From 881e40be880fdebb5d3ebfbbce18de442c01d9e5 Mon Sep 17 00:00:00 2001
From: Fletterio <fj.letterio@gmail.com>
Date: Wed, 22 Jan 2025 12:43:15 -0300
Subject: [PATCH 001/529] Update Bloom example, removed memory barriers on FFT

Signed-off-by: kevyuu <kevin.kayu@gmail.com>
---
 28_FFTBloom/app_resources/fft_common.hlsl | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/28_FFTBloom/app_resources/fft_common.hlsl b/28_FFTBloom/app_resources/fft_common.hlsl
index 295c05223..41f8821cc 100644
--- a/28_FFTBloom/app_resources/fft_common.hlsl
+++ b/28_FFTBloom/app_resources/fft_common.hlsl
@@ -32,11 +32,6 @@ struct PreloadedAccessorCommonBase
 	NBL_CONSTEXPR_STATIC_INLINE uint16_t ElementsPerInvocation = FFTParameters::ElementsPerInvocation;
 	NBL_CONSTEXPR_STATIC_INLINE uint16_t WorkgroupSize = FFTParameters::WorkgroupSize;
 	NBL_CONSTEXPR_STATIC_INLINE uint16_t TotalSize = FFTParameters::TotalSize;
-
-	void memoryBarrier()
-	{
-		// Preloaded Accessors don't access any memory in this stage, so we don't need to do anything here
-	}
 };
 
 struct PreloadedAccessorBase : PreloadedAccessorCommonBase

From 444c91729670f1c804b656899ef7d483ca9f30b8 Mon Sep 17 00:00:00 2001
From: kevyuu <kevin.kayu@gmail.com>
Date: Wed, 22 Jan 2025 11:05:21 +0700
Subject: [PATCH 002/529] Implement Ray Tracing Demo

- Multiple HitGroup. Each with closesthit and anythit shader
- Multiple Miss Shader Group.

Signed-off-by: kevyuu <kevin.kayu@gmail.com>
---
 71_RayTracingPipeline/CMakeLists.txt          |   28 +
 .../app_resources/common.hlsl                 |  101 ++
 .../app_resources/random.hlsl                 |   34 +
 .../app_resources/raytrace.rahit.hlsl         |   27 +
 .../app_resources/raytrace.rchit.hlsl         |  152 ++
 .../app_resources/raytrace.rgen.hlsl          |   72 +
 .../app_resources/raytrace.rmiss.hlsl         |    8 +
 .../app_resources/raytraceShadow.rmiss.hlsl   |    7 +
 71_RayTracingPipeline/include/common.hpp      |   93 ++
 71_RayTracingPipeline/main.cpp                | 1289 +++++++++++++++++
 CMakeLists.txt                                |    1 +
 common/include/CCamera.hpp                    |   15 +-
 12 files changed, 1825 insertions(+), 2 deletions(-)
 create mode 100644 71_RayTracingPipeline/CMakeLists.txt
 create mode 100644 71_RayTracingPipeline/app_resources/common.hlsl
 create mode 100644 71_RayTracingPipeline/app_resources/random.hlsl
 create mode 100644 71_RayTracingPipeline/app_resources/raytrace.rahit.hlsl
 create mode 100644 71_RayTracingPipeline/app_resources/raytrace.rchit.hlsl
 create mode 100644 71_RayTracingPipeline/app_resources/raytrace.rgen.hlsl
 create mode 100644 71_RayTracingPipeline/app_resources/raytrace.rmiss.hlsl
 create mode 100644 71_RayTracingPipeline/app_resources/raytraceShadow.rmiss.hlsl
 create mode 100644 71_RayTracingPipeline/include/common.hpp
 create mode 100644 71_RayTracingPipeline/main.cpp

diff --git a/71_RayTracingPipeline/CMakeLists.txt b/71_RayTracingPipeline/CMakeLists.txt
new file mode 100644
index 000000000..4a555f4ce
--- /dev/null
+++ b/71_RayTracingPipeline/CMakeLists.txt
@@ -0,0 +1,28 @@
+set(NBL_INCLUDE_SEARCH_DIRECTORIES
+	"${CMAKE_CURRENT_SOURCE_DIR}/include"
+)
+
+include(common RESULT_VARIABLE RES)
+if(NOT RES)
+	message(FATAL_ERROR "common.cmake not found. Should be in {repo_root}/cmake directory")
+endif()
+
+nbl_create_executable_project("" "" "${NBL_INCLUDE_SEARCH_DIRECTORIES}" "" "${NBL_EXECUTABLE_PROJECT_CREATION_PCH_TARGET}")
+
+if(NBL_EMBED_BUILTIN_RESOURCES)
+	set(_BR_TARGET_ ${EXECUTABLE_NAME}_builtinResourceData)
+	set(RESOURCE_DIR "app_resources")
+
+	get_filename_component(_SEARCH_DIRECTORIES_ "${CMAKE_CURRENT_SOURCE_DIR}" ABSOLUTE)
+	get_filename_component(_OUTPUT_DIRECTORY_SOURCE_ "${CMAKE_CURRENT_BINARY_DIR}/src" ABSOLUTE)
+	get_filename_component(_OUTPUT_DIRECTORY_HEADER_ "${CMAKE_CURRENT_BINARY_DIR}/include" ABSOLUTE)
+
+    file(GLOB_RECURSE BUILTIN_RESOURCE_FILES RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}/${RESOURCE_DIR}" CONFIGURE_DEPENDS "${CMAKE_CURRENT_SOURCE_DIR}/${RESOURCE_DIR}/*")
+    foreach(RES_FILE ${BUILTIN_RESOURCE_FILES})
+      LIST_BUILTIN_RESOURCE(RESOURCES_TO_EMBED "${RES_FILE}")
+    endforeach()
+
+	ADD_CUSTOM_BUILTIN_RESOURCES(${_BR_TARGET_} RESOURCES_TO_EMBED "${_SEARCH_DIRECTORIES_}" "${RESOURCE_DIR}" "nbl::this_example::builtin" "${_OUTPUT_DIRECTORY_HEADER_}" "${_OUTPUT_DIRECTORY_SOURCE_}")
+
+	LINK_BUILTIN_RESOURCES_TO_TARGET(${EXECUTABLE_NAME} ${_BR_TARGET_})
+endif()
diff --git a/71_RayTracingPipeline/app_resources/common.hlsl b/71_RayTracingPipeline/app_resources/common.hlsl
new file mode 100644
index 000000000..3b6c36abc
--- /dev/null
+++ b/71_RayTracingPipeline/app_resources/common.hlsl
@@ -0,0 +1,101 @@
+#ifndef RQG_COMMON_HLSL
+#define RQG_COMMON_HLSL
+
+#include "nbl/builtin/hlsl/cpp_compat.hlsl"
+
+NBL_CONSTEXPR uint32_t WorkgroupSize = 16;
+
+struct Material
+{
+	float32_t3 ambient;
+    float32_t3 diffuse;
+    float32_t3 specular;
+    float32_t shininess;
+    float32_t dissolve; // 1 == opaque; 0 == fully transparent
+    uint32_t illum; // illumination model (see http://www.fileformat.info/format/material/)
+};
+
+struct SGeomInfo
+{
+    uint64_t vertexBufferAddress;
+    uint64_t indexBufferAddress;
+
+    uint32_t vertexStride : 29;
+    uint32_t indexType : 2; // 16 bit, 32 bit or none
+    uint32_t smoothNormals : 1;	// flat for cube, rectangle, disk
+
+    uint32_t objType;
+
+    Material material;
+};
+
+struct SPushConstants
+{
+    uint64_t geometryInfoBuffer;
+    uint32_t frameCounter;
+
+    float32_t3 camPos;
+    float32_t4x4 invMVP;
+
+};
+
+#ifdef __HLSL_VERSION
+
+struct [raypayload] ColorPayload
+{
+	float32_t3 hitValue;
+    uint32_t seed;
+};
+
+struct [raypayload] ShadowPayload
+{
+	bool isShadowed;
+    uint32_t seed;
+};
+
+enum ObjectType : uint32_t  // matches c++
+{
+    OT_CUBE = 0,
+    OT_SPHERE,
+    OT_CYLINDER,
+    OT_RECTANGLE,
+    OT_DISK,
+    OT_ARROW,
+    OT_CONE,
+    OT_ICOSPHERE,
+
+    OT_COUNT
+};
+
+static uint32_t s_offsetsToNormalBytes[OT_COUNT] = { 18, 24, 24, 20, 20, 24, 16, 12 };	// based on normals data position
+float32_t3 computeDiffuse(Material mat, float32_t3 light_dir, float32_t3 normal)
+{
+	// Lambertian
+	float32_t dotNL = max(dot(normal, light_dir), 0.0);
+	float32_t3 c = mat.diffuse * dotNL;
+	if (mat.illum >= 1)
+		c += mat.ambient;
+	return c;
+}
+
+float32_t3 computeSpecular(Material mat, float32_t3 view_dir, 
+	float32_t3 light_dir, float32_t3 normal)
+{
+	if (mat.illum < 2)
+		return float32_t3(0, 0, 0);
+
+	// Compute specular only if not in shadow
+	const float32_t kPi = 3.14159265;
+	const float32_t kShininess = max(mat.shininess, 4.0);
+
+	// Specular
+	const float32_t kEnergyConservation = (2.0 + kShininess) / (2.0 * kPi);
+	float32_t3 V = normalize(-view_dir);
+	float32_t3 R = reflect(-light_dir, normal);
+	float32_t specular = kEnergyConservation * pow(max(dot(V, R), 0.0), kShininess);
+
+	return float32_t3(mat.specular * specular);
+}
+#endif
+
+#endif  // RQG_COMMON_HLSL
diff --git a/71_RayTracingPipeline/app_resources/random.hlsl b/71_RayTracingPipeline/app_resources/random.hlsl
new file mode 100644
index 000000000..e01d7ff6c
--- /dev/null
+++ b/71_RayTracingPipeline/app_resources/random.hlsl
@@ -0,0 +1,34 @@
+// Generate a random unsigned int from two unsigned int values, using 16 pairs
+// of rounds of the Tiny Encryption Algorithm. See Zafar, Olano, and Curtis,
+// "GPU Random Numbers via the Tiny Encryption Algorithm"
+uint32_t tea(uint32_t val0, uint32_t val1)
+{
+  uint32_t v0 = val0;
+  uint32_t v1 = val1;
+  uint32_t s0 = 0;
+
+  for(uint32_t n = 0; n < 16; n++)
+  {
+    s0 += 0x9e3779b9;
+    v0 += ((v1 << 4) + 0xa341316c) ^ (v1 + s0) ^ ((v1 >> 5) + 0xc8013ea4);
+    v1 += ((v0 << 4) + 0xad90777d) ^ (v0 + s0) ^ ((v0 >> 5) + 0x7e95761e);
+  }
+
+  return v0;
+}
+
+// Generate a random unsigned int in [0, 2^24) given the previous RNG state
+// using the Numerical Recipes linear congruential generator
+uint32_t lcg(inout uint32_t prev)
+{
+  uint32_t LCG_A = 1664525u;
+  uint32_t LCG_C = 1013904223u;
+  prev       = (LCG_A * prev + LCG_C);
+  return prev & 0x00FFFFFF;
+}
+
+// Generate a random float32_t in [0, 1) given the previous RNG state
+float32_t rnd(inout uint32_t prev)
+{
+  return (float32_t(lcg(prev)) / float32_t(0x01000000));
+}
diff --git a/71_RayTracingPipeline/app_resources/raytrace.rahit.hlsl b/71_RayTracingPipeline/app_resources/raytrace.rahit.hlsl
new file mode 100644
index 000000000..f68d607aa
--- /dev/null
+++ b/71_RayTracingPipeline/app_resources/raytrace.rahit.hlsl
@@ -0,0 +1,27 @@
+#include "common.hlsl"
+#include "random.hlsl"
+
+[[vk::push_constant]] SPushConstants pc;
+
+[[vk::binding(0, 0)]] RaytracingAccelerationStructure topLevelAS;
+
+#if defined(USE_COLOR_PAYLOAD)
+using AnyHitPayload = ColorPayload;
+#elif defined(USE_SHADOW_PAYLOAD)
+using AnyHitPayload = ShadowPayload;
+#endif
+
+[shader("anyhit")]
+void main(inout AnyHitPayload p, in BuiltInTriangleIntersectionAttributes attribs)
+{
+    const int instID = InstanceID();
+    const SGeomInfo geom = vk::RawBufferLoad < SGeomInfo > (pc.geometryInfoBuffer + instID * sizeof(SGeomInfo));
+    
+    if (geom.material.illum != 4)
+        return;
+
+    if (geom.material.dissolve == 0.0)
+        IgnoreHit();
+    else if (rnd(p.seed) > geom.material.dissolve)
+        IgnoreHit();
+}
diff --git a/71_RayTracingPipeline/app_resources/raytrace.rchit.hlsl b/71_RayTracingPipeline/app_resources/raytrace.rchit.hlsl
new file mode 100644
index 000000000..b77412ff7
--- /dev/null
+++ b/71_RayTracingPipeline/app_resources/raytrace.rchit.hlsl
@@ -0,0 +1,152 @@
+#include "common.hlsl"
+
+[[vk::push_constant]] SPushConstants pc;
+
+[[vk::binding(0, 0)]] RaytracingAccelerationStructure topLevelAS;
+
+float3 unpackNormals3x10(uint32_t v)
+{
+    // host side changes float32_t3 to EF_A2B10G10R10_SNORM_PACK32
+    // follows unpacking scheme from https://github.com/KhronosGroup/SPIRV-Cross/blob/main/reference/shaders-hlsl/frag/unorm-snorm-packing.frag
+    int signedValue = int(v);
+    int3 pn = int3(signedValue << 22, signedValue << 12, signedValue << 2) >> 22;
+    return clamp(float3(pn) / 511.0, -1.0, 1.0);
+}
+
+struct VertexData {
+    float32_t3 position;
+    float32_t3 normal;
+};
+
+VertexData fetchVertexData(int instID, int primID, SGeomInfo geom, float2 bary)
+{
+    uint idxOffset = primID * 3;
+
+    const uint indexType = geom.indexType;
+    const uint vertexStride = geom.vertexStride;
+
+    const uint32_t objType = geom.objType;
+    const uint64_t indexBufferAddress = geom.indexBufferAddress;
+
+    uint i0, i1, i2;
+    switch (indexType)
+    {
+        case 0: // EIT_16BIT
+        {
+            i0 = uint32_t(vk::RawBufferLoad<uint16_t>(indexBufferAddress + (idxOffset + 0) * sizeof(uint16_t), 2u));
+            i1 = uint32_t(vk::RawBufferLoad<uint16_t>(indexBufferAddress + (idxOffset + 1) * sizeof(uint16_t), 2u));
+            i2 = uint32_t(vk::RawBufferLoad<uint16_t>(indexBufferAddress + (idxOffset + 2) * sizeof(uint16_t), 2u));
+        }
+        break;
+        case 1: // EIT_32BIT
+        {
+            i0 = vk::RawBufferLoad<uint32_t>(indexBufferAddress + (idxOffset + 0) * sizeof(uint32_t));
+            i1 = vk::RawBufferLoad<uint32_t>(indexBufferAddress + (idxOffset + 1) * sizeof(uint32_t));
+            i2 = vk::RawBufferLoad<uint32_t>(indexBufferAddress + (idxOffset + 2) * sizeof(uint32_t));
+        }
+        break;
+        default:    // EIT_NONE
+        {
+            i0 = idxOffset;
+            i1 = idxOffset + 1;
+            i2 = idxOffset + 2;
+        }
+    }
+
+    const uint64_t vertexBufferAddress = geom.vertexBufferAddress;
+	float32_t3 p0 = vk::RawBufferLoad<float32_t3>(vertexBufferAddress + i0 * vertexStride);
+	float32_t3 p1 = vk::RawBufferLoad<float32_t3>(vertexBufferAddress + i1 * vertexStride);
+	float32_t3 p2 = vk::RawBufferLoad<float32_t3>(vertexBufferAddress + i2 * vertexStride);
+
+    const uint64_t normalVertexBufferAddress = vertexBufferAddress + s_offsetsToNormalBytes[objType];
+    float3 n0, n1, n2;
+    switch (objType)
+    {
+        case OT_CUBE:
+        {
+            uint32_t v0 = vk::RawBufferLoad<uint32_t>(normalVertexBufferAddress + i0 * vertexStride, 2u);
+            uint32_t v1 = vk::RawBufferLoad<uint32_t>(normalVertexBufferAddress + i1 * vertexStride, 2u);
+            uint32_t v2 = vk::RawBufferLoad<uint32_t>(normalVertexBufferAddress + i2 * vertexStride, 2u);
+
+            n0 = normalize(nbl::hlsl::spirv::unpackSnorm4x8(v0).xyz);
+            n1 = normalize(nbl::hlsl::spirv::unpackSnorm4x8(v1).xyz);
+            n2 = normalize(nbl::hlsl::spirv::unpackSnorm4x8(v2).xyz);
+        }
+        break;
+        case OT_SPHERE:
+        case OT_CYLINDER:
+        case OT_ARROW:
+        case OT_CONE:
+        {
+            uint32_t v0 = vk::RawBufferLoad<uint32_t>(normalVertexBufferAddress + i0 * vertexStride);
+            uint32_t v1 = vk::RawBufferLoad<uint32_t>(normalVertexBufferAddress + i1 * vertexStride);
+            uint32_t v2 = vk::RawBufferLoad<uint32_t>(normalVertexBufferAddress + i2 * vertexStride);
+
+            n0 = normalize(unpackNormals3x10(v0));
+            n1 = normalize(unpackNormals3x10(v1));
+            n2 = normalize(unpackNormals3x10(v2));
+        }
+        break;
+        case OT_RECTANGLE:
+        case OT_DISK:
+        case OT_ICOSPHERE:
+        default:
+        {
+            n0 = normalize(vk::RawBufferLoad<float3>(normalVertexBufferAddress + i0 * vertexStride));
+            n1 = normalize(vk::RawBufferLoad<float3>(normalVertexBufferAddress + i1 * vertexStride));
+            n2 = normalize(vk::RawBufferLoad<float3>(normalVertexBufferAddress + i2 * vertexStride));
+        }
+    }
+
+    float3 barycentrics = float3(0.0, bary);
+    barycentrics.x = 1.0 - barycentrics.y - barycentrics.z;        
+
+    VertexData data;
+    data.position = barycentrics.x * p0 + barycentrics.y * p1 + barycentrics.z * p2;
+    data.normal = normalize(barycentrics.x * n0 + barycentrics.y * n1 + barycentrics.z * n2);
+    return data;
+}
+
+[shader("closesthit")]
+void main(inout ColorPayload p, in BuiltInTriangleIntersectionAttributes attribs)
+{
+	const int instID = InstanceID();
+	const int primID = PrimitiveIndex();
+    const SGeomInfo geom = vk::RawBufferLoad<SGeomInfo>(pc.geometryInfoBuffer + instID * sizeof(SGeomInfo));
+    const VertexData vertexData = fetchVertexData(instID, primID, geom, attribs.barycentrics);
+    const float32_t3 worldPosition = mul(ObjectToWorld3x4(), float32_t4(vertexData.position, 1));
+    const float32_t3 worldNormal = mul(vertexData.normal, WorldToObject3x4()).xyz;
+
+    const float32_t lightIntensity = 1;
+    const float32_t3 lightDirection = normalize(float32_t3(1, 1, -1));
+
+    float32_t3 diffuse = computeDiffuse(geom.material, lightDirection, worldNormal);
+    float32_t3 specular = float32_t3(0, 0, 0);
+    float32_t attenuation = 1;
+
+    if (dot(worldNormal, lightDirection) > 0)
+    {
+        RayDesc rayDesc;
+    rayDesc.Origin = WorldRayOrigin() + WorldRayDirection() * RayTCurrent() + worldNormal * 0.02f;
+        rayDesc.Direction = lightDirection;
+        rayDesc.TMin = 0.001;
+        rayDesc.TMax = 1000;
+
+        uint flags = RAY_FLAG_SKIP_CLOSEST_HIT_SHADER;
+        ShadowPayload shadowPayload;
+        shadowPayload.isShadowed = true;
+        shadowPayload.seed = p.seed;
+        TraceRay(topLevelAS, flags, 0xFF, 1, 0, 1, rayDesc, shadowPayload);
+        p.seed = shadowPayload.seed;
+
+        if (shadowPayload.isShadowed)
+        {
+            attenuation = 0.3;
+        }
+        else
+        {
+            specular = computeSpecular(geom.material, WorldRayDirection(), lightDirection, worldNormal);
+        }
+    }
+	p.hitValue = (lightIntensity * attenuation * (specular + diffuse));	
+}
\ No newline at end of file
diff --git a/71_RayTracingPipeline/app_resources/raytrace.rgen.hlsl b/71_RayTracingPipeline/app_resources/raytrace.rgen.hlsl
new file mode 100644
index 000000000..90b950f76
--- /dev/null
+++ b/71_RayTracingPipeline/app_resources/raytrace.rgen.hlsl
@@ -0,0 +1,72 @@
+#include "common.hlsl"
+#include "random.hlsl"
+
+#include "nbl/builtin/hlsl/jit/device_capabilities.hlsl"
+
+#include "nbl/builtin/hlsl/glsl_compat/core.hlsl"
+#include "nbl/builtin/hlsl/spirv_intrinsics/raytracing.hlsl"
+
+static const int32_t s_sampleCount = 10;
+
+[[vk::push_constant]] SPushConstants pc;
+
+[[vk::binding(0, 0)]] RaytracingAccelerationStructure topLevelAS;
+
+[[vk::binding(1, 0)]] RWTexture2D<float32_t4> colorImage;
+
+float32_t3 reinhardTonemap(float32_t3 v)
+{
+    return v / (1.0f + v);
+}
+
+[shader("raygeneration")]
+void main()
+{
+    uint32_t3 launchID = DispatchRaysIndex();
+    uint32_t3 launchSize = DispatchRaysDimensions();
+    uint32_t2 coords = launchID.xy;
+    uint32_t seed = tea(launchID.y * launchSize.x + launchID.x, pc.frameCounter);
+
+    float32_t3 hitValues = float32_t3(0, 0, 0);
+    for (uint32_t sample_i = 0; sample_i < s_sampleCount; sample_i++)
+    {
+        const float32_t r1 = rnd(seed);
+        const float32_t r2 = rnd(seed);
+        const float32_t2 subpixelJitter = pc.frameCounter == 0 ? float32_t2(0.5f, 0.5f) : float32_t2(r1, r2);
+
+        const float32_t2 pixelCenter = float32_t2(coords) + subpixelJitter;
+        const float32_t2 inUV = pixelCenter / float32_t2(launchSize.xy);
+
+        const float32_t2 d = inUV * 2.0 - 1.0;
+        const float32_t4 tmp = mul(pc.invMVP, float32_t4(d.x, d.y, 1, 1));
+        const float32_t3 targetPos = tmp.xyz / tmp.w;
+
+        float32_t3 direction = normalize(targetPos - pc.camPos);
+
+        RayDesc rayDesc;
+        rayDesc.Origin = pc.camPos;
+        rayDesc.Direction = direction;
+        rayDesc.TMin = 0.01;
+        rayDesc.TMax = 1000.0;
+        
+        ColorPayload payload;
+        payload.seed = seed;
+        payload.hitValue = float32_t3(0, 0, 0);
+        TraceRay(topLevelAS, RAY_FLAG_NONE, 0xff, 0, 0, 0, rayDesc, payload);
+
+        hitValues += payload.hitValue;
+    }
+
+    float32_t3 hitValue = hitValues / s_sampleCount;
+
+    if (pc.frameCounter > 0)
+    {
+        float32_t a = 1.0f / float32_t(pc.frameCounter + 1);
+        float32_t3 oldColor = colorImage[coords].xyz;
+        colorImage[coords] = float32_t4(lerp(oldColor, hitValue, a), 1.0f);
+    }
+    else
+    {
+        colorImage[coords] = float32_t4(hitValue, 1.0f);
+    }
+}
diff --git a/71_RayTracingPipeline/app_resources/raytrace.rmiss.hlsl b/71_RayTracingPipeline/app_resources/raytrace.rmiss.hlsl
new file mode 100644
index 000000000..70db3b0e4
--- /dev/null
+++ b/71_RayTracingPipeline/app_resources/raytrace.rmiss.hlsl
@@ -0,0 +1,8 @@
+#include "common.hlsl"
+
+[shader("miss")]
+void main(inout ColorPayload p)
+{
+    p.hitValue = float32_t3(0.3, 0.3, 0.6);
+
+}
diff --git a/71_RayTracingPipeline/app_resources/raytraceShadow.rmiss.hlsl b/71_RayTracingPipeline/app_resources/raytraceShadow.rmiss.hlsl
new file mode 100644
index 000000000..295e721f2
--- /dev/null
+++ b/71_RayTracingPipeline/app_resources/raytraceShadow.rmiss.hlsl
@@ -0,0 +1,7 @@
+#include "common.hlsl"
+
+[shader("miss")]
+void main(inout ShadowPayload p)
+{
+	p.isShadowed = false;
+}
diff --git a/71_RayTracingPipeline/include/common.hpp b/71_RayTracingPipeline/include/common.hpp
new file mode 100644
index 000000000..e50cb4473
--- /dev/null
+++ b/71_RayTracingPipeline/include/common.hpp
@@ -0,0 +1,93 @@
+#ifndef __NBL_THIS_EXAMPLE_COMMON_H_INCLUDED__
+#define __NBL_THIS_EXAMPLE_COMMON_H_INCLUDED__
+
+#include <nabla.h>
+#include "nbl/asset/utils/CGeometryCreator.h"
+#include "nbl/application_templates/MonoAssetManagerAndBuiltinResourceApplication.hpp"
+
+#include "SimpleWindowedApplication.hpp"
+
+#include "InputSystem.hpp"
+#include "CEventCallback.hpp"
+
+#include "CCamera.hpp"
+
+#include <nbl/builtin/hlsl/cpp_compat.hlsl>
+#include <nbl/builtin/hlsl/cpp_compat/matrix.hlsl>
+#include <nbl/asset/IRayTracingPipeline.h>
+
+using namespace nbl;
+using namespace core;
+using namespace hlsl;
+using namespace system;
+using namespace asset;
+using namespace ui;
+using namespace video;
+using namespace scene;
+
+#include "app_resources/common.hlsl"
+
+namespace nbl::scene
+{
+
+enum ObjectType : uint8_t
+{
+	OT_CUBE,
+	OT_SPHERE,
+	OT_CYLINDER,
+	OT_RECTANGLE,
+	OT_DISK,
+	OT_ARROW,
+	OT_CONE,
+	OT_ICOSPHERE,
+
+	OT_COUNT,
+	OT_UNKNOWN = std::numeric_limits<uint8_t>::max()
+};
+
+static constexpr uint32_t s_smoothNormals[OT_COUNT] = { 0, 1, 1, 0, 0, 1, 1, 1 };
+
+struct ObjectMeta
+{
+	ObjectType type = OT_UNKNOWN;
+	std::string_view name = "Unknown";
+};
+
+struct ObjectDrawHookCpu
+{
+	nbl::core::matrix3x4SIMD model;
+	nbl::asset::SBasicViewParameters viewParameters;
+	ObjectMeta meta;
+};
+
+struct ReferenceObjectCpu
+{
+	ObjectMeta meta;
+	nbl::asset::CGeometryCreator::return_type data;
+	Material material;
+  core::matrix3x4SIMD transform;
+};
+
+struct ReferenceObjectGpu
+{
+	struct Bindings
+	{
+		nbl::asset::SBufferBinding<IGPUBuffer> vertex, index;
+	};
+
+	ObjectMeta meta;
+	Bindings bindings;
+	uint32_t vertexStride;
+	nbl::asset::E_INDEX_TYPE indexType = nbl::asset::E_INDEX_TYPE::EIT_UNKNOWN;
+	uint32_t indexCount = {};
+	Material material;
+  core::matrix3x4SIMD transform;
+
+	const bool useIndex() const
+	{
+		return bindings.index.buffer && (indexType != E_INDEX_TYPE::EIT_UNKNOWN);
+	}
+};
+}
+
+#endif // __NBL_THIS_EXAMPLE_COMMON_H_INCLUDED__
diff --git a/71_RayTracingPipeline/main.cpp b/71_RayTracingPipeline/main.cpp
new file mode 100644
index 000000000..54a692317
--- /dev/null
+++ b/71_RayTracingPipeline/main.cpp
@@ -0,0 +1,1289 @@
+// Copyright (C) 2018-2024 - DevSH Graphics Programming Sp. z O.O.
+// This file is part of the "Nabla Engine".
+// For conditions of distribution and use, see copyright notice in nabla.h
+
+#include "common.hpp"
+
+class RaytracingPipelineApp final : public examples::SimpleWindowedApplication, public application_templates::MonoAssetManagerAndBuiltinResourceApplication
+{
+  using device_base_t = examples::SimpleWindowedApplication;
+  using asset_base_t = application_templates::MonoAssetManagerAndBuiltinResourceApplication;
+  using clock_t = std::chrono::steady_clock;
+
+  constexpr static inline uint32_t WIN_W = 1280, WIN_H = 720;
+  constexpr static inline uint32_t MaxFramesInFlight = 3u;
+
+  constexpr static inline clock_t::duration DisplayImageDuration = std::chrono::milliseconds(900);
+
+  struct ShaderBindingTable
+  {
+    SStridedBufferRegion<IGPUBuffer> raygenGroupRegion;
+    SStridedBufferRegion<IGPUBuffer> hitGroupsRegion;
+    SStridedBufferRegion<IGPUBuffer> missGroupsRegion;
+    SStridedBufferRegion<IGPUBuffer> callableGroupsRegion;
+  };
+
+  struct CameraView
+  {
+    float32_t3 position;
+    float32_t3 target;
+    float32_t3 upVector;
+  };
+
+public:
+  inline RaytracingPipelineApp(const path& _localInputCWD, const path& _localOutputCWD, const path& _sharedInputCWD, const path& _sharedOutputCWD)
+    : IApplicationFramework(_localInputCWD, _localOutputCWD, _sharedInputCWD, _sharedOutputCWD) {
+  }
+
+  inline SPhysicalDeviceFeatures getRequiredDeviceFeatures() const override
+  {
+    auto retval = device_base_t::getRequiredDeviceFeatures();
+    retval.rayTracingPipeline = true;
+    retval.accelerationStructure = true;
+    retval.rayQuery = true;
+    return retval;
+  }
+
+  inline SPhysicalDeviceFeatures getPreferredDeviceFeatures() const override
+  {
+    auto retval = device_base_t::getPreferredDeviceFeatures();
+    retval.accelerationStructureHostCommands = true;
+    return retval;
+  }
+
+  inline core::vector<video::SPhysicalDeviceFilter::SurfaceCompatibility> getSurfaces() const override
+  {
+    if (!m_surface)
+    {
+      {
+        auto windowCallback = core::make_smart_refctd_ptr<CEventCallback>(smart_refctd_ptr(m_inputSystem), smart_refctd_ptr(m_logger));
+        IWindow::SCreationParams params = {};
+        params.callback = core::make_smart_refctd_ptr<ISimpleManagedSurface::ICallback>();
+        params.width = WIN_W;
+        params.height = WIN_H;
+        params.x = 32;
+        params.y = 32;
+        params.flags = ui::IWindow::ECF_HIDDEN | IWindow::ECF_BORDERLESS | IWindow::ECF_RESIZABLE;
+        params.windowCaption = "RaytracingPipelineApp";
+        params.callback = windowCallback;
+        const_cast<std::remove_const_t<decltype(m_window)>&>(m_window) = m_winMgr->createWindow(std::move(params));
+      }
+
+      auto surface = CSurfaceVulkanWin32::create(smart_refctd_ptr(m_api), smart_refctd_ptr_static_cast<IWindowWin32>(m_window));
+      const_cast<std::remove_const_t<decltype(m_surface)>&>(m_surface) = CSimpleResizeSurface<ISimpleManagedSurface::ISwapchainResources>::create(std::move(surface));
+    }
+
+    if (m_surface)
+      return { {m_surface->getSurface()/*,EQF_NONE*/} };
+
+    return {};
+  }
+
+  // so that we can use the same queue for asset converter and rendering
+  inline core::vector<queue_req_t> getQueueRequirements() const override
+  {
+    auto reqs = device_base_t::getQueueRequirements();
+    reqs.front().requiredFlags |= IQueue::FAMILY_FLAGS::TRANSFER_BIT;
+    return reqs;
+  }
+
+  inline bool onAppInitialized(smart_refctd_ptr<ISystem>&& system) override
+  {
+    m_inputSystem = make_smart_refctd_ptr<InputSystem>(logger_opt_smart_ptr(smart_refctd_ptr(m_logger)));
+
+    if (!device_base_t::onAppInitialized(smart_refctd_ptr(system)))
+      return false;
+
+    if (!asset_base_t::onAppInitialized(smart_refctd_ptr(system)))
+      return false;
+
+    const auto compileShader = [&]<typename... Args>(const std::string& filePath, const std::string& header = "") -> smart_refctd_ptr<IGPUShader>
+      {
+        IAssetLoader::SAssetLoadParams lparams = {};
+        lparams.logger = m_logger.get();
+        lparams.workingDirectory = "";
+        auto bundle = m_assetMgr->getAsset(filePath, lparams);
+        if (bundle.getContents().empty() || bundle.getAssetType() != IAsset::ET_SHADER)
+        {
+          m_logger->log("Shader %s not found!", ILogger::ELL_ERROR, filePath);
+          exit(-1);
+        }
+
+        const auto assets = bundle.getContents();
+        assert(assets.size() == 1);
+        smart_refctd_ptr<ICPUShader> sourceRaw = IAsset::castDown<ICPUShader>(assets[0]);
+        if (!sourceRaw)
+          m_logger->log("Fail to load shader source", ILogger::ELL_ERROR, filePath);
+        smart_refctd_ptr<ICPUShader> source = CHLSLCompiler::createOverridenCopy(
+          sourceRaw.get(),
+          "%s\n",
+          header.c_str()
+        );
+
+        return m_device->createShader(source.get());
+      };
+
+    // shader
+    const auto raygenShader = compileShader("app_resources/raytrace.rgen.hlsl");
+    const auto closestHitShader = compileShader("app_resources/raytrace.rchit.hlsl");
+    const auto anyHitShaderColorPayload = compileShader("app_resources/raytrace.rahit.hlsl", "#define USE_COLOR_PAYLOAD\n");
+    const auto anyHitShaderShadowPayload = compileShader("app_resources/raytrace.rahit.hlsl", "#define USE_SHADOW_PAYLOAD\n");
+    const auto missShader = compileShader("app_resources/raytrace.rmiss.hlsl");
+    const auto shadowMissShader = compileShader("app_resources/raytraceShadow.rmiss.hlsl");
+
+    m_semaphore = m_device->createSemaphore(m_realFrameIx);
+    if (!m_semaphore)
+      return logFail("Failed to Create a Semaphore!");
+
+    ISwapchain::SCreationParams swapchainParams = { .surface = core::smart_refctd_ptr<nbl::video::ISurface>(m_surface->getSurface()) };
+    if (!swapchainParams.deduceFormat(m_physicalDevice))
+      return logFail("Could not choose a Surface Format for the Swapchain!");
+
+    auto gQueue = getGraphicsQueue();
+    if (!m_surface || !m_surface->init(gQueue, std::make_unique<ISimpleManagedSurface::ISwapchainResources>(), swapchainParams.sharedParams))
+      return logFail("Could not create Window & Surface or initialize the Surface!");
+
+    auto pool = m_device->createCommandPool(gQueue->getFamilyIndex(), IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT);
+
+    m_converter = CAssetConverter::create({ .device = m_device.get(), .optimizer = {} });
+
+    for (auto i = 0u; i < MaxFramesInFlight; i++)
+    {
+      if (!pool)
+        return logFail("Couldn't create Command Pool!");
+      if (!pool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY, { m_cmdBufs.data() + i, 1 }))
+        return logFail("Couldn't create Command Buffer!");
+    }
+
+    m_winMgr->setWindowSize(m_window.get(), WIN_W, WIN_H);
+    m_surface->recreateSwapchain();
+
+    // create output images
+    m_hdrImage = m_device->createImage({
+        {
+          .type = IGPUImage::ET_2D,
+          .samples = asset::ICPUImage::ESCF_1_BIT,
+          .format = asset::EF_R16G16B16A16_SFLOAT,
+          .extent = {WIN_W, WIN_H, 1},
+          .mipLevels = 1,
+          .arrayLayers = 1,
+          .flags = IImage::ECF_NONE,
+          .usage = core::bitflag(asset::IImage::EUF_STORAGE_BIT) | asset::IImage::EUF_TRANSFER_SRC_BIT
+        }
+      });
+
+    if (!m_hdrImage || !m_device->allocate(m_hdrImage->getMemoryReqs(), m_hdrImage.get()).isValid())
+      return logFail("Could not create HDR Image");
+
+    auto assetManager = make_smart_refctd_ptr<nbl::asset::IAssetManager>(smart_refctd_ptr(system));
+    auto* geometryCreator = assetManager->getGeometryCreator();
+
+    auto cQueue = getComputeQueue();
+
+    // create geometry objects
+    if (!createGeometries(gQueue, geometryCreator))
+      return logFail("Could not create geometries from geometry creator");
+
+    if (!createAccelerationStructures(cQueue))
+      return logFail("Could not create acceleration structures");
+
+
+    // create pipelines
+    {
+      // descriptors
+      const IGPUDescriptorSetLayout::SBinding bindings[] = {
+        {
+          .binding = 0,
+          .type = asset::IDescriptor::E_TYPE::ET_ACCELERATION_STRUCTURE,
+          .createFlags = IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_NONE,
+          .stageFlags = asset::IShader::E_SHADER_STAGE::ESS_ALL_RAY_TRACING,
+          .count = 1,
+        },
+        {
+          .binding = 1,
+          .type = asset::IDescriptor::E_TYPE::ET_STORAGE_IMAGE,
+          .createFlags = IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_NONE,
+          .stageFlags = asset::IShader::E_SHADER_STAGE::ESS_ALL_RAY_TRACING,
+          .count = 1,
+        }
+      };
+      const auto descriptorSetLayout = m_device->createDescriptorSetLayout(bindings);
+
+      const std::array<IGPUDescriptorSetLayout*, ICPUPipelineLayout::DESCRIPTOR_SET_COUNT> dsLayoutPtrs = { descriptorSetLayout.get() };
+      m_renderPool = m_device->createDescriptorPoolForDSLayouts(IDescriptorPool::ECF_UPDATE_AFTER_BIND_BIT, std::span(dsLayoutPtrs.begin(), dsLayoutPtrs.end()));
+      if (!m_renderPool)
+        return logFail("Could not create descriptor pool");
+      m_renderDs = m_renderPool->createDescriptorSet(descriptorSetLayout);
+      if (!m_renderDs)
+        return logFail("Could not create descriptor set");
+
+      const SPushConstantRange pcRange = {
+        .stageFlags = IShader::E_SHADER_STAGE::ESS_ALL_RAY_TRACING,
+        .offset = 0u,
+        .size = sizeof(SPushConstants),
+      };
+      const auto pipelineLayout = m_device->createPipelineLayout({ &pcRange, 1 }, smart_refctd_ptr(descriptorSetLayout), nullptr, nullptr, nullptr);
+
+      IGPURayTracingPipeline::SCreationParams params = {};
+
+
+      const IGPUShader::SSpecInfo shaders[] = {
+          {.shader = raygenShader.get()},
+          {.shader = closestHitShader.get()},
+          {.shader = anyHitShaderColorPayload.get()},
+          {.shader = anyHitShaderShadowPayload.get()},
+          {.shader = missShader.get()},
+          {.shader = shadowMissShader.get()},
+      };
+
+      params.layout = pipelineLayout.get();
+      params.shaders = std::span(shaders, std::size(shaders));
+      params.cached.shaderGroups.raygenGroup = {
+        .shaderIndex = 0,
+      };
+      params.cached.shaderGroups.hitGroups.push_back({ .closestHitShaderIndex = 1, .anyHitShaderIndex = 2 });
+      params.cached.shaderGroups.hitGroups.push_back({ .closestHitShaderIndex = 1, .anyHitShaderIndex = 3 });
+      params.cached.shaderGroups.missGroups.push_back({ .shaderIndex = 4 });
+      params.cached.shaderGroups.missGroups.push_back({ .shaderIndex = 5 });
+      params.cached.maxRecursionDepth = 2;
+      if (!m_device->createRayTracingPipelines(nullptr, { &params, 1 }, &m_rayTracingPipeline))
+        return logFail("Failed to create ray tracing pipeline");
+      m_logger->log("Ray Tracing Pipeline Created!",system::ILogger::ELL_INFO);
+
+      //create shader binding table
+      if (!createShaderBindingTable(gQueue, m_rayTracingPipeline))
+        return logFail("Could not create shader binding table");
+    }
+
+
+    // write descriptors
+    IGPUDescriptorSet::SDescriptorInfo infos[2];
+    infos[0].desc = m_gpuTlas;
+    infos[1].desc = m_device->createImageView({
+        .flags = IGPUImageView::ECF_NONE,
+        .subUsages = IGPUImage::E_USAGE_FLAGS::EUF_STORAGE_BIT,
+        .image = m_hdrImage,
+        .viewType = IGPUImageView::E_TYPE::ET_2D,
+        .format = asset::EF_R16G16B16A16_SFLOAT
+      });
+    if (!infos[1].desc)
+      return logFail("Failed to create image view");
+    infos[1].info.image.imageLayout = IImage::LAYOUT::GENERAL;
+    IGPUDescriptorSet::SWriteDescriptorSet writes[3] = {
+        {.dstSet = m_renderDs.get(), .binding = 0, .arrayElement = 0, .count = 1, .info = &infos[0]},
+        {.dstSet = m_renderDs.get(), .binding = 1, .arrayElement = 0, .count = 1, .info = &infos[1]}
+    };
+    m_device->updateDescriptorSets(std::span(writes, 2), {});
+
+    // camera
+    {
+      core::vectorSIMDf cameraPosition(-5.81655884, 2.58630896, -4.23974705);
+      core::vectorSIMDf cameraTarget(-0.349590302, -0.213266611, 0.317821503);
+      matrix4SIMD projectionMatrix = matrix4SIMD::buildProjectionMatrixPerspectiveFovLH(core::radians(60.0f), float(WIN_W) / WIN_H, 0.1, 1000);
+      m_camera = Camera(cameraPosition, cameraTarget, projectionMatrix, 1.069f, 0.4f);
+    }
+
+    m_winMgr->show(m_window.get());
+    m_oracle.reportBeginFrameRecord();
+
+    return true;
+  }
+
+  inline void workLoopBody() override
+  {
+    const auto resourceIx = m_realFrameIx % MaxFramesInFlight;
+
+    const uint32_t framesInFlight = core::min(MaxFramesInFlight, m_surface->getMaxAcquiresInFlight());
+
+    if (m_realFrameIx >= framesInFlight)
+    {
+      const ISemaphore::SWaitInfo cbDonePending[] =
+      {
+          {
+            .semaphore = m_semaphore.get(),
+            .value = m_realFrameIx + 1 - framesInFlight
+          }
+      };
+      if (m_device->blockForSemaphores(cbDonePending) != ISemaphore::WAIT_RESULT::SUCCESS)
+        return;
+    }
+
+    m_inputSystem->getDefaultMouse(&m_mouse);
+    m_inputSystem->getDefaultKeyboard(&m_keyboard);
+
+    auto updatePresentationTimestamp = [&]()
+      {
+        m_currentImageAcquire = m_surface->acquireNextImage();
+
+        m_oracle.reportEndFrameRecord();
+        const auto timestamp = m_oracle.getNextPresentationTimeStamp();
+        m_oracle.reportBeginFrameRecord();
+
+        return timestamp;
+      };
+
+    const auto nextPresentationTimestamp = updatePresentationTimestamp();
+
+    if (!m_currentImageAcquire)
+      return;
+
+    static bool first = true;
+    if (first)
+    {
+      m_api->startCapture();
+      first = false;
+    }
+
+    auto* const cmdbuf = m_cmdBufs.data()[resourceIx].get();
+    cmdbuf->reset(IGPUCommandBuffer::RESET_FLAGS::RELEASE_RESOURCES_BIT);
+    cmdbuf->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT);
+    cmdbuf->beginDebugMarker("RaytracingPipelineApp Frame");
+    {
+      m_camera.beginInputProcessing(nextPresentationTimestamp);
+      m_mouse.consumeEvents([&](const IMouseEventChannel::range_t& events) -> void
+      {
+        if (m_camera.mouseProcess(events)) 
+        {
+          m_frameAccumulationCounter = 0;
+        }
+      }, m_logger.get());
+      m_keyboard.consumeEvents([&](const IKeyboardEventChannel::range_t& events) -> void
+      {
+        if (m_camera.keyboardProcess(events))
+        {
+          m_frameAccumulationCounter = 0;
+        }
+      }, m_logger.get());
+      m_camera.endInputProcessing(nextPresentationTimestamp);
+
+    }
+
+    const auto viewMatrix = m_camera.getViewMatrix();
+    const auto projectionMatrix = m_camera.getProjectionMatrix();
+    const auto viewProjectionMatrix = m_camera.getConcatenatedMatrix();
+
+    core::matrix3x4SIMD modelMatrix;
+    modelMatrix.setTranslation(nbl::core::vectorSIMDf(0, 0, 0, 0));
+    modelMatrix.setRotation(quaternion(0, 0, 0));
+
+    core::matrix4SIMD modelViewProjectionMatrix = core::concatenateBFollowedByA(viewProjectionMatrix, modelMatrix);
+    core::matrix4SIMD invModelViewProjectionMatrix;
+    modelViewProjectionMatrix.getInverseTransform(invModelViewProjectionMatrix);
+
+    auto* queue = getGraphicsQueue();
+
+    {
+      IGPUCommandBuffer::SPipelineBarrierDependencyInfo::image_barrier_t imageBarriers[1];
+      imageBarriers[0].barrier = {
+         .dep = {
+           .srcStageMask = PIPELINE_STAGE_FLAGS::NONE,
+           .srcAccessMask = ACCESS_FLAGS::NONE,
+           .dstStageMask = PIPELINE_STAGE_FLAGS::RAY_TRACING_SHADER_BIT,
+           .dstAccessMask = ACCESS_FLAGS::SHADER_WRITE_BITS
+        }
+      };
+      imageBarriers[0].image = m_hdrImage.get();
+      imageBarriers[0].subresourceRange = {
+        .aspectMask = IImage::EAF_COLOR_BIT,
+        .baseMipLevel = 0u,
+        .levelCount = 1u,
+        .baseArrayLayer = 0u,
+        .layerCount = 1u
+      };
+      imageBarriers[0].oldLayout = IImage::LAYOUT::UNDEFINED;
+      imageBarriers[0].newLayout = IImage::LAYOUT::GENERAL;
+      cmdbuf->pipelineBarrier(E_DEPENDENCY_FLAGS::EDF_NONE, { .imgBarriers = imageBarriers });
+    }
+
+    // do ray query
+    SPushConstants pc;
+    pc.geometryInfoBuffer = m_geometryInfoBuffer->getDeviceAddress();
+    pc.frameCounter = m_frameAccumulationCounter;
+    const core::vector3df camPos = m_camera.getPosition().getAsVector3df();
+    pc.camPos = { camPos.X, camPos.Y, camPos.Z };
+    memcpy(&pc.invMVP, invModelViewProjectionMatrix.pointer(), sizeof(pc.invMVP));
+
+    cmdbuf->bindRayTracingPipeline(m_rayTracingPipeline.get());
+    cmdbuf->pushConstants(m_rayTracingPipeline->getLayout(), IShader::E_SHADER_STAGE::ESS_ALL_RAY_TRACING, 0, sizeof(SPushConstants), &pc);
+    cmdbuf->bindDescriptorSets(EPBP_RAY_TRACING, m_rayTracingPipeline->getLayout(), 0, 1, &m_renderDs.get());
+    cmdbuf->traceRays(m_shaderBindingTable.raygenGroupRegion, 
+      m_shaderBindingTable.missGroupsRegion,
+      m_shaderBindingTable.hitGroupsRegion,
+      m_shaderBindingTable.callableGroupsRegion,
+      WIN_W, WIN_H, 1);
+
+    // blit
+    {
+      IGPUCommandBuffer::SPipelineBarrierDependencyInfo::image_barrier_t imageBarriers[2];
+      imageBarriers[0].barrier = {
+         .dep = {
+           .srcStageMask = PIPELINE_STAGE_FLAGS::RAY_TRACING_SHADER_BIT,
+           .srcAccessMask = ACCESS_FLAGS::SHADER_WRITE_BITS,
+           .dstStageMask = PIPELINE_STAGE_FLAGS::BLIT_BIT,
+           .dstAccessMask = ACCESS_FLAGS::TRANSFER_WRITE_BIT
+        }
+      };
+      imageBarriers[0].image = m_hdrImage.get();
+      imageBarriers[0].subresourceRange = {
+        .aspectMask = IImage::EAF_COLOR_BIT,
+        .baseMipLevel = 0u,
+        .levelCount = 1u,
+        .baseArrayLayer = 0u,
+        .layerCount = 1u
+      };
+      imageBarriers[0].oldLayout = IImage::LAYOUT::UNDEFINED;
+      imageBarriers[0].newLayout = IImage::LAYOUT::TRANSFER_SRC_OPTIMAL;
+
+      imageBarriers[1].barrier = {
+         .dep = {
+           .srcStageMask = PIPELINE_STAGE_FLAGS::NONE,
+           .srcAccessMask = ACCESS_FLAGS::NONE,
+           .dstStageMask = PIPELINE_STAGE_FLAGS::BLIT_BIT,
+           .dstAccessMask = ACCESS_FLAGS::TRANSFER_WRITE_BIT
+        }
+      };
+      imageBarriers[1].image = m_surface->getSwapchainResources()->getImage(m_currentImageAcquire.imageIndex);
+      imageBarriers[1].subresourceRange = {
+        .aspectMask = IImage::EAF_COLOR_BIT,
+        .baseMipLevel = 0u,
+        .levelCount = 1u,
+        .baseArrayLayer = 0u,
+        .layerCount = 1u
+      };
+      imageBarriers[1].oldLayout = IImage::LAYOUT::UNDEFINED;
+      imageBarriers[1].newLayout = IImage::LAYOUT::TRANSFER_DST_OPTIMAL;
+
+      cmdbuf->pipelineBarrier(E_DEPENDENCY_FLAGS::EDF_NONE, { .imgBarriers = imageBarriers });
+    }
+
+    {
+      IGPUCommandBuffer::SImageBlit regions[] = { {
+        .srcMinCoord = {0,0,0},
+        .srcMaxCoord = {WIN_W,WIN_H,1},
+        .dstMinCoord = {0,0,0},
+        .dstMaxCoord = {WIN_W,WIN_H,1},
+        .layerCount = 1,
+        .srcBaseLayer = 0,
+        .dstBaseLayer = 0,
+        .srcMipLevel = 0,
+        .dstMipLevel = 0,
+        .aspectMask = IGPUImage::E_ASPECT_FLAGS::EAF_COLOR_BIT
+      } };
+
+      auto srcImg = m_hdrImage.get();
+      auto scRes = static_cast<CDefaultSwapchainFramebuffers*>(m_surface->getSwapchainResources());
+      auto dstImg = scRes->getImage(m_currentImageAcquire.imageIndex);
+
+      cmdbuf->blitImage(srcImg, IImage::LAYOUT::TRANSFER_SRC_OPTIMAL, dstImg, IImage::LAYOUT::TRANSFER_DST_OPTIMAL, regions, ISampler::ETF_NEAREST);
+    }
+
+    // TODO: transition to present
+    {
+      IGPUCommandBuffer::SPipelineBarrierDependencyInfo::image_barrier_t imageBarriers[1];
+      imageBarriers[0].barrier = {
+         .dep = {
+           .srcStageMask = PIPELINE_STAGE_FLAGS::BLIT_BIT,
+           .srcAccessMask = ACCESS_FLAGS::TRANSFER_WRITE_BIT,
+           .dstStageMask = PIPELINE_STAGE_FLAGS::NONE,
+           .dstAccessMask = ACCESS_FLAGS::NONE
+        }
+      };
+      imageBarriers[0].image = m_surface->getSwapchainResources()->getImage(m_currentImageAcquire.imageIndex);
+      imageBarriers[0].subresourceRange = {
+        .aspectMask = IImage::EAF_COLOR_BIT,
+        .baseMipLevel = 0u,
+        .levelCount = 1u,
+        .baseArrayLayer = 0u,
+        .layerCount = 1u
+      };
+      imageBarriers[0].oldLayout = IImage::LAYOUT::TRANSFER_DST_OPTIMAL;
+      imageBarriers[0].newLayout = IImage::LAYOUT::PRESENT_SRC;
+
+      cmdbuf->pipelineBarrier(E_DEPENDENCY_FLAGS::EDF_NONE, { .imgBarriers = imageBarriers });
+    }
+
+    cmdbuf->endDebugMarker();
+    cmdbuf->end();
+
+    {
+      const IQueue::SSubmitInfo::SSemaphoreInfo rendered[] =
+      {
+        {
+          .semaphore = m_semaphore.get(),
+          .value = ++m_realFrameIx,
+          .stageMask = PIPELINE_STAGE_FLAGS::ALL_TRANSFER_BITS
+        }
+      };
+      {
+        {
+          const IQueue::SSubmitInfo::SCommandBufferInfo commandBuffers[] =
+          {
+            {.cmdbuf = cmdbuf }
+          };
+
+          const IQueue::SSubmitInfo::SSemaphoreInfo acquired[] =
+          {
+            {
+              .semaphore = m_currentImageAcquire.semaphore,
+              .value = m_currentImageAcquire.acquireCount,
+              .stageMask = PIPELINE_STAGE_FLAGS::NONE
+            }
+          };
+          const IQueue::SSubmitInfo infos[] =
+          {
+            {
+              .waitSemaphores = acquired,
+              .commandBuffers = commandBuffers,
+              .signalSemaphores = rendered
+            }
+          };
+
+          if (queue->submit(infos) == IQueue::RESULT::SUCCESS)
+          {
+            const nbl::video::ISemaphore::SWaitInfo waitInfos[] =
+            { {
+              .semaphore = m_semaphore.get(),
+              .value = m_realFrameIx
+            } };
+
+            m_device->blockForSemaphores(waitInfos); // this is not solution, quick wa to not throw validation errors
+          }
+          else
+            --m_realFrameIx;
+        }
+      }
+
+      std::string caption = "[Nabla Engine] Ray Tracing Pipeline";
+      {
+        caption += ", displaying [all objects]";
+        m_window->setCaption(caption);
+      }
+      m_surface->present(m_currentImageAcquire.imageIndex, rendered);
+    }
+
+    m_frameAccumulationCounter++;
+  }
+
+  inline bool keepRunning() override
+  {
+    if (m_surface->irrecoverable())
+      return false;
+
+    return true;
+  }
+
+  inline bool onAppTerminated() override
+  {
+    return device_base_t::onAppTerminated();
+  }
+
+private:
+  uint32_t getWorkgroupCount(uint32_t dim, uint32_t size)
+  {
+    return (dim + size - 1) / size;
+  }
+
+  smart_refctd_ptr<IGPUBuffer> createBuffer(IGPUBuffer::SCreationParams& params)
+  {
+    smart_refctd_ptr<IGPUBuffer> buffer;
+    buffer = m_device->createBuffer(std::move(params));
+    auto bufReqs = buffer->getMemoryReqs();
+    bufReqs.memoryTypeBits &= m_physicalDevice->getDeviceLocalMemoryTypeBits();
+    m_device->allocate(bufReqs, buffer.get(), IDeviceMemoryAllocation::EMAF_DEVICE_ADDRESS_BIT);
+
+    return buffer;
+  }
+
+  smart_refctd_ptr<IGPUCommandBuffer> getSingleUseCommandBufferAndBegin(smart_refctd_ptr<IGPUCommandPool> pool)
+  {
+    smart_refctd_ptr<IGPUCommandBuffer> cmdbuf;
+    if (!pool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY, 1u, &cmdbuf))
+      return nullptr;
+
+    cmdbuf->reset(IGPUCommandBuffer::RESET_FLAGS::RELEASE_RESOURCES_BIT);
+    cmdbuf->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT);
+
+    return cmdbuf;
+  }
+
+  void cmdbufSubmitAndWait(smart_refctd_ptr<IGPUCommandBuffer> cmdbuf, CThreadSafeQueueAdapter* queue, uint64_t startValue)
+  {
+    cmdbuf->end();
+
+    uint64_t finishedValue = startValue + 1;
+
+    // submit builds
+    {
+      auto completed = m_device->createSemaphore(startValue);
+
+      std::array<IQueue::SSubmitInfo::SSemaphoreInfo, 1u> signals;
+      {
+        auto& signal = signals.front();
+        signal.value = finishedValue;
+        signal.stageMask = bitflag(PIPELINE_STAGE_FLAGS::ALL_TRANSFER_BITS);
+        signal.semaphore = completed.get();
+      }
+
+      const IQueue::SSubmitInfo::SCommandBufferInfo commandBuffers[1] = { {
+        .cmdbuf = cmdbuf.get()
+      } };
+
+      const IQueue::SSubmitInfo infos[] =
+      {
+        {
+          .waitSemaphores = {},
+          .commandBuffers = commandBuffers,
+          .signalSemaphores = signals
+        }
+      };
+
+      if (queue->submit(infos) != IQueue::RESULT::SUCCESS)
+      {
+        m_logger->log("Failed to submit geometry transfer upload operations!", ILogger::ELL_ERROR);
+        return;
+      }
+
+      const ISemaphore::SWaitInfo info[] =
+      { {
+        .semaphore = completed.get(),
+        .value = finishedValue
+      } };
+
+      m_device->blockForSemaphores(info);
+    }
+  }
+
+  bool createGeometries(video::CThreadSafeQueueAdapter* queue, const IGeometryCreator* gc)
+  {
+    auto pool = m_device->createCommandPool(queue->getFamilyIndex(), IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT);
+    if (!pool)
+      return logFail("Couldn't create Command Pool for geometry creation!");
+
+    const auto defaultMaterial = Material{
+      .ambient = {0.1, 0.1, 0.1},
+      .diffuse = {0.8, 0.3, 0.3},
+      .specular = {0.8, 0.8, 0.8},
+      .shininess = 1.0f,
+      .illum = 2
+    };
+
+    auto getTranslationMatrix = [](float32_t x, float32_t y, float32_t z)
+      {
+        core::matrix3x4SIMD transform;
+        transform.setTranslation(nbl::core::vectorSIMDf(x, y, z, 0));
+        return transform;
+      };
+    
+    core::matrix3x4SIMD planeTransform;
+    planeTransform.setRotation(quaternion::fromAngleAxis(core::radians(-90.0f), vector3df_SIMD{1, 0, 0}));
+
+    const auto cpuObjects = std::array{
+      ReferenceObjectCpu {
+        .meta = {.type = OT_RECTANGLE, .name = "Plane Mesh"},
+        .data = gc->createRectangleMesh(nbl::core::vector2df_SIMD(10, 10)),
+        .material = defaultMaterial,
+        .transform = planeTransform,
+      },
+      ReferenceObjectCpu {
+        .meta = {.type = OT_CUBE, .name = "Cube Mesh"},
+        .data = gc->createCubeMesh(nbl::core::vector3df(1, 1, 1)),
+        .material = defaultMaterial,
+        .transform = getTranslationMatrix(0, 0.5f, 0),
+      },
+      ReferenceObjectCpu {
+        .meta = {.type = OT_SPHERE, .name = "Sphere Mesh"},
+        .data = gc->createSphereMesh(2, 16, 16),
+        .material = {
+          .ambient = {0.1, 0.1, 0.1},
+          .diffuse = {0.2, 0.2, 0.8},
+          .specular = {0.8, 0.8, 0.8},
+          .shininess = 1.0f,
+          .illum = 2
+        },
+        .transform = getTranslationMatrix(-5.0f, 1.0f, 0),
+      },
+      ReferenceObjectCpu {
+        .meta = {.type = OT_SPHERE, .name = "Transparent Sphere Mesh"},
+        .data = gc->createSphereMesh(2, 16, 16),
+        .material = {
+          .ambient = {0.1, 0.1, 0.1},
+          .diffuse = {0.2, 0.8, 0.2},
+          .specular = {0.8, 0.8, 0.8},
+          .shininess = 1.0f,
+          .dissolve = 0.2,
+          .illum = 4
+        },
+        .transform = getTranslationMatrix(5.0f, 1.0f, 0),
+      },
+    };
+
+    struct ScratchVIBindings
+    {
+      nbl::asset::SBufferBinding<ICPUBuffer> vertex, index;
+    };
+    std::array<ScratchVIBindings, std::size(cpuObjects)> scratchBuffers;
+
+    for (uint32_t i = 0; i < cpuObjects.size(); i++)
+    {
+      const auto& cpuObject = cpuObjects[i];
+
+      auto vBuffer = smart_refctd_ptr(cpuObject.data.bindings[0].buffer); // no offset
+      auto vUsage = bitflag(IGPUBuffer::EUF_STORAGE_BUFFER_BIT) | IGPUBuffer::EUF_TRANSFER_DST_BIT | IGPUBuffer::EUF_INLINE_UPDATE_VIA_CMDBUF |
+        IGPUBuffer::EUF_ACCELERATION_STRUCTURE_BUILD_INPUT_READ_ONLY_BIT | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT;
+      vBuffer->addUsageFlags(vUsage);
+      vBuffer->setContentHash(vBuffer->computeContentHash());
+
+      auto iBuffer = smart_refctd_ptr(cpuObject.data.indexBuffer.buffer); // no offset
+      auto iUsage = bitflag(IGPUBuffer::EUF_STORAGE_BUFFER_BIT) | IGPUBuffer::EUF_TRANSFER_DST_BIT | IGPUBuffer::EUF_INLINE_UPDATE_VIA_CMDBUF |
+        IGPUBuffer::EUF_ACCELERATION_STRUCTURE_BUILD_INPUT_READ_ONLY_BIT | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT;
+
+      if (cpuObject.data.indexType != EIT_UNKNOWN)
+        if (iBuffer)
+        {
+          iBuffer->addUsageFlags(iUsage);
+          iBuffer->setContentHash(iBuffer->computeContentHash());
+        }
+
+      scratchBuffers[i] = {
+        .vertex = {.offset = 0, .buffer = vBuffer},
+        .index = {.offset = 0, .buffer = iBuffer},
+      };
+
+    }
+
+    auto cmdbuf = getSingleUseCommandBufferAndBegin(pool);
+    cmdbuf->beginDebugMarker("Build geometry vertex and index buffers");
+
+    CAssetConverter::SInputs inputs = {};
+    inputs.logger = m_logger.get();
+    std::array<ICPUBuffer*, std::size(cpuObjects) * 2u> tmpBuffers;
+    {
+      for (uint32_t i = 0; i < cpuObjects.size(); i++)
+      {
+        tmpBuffers[2 * i + 0] = scratchBuffers[i].vertex.buffer.get();
+        tmpBuffers[2 * i + 1] = scratchBuffers[i].index.buffer.get();
+      }
+
+      std::get<CAssetConverter::SInputs::asset_span_t<ICPUBuffer>>(inputs.assets) = tmpBuffers;
+    }
+
+    auto reservation = m_converter->reserve(inputs);
+    {
+      auto prepass = [&]<typename asset_type_t>(const auto & references) -> bool
+      {
+        auto objects = reservation.getGPUObjects<asset_type_t>();
+        uint32_t counter = {};
+        for (auto& object : objects)
+        {
+          auto gpu = object.value;
+          auto* reference = references[counter];
+
+          if (reference)
+          {
+            if (!gpu)
+            {
+              m_logger->log("Failed to convert a CPU object to GPU!", ILogger::ELL_ERROR);
+              return false;
+            }
+          }
+          counter++;
+        }
+        return true;
+      };
+
+      prepass.template operator() < ICPUBuffer > (tmpBuffers);
+    }
+
+    auto geomInfoBuffer = ICPUBuffer::create({ std::size(cpuObjects) * sizeof(SGeomInfo) });
+    SGeomInfo* geomInfos = reinterpret_cast<SGeomInfo*>(geomInfoBuffer->getPointer());
+
+    m_gpuObjects.reserve(std::size(cpuObjects));
+    // convert
+    {
+      // not sure if need this (probably not, originally for transition img view)
+      auto semaphore = m_device->createSemaphore(0u);
+
+      std::array<IQueue::SSubmitInfo::SCommandBufferInfo, 1> cmdbufs = {};
+      cmdbufs.front().cmdbuf = cmdbuf.get();
+
+      SIntendedSubmitInfo transfer = {};
+      transfer.queue = queue;
+      transfer.scratchCommandBuffers = cmdbufs;
+      transfer.scratchSemaphore = {
+        .semaphore = semaphore.get(),
+        .value = 0u,
+        .stageMask = PIPELINE_STAGE_FLAGS::ALL_TRANSFER_BITS
+      };
+
+      CAssetConverter::SConvertParams params = {};
+      params.utilities = m_utils.get();
+      params.transfer = &transfer;
+
+      auto future = reservation.convert(params);
+      if (future.copy() != IQueue::RESULT::SUCCESS)
+      {
+        m_logger->log("Failed to await submission feature!", ILogger::ELL_ERROR);
+        return false;
+      }
+
+      auto&& buffers = reservation.getGPUObjects<ICPUBuffer>();
+      for (uint32_t i = 0; i < cpuObjects.size(); i++)
+      {
+        auto& cpuObject = cpuObjects[i];
+
+        m_gpuObjects.push_back(ReferenceObjectGpu{
+          .meta = cpuObject.meta,
+          .bindings = {
+            .vertex = {.offset = 0, .buffer = buffers[2 * i + 0].value },
+            .index = {.offset = 0, .buffer = buffers[2 * i + 1].value },
+          },
+          .vertexStride = cpuObject.data.inputParams.bindings[0].stride,
+          .indexType = cpuObject.data.indexType,
+          .indexCount = cpuObject.data.indexCount,
+          .material = cpuObject.material,
+          .transform = cpuObject.transform,
+        });
+      }
+
+      for (uint32_t i = 0; i < m_gpuObjects.size(); i++)
+      {
+        const auto& gpuObject = m_gpuObjects[i];
+        const uint64_t vertexBufferAddress = gpuObject.bindings.vertex.buffer->getDeviceAddress();
+        geomInfos[i] = {
+          .vertexBufferAddress = vertexBufferAddress,
+          .indexBufferAddress = gpuObject.useIndex() ? gpuObject.bindings.index.buffer->getDeviceAddress() : vertexBufferAddress,
+          .vertexStride = gpuObject.vertexStride,
+          .indexType = gpuObject.indexType,
+          .smoothNormals = s_smoothNormals[gpuObject.meta.type],
+          .objType = gpuObject.meta.type,
+          .material = gpuObject.material,
+        };
+      }
+    }
+
+    {
+      IGPUBuffer::SCreationParams params;
+      params.usage = IGPUBuffer::EUF_STORAGE_BUFFER_BIT | IGPUBuffer::EUF_TRANSFER_DST_BIT | IGPUBuffer::EUF_INLINE_UPDATE_VIA_CMDBUF | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT;
+      params.size = geomInfoBuffer->getSize();
+      m_utils->createFilledDeviceLocalBufferOnDedMem(SIntendedSubmitInfo{ .queue = queue }, std::move(params), geomInfos).move_into(m_geometryInfoBuffer);
+    }
+
+    return true;
+  }
+
+  bool createShaderBindingTable(video::CThreadSafeQueueAdapter* queue, const smart_refctd_ptr<video::IGPURayTracingPipeline>& pipeline)
+  {
+    const auto& limits = m_device->getPhysicalDevice()->getLimits();
+    const auto handleSize = limits.shaderGroupHandleSize;
+    const auto handleSizeAligned = nbl::core::alignUp(handleSize, limits.shaderGroupHandleAlignment);
+
+    auto& raygenRegion = m_shaderBindingTable.raygenGroupRegion;
+    auto& hitRegion = m_shaderBindingTable.hitGroupsRegion;
+    auto& missRegion = m_shaderBindingTable.missGroupsRegion;
+    auto& callableRegion = m_shaderBindingTable.callableGroupsRegion;
+
+    raygenRegion = {
+      .offset = 0,
+      .stride = core::alignUp(handleSizeAligned, limits.shaderGroupBaseAlignment),
+      .size = core::alignUp(handleSizeAligned, limits.shaderGroupBaseAlignment)
+    };
+
+    missRegion = {
+      .offset = raygenRegion.size,
+      .stride = handleSizeAligned,
+      .size = core::alignUp(pipeline->getMissGroupCount(), limits.shaderGroupBaseAlignment),
+    };
+
+    hitRegion = {
+      .offset = missRegion.offset + missRegion.size,
+      .stride = handleSizeAligned,
+      .size = core::alignUp(pipeline->getHitGroupCount(), limits.shaderGroupBaseAlignment),
+    };
+
+    callableRegion = {
+      .offset = hitRegion.offset + hitRegion.size,
+      .stride = handleSizeAligned,
+      .size = core::alignUp(pipeline->getCallableGroupCount(), limits.shaderGroupBaseAlignment),
+    };
+
+    const auto bufferSize = raygenRegion.size + missRegion.size + hitRegion.size + callableRegion.size;
+
+    ICPUBuffer::SCreationParams cpuBufferParams;
+    cpuBufferParams.size = bufferSize;
+    auto cpuBuffer = ICPUBuffer::create(std::move(cpuBufferParams));
+    uint8_t* pData = reinterpret_cast<uint8_t*>(cpuBuffer->getPointer());
+    
+    // copy raygen region
+    memcpy(pData, pipeline->getRaygenGroupShaderHandle().data(), handleSize);
+
+    // copy miss region
+    uint8_t* pMissData = pData + missRegion.offset;
+    for (int32_t missIx = 0; missIx < pipeline->getMissGroupCount(); missIx++)
+    {
+      memcpy(pMissData, pipeline->getMissGroupShaderHandle(missIx).data(), handleSize);
+      pMissData += missRegion.stride;
+    }
+
+    // copy hit region
+    uint8_t* pHitData = pData + hitRegion.offset;
+    for (int32_t hitIx = 0; hitIx < pipeline->getHitGroupCount(); hitIx++)
+    {
+      memcpy(pHitData, pipeline->getHitGroupShaderHandle(hitIx).data(), handleSize);
+      pHitData += hitRegion.stride;
+    }
+
+    // copy callable region
+    uint8_t* pCallableData = pData + callableRegion.offset;
+    for (int32_t callableIx = 0; callableIx < pipeline->getCallableGroupCount(); callableIx++)
+    {
+      memcpy(pCallableData, pipeline->getCallableGroupShaderHandle(callableIx).data(), handleSize);
+      pCallableData += callableRegion.stride;
+    }
+
+    {
+      IGPUBuffer::SCreationParams params;
+      params.usage = IGPUBuffer::EUF_TRANSFER_DST_BIT | IGPUBuffer::EUF_INLINE_UPDATE_VIA_CMDBUF | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT | IGPUBuffer::EUF_SHADER_BINDING_TABLE_BIT;
+      params.size = bufferSize;
+      m_utils->createFilledDeviceLocalBufferOnDedMem(SIntendedSubmitInfo{ .queue = queue }, std::move(params), pData).move_into(raygenRegion.buffer);
+      m_logger->log("Device address : %d", ILogger::ELL_INFO, raygenRegion.buffer->getDeviceAddress());
+      missRegion.buffer = core::smart_refctd_ptr(raygenRegion.buffer);
+      hitRegion.buffer = core::smart_refctd_ptr(raygenRegion.buffer);
+      callableRegion.buffer = core::smart_refctd_ptr(raygenRegion.buffer);
+    }
+
+    return true;
+  }
+
+  bool createAccelerationStructures(video::CThreadSafeQueueAdapter* queue)
+  {
+    IQueryPool::SCreationParams qParams{ .queryCount = static_cast<uint32_t>(m_gpuObjects.size()), .queryType = IQueryPool::ACCELERATION_STRUCTURE_COMPACTED_SIZE};
+    smart_refctd_ptr<IQueryPool> queryPool = m_device->createQueryPool(std::move(qParams));
+
+    auto pool = m_device->createCommandPool(queue->getFamilyIndex(), IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT | IGPUCommandPool::CREATE_FLAGS::TRANSIENT_BIT);
+    if (!pool)
+      return logFail("Couldn't create Command Pool for blas/tlas creation!");
+
+    m_api->startCapture();
+#ifdef TRY_BUILD_FOR_NGFX // NSight is "debugger-challenged" it can't capture anything not happenning "during a frame", so we need to trick it
+    m_currentImageAcquire = m_surface->acquireNextImage();
+    {
+      const IQueue::SSubmitInfo::SSemaphoreInfo acquired[] = { {
+        .semaphore = m_currentImageAcquire.semaphore,
+        .value = m_currentImageAcquire.acquireCount,
+        .stageMask = PIPELINE_STAGE_FLAGS::ALL_COMMANDS_BITS
+      } };
+      m_surface->present(m_currentImageAcquire.imageIndex, acquired);
+    }
+    m_currentImageAcquire = m_surface->acquireNextImage();
+#endif
+    size_t totalScratchSize = 0;
+
+    // build bottom level ASes
+    {
+      core::vector<IGPUBottomLevelAccelerationStructure::DeviceBuildInfo> blasBuildInfos(m_gpuObjects.size());
+      core::vector<uint32_t> primitiveCounts(m_gpuObjects.size());
+      core::vector<IGPUBottomLevelAccelerationStructure::Triangles<const IGPUBuffer>> triangles(m_gpuObjects.size());
+      core::vector<uint32_t> scratchSizes(m_gpuObjects.size());
+      m_gpuBlasList.resize(m_gpuObjects.size());
+
+      for (uint32_t i = 0; i < m_gpuObjects.size(); i++)
+      {
+        const auto& gpuObject = m_gpuObjects[i];
+
+        const uint32_t vertexStride = gpuObject.vertexStride;
+        const uint32_t numVertices = gpuObject.bindings.vertex.buffer->getSize() / vertexStride;
+        if (gpuObject.useIndex())
+          primitiveCounts[i] = gpuObject.indexCount / 3;
+        else
+          primitiveCounts[i] = numVertices / 3;
+
+        triangles[i].vertexData[0] = gpuObject.bindings.vertex;
+        triangles[i].indexData = gpuObject.useIndex() ? gpuObject.bindings.index : gpuObject.bindings.vertex;
+        triangles[i].maxVertex = numVertices - 1;
+        triangles[i].vertexStride = vertexStride;
+        triangles[i].vertexFormat = EF_R32G32B32_SFLOAT;
+        triangles[i].indexType = gpuObject.indexType;
+        triangles[i].geometryFlags = IGPUBottomLevelAccelerationStructure::GEOMETRY_FLAGS::NO_DUPLICATE_ANY_HIT_INVOCATION_BIT;
+
+        auto blasFlags = bitflag(IGPUBottomLevelAccelerationStructure::BUILD_FLAGS::PREFER_FAST_TRACE_BIT) | IGPUBottomLevelAccelerationStructure::BUILD_FLAGS::ALLOW_COMPACTION_BIT;
+        if (m_physicalDevice->getProperties().limits.rayTracingPositionFetch)
+          blasFlags |= IGPUBottomLevelAccelerationStructure::BUILD_FLAGS::ALLOW_DATA_ACCESS_KHR;
+
+        blasBuildInfos[i].buildFlags = blasFlags;
+        blasBuildInfos[i].geometryCount = 1;	// only 1 geometry object per blas
+        blasBuildInfos[i].srcAS = nullptr;
+        blasBuildInfos[i].dstAS = nullptr;
+        blasBuildInfos[i].triangles = &triangles[i];
+        blasBuildInfos[i].scratch = {};
+
+        ILogicalDevice::AccelerationStructureBuildSizes buildSizes;
+        {
+          const uint32_t maxPrimCount[1] = { primitiveCounts[i] };
+          buildSizes = m_device->getAccelerationStructureBuildSizes(blasFlags, false, std::span{ &triangles[i], 1 }, maxPrimCount);
+          if (!buildSizes)
+            return logFail("Failed to get BLAS build sizes");
+        }
+
+        scratchSizes[i] = buildSizes.buildScratchSize;
+        totalScratchSize += buildSizes.buildScratchSize;
+
+        {
+          IGPUBuffer::SCreationParams params;
+          params.usage = bitflag(IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT) | IGPUBuffer::EUF_ACCELERATION_STRUCTURE_STORAGE_BIT;
+          params.size = buildSizes.accelerationStructureSize;
+          smart_refctd_ptr<IGPUBuffer> asBuffer = createBuffer(params);
+
+          IGPUBottomLevelAccelerationStructure::SCreationParams blasParams;
+          blasParams.bufferRange.buffer = asBuffer;
+          blasParams.bufferRange.offset = 0u;
+          blasParams.bufferRange.size = buildSizes.accelerationStructureSize;
+          blasParams.flags = IGPUBottomLevelAccelerationStructure::SCreationParams::FLAGS::NONE;
+          m_gpuBlasList[i] = m_device->createBottomLevelAccelerationStructure(std::move(blasParams));
+          if (!m_gpuBlasList[i])
+            return logFail("Could not create BLAS");
+        }
+      }
+
+      auto cmdbufBlas = getSingleUseCommandBufferAndBegin(pool);
+      cmdbufBlas->beginDebugMarker("Build BLAS");
+
+      cmdbufBlas->resetQueryPool(queryPool.get(), 0, m_gpuObjects.size());
+
+      smart_refctd_ptr<IGPUBuffer> scratchBuffer;
+      {
+        IGPUBuffer::SCreationParams params;
+        params.usage = bitflag(IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT) | IGPUBuffer::EUF_STORAGE_BUFFER_BIT;
+        params.size = totalScratchSize;
+        scratchBuffer = createBuffer(params);
+      }
+
+      uint32_t queryCount = 0;
+      core::vector<IGPUBottomLevelAccelerationStructure::BuildRangeInfo> buildRangeInfos(m_gpuObjects.size());
+      core::vector<IGPUBottomLevelAccelerationStructure::BuildRangeInfo*> pRangeInfos(m_gpuObjects.size());
+      for (uint32_t i = 0; i < m_gpuObjects.size(); i++)
+      {
+        blasBuildInfos[i].dstAS = m_gpuBlasList[i].get();
+        blasBuildInfos[i].scratch.buffer = scratchBuffer;
+        blasBuildInfos[i].scratch.offset = (i == 0) ? 0u : blasBuildInfos[i - 1].scratch.offset + scratchSizes[i - 1];
+
+        buildRangeInfos[i].primitiveCount = primitiveCounts[i];
+        buildRangeInfos[i].primitiveByteOffset = 0u;
+        buildRangeInfos[i].firstVertex = 0u;
+        buildRangeInfos[i].transformByteOffset = 0u;
+
+        pRangeInfos[i] = &buildRangeInfos[i];
+      }
+
+      if (!cmdbufBlas->buildAccelerationStructures(std::span(blasBuildInfos), pRangeInfos.data()))
+        return logFail("Failed to build BLAS");
+
+      {
+        SMemoryBarrier memBarrier;
+        memBarrier.srcStageMask = PIPELINE_STAGE_FLAGS::ACCELERATION_STRUCTURE_BUILD_BIT;
+        memBarrier.srcAccessMask = ACCESS_FLAGS::ACCELERATION_STRUCTURE_WRITE_BIT;
+        memBarrier.dstStageMask = PIPELINE_STAGE_FLAGS::ACCELERATION_STRUCTURE_BUILD_BIT;
+        memBarrier.dstAccessMask = ACCESS_FLAGS::ACCELERATION_STRUCTURE_READ_BIT;
+        cmdbufBlas->pipelineBarrier(E_DEPENDENCY_FLAGS::EDF_NONE, { .memBarriers = {&memBarrier, 1} });
+      }
+
+
+      core::vector<const IGPUAccelerationStructure*> ases(m_gpuObjects.size());
+      for (uint32_t i = 0; i < m_gpuObjects.size(); i++)
+        ases[i] = m_gpuBlasList[i].get();
+      if (!cmdbufBlas->writeAccelerationStructureProperties(std::span(ases), IQueryPool::ACCELERATION_STRUCTURE_COMPACTED_SIZE,
+        queryPool.get(), queryCount++))
+        return logFail("Failed to write acceleration structure properties!");
+
+      cmdbufBlas->endDebugMarker();
+      cmdbufSubmitAndWait(cmdbufBlas, queue, 39);
+    }
+
+    auto cmdbufCompact = getSingleUseCommandBufferAndBegin(pool);
+    cmdbufCompact->beginDebugMarker("Compact BLAS");
+
+    // compact blas
+    {
+      core::vector<size_t> asSizes(m_gpuObjects.size(), 0);
+      if (!m_device->getQueryPoolResults(queryPool.get(), 0, m_gpuObjects.size(), asSizes.data(), sizeof(size_t), IQueryPool::WAIT_BIT))
+        return logFail("Could not get query pool results for AS sizes");
+
+      core::vector<smart_refctd_ptr<IGPUBottomLevelAccelerationStructure>> cleanupBlas(m_gpuObjects.size());
+      for (uint32_t i = 0; i < m_gpuObjects.size(); i++)
+      {
+        cleanupBlas[i] = m_gpuBlasList[i];
+        {
+          IGPUBuffer::SCreationParams params;
+          params.usage = bitflag(IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT) | IGPUBuffer::EUF_ACCELERATION_STRUCTURE_STORAGE_BIT;
+          params.size = asSizes[i];
+          smart_refctd_ptr<IGPUBuffer> asBuffer = createBuffer(params);
+
+          IGPUBottomLevelAccelerationStructure::SCreationParams blasParams;
+          blasParams.bufferRange.buffer = asBuffer;
+          blasParams.bufferRange.offset = 0u;
+          blasParams.bufferRange.size = asSizes[i];
+          blasParams.flags = IGPUBottomLevelAccelerationStructure::SCreationParams::FLAGS::NONE;
+          m_gpuBlasList[i] = m_device->createBottomLevelAccelerationStructure(std::move(blasParams));
+          if (!m_gpuBlasList[i])
+            return logFail("Could not create compacted BLAS");
+        }
+
+        IGPUBottomLevelAccelerationStructure::CopyInfo copyInfo;
+        copyInfo.src = cleanupBlas[i].get();
+        copyInfo.dst = m_gpuBlasList[i].get();
+        copyInfo.mode = IGPUBottomLevelAccelerationStructure::COPY_MODE::COMPACT;
+        if (!cmdbufCompact->copyAccelerationStructure(copyInfo))
+          return logFail("Failed to copy AS to compact");
+      }
+    }
+
+    cmdbufCompact->endDebugMarker();
+    cmdbufSubmitAndWait(cmdbufCompact, queue, 40);
+
+    auto cmdbufTlas = getSingleUseCommandBufferAndBegin(pool);
+    cmdbufTlas->beginDebugMarker("Build TLAS");
+
+    // build top level AS
+    {
+      const uint32_t instancesCount = m_gpuObjects.size();
+      core::vector<IGPUTopLevelAccelerationStructure::DeviceStaticInstance> instances(m_gpuObjects.size());
+      for (uint32_t i = 0; i < instancesCount; i++)
+      {
+        instances[i].base.blas.deviceAddress = m_gpuBlasList[i]->getReferenceForDeviceOperations().deviceAddress;
+        instances[i].base.mask = 0xFF;
+        instances[i].base.instanceCustomIndex = i;
+        instances[i].base.instanceShaderBindingTableRecordOffset = 0;
+        instances[i].base.flags = static_cast<uint32_t>(IGPUTopLevelAccelerationStructure::INSTANCE_FLAGS::TRIANGLE_FACING_CULL_DISABLE_BIT);
+        instances[i].transform = m_gpuObjects[i].transform;
+      }
+
+      {
+        size_t bufSize = instancesCount * sizeof(IGPUTopLevelAccelerationStructure::DeviceStaticInstance);
+        IGPUBuffer::SCreationParams params;
+        params.usage = bitflag(IGPUBuffer::EUF_ACCELERATION_STRUCTURE_BUILD_INPUT_READ_ONLY_BIT) | IGPUBuffer::EUF_STORAGE_BUFFER_BIT |
+          IGPUBuffer::EUF_INLINE_UPDATE_VIA_CMDBUF | IGPUBuffer::EUF_TRANSFER_DST_BIT | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT;
+        params.size = bufSize;
+        m_instanceBuffer = createBuffer(params);
+
+        SBufferRange<IGPUBuffer> range = { .offset = 0u, .size = bufSize, .buffer = m_instanceBuffer };
+        cmdbufTlas->updateBuffer(range, instances.data());
+      }
+
+      // make sure instances upload complete first
+      {
+        SMemoryBarrier memBarrier;
+        memBarrier.srcStageMask = PIPELINE_STAGE_FLAGS::ALL_TRANSFER_BITS;
+        memBarrier.srcAccessMask = ACCESS_FLAGS::TRANSFER_WRITE_BIT;
+        memBarrier.dstStageMask = PIPELINE_STAGE_FLAGS::ACCELERATION_STRUCTURE_BUILD_BIT;
+        memBarrier.dstAccessMask = ACCESS_FLAGS::ACCELERATION_STRUCTURE_WRITE_BIT;
+        cmdbufTlas->pipelineBarrier(E_DEPENDENCY_FLAGS::EDF_NONE, { .memBarriers = {&memBarrier, 1} });
+      }
+
+      auto tlasFlags = bitflag(IGPUTopLevelAccelerationStructure::BUILD_FLAGS::PREFER_FAST_TRACE_BIT);
+
+      IGPUTopLevelAccelerationStructure::DeviceBuildInfo tlasBuildInfo;
+      tlasBuildInfo.buildFlags = tlasFlags;
+      tlasBuildInfo.srcAS = nullptr;
+      tlasBuildInfo.dstAS = nullptr;
+      tlasBuildInfo.instanceData.buffer = m_instanceBuffer;
+      tlasBuildInfo.instanceData.offset = 0u;
+      tlasBuildInfo.scratch = {};
+
+      auto buildSizes = m_device->getAccelerationStructureBuildSizes(tlasFlags, false, instancesCount);
+      if (!buildSizes)
+        return logFail("Failed to get TLAS build sizes");
+
+      {
+        IGPUBuffer::SCreationParams params;
+        params.usage = bitflag(IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT) | IGPUBuffer::EUF_ACCELERATION_STRUCTURE_STORAGE_BIT;
+        params.size = buildSizes.accelerationStructureSize;
+        smart_refctd_ptr<IGPUBuffer> asBuffer = createBuffer(params);
+
+        IGPUTopLevelAccelerationStructure::SCreationParams tlasParams;
+        tlasParams.bufferRange.buffer = asBuffer;
+        tlasParams.bufferRange.offset = 0u;
+        tlasParams.bufferRange.size = buildSizes.accelerationStructureSize;
+        tlasParams.flags = IGPUTopLevelAccelerationStructure::SCreationParams::FLAGS::NONE;
+        m_gpuTlas = m_device->createTopLevelAccelerationStructure(std::move(tlasParams));
+        if (!m_gpuTlas)
+          return logFail("Could not create TLAS");
+      }
+
+      smart_refctd_ptr<IGPUBuffer> scratchBuffer;
+      {
+        IGPUBuffer::SCreationParams params;
+        params.usage = bitflag(IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT) | IGPUBuffer::EUF_STORAGE_BUFFER_BIT;
+        params.size = buildSizes.buildScratchSize;
+        scratchBuffer = createBuffer(params);
+      }
+
+      tlasBuildInfo.dstAS = m_gpuTlas.get();
+      tlasBuildInfo.scratch.buffer = scratchBuffer;
+      tlasBuildInfo.scratch.offset = 0u;
+
+      IGPUTopLevelAccelerationStructure::BuildRangeInfo buildRangeInfo[1u];
+      buildRangeInfo[0].instanceCount = instancesCount;
+      buildRangeInfo[0].instanceByteOffset = 0u;
+      IGPUTopLevelAccelerationStructure::BuildRangeInfo* pRangeInfos;
+      pRangeInfos = &buildRangeInfo[0];
+
+      if (!cmdbufTlas->buildAccelerationStructures({ &tlasBuildInfo, 1 }, pRangeInfos))
+        return logFail("Failed to build TLAS");
+    }
+
+    cmdbufTlas->endDebugMarker();
+    cmdbufSubmitAndWait(cmdbufTlas, queue, 45);
+
+#ifdef TRY_BUILD_FOR_NGFX
+    {
+      const IQueue::SSubmitInfo::SSemaphoreInfo acquired[] = { {
+        .semaphore = m_currentImageAcquire.semaphore,
+        .value = m_currentImageAcquire.acquireCount,
+        .stageMask = PIPELINE_STAGE_FLAGS::ALL_COMMANDS_BITS
+      } };
+      m_surface->present(m_currentImageAcquire.imageIndex, acquired);
+    }
+#endif
+    m_api->endCapture();
+
+    return true;
+  }
+
+
+  smart_refctd_ptr<IWindow> m_window;
+  smart_refctd_ptr<CSimpleResizeSurface<ISimpleManagedSurface::ISwapchainResources>> m_surface;
+  smart_refctd_ptr<ISemaphore> m_semaphore;
+  uint64_t m_realFrameIx = 0;
+  uint32_t m_frameAccumulationCounter = -1;
+  std::array<smart_refctd_ptr<IGPUCommandBuffer>, MaxFramesInFlight> m_cmdBufs;
+  ISimpleManagedSurface::SAcquireResult m_currentImageAcquire = {};
+
+  core::smart_refctd_ptr<InputSystem> m_inputSystem;
+  InputSystem::ChannelReader<IMouseEventChannel> m_mouse;
+  InputSystem::ChannelReader<IKeyboardEventChannel> m_keyboard;
+
+  Camera m_camera = Camera(core::vectorSIMDf(0, 0, 0), core::vectorSIMDf(0, 0, 0), core::matrix4SIMD());
+  CameraView m_oldCameraView;
+  video::CDumbPresentationOracle m_oracle;
+
+  std::vector<ReferenceObjectGpu> m_gpuObjects;
+
+  std::vector<smart_refctd_ptr<IGPUBottomLevelAccelerationStructure>> m_gpuBlasList;
+  smart_refctd_ptr<IGPUTopLevelAccelerationStructure> m_gpuTlas;
+  smart_refctd_ptr<IGPUBuffer> m_instanceBuffer;
+
+  smart_refctd_ptr<IGPUBuffer> m_geometryInfoBuffer;
+  ShaderBindingTable m_shaderBindingTable;
+  smart_refctd_ptr<IGPUImage> m_hdrImage;
+
+  smart_refctd_ptr<IGPURayTracingPipeline> m_rayTracingPipeline;
+  smart_refctd_ptr<IGPUDescriptorSet> m_renderDs;
+  smart_refctd_ptr<IDescriptorPool> m_renderPool;
+
+  smart_refctd_ptr<CAssetConverter> m_converter;
+  smart_refctd_ptr<IGPUBuffer> m_sbtBuffer;
+
+  uint16_t gcIndex = {};
+
+};
+
+NBL_MAIN_FUNC(RaytracingPipelineApp)
diff --git a/CMakeLists.txt b/CMakeLists.txt
index d840850a6..bd200d8a1 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -93,6 +93,7 @@ if(NBL_BUILD_EXAMPLES)
 	add_subdirectory(67_RayQueryGeometry EXCLUDE_FROM_ALL)
 
   add_subdirectory(70_FLIPFluids EXCLUDE_FROM_ALL)
+	add_subdirectory(71_RayTracingPipeline EXCLUDE_FROM_ALL)
 
 	NBL_HOOK_COMMON_API("${NBL_COMMON_API_TARGETS}")
 endif()
diff --git a/common/include/CCamera.hpp b/common/include/CCamera.hpp
index 1b0fe9c0f..d9f31a260 100644
--- a/common/include/CCamera.hpp
+++ b/common/include/CCamera.hpp
@@ -132,8 +132,10 @@ class Camera
 
 public:
 
-	void mouseProcess(const nbl::ui::IMouseEventChannel::range_t& events)
+	// return whether camera is moved by mouse
+	bool mouseProcess(const nbl::ui::IMouseEventChannel::range_t& events)
 	{
+		bool cameraMoved = false;
 		for (auto eventIt=events.begin(); eventIt!=events.end(); eventIt++)
 		{
 			auto ev = *eventIt;
@@ -179,11 +181,15 @@ class Camera
 				mat.transformVect(localTarget);
 				
 				setTarget(localTarget + pos);
+
+				cameraMoved = true;
 			}
 		}
+		return cameraMoved;
 	}
 
-	void keyboardProcess(const nbl::ui::IKeyboardEventChannel::range_t& events)
+	// return whether camera is moved by keyboard
+	bool keyboardProcess(const nbl::ui::IKeyboardEventChannel::range_t& events)
 	{
 		for(uint32_t k = 0; k < E_CAMERA_MOVE_KEYS::ECMK_COUNT; ++k)
 			perActionDt[k] = 0.0;
@@ -194,12 +200,14 @@ class Camera
 		* And If an UP event was sent It will get subtracted it from this value. (Currently Disabled Because we Need better Oracle)
 		*/
 
+		bool cameraMoved = false;
 		for(uint32_t k = 0; k < E_CAMERA_MOVE_KEYS::ECMK_COUNT; ++k) 
 			if(keysDown[k]) 
 			{
 				auto timeDiff = std::chrono::duration_cast<std::chrono::milliseconds>(nextPresentationTimeStamp - lastVirtualUpTimeStamp).count();
 				assert(timeDiff >= 0);
 				perActionDt[k] += timeDiff;
+				cameraMoved = true;
 			}
 
 		for (auto eventIt=events.begin(); eventIt!=events.end(); eventIt++)
@@ -237,8 +245,11 @@ class Camera
 					position = initialPosition;
 					target = initialTarget;
 					recomputeViewMatrix();
+					cameraMoved = true;
 				}
 		}
+
+		return cameraMoved;
 	}
 
 	void beginInputProcessing(std::chrono::microseconds _nextPresentationTimeStamp)

From c991e20d986b6d427b64925b3730f6723b62079e Mon Sep 17 00:00:00 2001
From: kevyuu <kevin.kayu@gmail.com>
Date: Wed, 22 Jan 2025 21:34:34 +0700
Subject: [PATCH 003/529] Add ImGui Overlay

Signed-off-by: kevyuu <kevin.kayu@gmail.com>
---
 71_RayTracingPipeline/CMakeLists.txt          |  45 +-
 .../app_resources/present.frag.hlsl           |  19 +
 71_RayTracingPipeline/include/common.hpp      |   4 +
 71_RayTracingPipeline/main.cpp                | 710 ++++++++++++------
 4 files changed, 528 insertions(+), 250 deletions(-)
 create mode 100644 71_RayTracingPipeline/app_resources/present.frag.hlsl

diff --git a/71_RayTracingPipeline/CMakeLists.txt b/71_RayTracingPipeline/CMakeLists.txt
index 4a555f4ce..07b0fd396 100644
--- a/71_RayTracingPipeline/CMakeLists.txt
+++ b/71_RayTracingPipeline/CMakeLists.txt
@@ -1,28 +1,37 @@
-set(NBL_INCLUDE_SEARCH_DIRECTORIES
-	"${CMAKE_CURRENT_SOURCE_DIR}/include"
-)
-
 include(common RESULT_VARIABLE RES)
 if(NOT RES)
-	message(FATAL_ERROR "common.cmake not found. Should be in {repo_root}/cmake directory")
+        message(FATAL_ERROR "common.cmake not found. Should be in {repo_root}/cmake directory")
 endif()
 
-nbl_create_executable_project("" "" "${NBL_INCLUDE_SEARCH_DIRECTORIES}" "" "${NBL_EXECUTABLE_PROJECT_CREATION_PCH_TARGET}")
+if(NBL_BUILD_IMGUI)
+	set(NBL_INCLUDE_SERACH_DIRECTORIES
+		"${CMAKE_CURRENT_SOURCE_DIR}/include"
+	)
+
+	list(APPEND NBL_LIBRARIES 
+		imtestengine
+		"${NBL_EXT_IMGUI_UI_LIB}"
+	)
+
+	nbl_create_executable_project("" "" "${NBL_INCLUDE_SERACH_DIRECTORIES}" "${NBL_LIBRARIES}" "${NBL_EXECUTABLE_PROJECT_CREATION_PCH_TARGET}")
 
-if(NBL_EMBED_BUILTIN_RESOURCES)
-	set(_BR_TARGET_ ${EXECUTABLE_NAME}_builtinResourceData)
-	set(RESOURCE_DIR "app_resources")
+	if(NBL_EMBED_BUILTIN_RESOURCES)
+		set(_BR_TARGET_ ${EXECUTABLE_NAME}_builtinResourceData)
+		set(RESOURCE_DIR "app_resources")
 
-	get_filename_component(_SEARCH_DIRECTORIES_ "${CMAKE_CURRENT_SOURCE_DIR}" ABSOLUTE)
-	get_filename_component(_OUTPUT_DIRECTORY_SOURCE_ "${CMAKE_CURRENT_BINARY_DIR}/src" ABSOLUTE)
-	get_filename_component(_OUTPUT_DIRECTORY_HEADER_ "${CMAKE_CURRENT_BINARY_DIR}/include" ABSOLUTE)
+		get_filename_component(_SEARCH_DIRECTORIES_ "${CMAKE_CURRENT_SOURCE_DIR}" ABSOLUTE)
+		get_filename_component(_OUTPUT_DIRECTORY_SOURCE_ "${CMAKE_CURRENT_BINARY_DIR}/src" ABSOLUTE)
+		get_filename_component(_OUTPUT_DIRECTORY_HEADER_ "${CMAKE_CURRENT_BINARY_DIR}/include" ABSOLUTE)
 
-    file(GLOB_RECURSE BUILTIN_RESOURCE_FILES RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}/${RESOURCE_DIR}" CONFIGURE_DEPENDS "${CMAKE_CURRENT_SOURCE_DIR}/${RESOURCE_DIR}/*")
-    foreach(RES_FILE ${BUILTIN_RESOURCE_FILES})
-      LIST_BUILTIN_RESOURCE(RESOURCES_TO_EMBED "${RES_FILE}")
-    endforeach()
+		file(GLOB_RECURSE BUILTIN_RESOURCE_FILES RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}/${RESOURCE_DIR}" "${CMAKE_CURRENT_SOURCE_DIR}/${RESOURCE_DIR}/*")
+		foreach(RES_FILE ${BUILTIN_RESOURCE_FILES})
+			LIST_BUILTIN_RESOURCE(RESOURCES_TO_EMBED "${RES_FILE}")
+		endforeach()
 
-	ADD_CUSTOM_BUILTIN_RESOURCES(${_BR_TARGET_} RESOURCES_TO_EMBED "${_SEARCH_DIRECTORIES_}" "${RESOURCE_DIR}" "nbl::this_example::builtin" "${_OUTPUT_DIRECTORY_HEADER_}" "${_OUTPUT_DIRECTORY_SOURCE_}")
+		ADD_CUSTOM_BUILTIN_RESOURCES(${_BR_TARGET_} RESOURCES_TO_EMBED "${_SEARCH_DIRECTORIES_}" "${RESOURCE_DIR}" "nbl::this_example::builtin" "${_OUTPUT_DIRECTORY_HEADER_}" "${_OUTPUT_DIRECTORY_SOURCE_}")
 
-	LINK_BUILTIN_RESOURCES_TO_TARGET(${EXECUTABLE_NAME} ${_BR_TARGET_})
+		LINK_BUILTIN_RESOURCES_TO_TARGET(${EXECUTABLE_NAME} ${_BR_TARGET_})
+	endif()
 endif()
+
+
diff --git a/71_RayTracingPipeline/app_resources/present.frag.hlsl b/71_RayTracingPipeline/app_resources/present.frag.hlsl
new file mode 100644
index 000000000..00ab6e31d
--- /dev/null
+++ b/71_RayTracingPipeline/app_resources/present.frag.hlsl
@@ -0,0 +1,19 @@
+// Copyright (C) 2024-2024 - DevSH Graphics Programming Sp. z O.O.
+// This file is part of the "Nabla Engine".
+// For conditions of distribution and use, see copyright notice in nabla.h
+
+#pragma wave shader_stage(fragment)
+
+// vertex shader is provided by the fullScreenTriangle extension
+#include <nbl/builtin/hlsl/ext/FullScreenTriangle/SVertexAttributes.hlsl>
+using namespace nbl::hlsl;
+using namespace ext::FullScreenTriangle;
+
+// binding 0 set 0
+[[vk::combinedImageSampler]] [[vk::binding(0, 0)]] Texture2D texture;
+[[vk::combinedImageSampler]] [[vk::binding(0, 0)]] SamplerState samplerState;
+
+[[vk::location(0)]] float32_t4 main(SVertexAttributes vxAttr) : SV_Target0
+{
+    return float32_t4(texture.Sample(samplerState, vxAttr.uv).rgb, 1.0f);
+}
diff --git a/71_RayTracingPipeline/include/common.hpp b/71_RayTracingPipeline/include/common.hpp
index e50cb4473..3a8411fd2 100644
--- a/71_RayTracingPipeline/include/common.hpp
+++ b/71_RayTracingPipeline/include/common.hpp
@@ -16,6 +16,10 @@
 #include <nbl/builtin/hlsl/cpp_compat/matrix.hlsl>
 #include <nbl/asset/IRayTracingPipeline.h>
 
+#include "nbl/ui/ICursorControl.h"
+#include "nbl/ext/ImGui/ImGui.h"
+#include "imgui/imgui_internal.h"
+
 using namespace nbl;
 using namespace core;
 using namespace hlsl;
diff --git a/71_RayTracingPipeline/main.cpp b/71_RayTracingPipeline/main.cpp
index 54a692317..4fc992c90 100644
--- a/71_RayTracingPipeline/main.cpp
+++ b/71_RayTracingPipeline/main.cpp
@@ -3,6 +3,7 @@
 // For conditions of distribution and use, see copyright notice in nabla.h
 
 #include "common.hpp"
+#include "nbl/ext/FullScreenTriangle/FullScreenTriangle.h"
 
 class RaytracingPipelineApp final : public examples::SimpleWindowedApplication, public application_templates::MonoAssetManagerAndBuiltinResourceApplication
 {
@@ -12,6 +13,15 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication,
 
   constexpr static inline uint32_t WIN_W = 1280, WIN_H = 720;
   constexpr static inline uint32_t MaxFramesInFlight = 3u;
+  constexpr static inline uint8_t MaxUITextureCount = 1u;
+
+  enum E_LIGHT_TYPE : uint8_t
+  {
+    ELT_SPHERE,
+    ELT_TRIANGLE,
+    ELT_RECTANGLE,
+    ELT_COUNT
+  };
 
   constexpr static inline clock_t::duration DisplayImageDuration = std::chrono::milliseconds(900);
 
@@ -23,12 +33,6 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication,
     SStridedBufferRegion<IGPUBuffer> callableGroupsRegion;
   };
 
-  struct CameraView
-  {
-    float32_t3 position;
-    float32_t3 target;
-    float32_t3 upVector;
-  };
 
 public:
   inline RaytracingPipelineApp(const path& _localInputCWD, const path& _localOutputCWD, const path& _sharedInputCWD, const path& _sharedOutputCWD)
@@ -97,31 +101,32 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication,
     if (!asset_base_t::onAppInitialized(smart_refctd_ptr(system)))
       return false;
 
-    const auto compileShader = [&]<typename... Args>(const std::string& filePath, const std::string& header = "") -> smart_refctd_ptr<IGPUShader>
+
+    const auto compileShader = [&](const std::string & filePath, const std::string & header = "") -> smart_refctd_ptr<IGPUShader>
+    {
+      IAssetLoader::SAssetLoadParams lparams = {};
+      lparams.logger = m_logger.get();
+      lparams.workingDirectory = "";
+      auto bundle = m_assetMgr->getAsset(filePath, lparams);
+      if (bundle.getContents().empty() || bundle.getAssetType() != IAsset::ET_SHADER)
       {
-        IAssetLoader::SAssetLoadParams lparams = {};
-        lparams.logger = m_logger.get();
-        lparams.workingDirectory = "";
-        auto bundle = m_assetMgr->getAsset(filePath, lparams);
-        if (bundle.getContents().empty() || bundle.getAssetType() != IAsset::ET_SHADER)
-        {
-          m_logger->log("Shader %s not found!", ILogger::ELL_ERROR, filePath);
-          exit(-1);
-        }
+        m_logger->log("Shader %s not found!", ILogger::ELL_ERROR, filePath);
+        exit(-1);
+      }
 
-        const auto assets = bundle.getContents();
-        assert(assets.size() == 1);
-        smart_refctd_ptr<ICPUShader> sourceRaw = IAsset::castDown<ICPUShader>(assets[0]);
-        if (!sourceRaw)
-          m_logger->log("Fail to load shader source", ILogger::ELL_ERROR, filePath);
-        smart_refctd_ptr<ICPUShader> source = CHLSLCompiler::createOverridenCopy(
-          sourceRaw.get(),
-          "%s\n",
-          header.c_str()
-        );
-
-        return m_device->createShader(source.get());
-      };
+      const auto assets = bundle.getContents();
+      assert(assets.size() == 1);
+      smart_refctd_ptr<ICPUShader> sourceRaw = IAsset::castDown<ICPUShader>(assets[0]);
+      if (!sourceRaw)
+        m_logger->log("Fail to load shader source", ILogger::ELL_ERROR, filePath);
+      smart_refctd_ptr<ICPUShader> source = CHLSLCompiler::createOverridenCopy(
+        sourceRaw.get(),
+        "%s\n",
+        header.c_str()
+      );
+
+      return m_device->createShader(source.get());
+    };
 
     // shader
     const auto raygenShader = compileShader("app_resources/raytrace.rgen.hlsl");
@@ -135,13 +140,49 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication,
     if (!m_semaphore)
       return logFail("Failed to Create a Semaphore!");
 
-    ISwapchain::SCreationParams swapchainParams = { .surface = core::smart_refctd_ptr<nbl::video::ISurface>(m_surface->getSurface()) };
-    if (!swapchainParams.deduceFormat(m_physicalDevice))
-      return logFail("Could not choose a Surface Format for the Swapchain!");
-
     auto gQueue = getGraphicsQueue();
-    if (!m_surface || !m_surface->init(gQueue, std::make_unique<ISimpleManagedSurface::ISwapchainResources>(), swapchainParams.sharedParams))
-      return logFail("Could not create Window & Surface or initialize the Surface!");
+
+    // Create renderpass and init surface
+    nbl::video::IGPURenderpass* renderpass;
+    {
+      ISwapchain::SCreationParams swapchainParams = { .surface = smart_refctd_ptr<ISurface>(m_surface->getSurface()) };
+      if (!swapchainParams.deduceFormat(m_physicalDevice))
+        return logFail("Could not choose a Surface Format for the Swapchain!");
+
+      const static IGPURenderpass::SCreationParams::SSubpassDependency dependencies[] =
+      {
+        {
+          .srcSubpass = IGPURenderpass::SCreationParams::SSubpassDependency::External,
+          .dstSubpass = 0,
+          .memoryBarrier =
+          {
+            .srcStageMask = asset::PIPELINE_STAGE_FLAGS::COPY_BIT,
+            .srcAccessMask = asset::ACCESS_FLAGS::TRANSFER_WRITE_BIT,
+            .dstStageMask = asset::PIPELINE_STAGE_FLAGS::COLOR_ATTACHMENT_OUTPUT_BIT,
+            .dstAccessMask = asset::ACCESS_FLAGS::COLOR_ATTACHMENT_WRITE_BIT
+          }
+        },
+        {
+          .srcSubpass = 0,
+          .dstSubpass = IGPURenderpass::SCreationParams::SSubpassDependency::External,
+          .memoryBarrier =
+          {
+            .srcStageMask = asset::PIPELINE_STAGE_FLAGS::COLOR_ATTACHMENT_OUTPUT_BIT,
+            .srcAccessMask = asset::ACCESS_FLAGS::COLOR_ATTACHMENT_WRITE_BIT
+          }
+        },
+        IGPURenderpass::SCreationParams::DependenciesEnd
+      };
+
+      auto scResources = std::make_unique<CDefaultSwapchainFramebuffers>(m_device.get(), swapchainParams.surfaceFormat.format, dependencies);
+      renderpass = scResources->getRenderpass();
+
+      if (!renderpass)
+        return logFail("Failed to create Renderpass!");
+
+      if (!m_surface || !m_surface->init(gQueue, std::move(scResources), swapchainParams.sharedParams))
+        return logFail("Could not create Window & Surface or initialize the Surface!");
+    }
 
     auto pool = m_device->createCommandPool(gQueue->getFamilyIndex(), IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT);
 
@@ -158,23 +199,33 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication,
     m_winMgr->setWindowSize(m_window.get(), WIN_W, WIN_H);
     m_surface->recreateSwapchain();
 
+
     // create output images
     m_hdrImage = m_device->createImage({
         {
           .type = IGPUImage::ET_2D,
-          .samples = asset::ICPUImage::ESCF_1_BIT,
-          .format = asset::EF_R16G16B16A16_SFLOAT,
+          .samples = ICPUImage::ESCF_1_BIT,
+          .format = EF_R16G16B16A16_SFLOAT,
           .extent = {WIN_W, WIN_H, 1},
           .mipLevels = 1,
           .arrayLayers = 1,
           .flags = IImage::ECF_NONE,
-          .usage = core::bitflag(asset::IImage::EUF_STORAGE_BIT) | asset::IImage::EUF_TRANSFER_SRC_BIT
+          .usage = bitflag(IImage::EUF_STORAGE_BIT) | IImage::EUF_TRANSFER_SRC_BIT | IImage::EUF_SAMPLED_BIT
         }
       });
 
     if (!m_hdrImage || !m_device->allocate(m_hdrImage->getMemoryReqs(), m_hdrImage.get()).isValid())
       return logFail("Could not create HDR Image");
 
+    m_hdrImageView = m_device->createImageView({
+      .flags = IGPUImageView::ECF_NONE,
+      .subUsages = IGPUImage::E_USAGE_FLAGS::EUF_STORAGE_BIT | IGPUImage::E_USAGE_FLAGS::EUF_SAMPLED_BIT,
+      .image = m_hdrImage,
+      .viewType = IGPUImageView::E_TYPE::ET_2D,
+      .format = asset::EF_R16G16B16A16_SFLOAT
+    });
+
+
     auto assetManager = make_smart_refctd_ptr<nbl::asset::IAssetManager>(smart_refctd_ptr(system));
     auto* geometryCreator = assetManager->getGeometryCreator();
 
@@ -187,10 +238,13 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication,
     if (!createAccelerationStructures(cQueue))
       return logFail("Could not create acceleration structures");
 
+    ISampler::SParams samplerParams = {
+      .AnisotropicFilter = 0
+    };
+    auto defaultSampler = m_device->createSampler(samplerParams);
 
-    // create pipelines
+    // ray trace pipeline and descriptor set layout setup
     {
-      // descriptors
       const IGPUDescriptorSetLayout::SBinding bindings[] = {
         {
           .binding = 0,
@@ -210,12 +264,8 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication,
       const auto descriptorSetLayout = m_device->createDescriptorSetLayout(bindings);
 
       const std::array<IGPUDescriptorSetLayout*, ICPUPipelineLayout::DESCRIPTOR_SET_COUNT> dsLayoutPtrs = { descriptorSetLayout.get() };
-      m_renderPool = m_device->createDescriptorPoolForDSLayouts(IDescriptorPool::ECF_UPDATE_AFTER_BIND_BIT, std::span(dsLayoutPtrs.begin(), dsLayoutPtrs.end()));
-      if (!m_renderPool)
-        return logFail("Could not create descriptor pool");
-      m_renderDs = m_renderPool->createDescriptorSet(descriptorSetLayout);
-      if (!m_renderDs)
-        return logFail("Could not create descriptor set");
+      m_rayTracingDsPool = m_device->createDescriptorPoolForDSLayouts(IDescriptorPool::ECF_UPDATE_AFTER_BIND_BIT, std::span(dsLayoutPtrs.begin(), dsLayoutPtrs.end()));
+      m_rayTracingDs = m_rayTracingDsPool->createDescriptorSet(descriptorSetLayout);
 
       const SPushConstantRange pcRange = {
         .stageFlags = IShader::E_SHADER_STAGE::ESS_ALL_RAY_TRACING,
@@ -226,7 +276,6 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication,
 
       IGPURayTracingPipeline::SCreationParams params = {};
 
-
       const IGPUShader::SSpecInfo shaders[] = {
           {.shader = raygenShader.get()},
           {.shader = closestHitShader.get()},
@@ -248,115 +297,241 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication,
       params.cached.maxRecursionDepth = 2;
       if (!m_device->createRayTracingPipelines(nullptr, { &params, 1 }, &m_rayTracingPipeline))
         return logFail("Failed to create ray tracing pipeline");
-      m_logger->log("Ray Tracing Pipeline Created!",system::ILogger::ELL_INFO);
+      m_logger->log("Ray Tracing Pipeline Created!", system::ILogger::ELL_INFO);
 
-      //create shader binding table
       if (!createShaderBindingTable(gQueue, m_rayTracingPipeline))
         return logFail("Could not create shader binding table");
     }
 
+    {
+      const IGPUDescriptorSetLayout::SBinding bindings[] = {
+        {
+          .binding = 0u,
+          .type = nbl::asset::IDescriptor::E_TYPE::ET_COMBINED_IMAGE_SAMPLER,
+          .createFlags = ICPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_NONE,
+          .stageFlags = IShader::E_SHADER_STAGE::ESS_FRAGMENT,
+          .count = 1u,
+          .immutableSamplers = &defaultSampler
+        }
+      };
+      auto gpuPresentDescriptorSetLayout = m_device->createDescriptorSetLayout(bindings);
+      const video::IGPUDescriptorSetLayout* const layouts[] = { gpuPresentDescriptorSetLayout.get() };
+      const uint32_t setCounts[] = { 1u };
+      m_presentDsPool = m_device->createDescriptorPoolForDSLayouts(IDescriptorPool::E_CREATE_FLAGS::ECF_NONE, layouts, setCounts);
+      m_presentDs = m_presentDsPool->createDescriptorSet(gpuPresentDescriptorSetLayout);
+
+      auto scRes = static_cast<CDefaultSwapchainFramebuffers*>(m_surface->getSwapchainResources());
+      ext::FullScreenTriangle::ProtoPipeline fsTriProtoPPln(m_assetMgr.get(), m_device.get(), m_logger.get());
+      if (!fsTriProtoPPln)
+        return logFail("Failed to create Full Screen Triangle protopipeline or load its vertex shader!");
+
+      // Load Fragment Shader
+      auto fragmentShader = compileShader("app_resources/present.frag.hlsl");
+      if (!fragmentShader)
+        return logFail("Failed to Load and Compile Fragment Shader: lumaMeterShader!");
+
+      const IGPUShader::SSpecInfo fragSpec = {
+        .entryPoint = "main",
+        .shader = fragmentShader.get()
+      };
+
+      auto presentLayout = m_device->createPipelineLayout(
+        {},
+        core::smart_refctd_ptr(gpuPresentDescriptorSetLayout),
+        nullptr,
+        nullptr,
+        nullptr
+      );
+      m_presentPipeline = fsTriProtoPPln.createPipeline(fragSpec, presentLayout.get(), scRes->getRenderpass());
+      if (!m_presentPipeline)
+        return logFail("Could not create Graphics Pipeline!");
+    }
 
     // write descriptors
-    IGPUDescriptorSet::SDescriptorInfo infos[2];
+    IGPUDescriptorSet::SDescriptorInfo infos[3];
     infos[0].desc = m_gpuTlas;
-    infos[1].desc = m_device->createImageView({
-        .flags = IGPUImageView::ECF_NONE,
-        .subUsages = IGPUImage::E_USAGE_FLAGS::EUF_STORAGE_BIT,
-        .image = m_hdrImage,
-        .viewType = IGPUImageView::E_TYPE::ET_2D,
-        .format = asset::EF_R16G16B16A16_SFLOAT
-      });
+
+    infos[1].desc = m_hdrImageView;
     if (!infos[1].desc)
       return logFail("Failed to create image view");
     infos[1].info.image.imageLayout = IImage::LAYOUT::GENERAL;
-    IGPUDescriptorSet::SWriteDescriptorSet writes[3] = {
-        {.dstSet = m_renderDs.get(), .binding = 0, .arrayElement = 0, .count = 1, .info = &infos[0]},
-        {.dstSet = m_renderDs.get(), .binding = 1, .arrayElement = 0, .count = 1, .info = &infos[1]}
+
+    infos[2].desc = m_hdrImageView;
+    infos[2].info.image.imageLayout = IImage::LAYOUT::READ_ONLY_OPTIMAL;
+
+    IGPUDescriptorSet::SWriteDescriptorSet writes[] = {
+        {.dstSet = m_rayTracingDs.get(), .binding = 0, .arrayElement = 0, .count = 1, .info = &infos[0]},
+        {.dstSet = m_rayTracingDs.get(), .binding = 1, .arrayElement = 0, .count = 1, .info = &infos[1]},
+        {.dstSet = m_presentDs.get(), .binding = 0, .arrayElement = 0, .count = 1, .info = &infos[2] },
     };
-    m_device->updateDescriptorSets(std::span(writes, 2), {});
+    m_device->updateDescriptorSets(std::span(writes), {});
+
+    // gui descriptor setup
+    {
+      using binding_flags_t = IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS;
+      {
+        IGPUSampler::SParams params;
+        params.AnisotropicFilter = 1u;
+        params.TextureWrapU = ISampler::ETC_REPEAT;
+        params.TextureWrapV = ISampler::ETC_REPEAT;
+        params.TextureWrapW = ISampler::ETC_REPEAT;
+
+        m_ui.samplers.gui = m_device->createSampler(params);
+        m_ui.samplers.gui->setObjectDebugName("Nabla IMGUI UI Sampler");
+      }
+
+      std::array<core::smart_refctd_ptr<IGPUSampler>, 69u> immutableSamplers;
+      for (auto& it : immutableSamplers)
+        it = smart_refctd_ptr(m_ui.samplers.scene);
+
+      immutableSamplers[nbl::ext::imgui::UI::FontAtlasTexId] = smart_refctd_ptr(m_ui.samplers.gui);
+
+      nbl::ext::imgui::UI::SCreationParameters params;
 
-    // camera
+      params.resources.texturesInfo = { .setIx = 0u, .bindingIx = 0u };
+      params.resources.samplersInfo = { .setIx = 0u, .bindingIx = 1u };
+      params.assetManager = m_assetMgr;
+      params.pipelineCache = nullptr;
+      params.pipelineLayout = nbl::ext::imgui::UI::createDefaultPipelineLayout(m_utils->getLogicalDevice(), params.resources.texturesInfo, params.resources.samplersInfo, MaxUITextureCount);
+      params.renderpass = smart_refctd_ptr<IGPURenderpass>(renderpass);
+      params.streamingBuffer = nullptr;
+      params.subpassIx = 0u;
+      params.transfer = getTransferUpQueue();
+      params.utilities = m_utils;
+      {
+        m_ui.manager = ext::imgui::UI::create(std::move(params));
+
+        // note that we use default layout provided by our extension, but you are free to create your own by filling nbl::ext::imgui::UI::S_CREATION_PARAMETERS::resources
+        const auto* descriptorSetLayout = m_ui.manager->getPipeline()->getLayout()->getDescriptorSetLayout(0u);
+        const auto& params = m_ui.manager->getCreationParameters();
+
+        IDescriptorPool::SCreateInfo descriptorPoolInfo = {};
+        descriptorPoolInfo.maxDescriptorCount[static_cast<uint32_t>(asset::IDescriptor::E_TYPE::ET_SAMPLER)] = (uint32_t)nbl::ext::imgui::UI::DefaultSamplerIx::COUNT;
+        descriptorPoolInfo.maxDescriptorCount[static_cast<uint32_t>(asset::IDescriptor::E_TYPE::ET_SAMPLED_IMAGE)] = MaxUITextureCount;
+        descriptorPoolInfo.maxSets = 1u;
+        descriptorPoolInfo.flags = IDescriptorPool::E_CREATE_FLAGS::ECF_UPDATE_AFTER_BIND_BIT;
+
+        m_guiDescriptorSetPool = m_device->createDescriptorPool(std::move(descriptorPoolInfo));
+        assert(m_guiDescriptorSetPool);
+
+        m_guiDescriptorSetPool->createDescriptorSets(1u, &descriptorSetLayout, &m_ui.descriptorSet);
+        assert(m_ui.descriptorSet);
+      }
+    }
+
+    m_ui.manager->registerListener(
+      [this]() -> void {
+        ImGuiIO& io = ImGui::GetIO();
+
+        m_camera.setProjectionMatrix([&]()
+        {
+          static matrix4SIMD projection;
+
+          projection = matrix4SIMD::buildProjectionMatrixPerspectiveFovRH(core::radians(fov), io.DisplaySize.x / io.DisplaySize.y, zNear, zFar);
+
+          return projection;
+        }());
+
+        ImGui::SetNextWindowPos(ImVec2(1024, 100), ImGuiCond_Appearing);
+        ImGui::SetNextWindowSize(ImVec2(256, 256), ImGuiCond_Appearing);
+
+        // create a window and insert the inspector
+        ImGui::SetNextWindowPos(ImVec2(10, 10), ImGuiCond_Appearing);
+        ImGui::SetNextWindowSize(ImVec2(320, 340), ImGuiCond_Appearing);
+        ImGui::Begin("Controls");
+
+        ImGui::SameLine();
+
+        ImGui::Text("Camera");
+
+        ImGui::SliderFloat("Move speed", &moveSpeed, 0.1f, 10.f);
+        ImGui::SliderFloat("Rotate speed", &rotateSpeed, 0.1f, 10.f);
+        ImGui::SliderFloat("Fov", &fov, 20.f, 150.f);
+        ImGui::SliderFloat("zNear", &zNear, 0.1f, 100.f);
+        ImGui::SliderFloat("zFar", &zFar, 110.f, 10000.f);
+
+        ImGui::Text("X: %f Y: %f", io.MousePos.x, io.MousePos.y);
+
+        ImGui::End();
+      }
+    );
+
+    // Set Camera
     {
-      core::vectorSIMDf cameraPosition(-5.81655884, 2.58630896, -4.23974705);
-      core::vectorSIMDf cameraTarget(-0.349590302, -0.213266611, 0.317821503);
-      matrix4SIMD projectionMatrix = matrix4SIMD::buildProjectionMatrixPerspectiveFovLH(core::radians(60.0f), float(WIN_W) / WIN_H, 0.1, 1000);
-      m_camera = Camera(cameraPosition, cameraTarget, projectionMatrix, 1.069f, 0.4f);
+      core::vectorSIMDf cameraPosition(0, 5, -10);
+      matrix4SIMD proj = matrix4SIMD::buildProjectionMatrixPerspectiveFovRH(
+        core::radians(60.0f),
+        WIN_W / WIN_H,
+        0.01f,
+        500.0f
+      );
+      m_camera = Camera(cameraPosition, core::vectorSIMDf(0, 0, 0), proj);
     }
 
+    m_winMgr->setWindowSize(m_window.get(), WIN_W, WIN_H);
+    m_surface->recreateSwapchain();
     m_winMgr->show(m_window.get());
     m_oracle.reportBeginFrameRecord();
+    m_camera.mapKeysToWASD();
 
     return true;
   }
 
-  inline void workLoopBody() override
+  bool updateGUIDescriptorSet()
   {
-    const auto resourceIx = m_realFrameIx % MaxFramesInFlight;
+    // texture atlas, note we don't create info & write pair for the font sampler because UI extension's is immutable and baked into DS layout
+    static std::array<IGPUDescriptorSet::SDescriptorInfo, MaxUITextureCount> descriptorInfo;
+    static IGPUDescriptorSet::SWriteDescriptorSet writes[MaxUITextureCount];
 
-    const uint32_t framesInFlight = core::min(MaxFramesInFlight, m_surface->getMaxAcquiresInFlight());
+    descriptorInfo[nbl::ext::imgui::UI::FontAtlasTexId].info.image.imageLayout = IImage::LAYOUT::READ_ONLY_OPTIMAL;
+    descriptorInfo[nbl::ext::imgui::UI::FontAtlasTexId].desc = smart_refctd_ptr<IGPUImageView>(m_ui.manager->getFontAtlasView());
+
+    for (uint32_t i = 0; i < descriptorInfo.size(); ++i)
+    {
+      writes[i].dstSet = m_ui.descriptorSet.get();
+      writes[i].binding = 0u;
+      writes[i].arrayElement = i;
+      writes[i].count = 1u;
+    }
+    writes[nbl::ext::imgui::UI::FontAtlasTexId].info = descriptorInfo.data() + nbl::ext::imgui::UI::FontAtlasTexId;
+
+    return m_device->updateDescriptorSets(writes, {});
+  }
 
+  inline void workLoopBody() override
+  {
+    // framesInFlight: ensuring safe execution of command buffers and acquires, `framesInFlight` only affect semaphore waits, don't use this to index your resources because it can change with swapchain recreation.
+    const uint32_t framesInFlight = core::min(MaxFramesInFlight, m_surface->getMaxAcquiresInFlight());
+    // We block for semaphores for 2 reasons here:
+      // A) Resource: Can't use resource like a command buffer BEFORE previous use is finished! [MaxFramesInFlight]
+      // B) Acquire: Can't have more acquires in flight than a certain threshold returned by swapchain or your surface helper class. [MaxAcquiresInFlight]
     if (m_realFrameIx >= framesInFlight)
     {
-      const ISemaphore::SWaitInfo cbDonePending[] =
+      const ISemaphore::SWaitInfo cbDonePending[] = 
       {
-          {
-            .semaphore = m_semaphore.get(),
-            .value = m_realFrameIx + 1 - framesInFlight
-          }
+        {
+          .semaphore = m_semaphore.get(),
+          .value = m_realFrameIx + 1 - framesInFlight
+        }
       };
       if (m_device->blockForSemaphores(cbDonePending) != ISemaphore::WAIT_RESULT::SUCCESS)
         return;
     }
+    const auto resourceIx = m_realFrameIx % MaxFramesInFlight;
 
-    m_inputSystem->getDefaultMouse(&m_mouse);
-    m_inputSystem->getDefaultKeyboard(&m_keyboard);
-
-    auto updatePresentationTimestamp = [&]()
-      {
-        m_currentImageAcquire = m_surface->acquireNextImage();
-
-        m_oracle.reportEndFrameRecord();
-        const auto timestamp = m_oracle.getNextPresentationTimeStamp();
-        m_oracle.reportBeginFrameRecord();
+    m_api->startCapture();
 
-        return timestamp;
-      };
+    update();
 
-    const auto nextPresentationTimestamp = updatePresentationTimestamp();
+    auto queue = getGraphicsQueue();
+    auto cmdbuf = m_cmdBufs[resourceIx].get();
 
-    if (!m_currentImageAcquire)
+    if (!keepRunning())
       return;
 
-    static bool first = true;
-    if (first)
-    {
-      m_api->startCapture();
-      first = false;
-    }
-
-    auto* const cmdbuf = m_cmdBufs.data()[resourceIx].get();
     cmdbuf->reset(IGPUCommandBuffer::RESET_FLAGS::RELEASE_RESOURCES_BIT);
     cmdbuf->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT);
     cmdbuf->beginDebugMarker("RaytracingPipelineApp Frame");
-    {
-      m_camera.beginInputProcessing(nextPresentationTimestamp);
-      m_mouse.consumeEvents([&](const IMouseEventChannel::range_t& events) -> void
-      {
-        if (m_camera.mouseProcess(events)) 
-        {
-          m_frameAccumulationCounter = 0;
-        }
-      }, m_logger.get());
-      m_keyboard.consumeEvents([&](const IKeyboardEventChannel::range_t& events) -> void
-      {
-        if (m_camera.keyboardProcess(events))
-        {
-          m_frameAccumulationCounter = 0;
-        }
-      }, m_logger.get());
-      m_camera.endInputProcessing(nextPresentationTimestamp);
-
-    }
 
     const auto viewMatrix = m_camera.getViewMatrix();
     const auto projectionMatrix = m_camera.getProjectionMatrix();
@@ -370,14 +545,12 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication,
     core::matrix4SIMD invModelViewProjectionMatrix;
     modelViewProjectionMatrix.getInverseTransform(invModelViewProjectionMatrix);
 
-    auto* queue = getGraphicsQueue();
-
     {
       IGPUCommandBuffer::SPipelineBarrierDependencyInfo::image_barrier_t imageBarriers[1];
       imageBarriers[0].barrier = {
          .dep = {
-           .srcStageMask = PIPELINE_STAGE_FLAGS::NONE,
-           .srcAccessMask = ACCESS_FLAGS::NONE,
+           .srcStageMask = PIPELINE_STAGE_FLAGS::FRAGMENT_SHADER_BIT, // previous frame read from framgent shader
+           .srcAccessMask = ACCESS_FLAGS::SHADER_READ_BITS,
            .dstStageMask = PIPELINE_STAGE_FLAGS::RAY_TRACING_SHADER_BIT,
            .dstAccessMask = ACCESS_FLAGS::SHADER_WRITE_BITS
         }
@@ -390,37 +563,39 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication,
         .baseArrayLayer = 0u,
         .layerCount = 1u
       };
-      imageBarriers[0].oldLayout = IImage::LAYOUT::UNDEFINED;
+      imageBarriers[0].oldLayout = m_frameAccumulationCounter == 0 ? IImage::LAYOUT::UNDEFINED : IImage::LAYOUT::READ_ONLY_OPTIMAL;
       imageBarriers[0].newLayout = IImage::LAYOUT::GENERAL;
       cmdbuf->pipelineBarrier(E_DEPENDENCY_FLAGS::EDF_NONE, { .imgBarriers = imageBarriers });
     }
 
-    // do ray query
-    SPushConstants pc;
-    pc.geometryInfoBuffer = m_geometryInfoBuffer->getDeviceAddress();
-    pc.frameCounter = m_frameAccumulationCounter;
-    const core::vector3df camPos = m_camera.getPosition().getAsVector3df();
-    pc.camPos = { camPos.X, camPos.Y, camPos.Z };
-    memcpy(&pc.invMVP, invModelViewProjectionMatrix.pointer(), sizeof(pc.invMVP));
-
-    cmdbuf->bindRayTracingPipeline(m_rayTracingPipeline.get());
-    cmdbuf->pushConstants(m_rayTracingPipeline->getLayout(), IShader::E_SHADER_STAGE::ESS_ALL_RAY_TRACING, 0, sizeof(SPushConstants), &pc);
-    cmdbuf->bindDescriptorSets(EPBP_RAY_TRACING, m_rayTracingPipeline->getLayout(), 0, 1, &m_renderDs.get());
-    cmdbuf->traceRays(m_shaderBindingTable.raygenGroupRegion, 
-      m_shaderBindingTable.missGroupsRegion,
-      m_shaderBindingTable.hitGroupsRegion,
-      m_shaderBindingTable.callableGroupsRegion,
-      WIN_W, WIN_H, 1);
-
-    // blit
+    // Trace Rays Pass
+    {
+      SPushConstants pc;
+      pc.geometryInfoBuffer = m_geometryInfoBuffer->getDeviceAddress();
+      pc.frameCounter = m_frameAccumulationCounter;
+      const core::vector3df camPos = m_camera.getPosition().getAsVector3df();
+      pc.camPos = { camPos.X, camPos.Y, camPos.Z };
+      memcpy(&pc.invMVP, invModelViewProjectionMatrix.pointer(), sizeof(pc.invMVP));
+
+      cmdbuf->bindRayTracingPipeline(m_rayTracingPipeline.get());
+      cmdbuf->pushConstants(m_rayTracingPipeline->getLayout(), IShader::E_SHADER_STAGE::ESS_ALL_RAY_TRACING, 0, sizeof(SPushConstants), &pc);
+      cmdbuf->bindDescriptorSets(EPBP_RAY_TRACING, m_rayTracingPipeline->getLayout(), 0, 1, &m_rayTracingDs.get());
+      cmdbuf->traceRays(m_shaderBindingTable.raygenGroupRegion,
+        m_shaderBindingTable.missGroupsRegion,
+        m_shaderBindingTable.hitGroupsRegion,
+        m_shaderBindingTable.callableGroupsRegion,
+        WIN_W, WIN_H, 1);
+    }
+
+    // pipeline barrier
     {
-      IGPUCommandBuffer::SPipelineBarrierDependencyInfo::image_barrier_t imageBarriers[2];
+      IGPUCommandBuffer::SPipelineBarrierDependencyInfo::image_barrier_t imageBarriers[1];
       imageBarriers[0].barrier = {
-         .dep = {
-           .srcStageMask = PIPELINE_STAGE_FLAGS::RAY_TRACING_SHADER_BIT,
-           .srcAccessMask = ACCESS_FLAGS::SHADER_WRITE_BITS,
-           .dstStageMask = PIPELINE_STAGE_FLAGS::BLIT_BIT,
-           .dstAccessMask = ACCESS_FLAGS::TRANSFER_WRITE_BIT
+        .dep = {
+          .srcStageMask = PIPELINE_STAGE_FLAGS::RAY_TRACING_SHADER_BIT,
+          .srcAccessMask = ACCESS_FLAGS::SHADER_WRITE_BITS,
+          .dstStageMask = PIPELINE_STAGE_FLAGS::COLOR_ATTACHMENT_OUTPUT_BIT,
+          .dstAccessMask = ACCESS_FLAGS::COLOR_ATTACHMENT_WRITE_BIT
         }
       };
       imageBarriers[0].image = m_hdrImage.get();
@@ -431,75 +606,58 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication,
         .baseArrayLayer = 0u,
         .layerCount = 1u
       };
-      imageBarriers[0].oldLayout = IImage::LAYOUT::UNDEFINED;
-      imageBarriers[0].newLayout = IImage::LAYOUT::TRANSFER_SRC_OPTIMAL;
-
-      imageBarriers[1].barrier = {
-         .dep = {
-           .srcStageMask = PIPELINE_STAGE_FLAGS::NONE,
-           .srcAccessMask = ACCESS_FLAGS::NONE,
-           .dstStageMask = PIPELINE_STAGE_FLAGS::BLIT_BIT,
-           .dstAccessMask = ACCESS_FLAGS::TRANSFER_WRITE_BIT
-        }
-      };
-      imageBarriers[1].image = m_surface->getSwapchainResources()->getImage(m_currentImageAcquire.imageIndex);
-      imageBarriers[1].subresourceRange = {
-        .aspectMask = IImage::EAF_COLOR_BIT,
-        .baseMipLevel = 0u,
-        .levelCount = 1u,
-        .baseArrayLayer = 0u,
-        .layerCount = 1u
-      };
-      imageBarriers[1].oldLayout = IImage::LAYOUT::UNDEFINED;
-      imageBarriers[1].newLayout = IImage::LAYOUT::TRANSFER_DST_OPTIMAL;
+      imageBarriers[0].oldLayout = IImage::LAYOUT::GENERAL;
+      imageBarriers[0].newLayout = IImage::LAYOUT::READ_ONLY_OPTIMAL;
 
       cmdbuf->pipelineBarrier(E_DEPENDENCY_FLAGS::EDF_NONE, { .imgBarriers = imageBarriers });
     }
 
     {
-      IGPUCommandBuffer::SImageBlit regions[] = { {
-        .srcMinCoord = {0,0,0},
-        .srcMaxCoord = {WIN_W,WIN_H,1},
-        .dstMinCoord = {0,0,0},
-        .dstMaxCoord = {WIN_W,WIN_H,1},
-        .layerCount = 1,
-        .srcBaseLayer = 0,
-        .dstBaseLayer = 0,
-        .srcMipLevel = 0,
-        .dstMipLevel = 0,
-        .aspectMask = IGPUImage::E_ASPECT_FLAGS::EAF_COLOR_BIT
-      } };
+			asset::SViewport viewport;
+			{
+				viewport.minDepth = 1.f;
+				viewport.maxDepth = 0.f;
+				viewport.x = 0u;
+				viewport.y = 0u;
+				viewport.width = WIN_W;
+				viewport.height = WIN_H;
+			}
+			cmdbuf->setViewport(0u, 1u, &viewport);
 
-      auto srcImg = m_hdrImage.get();
-      auto scRes = static_cast<CDefaultSwapchainFramebuffers*>(m_surface->getSwapchainResources());
-      auto dstImg = scRes->getImage(m_currentImageAcquire.imageIndex);
 
-      cmdbuf->blitImage(srcImg, IImage::LAYOUT::TRANSFER_SRC_OPTIMAL, dstImg, IImage::LAYOUT::TRANSFER_DST_OPTIMAL, regions, ISampler::ETF_NEAREST);
-    }
+			VkRect2D defaultScisors[] = { {.offset = {(int32_t)viewport.x, (int32_t)viewport.y}, .extent = {(uint32_t)viewport.width, (uint32_t)viewport.height}} };
+			cmdbuf->setScissor(defaultScisors);
 
-    // TODO: transition to present
-    {
-      IGPUCommandBuffer::SPipelineBarrierDependencyInfo::image_barrier_t imageBarriers[1];
-      imageBarriers[0].barrier = {
-         .dep = {
-           .srcStageMask = PIPELINE_STAGE_FLAGS::BLIT_BIT,
-           .srcAccessMask = ACCESS_FLAGS::TRANSFER_WRITE_BIT,
-           .dstStageMask = PIPELINE_STAGE_FLAGS::NONE,
-           .dstAccessMask = ACCESS_FLAGS::NONE
-        }
+      auto scRes = static_cast<CDefaultSwapchainFramebuffers*>(m_surface->getSwapchainResources());
+      const VkRect2D currentRenderArea =
+      {
+        .offset = {0,0},
+        .extent = {m_window->getWidth(),m_window->getHeight()}
       };
-      imageBarriers[0].image = m_surface->getSwapchainResources()->getImage(m_currentImageAcquire.imageIndex);
-      imageBarriers[0].subresourceRange = {
-        .aspectMask = IImage::EAF_COLOR_BIT,
-        .baseMipLevel = 0u,
-        .levelCount = 1u,
-        .baseArrayLayer = 0u,
-        .layerCount = 1u
+      const IGPUCommandBuffer::SClearColorValue clearColor = { .float32 = {0.f,0.f,0.f,1.f} };
+      const IGPUCommandBuffer::SRenderpassBeginInfo info =
+      {
+        .framebuffer = scRes->getFramebuffer(m_currentImageAcquire.imageIndex),
+        .colorClearValues = &clearColor,
+        .depthStencilClearValues = nullptr,
+        .renderArea = currentRenderArea
       };
-      imageBarriers[0].oldLayout = IImage::LAYOUT::TRANSFER_DST_OPTIMAL;
-      imageBarriers[0].newLayout = IImage::LAYOUT::PRESENT_SRC;
+      nbl::video::ISemaphore::SWaitInfo waitInfo = { .semaphore = m_semaphore.get(), .value = m_realFrameIx + 1u };
+
+      cmdbuf->beginRenderPass(info, IGPUCommandBuffer::SUBPASS_CONTENTS::INLINE);
+
+      cmdbuf->bindGraphicsPipeline(m_presentPipeline.get());
+      cmdbuf->bindDescriptorSets(EPBP_GRAPHICS, m_presentPipeline->getLayout(), 0, 1u, &m_presentDs.get());
+      ext::FullScreenTriangle::recordDrawCall(cmdbuf);
+
+      const auto uiParams = m_ui.manager->getCreationParameters();
+      auto* uiPipeline = m_ui.manager->getPipeline();
+      cmdbuf->bindGraphicsPipeline(uiPipeline);
+      cmdbuf->bindDescriptorSets(EPBP_GRAPHICS, uiPipeline->getLayout(), uiParams.resources.texturesInfo.setIx, 1u, &m_ui.descriptorSet.get());
+      m_ui.manager->render(cmdbuf, waitInfo);
+
+      cmdbuf->endRenderPass();
 
-      cmdbuf->pipelineBarrier(E_DEPENDENCY_FLAGS::EDF_NONE, { .imgBarriers = imageBarriers });
     }
 
     cmdbuf->endDebugMarker();
@@ -538,32 +696,102 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication,
             }
           };
 
-          if (queue->submit(infos) == IQueue::RESULT::SUCCESS)
-          {
-            const nbl::video::ISemaphore::SWaitInfo waitInfos[] =
-            { {
-              .semaphore = m_semaphore.get(),
-              .value = m_realFrameIx
-            } };
+          updateGUIDescriptorSet();
 
-            m_device->blockForSemaphores(waitInfos); // this is not solution, quick wa to not throw validation errors
-          }
-          else
-            --m_realFrameIx;
+          if (queue->submit(infos) != IQueue::RESULT::SUCCESS)
+            m_realFrameIx--;
         }
       }
 
-      std::string caption = "[Nabla Engine] Ray Tracing Pipeline";
-      {
-        caption += ", displaying [all objects]";
-        m_window->setCaption(caption);
-      }
+      m_window->setCaption("[Nabla Engine] Ray Tracing Pipeline");
       m_surface->present(m_currentImageAcquire.imageIndex, rendered);
     }
-
+    m_api->endCapture();
     m_frameAccumulationCounter++;
   }
 
+  inline void update()
+  {
+    m_camera.setMoveSpeed(moveSpeed);
+    m_camera.setRotateSpeed(rotateSpeed);
+
+    static std::chrono::microseconds previousEventTimestamp{};
+
+    m_inputSystem->getDefaultMouse(&m_mouse);
+    m_inputSystem->getDefaultKeyboard(&m_keyboard);
+
+    auto updatePresentationTimestamp = [&]()
+      {
+        m_currentImageAcquire = m_surface->acquireNextImage();
+
+        m_oracle.reportEndFrameRecord();
+        const auto timestamp = m_oracle.getNextPresentationTimeStamp();
+        m_oracle.reportBeginFrameRecord();
+
+        return timestamp;
+      };
+
+    const auto nextPresentationTimestamp = updatePresentationTimestamp();
+
+    struct
+    {
+      std::vector<SMouseEvent> mouse{};
+      std::vector<SKeyboardEvent> keyboard{};
+    } capturedEvents;
+
+    m_camera.beginInputProcessing(nextPresentationTimestamp);
+    {
+      bool camera_moved = false;
+      m_mouse.consumeEvents([&](const IMouseEventChannel::range_t& events) -> void
+        {
+          camera_moved |= m_camera.mouseProcess(events); // don't capture the events, only let camera handle them with its impl
+
+          for (const auto& e : events) // here capture
+          {
+            if (e.timeStamp < previousEventTimestamp)
+              continue;
+
+            previousEventTimestamp = e.timeStamp;
+            capturedEvents.mouse.emplace_back(e);
+
+          }
+        }, m_logger.get());
+
+      m_keyboard.consumeEvents([&](const IKeyboardEventChannel::range_t& events) -> void
+        {
+          camera_moved |= m_camera.keyboardProcess(events); // don't capture the events, only let camera handle them with its impl
+
+          for (const auto& e : events) // here capture
+          {
+            if (e.timeStamp < previousEventTimestamp)
+              continue;
+
+            previousEventTimestamp = e.timeStamp;
+            capturedEvents.keyboard.emplace_back(e);
+          }
+        }, m_logger.get());
+
+      if (camera_moved)
+        m_frameAccumulationCounter = 0;
+    }
+    m_camera.endInputProcessing(nextPresentationTimestamp);
+
+    const core::SRange<const nbl::ui::SMouseEvent> mouseEvents(capturedEvents.mouse.data(), capturedEvents.mouse.data() + capturedEvents.mouse.size());
+    const core::SRange<const nbl::ui::SKeyboardEvent> keyboardEvents(capturedEvents.keyboard.data(), capturedEvents.keyboard.data() + capturedEvents.keyboard.size());
+    const auto cursorPosition = m_window->getCursorControl()->getPosition();
+    const auto mousePosition = float32_t2(cursorPosition.x, cursorPosition.y) - float32_t2(m_window->getX(), m_window->getY());
+
+    const ext::imgui::UI::SUpdateParameters params =
+    {
+      .mousePosition = mousePosition,
+      .displaySize = { m_window->getWidth(), m_window->getHeight() },
+      .mouseEvents = mouseEvents,
+      .keyboardEvents = keyboardEvents
+    };
+
+    m_ui.manager->update(params);
+  }
+
   inline bool keepRunning() override
   {
     if (m_surface->irrecoverable())
@@ -673,9 +901,9 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication,
         transform.setTranslation(nbl::core::vectorSIMDf(x, y, z, 0));
         return transform;
       };
-    
+
     core::matrix3x4SIMD planeTransform;
-    planeTransform.setRotation(quaternion::fromAngleAxis(core::radians(-90.0f), vector3df_SIMD{1, 0, 0}));
+    planeTransform.setRotation(quaternion::fromAngleAxis(core::radians(-90.0f), vector3df_SIMD{ 1, 0, 0 }));
 
     const auto cpuObjects = std::array{
       ReferenceObjectCpu {
@@ -842,7 +1070,7 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication,
           .indexCount = cpuObject.data.indexCount,
           .material = cpuObject.material,
           .transform = cpuObject.transform,
-        });
+          });
       }
 
       for (uint32_t i = 0; i < m_gpuObjects.size(); i++)
@@ -912,7 +1140,7 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication,
     cpuBufferParams.size = bufferSize;
     auto cpuBuffer = ICPUBuffer::create(std::move(cpuBufferParams));
     uint8_t* pData = reinterpret_cast<uint8_t*>(cpuBuffer->getPointer());
-    
+
     // copy raygen region
     memcpy(pData, pipeline->getRaygenGroupShaderHandle().data(), handleSize);
 
@@ -956,7 +1184,7 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication,
 
   bool createAccelerationStructures(video::CThreadSafeQueueAdapter* queue)
   {
-    IQueryPool::SCreationParams qParams{ .queryCount = static_cast<uint32_t>(m_gpuObjects.size()), .queryType = IQueryPool::ACCELERATION_STRUCTURE_COMPACTED_SIZE};
+    IQueryPool::SCreationParams qParams{ .queryCount = static_cast<uint32_t>(m_gpuObjects.size()), .queryType = IQueryPool::ACCELERATION_STRUCTURE_COMPACTED_SIZE };
     smart_refctd_ptr<IQueryPool> queryPool = m_device->createQueryPool(std::move(qParams));
 
     auto pool = m_device->createCommandPool(queue->getFamilyIndex(), IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT | IGPUCommandPool::CREATE_FLAGS::TRANSIENT_BIT);
@@ -1253,7 +1481,7 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication,
   smart_refctd_ptr<CSimpleResizeSurface<ISimpleManagedSurface::ISwapchainResources>> m_surface;
   smart_refctd_ptr<ISemaphore> m_semaphore;
   uint64_t m_realFrameIx = 0;
-  uint32_t m_frameAccumulationCounter = -1;
+  uint32_t m_frameAccumulationCounter = 0;
   std::array<smart_refctd_ptr<IGPUCommandBuffer>, MaxFramesInFlight> m_cmdBufs;
   ISimpleManagedSurface::SAcquireResult m_currentImageAcquire = {};
 
@@ -1261,10 +1489,26 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication,
   InputSystem::ChannelReader<IMouseEventChannel> m_mouse;
   InputSystem::ChannelReader<IKeyboardEventChannel> m_keyboard;
 
+  float fov = 60.f, zNear = 0.1f, zFar = 10000.f, moveSpeed = 1.f, rotateSpeed = 1.f;
+  float viewWidth = 10.f;
+  float camYAngle = 165.f / 180.f * 3.14159f;
+  float camXAngle = 32.f / 180.f * 3.14159f;
   Camera m_camera = Camera(core::vectorSIMDf(0, 0, 0), core::vectorSIMDf(0, 0, 0), core::matrix4SIMD());
-  CameraView m_oldCameraView;
   video::CDumbPresentationOracle m_oracle;
 
+  struct C_UI
+  {
+    nbl::core::smart_refctd_ptr<nbl::ext::imgui::UI> manager;
+
+    struct
+    {
+      core::smart_refctd_ptr<video::IGPUSampler> gui, scene;
+    } samplers;
+
+    core::smart_refctd_ptr<IGPUDescriptorSet> descriptorSet;
+  } m_ui;
+  core::smart_refctd_ptr<IDescriptorPool> m_guiDescriptorSetPool;
+
   std::vector<ReferenceObjectGpu> m_gpuObjects;
 
   std::vector<smart_refctd_ptr<IGPUBottomLevelAccelerationStructure>> m_gpuBlasList;
@@ -1272,17 +1516,19 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication,
   smart_refctd_ptr<IGPUBuffer> m_instanceBuffer;
 
   smart_refctd_ptr<IGPUBuffer> m_geometryInfoBuffer;
-  ShaderBindingTable m_shaderBindingTable;
   smart_refctd_ptr<IGPUImage> m_hdrImage;
+  smart_refctd_ptr<IGPUImageView> m_hdrImageView;
 
+  smart_refctd_ptr<IDescriptorPool> m_rayTracingDsPool;
+  smart_refctd_ptr<IGPUDescriptorSet> m_rayTracingDs;
   smart_refctd_ptr<IGPURayTracingPipeline> m_rayTracingPipeline;
-  smart_refctd_ptr<IGPUDescriptorSet> m_renderDs;
-  smart_refctd_ptr<IDescriptorPool> m_renderPool;
+  ShaderBindingTable m_shaderBindingTable;
 
-  smart_refctd_ptr<CAssetConverter> m_converter;
-  smart_refctd_ptr<IGPUBuffer> m_sbtBuffer;
+  smart_refctd_ptr<IGPUDescriptorSet> m_presentDs;
+  smart_refctd_ptr<IDescriptorPool> m_presentDsPool;
+  smart_refctd_ptr<IGPUGraphicsPipeline> m_presentPipeline;
 
-  uint16_t gcIndex = {};
+  smart_refctd_ptr<CAssetConverter> m_converter;
 
 };
 

From 6ac8f88906b02ad2464961c9106d79474bfa191d Mon Sep 17 00:00:00 2001
From: kevyuu <kevin.kayu@gmail.com>
Date: Wed, 22 Jan 2025 22:49:22 +0700
Subject: [PATCH 004/529] Implement multiple light type

Signed-off-by: kevyuu <kevin.kayu@gmail.com>
---
 .../app_resources/common.hlsl                 |  37 ++++-
 .../app_resources/raytrace.rchit.hlsl         | 137 +++++++++++-------
 71_RayTracingPipeline/main.cpp                |  60 ++++++--
 3 files changed, 167 insertions(+), 67 deletions(-)

diff --git a/71_RayTracingPipeline/app_resources/common.hlsl b/71_RayTracingPipeline/app_resources/common.hlsl
index 3b6c36abc..d28e646fe 100644
--- a/71_RayTracingPipeline/app_resources/common.hlsl
+++ b/71_RayTracingPipeline/app_resources/common.hlsl
@@ -29,14 +29,47 @@ struct SGeomInfo
     Material material;
 };
 
+enum E_LIGHT_TYPE : int32_t
+{
+    ELT_DIRECTIONAL,
+    ELT_POINT,
+    ELT_SPOT,
+    ELT_COUNT
+};
+
+struct Light
+{
+    float32_t3 direction;
+    float32_t3 position;
+    float32_t intensity;
+    float32_t innerCutoff;
+    float32_t outerCutoff;
+    int32_t type;
+
+#ifndef __HLSL_VERSION
+    bool operator==(const Light&) const = default;
+#endif
+
+};
+
 struct SPushConstants
 {
-    uint64_t geometryInfoBuffer;
-    uint32_t frameCounter;
+    Light light;
 
     float32_t3 camPos;
     float32_t4x4 invMVP;
 
+    uint64_t geometryInfoBuffer;
+    uint32_t frameCounter;
+};
+
+
+struct RayLight
+{
+    float32_t3  inHitPosition;
+    float32_t outLightDistance;
+    float32_t3  outLightDir;
+    float32_t outIntensity;
 };
 
 #ifdef __HLSL_VERSION
diff --git a/71_RayTracingPipeline/app_resources/raytrace.rchit.hlsl b/71_RayTracingPipeline/app_resources/raytrace.rchit.hlsl
index b77412ff7..d8c527389 100644
--- a/71_RayTracingPipeline/app_resources/raytrace.rchit.hlsl
+++ b/71_RayTracingPipeline/app_resources/raytrace.rchit.hlsl
@@ -13,7 +13,8 @@ float3 unpackNormals3x10(uint32_t v)
     return clamp(float3(pn) / 511.0, -1.0, 1.0);
 }
 
-struct VertexData {
+struct VertexData
+{
     float32_t3 position;
     float32_t3 normal;
 };
@@ -33,30 +34,30 @@ VertexData fetchVertexData(int instID, int primID, SGeomInfo geom, float2 bary)
     {
         case 0: // EIT_16BIT
         {
-            i0 = uint32_t(vk::RawBufferLoad<uint16_t>(indexBufferAddress + (idxOffset + 0) * sizeof(uint16_t), 2u));
-            i1 = uint32_t(vk::RawBufferLoad<uint16_t>(indexBufferAddress + (idxOffset + 1) * sizeof(uint16_t), 2u));
-            i2 = uint32_t(vk::RawBufferLoad<uint16_t>(indexBufferAddress + (idxOffset + 2) * sizeof(uint16_t), 2u));
-        }
-        break;
+                i0 = uint32_t(vk::RawBufferLoad < uint16_t > (indexBufferAddress + (idxOffset + 0) * sizeof(uint16_t), 2u));
+                i1 = uint32_t(vk::RawBufferLoad < uint16_t > (indexBufferAddress + (idxOffset + 1) * sizeof(uint16_t), 2u));
+                i2 = uint32_t(vk::RawBufferLoad < uint16_t > (indexBufferAddress + (idxOffset + 2) * sizeof(uint16_t), 2u));
+            }
+            break;
         case 1: // EIT_32BIT
         {
-            i0 = vk::RawBufferLoad<uint32_t>(indexBufferAddress + (idxOffset + 0) * sizeof(uint32_t));
-            i1 = vk::RawBufferLoad<uint32_t>(indexBufferAddress + (idxOffset + 1) * sizeof(uint32_t));
-            i2 = vk::RawBufferLoad<uint32_t>(indexBufferAddress + (idxOffset + 2) * sizeof(uint32_t));
-        }
-        break;
-        default:    // EIT_NONE
+                i0 = vk::RawBufferLoad < uint32_t > (indexBufferAddress + (idxOffset + 0) * sizeof(uint32_t));
+                i1 = vk::RawBufferLoad < uint32_t > (indexBufferAddress + (idxOffset + 1) * sizeof(uint32_t));
+                i2 = vk::RawBufferLoad < uint32_t > (indexBufferAddress + (idxOffset + 2) * sizeof(uint32_t));
+            }
+            break;
+        default: // EIT_NONE
         {
-            i0 = idxOffset;
-            i1 = idxOffset + 1;
-            i2 = idxOffset + 2;
-        }
+                i0 = idxOffset;
+                i1 = idxOffset + 1;
+                i2 = idxOffset + 2;
+            }
     }
 
     const uint64_t vertexBufferAddress = geom.vertexBufferAddress;
-	float32_t3 p0 = vk::RawBufferLoad<float32_t3>(vertexBufferAddress + i0 * vertexStride);
-	float32_t3 p1 = vk::RawBufferLoad<float32_t3>(vertexBufferAddress + i1 * vertexStride);
-	float32_t3 p2 = vk::RawBufferLoad<float32_t3>(vertexBufferAddress + i2 * vertexStride);
+    float32_t3 p0 = vk::RawBufferLoad < float32_t3 > (vertexBufferAddress + i0 * vertexStride);
+    float32_t3 p1 = vk::RawBufferLoad < float32_t3 > (vertexBufferAddress + i1 * vertexStride);
+    float32_t3 p2 = vk::RawBufferLoad < float32_t3 > (vertexBufferAddress + i2 * vertexStride);
 
     const uint64_t normalVertexBufferAddress = vertexBufferAddress + s_offsetsToNormalBytes[objType];
     float3 n0, n1, n2;
@@ -64,42 +65,45 @@ VertexData fetchVertexData(int instID, int primID, SGeomInfo geom, float2 bary)
     {
         case OT_CUBE:
         {
-            uint32_t v0 = vk::RawBufferLoad<uint32_t>(normalVertexBufferAddress + i0 * vertexStride, 2u);
-            uint32_t v1 = vk::RawBufferLoad<uint32_t>(normalVertexBufferAddress + i1 * vertexStride, 2u);
-            uint32_t v2 = vk::RawBufferLoad<uint32_t>(normalVertexBufferAddress + i2 * vertexStride, 2u);
-
-            n0 = normalize(nbl::hlsl::spirv::unpackSnorm4x8(v0).xyz);
-            n1 = normalize(nbl::hlsl::spirv::unpackSnorm4x8(v1).xyz);
-            n2 = normalize(nbl::hlsl::spirv::unpackSnorm4x8(v2).xyz);
-        }
-        break;
+                uint32_t v0 = vk::RawBufferLoad < uint32_t > (normalVertexBufferAddress + i0 * vertexStride, 2u);
+                uint32_t v1 = vk::RawBufferLoad < uint32_t > (normalVertexBufferAddress + i1 * vertexStride, 2u);
+                uint32_t v2 = vk::RawBufferLoad < uint32_t > (normalVertexBufferAddress + i2 * vertexStride, 2u);
+
+                n0 = normalize(nbl::hlsl::spirv::unpackSnorm4x8(v0).xyz);
+                n1 = normalize(nbl::hlsl::spirv::unpackSnorm4x8(v1).xyz);
+                n2 = normalize(nbl::hlsl::spirv::unpackSnorm4x8(v2).xyz);
+            }
+            break;
         case OT_SPHERE:
         case OT_CYLINDER:
         case OT_ARROW:
         case OT_CONE:
         {
-            uint32_t v0 = vk::RawBufferLoad<uint32_t>(normalVertexBufferAddress + i0 * vertexStride);
-            uint32_t v1 = vk::RawBufferLoad<uint32_t>(normalVertexBufferAddress + i1 * vertexStride);
-            uint32_t v2 = vk::RawBufferLoad<uint32_t>(normalVertexBufferAddress + i2 * vertexStride);
-
-            n0 = normalize(unpackNormals3x10(v0));
-            n1 = normalize(unpackNormals3x10(v1));
-            n2 = normalize(unpackNormals3x10(v2));
-        }
-        break;
+                uint32_t v0 = vk::RawBufferLoad < uint32_t > (normalVertexBufferAddress + i0 * vertexStride);
+                uint32_t v1 = vk::RawBufferLoad < uint32_t > (normalVertexBufferAddress + i1 * vertexStride);
+                uint32_t v2 = vk::RawBufferLoad < uint32_t > (normalVertexBufferAddress + i2 * vertexStride);
+
+                n0 = normalize(unpackNormals3x10(v0));
+                n1 = normalize(unpackNormals3x10(v1));
+                n2 = normalize(unpackNormals3x10(v2));
+            }
+            break;
         case OT_RECTANGLE:
         case OT_DISK:
         case OT_ICOSPHERE:
         default:
         {
-            n0 = normalize(vk::RawBufferLoad<float3>(normalVertexBufferAddress + i0 * vertexStride));
-            n1 = normalize(vk::RawBufferLoad<float3>(normalVertexBufferAddress + i1 * vertexStride));
-            n2 = normalize(vk::RawBufferLoad<float3>(normalVertexBufferAddress + i2 * vertexStride));
-        }
+                n0 = normalize(vk::RawBufferLoad <
+                float3 > (normalVertexBufferAddress + i0 * vertexStride));
+                n1 = normalize(vk::RawBufferLoad <
+                float3 > (normalVertexBufferAddress + i1 * vertexStride));
+                n2 = normalize(vk::RawBufferLoad <
+                float3 > (normalVertexBufferAddress + i2 * vertexStride));
+            }
     }
 
     float3 barycentrics = float3(0.0, bary);
-    barycentrics.x = 1.0 - barycentrics.y - barycentrics.z;        
+    barycentrics.x = 1.0 - barycentrics.y - barycentrics.z;
 
     VertexData data;
     data.position = barycentrics.x * p0 + barycentrics.y * p1 + barycentrics.z * p2;
@@ -110,27 +114,52 @@ VertexData fetchVertexData(int instID, int primID, SGeomInfo geom, float2 bary)
 [shader("closesthit")]
 void main(inout ColorPayload p, in BuiltInTriangleIntersectionAttributes attribs)
 {
-	const int instID = InstanceID();
-	const int primID = PrimitiveIndex();
-    const SGeomInfo geom = vk::RawBufferLoad<SGeomInfo>(pc.geometryInfoBuffer + instID * sizeof(SGeomInfo));
+    const int instID = InstanceID();
+    const int primID = PrimitiveIndex();
+    const SGeomInfo geom = vk::RawBufferLoad < SGeomInfo > (pc.geometryInfoBuffer + instID * sizeof(SGeomInfo));
     const VertexData vertexData = fetchVertexData(instID, primID, geom, attribs.barycentrics);
     const float32_t3 worldPosition = mul(ObjectToWorld3x4(), float32_t4(vertexData.position, 1));
     const float32_t3 worldNormal = mul(vertexData.normal, WorldToObject3x4()).xyz;
 
-    const float32_t lightIntensity = 1;
-    const float32_t3 lightDirection = normalize(float32_t3(1, 1, -1));
+    RayLight cLight;
+    cLight.inHitPosition = worldPosition;
+    if (pc.light.type == 0)
+    {
+        cLight.outLightDir = normalize(-pc.light.direction);
+        cLight.outIntensity = 1.0;
+        cLight.outLightDistance = 10000000;
+    }
+    if (pc.light.type == 1)
+    {
+        float32_t3 lDir = pc.light.position - cLight.inHitPosition;
+        float lightDistance = length(lDir);
+        cLight.outIntensity = pc.light.intensity / (lightDistance * lightDistance);
+        cLight.outLightDir = normalize(lDir);
+        cLight.outLightDistance = lightDistance;
+    }
+    else if (pc.light.type == 2)
+    {
+        float32_t3 lDir = pc.light.position - cLight.inHitPosition;
+        cLight.outLightDistance = length(lDir);
+        cLight.outIntensity = pc.light.intensity / (cLight.outLightDistance * cLight.outLightDistance);
+        cLight.outLightDir = normalize(lDir);
+        float theta = dot(cLight.outLightDir, normalize(-pc.light.direction));
+        float epsilon = pc.light.innerCutoff - pc.light.outerCutoff;
+        float spotIntensity = clamp((theta - pc.light.outerCutoff) / epsilon, 0.0, 1.0);
+        cLight.outIntensity *= spotIntensity;
+    }
 
-    float32_t3 diffuse = computeDiffuse(geom.material, lightDirection, worldNormal);
+    float32_t3 diffuse = computeDiffuse(geom.material, cLight.outLightDir, worldNormal);
     float32_t3 specular = float32_t3(0, 0, 0);
     float32_t attenuation = 1;
 
-    if (dot(worldNormal, lightDirection) > 0)
+    if (dot(worldNormal, cLight.outLightDir) > 0)
     {
         RayDesc rayDesc;
-    rayDesc.Origin = WorldRayOrigin() + WorldRayDirection() * RayTCurrent() + worldNormal * 0.02f;
-        rayDesc.Direction = lightDirection;
+        rayDesc.Origin = WorldRayOrigin() + WorldRayDirection() * RayTCurrent() + worldNormal * 0.02f;
+        rayDesc.Direction = cLight.outLightDir;
         rayDesc.TMin = 0.001;
-        rayDesc.TMax = 1000;
+        rayDesc.TMax = cLight.outLightDistance;
 
         uint flags = RAY_FLAG_SKIP_CLOSEST_HIT_SHADER;
         ShadowPayload shadowPayload;
@@ -145,8 +174,8 @@ void main(inout ColorPayload p, in BuiltInTriangleIntersectionAttributes attribs
         }
         else
         {
-            specular = computeSpecular(geom.material, WorldRayDirection(), lightDirection, worldNormal);
+            specular = computeSpecular(geom.material, WorldRayDirection(), cLight.outLightDir, worldNormal);
         }
     }
-	p.hitValue = (lightIntensity * attenuation * (specular + diffuse));	
+    p.hitValue = (cLight.outIntensity * attenuation * (diffuse + specular));
 }
\ No newline at end of file
diff --git a/71_RayTracingPipeline/main.cpp b/71_RayTracingPipeline/main.cpp
index 4fc992c90..c83498896 100644
--- a/71_RayTracingPipeline/main.cpp
+++ b/71_RayTracingPipeline/main.cpp
@@ -15,12 +15,10 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication,
   constexpr static inline uint32_t MaxFramesInFlight = 3u;
   constexpr static inline uint8_t MaxUITextureCount = 1u;
 
-  enum E_LIGHT_TYPE : uint8_t
-  {
-    ELT_SPHERE,
-    ELT_TRIANGLE,
-    ELT_RECTANGLE,
-    ELT_COUNT
+  static constexpr const char* s_lightTypeNames[E_LIGHT_TYPE::ELT_COUNT] = {
+    "Directional",
+    "Point",
+    "Spot"
   };
 
   constexpr static inline clock_t::duration DisplayImageDuration = std::chrono::milliseconds(900);
@@ -36,7 +34,8 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication,
 
 public:
   inline RaytracingPipelineApp(const path& _localInputCWD, const path& _localOutputCWD, const path& _sharedInputCWD, const path& _sharedOutputCWD)
-    : IApplicationFramework(_localInputCWD, _localOutputCWD, _sharedInputCWD, _sharedOutputCWD) {
+    : IApplicationFramework(_localInputCWD, _localOutputCWD, _sharedInputCWD, _sharedOutputCWD)
+  {
   }
 
   inline SPhysicalDeviceFeatures getRequiredDeviceFeatures() const override
@@ -229,13 +228,11 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication,
     auto assetManager = make_smart_refctd_ptr<nbl::asset::IAssetManager>(smart_refctd_ptr(system));
     auto* geometryCreator = assetManager->getGeometryCreator();
 
-    auto cQueue = getComputeQueue();
-
     // create geometry objects
     if (!createGeometries(gQueue, geometryCreator))
       return logFail("Could not create geometries from geometry creator");
 
-    if (!createAccelerationStructures(cQueue))
+    if (!createAccelerationStructures(getComputeQueue()))
       return logFail("Could not create acceleration structures");
 
     ISampler::SParams samplerParams = {
@@ -449,6 +446,37 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication,
         ImGui::SliderFloat("Fov", &fov, 20.f, 150.f);
         ImGui::SliderFloat("zNear", &zNear, 0.1f, 100.f);
         ImGui::SliderFloat("zFar", &zFar, 110.f, 10000.f);
+        Light m_oldLight = m_light;
+        ImGui::ListBox("LightType", &m_light.type, s_lightTypeNames, ELT_COUNT);
+        if (m_light.type == ELT_DIRECTIONAL)
+        {
+          ImGui::SliderFloat3("Light Direction", &m_light.direction.x, -1.f, 1.f);
+        } else if (m_light.type == ELT_POINT)
+        {
+          ImGui::SliderFloat3("Light Position", &m_light.position.x, -20.f, 20.f);
+          ImGui::SliderFloat("Light Intensity", &m_light.intensity, 0.0f, 500.f);
+        } else if (m_light.type == ELT_SPOT)
+        {
+          ImGui::SliderFloat3("Light Direction", &m_light.direction.x, -1.f, 1.f);
+          ImGui::SliderFloat3("Light Position", &m_light.position.x, -20.f, 20.f);
+          ImGui::SliderFloat("Light Intensity", &m_light.intensity, 0.0f, 500.f);
+
+          float32_t dInnerCutoff = degrees(acos(m_light.innerCutoff));
+          float32_t dOuterCutoff = degrees(acos(m_light.outerCutoff));
+          if (ImGui::SliderFloat("Light Inner Cutoff", &dInnerCutoff, 0.0f, 45.0f))
+          {
+            dInnerCutoff = dInnerCutoff > dOuterCutoff ? dOuterCutoff : dInnerCutoff;
+            m_light.innerCutoff = cos(radians(dInnerCutoff));
+          }
+          if (ImGui::SliderFloat("Light Outer Cutoff", &dOuterCutoff, 0.0f, 45.0f))
+          {
+            m_light.outerCutoff = cos(radians(dOuterCutoff));
+          }
+        }
+        if (m_light != m_oldLight)
+        {
+          m_frameAccumulationCounter = 0;
+        }
 
         ImGui::Text("X: %f Y: %f", io.MousePos.x, io.MousePos.y);
 
@@ -571,6 +599,7 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication,
     // Trace Rays Pass
     {
       SPushConstants pc;
+      pc.light = m_light;
       pc.geometryInfoBuffer = m_geometryInfoBuffer->getDeviceAddress();
       pc.frameCounter = m_frameAccumulationCounter;
       const core::vector3df camPos = m_camera.getPosition().getAsVector3df();
@@ -1494,6 +1523,16 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication,
   float camYAngle = 165.f / 180.f * 3.14159f;
   float camXAngle = 32.f / 180.f * 3.14159f;
   Camera m_camera = Camera(core::vectorSIMDf(0, 0, 0), core::vectorSIMDf(0, 0, 0), core::matrix4SIMD());
+
+  Light m_light = {
+    .direction = {-1.0f, -1.0f, -0.4f},
+    .position = {10.0f, 15.0f, 8.0f},
+    .intensity = 100.0f,
+    .innerCutoff = 0.939692621f, // {cos(radians(20.0f))},
+    .outerCutoff = 0.866025404f, // {cos(radians(30.0f))}, 
+    .type = ELT_DIRECTIONAL
+  };
+
   video::CDumbPresentationOracle m_oracle;
 
   struct C_UI
@@ -1531,5 +1570,4 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication,
   smart_refctd_ptr<CAssetConverter> m_converter;
 
 };
-
 NBL_MAIN_FUNC(RaytracingPipelineApp)

From 9091da112b9b5763d2b340045e261fe0032d6bdc Mon Sep 17 00:00:00 2001
From: kevyuu <kevin.kayu@gmail.com>
Date: Thu, 23 Jan 2025 00:02:44 +0700
Subject: [PATCH 005/529] Implement callable shader

Signed-off-by: kevyuu <kevin.kayu@gmail.com>
---
 .../app_resources/common.hlsl                 | 12 ++++----
 .../app_resources/lgiht_spot.rcall.hlsl       | 16 ++++++++++
 .../light_directional.rcall.hlsl              | 11 +++++++
 .../app_resources/light_point.rcall.hlsl      | 13 ++++++++
 .../app_resources/raytrace.rahit.hlsl         |  3 +-
 .../app_resources/raytrace.rchit.hlsl         | 30 ++-----------------
 .../app_resources/raytrace.rgen.hlsl          |  1 -
 71_RayTracingPipeline/main.cpp                | 21 +++++++++----
 8 files changed, 66 insertions(+), 41 deletions(-)
 create mode 100644 71_RayTracingPipeline/app_resources/lgiht_spot.rcall.hlsl
 create mode 100644 71_RayTracingPipeline/app_resources/light_directional.rcall.hlsl
 create mode 100644 71_RayTracingPipeline/app_resources/light_point.rcall.hlsl

diff --git a/71_RayTracingPipeline/app_resources/common.hlsl b/71_RayTracingPipeline/app_resources/common.hlsl
index d28e646fe..ce82181c3 100644
--- a/71_RayTracingPipeline/app_resources/common.hlsl
+++ b/71_RayTracingPipeline/app_resources/common.hlsl
@@ -66,9 +66,9 @@ struct SPushConstants
 
 struct RayLight
 {
-    float32_t3  inHitPosition;
+    float32_t3 inHitPosition;
     float32_t outLightDistance;
-    float32_t3  outLightDir;
+    float32_t3 outLightDir;
     float32_t outIntensity;
 };
 
@@ -76,14 +76,14 @@ struct RayLight
 
 struct [raypayload] ColorPayload
 {
-	float32_t3 hitValue;
-    uint32_t seed;
+	float32_t3 hitValue : read(caller) : write(closesthit,miss);
+    uint32_t seed : read(closesthit,anyhit) : write(caller);
 };
 
 struct [raypayload] ShadowPayload
 {
-	bool isShadowed;
-    uint32_t seed;
+	bool isShadowed : read(caller) : write(caller,miss);
+    uint32_t seed : read(anyhit) : write(caller);
 };
 
 enum ObjectType : uint32_t  // matches c++
diff --git a/71_RayTracingPipeline/app_resources/lgiht_spot.rcall.hlsl b/71_RayTracingPipeline/app_resources/lgiht_spot.rcall.hlsl
new file mode 100644
index 000000000..5dbc5a830
--- /dev/null
+++ b/71_RayTracingPipeline/app_resources/lgiht_spot.rcall.hlsl
@@ -0,0 +1,16 @@
+#include "common.hlsl"
+
+[[vk::push_constant]] SPushConstants pc;
+
+[shader("callable")]
+void main(inout RayLight cLight)
+{
+    float32_t3 lDir = pc.light.position - cLight.inHitPosition;
+    cLight.outLightDistance = length(lDir);
+    cLight.outIntensity = pc.light.intensity / (cLight.outLightDistance * cLight.outLightDistance);
+    cLight.outLightDir = normalize(lDir);
+    float theta = dot(cLight.outLightDir, normalize(-pc.light.direction));
+    float epsilon = pc.light.innerCutoff - pc.light.outerCutoff;
+    float spotIntensity = clamp((theta - pc.light.outerCutoff) / epsilon, 0.0, 1.0);
+    cLight.outIntensity *= spotIntensity;
+}
diff --git a/71_RayTracingPipeline/app_resources/light_directional.rcall.hlsl b/71_RayTracingPipeline/app_resources/light_directional.rcall.hlsl
new file mode 100644
index 000000000..d4aeca85e
--- /dev/null
+++ b/71_RayTracingPipeline/app_resources/light_directional.rcall.hlsl
@@ -0,0 +1,11 @@
+#include "common.hlsl"
+
+[[vk::push_constant]] SPushConstants pc;
+
+[shader("callable")]
+void main(inout RayLight cLight)
+{
+    cLight.outLightDir = normalize(-pc.light.direction);
+    cLight.outIntensity = 1.0;
+    cLight.outLightDistance = 10000000;
+}
diff --git a/71_RayTracingPipeline/app_resources/light_point.rcall.hlsl b/71_RayTracingPipeline/app_resources/light_point.rcall.hlsl
new file mode 100644
index 000000000..e82d17ec8
--- /dev/null
+++ b/71_RayTracingPipeline/app_resources/light_point.rcall.hlsl
@@ -0,0 +1,13 @@
+#include "common.hlsl"
+
+[[vk::push_constant]] SPushConstants pc;
+
+[shader("callable")]
+void main(inout RayLight cLight)
+{
+    float32_t3 lDir = pc.light.position - cLight.inHitPosition;
+    float lightDistance = length(lDir);
+    cLight.outIntensity = pc.light.intensity / (lightDistance * lightDistance);
+    cLight.outLightDir = normalize(lDir);
+    cLight.outLightDistance = lightDistance;
+}
\ No newline at end of file
diff --git a/71_RayTracingPipeline/app_resources/raytrace.rahit.hlsl b/71_RayTracingPipeline/app_resources/raytrace.rahit.hlsl
index f68d607aa..660e506c4 100644
--- a/71_RayTracingPipeline/app_resources/raytrace.rahit.hlsl
+++ b/71_RayTracingPipeline/app_resources/raytrace.rahit.hlsl
@@ -20,8 +20,9 @@ void main(inout AnyHitPayload p, in BuiltInTriangleIntersectionAttributes attrib
     if (geom.material.illum != 4)
         return;
 
+    uint32_t seed = p.seed;
     if (geom.material.dissolve == 0.0)
         IgnoreHit();
-    else if (rnd(p.seed) > geom.material.dissolve)
+    else if (rnd(seed) > geom.material.dissolve)
         IgnoreHit();
 }
diff --git a/71_RayTracingPipeline/app_resources/raytrace.rchit.hlsl b/71_RayTracingPipeline/app_resources/raytrace.rchit.hlsl
index d8c527389..c89b69142 100644
--- a/71_RayTracingPipeline/app_resources/raytrace.rchit.hlsl
+++ b/71_RayTracingPipeline/app_resources/raytrace.rchit.hlsl
@@ -123,31 +123,7 @@ void main(inout ColorPayload p, in BuiltInTriangleIntersectionAttributes attribs
 
     RayLight cLight;
     cLight.inHitPosition = worldPosition;
-    if (pc.light.type == 0)
-    {
-        cLight.outLightDir = normalize(-pc.light.direction);
-        cLight.outIntensity = 1.0;
-        cLight.outLightDistance = 10000000;
-    }
-    if (pc.light.type == 1)
-    {
-        float32_t3 lDir = pc.light.position - cLight.inHitPosition;
-        float lightDistance = length(lDir);
-        cLight.outIntensity = pc.light.intensity / (lightDistance * lightDistance);
-        cLight.outLightDir = normalize(lDir);
-        cLight.outLightDistance = lightDistance;
-    }
-    else if (pc.light.type == 2)
-    {
-        float32_t3 lDir = pc.light.position - cLight.inHitPosition;
-        cLight.outLightDistance = length(lDir);
-        cLight.outIntensity = pc.light.intensity / (cLight.outLightDistance * cLight.outLightDistance);
-        cLight.outLightDir = normalize(lDir);
-        float theta = dot(cLight.outLightDir, normalize(-pc.light.direction));
-        float epsilon = pc.light.innerCutoff - pc.light.outerCutoff;
-        float spotIntensity = clamp((theta - pc.light.outerCutoff) / epsilon, 0.0, 1.0);
-        cLight.outIntensity *= spotIntensity;
-    }
+    CallShader(pc.light.type, cLight);
 
     float32_t3 diffuse = computeDiffuse(geom.material, cLight.outLightDir, worldNormal);
     float32_t3 specular = float32_t3(0, 0, 0);
@@ -166,9 +142,9 @@ void main(inout ColorPayload p, in BuiltInTriangleIntersectionAttributes attribs
         shadowPayload.isShadowed = true;
         shadowPayload.seed = p.seed;
         TraceRay(topLevelAS, flags, 0xFF, 1, 0, 1, rayDesc, shadowPayload);
-        p.seed = shadowPayload.seed;
 
-        if (shadowPayload.isShadowed)
+        bool isShadowed = shadowPayload.isShadowed;
+        if (isShadowed)
         {
             attenuation = 0.3;
         }
diff --git a/71_RayTracingPipeline/app_resources/raytrace.rgen.hlsl b/71_RayTracingPipeline/app_resources/raytrace.rgen.hlsl
index 90b950f76..efbbcd56e 100644
--- a/71_RayTracingPipeline/app_resources/raytrace.rgen.hlsl
+++ b/71_RayTracingPipeline/app_resources/raytrace.rgen.hlsl
@@ -51,7 +51,6 @@ void main()
         
         ColorPayload payload;
         payload.seed = seed;
-        payload.hitValue = float32_t3(0, 0, 0);
         TraceRay(topLevelAS, RAY_FLAG_NONE, 0xff, 0, 0, 0, rayDesc, payload);
 
         hitValues += payload.hitValue;
diff --git a/71_RayTracingPipeline/main.cpp b/71_RayTracingPipeline/main.cpp
index c83498896..51001f4f8 100644
--- a/71_RayTracingPipeline/main.cpp
+++ b/71_RayTracingPipeline/main.cpp
@@ -134,6 +134,9 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication,
     const auto anyHitShaderShadowPayload = compileShader("app_resources/raytrace.rahit.hlsl", "#define USE_SHADOW_PAYLOAD\n");
     const auto missShader = compileShader("app_resources/raytrace.rmiss.hlsl");
     const auto shadowMissShader = compileShader("app_resources/raytraceShadow.rmiss.hlsl");
+    const auto directionalLightCallShader = compileShader("app_resources/light_directional.rcall.hlsl");
+    const auto pointLightCallShader = compileShader("app_resources/light_point.rcall.hlsl");
+    const auto spotLightCallShader = compileShader("app_resources/light_spot.rcall.hlsl");
 
     m_semaphore = m_device->createSemaphore(m_realFrameIx);
     if (!m_semaphore)
@@ -275,11 +278,14 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication,
 
       const IGPUShader::SSpecInfo shaders[] = {
           {.shader = raygenShader.get()},
+          {.shader = missShader.get()},
+          {.shader = shadowMissShader.get()},
           {.shader = closestHitShader.get()},
           {.shader = anyHitShaderColorPayload.get()},
           {.shader = anyHitShaderShadowPayload.get()},
-          {.shader = missShader.get()},
-          {.shader = shadowMissShader.get()},
+          {.shader = directionalLightCallShader.get()},
+          {.shader = pointLightCallShader.get()},
+          {.shader = spotLightCallShader.get()},
       };
 
       params.layout = pipelineLayout.get();
@@ -287,10 +293,13 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication,
       params.cached.shaderGroups.raygenGroup = {
         .shaderIndex = 0,
       };
-      params.cached.shaderGroups.hitGroups.push_back({ .closestHitShaderIndex = 1, .anyHitShaderIndex = 2 });
-      params.cached.shaderGroups.hitGroups.push_back({ .closestHitShaderIndex = 1, .anyHitShaderIndex = 3 });
-      params.cached.shaderGroups.missGroups.push_back({ .shaderIndex = 4 });
-      params.cached.shaderGroups.missGroups.push_back({ .shaderIndex = 5 });
+      params.cached.shaderGroups.missGroups.push_back({ .shaderIndex = 1 });
+      params.cached.shaderGroups.missGroups.push_back({ .shaderIndex = 2 });
+      params.cached.shaderGroups.hitGroups.push_back({ .closestHitShaderIndex = 3, .anyHitShaderIndex = 4 });
+      params.cached.shaderGroups.hitGroups.push_back({ .closestHitShaderIndex = 3, .anyHitShaderIndex = 5 });
+      params.cached.shaderGroups.callableGroups.push_back({.shaderIndex = 6});
+      params.cached.shaderGroups.callableGroups.push_back({.shaderIndex = 7});
+      params.cached.shaderGroups.callableGroups.push_back({.shaderIndex = 8});
       params.cached.maxRecursionDepth = 2;
       if (!m_device->createRayTracingPipelines(nullptr, { &params, 1 }, &m_rayTracingPipeline))
         return logFail("Failed to create ray tracing pipeline");

From d303356516823fedea27e4a3da03d56d753b66a8 Mon Sep 17 00:00:00 2001
From: kevyuu <kevin.kayu@gmail.com>
Date: Fri, 24 Jan 2025 15:35:51 +0700
Subject: [PATCH 006/529] Implement procedural geometry intersection shader
 demo

Signed-off-by: kevyuu <kevin.kayu@gmail.com>
---
 .../app_resources/common.hlsl                 |  39 ++-
 ..._spot.rcall.hlsl => light_spot.rcall.hlsl} |   0
 .../app_resources/raytrace.rahit.hlsl         |   2 +-
 .../app_resources/raytrace.rchit.hlsl         |  10 +-
 .../app_resources/raytrace.rgen.hlsl          |   6 +-
 .../app_resources/raytrace.rint.hlsl          |  54 ++++
 .../raytrace_procedural.rchit.hlsl            |  61 ++++
 71_RayTracingPipeline/main.cpp                | 304 +++++++++++++-----
 8 files changed, 378 insertions(+), 98 deletions(-)
 rename 71_RayTracingPipeline/app_resources/{lgiht_spot.rcall.hlsl => light_spot.rcall.hlsl} (100%)
 create mode 100644 71_RayTracingPipeline/app_resources/raytrace.rint.hlsl
 create mode 100644 71_RayTracingPipeline/app_resources/raytrace_procedural.rchit.hlsl

diff --git a/71_RayTracingPipeline/app_resources/common.hlsl b/71_RayTracingPipeline/app_resources/common.hlsl
index ce82181c3..50306b516 100644
--- a/71_RayTracingPipeline/app_resources/common.hlsl
+++ b/71_RayTracingPipeline/app_resources/common.hlsl
@@ -15,7 +15,20 @@ struct Material
     uint32_t illum; // illumination model (see http://www.fileformat.info/format/material/)
 };
 
-struct SGeomInfo
+struct SProceduralGeomInfo
+{
+    float32_t3 center;
+    float32_t radius;
+    Material material;
+};
+
+struct Aabb
+{
+    float32_t3 minimum;
+    float32_t3 maximum;
+};
+
+struct STriangleGeomInfo
 {
     uint64_t vertexBufferAddress;
     uint64_t indexBufferAddress;
@@ -29,6 +42,13 @@ struct SGeomInfo
     Material material;
 };
 
+enum E_GEOM_TYPE : int32_t
+{
+    EGT_TRIANGLES,
+    EGT_PROCEDURAL,
+    EGT_COUNT
+};
+
 enum E_LIGHT_TYPE : int32_t
 {
     ELT_DIRECTIONAL,
@@ -37,6 +57,20 @@ enum E_LIGHT_TYPE : int32_t
     ELT_COUNT
 };
 
+enum E_RAY_TYPE : int32_t
+{
+    ERT_PRIMARY, // Ray shoot from camera
+    ERT_OCCLUSION,
+    ERT_COUNT
+};
+
+enum E_MISS_TYPE : int32_t
+{
+    EMT_PRIMARY,
+    EMT_OCCLUSION,
+    EMT_COUNT
+};
+
 struct Light
 {
     float32_t3 direction;
@@ -59,7 +93,8 @@ struct SPushConstants
     float32_t3 camPos;
     float32_t4x4 invMVP;
 
-    uint64_t geometryInfoBuffer;
+    uint64_t proceduralGeomInfoBuffer;
+    uint64_t triangleGeomInfoBuffer;
     uint32_t frameCounter;
 };
 
diff --git a/71_RayTracingPipeline/app_resources/lgiht_spot.rcall.hlsl b/71_RayTracingPipeline/app_resources/light_spot.rcall.hlsl
similarity index 100%
rename from 71_RayTracingPipeline/app_resources/lgiht_spot.rcall.hlsl
rename to 71_RayTracingPipeline/app_resources/light_spot.rcall.hlsl
diff --git a/71_RayTracingPipeline/app_resources/raytrace.rahit.hlsl b/71_RayTracingPipeline/app_resources/raytrace.rahit.hlsl
index 660e506c4..5db6d70fa 100644
--- a/71_RayTracingPipeline/app_resources/raytrace.rahit.hlsl
+++ b/71_RayTracingPipeline/app_resources/raytrace.rahit.hlsl
@@ -15,7 +15,7 @@ using AnyHitPayload = ShadowPayload;
 void main(inout AnyHitPayload p, in BuiltInTriangleIntersectionAttributes attribs)
 {
     const int instID = InstanceID();
-    const SGeomInfo geom = vk::RawBufferLoad < SGeomInfo > (pc.geometryInfoBuffer + instID * sizeof(SGeomInfo));
+    const STriangleGeomInfo geom = vk::RawBufferLoad < STriangleGeomInfo > (pc.triangleGeomInfoBuffer + instID * sizeof(STriangleGeomInfo));
     
     if (geom.material.illum != 4)
         return;
diff --git a/71_RayTracingPipeline/app_resources/raytrace.rchit.hlsl b/71_RayTracingPipeline/app_resources/raytrace.rchit.hlsl
index c89b69142..734491e7d 100644
--- a/71_RayTracingPipeline/app_resources/raytrace.rchit.hlsl
+++ b/71_RayTracingPipeline/app_resources/raytrace.rchit.hlsl
@@ -19,7 +19,7 @@ struct VertexData
     float32_t3 normal;
 };
 
-VertexData fetchVertexData(int instID, int primID, SGeomInfo geom, float2 bary)
+VertexData fetchVertexData(int instID, int primID, STriangleGeomInfo geom, float2 bary)
 {
     uint idxOffset = primID * 3;
 
@@ -116,7 +116,7 @@ void main(inout ColorPayload p, in BuiltInTriangleIntersectionAttributes attribs
 {
     const int instID = InstanceID();
     const int primID = PrimitiveIndex();
-    const SGeomInfo geom = vk::RawBufferLoad < SGeomInfo > (pc.geometryInfoBuffer + instID * sizeof(SGeomInfo));
+    const STriangleGeomInfo geom = vk::RawBufferLoad < STriangleGeomInfo > (pc.triangleGeomInfoBuffer + instID * sizeof(STriangleGeomInfo));
     const VertexData vertexData = fetchVertexData(instID, primID, geom, attribs.barycentrics);
     const float32_t3 worldPosition = mul(ObjectToWorld3x4(), float32_t4(vertexData.position, 1));
     const float32_t3 worldNormal = mul(vertexData.normal, WorldToObject3x4()).xyz;
@@ -132,16 +132,16 @@ void main(inout ColorPayload p, in BuiltInTriangleIntersectionAttributes attribs
     if (dot(worldNormal, cLight.outLightDir) > 0)
     {
         RayDesc rayDesc;
-        rayDesc.Origin = WorldRayOrigin() + WorldRayDirection() * RayTCurrent() + worldNormal * 0.02f;
+        rayDesc.Origin = WorldRayOrigin() + WorldRayDirection() * RayTCurrent();
         rayDesc.Direction = cLight.outLightDir;
-        rayDesc.TMin = 0.001;
+        rayDesc.TMin = 0.01;
         rayDesc.TMax = cLight.outLightDistance;
 
         uint flags = RAY_FLAG_SKIP_CLOSEST_HIT_SHADER;
         ShadowPayload shadowPayload;
         shadowPayload.isShadowed = true;
         shadowPayload.seed = p.seed;
-        TraceRay(topLevelAS, flags, 0xFF, 1, 0, 1, rayDesc, shadowPayload);
+        TraceRay(topLevelAS, flags, 0xFF, ERT_OCCLUSION, 0, EMT_OCCLUSION, rayDesc, shadowPayload);
 
         bool isShadowed = shadowPayload.isShadowed;
         if (isShadowed)
diff --git a/71_RayTracingPipeline/app_resources/raytrace.rgen.hlsl b/71_RayTracingPipeline/app_resources/raytrace.rgen.hlsl
index efbbcd56e..43b052630 100644
--- a/71_RayTracingPipeline/app_resources/raytrace.rgen.hlsl
+++ b/71_RayTracingPipeline/app_resources/raytrace.rgen.hlsl
@@ -46,12 +46,12 @@ void main()
         RayDesc rayDesc;
         rayDesc.Origin = pc.camPos;
         rayDesc.Direction = direction;
-        rayDesc.TMin = 0.01;
-        rayDesc.TMax = 1000.0;
+        rayDesc.TMin = 0.001;
+        rayDesc.TMax = 10000.0;
         
         ColorPayload payload;
         payload.seed = seed;
-        TraceRay(topLevelAS, RAY_FLAG_NONE, 0xff, 0, 0, 0, rayDesc, payload);
+        TraceRay(topLevelAS, RAY_FLAG_NONE, 0xff, ERT_PRIMARY, 0, EMT_PRIMARY, rayDesc, payload);
 
         hitValues += payload.hitValue;
     }
diff --git a/71_RayTracingPipeline/app_resources/raytrace.rint.hlsl b/71_RayTracingPipeline/app_resources/raytrace.rint.hlsl
new file mode 100644
index 000000000..f302543b6
--- /dev/null
+++ b/71_RayTracingPipeline/app_resources/raytrace.rint.hlsl
@@ -0,0 +1,54 @@
+#include "common.hlsl"
+
+[[vk::push_constant]] SPushConstants pc;
+
+struct Ray
+{
+    float32_t3 origin;
+    float32_t3 direction;
+};
+
+struct Attrib
+{
+    float3 HitAttribute;
+};
+
+// Ray-Sphere intersection
+// http://viclw17.github.io/2018/07/16/raytracing-ray-sphere-intersection/
+float32_t hitSphere(SProceduralGeomInfo s, Ray r)
+{
+    float32_t3 oc = r.origin - s.center;
+    float32_t a = dot(r.direction, r.direction);
+    float32_t b = 2.0 * dot(oc, r.direction);
+    float32_t c = dot(oc, oc) - s.radius * s.radius;
+    float32_t discriminant = b * b - 4 * a * c;
+
+    if (discriminant < 0)
+    {
+        return -1.0;
+    }
+    else
+    {
+        return (-b - sqrt(discriminant)) / (2.0 * a);
+    }
+}
+
+[shader("intersection")]
+void main()
+{
+    Ray ray;
+    ray.origin = WorldRayOrigin();
+    ray.direction = WorldRayDirection();
+
+    const int primID = PrimitiveIndex();
+
+    // Sphere data
+    SProceduralGeomInfo sphere = vk::RawBufferLoad < SProceduralGeomInfo > (pc.proceduralGeomInfoBuffer + primID * sizeof(SProceduralGeomInfo));
+
+    float32_t tHit = hitSphere(sphere, ray);
+    
+    Attrib attrib;
+    // Report hit point
+    if (tHit > 0)
+        ReportHit(tHit, 0, attrib);
+}
\ No newline at end of file
diff --git a/71_RayTracingPipeline/app_resources/raytrace_procedural.rchit.hlsl b/71_RayTracingPipeline/app_resources/raytrace_procedural.rchit.hlsl
new file mode 100644
index 000000000..ef3503346
--- /dev/null
+++ b/71_RayTracingPipeline/app_resources/raytrace_procedural.rchit.hlsl
@@ -0,0 +1,61 @@
+#include "common.hlsl"
+
+[[vk::push_constant]] SPushConstants pc;
+
+[[vk::binding(0, 0)]] RaytracingAccelerationStructure topLevelAS;
+
+[shader("closesthit")]
+void main(inout ColorPayload p, in BuiltInTriangleIntersectionAttributes attribs)
+{
+    const int instID = InstanceID();
+    const int primID = PrimitiveIndex();
+    float32_t3 worldPosition = WorldRayOrigin() + WorldRayDirection() * RayTCurrent();
+
+    SProceduralGeomInfo sphere = vk::RawBufferLoad < SProceduralGeomInfo > (pc.proceduralGeomInfoBuffer + primID * sizeof(SProceduralGeomInfo));
+
+    // Computing the normal at hit position
+    float32_t3 worldNormal = normalize(worldPosition - sphere.center);
+
+    RayLight cLight;
+    cLight.inHitPosition = worldPosition;
+    CallShader(pc.light.type, cLight);
+
+    // Material of the object
+    Material mat = sphere.material;
+
+    // Diffuse
+    float3 diffuse = computeDiffuse(sphere.material, cLight.outLightDir, worldNormal);
+    float3 specular = float3(0, 0, 0);
+    float attenuation = 1;
+
+    // Tracing shadow ray only if the light is visible from the surface
+    if (dot(worldNormal, cLight.outLightDir) > 0)
+    {
+        RayDesc rayDesc;
+        rayDesc.Origin = WorldRayOrigin() + WorldRayDirection() * RayTCurrent();
+        rayDesc.Direction = cLight.outLightDir;
+        rayDesc.TMin = 0.01;
+        rayDesc.TMax = cLight.outLightDistance;
+
+        uint flags =
+            RAY_FLAG_ACCEPT_FIRST_HIT_AND_END_SEARCH | RAY_FLAG_FORCE_OPAQUE |
+            RAY_FLAG_SKIP_CLOSEST_HIT_SHADER;
+
+        ShadowPayload shadowPayload;
+        shadowPayload.isShadowed = true;
+        shadowPayload.seed = p.seed;
+        TraceRay(topLevelAS, flags, 0xFF, ERT_OCCLUSION, 0, EMT_PRIMARY, rayDesc, shadowPayload);
+
+        bool isShadowed = shadowPayload.isShadowed;
+        if (isShadowed)
+        {
+            attenuation = 0.3;
+        }
+        else
+        {
+            specular = computeSpecular(sphere.material, WorldRayDirection(), cLight.outLightDir, worldNormal);
+        }
+    }
+
+    p.hitValue = (cLight.outIntensity * attenuation * (diffuse + specular));
+}
\ No newline at end of file
diff --git a/71_RayTracingPipeline/main.cpp b/71_RayTracingPipeline/main.cpp
index 51001f4f8..ac3befb5e 100644
--- a/71_RayTracingPipeline/main.cpp
+++ b/71_RayTracingPipeline/main.cpp
@@ -14,6 +14,7 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication,
   constexpr static inline uint32_t WIN_W = 1280, WIN_H = 720;
   constexpr static inline uint32_t MaxFramesInFlight = 3u;
   constexpr static inline uint8_t MaxUITextureCount = 1u;
+  constexpr static inline uint32_t NumberOfProceduralGeometries = 5;
 
   static constexpr const char* s_lightTypeNames[E_LIGHT_TYPE::ELT_COUNT] = {
     "Directional",
@@ -130,6 +131,8 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication,
     // shader
     const auto raygenShader = compileShader("app_resources/raytrace.rgen.hlsl");
     const auto closestHitShader = compileShader("app_resources/raytrace.rchit.hlsl");
+    const auto proceduralClosestHitShader = compileShader("app_resources/raytrace_procedural.rchit.hlsl");
+    const auto intersectionHitShader = compileShader("app_resources/raytrace.rint.hlsl");
     const auto anyHitShaderColorPayload = compileShader("app_resources/raytrace.rahit.hlsl", "#define USE_COLOR_PAYLOAD\n");
     const auto anyHitShaderShadowPayload = compileShader("app_resources/raytrace.rahit.hlsl", "#define USE_SHADOW_PAYLOAD\n");
     const auto missShader = compileShader("app_resources/raytrace.rmiss.hlsl");
@@ -276,37 +279,85 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication,
 
       IGPURayTracingPipeline::SCreationParams params = {};
 
-      const IGPUShader::SSpecInfo shaders[] = {
-          {.shader = raygenShader.get()},
-          {.shader = missShader.get()},
-          {.shader = shadowMissShader.get()},
-          {.shader = closestHitShader.get()},
-          {.shader = anyHitShaderColorPayload.get()},
-          {.shader = anyHitShaderShadowPayload.get()},
-          {.shader = directionalLightCallShader.get()},
-          {.shader = pointLightCallShader.get()},
-          {.shader = spotLightCallShader.get()},
+      enum RtDemoShader
+      {
+        RTDS_RAYGEN,
+        RTDS_MISS,
+        RTDS_SHADOW_MISS,
+        RTDS_CLOSEST_HIT,
+        RTDS_SPHERE_CLOSEST_HIT,
+        RTDS_ANYHIT_COLOR,
+        RTDS_ANYHIT_SHADOW,
+        RTDS_INTERSECTION,
+        RTDS_DIRECTIONAL_CALL,
+        RTDS_POINT_CALL,
+        RTDS_SPOT_CALL,
+        RTDS_COUNT
       };
 
+      IGPUShader::SSpecInfo shaders[RTDS_COUNT];
+      shaders[RTDS_RAYGEN] = {.shader = raygenShader.get()};
+      shaders[RTDS_MISS] = {.shader = missShader.get()};
+      shaders[RTDS_SHADOW_MISS] = {.shader = shadowMissShader.get()};
+      shaders[RTDS_CLOSEST_HIT] = {.shader = closestHitShader.get()};
+      shaders[RTDS_SPHERE_CLOSEST_HIT] = {.shader = proceduralClosestHitShader.get()};
+      shaders[RTDS_ANYHIT_COLOR] = {.shader = anyHitShaderColorPayload.get()};
+      shaders[RTDS_ANYHIT_SHADOW] = {.shader = anyHitShaderShadowPayload.get()};
+      shaders[RTDS_INTERSECTION] = {.shader = intersectionHitShader.get() };
+      shaders[RTDS_DIRECTIONAL_CALL] = {.shader = directionalLightCallShader.get()};
+      shaders[RTDS_POINT_CALL] = {.shader = pointLightCallShader.get()};
+      shaders[RTDS_SPOT_CALL] = {.shader = spotLightCallShader.get()};
+
       params.layout = pipelineLayout.get();
       params.shaders = std::span(shaders, std::size(shaders));
-      params.cached.shaderGroups.raygenGroup = {
-        .shaderIndex = 0,
+
+      auto& shaderGroups = params.cached.shaderGroups;
+
+      shaderGroups.raygenGroup = { .shaderIndex = RTDS_RAYGEN };
+
+      shaderGroups.missGroups.resize(E_MISS_TYPE::EMT_COUNT, {});
+      shaderGroups.missGroups[EMT_PRIMARY] = { .shaderIndex = RTDS_MISS };
+      shaderGroups.missGroups[EMT_OCCLUSION] = { .shaderIndex = RTDS_SHADOW_MISS };
+
+      auto getHitGroupIndex = [](E_GEOM_TYPE geomType, E_RAY_TYPE rayType)
+        {
+          return geomType * ERT_COUNT + rayType;
+        };
+      shaderGroups.hitGroups.resize(E_RAY_TYPE::ERT_COUNT * E_GEOM_TYPE::EGT_COUNT);
+      shaderGroups.hitGroups[getHitGroupIndex(EGT_TRIANGLES, ERT_PRIMARY)] = {
+        .closestHitShaderIndex = RTDS_CLOSEST_HIT,
+        .anyHitShaderIndex = RTDS_ANYHIT_COLOR,
+      };
+      shaderGroups.hitGroups[getHitGroupIndex(EGT_TRIANGLES, ERT_OCCLUSION)] = {
+        .closestHitShaderIndex = RTDS_CLOSEST_HIT,
+        .anyHitShaderIndex = RTDS_ANYHIT_SHADOW,
+      };
+      shaderGroups.hitGroups[getHitGroupIndex(EGT_PROCEDURAL, ERT_PRIMARY)] = {
+        .closestHitShaderIndex = RTDS_SPHERE_CLOSEST_HIT,
+        .anyHitShaderIndex = RTDS_ANYHIT_COLOR,
+        .intersectionShaderIndex = RTDS_INTERSECTION,
+      };
+      shaderGroups.hitGroups[getHitGroupIndex(EGT_PROCEDURAL, ERT_OCCLUSION)] = {
+        .closestHitShaderIndex = RTDS_CLOSEST_HIT,
+        .anyHitShaderIndex = RTDS_ANYHIT_SHADOW,
+        .intersectionShaderIndex = RTDS_INTERSECTION,
       };
-      params.cached.shaderGroups.missGroups.push_back({ .shaderIndex = 1 });
-      params.cached.shaderGroups.missGroups.push_back({ .shaderIndex = 2 });
-      params.cached.shaderGroups.hitGroups.push_back({ .closestHitShaderIndex = 3, .anyHitShaderIndex = 4 });
-      params.cached.shaderGroups.hitGroups.push_back({ .closestHitShaderIndex = 3, .anyHitShaderIndex = 5 });
-      params.cached.shaderGroups.callableGroups.push_back({.shaderIndex = 6});
-      params.cached.shaderGroups.callableGroups.push_back({.shaderIndex = 7});
-      params.cached.shaderGroups.callableGroups.push_back({.shaderIndex = 8});
+
+      shaderGroups.callableGroups.resize(ELT_COUNT);
+      shaderGroups.callableGroups[ELT_DIRECTIONAL] = { .shaderIndex = RTDS_DIRECTIONAL_CALL };
+      shaderGroups.callableGroups[ELT_POINT] = { .shaderIndex = RTDS_POINT_CALL };
+      shaderGroups.callableGroups[ELT_SPOT] = { .shaderIndex = RTDS_SPOT_CALL };
+
       params.cached.maxRecursionDepth = 2;
+
       if (!m_device->createRayTracingPipelines(nullptr, { &params, 1 }, &m_rayTracingPipeline))
         return logFail("Failed to create ray tracing pipeline");
       m_logger->log("Ray Tracing Pipeline Created!", system::ILogger::ELL_INFO);
 
       if (!createShaderBindingTable(gQueue, m_rayTracingPipeline))
         return logFail("Could not create shader binding table");
+
+      m_logger->log("Shader binding table created", system::ILogger::ELL_INFO);
     }
 
     {
@@ -609,7 +660,8 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication,
     {
       SPushConstants pc;
       pc.light = m_light;
-      pc.geometryInfoBuffer = m_geometryInfoBuffer->getDeviceAddress();
+      pc.proceduralGeomInfoBuffer = m_proceduralGeomInfoBuffer->getDeviceAddress();
+      pc.triangleGeomInfoBuffer = m_triangleGeomInfoBuffer->getDeviceAddress();
       pc.frameCounter = m_frameAccumulationCounter;
       const core::vector3df camPos = m_camera.getPosition().getAsVector3df();
       pc.camPos = { camPos.X, camPos.Y, camPos.Z };
@@ -957,8 +1009,8 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication,
         .transform = getTranslationMatrix(0, 0.5f, 0),
       },
       ReferenceObjectCpu {
-        .meta = {.type = OT_SPHERE, .name = "Sphere Mesh"},
-        .data = gc->createSphereMesh(2, 16, 16),
+        .meta = {.type = OT_CUBE, .name = "Cube Mesh 2"},
+        .data = gc->createCubeMesh(nbl::core::vector3df(1.5, 1.5, 1.5)),
         .material = {
           .ambient = {0.1, 0.1, 0.1},
           .diffuse = {0.2, 0.2, 0.8},
@@ -969,8 +1021,8 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication,
         .transform = getTranslationMatrix(-5.0f, 1.0f, 0),
       },
       ReferenceObjectCpu {
-        .meta = {.type = OT_SPHERE, .name = "Transparent Sphere Mesh"},
-        .data = gc->createSphereMesh(2, 16, 16),
+        .meta = {.type = OT_CUBE, .name = "Transparent Cube Mesh"},
+        .data = gc->createCubeMesh(nbl::core::vector3df(1.5, 1.5, 1.5)),
         .material = {
           .ambient = {0.1, 0.1, 0.1},
           .diffuse = {0.2, 0.8, 0.2},
@@ -1060,10 +1112,10 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication,
       prepass.template operator() < ICPUBuffer > (tmpBuffers);
     }
 
-    auto geomInfoBuffer = ICPUBuffer::create({ std::size(cpuObjects) * sizeof(SGeomInfo) });
-    SGeomInfo* geomInfos = reinterpret_cast<SGeomInfo*>(geomInfoBuffer->getPointer());
+    auto geomInfoBuffer = ICPUBuffer::create({ std::size(cpuObjects) * sizeof(STriangleGeomInfo) });
+    STriangleGeomInfo* geomInfos = reinterpret_cast<STriangleGeomInfo*>(geomInfoBuffer->getPointer());
 
-    m_gpuObjects.reserve(std::size(cpuObjects));
+    m_gpuTriangleGeometries.reserve(std::size(cpuObjects));
     // convert
     {
       // not sure if need this (probably not, originally for transition img view)
@@ -1097,7 +1149,7 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication,
       {
         auto& cpuObject = cpuObjects[i];
 
-        m_gpuObjects.push_back(ReferenceObjectGpu{
+        m_gpuTriangleGeometries.push_back(ReferenceObjectGpu{
           .meta = cpuObject.meta,
           .bindings = {
             .vertex = {.offset = 0, .buffer = buffers[2 * i + 0].value },
@@ -1111,9 +1163,9 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication,
           });
       }
 
-      for (uint32_t i = 0; i < m_gpuObjects.size(); i++)
+      for (uint32_t i = 0; i < m_gpuTriangleGeometries.size(); i++)
       {
-        const auto& gpuObject = m_gpuObjects[i];
+        const auto& gpuObject = m_gpuTriangleGeometries[i];
         const uint64_t vertexBufferAddress = gpuObject.bindings.vertex.buffer->getDeviceAddress();
         geomInfos[i] = {
           .vertexBufferAddress = vertexBufferAddress,
@@ -1131,7 +1183,50 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication,
       IGPUBuffer::SCreationParams params;
       params.usage = IGPUBuffer::EUF_STORAGE_BUFFER_BIT | IGPUBuffer::EUF_TRANSFER_DST_BIT | IGPUBuffer::EUF_INLINE_UPDATE_VIA_CMDBUF | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT;
       params.size = geomInfoBuffer->getSize();
-      m_utils->createFilledDeviceLocalBufferOnDedMem(SIntendedSubmitInfo{ .queue = queue }, std::move(params), geomInfos).move_into(m_geometryInfoBuffer);
+      m_utils->createFilledDeviceLocalBufferOnDedMem(SIntendedSubmitInfo{ .queue = queue }, std::move(params), geomInfos).move_into(m_triangleGeomInfoBuffer);
+    }
+
+    // intersection geometries setup
+    {
+      auto spheresInfoBuffer = ICPUBuffer::create({ NumberOfProceduralGeometries * sizeof(SProceduralGeomInfo) });
+      SProceduralGeomInfo* sphereInfos = reinterpret_cast<SProceduralGeomInfo*>(spheresInfoBuffer->getPointer());
+      core::vector<Aabb> aabbs;
+      for (int32_t i = 0; i < NumberOfProceduralGeometries; i++)
+      {
+        const auto middle_i = NumberOfProceduralGeometries / 2.0;
+        SProceduralGeomInfo sphere = {
+          .center = float32_t3((i - middle_i) * 4.0, 2, 5.0),
+          .radius = 1,
+          .material = {
+            .ambient = {0.1, 0.1, 0.1},
+            .diffuse = {0.3, 0.2 * i, 0.3},
+            .specular = {0.8, 0.8, 0.8},
+            .shininess = 1.0f,
+            .illum = 2
+          },
+        };
+
+        sphereInfos[i] = sphere;
+        aabbs.push_back({
+          .minimum = sphere.center - sphere.radius,
+          .maximum = sphere.center + sphere.radius,
+        });
+      }
+
+      {
+        IGPUBuffer::SCreationParams params;
+        params.usage = IGPUBuffer::EUF_STORAGE_BUFFER_BIT | IGPUBuffer::EUF_TRANSFER_DST_BIT | IGPUBuffer::EUF_INLINE_UPDATE_VIA_CMDBUF | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT;
+        params.size = spheresInfoBuffer->getSize();
+        m_utils->createFilledDeviceLocalBufferOnDedMem(SIntendedSubmitInfo{ .queue = queue }, std::move(params), sphereInfos).move_into(m_proceduralGeomInfoBuffer);
+        m_logger->log("Device address : %d", ILogger::ELL_INFO, m_proceduralGeomInfoBuffer->getDeviceAddress());
+      }
+
+      {
+        IGPUBuffer::SCreationParams params;
+        params.usage = IGPUBuffer::EUF_TRANSFER_DST_BIT | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT | IGPUBuffer::EUF_ACCELERATION_STRUCTURE_BUILD_INPUT_READ_ONLY_BIT;
+        params.size = aabbs.size() * sizeof(Aabb);
+        m_utils->createFilledDeviceLocalBufferOnDedMem(SIntendedSubmitInfo{ .queue = queue }, std::move(params), aabbs.data()).move_into(m_proceduralAabbBuffer);
+      }
     }
 
     return true;
@@ -1157,19 +1252,19 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication,
     missRegion = {
       .offset = raygenRegion.size,
       .stride = handleSizeAligned,
-      .size = core::alignUp(pipeline->getMissGroupCount(), limits.shaderGroupBaseAlignment),
+      .size = core::alignUp(pipeline->getMissGroupCount() * handleSizeAligned, limits.shaderGroupBaseAlignment),
     };
 
     hitRegion = {
       .offset = missRegion.offset + missRegion.size,
       .stride = handleSizeAligned,
-      .size = core::alignUp(pipeline->getHitGroupCount(), limits.shaderGroupBaseAlignment),
+      .size = core::alignUp(pipeline->getHitGroupCount() * handleSizeAligned, limits.shaderGroupBaseAlignment),
     };
 
     callableRegion = {
       .offset = hitRegion.offset + hitRegion.size,
       .stride = handleSizeAligned,
-      .size = core::alignUp(pipeline->getCallableGroupCount(), limits.shaderGroupBaseAlignment),
+      .size = core::alignUp(pipeline->getCallableGroupCount() * handleSizeAligned, limits.shaderGroupBaseAlignment),
     };
 
     const auto bufferSize = raygenRegion.size + missRegion.size + hitRegion.size + callableRegion.size;
@@ -1222,7 +1317,12 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication,
 
   bool createAccelerationStructures(video::CThreadSafeQueueAdapter* queue)
   {
-    IQueryPool::SCreationParams qParams{ .queryCount = static_cast<uint32_t>(m_gpuObjects.size()), .queryType = IQueryPool::ACCELERATION_STRUCTURE_COMPACTED_SIZE };
+    // plus 1 blas for procedural geometry contains {{var::NumberOfProcedural}}
+    // spheres. Each sphere is a primitive instead one instance or geometry
+    const auto blasCount = m_gpuTriangleGeometries.size() + 1;
+    const auto proceduralBlasIdx = blasCount - 1;
+
+    IQueryPool::SCreationParams qParams{ .queryCount = static_cast<uint32_t>(blasCount), .queryType = IQueryPool::ACCELERATION_STRUCTURE_COMPACTED_SIZE };
     smart_refctd_ptr<IQueryPool> queryPool = m_device->createQueryPool(std::move(qParams));
 
     auto pool = m_device->createCommandPool(queue->getFamilyIndex(), IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT | IGPUCommandPool::CREATE_FLAGS::TRANSIENT_BIT);
@@ -1244,48 +1344,72 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication,
 #endif
     size_t totalScratchSize = 0;
 
+
     // build bottom level ASes
     {
-      core::vector<IGPUBottomLevelAccelerationStructure::DeviceBuildInfo> blasBuildInfos(m_gpuObjects.size());
-      core::vector<uint32_t> primitiveCounts(m_gpuObjects.size());
-      core::vector<IGPUBottomLevelAccelerationStructure::Triangles<const IGPUBuffer>> triangles(m_gpuObjects.size());
-      core::vector<uint32_t> scratchSizes(m_gpuObjects.size());
-      m_gpuBlasList.resize(m_gpuObjects.size());
-
-      for (uint32_t i = 0; i < m_gpuObjects.size(); i++)
+      core::vector<uint32_t> primitiveCounts(blasCount);
+      core::vector<IGPUBottomLevelAccelerationStructure::Triangles<const IGPUBuffer>> triangles(m_gpuTriangleGeometries.size());
+      core::vector<uint32_t> scratchSizes(blasCount);
+      IGPUBottomLevelAccelerationStructure::AABBs<const IGPUBuffer> aabbs;
+
+      auto blasFlags = bitflag(IGPUBottomLevelAccelerationStructure::BUILD_FLAGS::PREFER_FAST_TRACE_BIT) | IGPUBottomLevelAccelerationStructure::BUILD_FLAGS::ALLOW_COMPACTION_BIT;
+      if (m_physicalDevice->getProperties().limits.rayTracingPositionFetch)
+        blasFlags |= IGPUBottomLevelAccelerationStructure::BUILD_FLAGS::ALLOW_DATA_ACCESS_KHR;
+
+      IGPUBottomLevelAccelerationStructure::DeviceBuildInfo initBuildInfo;
+      initBuildInfo.buildFlags = blasFlags;
+      initBuildInfo.geometryCount = 1;	// only 1 geometry object per blas
+      initBuildInfo.srcAS = nullptr;
+      initBuildInfo.dstAS = nullptr;
+      initBuildInfo.scratch = {};
+
+      auto blasBuildInfos = core::vector(blasCount, initBuildInfo);
+
+      m_gpuBlasList.resize(blasCount);
+      // setup blas info for triangle geometries
+      for (uint32_t i = 0; i < blasCount; i++)
       {
-        const auto& gpuObject = m_gpuObjects[i];
-
-        const uint32_t vertexStride = gpuObject.vertexStride;
-        const uint32_t numVertices = gpuObject.bindings.vertex.buffer->getSize() / vertexStride;
-        if (gpuObject.useIndex())
-          primitiveCounts[i] = gpuObject.indexCount / 3;
-        else
-          primitiveCounts[i] = numVertices / 3;
-
-        triangles[i].vertexData[0] = gpuObject.bindings.vertex;
-        triangles[i].indexData = gpuObject.useIndex() ? gpuObject.bindings.index : gpuObject.bindings.vertex;
-        triangles[i].maxVertex = numVertices - 1;
-        triangles[i].vertexStride = vertexStride;
-        triangles[i].vertexFormat = EF_R32G32B32_SFLOAT;
-        triangles[i].indexType = gpuObject.indexType;
-        triangles[i].geometryFlags = IGPUBottomLevelAccelerationStructure::GEOMETRY_FLAGS::NO_DUPLICATE_ANY_HIT_INVOCATION_BIT;
-
-        auto blasFlags = bitflag(IGPUBottomLevelAccelerationStructure::BUILD_FLAGS::PREFER_FAST_TRACE_BIT) | IGPUBottomLevelAccelerationStructure::BUILD_FLAGS::ALLOW_COMPACTION_BIT;
-        if (m_physicalDevice->getProperties().limits.rayTracingPositionFetch)
-          blasFlags |= IGPUBottomLevelAccelerationStructure::BUILD_FLAGS::ALLOW_DATA_ACCESS_KHR;
-
-        blasBuildInfos[i].buildFlags = blasFlags;
-        blasBuildInfos[i].geometryCount = 1;	// only 1 geometry object per blas
-        blasBuildInfos[i].srcAS = nullptr;
-        blasBuildInfos[i].dstAS = nullptr;
-        blasBuildInfos[i].triangles = &triangles[i];
-        blasBuildInfos[i].scratch = {};
-
+        bool isProcedural = i == proceduralBlasIdx;
+        if (isProcedural)
+        {
+          aabbs.data.buffer = smart_refctd_ptr<IGPUBuffer>(m_proceduralAabbBuffer);
+          aabbs.stride = sizeof(Aabb);
+          aabbs.geometryFlags = IGPUBottomLevelAccelerationStructure::GEOMETRY_FLAGS::OPAQUE_BIT; // only allow opaque for now
+
+          primitiveCounts[proceduralBlasIdx] = NumberOfProceduralGeometries;
+          blasBuildInfos[proceduralBlasIdx].aabbs = &aabbs;
+          blasBuildInfos[proceduralBlasIdx].buildFlags |= IGPUBottomLevelAccelerationStructure::BUILD_FLAGS::GEOMETRY_TYPE_IS_AABB_BIT;
+        } else
+        {
+          const auto& gpuObject = m_gpuTriangleGeometries[i];
+
+          const uint32_t vertexStride = gpuObject.vertexStride;
+          const uint32_t numVertices = gpuObject.bindings.vertex.buffer->getSize() / vertexStride;
+          if (gpuObject.useIndex())
+            primitiveCounts[i] = gpuObject.indexCount / 3;
+          else
+            primitiveCounts[i] = numVertices / 3;
+
+          triangles[i].vertexData[0] = gpuObject.bindings.vertex;
+          triangles[i].indexData = gpuObject.useIndex() ? gpuObject.bindings.index : gpuObject.bindings.vertex;
+          triangles[i].maxVertex = numVertices - 1;
+          triangles[i].vertexStride = vertexStride;
+          triangles[i].vertexFormat = EF_R32G32B32_SFLOAT;
+          triangles[i].indexType = gpuObject.indexType;
+          triangles[i].geometryFlags = IGPUBottomLevelAccelerationStructure::GEOMETRY_FLAGS::NO_DUPLICATE_ANY_HIT_INVOCATION_BIT;
+
+          blasBuildInfos[i].triangles = &triangles[i];
+        }
         ILogicalDevice::AccelerationStructureBuildSizes buildSizes;
         {
           const uint32_t maxPrimCount[1] = { primitiveCounts[i] };
-          buildSizes = m_device->getAccelerationStructureBuildSizes(blasFlags, false, std::span{ &triangles[i], 1 }, maxPrimCount);
+          if (isProcedural)
+          {
+            buildSizes = m_device->getAccelerationStructureBuildSizes(blasBuildInfos[i].buildFlags, false, std::span{&aabbs, 1}, maxPrimCount);
+          } else
+          {
+            buildSizes = m_device->getAccelerationStructureBuildSizes(blasBuildInfos[i].buildFlags, false, std::span{&triangles[i], 1}, maxPrimCount);
+          }
           if (!buildSizes)
             return logFail("Failed to get BLAS build sizes");
         }
@@ -1310,10 +1434,11 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication,
         }
       }
 
+
       auto cmdbufBlas = getSingleUseCommandBufferAndBegin(pool);
       cmdbufBlas->beginDebugMarker("Build BLAS");
 
-      cmdbufBlas->resetQueryPool(queryPool.get(), 0, m_gpuObjects.size());
+      cmdbufBlas->resetQueryPool(queryPool.get(), 0, blasCount);
 
       smart_refctd_ptr<IGPUBuffer> scratchBuffer;
       {
@@ -1324,9 +1449,9 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication,
       }
 
       uint32_t queryCount = 0;
-      core::vector<IGPUBottomLevelAccelerationStructure::BuildRangeInfo> buildRangeInfos(m_gpuObjects.size());
-      core::vector<IGPUBottomLevelAccelerationStructure::BuildRangeInfo*> pRangeInfos(m_gpuObjects.size());
-      for (uint32_t i = 0; i < m_gpuObjects.size(); i++)
+      core::vector<IGPUBottomLevelAccelerationStructure::BuildRangeInfo> buildRangeInfos(blasCount);
+      core::vector<IGPUBottomLevelAccelerationStructure::BuildRangeInfo*> pRangeInfos(blasCount);
+      for (uint32_t i = 0; i < blasCount; i++)
       {
         blasBuildInfos[i].dstAS = m_gpuBlasList[i].get();
         blasBuildInfos[i].scratch.buffer = scratchBuffer;
@@ -1353,8 +1478,8 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication,
       }
 
 
-      core::vector<const IGPUAccelerationStructure*> ases(m_gpuObjects.size());
-      for (uint32_t i = 0; i < m_gpuObjects.size(); i++)
+      core::vector<const IGPUAccelerationStructure*> ases(blasCount);
+      for (uint32_t i = 0; i < blasCount; i++)
         ases[i] = m_gpuBlasList[i].get();
       if (!cmdbufBlas->writeAccelerationStructureProperties(std::span(ases), IQueryPool::ACCELERATION_STRUCTURE_COMPACTED_SIZE,
         queryPool.get(), queryCount++))
@@ -1369,12 +1494,12 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication,
 
     // compact blas
     {
-      core::vector<size_t> asSizes(m_gpuObjects.size(), 0);
-      if (!m_device->getQueryPoolResults(queryPool.get(), 0, m_gpuObjects.size(), asSizes.data(), sizeof(size_t), IQueryPool::WAIT_BIT))
+      core::vector<size_t> asSizes(blasCount);
+      if (!m_device->getQueryPoolResults(queryPool.get(), 0, blasCount, asSizes.data(), sizeof(size_t), IQueryPool::WAIT_BIT))
         return logFail("Could not get query pool results for AS sizes");
 
-      core::vector<smart_refctd_ptr<IGPUBottomLevelAccelerationStructure>> cleanupBlas(m_gpuObjects.size());
-      for (uint32_t i = 0; i < m_gpuObjects.size(); i++)
+      core::vector<smart_refctd_ptr<IGPUBottomLevelAccelerationStructure>> cleanupBlas(blasCount);
+      for (uint32_t i = 0; i < blasCount; i++)
       {
         cleanupBlas[i] = m_gpuBlasList[i];
         {
@@ -1410,16 +1535,17 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication,
 
     // build top level AS
     {
-      const uint32_t instancesCount = m_gpuObjects.size();
-      core::vector<IGPUTopLevelAccelerationStructure::DeviceStaticInstance> instances(m_gpuObjects.size());
+      const uint32_t instancesCount = m_gpuBlasList.size();
+      core::vector<IGPUTopLevelAccelerationStructure::DeviceStaticInstance> instances(instancesCount);
       for (uint32_t i = 0; i < instancesCount; i++)
       {
+        const auto isProceduralInstance = i == proceduralBlasIdx;
         instances[i].base.blas.deviceAddress = m_gpuBlasList[i]->getReferenceForDeviceOperations().deviceAddress;
         instances[i].base.mask = 0xFF;
         instances[i].base.instanceCustomIndex = i;
-        instances[i].base.instanceShaderBindingTableRecordOffset = 0;
+        instances[i].base.instanceShaderBindingTableRecordOffset = isProceduralInstance ? 2 : 0;
         instances[i].base.flags = static_cast<uint32_t>(IGPUTopLevelAccelerationStructure::INSTANCE_FLAGS::TRIANGLE_FACING_CULL_DISABLE_BIT);
-        instances[i].transform = m_gpuObjects[i].transform;
+        instances[i].transform = isProceduralInstance ? matrix3x4SIMD() : m_gpuTriangleGeometries[i].transform;
       }
 
       {
@@ -1557,13 +1683,17 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication,
   } m_ui;
   core::smart_refctd_ptr<IDescriptorPool> m_guiDescriptorSetPool;
 
-  std::vector<ReferenceObjectGpu> m_gpuObjects;
+  core::vector<ReferenceObjectGpu> m_gpuTriangleGeometries;
+  core::vector<SProceduralGeomInfo> m_gpuIntersectionSpheres;
+  uint32_t m_intersectionHitGroupIdx;
 
   std::vector<smart_refctd_ptr<IGPUBottomLevelAccelerationStructure>> m_gpuBlasList;
   smart_refctd_ptr<IGPUTopLevelAccelerationStructure> m_gpuTlas;
   smart_refctd_ptr<IGPUBuffer> m_instanceBuffer;
 
-  smart_refctd_ptr<IGPUBuffer> m_geometryInfoBuffer;
+  smart_refctd_ptr<IGPUBuffer> m_triangleGeomInfoBuffer;
+  smart_refctd_ptr<IGPUBuffer> m_proceduralGeomInfoBuffer;
+  smart_refctd_ptr<IGPUBuffer> m_proceduralAabbBuffer;
   smart_refctd_ptr<IGPUImage> m_hdrImage;
   smart_refctd_ptr<IGPUImageView> m_hdrImageView;
 

From f261f7c42e1b1465e225b8671ecbdb97a8f2385b Mon Sep 17 00:00:00 2001
From: kevyuu <kevin.kayu@gmail.com>
Date: Fri, 24 Jan 2025 16:23:42 +0700
Subject: [PATCH 007/529] Add Readme

Signed-off-by: kevyuu <kevin.kayu@gmail.com>
---
 71_RayTracingPipeline/Readme.md               |  11 ++++++++
 .../app_resources/common.hlsl                 |  24 +++++++++---------
 .../docs/Images/final_result.png              | Bin 0 -> 103835 bytes
 .../docs/Images/shader_binding_table.png      | Bin 0 -> 8569 bytes
 71_RayTracingPipeline/main.cpp                |  11 ++++----
 5 files changed, 29 insertions(+), 17 deletions(-)
 create mode 100644 71_RayTracingPipeline/Readme.md
 create mode 100644 71_RayTracingPipeline/docs/Images/final_result.png
 create mode 100644 71_RayTracingPipeline/docs/Images/shader_binding_table.png

diff --git a/71_RayTracingPipeline/Readme.md b/71_RayTracingPipeline/Readme.md
new file mode 100644
index 000000000..4317be9c3
--- /dev/null
+++ b/71_RayTracingPipeline/Readme.md
@@ -0,0 +1,11 @@
+# Vulkan Ray Tracing Pipeline Demo
+![finalResult](docs/Images/final_result.png)
+
+The scene is rendered using two ray. The first ray(primary ray) is shoot from the camera/generation shader and the second ray(occlusion ray) is shoot from the closest hit shader.
+To test intersection shader, the acceleration structures consist of two types of geometries. The cubes are stored as triangle geometries while the spheres are stored as procedural geometries.
+To test callable shader, we calculate lighting information of different type in its own callable shader
+
+## Shader Table Layout
+![shaderBindingTable](docs/Images/shader_binding_table.png)
+
+
diff --git a/71_RayTracingPipeline/app_resources/common.hlsl b/71_RayTracingPipeline/app_resources/common.hlsl
index 50306b516..a35bd3fcd 100644
--- a/71_RayTracingPipeline/app_resources/common.hlsl
+++ b/71_RayTracingPipeline/app_resources/common.hlsl
@@ -42,35 +42,35 @@ struct STriangleGeomInfo
     Material material;
 };
 
-enum E_GEOM_TYPE : int32_t
+enum E_GEOM_TYPE : uint16_t
 {
     EGT_TRIANGLES,
     EGT_PROCEDURAL,
     EGT_COUNT
 };
 
-enum E_LIGHT_TYPE : int32_t
-{
-    ELT_DIRECTIONAL,
-    ELT_POINT,
-    ELT_SPOT,
-    ELT_COUNT
-};
-
-enum E_RAY_TYPE : int32_t
+enum E_RAY_TYPE : uint16_t
 {
     ERT_PRIMARY, // Ray shoot from camera
     ERT_OCCLUSION,
     ERT_COUNT
 };
 
-enum E_MISS_TYPE : int32_t
+enum E_MISS_TYPE : uint16_t
 {
     EMT_PRIMARY,
     EMT_OCCLUSION,
     EMT_COUNT
 };
 
+enum E_LIGHT_TYPE : uint16_t
+{
+    ELT_DIRECTIONAL,
+    ELT_POINT,
+    ELT_SPOT,
+    ELT_COUNT
+};
+
 struct Light
 {
     float32_t3 direction;
@@ -78,7 +78,7 @@ struct Light
     float32_t intensity;
     float32_t innerCutoff;
     float32_t outerCutoff;
-    int32_t type;
+    int type;
 
 #ifndef __HLSL_VERSION
     bool operator==(const Light&) const = default;
diff --git a/71_RayTracingPipeline/docs/Images/final_result.png b/71_RayTracingPipeline/docs/Images/final_result.png
new file mode 100644
index 0000000000000000000000000000000000000000..af1f2b9b88c16271ba23a15333ee4ea0e9549d9a
GIT binary patch
literal 103835
zcmb@u1z1#F+cu1WqDb8aN;oPdBB0dJqkte%5{fh^DGUuGF^n`yN=hr;-QC?aFbv%>
zbTbU!M#1NP?&p2K<Nf~s-N%7AvuDp<YhCAco>#2Rm)Fu_L{}-U;^E;DiN6$)!^69X
z!^1n@LvRUrN2&r;2>d!{Atxq;m(fAB0KB-U_gv~Z9$vODAyx|?czwn6rIH069&zo-
z&$&jEByBvr<9KnA=L(<Hmyc|+9wXtPPq>enC2wGeL)>TgBP}{SC)$TXXc5jAuO&S5
z=ukAC1xBry^*kDSG>w=g_bf@frEl%^ZmvQ0so+uux0Tz?ADAxrKfWwX$l>;qEbgT^
z9*h((drF{ecSU)3z&v%<9$_P0k(Sn$mR4fef8=m=pKUWDfFXCWzdUo%Gaxi{d&WD1
z6!Y0DGC<2A<x=HcK5T5E3F4f1XD#&&MElm1A)A8}LT>`Zjf1abtSz-75p!$#G^jg=
zJI{^U)~z<L*-~FR;(OWcwqI6-+|rcT+_iWO>0Ms6Bg}j?phEiqMYP9h@fvY%73@%S
zFXpIWtW!ipgd+nDlX94obTWwyO^iCA(76BL^oY4qONIbG@%y;$Q4JzHJ-RQxUlTh!
z!Qprtdv=0M>fWXQf0sPm?3346Le0FBkmy9wJ&38*F<vn%J-ru)y4+<%<tAUH0n)>R
z*j)y`n0pQ9@$1kxT368c*U0O}eSg32;h}&2=^oA5nXqTWB@JV-zJiZr_}5+3Ja^mJ
zJDV&z1@w^gjfxc;HzO-o;j6toWKy@(r%u12VW>b9d%rC_GBPN7KQ#JF<ZH;QdH2`t
zQhZv82Up&Y|5=F3{RJ}HM~TN<(JkJT;ALgjz*vWO1Pi1o#Rgy@zOxa;h!@Zx_YRN7
zj^U+!Od0%N2#=hcmXcudue117=S2ETXH1KyT4ERdFBVc-tNghNgZehUs)1`oFtS!+
zbJR99sb~1EoKx<3Srx}iFmM_FkjNOUk1(ql);hc70ZM|;T5$}qyNZic+i*dW<Y+!A
zyndhh;MraX*F)N~kD(0sZefphOpXh%`_ae6EFXNfYumiHKJ5LO*P&LU6Xerka$?+&
z(W1SAtkw{>yWX|qKkExw)?eF=T(uX&t&tqRo*O3Pe{<{lKtM|`fk}8n(jlQSdz<wc
zSVZ{f{$}8oRcwa|iW9K%gnaZtT3fq-F<{tVM?<TR&%ewT<Lvl9Lf;Ps@a{VmJLxF+
z&Ye<G5le1<+}?G$RDq9wzLrqze-W(zA6!De)44J+8pn({TmJ_!BiV`7pqdJXXsj3H
zDfSOkQPd~?sElN3R6d&#7X|r2{S~**8EQps>6Z_`por19m;%QljE6@=BM2e%F<%Tr
z@!J=>Kk+<4?g{VBCYU~)YpL0;&e1Hsp6?xyM}6k2-9>y)fct{ML^K9`F1FI9cl-<f
z+?)b-`2leCEh<N_*SNJo;Yi1+p8>{csE)!%5DeDDE?L{=QH!mfeIc36j%8gm;W2tz
zblA1E_v@00pX}}*6mhWZEN4`Rh)lV`w&{LNKE}*i(|G0izo6m%KeF+^F^s<0%A(!Y
z#&zp29&T3K_@hq`e@v>uzx;xbeuP=w-zbaMlm^EF$68o3Mv_-<y(ksoh~l1H9iQ}o
z{5A!J9@YB4M*AP6X8A+lXv<{i_s}0apt$5*`E;GK5|?VtC}1>|kR`jMzk7fFs=J!3
z&e@E_Z)3e3WzTaPZSHFvIzjLcZ|D2`MmMmCmz^swbl{s=7V+;->tV?pzJCyC^2WR7
zdtNg4ZvM!<YK+rxLl9r7?AeeOp6v0B7f>G1c@I+N`|Oa{Jh|v-{EE^C(`WtN$!Vrq
zFklBPd1hR1w`x%Lxta6XS~aYNL?Td6HgNpR0hKN#g8@R^>U4a0@M}1F)gF&(VCeSu
z>j<R{ig#+=EUV#KS6`)^A|VFzVGTU{>D7uhB^_xk7s31`C|}D<KIHC0{{qT<nNtvd
z4Y65F&a}dqr{(9*O%LfsXB!^*s5{iKK*!>J5gjb?o0B0G`XTi43R&b--tGrQ-8R+G
zTBvAxY$R;kbu>D?sEg&@-D@qE@`)(&)B?x3NYO0G#Qr-W1;jmfm!$2m>kM&k?fNoK
zk>*Vfh|J+*Dy;YQi<XBsTQK-%qQlYgDTVb6kN;39Vsvtah<iJC^djHFpHF$S|BYqB
zp7F6w`g-X9^#?cqIDIBIfD!3QolkM|;6coly3&KcFzloKyW^9bXM*&<iaw5SQ4R-%
z%YXfzTlYlWxQQgTA@evz?G?#z5u;HbFN%^N-KqD9C{dZ{$QjCr=`SHZXa?dD58LCA
z72`71g20lOZ%+qAuQRxrpTdCS@g>CN2*ODZC-lpzqGwMs1T)ZIQhk{~v_)#B!$szL
z;KftW5>o=7L|Y42#*x{@4s_#$k~sA_G_wRBzKgOh<v8^GYn1pz#Rm0yaBj(`0ODN+
zq66yJ1UcwD`R8x5I<>rVA_-^RM#yO~tjTBNG=9GZp70mF=A$fT_aRX%<NiYnIPtKF
zX{lC$6~>)t!RfH(Mxer(CxD;sQCjfR!8Y!rd0)m3i}D0x_}$|$#63$wzo%r`r5$Cv
zXsA|7WeCOa*z64HEh|^XhD2vc48s<tqH`q{%J$+}QSr?;ZZ6}CP5l%!wp2GAHVp^-
zU``GG^H`72@$DxLm8{3NKfUm_^-LwXtRs79I$7X}(g=)$d)rd^o#4{&KPgvhO)Rf7
z#;H=={j+iDPqez`S(G&8m3VM!JF<VoNb^;ZixQTDR1Na*Mtqq+)Y$O%&`V!T6i_Xx
z2WMD~G5fdE|Gf--Qq+inH<TXSRV`~crNVzenra;!BLO;&P<u63&kFBmA={qxsC4$E
z`L9v@--mtbQzjyIX5N)!d)pfQ`*R2kAumjo$7RW%77G4I4b|x5#Qj2WkCcqy{+&`0
zJ6GhXyaSUNHxVhN#@;vbygF1fV}_|Y;T9ijQPs7^A123V(HrnNc{s^EH$x5rr{`dN
zB4vi(>=RS#Ts?Nqb*)oMcCzKNILa9C1#0n$yP_9LzD(RbC4(1QYp`cpb<yFsOQ@x*
zO<B`5N$1ZCozfR?1VGfuvNHJ6Cl{Z-`GEJ+ZLHL7A7WbYqHpZcnG1kN`(eOD6*U<6
zQO2*;(DP-Ot%bU6saVRHjEd@|gBST9)(Y8yb^rY}6nfWiqItvBAnLoaz(3c{7(_qf
zzYkY|Gl0>6xw1qpO8iFjl-HR-PN=8L#|v1yFp)oc_3x+uotgd~2(JSIeCP$+7AX4U
z{a>q|wPoE2WlPT)l%Kc=T}&+i1!9)}XW*SE-w7196SeB00eyOM)R<pkPoz`L(kbtV
zT*DD_yO=~D=d<qhKG9ND9gY($%>6^EyxhR@KO`UT{$w6|;3VG_Amfmb+4g1l(&QrA
zR^gA(F#rtre()R2K&UcKkf+B4aeojHvio_2#^l87{)<k-Y-bS_1p-)nVs7##*Lr-*
zhZgeh<kU<-vWI6vw_XEK>T-iYsW^OtW5W5|xsj~<RH(}gjoUX_&XCp&&~Dp$Ai6n-
zeoGkMH!r~Ohr;MpW|I>u2J&082L_@7^rGPn8QXle`&S}04)*3OqXs-R4r0_|XaVT)
z|3zXY8@m~EzFM>NIh>bjzwj`lqU$*K=FP!k(kD~|B?Z;C3{+HZhu%v8{Js#JOW#Ld
zAV&Um!~mo8)sKFGkFb35<X`?He;V=phc4yC0=(0`_9<nxy8i4#{|Jjv-%gk&z~c;H
z|2X~Md;DmBh<-GHsLSM;XK~X%a0rCj7{7m!i|icLI?VL{U0V&YnCufQ1$7>y$d^%4
z4`DUirLrN~nY&=doka?S4CF|~dLiK?epZW#tTQ|f@VmJIPjpv(a_i=I{8p!>lPmfJ
z0~U!r%dg>XZ()f39y6@o<?(uc+nabo{nINZW6Hw2Z_dOfz`c6xuK4wX`&3@Jlvd(9
zwunAL8UT;q30GszmyKIb*rql1pOQ?sHXs0&RmE8>O|GyB^JjKoSNvfy=&28j1f1}c
zKgT0e@S^|nHtwOa!hA?wKJ5DG)zs>MLoi19+pQ5%9-O{Mh7T;T9ql81LLVQ(0e7)>
z%v&JzAM!q;zQkvvI2PAyc^Thn0@`aVcJyzQ<6+$-ed&LQkvaevJ9lJGNZ~AY`iDmR
zuSP|fmus+v0K^yzS$CE?w*0u8rTcm98A&w&<2Z}3AA~(SW6S@<b!PQg<z6YoIAmu`
z{Vw5-m0uK}cj8H2RT|ilh4~%-d+3*8l-K5J(&PM=K*wXpgDjYe=g)`L@3yM1cs9s~
z`K>fFKt6<6i}nf1-U6T5u(}|X21jaIZ18$bROB*A%?t30^ivacSLf>`9=O$37Jd7i
z@i^Hbkk=!1tb<F|%1=QoPfa{_#VK~920Gai`FvYz!n%4}Q)9v!ICaW*!-XcuJqg~C
zn_5bJFRL~8pIAu)p;Ot)wMC0Q-WbeRN;7v-W&;KFES5*6gafNCnuzKnSKok%Q{C?F
z7)FNM)jfYVdokqP{jhL<m%?E~1K)u8ETO?U`cu(D*<5DZlb96kE?&4NGzjbR{ia`u
zFNQs<3B*L^>s3b+)>_N;==1vI(NKV>woiwbx5*|Wu$`Zw=96Er6TyGD%x;}N7B7Q?
zms-kmfAYL~6jM|6{G3+SSQ0d{=!nN)m$Fc*2;OJ=8g<k?u{pVV<O{f9(!Iz0kBFf$
zqAjk4R?r$uYypXJ>GeZG3SEuM^;5o(`}wF?YwzDus$aIO41`AuBKQbCI+nCy=ISHN
zb_MLhSy)Y>_W^b$Qo!seynu|xrTQfw7*p?2T0VM~*u~eLz<0<Pkjmkc|Ni=N{fHBk
z{YeX5_oA6m8fReoze(>|tO49G?Ac|my+dz}<2Ps3C%}%isQht~YBf`efs+Ua$o$+~
z&(usLN{xEmeXT?4+RX0>0{`xGR75U@b5NMRwow(hmU1d14<7Jd+o9VNei*_*L8@cs
zU*N1MaOUsbTKTEf(UdlGPJ*W<eeeK9X=nYnl|b<w{@Y7|!@sav7srhwE>W$7oZWgp
ztA&5N4_RW(Gk`)_Y%J$?4(2<Eu|l*$JoMiwYIqzE#T;$#P074W-$_zCl|Tws{{BB-
z@Q<q5+2R4;|BuY)%ys{x(DvV6l9yUNW`aG)+>tz~p#3Q~hH01H!ruRJh|Prm)O)Xg
z<~AaE;GHc64Hv|?C|(KU<7DDX{b!6xMr{pTQcUJxHld1s`Rs9HKToyopJiJt1RU9E
zLQNI#<sTdInaD_Q6J7fJ=Z(BW>+RXQ|9T1eFT9|;Rc0_=ePXRsEr2xxePzIXN%OTp
zH+S)Y5VrFcCi6cEFGKy~ti(=ym#jtZ0{Ig8zW{A#{&zE~{C$DCDlQ`U+yAxtus<?M
zM)r?9_J6b5`fr)Te<0>pVQ#%K+D{oH3slp79USW1K#F{N{9i!Q<)b>aIzPj(+%sVf
zr1(gW%vLFT=%lL#B`^2?F!+pbCIzQpvMO1!C~0u?GgdSI4~dC)@*gpPwK!nFEzfyx
z{YFO2i1iQWUujCd#DW$j*wtGZ*5zgHCEjV$SK>T-yjJH0@qUEm4~Rm@jabKDNs6-H
zV7E5V^ZM(nR`~c0okU;u<{xueA&sZWX5JXF(Hi*GcPFUev7yFVq9!PIfSMzgYB;G=
zAbI7-^TAz&t!{m^=#}G|P7^ux@DaBs?NZK8;8UEU_yESX_<41Eaw*5I0h2F9p*6P{
zgQ^SSb6&iB#T{V=FV2ZOkX^i&;SdNtUG|0{$7{F($|_p#3X1%dd*e%NgGn?;{_4Zd
zd#Jj|oUh}LxX-gben)=O9DfNWBnPSfKy+7Pk0&nO!fE^YgcXBSG*I@Z-MqpM9u7uJ
zs$3NdljBAOVuj>$*qOh1L4{8YU1~h$1i8pTAmwy@-Aeo~2Q|?v`!!u8VzMi~nxy2!
zGo%&8jN+ewsx}G~YA%)%5@tRbn`*v3_jYEbGrf5PGU9LNkiZPHk6|uEq|p`H=dq9r
zmOR#TSd`+~$p&j5OcchMS}VmB>DZkRKulzTNi`^=)~$+;8xt5;A(}qSWUn?^jI5RV
zlwK3la_NfO7h{2k3;PWH-Sg;qK0vLzs{@*0I*ae+JM3dvc8|Ra`1jVs(C|a?V$7Zr
zk0KvKVrt@y_I4aqv>=OEirDmgpR7{aV_SJEQ6H+-)@(kc_C&T~+(wmzKTl3u_b*1^
zTYeU`$oG=_`+X9PkWXf@7o4BR5U$^z_3nt|Dh==#JtCqY1GuC#)>=$>1?L*fzZdYP
z*uM2n@#~^{iEUFwHjl#^BW&8fi}mv-u*u6cE5=1I*~u3?jyDpYn~_vX$hny;r4whp
zF4a7C#Qd$8Zef=DsvlreeTM(KUeUGxw^S@%UH07DCtlCts@&JnTxi|Ck~|=7IHXv&
zDrQ{CAM#mSqPVtOLv#E(WOlDMbHbsFi}a?x>xKLxKjyM@!|TSNe7NawQbq0L&dnQz
z(Q&u{pATJ6iBZ<yp?4A3E6M$L^9@Br6nKY%2|+0TDZ*Te6{%3MW`6V8Z`rlmytCOl
zccGj5p<>s43qr$2`}DcWY--z7z^%&Z!5pZBYf#DuBdK$Ag(XSwlR8JYKC$*1IK!H6
zk5bQ$3`=DGS_7FhZjxAsuNFvrLWm^NO9DI9B@vamkHpx-EIBml`M=J7<0Nr$U|FlJ
zCBd%g4`pSNksXAxwUHexltByX^7{RE-m36!zhE3G5@s98<`;K8S-P=x`5PzVS>GaU
zHH~EePy3YKno*byjA~hOs!aOkKyXO$$&xasxR{Z0n`r3WnBRf28rMLZeSf$~T3B-%
z7zwuem<En39t|82l|XG57(q4r##M57H{T4k7%vK*c$_{rz|{2DME8rooYVpI`ON!$
z&^Y#R^Cbh-^3@`Qt;r~VM23x~3sA%-I(%o6X4q9&kTWZ@l)98tl9OT)Ik0SBpJ%0Y
z&#5Y3${>w1u6ST8!JE>Y<~S{neaYJQ#N7f1e4-ZZ1~Cl_X#q}|Qe`866c@OkByU>5
z;Y0qzQD{GTb_VTge$!ZD)y_^=2Lh}qH3Y^Cp`Jl*i0*T8Val{xve1KpZ+r8ewyxx9
zM2vbSkFW{kl^dj%)2XPa^ekA$4P#o~>Pf6_dBSzfU-0-JJ!RBQusS}uJ;&|eZ4p@#
zudULx>sCBm79zUs7ZSHisyZV(Jio!MHs67L`*%{(($bqYdUDbpXEybC)2XRfWG^#+
za2R1yQBqz{?59l4DJY{ilrYSx(6&|H5k2l#l&sW+TUg0+1%tIM@HcPT*d58ljZ7F+
zp4`;{(*zXXW5nx#4Ak=BOo{nMEt5H}ri_Y_YJqaW-EdzESW*bur4}?%n4R3)FzeP)
z8;jc#y(^U~RZBTD=ev_Duh+U|lxUoo8jE8K_42{iCRIvu(G1zYO-NywS*~%X3oo`Q
zd3pkn7axFV^M*6fVxShyC1Gu?w|itBXDgD>e&Hv`c=FtetwVcy0tYxeCp+7WearBB
zvT=eGE1sp}z4MUAQ$YOTzze=!Q;|L!46`-S>{%m8Jg{yXB|T6oR+V<H6e_;g=p#VO
zKtA@t@B}gsiBI#GH~htnVywo^mRibRXU`S6Mf2su;D%~%k&4^&)UB<pys*UW(IRW%
z;Nalt1}k>34tG9z;^riWMnp@g`A8JXt~P4Gkp6L|I`}B`3e?oXw?VWPbo2JLi=59?
z#{&BeQ#THnI>pN!)O>%5^5kg%D-dbDvXRpks7o8&yrj=ZbW|cWqz`RE9fYY@k_43M
zf`kNJ94NN5N(yM1z-6}OJ^*x=QhCLi=A0^m4Au7=4%~cjwFH$~99&$}c4LP!v$vt=
zey!qZJwT4PkUqEfMR~AB;Yg5mZhGwMqM?<NhDu0&5GVCSVztIN{au1%4!p98<*}{{
z6ClxDe6hV^SvWWKZJNCTL--L?Nu~7tuddv~tG^>8|4<dhI6uah2?5=Ps}l$IfWHzQ
zk?!FOLj>~4wZ^=|pGTKUWD*_@WMnQh33~0QXec4&Va~|q;VTgA1ZbG9CAcsytLT1E
zZmOJM?t}R2hI0HR^!xRk&Fw}W(}kb{fS9`XCfWr&Jl?MXF`wnTY2SgFx>^Y~#i#)D
zjK{^s(j_Wt$0*y|`*TjuByZ=tQ`*h9^P67Q=nGFSo*3D>fSjzgI<V%Q_uS|bu6!&n
zZ()t`Yx1~3PAR9Z-_$3h($ep9<0_B5(W5qr46xYKOaUAYrLxn}tUGEHW|00ceLKf!
z<U6@RpppBD%OzVs-m^Pc81w&CxY!g4A|ixy%O3GnmuMCT>~M<?ftIb*ZshrXBv%yl
zisYBp>l559@gZgf>K2yo6jl)uJ`qi?@?ZDM!*wL$dRO%4-R}XsU|Xo};;^KSH-8>N
ze!#gnx}kp2q2tSSlGb<$h%S;|#UX*tqR^;mD?7L^AtCNN>#O^Ue)e3MKSAm+G|`=E
zkWl5@QaTLR`O1VmW#&k*%Is>g4z8lDteCM!J^WU&nKGLSeknO1*<rModG`oSK&qew
zmh?!}m@%^ci(!E7Fs*o_zrsnW<z%qnlfgc`QJ}q&9imgf!CY@7n-iI4sZ6~}fe2Uk
zbFb>ehZ7t<i6OE2PP1mQ6q!C@+>?F%RXmSPr=5d%&QNo?lhpnv;aeudey*`}$-${|
zjBn!<9vaLi#$v1NW5!S$E1#D|_YDT!Ka)6WXbP>lho=;glAfCwSTt+Si1b^2JG6A-
z81ywha3P`BPE~bZuF5!V8_>G4Uei5rJtqE)AOUO<NXu7AA1bYwxs-Tc9WKAPWTm8{
zwrMi&xR2ThQ*Z2Ko)V?r*6Ceo2y=5hBs+K>n@t-Y&SVYExj`OozIdV4ym9J5p(;O%
zo=?=bfK7&(ciZ0|-yqkf2gk~DDh9o(crYWR^lIP#QqgVv@2%6cdrhdz(lZ^_?f#)z
zUX1d#5@ER-A?=HW@!=EOct8eAU+;s<lqKhX;VLzJ6RC&Bsq!Rf(LpWz7GsJkFI3rA
z9^buW)<`jP@1*b~mD0BL^`**InEmZ;_bw9HM*n!H<S*%2c10|;F4?+|9=uW;08|Oj
zLssVy3dN+Tg<T6(RlLQL*%IHVV%}OGYd+g_her^udTY~@P46z}=65#V9Ge@<wfJ_t
z=SlCRg_=f+N)qT;eY{Mc>r)Cgq-1Yg7<u1<3N6%$#~ugh?u6YX1V-XrC?LS_zK`^(
zqa?cEVdIT!P*o*`5v}nk_Suul{GFXuN5B*f&$3s%cWR}0xXPDyC!{L~j!nVpE_S>c
z8%_`xhawAB>En@B!+km>C7WIr?@HBlW(JT-Fd^*CblbOHbNkt%$J4hs)>6YR*Hrl1
zj4OK)90GkkYJO7mgy>!F_QIi_-u$OxNyF?zY*0hg`0AKcWWqvXe_+sxqRyKN2DJ!b
zNx<MA48@!w<kv9Tc!4*#Fu`$0)Nv){c`67JyQ45~WgzwB$j(`!IKM!|zbwT^iq(n(
zB^;!X;~A%gZP2me)mcpw`}g;QmwP@jv2e`PTppON*{3l#pNM-LcuTv1pLFkLJI(E_
zb2SyroB9)uhC36wkBbN`Vkgv`%{FJ%LXP;J3@>plN~t_4guQRD#)u9RUTml8{W!&E
zUcSw_U}>`m^f0@NP}$?GRM)h4sBlD-m*`!k#NkA!&aB3#20M$Z=ZFstr9?z5{j4mY
zLm%z5d<os3^q72r3SWRc{;(nh=>rAIIy|SOoI)sn+^<VOA2_;2@S98C(z~U@1zz%9
zt7R;EPJrcDX7?2>{0R{2IYRn~j1l(C@cjlin3!(v3hF@X2N9myJ&&1zgoP2w)NRG-
zS4XWq3k8@_0HOL3%L{7ofb($=Wq-;RK^Q)cb^MsRijt1*Q12shf=(5Pr+EIwPs~ts
z^kdG?l1jSIy2An4Jy8#|NxS2gZh_-14#NGRI@=tbZ$}96D7W2=GN87wB{jqOg>uYJ
zHL=6!F%CgHWYm%29n$m7EdpUF?9&Mf*<Y|p)+smKn?S*<$sD@4NG>`^ZB}jHq?9tz
zj&?&>uh4r<4jUQS(wxIk!?V4k77b_j`08EAB5cJp*%$Kh_(om)6rI&A2!eX<@NgcU
z+oYJod?N+cntL!CsAK5&05$a(aU`SB%cVi&3yKGr_5zsVSiTfw&oGV4vJQ0qr}Ij7
zBaW$g&eIxde`7Yx9G7jy!wVC+l|UZ%c1;KZ`^Y-ZYj=BQ&fS4})pArbD-j5x#z_+Q
zQRWw&S@t}Mj*Qmk#xB;mIeafY(qp?58eSnR4YH{W2HdCn(_nA79WWXowQ=OWlDJ>8
zRDOT0*I<MYX9}$}BRIMjaHrBYe`q6^3ZW@JYiPly78?BOXN)6A8uMasHe<}a8y-F@
zyE+g*nq-I6<mg>Yffdg52fc|jCD#o!;8|%Y)1{YRUFk?e8r3aS76r$KK8cZ^<DNl2
zUeXTc+~aI*FR{`1zAI!gLg<pIYNMXPT504Pb>gz;ekpWJZ3>gEq9UyJG6btM(${M@
z;*n|g#3%6w%P8W-Y+|BL3I`oPbgO-Y#+IP-kWDZR<!tuomJpwd4aLg+<=p#YEiGRm
z=`5F>DuXzWiQ5O?GCUq$;(oAYaX*vLeq_2vFk;bL9I41PGZS+BjEgeWHUe-}%eCJz
z4R2b>9U`1%A82Jo8Qg2zOL$b!z_8Ui^U8mr{KlxGB2Vdq+}vLh(GfxyvM#;!V%}ZW
zfnv`J95=dEBM1(YsTBZgWd9t?DO|3325uo4F79@$&gvxj^mK>}_*`glX)CMr6=L<n
z&d9&YvI`zT_Hdx4b7hSu)b`?cC)e_&PGL)raen-WzFp$s0Hy8aF*BZWWb#`XRQ$SM
zdY<;gWLV#;{P3R|V%n*>bm`I(6wuAxtc8&v@}H5H5VsffY6ULBja{PM98$Tq2qg({
zrf|`S(bU8pQ5Of?yx=Cxch|ABVC-IND{HGaW#+H84{y`&iVwBiolV)S-iI#@bK_xK
zQ0VvMp`84w4<A2vvv|9s(iL@5-hC6wxEE?HS`wKts$V-2_aV08{?gLu%krqIPM@7#
z9Tx-_p5^msH~324=$q9*^^5idM^4W{=heM7>If=0o@)exfGu2cy>~J{fA{d1fATxl
zv-FrLoGtH#gH<Sqy<>}uEvy&DTctVUc}wZ)BVb5%q9bL;E|U?$BaSuUCU<$5B2wGN
zMoCx@OfsM)v^QrwJ|VhW{LKMJQ0eWqV%V3(n$-P{?XdD!@HhYC>>feWNl}ZQ<0cWt
zvt%5coF*Z}6&nk%`0x3!6^x}-pIYuvV7s?eG7^{$gc7S|t`!a0AOWvd5kOQMf!`iQ
zDkhx-5WW%<FwPjU70Wt&(njD-I$gT=^844-_wt7F=%(mGm?Yxa^pL~_lEsWOp$F}T
z2NPB%9NQT!l$4aVA6bMT)XjxC35gpelveZkSkk@wq<OJeEG6vT7NJ_L%!Nu_Low0a
zdL^a(Z#*kEOGRUwrg{CDxZ>Zb1oUjBDBKzi+NgQM*s1YtW0F!X0xo7QFX7Le{))a*
zt6*I1&1p6XI2G|vIq7Mi`V4H%g_}HYBB@hL_byQj*xDGa%!g~d>*=*qn|Ifg7Op&S
zL#RQ>m(w$B5;xprmaLGQ7RdG9leDaK7T<x-ir0j~r1A(_Dq=AGm6;K!y`V#3(}VHg
znL4ORQ?tZs%lc-j`7%1}@m-UaZ&S=l+yjBduFRooNv+G>Fr9<>a5k%HI!5Xvrsht#
z1Y3||JLN}l+TH>Ipvd~76}}NrxzV|7o~FA=WV79`xJ*ITJz=_n#%H`GGYpgmn@!6e
zZQv}_Y{vF8B2-8jXI}^>&S~s_gFaPXDVk!4fBkSc-#OiMnVK;q$G>x$?!(<kYcZ2W
zM|m?!Z#TpT5-*&E)(>fJIJwU4ZXm_ex<7k6eh96<xMRz^D(F%;WTfr`s%*`Wz2UW(
z+LqC}oc%_Lznp$kOhi|&l}#<&Upb#%K8P{-ZHgSDV%uTt<M{W@Sk+tceylGAqmedT
zhNIxw{0`k5|7L5+Xsd5n&V}hwKWSYl=TXVRm{lgA?sV^WXFJT)X7kHx0sc3ck)a@&
z(f)6_n`+IM1o10YJ4)kK9}ecPKZPK|6&~eUV?Hx+(yFM2tOS>5MDx`n-Rx{pEE%ir
zeEX9M>UQ1`NoDdPpXgGROvEdb*mxu%nuL3#aO7@hZyijfqR)Rn7ded<SZx`Sfbzf#
z9yR(>g?te6Y3>GMFzpGC#~sWqHE5?c>CP5Qs4|0#V}EkWB`5hAMn>{hImR~}q<ycm
zG=-~G)f0^zlz8`7C1`ddIA-2FU43GA5unPG{Qx&kwr=R7+k;-*6`<LuRw*$zlZ{hm
zbKNN$954DIB*@w>DqL|qpi-dCkB#)Te`hSfOBbK^vM~Hxd-?@_RI*&i<knZ8o!aDz
z+mVT|O(>UD9!TgJEFD6A>Ak|3?ug{=nZoX_Za8m^Np5`Xktuk#oHrh<uINKm+={j1
z(LpjQ8nu4gTy4qDl~Pju?(bS`6B->EGIWO(T{|!6Vry)v@pIgJS=b|_b=Fb=<nYYw
z!fmnRsCCfn?9o&vtfVAOK9L!-!f9Yst~-#Mka@=k57?@D(yKKCe7T#uC2V!Skojxb
zu~Sp6UTMzj?G{@R7ewRS+>9@7gSGsb)-|-nL7x4_x63*eJwfcbD&~jpRaSUV!MVNE
zJRe(9Z1P%~bPqpY1X-9!WE3%mN{fme70cU?G(o@zGPQ!>;;zoV;w}=$=%@j^&ns1`
zsgYd0WTn=hbO5y#$fs@WMCzDnDc4r)1M>=vT0dWzoTBjPCS%a6*-cmQN7j&58!Zhk
z!-B%qH<W8OTI!0&8CIypPfNNhAwwIZGX)%l8BL-~H=FgyC6A}f9`TcrVTeyA_w?7~
z80TKB2(WBadp;<n-W1(6@FRuIdA{CLWDL^IRt_?$*z#xQDKZ~Q!y=<`d3P;H&vzuh
zSG@H|#oSg`f`vJ6CWD5#7+R68HE#KniJPV%KdAagvOs^NLn{?b#TimwppgpRvp5$`
zc=V(OATlX=lVGdBXamn)7m~17I<m8Kov8fU<af}f?oIIY{JPWQj&S4$WA$0x*|@i<
z3(2Jvd-&45FUAwKKzC>Lx?SzR7ZeoOGHk+D5`fB_@CQYAD;K!2S_q%YDC3L@b23<i
zbJHzwqtW=1Cm`cV7c*9VmCUDR(P>Q##^n31?ra-fO5YZ`#_t}bM6+S-Ix&T<)C@AH
zeCH){!L@Ut;ZX&KWam2xk>IpjrRK&N%p?&;EetHs7QGeqkm%A_77)*vpPg^sv_d{j
z5PHSOa{1**X+g9=J`&&l_Dmvjg|N+P8YFr+l&rKXdT_sw$3r=tsMM1C%`9@JFP+JJ
zvMvpQ5iB<zYfEeE(``G<d=zB#q9ruvMmvqKSi@*it*e8zdF+nudB{b3DC-)bj0$rm
zQH{2w!=9dV6?3m{Ixa?44JSXe$gxazovd36*qZ?myMCL(#)Dm|)+W8x&ZfrIwx$Ie
zFABDsPtQdMH}v5P=a^?dtUO4Tp`NFNqhUK~_Tw8f({mRc=We=X#Ye2Rd{I_e;TsJE
zRc76XSm>$MR*^&x2a-g5Y}#L7PQGc)at*y`G|#yY>=-R_UTtWXx3fDBK^N8HJLrDo
zRH3*^t<XN3qQ<KYz!h?s5m6k^DCSGm4b(xgy1Ke$l$JMA<bbdlUZfuFvy$t}a+lzs
zz^Kc;+a6N@<ogEsD&UZIm#jhNsq)X=F}&|@v^oNJ6o~{;pd6McO<I-|aj_iAtvlmi
zJm#~0$3z_T(tT_j>%;6Dp_kb<Dm=kryW1`*Pqn@WH{k0yZY(4-3V83_Q-0F8(+WB+
zx&-svi?}z)^7wt}Uo}+=%sddJKS{)hAA^ZXNGH0OCAuu0WfjuzH$01?xU{3jZuZ%@
zfl!mQ52(1A)KYGfU&C%~jUKcw?Q2oT77&*hSz}iS%GE+u#cp+|1<^E`Q|wISslzsX
zl6wMuR$4>FZfqKN^_r=@PMlD0#YTqj;*WxOA6Q!Sl6<l9p{)9T<^B&9B*y9Iy_(7d
zBK&Nl3qoK{N-LYAbk=6w56yS4kdaYeI@viVd+p?L0OIp5lDkx$O^2+yGmgD?LL*yV
z61)B!ziQ|pE_pivI_Z@2Y=5%NOTnDSW+ESH)QsbM%Exqk7=6f-{N_<XU91@OZp8Tq
zDi;ful1T!m;v43xip|BOWeR6@TocgcgMGcuozf=F3K>kj^%-e}IkrbtFGHCPrZv_@
zt@F$9y1a3{S(lLq!$34!ZqukM(c18WMKSm~1IuuJb4atb&cyN?9`n5E$R0jSWKSw=
zk@-{N=6eN=+oJpV>w`I3AR&IfVolcDYfvsOlZ@fruLV%9GQ6S3hmO(I3yBuJl|Ea9
zs>hw)kBth$b8>c4Bo#Zw1(9~%l0dCT5~zTCyg$i~jue=_mhGyF3R)VzWqGU{_i^BG
zBi*B_l~tUew#Nlg7$wM@JClQ*VLCj%v(|<!!>$S*8HA?yPexnxSmaoJV?IB;-PgF~
zjwI%kclD7B;k7MU3k@wm!{Zp|652Pmf**h1kOtGJ=za`O4mW5Ooz+<=C>`~9%)Zdh
zOS*$k?*e})_L*HeA+fWmJGO8rzd5+sS{t{_m-8u(mN1vlwox5jb>V09Xc-Ot7W*mz
z(UIyn`L+EgB_shV1M*d}ZsR7Kx}@{76nk1*Nkm6meVQj72gBiT#GpK!b1N5lT-#?1
z*0z}u%Xxxu*Us&Z{p*W|zWJ~I%mvolIcIw#yglYkKLOj4+uZk0w%m7*7I8;^<MYv(
zUKV=DPoOrHuE#uiVB);^Tw*GNTCY{()4tBWV&!3J4?0m^dR5TsmEiFgnI_A4{}mpR
zPlBXF4>AN3DxV|K7TZP6YugRMSi~sytECFCrC`npLcNdFUwR4D^So=5%e4|^DrH#I
z?W5JgjdJxygK5OB+A3!vKAPCPT1BZe-Pp2qcwQ~*Krqd*SDReY9<HVS>8iUu#nw<&
z*Y?keDC+*ibI~?qd$EV6&ZK-3EY-Z=Rk29ZIn6`j%r~7))P(|fLd*a%a~c1=k$Fd{
z!+zaiP2sJ>x&Y7qeB=<Wi$rLw!*1X;I7!ecxB<RYI1|FE9(X)EhZv>mb;?+cmq8;c
zv5GQ+$v-usJ*K2L?eFEhP=$ZR#T220GME|@b9$h#vMLgqe9{fAilLsRz2bdtH-{$z
za5Rfl!Q@t0Tbp$`B~0NYrUPny;uU^ilgq0nR5XXAY0yyjeac|b#Gm)0DVr5DkvioS
z88B0I4ULd(&RnCGC&fzw2dxR^;idvFVe+@B@%0&~nZNedk0e)q(r~5sy0om+c-U2T
z*+dngq`u!By?8Ry>E9_xo{Z0wE;Vd=I%G6FM2tl{Ub#j8GbtsbYfVChwfOea=Rp8{
z7jF<UNH;<Dm-S|*cE?!j(JRpupNwpkam}5$H0<(a$isbXJDCDp?2Z}QAc~|eZ*9Ae
zq!jeTs#-oJ)qhvI(dg4pLVAI6Sa0|T+JT}+K_9IR%x7A{*$^9HmmFr~hWO|FZ*1T<
z&z6;LPp*<^0d;fEQYix?8^>tSXg|2@N2eqUb~6N7hGz<eOEEByB^h!jfgP%)G@Z`f
z6TN_1I@%8)I^OCV)2t$YH*jV3IeT&M=P}#O&i1G1tWJNRv`F?ll@|zPkFGG~T}9X+
z)CJ)h(wSrKId^|{C0%a0U<FSX8s1K;-%5@bKV(mMMYq4~dt8K#J{}nJx9f=L&WYqw
zV`&}RpBI~{JaX0WQ;~_xiEkk7M25uh=VTTgc;NDAjUmCQ0<vkgKd;eKVhsq2hI-_m
zHV*amjrv|1kq^y~1ldVEivLP}a1%No@)S~}wl0%hKgL`mc+tYx1w;>-&y77Cc#G7A
zwiE6>fjNSl1i>g7VMuBDB?lmfz5qEcK@c31?oAwNUl1hPCtbO}Rm)h-HEof2=w`(W
zwOOx+aBZ$rL0^f8IIo}V?s)pig5&{8++xOy-<*ZPcg#PaRC-oBMZdQE_27*gl(^?V
zGBV%jmqz6m!52W?RvA=?V9yseV<WIunh+Loo{)f$Ow~J|%1fV1cXYI{!l!9AkdUqW
zJTPq0bS~MkiEYzdD&2)sbBk>y(wQBelXpik88U73<fl^X6a!#CY7p3Bx}`V0Jmii+
zX+LUL3kWUt>3hT(8WsjLld3fY=OQOPcOkgD+hKO;1(5TsRCw=|oCO|#xB(K~bX`=d
zB99=Q#h00?-#+3%=OXuo=jTP&io-w66Z8Z^g1sC3+d9+(&23P@q-5Q+|7vpNUGM1B
znXc&N;Smh)Hr#))4)LnLsQ;x}IVnRA>QVAz@Cd(UL@u}AKV-~S!eQ-TX=9sQBM_OP
zQOMamYOx-JHlK9iH(l_j3J=eXKc}nlO^nR!T1A)Hp`kE8Xv`08V1Rb;Vlr&qaLvSJ
zb1X?kDpQW?r<uWK+b&PnxbC9zn**ASo0mgFbgXribw9Gpg{Lfd^J8TZLuqb^&ub*H
z$4=n!u5n_TbM5@M2}-QING+?Q!Yvh&LOEWFG?8E1YwNSPPJ+%RkKH%z^>Nu9yBFUR
z!N1it7O0xBJ4*``YH=qie*H--68mUYWJMq};ILCW-j5^>enq40D6EJpf!}(miVF^X
zaIh)i@53;`ZJB^@`(9!jdmkAbq67UP%&SY?S&^R~AG_z%(Qa^OR>><cXSj~d%IwhW
zU{UyJ-_Y!EsJbGLBO#*~(}dU+t7MXBUFuOaS=6r-U#q%+sKu(-^kqmz_0)v3@r`W>
zReqE*ng2<HyNcgDV?KL?IkI2k#PN<M@e$!jXY<QsZL9rc=2%YHxq!$?u<!mg>z_ZD
zQd!r82LWi8Wgd(b#TGARL<iN^6P|3AqsIqaHs&^<!20jRQt#L-zgPF^)TZ^FfOB13
zJj}Xzo!D`pu0MQZkXd+mE8jC!hBw${%Je{)mRd{%q3myZEy**Ddgk`zPOdXxF3Ayu
zJ&tliYz$YxF+f3q@+y5b^Bq`F)&rVRdMSFOKWd$vb1ghW`Y>_-7U2s(A}J7;9HdF>
zio0!$>(otYHDvbOWf6|w95Tv{edRwrg>FYRECCMMV*)n5#14jes#aioe~tS^9lin(
z4}0j}E@aW2$%Lo|M%yJlOq|PApTEpSS{*IW{Zi<445!&PV4>D7u64V$z3o93T|~B{
z^pCR47d>Vor>;b<#Mkn)4x?9vbH214)D?iXTTD}ThW;f4;kKA1+hn5^m|71`RYjJ*
zn#&n&5~S`&?if#YTM$9eUa>19-)l#T6;05N;nr#C&5Nj%$~4#-r<F0K>JlsWf;XQL
zchb)vrDpC#e`Oz!5FBY5EYNL63;=}=GO4R4nVE<PP0~&FjSq#hGn=cd&5f%pO{#SC
zcvmjc2r&P3{(AazxB{ho@a-o2MO$Up#UGi1PAg0)EUyO?`KBH6Xg`8{l24caLo1Ep
zWuxo2c6TkT*OMaB+JUM<aHmW&7Ryu06>2_1IeRoN@FayK7Q?ZXOS6Y==RFXr<g_bP
z=PPDwQ&TDbDJC%KQ?9$Ra<F_J;)T~&#i~oYAg!!y+sgu!jN^Vs<AHxoMhyGv&dh-x
zx3E?e((XodNRT1d2wg<KeIr@C0JV4Acv8s$1RJXOiwDZC1xrelmXEdgia^X?hAP>t
zzU9YK38Nj1Y_y?hhxZ3x4-faZ?C7V6i+snd6VcgzL9UZ;m!%9Fl#S(chJyD?SBQ?D
zXemVbY~{kHtbRs=aQ)DxLgRt#vf2LP0+y{_fcUU&alp?F{-eSXbZJ$m&0ORh<}Xa4
zQ6cmGirbCSRJnyRnxOPTosUaW$psqU7}?(_XTB`n&IzQ11is(oYz70azDeC|l>WsC
z`gQyO*e!ACWWV_h=ac7062G@LIqR95Bb&<<z7{<3Ir6`Ic#bJUaJIEvgF-R-#?_0b
zvy|WsPAZ8Um60s|$0Q#d^!UH#IUjdC0YfONmzmcGdqvaqr(gqQ5Xu|H5gAi$y$x@r
z--g>1m&p3ju~%v(i0av2uWsvGlFJzJdzJiPCI-gtYtJ%fp#~`*jVdpQCd9W2R38qf
zsUlRb?ere6jRRx!FB^|)1*;*x@31WswrHDg41TD5RIFMZ+@;<Oqm*k&lThcPi#B*U
zP}W~Yh6K;;#XId@GG2_aK#$W|vu(@yNgJaq1(`x?ycTK?LxG1fT!C#2ve!<Us~_o^
zNFrNXL-AFc#%@s@OfW8Z7N>hc$rPiz5>IL20zeB_wvoBFa#@+ZQ--UL^u^n7L4Swk
z)Xw$pkYJY<4TnSM{v7atwKWurHG$yFia%kC)I2q#O-jSR<?E`etE4u)Vq!qHn_EcM
zR4un@h$nwvUmf@CB0(O=&o115*Im^ixguTmz2-S+z!|n!k0!wy4Z8Ex2%>L(D$E!I
zdtC-$>5|`)FH>M{p^;_x!r6k9{NESQ6|0JxyGFpKzN@~?8A{V$hVXx<nYRca+(p~H
zzMu~izKvExm1i1JLQO`MM<kWlX5{^uJ3<BTto%iN>nt_ljd)VBinf1yw>179yZlf^
zS3`Htcix*xo<=wSwip3EbbT6~%54Cj|5AcW*NOq`4`2RX(u4w3V%w~dv=F3nSBbJP
z+M^<mc5w9Zapm!}%=8clTNUi(>pKE3wBOKLwbbGFefKm|#*m3-+gd%C0>9WeAyrq|
zL@!oj>d1e{UlKzp_b7OaxB_qjS;g6BYHO1<d_>#IP=kUpc`IjN6ZYW~Wa;*M<HFvo
zPB*Uk)gMK3QJqa#Eb!PeA2S0qxrHcs8ceIvB>wZ=lg+YAZN>J|3>N{#?DXEMeF!01
zBb}W%`Y^Cvt}QIMJ9@RE(RPK{JNd(yiSDrKe%|yW23p0Wg$f{ZIcc7JtseKCH?5wp
zXieNO$9e2Oa9i5Jr@KfM>jJ7MI*2=306jhBU;#tnJ|5*TQ!3tG$OSjFZ*U#vf~1-@
zR5Lv4pa}k~&tdYw%xy&GN0~vz_v6hm9MtB6L31#b{T%tNX+^;U+2r8Xf{QZA$HfOm
zYTBMv){(}d11dI32D3BDu}L%+LfQ}B3L#6A!_`9C`U=}t9i48N<C|Oz=BqFKFlwP|
z?s)b5E%N2<h!#&-2dGKLbFWaEPiBr@po9Bs!-;cu=4xC?T0_}M202lTFNOj?>MA{|
z4|apkB<FNk)g;NGpO<Kk8Y-FOfgDQj8(BZkz2vYX5W7lq=wQCe+UC5<Q$}+1tgB4p
zDQvY=6YqO_`%AOgw~T?e%KW~0(4Xu|`z5l(?uXJdGZne`p<hawG6P6WY2846H$Fd8
zg-cdC7^uS0>n<&J18zwcd6rcb2Or)4eUY?w$&e&21?$N72Um~FRaKO}9q)h<y&2R-
z`R`?M;n6sQWPBlrPL%vsE~nck43*;7#eUuAi<pSCf(_e|Dj1h%&*F*$TX(bCO-9^n
zXNcF9@ci8Q+rVa>2+faURG7K*<Ophui!8e46B07xdb1Z0Cd&a+jUfMFW4oG+ZtIzN
zy8LCU<XgjXDS>gSj1t+ZTeGusuY@aw)Me)S{q6wUCd&HBHYVVcLJ&A;h=;zc6K%(g
z9o<B(kJed#0zJqoSXCpZFJTOi4qlvu`?H!~yw@Gq{NMHgHXWDq)PSBTn__cQB*Vr=
z@?op_`aAVk_?tu$tWi;<k7)bs$kjnv%(@49E=31LM?*G7aCG(${@2LfM|o0mk+DFM
zQ6qVL?Y22Y`L9BfCx8SwLTwk_N5oViq^JfB&jlvzF58<&&Co{QRz(f_?qDiOB?~lS
zmj?80%EF?6wyN(1M1^L}{t#7HCQ6)vc-sOMPwxt@H<y=I1kdaYSrxw?@tcYd?%Pn2
z>#(-U)66w=_NVUYsDSQNq_hIPIKoxcuE~tCMETZInoRYvPBkP6)GWp1p2yHf8~8nK
zo+e%0j;5L#8=%$G@={Ef4XFe0LT`RdpYBIC0WdRrE_;b;`Wu}TF1n!PuhInPC1znl
zKP@dT-lIRZMa=5a-@f|59G1-|x}qgpYjDw(n|V;u_|B>pGy7_}CZ$?~9h$!YMVXU*
z@HLMjRW5>8cL#U~PEjNeaTeEd*s!onPk(>~O=nBc@sxTjS-)^0xp4kVku9?nYM>7~
z(>9kPr^sN`)d8a`)Val`mb(b`gxJ}Nqd;~2B;2<W6K}Q%Tj7+^4qLwSyo#fWlpn`(
zZvlM&%eRHnQ3HaPmigo)ZT41m%ydO(nQq$PPO4Yca!oVSmAK~Uz)f-R=F9zqt^BWp
z9jsp#a{Vx`ILG{HX51peP<|>2$eDqKl86+JnNP`#+!7!aF5X!8ThQlgio3#O)LJ?+
z7^|NeGHc`nM_Adn);?E;?{5i1rhdHTV<aR5gupvSdlW)g4AU+NE55S_I3*PIxKmOh
zUpeSi{)3yFxtTP_uj<2Nk4;-l1Q|jj_RDo~-sXv+d@MzPK5SXmER-4#N!q?L(agZe
zx<j2De}qe_r4~Qf4nopugaz1ZwX6hrG~7A!ZhB#!=x?nIgd541ADK5UI4QKwO56}~
zeXdpi<AC6y_f<6M8s}FvgPU#SoVkaOFX?^Pjf_ZU*HN)~Dv!)#AC9fTh7d*Jf6syd
znzF$|n<sZ+Hx^O1^}hA+qcFy9;3S$Tf`3k4N~JajeS8LlQ76n9c*t$Hl{AhE8?#$t
z&K&<-l$CW4ar^-kAg`M-8Rh$Jr+k1R23XnQj^I+1Y64RbHdvBe#oYX(JWJoAd1G5x
zL<Wa81<eHuHOB8;1Q*D%NqQD5=pv0r2w#x;8`;1ltsJAIp_F*2d#{(C6vm!cn@^-U
zLa9&`d3h>hIK;5tO_VdRH?u8<a^dMMTWREbT0UJeV*76rIhHMGfDY$`f(A^UM3Mlz
z#_rmTP3;D)4eiDwE^F}Z%FN^kAU+n4h_*rr=4CQ8*ei>p6Vfuv-jCK2B`f$YnUQwy
z{d(g4{Tc;AWD29t%;c;3uw|d7$l9OTD_|?$zK+*kNw^EyVMRCA@#$x@%{|yd4tC{?
zJ1(IHO?nBpMDeU8e=d;6UdUFnCbYoFv0c$4i193%62C@${#PO5B>U=y#4M@WXh6||
z_hMLoi**I+{?0^ZJ*GBTY~WJCLbC6?goMxxYvoY^c5`f>y0oBjh2KF{aMO{bX>u!%
z`3irc3z9w|K`*?5>*>*c{4hGY_vs6wP3L^{qn1~>nzKczrm|rK!xt7+5OgXdsiR+q
zL~BXC;&SeP<-;IdZ5^ug7F9zF>dP#$WR|>v#zC$5LXYndClCxtg|=5P<$rv8(%I(@
zLgK^Y&BdJsUF>;NSVMY0cXq~L?Al!|s&|#1m^Ki|KQ5I`STXLwQO$7K4}7L*EdU)0
ze1#7(Qk^f6{mgw!vcBah@W8sWfBOu^@vrWpiIe@Yw}HMMhr@=_w^He8pzIVl^Hb#&
zo1%qAnr-Fau6PPafjnhZx1@)~!zj3&oWX)$O@5?3RCiW_x<jxox!hJfr%G4J1)gv%
z_UN@&SEu3JwrdwDC5Hs{rk<$S<&P>iR8VSd**^Rmta*`@w+cZ=G$kEDyZsHA3%}9)
z4xTT(T&v+Z?9S|)Mr7105Be1=&KYvrZ1vFDY>vQYPzNQ<BX*$TC6@i=Dc{|Zrr2X8
zm+y1ehv`@H9X*f?L4k*xp)qCqp&J>LgMRWsC1|elM}+{kOL7=<6Uq!7GJ;hy#Vt}-
z+~MCzgNr_AnrLhMkd|41+R48kclg}i77?_+Z(;4-^UxZsEr@vWG2bZ)m72KwJ`nhe
zJOgVcEx_I%7mWhHplclx;L4@-Qpld@KvlaB*S6T!{jA(rX)B*ys!?ahlj?UdrEQUX
z-;tnXsEResu@enmwh1|2Npinx&<v1qEEIeCw`v)&k$D4u>)>}=&f7Qnz3AQ2cmC03
zCa+kim|hlpn8@9Cu5pdC?ITKpB`!UP6E>ZT><RDgwK?}wBYB3J+_uJcv~S9>7ycAg
zi#a#^J}+9ZWUrS`joUD|`&J?oSBq{GdqA>?2En;}oi?*YgAdllGMWH1f8wzDMMv`|
zC8)AYt_%T@K7(eKm0Ar-Yta4VMd8Dw;MR?%cnP`YnQ5`r3MS@Fe0V0>X3_=Q<1R?+
zJL*t6ZF=4DTqi`XrKI95+3-G{IGiFo|0iY=HK8zJZY(kDDaE6b-b!hxs8ig%CPYRa
zT-2@T_(R5Iz|rdQa2CmRry4Ws<-u|`0eD|V_r3a+lD3)57KQvY<VNjF2yBzMJa|mu
zh1lW!wbq+$5Q3(T8b=V@dn|ibHVtk$Zk6-UVN+PJ5WOXew^+is%RqTh%v(vqj(@o0
z49Q(~hg)Zy)4YcrMty8)>G9Qen*epcJE^^%Y|8GghUu~}Kx|B2j3Vq__;gMMs&6ts
z(pe+DvR!Lr@|JHX!zdWDeJDv*!?PLfGg!OJ%dv4+w^>2|nZC^Cizgm_Bv3Na#h1@w
z4R@S-5oWwkP~#|szB6V+*QdhtRo1@DVXdH9t)EYF<Ft=3<)u)E+7}g77E6ZLZAWp+
zd)rw+b>{^1#My{>)+>XK@I6DaLt>OML^i(2Vvd<XW^uk!H}-y}dZn?Ubz^8T&pzwe
zq56KEFv&tvEv<7DNgTBniXp~$9A{)>l=%~5JMjL%C{KAKwqIw}Qtcvbbr&l){`s4d
z`rwRMtdn%K!@HIlgHloPt0N`k2mOzLJuRD?@{t*~$c^wYf4O?Ta-{dh^$2FqvCSP=
zIUzRg11@(Jyz6k)&Bk_4-;Vu`T48Gj)vB1HPx0{Rwrev9b~)~+hGrg+fDbWJ;qnW)
zbaWQ1LPPmF*L?NcP97Bhl>%snvFJ?W*kA0ku7vxScHY!^Wip_}6#0Lcdh38D+cs`k
z0b#eq;6??h5e6vTJwiZ~P*4$x(IMSEnxP;lBAp_o(p{rP8Uz{Lj2I(FE#SL&p7(jb
z_aFWSj`KRt<M`E4|MAMR<QHJ_S`8&c2yG~YOwOl@uInvm+mj;8{=H0?GVLx|JID0R
zX>aiy`)En#C4^$R!o66QM{<ew(sswW=GlB2SE3aecFZ9~J<+YN7#28NR#aa{E6#4H
zZd3I{D2l}2@SP8GQYm}adh75F02yNJfcFwH3AJmhjv?%!z8+WTla#dl8eS&`2pNCW
zaBg`i3Fyw%TabA?S9Yc0#_*;6p0&?l=ufG=-?YA2Lv8DpU8vc_0$Q3`ScluAO?&nV
z?1w>GKl`&NrjPX_Ld-dnpMpR}vj_F@2h_5`=cI&TCM-*LuaA_K<oX93H3AMWB=!0(
zw@|rly8(oz*Gnjgc}cX!bfjDRh)#HOecSMtDF?#Yk<C=(pHiWq;WXsY4RfDEaE3|-
zJ>L8e-YnIA>}^^9!3UJp{%>|XlsDQ={!&x^AEfzf__aI&hqOVWhXyseUw^S5pTi)j
zd8Fzb<FLbJ*J)o@)P6+F5YRWUv-aHiu!`9lloqF`KFl}^Lp=8n(e?@;+D@aV=cADV
zV}dTzxwh=LK{inZ8=YA48H*d7tJB=tv?SN}!s^%kmARhKKd_jW&}MjMq4GOgg)RP7
z)U_-hnAfR9#bG^4@9B!Et<k)JT-Q05TjJ%!|6xYs#RXo&N@$<dri1Nl^KeWT+Fl4=
zp*=hPQPzio!3`%^XpPc!xRO8j7wYt~7wY!sf4;vqIh#>Zv3<@H&+X}WJ{@8ice4oN
zaI+#4_AjbXAifwoQ(9({bDnwh^2`T%n&dexr9nJ9Y>QC31{|hOn$LN)`T-At&o)p2
zFILfXKmCHYcWLpi3Q(UOK&)P$1<2vL!{&eUeC`x7tI8#j%3PF)aXX9a)6^eLUNdD7
zm4xVz8#HGYJz?W@&GZ0`b?Du{Yp~ENxz|aMV8b^6Y7}q@hTKCcwFftFtzl(ImoB}5
zaa0j-<Nx<^Iq$a>b6?mDn++=96_cPzer$&P*!ARlc$a?}UEU`BS1!*nR+kjfSbV36
zkH~kyg^ot1gHXy4JF9yU-XL6gOBR{qOb$!`8y9>>am!pAlWK(TYo8hOtWB$C&e7^_
zHbVP7&<%!Tb2*BLjXF!&U&oD*Ln<<r>dwk}W#6&ws;Q(mwGj%(*PHK2CI1}3td(^)
z$Gu@WVVOBVPS=xquso%~mxGV$w(8U6V{-r}^mz+WKb0x{S=K)z<^220nsq~E^&O|f
z$RICE)xOBEI2!qua?Uejow&iFz7AZqPIn-D(c_&$*ofr$G=0x3&odP2frVV_ol;#R
z4*i)rli@cxr4@gZE@q@!S#BRU{9}GyrL}8aDk_dWK*_n4)aea$;f+S3V?i$;=*zTZ
zLDbJS*#iM%jLt65yN}NWm@J~z7dML=Lor7UoZ$Ws>YeFQgKRuRvfS*b5bwn7)*#Nk
zL%`rM0xo1sZ!mp2A`kYVT0rgrhJ?`G>VxE^<?i_2cb|PQq73Yg{{w8Mq15_@e%liX
z87BOXhvtep;Zu(j$8AQAm;4X!(k{KJn%qYgrFt2|z!lM~L5RJdA+AH>8f{Swo}g3X
zj|PrbIL}3u{+U|qaD};fV(4xtXh)a~jGZ%9m5;@XdEjVeWrq*?nxPVHVqP1JtiBo>
z?AZo3C!veasGFa;T6(E!fJ<GR`zp@-1a=vHwVeeGb5Z@U7{AOK@txhaq{|iv>I`us
zg<nEt7Ld#F6aa>r^a;Wi*_N~CmeVdLHSWZOh;vGIu}8Jc*1-kx1@{`Dpz(|0;Qfr|
zCgeQR@fpVG^lHtA`@e%uNM8pA*y<c^<ToBQzId-KgiGJo!Tuc!c<B8N)(hC3^aD*c
z)=LB4WnZJ6lbsI$eWvsVZN71W?sXcIBb7#&q<SKNzipwg049av6YtP1jA#>LBc5S=
zQ)H|}4gEH9VUoiE>Zo@ci<Q~mE`9LFOh~3V_)kvgN!E8_O^@wrNAI7(5%7rFh@YC~
zQFyJ7m(1ZB|H79K#c^4R4z0@@yfU=g16-RmXEZH3*_4j+ya1|Cr(hoEfce5IRo0E@
zZF-e$Z^q)6chO90bRb?WT`OeBGSq;w@GjSDv8Sz!o1t#am|xTvK;DngiPMPBSR``B
zjb$dyHY2&7x(L2=bhO_{mKc^hwxJ0<K;FuABG#k;IGi)|70}zCuN;3GZ!LaW^wR!V
zq)5I}vRr#$La`)%2Eky&Sa)%?hqZTHZ9?5~CHW4H$%t|C9vs8f%`SI`E${mHf<&;m
zB2H}VP82*<OXsb`{GgO3vg$$ic2rbAcV#O50oP*x`knNCJP%<{2odFCWJ<ptCaaY`
zsLkG8XAi${c69!Yj!JcL^wpmm8vM3aFb~rR;GYrwnFm2Jp`9Uc-pS*8Y~ksi>^+gA
zA?zNp>~i1_be^Ao_$+0QNnnIpaQ79gA9)I=#%Es=286<l;omKV?+@yN9bYNp&9vi$
z=<1Ek(u8(3otC#m(J^mEo8)%Yr{uJmJ~MUr<+4|bL9cY#x7frz_wavEYLC!t#hKfj
zx7A#!$W)mTc=3J;VOWQ)6}A|5TtCX*CsR5sm&u&3L1C#qBHk6LEKGCc(QDtXRW;lz
zfb7JzN=N!S0Cq>0vL{WULT>E#X+j~=*s2#e!R6MnB}ktmFiGVE>6*+cTJ(J>e!2xo
z04Axs*v`kL?6bPd$IpkkzLhCjK;MY=9Y$jwYW{&=4q^$%5`7h_c1`u(b%GrGyOmpr
zV1dJ`@DUdkb{Tgmm&uQuV@^*S4r}Aks*A}&=%W{=v-RX<tnibK4e3L|%LOrX3nLBD
z^?NC$XZrl?HNT?|jg;^sf0JYpA51&vGZe$Alkd#&HMn<TP8<sj0xr3M`G?>re4CD7
zoN1(c{n&Z-G}O51S9-|5?4HCyoy&jna+j8lB1!2b{H(`zE&SwFen`_H88QWYss}CS
zNSKZ|9+iZ}z1m;v56s=FmDPK@NXcS4iINx;MgjljXlZs-V01^cY*VZfU2b}lI)qGy
zcBR%iD!5)fXCl3p`9rdrneB8^nxPO*OaU{f>R6A^hcis)3QXIV+a-D3$@aT3J+Hn2
z$%Sl<a$M!=GiIKAJ^c|ahp9AlIYr%3T{PgoamNz*Q*wam(Mk7CPAm}voZ@;oC2qkW
zeO&RydYPINBQJmx6hNXMB5uswam0>ElH=e1)ULQFR}OXj87<pt)#i2wH=S3QIp<yR
z=5w*}JT-9|aC`WnG}5gKb)i<tyibZrwX6Gt+LUBtuEN57i$s{DC47p7nV~QjLM9RW
zQ}xs)9Lk;w@QO^!XYE<EXA>9<yuR>AkfORhd*+r$CX06$OicKXkRZ;(gK6tpm;!X0
z`X@r;<nCEJ7`}i*)jB-#r1xvhzvS_buQ*Ta&&S+7BgPN*4I_hJ=|F_iba{GV^Om{r
zCMU;CQ;*`&B%Kxk3<_m+wpHn9GV`&dCdqR!IcV0J=_;f{=^F(IS7~##aG2Xll{b3K
zTLh3?$|`hEh)eWOlB>v5xSI;EXU@ljfb*!>^?xd7GLiY`a^?u!I{L+d4)q^l2(?j>
zOO$xe4j2MMA!zA*pTjvX`%;N&cC)A}l#FE10TH|qb6*#wc)O(c4z<%tSvM{}|G8H!
zzx$7Rk_ImCt_UFx02VLkw|<l<+C{?y8P?wXi<)P~@A4G{5jkQbNQg554H<L@E}(iD
zuQiRzSrI;xzIMJeW(z<iJ|~%0RaH#+uZ*f4ZX|7r-$j{>>iaXCHs}Xj<*5eb+OC7w
z;DX!KlyH`<%?qo>+vnyg8--UF@q<@JBH{3W0k^m1oBg}2xv)-F?IIv#h6s2YnNFuP
zRW(@;BG(1?SSKbX3XGeM697at{5Rb5GNW}`LF2p3+2ox5!9z&u=abpzU;exp`q9R;
zrwkkUyk-7Ofo775>|N4X*tf#gTi5-kC(!<?TT&C&N7$QwCr1vIHM1XwD4pjp*Ze@E
z^!4Ve>}VF6Tg{?c%&CPaka&~&vuFmt#!N%hXy7s7<&0w?S5C1@I+lS`C3h`IOEmy^
z5IR4Ax@sr0rN|g&0;G07zS59%9?y5n7c4m{fcJLRrVkd(&)pnm45chc@XMmrH|0B8
z7)@S+xwCIfhn(|tV|grC1vEDv4ld}&ryMAU8e2JaNy0T%f-59Ko$%9lunr;OgzzG7
zaCYlP{9>9_y=G@5HKZJPCrA9BR>~X(%*Gv1pAT^IcLqEnuQS^~=m)T-E1)*Vijhl*
zfOB4`+!lakmCTHQ^R8+K_|Pgr_o{*jJfv-2XvX`U4n^AYu+VJUW7iA?#ty|p4qdrM
zp`jmT@NOV~Ncl3hk)eAL2n?;=MUJWY4GY{XegR<0?Kwgm0K*YGsWB7i*XT}a)cVq2
z`4;KQMeMh(Vo7QcdUV=>5`iVs)7e}%Ozlg)W6=Aw@O?$_e6kYlY9({S`k>gKBi&6e
zDU_?vzE|P3JOOmbJr=e!?6atI&u=-Ymh9%-JIerVHsiJJ4I;idDgd~6J>i}Pb0qWf
z<c}}!`O8Y|Kf71hAvtRYm|EhZ;HQ-azt{7fky=^u?vG|7y5YBJ5>IMC|L2I;(!m~K
z(9}DB`rZ#E8yFl>3n8r*UYky=v{Y1E!KF~&j2~!@%HMbNK(A_MLQISkr^gd(I~SNJ
z2)mYjJPu^ofGr;UDA37x0n&&^Se}CHE}^vo!CFb@3kH(C<}5(_gqjmXDHR}vWiMxu
zod372Oe-d=xCA{tU!fM74D0n=uh2i3BxNV!UzwKHJ*nJ0k0;}cJ$@+BdOzR&IiOBU
z{r<REPZTr1%Q-vC1N$9NNSbyaU7cTdzQd^=*T~?*(l_-{r?aC#N_IFh3Up|kZe}#c
zQJBRuAchhAJ?8Ly`pWP^wvps#4uqjkPk4jQKJ+G!-(qO1JzPrT!bs&2A?AO)XkIFx
z?ON8DTElLjntOh<&_^`2#@D%0ge&Xe+UE411URjVS=4B7$QAm$<-PoWl>bFq<szNX
zWBU63jr@ybyGE&TR{ef_FPa&`L(ZE=hMG2`SouI8oUx3~U^ic<|MyZ_qHCjYhbk>i
zO$qHSCkU@|x=?6IzBVnz{u($Gb%1Y-N=nJGj=PD%kzX{VuE)Dago4Q82L|m8Zv-mG
z-r(JJU;yY1>g4pJx-!!*Oh|%B1AeFJAkP3M0Sim7_4cgyUc%w3XWf^=qC#U15A_6-
zkNlQn{J|bHK|Nr%VIdrw(o^ofF90a_VOm2AS`iF3+tNGP{jX@EV(nASWq(zVj8z`i
zu*KUd4M+m)RW{I~7PzFl%0;YQET06!O8;VNttwv(0x1T>ig?Ytdmf|PtJe7@Nwl<N
z{q8L$34=O<=MovN4Zn=^5`82#9_oJaD$UJq$rDn65(y!4cy3&0#EFk^L}x2I%lwok
zxe_aG(@sGjjXnn<p8?lJss$NzF4$E@ctJoT&NxYK7vPhD%Pr!6b?R(bd#udS{48lD
zrrtSQK<i(8#~jaP#!shkP>Rsw9~ewQ(GM7Izk4v!N>E&N5+UoFhj!I4b_{3mcCO}3
zDKo0{^)warJ5*V1VGUWt2ZK(oNKuZm`kLXgTOJCB$!qyvFC)GGFb1#C9_r`18P+;r
zqp|hFYtz|TQ*eg$@p4K<tkBofgJ>C@b+wRpN`DlfY*IRB-IDijF|{cCYPPXs<$E@d
z;;#PCYSSkrDYe!QBcwI&W$pqg2ni*VK_ChS%AtXQ-z1*}ft!aul2ExHJ&E{JuB8UG
z$~6Eu1>PJt5^MN*9R984bk8w)fncSi1#9b_+}2EASSU<1r!{37iS585kG6gsv<9B_
z1Eyl)awoW7!FC8pY}(l^;<;_;l5AZSE?myNTiaJz`vrCI&L%4udER{E`fZ^-0GB%?
zR6l#Laq(!c?9BEa`=;%Ar$YhQSZ*S&?@$9XwKoJAwIx@K2ohYS&Z@-3?}*LR!;L>O
z)8L6RWQtV_W+UrKK1TOGf9~NRP@!w~&Ij*&4Zn;@_>aCI{O=RP2Fy39#<ClVnNovH
zAC#REwtOms5AVJ$F-18nanXdw6?cMUN&nj3OSDX6Lo;h5u4rSljmB4=LbmA7|30`W
zpkFXRye|_=!JjUKIJJ4*<#n}X6wX|NNCM&L$s}Xg`NUhs!rRl+Erk!0&QvB%oHPrU
zKHO@L?o56AvQut1CR?hHJoJv;WFg@sOzY>0*<L;aW>N^PK@Y#}-?dY(YSAi9*++O2
zQQJNZHia9Wx3-=H=)qto*G~+s-#+1Jynr{jqGRLe9x-pkNW)>qCgYgI1BtETm<^4m
zU-5O6B=)1faphoOy7+Z;&5hK3bnWtCimeyQ+{|XUltQL91|JTH9c22<S6utN6bb+$
zV!J_QCNbt&0kaMhBCUxmnaw|4zO26WcrsD@9{6lBiiCy7Oux*c!QY<oKdvwy-vjrj
z%sz%*3?X-XYW)VqU8~wNfbniuScNq-oBC*bHGT*1&W#L51|3%ehPqk%mfBXc3wS-y
zxS9M~NLPq!@w^}fs}>YqCPB*b{*M~GvAgPM${`O+qAq-5f!v?`1474WJkZ2a9#zcw
z3&Xz6!%s|Hz8OpYxO%&ki^};nDkD8Tb#(MCX!4+dolMF{`+neaRr;d-g4v+`KavGB
zF#7$wv`438??!{p&MLF!W1fflH<tSG)op`XOAUV=oj+Sh5U`)0{FtFYeG5hu?xROl
zbspu=U*X<<$aLMGRt13jB7<~OEj1p$^G_^GUjcJ4#_g{z4>tL2{kVCUmh1FQ$xk=N
ztikX+J{YHQa8wpNm&7K|%-~C$6!E*@iT~X&R@g!>ywjk(akPh0S?fzu8^=}e0`$)R
zUn;t$_<A@e@Il@}B{Kvx3wI&ggx+Ds5_`sRPK?fVdS(3=d-+1Wm1k$E6zgd8>TckM
z>ipfS%ces2H@&DUNsivv^%frVv5>V-B>r^1X+`dHmqmlMEOlYfpc%009#g79<)*={
zV<KmHq=AUGLGTtCYAq&>p7PkZ2^hf8c@%JGBxER)AR?yVh>G6Gxl~?VV%X2i0wsCY
zZyg-8MZV9FcHX5(Kkipc^A4ZB^LJP2LaG#oDyM-J%`(aG^9UQgF*zy&X8vG?#wC1k
zh_x|<+MK1YoE_;|;^erI{5B%Aeu<JyDuW}Ip3Z4hXnJ|vI52b|$z88RTN4!s9kQ-O
z$9SUxM=`&r|D@djyaN7i^hk<j-~FHEu|Z#hCzW2?C)K!AxybuedlX)6(*JUGW=rCn
z4EN=|UyDWSGs?bzH<JfUFh8|(tjl?UU_5z52onIkQVGG54sC}{%M2E(z%3k&l8X}W
ze}3>MS>QE^dM?280(K*e0Bqc!?auwD%XrZ!xNdtiLB+(;2~f{U==)gv8J-JWY(l5P
z_9dL)C&W2$(BoTgCCWHAk1GFN@K6>hGjIC*dCgl*1B9IV*&CIL8ceK2k*W@o(I@q1
zEr!CaJx@&+5e|(Oe?GxO<%r9fk*MBl9)i_ld^ff?d7-gr>MxdmtbszzSF0vFWy`X+
zwGF4HHgKNIom6>$pu2XcIFNGkwsbTx)(>jQ&gJWDZ_;=Q=|tOhHM`}1mdZ957}iAv
z&TlSPJsl;czuhW#R7{y1va9<aM*dsOptp~--se6X$;Y00%f3q}@>9Yz%`|-6Umt!n
zEkWBY+yO>)bu#fs*qr`rgPhuA58Af|x3l+&WT&S6p$Gf?Qfot)(VIialv)&XQdeTw
zw&*FiIlH3erSNoI+!OoOgIK1X5lI-X5*_YpnU{+dFBfw<I(6cF=PekBLq78FL3|37
zro&Rrq{@UTM#S5Z`@gT;q$d2Tl>0077*?EudU)J#og+WH9DEMDFt~JOVyAsmGt)m)
zuMgRSd=EB8=htuTzp?8LK^2urj?V)Uiu;??=Q*v}0N5C`0oe~yn|M@x@-zxdO$I2#
zZo9z%u@6rKAO#fjTJR%zok<1b3v11YiGV&Q822guw{gwer$6ga{US%?Z)Ov5EV2dK
zP3dE!p--`VE#8aigMdY`_@rv4(ddsMKJ7gIHZ5^GV&Q_fb5MQ{3ajSh@cl3lu`n!H
zuUfReFe)W|AXLslM=Xz3B*M23l<9~Wj<uY-!57eK2l3|vM*OWvK0sOc(;VG!6tz0R
zl3>3qmf-Z2>0xcZubIh36%^vUbFXQhFEyK>r`*}LGwQc#QQdNuVWrsSn>kwhKe-cK
z6JaQE^`PPXz~D_H*quBT>`IP(l>QCRP}kzN5><Rqx`t31P_U-3Jmr2>@`C6tB}#vl
zm8`8{)1Gz#cf;&3pT3aXQt0)sV-Tf?1$`d*D73bxGLHE0Mzv6)IlIraS~iG4KRK9=
zbi=lPKjSp6o8dI>lH8d2p!En8y+_{@qQv0DaJ<Cq)+jW=Apcbt_hJ1#eDChDon2RU
zUfGKQOeB`)QhJTrKD;<xxH$24YBp)~)P!$W^sG*HghiD<z`Z$jcwY9r3$w!d6_BWH
zEwJED#tc};;2UsGV5EA7>5rZRU+NpXvDC}>>i;x+_~{fp^z689%_lo}{#Gn_w*3T<
z@zrMtLwBDKJACYF&C7}Qwm(6%t9+o-Vf-~0@emLlv0@Wui65GM8nIrRKZ#$g4h>1!
z5B%Q>-8+<H<RQ5}E(RQQz1QF@6w}tN^OTW|Cc@jD;*_DV5yJclc+g>$q|3)*iEepm
zJoWKUlS2YbEZaoV6?n|EhEt*LHgQd_G0}=;GD#8}Vta>+XkRcnddK$BLjQ3~i8Z<<
z-fu7H14pC&-A`wRpDcSGJyEN2p$yIYA1}zRo*$^bBWIL!GAnv_>PZ(B|Diem7jbOu
zjLo?0{c#)M!XcMR<fiYW(hV%`)P~aIxvW&}5N5Qn@hiD3(+$G98mLrRW7eHHRw+-p
z*k#p`D5Uex(zhcB<g+N!mD)~tg+S>{ojtkAHZ_Tdrt5=RGXt0D<JY1=^_U~B$<7K+
zqDRk%^)MN{GAn;x@b*k3g4r+n%D+BGw5=R*Nx%MfCsNHc<p=La3jN;ddvC%TTsQCk
zBM@|T@e_}!@<t6nZkh5J-FiMYhAdS4f66Ggtr&%+0&Y-?c&4aHCcDj$a|;L6nWGx9
zqlJJ&?K7!i#UIeTmBr|@mV>8|(-8i~v49Un>tA>`C8$|(EFB>v`;yw;NepF@Rk6vZ
z{E|vjDOX06+N4_MFNvu)gwm)$e;LXFrwznHVVatSC?fi#G|C+$HgMx6(flABIxsr)
zcM8ZpAQ{6z%D?XHd2sk|BwN3IfDgVm+MYDE^~CKjHQ~C{R@$cZz`9Drf`SK;hVxVv
z6iqXGF25MG34dx<x@#_Wwrl<~{Uq48bT=R(xRiZsPf-ekS84^#tlQiPAl6&0qbv01
z!1<T)zPyy-P+6@w?e$esox$38MFL3i_gzgwB#(AerrqZk(5_n0Ij|`wgeH_(eUs~s
zs&7GI_w>a~>?j0t)OaICjdo=3eJ(qhJkvxjICeZ;0#-d*7Cst0NqLn`MQz+t@ueUJ
z$$-DP&8vh9SJKrWOpU!`VCH7)s<4FZyb0Tp4c{Q2$pKPCF-3JfC0D=ljfMQd?WzlM
zWJ*E4w&R%8o~~?J^o3(9Z~?)4R8;t19C@-nI~m0B9>sfNpcg7-k)^c0LEH|iWtl>N
z21RpH!)lP86rSX435kP=I>P8D3(5wg34c=0Z*x_f45yU@tZltK3zk2OttNvmP>r?#
zwjo`G)0}}94|<2GfnKC%9Bk({mJ`$tRMJT+UA-_Tbn(dh<(APk!}$ljX@<HxA}teJ
z;QXqyf+)80h=mi7{J^f^$&*E&97td430VPhQOLq%Ce^wQVA9q~bqzRXdz4Sw$S+tA
z&(6@ac^iul1zk2?ARF(=9CE1ECqG=fW6&(a1xT`o$Cqr|N-zO`;Er9x5y;^EwgAZ+
z2b%<TWtdr<{(Q~QfXS5&7u)+xps%_?DJ1J#Om~CvKdpX9ko>+`{U7coPcvBzlDH#d
zl7MhdO(>okv6rz&8h^7Vmm5WSZ|OS|cA9q6x=W*+@k@WSm^16%lsS}3nv`r6SEtZ7
zb47O|Ud3*8Qt3xHgVyN3FR2!>+EcOoO2rO@WktmZ#h1=Ux^7S_vmZ0k+#Hp177rbT
zZ$QlNbjU}<7m?#4dn5fOA!LgDJp5fAS8?$q^HyHf0G!`W*FPl&wN8NV^gkbE8KCxW
zJvF%KE7eZ2D82(dKV3O!oAbdqKoW6{4F{KM-*Y`+Y%XgR+1M>8^Nt8IOaDC`?L<td
zKVQa&T{=e8XOo9;!ymt$w@nC@0Bt1>7y{z_f+3G|O_BWZbKXX~1kt_OKZ++gT_-Bf
zispC;WKN&lvjNeoq^#Q1<byuB5_@*``ib6|=_T<bBu^&#rqF8J?n?@|B<?Vr&coQ(
zPAN2J=o*{k#+bDjsy8wyR5n=7DDIpn%IaBvR$m`@R@OT^%h#k%^|HWHafQF6f1I^H
zVZqLRBw1*k)FuS$!^n~$joTAbHN#soP7ZUR<fqStp<ukeS@13KZyvi&&1F~ulo_{a
z)lc#=uvwp;zZP06baJ?~OATeJ?>{ac)01{rsa$)1vRVZGKN>kZJ%JY<2a9L#XWf1U
zDn0NkO#AiZ(0V1O`gRSS76}-8DJ?_vZA9LH+gWS79zz31)Xbtpl|l+2ii4}p384;z
z(56kwY3^^GzvU_5ygiDp(}F5$v|9rRn`+f72z?g@`sMKIIFR_vvbbsBNcVwO%~lZS
zuR{?=3+Of^$Se_pw#(znTk23|dc1tPZ^-VExXG&u<WAv_n^iWTfaUN%<-J&|YZYiE
z;l{k34l<y@rf~dyxQgx(-4p-0OQ~2<OY5ysK)!tBLW6JG{PDpa>iJ(e>3?2LnhK#k
zVD!`4qE-PqpsJ%&zw?r^K(+Jb4^o~aPH7i-0rYSqPil9THSxf&TlBK;5x0X0n+3zM
zIV)jG>1LQgy3(7kzci-891N_ye(=t_bo#RH!c}XhpTd7LFyW6fe%alFE_Uv|ZQ?NY
z23Gmh=L`$vxky2zfJQY&RLo<xbW}OK&1I|PZmPd!cXQp~dZN3|^)F$o5B>NO2b;GC
z8E{KF6N0Ch+xu_tkz|BF6K=PW@51;AQ1V=1T_J<zzBf`R(t#4-DH>=z4$p=#U0Gi5
zI3GBJhNq-k@?|yg=C9b^G2-{rO$uz#g&!E4r<%n5bwqNNA@T2lz-L*uq0A=E?qxuP
zB7Qz3bZb=~JlGeola<61gI4`?CM@+4-rHW-L=nEl<piCXGWc^Gj+gXemA*R>Sfs`r
zH8LESFb-~z*1i?Wp(4lMH0pkO<-O$Vp5l6G31%ny$y`Xa8Wgl5tnv)noc7NV%at^6
zE$Ax4l`U;mEjbc&y+Aj@J;Z5*LpeLxn6eKSUYhDDryu3;&sVmdC7qiSmf}Zvt{MK7
z`n?SC6!A4_Nj9iD!*R20f&BZVdw<pbsfNN=qZd<Kj;9?=+anB3O_P#MgI+o}t=*z^
z3+QTg2GmpDN(4@UcMdLguNkbVj!TyD?-uKTmWla$HkR3QHrF!N0g?TKFqHXO12C@|
z00^8l0ea=Q^XHkuUKLoL6Ka2ts2lz`Jj<YC;lqCpKq6X#t-N)d5T*oc#~$213=pj2
zqdxPJF)>bv$#-zCt>@M_WYrV#iqz70GiR*JJo&R#Ak%y9!X)M&b5-HVv5yuQbf*6t
zt(^Gpwq_5p4OcG~3d;Ml3X`}WtyLm*WA(Cb%W%FgDn1_yNQk!GzV8F@6ZYcqbiIrR
zkNs+Y>RXpCYggSY9(t88@6=cDP_H!AUrXPbPy0)Av4x^T;-GD&p7KF=GqeGi*%h=h
zC}W3Nk3G%U`Itd8M+<iXDZ1m0%tWEq_$#x#kM1{d{6vY?InONG7Ssf4OD_2TC1%vz
zfn_cYDv|vUT=;w?kFWzZ_;erO9cEmL)4)!Chvxrem-F+3Pyggo*02_2D^u3ot}bK1
zRDwna*rWzrOZFn?=fufjlUH)tbn*n!h5o))&xz`7tp!*|7)y03I17M%LTH!Rrnxu9
zgg$v2`Cyf*skRxN+{EZ3{6HJ@8@Z+$gxjxwDbfp@i$hc%{ahiJz*9IP*272WvHR9H
z@;>j$<_Oe=KPbJ;{2wXNi%cvq9>iNV`R-(xGGc2`FjS?cei>?wbr>q-Gw+>_+*xcD
z6e%kPI%m5DeWp=e_u-i`M3<!7u)yYP#@cst2A9G+o}H(*`1Ow}LCY*@@jwG-3^=q+
zTfMbk4GeFawzCI7y>?p;HW3^NV${^=5p{p{O|16?S7Ti@fu=fUhD`e)HSR?vt>N#o
z-gh09#v{V6hH+~c(R7Et4>i_Pj->%hWN=uMt4^>BRTz+l71|3v?ZC7<ZLg{z7Z2*?
ze0G8oh7spIRM>~JnLcPL1FJ1{llN#3GyBFnYYml@LHdXFvoV=A1>ze&#vjOj`Cees
zO-*{g_Lb@Q%Yh#6El(BTJ(z7?sR@2iPL1CSkAkQS%ns+LF4^{xTpJg4`pY>2Q?i@7
zG|Y^G%;tzRHLpth(M>Lo3*_&fJvl-n!6N@h>o_Vcr?CEEC*X9)UdHchn2U7nJ@sj@
zTKIJSCY6)-_iSmNaVKX-J0Q1_c8LWYvPXm2p}Y<1cfBs4A|A-wE!u~>aOQ~>`%LLp
zz`=tnj|FFLfR`Xh@#U2gYFxR()|_O@3ME8Z;`L0*IXy3@7SNrs`mljcU1{vXks|)|
zS97yCVlBniLL@;C+y+1*6>&~;9vxoSN2UCC{Jgf2+-iSw<LjLqWwu;eT$P8?H)j?|
zrJe(Bx@o0;PB7tNY^{;Y7ry3N^^S+PNpECB8@m)^yM(0Nyxd4V)w;6?21C{gZFeE{
zKX6Jm)szMQ-`E#+=|~Q4e1>H3`mRLq$w`KM@rCLV_RCF@-z=>Uhr$CWX)zoSi*tOe
z(uniDVB{J+WLxr!%%6<09YJyqitI*>z&%pq605XRd;hNCrxT24>^63v${P!>cR+#s
zyc~p|@06D8GT7RrY-mAp96SNQ?5;~4D`;zNH}$L7M#m}vv4vobw=RND`Le(Kan$S+
zvrc`JazNJFkRA$%fVj5D7~p>^Z#-0EpF_x~<G41)J_2&6O*=)o#$gkJkS|L)^!B{&
z^F{OEQcvgmho2Wo{FBzLAN)2N)AQsl!&Oidks&s5M?<fkN-hMMNG6v($(6m|V6l5w
z`f{Due<ZG>^?Yk$2b1~V16L1XX9L!&&lZz3-mbk|?jOt%dSyS|mgC)t`+P?ms(GY2
zqcQGO^X7cBE@XsGnX^O6rxVOBVT&B#O6fd4oqRVM3?e*>&!G)Xi*oJ$dPehiyba8i
zN$F-2r^t}S*`~S|wMdOQJy?(XXNfIo<W~O%AQ;L6S9go#D&X5direHcQIS^^N74X|
zO5y=^i7I}yWbfZe%?qXo1KkSuH@zL8rOZ3}Ct5aIP=XA*Zfmou0Qm3?N@2+un>6uL
z!s|CCUz&F7JA)MxOvMa1^f8ARDjefo$?BP<gOq7Ix1ie<J**=r?rzB-N_L;3Vex>r
zieWA7-b_I4wo*5D&nH{HI4^Wx&ZhG`?)Ei!4*-~ch6QemoOP}B%XZMC_B)4to_#kk
zA948dRq-#J8`x&%EVhx7q0|&=)P-wko=U8Hg^>yeF-=T0Q2HHHQn27sWz<h{wbt!e
z(}-jKCJ-BT^X_lFH{QA*Z}WJ6IA(O76*(!X%eR}i?cV&FxQ?}NIlOu41(wp^*V!!b
z&m+zjH+SoevMwRY!MWLIY|9;r#Nj8j#DU$6#m%Oa=3&S@`9RaGkr)&zn1dnsb=){j
zb?t|zx82y*NLB?4^zaD(>-yt)@=+ZuAx@_D&%jrX74xpkph2}FwkFQq1`?Uej<V&_
zV*y(v@#R{_6d#R2c|LgxSc^R0o;-XS+l8#1(RknZ;mC`&m!4?e#iq!{y`d@{MNf}v
zH~N8+@a$w#3*6|^7I&4sPbQtAhar=m9U*UrBwgj%8sUK9MG|zO_0w2}ZF8vf^=;Op
zka94joI&&}WOj`ShIiwiq+FYX)GFs76y5miXUe0lgnDJj{tQaN3O}GX&!y(uC+cv>
zd?nygod9<7O)bCH)4VM$!_`}%wXZwef)68^R#PvC+16gx-9J(|p%-INfx0X?TU7J<
zPYYAsOS_BzTp$DBc{IMDG(A*Q#ev_j)PeOm3>in~g>KghfgHzBHcThig0OnW@3I?M
z9ULkx;2O<tq{ed{-<4mqekkI(eac$F+0@Up?4$Q*Q?Q~})<Z-9fm%C6+umpme>TB*
z(2ZqE$ch)+I=mho2vxgXEyO&MJW((QnXDzRi{3Z{g0T+<;^#$Y(qu`)@>)%<yrUi-
z@kM^+AB?1C3gL6V9#=^7#4F<Kuq;@FRufMRs#No=FKxB(K(DQaXw&EKUzXKqR8zP!
z%@8CVWN<%VOHfhQt{gGu#gK4BY}}u(OTmY^1bEQ3%;%@SnP<BQztNx4WZhb7KSAy^
zH+!^J`l*=E0WXIU9UJ)e4%aSXe^G=t6yKKd+hs)iaUj!!FY(Xo${Plz(tIaud?KfL
z<kZ>Sg6t1_2|GoaPRNa$(&$c|x8c(TScm)gsds>!m3&KbkjW*&$}Z9soXZv_5#h_s
zfXJa+`%#c-3L9=lgZ|8d0CRf;Fy^tpw4L6Fme-H>iv4LM2|HzYupBO-@(7nVv(#BK
zMK^Mb&`}He$90PXuSOq@oh?!gEb>Bq9J@GlM%%?(wj<2jkVmTJh8jl34)i20&?C86
z?ylN;MpsBx<`7P|KPl4Q{zWEV%q;iFgn+r{szK{M?b#U6&$BQ;m*LuJ3N$>A0S8Z(
z`Vnm*hiNiFHEv#)=i$YL8Nhu<Z9w+s-+!@y#3kLqN!URFEL3&*_?lZ#+CfSh65xTl
zajCvUGYpSW_mTa&+tE;4kZO_k!mV(;@Dbnf@@1*Zp(X5|S~@v%Z2~Gm$f`HV3m5G_
zY~&7wy;8)Vfo7k-2$yG7Jngr6|J}nnMfW>3lOn(Ekh~enuFTrVNc1pMDu2ZBY3`ek
zdL`c&!=M=*F>Six(138d*zGaQ@y&ztXvK`mhxOM%iZ3Xr{#|i$xE};-;vCZQXYVf>
z8P)<Wm?N`OCJ9uplPy|s4;UnU2%X6ACD`72-yn&qZ%>TX*XsIR8F&ws)!UsE#gfji
zK3dWYqg?AM(68Zd3lct`EWG;CdQxF|P;nyg#3^^26@PW2)7p))MtcUOgZu0a)ud!s
zWz$D6_=66vgx*eKC^wHYqf<_mdwI)=VU<g~Q<O8KQ&#8QsO%WW4ELleYwSpR$=_<-
zSpF_vB?as-zhxqjW3tJse5ia2;>mDs?wLhRq*t_9kbcE)X%x88UBMmO^;jt)48-ro
z&kI(8?%d;^gv5cZPtDlQ`cnJ>8ZJp7=Eh4Vl;X3UB;l|7;qeOhE;e>a3Vz!yyAm*^
z=1=h}&R$L>yIy;!D8E6@WiM9ESw-*fv+sYh=L!+GX+`VR{40VY<9M)&V{8}YL=Gxk
z=G);J1*C~DT+`FV{^Yiu)b{hC#2EPRMP0N{fsy)TZYbiR598Ks4M~L&zMxZ?dK(@!
zFLYL;ok~K-e;axGG}mT#GKST6AixLHZQru<2EHQ;XTDlgc${|PkIYEc>Ln&F^3W<?
zH}BA(l;V=<unPVdRyej544MVnJ{czem6lz2k|)QhjJp%btLa0liP@ZXBY0r2GtqaA
z0X&vBMDj+DI{#feVnW_|YCwm|zg(orU`RvS;FJ@CeasGaG*vvQ05FX;ncl^n=I}-8
zVGt~iJ0JTHc&Gnt+sYmNLu$g{xTHJn%R!@8mRLDGJ9!l-+(GNFCMLv<RehsEfcGTs
zS5;^R_2R6kI+&Z-iBkM&IrAp}YY-Q)lggY)(4sVtR!T~O=8<nS$pTt`kF&NoXLg54
znKRDPI%p2e@|<piDUamonHwbNKV3cr#w)je#C<;=S8ImDHD8zvVAvg59iNMo^<FO9
zCawhUfmiP<9*d-=XV*!r^tPavQ1{R;_a|%LInlH#Zh4`g%K^-|lYd*+wBHdPPx_=R
zh;I5~oBv~tVIZB|C6ALUkeuS}0c#SpHpY_2e~ssF<Z8eOH+nw$LvoSabx#j7d{pgP
zzo?j^`_Vq7X9}At|77mTy#{ifJ)lJDntzg37Lqg(to|>4!~k!vo}0Z%K`THrP~Ya$
z3dn`o7ko3%ZUWL9pO#(zOoEgyB@3huQ=b-IhX_~#85by5aC{rz&JJ2)2dJJ64oIYE
zuFsjhXaFS=W}@z)py+mt<(4Ri_Z*B)!FbWuzv`|O#3krLcNGgEMV@@4yE;D`9{oG|
zIw?+j&CdfoZSY0q)sCD&k}kB8(`R0h=jEZfPE>;-;&ZK2nz!8Ou+Fw^6CwJ$7(~hK
zjgX=h_s{lBLkT-Eh&w~VI4dpf)w$nt6rLd=gCG8MicVYn2`O94)QefGwFiLAxZ~Hq
zsO;F)QciYgVz1C^L^=31JdX;oR4Fga0}Y-tuvoxQfK<rSBr*4{I?0@sjXFuB13Y}y
z%a>ysF{P7jyBJe6{};6p@N)Z{@F<aR_qi7ze$(E(kHIWU#!rs}zIe6aD9LE`*Foja
zk}}Lfzx3G#qwa6w_RqS1{n<U8+8o9V<Z^;OCUw<FEj!379Ep(b9Glm^`x!4cap#|5
z%SqYRJN8Y|xlVc~!gjD)#mbihwQ{vC%(!Mfw&1Ome{NgL&D|S33{z?{KdEV~;B)y|
z>c<m1u**R!ZVa>ttnzk`8D9GwfjH=5EPHl7=V=H{NlZUH`%7CKw65Ko>^{HQA~s*#
z0wVT}d0W9aw>=6^x%WCHNTSyc^%-dJ#*<7$^Y$6$&u8lf!IT4pxk!H&bm}mh;CQF|
z4{YW<YZhLIN<DRx92*d(#7{-0bOJn^+e_wX!O2d5c{H5utnr9H`XV)$EmJ(0HBu<k
z3w)lVxlB@j;@crurq~_AMo`s3I~96uV3V)+MoS3azsM1q!?pX`Gs@Vg&HI{Z_{<F+
zouX0)MLW|LH&t{&p_eBcnP}{QM>{(ZIOsi(7_X)DHjME0R$)i7aT^26mrI4sjjB-$
z!39tXXAr48R<>H19uZ{mjRWr$6iSVG7S9mE+=5_<i14UM+o2wQq^cV56ZG#W9Y$J`
z3B)Z;A<rcBgkeoz@-)V(4v1eVOA!DcJpH=G-S;A=xiC_oM`UPl;5T`)P=5df{(2}w
zCQnNFwTa2!(Riw<0Episj^_2O->AHv5Il1FYMJ3Pi{aek`iXz%52aqc@_h7g=Sj2U
zJ&ICRbPGbXSqg(Dec!!*INu?EygLDK$TRnH%ajSB?E-q`1kD)(-KK9Vb%(F{vS2B#
z-62khT|cILzbt!LjEOUX^+)sG@k{9MF|(^&W@areQ(8MGC+6<^%Zz%0!+`gnI&#gc
zIEX;od$S5C@|bnYJ4Dt{%!tbtdb2sdQX#+2_s}0c6?lvWL7R^l@Yj|us+LL|(Xt)G
z^A&H%Wh1=XIl(B5>LQ}ytgdU&1hlc`dHZPc7cPHogE5Fpx=XY}u?l>F<^r7$48Iii
zJPD@54zqIDXs_eTt#?NCZ6UlyM({b@Ls2oGC)o$zmVu2D8_%A~wj@iP<g_FLaZ6ca
z)_%BJd%l4i{;h#Dhu65BW-{N_F_OM*S~m53vbjKqZ`7<=)An6~ut@NVeU2*^^9X#V
zHW^A!2a)%3p1wA-{ObmDC<0s}$o~i!n82uI)q2G7B-r`hSDu6N624BhpvI;%i#Se7
z>kx(rX@yBb!qndt<!8|j$L0{h)lQ^%y?D^#)Y7DI&!G})EJc@WEy9u-Sjbxz8KmZ~
z_IIih$bk9EF|42m)LpHp#i(;oY2KIM<hRtc@CDL>|8CssJvgxq&5%4_fe>@#-7iS+
z$uzW#T8k-QNwJAmtiJOs?u2fhRVWojTeeh_f7fqjU_Xm}VqhTFbj%&@taEFy+N@yn
z?aS}_`mzZ(c2&2VE{bC7EhD34Z=#qeYBwFKVhCYK?>yW;-c7$HMS_kKe_>J4q$%p_
zHJdXQ&maYg-KN}xceBoI=hfaLA~K=g<aOmvbpynUvzpeAHxCd0rDOH)Kq}3dC-$zc
zl6Ef;HXJ%}c6F;;w8e@S-_J5WezGVj0tBCc9I9-gupP3qXe4>Gqs{@;)Rwa46Jn~?
zHW5Y-rw~r9tEO~mPxnh1rb{v2TSxdy5~(SJ+Z%rP^p|Fv!AgUnI57;q3SH1PoJC0O
zbYVB~Udl4`2DkDrFKr38dJcaUl?qnR#^1SQED;kq4tSt>ZgF*#74pdv>|djn_@s=&
zib1UOp3Zm|0yv6oQ_%K-VxzXpH%3LPWmO&(Y6|NRha+<<jtLQ}2q(}Mj=Uh1es8Ih
zH;k^j=eQotp&}hStr`&F$t;n-l)XuwORuhH&K}A(Z5cP}Ur#PfcEd3S{yff7^};TG
zk=hGng6pib4-cXD+37~o?{NBd)k<@Wztn1#dSy_<))T8Z+T+GV4CO-sOx3?HZTmZ_
z#Y+VX<R_7r2cgxbf8QA3+oE1CoX?-EegLv}FsL$zIn$=VVTFSwaG+9n+2lI;^I^Uh
z=&H41d`L9>Sy=h5oDh<CA@JO<p_Zx)Ot!}eBsH9k6-wNY_g;-go`}<UTEs~l*9coI
zK+Q1tH(ic-`IjoIwM)rC3})=HJpA#r%RnvMUT>j5Fn5A+8d(;kcQMhnzD_gEM6=Of
z#_pa?ZbXGAOvjskz99BFNU0)}_7;=&GxVDq88O5&aPL&rZiPejj3`#1n6&mV0h2O(
z>u-845d8`eUg<vdBV|@w{Uo2=zl-4Hc(+`b@Tr)KHTx3cMCM*Bu9MbbVEr0}rveXY
zU^RZR(|wFGxODbfa1pRwpxue6h_zFJT0L6ah`Xx4XbXmV$V!2wPzaJ?S7$XYIgg`s
zesH&OSL_ngM}j_I2J^5#v3FhtB3Hb}>=iP7biT*jH4=?zV@e49Q#)7eZI9P{TQ!F@
zAIID1zruL2N3N@HEs@^~lS}0KruqzB+JgAxf7<q5(wY4#>pcp`5PknS`)21dyIu9Y
zT$c<aMjgN8A*I4k!Na`2?tDG0jecf{yo8!zKX{wr2DjOxOpf#^DZDZS#5c#+T~Eik
zRSu!0b7}zMwt?jWlPKPUFYmusS)0FMd1>3nT?eYc8d8sUf05&D-z=`r6I8EhAq~kL
z41Ia%j-B&qp_D7KJBJTt%J3xbHa;E7lIN>=`*-5CTMzB?VX5UaUZbpW4PBEQwq3+-
zD=vX}4k3Ly`}4OlpFqC5W+Z6*aqPa%1}0L@WTl0572-GRVBQt=)UAx_5n$q>*$}Ff
zEQUVb5G(j4wyKz0DDK=UY_~xF-{OC?feihZ8^z{Wx|3lkoy}+Fg)an&LNG#scwO_t
zN&8;=HF|_05P*t+TuLE%0<&_Lyc+e+&N2RJF6Q5J+F`|L6Vxzd6CvX~|Ga{S^!{3=
zt2Pj#xm0x}3`%>!Ph5TLYI83rKD`&bNPo`t4bT{u$6Vhx9zk3g`O<Jjc=DG3Ar#m@
z6}u<qF;_NsAO5R+dqE4}$`J^`Cd9)kWz%sU<Mj!V;kAbcZQ-rF`Fev$ZeYhRZ~$cO
zRiJ<>Os?crA0Q?wTs@z$dDO>%!m^_KJK?qL<2FM0$q)XA=H5Y0+PZ~m^r7G^P#=AY
zc_M^NiozxZ<8?Z~W^AOy8x5`$l<uN(2&7kI#l@t1*JCIxzGY_>IN%?#j$0bFL}eRl
zWDD+*^XIV@ZZ0!lS>iYjUk6g&CT)+gVQSSLQZ()G5vDLe+Yq!QvnVq-{Kz=DcN_!6
zt-Slchu~|SupTb^1=X8>AM`jxD!I-De)5r`HWUk9Kb89>s!qF%_hMz1c=yR3*aY<A
z!R7*!&O*1gtm{g0UQA4-G*n1~=@t$jU1adQhc<L&qc|o&hhg}BtxNRQLJ}8_&;li@
zp6#5pmSF%69MH%X#zT@BHRAiUN(#yXPW^BqYa$Y(WeCR6Y*^)BkTR?(MRETy`S*lI
z?R(7F(8^Y3C?QHI{rALw$L)WobYYtBSN`7On=V#r>JlKs!Y_;#KDi)|lFsS=F18P8
zuFf`qQ-L*Y$4~Za9_sm%(pCr7A2Z?Y_Y9hD<Lws_V*w(6gdX7ftp|4vg-$(qNPCPi
z3$!UXlGA-!z{0ManaMLSHZmj#>`-ZSO--tEsYQu09Z#M6AKzr#WIFX5v-I$w@Rl|`
zakUnSFKRTqos|1V;VxKy@Al!YznK-D;!+HVcrrZ#W+sm6H278nw)|J0yDe{injO5T
zV(Aa`)pU}Y-g(hPDn)#g%x%=k<8nA^6tqm7lLIc=5X7^d%!wHPjE;Bu_ZTR-H-KM|
zev)gv6QTRl(ul$ZwCd@ZV>qFY&~Dk88W^RT&Z8wudp||#B+LR{1MTl|F_*`ZhF|<>
zgpoeRfctDZ_H%V(|9xZ^=Gx~(1>&dx)3Oe-9qq7$N$VI(2`5Y;F6r$eS)*G8`3zpZ
zbf3u96a-YTBp`!;Zlkr#JB4%)$8IR5wLt&3RRHrtb7KWjhJ0|Dl!m&+7N~KHY3<m^
z`4e@aE=JpAw)p6|#RZ&e&EZ-*>Y<)7Oy@s&nu;H~R;ctLQ{2V6a=LFBXI!4y5s=54
zBqQ?dY%zdR%@HbLHxIsi+>@mwSBq_1`2}L}Rqf=d!4Ds5tmY!l3(ihPZ;h1pi4WR-
zD7#Q{zjnCmTeCRV&19ZUOY-cvOvP0LHfUTTqakg&7lmKWbc1<PE^sYQy8C}eeb62e
zg6T3HGEyh^<=o5euU2_nyn*M|!eN5A&fyn-j)=lj)@=9T#{+3YbDd=|es+Wr9^+e{
z=z$gYr4Fkgs#U-nvN6JZ4@VBHJFn$Q9{fF5_~1Y+p8b>DPRf_!&9CtKCu-2>e8*;1
zCxm_S9qk~!bnHYY{fy;55Os!(C0RevH2tc5h^?aM*Jf+Q#J6%{8jDe2Alq-4pJ&?x
z1Df6w`8W64)?O|%L~oR-kAdQ=(7>q6P<3Z52nSR(Z;yT?SC=A;*{dKZ&APht{P_-<
z#(-P%2AF4Twn$|gf_0_w(~q;ph5A=_o48f5hqXN}00W2#*QM=A>nYEcYwZ71)?Y`E
z`_=(YH&GLb;2P93=)0Ne+90QY<JMv>&KJ^!UeDJ&=((zHj66oC9}s<7A1)9l>Toao
zGGyaVidlQNzMT=Y58s^K{iiGsI5}K**(}q}xo5ul&cFNP2zfhJr}kb^U7_zcmU3Iy
zVGJqw->waq#P8^Q4nF(kMPh%FmEC;N39K@({^4*Qbo<OD<IjU&!p&ain!5&lo6)T*
z`HR|XVXIFi7dw9|idnnYANt$9fc+xSEUJ$;ln+AQ+1_iBp#m7^^b`4e#`=IO$xZWf
zfjFy;DCQKBn{uL6FmHkx)?N}G!FEOl`?E=NXqmX|Pz8gI2i$mmQbz^8VY<GzxA(+F
z@$1&Ce<HBMX?3z9;7)dS#u)1M<@_e>xIsm8Zg-={=HOtdzv^J=gyR$S`u7CdtUobA
z^!BYOZNgZeJYYBvpWc1}(e(a4A6Vp=1}+En(9d%PlE=2GK8_l>s_%N+3j`4cIFZ>o
z6V%)QIV6msZEetdlJOqUbqEh7=7IV1Xn|sLPkQ81#*y=oM(*p({B@vPe(^#-PLxdu
z4LZDnnY?ur;s9E{#}(pE!Mg$JiCT}1Z_zK%)h~9!PXYp#qDDXm{dGuTSTXeA+c~Kw
z1<p;VvmQ0__21&gk0hi7eQhqgHzH~Ca3Kc?K<AW?aPdNV!{<}kQ#JeH*)cw)re9ow
zb^}<*xEHfZ{{mpfJHId7Ffgpk0h3JQJI52b_3xYGCtHk;K~_IfoJfDJ2KAiJUdxw9
zzP3NUYCn@N(xEuEL)z{LBd$4=^%k5paDVC<%J|eXJlNk$^pb(qzeV>~ew_Q5;Q1}V
zNz=&PChh0g-4lIpYm<XA*eA#Kz;oWUJ21)Yhuet{)4U?!Hd%w|!we~5x!I}WXweIF
zlahA0q?XNGB%Gy>D@XM;Vv&w3`^+)aYvB1hBEb84EjIO;^L-|6JVE61OluTensQbi
z*sKIZdI_$Ch741}8=d=cGS@gt1z$%Y9)c(CV)}mlN@J7rZZd%c7hf)Dl#K;e-sgwz
zf|KKtSIK<_`^HsdNOhpYtB$;#)w(m@_h)E<UQMxfCip$xr_6X?rO{Yt-o!NNY|~zU
z+o>=#hn7gcV;(0ooDu*rZ5i_)HPzjeF#fo%qR6yoeuI%r`ez<(ZI2>@qgjA4jLwB;
zSGBQ@7U(E;ROOUL5J6vuB7-NH;%HG<AEo|(Y`ynC+yDDL-j*6|t!isa(dx8GsFK=k
zQT5aoHDc7N*n323RH#)QYAagW+NHJ_v3F2n2C2PbMI^!Zq0iUrb9=u(pYK25humG)
z<GRi{*ExA@Zdoonkq5zNxl1PFs}#fr?Ov8gy#TxbvVqs!1lDwhRWASMU#|O<UHO4D
z88$5CM1<bh$a^qaQGe-og;><LC5O0?og+%o(XaiV<@JyqoP)do84EC%CV*-42}M#J
zom<h<<I_7+AXHf8ALQG9JDnK2c7&=P;Nff8j}|IJz^Gpig?f}zRj45+se@xGZGY&;
zSSFgv5rFb}KLbs9YNp)Xn~dax?zM|hTy)<(gGlW5e#rOfa<|UA={Wh$IF`rqQREvu
zE&We-{Ra0aD#d>>Mk(iw|AKim#Upp`B?1dr{VnX6^U=?4p9s3?<vcawJ6+0@Am=Lc
z{_LEoB$uBxo8pqa3VnEv8B9S{DRm%<IWs+F9DQ)$mF+SG9Q+x1XQnC5Ph*PC-p%SH
za<|g4(R8=Xh!JLdF5g3dU|Hd1!yC49y#_XZc%^M&gW#3NNo-fuMtevO<2eo$oCqf$
zsG?gc^DblxVW@?h3BCRZsZkluY{{bvq4y{EM@q8ht_ka?)$s*~v7d4Rd^s9<*T1cl
zt0|SY*x#E_F@o+SyN9_rhh^?%UJCv>I^~Qy#B!->rk;t$2I3?Ndr*V#PI|psV<%?9
z_#uZ9ikfGJb8^bqOtl`nT(L@Uqe#R$xOqRTIF0b?E{{L<S$$%r4h6u{$yW!edQlhV
zViVMNUAibQ!<Q6)z;L$s!{<+qZcxJUR01@?1wkDL?t`A>J+0avxLS48tGIh00EGXB
zM~wWIi1hCfTK7Wr1G=^Il<4o9=SuY2Bx1&MFH)yi&Xh&MmA<hVWLQZc4x-B>)J(qE
z=bl<SU%^yHeZPUwJ0zk2`8BHV!~Wiip<sZH?b<I(1M~p;5#&}1(+S0kRa4(PekP67
z%;P7Ajz)Hf9cERc`&_V3`TG+~Ky#%6FD$~Ax}4{;(IR)Cqtt{B#44^_=BCVC^=<@N
z#NtnqZ--Q!C9wA{dF^#qhx<AU0>^y~xqum3ZEnn{W~g#^C?}=W4g?We^w@qpy}*d{
zO?1DjbIb7jKGTRb;z?WVq(lY&kfR1Xbqs=YurNbbe8~f4{7MdJd(p{!5u%3B$4Pj`
zzo@Bg4FX<qlt_vBc+p_)w?l(1diwHZ45Lk$4<Okto_QV!M0>L3G0m`#WeSSy_Cx#|
z?x=$45%y7v6f2%N?ln|Dd)IZwXkhBH{UFkO&lP2@(tzscFf~H0nYKr~;-F$mEIdKw
zjZ#)&nNwlS;b8!Yy|c{nKu3T?Ks2m(ZoWRfhQ}H<e9-y{h_dznLpJ5y&Ki23<$@IZ
z;i9H(P0Euwkh$Dno-6*cUW%&j2h$xQ^}a<MkLFBe$nDOH`xU?!S$m`wDtfWdG4cd4
zxMqDoQEV?VAno$(uix)1VIJ22o#}JajBsilXV;Kc`Tga=-uGk8B)$v(*Z!>SsD;Bf
zh`~*0`!4mAr~8M$9IJy8r|~KEdpK+NvS+1NZ<tqHM<Dxnj5l;TG%IfBrM;Qx*ALN1
zzX=Df1Z&<+;&${ly+%lb{Y4ilTVdFgCXoZrFZF=+??{IKL0=$)<Pn&jPCm>dk+2<+
zQq$kdM(UpUSR;)b1v)ff|7tVIC0YYb6PcFTVCLGXsye!7cD?^BbauI7MQLNd_Q_4i
z(ZXWoL$4U1e6ujDfOihs=;JkffVy)H;@J-wjlfgozod5PhF^hIYDR;ujei~8_zZTS
zMwF&G3FQQN*=S(o0`Irp&=I#VF@uc&<X|Nx%z_1UjZ23oL^irz?QsNY(FFP;h3$uZ
zls~->6N&wt)s8@uD!ntAJ+_?uIO;O%Pg^7ovdn&cHdI!b-<mIQX`rExUu4*fh+-SG
zkGt2m#yRIa1e;vt49=u7MXx<ZeSiL)oXsKtR52+YzW>TlULSsx1Aknvw4136xLR0)
zhJ0S?Q)5Q${&>7`<g?>jb3?R=ZST=2DK2C0m|vvhYeot?DuRe~RsILnsdv3~FuTQ>
z<}G3Mdcf=!x?84A4$v8dy-2ad^I1BSvjM1>um&hwaPIo}64(Lu>0V74Rkfz(QThii
zyr0&yo{e41pM{M&!(LR=ZdsAZ`5pc4vM<S#Z}%tOq;|sIZ|~o7zvH8J>8F>$NW8nP
zHt%FDe|dax(r>vA^kMl_zKDTj#>q1Zb-l2@U$n=lLEp87f1sqM`fW<pS08#%4w}z*
zI-EZ-Wv{H}QHIIz%D$Ylc#V^(A~GPrnth}KdZvGstL>GSl3>6#Yufk<d5K@>GYiqP
zqmVEI&21pqI$7}5)EaE<|LE?CIg-OG%mO(%*Bh<GBtX@6&mLKL!m#FDt$}ZOM<vHu
zYck5#e11+oDjR)tz`9q5uD!k6qkAN4UdIu>-@km>R82CXpJg|ZU=3D<1}3&dD6(vI
zOE)Rdhg5>`@PeFRz>fhGq5wy|WjiU*IU#q{|4DI@n|)^nqv*UC{Fe2=8qYg1im(b8
z;R0gmi05<Jvjst&C9w;0s!&KW2cpuE_IfFgbC;K)lk(EZOwaELC_JHc2UMjo5u^s#
zg1G^YnhOlSuRc)2+JgiPATR&6=<?fHtwk98_Qqi&sC^F>s6WCMO5Lc^60F-$t`jMK
z6Hp*D>$R($W$qyfY`lo^o&1(vyDMm9!|>j(hkq;gvv?*c=N<^L_8vG%REAy~5_iu#
ziP-<;K;p5NDfap-7G!v|B7rmp5ZLi|9i*z`4||-z_Ok(+0L#1i4N=unlYWEOD``JX
zm%2NiSJ)5JeKJXsOaj&39?cu6OYsK+nJY6HAMJT%(^l_8fuE}|<1F3eeJBf^8r`)`
z&bSOM8np;ebtSU13coGQV4?N1a%%2oiu>Lyy5-Xq{7V0mS-maDXk#=t39tu-%vHnH
z6e|P+$zGXS#etAN+Xh?hBFOQ;OMDx9|AvXbwm8uc%JpN*fK8(FU{CD6Q{julmUDqG
zRq1rHxorVf2Ozn|f(FZZhMXU#tidyKHdJ;`0-zS&2&3a)VS+JhV-w<4?iW6x&NI(_
zfMOeY<rQe5aT$1hkgI;UO*C0eGFK&aUou8|we2*Z2wyG($;@b9rFg4|s45zLA30cI
z$Z|Qbh&xC|R2jsZP1~^oBy_Bv)`?^<A5$D-x*>DlU_NU1lM#m#qP#FyLni=m$BtTo
zKAsvB2Ck;Gv2lF~yT89oT$`5Dm41njtQ#wT;6&U*u6OhJ(TRL`X6Ur^0yFvQXiE_v
z|9amq>HBY5M(};Q!>QS3=Z`ykpNflAHZqJH%C%owFNaTh{12Lv?>{<Hx{9-9EgPf9
zPFr2`b>I$>Bce+ytRA>m^`upbEkTA>>U67gv>qnq4E^*{NT?CF+cunoubU~Pn|$Ok
zFz=W~4%;--&b~Y}o9q%9KRu6!(<901S=p4WdHi<mJ_O$#pt(Qeko<rk5oQaxD+?-@
zR;GAZ3}Agpy>->j^LMQJYEhHzO2AIC+Yq6QJX2(?n&Iu4(Hq)$xk*)VY186EhMzsR
zy%x0!CFs+Qi6S{Ghs@P{A9Mgr>kIx7$KS0(@VhmGI5^^;iocj^jC$%vj5Ut$WIwX&
z3w+E2RP(y#cpAK75?oV<ctiQ>I3mlBkuI#OKv#XwoY498!z0Y~l|&%#3EMiwt?GO1
z0r!aI4uDPZbT6O_=Lh*pS-D_#BKh1uBLmx8zE@45b_&yHR{*B^Ex?BaNsgeq`CTWK
z^9f_YxNGG=F79W$VYirjJ9P4whV(|ymJrnLoMH$I<7Roi=AT$@iBo=ed@KJWYI(d@
zYgA&V?7sf`u?nO-zev;uUw!GfMB)}Y`NKg7AOtD+`*jxFn8ELdLf?Y^2CwnQf)que
zn~iLp6M&=WgP=72`YSVXao?B5bBZRfBSI#{f9mRF1UXD77vM4~K0i)Di!tn4E*q@c
z5Q8il&;XCINq>4w)cN9a=@h*4XeAH!5h5!2q$mmL7_^y*`lCM0@E9$N)CxVfy~mkp
zw}K{g+|M(4JJ%bX?{(|v*o`jtio50`Ecq@Gf!ls8dlM1RG}i*CVyE7Sn=Ul<6kLnq
zaNYhejV(OHx1^e9d)v<Znh&GdZAPv6rL6=rO?x0fWGcze^~vYL8mAICMl)?fdfppm
z+mT|eL}V(`Qt}+{w=U}LF?-kbDWlAh5D5U890LBwn^D__$=fFQ=6pN4pRs+2WK}b1
zU-N#A;n!#n#fda237cU5nR)S)G`+w7SpNPA0e-14u)F}*8nN+`*E&W#funQggJocY
zm&h)Ab0+JI;G<m)rii2LShj`Z0FJ_E+gj6^=S1k;&b^XmVc}z`<0~HbDKH&g*|S6@
zK1P-En)8w`#~x(ExX#VyfCh6Ux<y)QHGReZ*Yqq&W%;!Wm~Ka`0%J2%*&1-%t)^{7
ziadT%1l3U=G5#|mIi2JSJsO47-+&!18#;B&Y?8C%mRar*x6dJf&1hx$lX~5}MW^*k
zO8F8%O!P${`V`%$I{(*P_5VWC9Lt=ayvc0y3(F>kg}8T{K5esEfm`VTFUHcuL6<hG
za{|A*Jf!-(|D20Dc_4HS?y2qngUG@{ug%MpjMQ+73jIoL<r_fE$+2un4(O>VbCCdx
z@S`2`Xoj-v69KBwo!2No#KfNvh1=8_*y0ly%O0E?hyy7uz_&U@b&$TRq>rOa$}=+p
z?ERY5#@p_ao`08>V1v>sLg<3pj(yFA?|fhfa^_-_Cju;=O9P~7YJ4~9L;Pxsx8fw&
z390zo5CE;<Q2X&P%KTXJ?G)yb`axm=wjo6WBNO(DTk7=*mXqm0i*g9dA%W$`;cJq~
zY#A(*X}(f-T>QMAR$!z7ou3dO-ntjj&q`Ih9vXz@5*^AuCvxyOPMV*tyzP!^^Hq;p
z^5+-}_VH7yN*`uCUqo|zsxWHawfvCf(muit8WS1)3ur_H(cZbH8E?2a(U0v8q00A0
zgh|ZteI5Tc;%4+Si)GC<laX0UIbg29fvBgAIc|*u=l+KQ^Tm~OG{60JD!%BX-@wWr
z@>L9a{x1d@rff|O{eo!SS^CtvHux|Qe<u@+Fgdr38Op_JzksT3boKuiPqv~^Z*6<*
zW&ma!#-&Gtwx&vMGa-S~!zNU%Fw|>v)Q2MXqB10enI@>Y>X?`Ees}#tev(6#!SDFs
z6tfLv@mg+y3&P}u!|2(=2^J;EEw*f`q)%oBU_p}!QWmiY%LvW5Lknb7Q;4PkEPp3)
zLq&3NkiAbw-zLb+r)!1LW89@rc_PT{tz80!t=_r88mhF_j{o@2_m?BB$LBf1cb+`z
zz4+(Ar?+!+a318ct>SQf9@E*cIm!?|D(*CXU=R7$<2)Sq*brt;dQ{Z2aBz=x1d#GP
z{XkF?@01K-g{<BKyi!>jQ`3JqCIIkLs+!VRO#{0}n6LdpVd=s;d$d12psHvDp<OMZ
z5v%M|nfxNB_17tCiZ&E=g;>t<=_jy>NNz+naL-Cmi5P8zbKc0|r$+o2r@+4ZcBpHt
z?w~iGPI(ME9D7S00(i!^?ip`)o?qQP`v9@>Ton&U?C4IdO%5dc#nCoD)!)V+9PR<t
z_*e#d7P=Ryx7ert_x+fQ{B(5jV8oggB^t3If2<HO&F~g9EYdMkEx7hCSWyt%*Y_W_
zWo&W%UkUD$qAnz5xZ=;aVbR#TO}E<TAN649+t;!mt;M{b->`pMhJ;G_mrvQkdY?gV
ztxWzHssX%klM;SAKLA&s9_^|*Q;{C<=!=sw<BK{D<l!IHdpene0dAtDd}~}Zht{8<
z_N2bk&V?!h^WMv6X~l4@{e7Fowsi6QUEjniQY3p{_rXCGDo=Zpsa)~8IPB0-8sgJg
z7y~+%TFY({6&A}mZT5)=QSVxoYR7GqYtK1-aI#2T1E|3c?*J@deyCU{rg=~3g(Fyw
zhCvyJ1#l}ZXabi+av}083{KZkHOs<G>f9&;H@6>!>F*p#!4EKX?#61+Fn7R#lHO_v
zkk*|znEB?KEJ0~1$4QJ?@k~^w5acB@&5IZ#l${RXt{*Zc(S7W5&!s;j03*1%`U``o
z1kE*{27rS*rhox3=@1KB2d3ZfQM)Jg7M)Z8i{Q$ll-qE~DXmSOHy;7j(th6Eo|EX>
z+WIM$7Fe0tW2e0TASorO<myR|qPzt4x~aE2$$(iLkf)^_gWJ1%WlhGmCLO>JuMPRD
zj_>`_TLulR0`AiPo8OOE;{1ASkiO3boNlDd`)sK|oDP-zS67YwHd>-6E**~t4h4Xd
zQJ?n>8<vceH!mn__n$d?&VRY;XT0*{MU~>!mB}plnkqMLa{2!DEu_B(Jt#yez=Q3m
z%siUj6DUKfC#!9EhV<V1aQrpj%3{_P;6Rwzt7|Ke0isdv@j<HfM%KQpt(sD`?72$#
z_xiycaiR01apc2TfttzxzF~=NuKSj?PhZFAwp_ew^U}?EOnW)Yc-uFsPR*p`L9Ro<
z<72;oigG!SXU{bYhl=Mv&5IoGp&qjk8!K&#YJmHfTqc62$&knuPf#*|bgMBfsw&W<
zzNzsAb~N1yQ3(tAriHYR!Uo7sa%D4>C~49x&7goK9TOuK>Z}@qnBB1&BH(1@WdYX5
zt@_z}=|@hoRJD6NH`cu(u8+Jsxki1N-DyqfANY2_io3Q`7I8Cf-0sP~<bkR>;9*&+
z_hIk%m(_6?Fa`l#^~z4n9^UY1R^ksIa{m*rKZv1JMrLa?gL315XPIL&nQ~RX2+~|P
z&pwt@q^p`CC6_KlS|WdSqn24AheJ*|QJ&}Glz%H<{&!n(a2h)Zc2abWI*FK_f7Dk8
zaKczN<aOQcfc?)ml96o6zK69ygqucTxL;aoKUic(ax94MZ!wlwQ3`x+*(SgH_~<F`
zKX3WczB|Dn#RjlX^N_=dv)k;er$wQwm_H#Ze6;=dGpe$*{HiL@@Mz!VMhl6Tf;>B!
zy|4X^e7fc{yS!mDjLQu!oW7OSq06`u>KthtYW1J?nJf>CGjj(%eGk^`HJT4H`c%3Z
zKREH&P$wsBp<{qQz~m47qx#0`XWGj^7Whp`=Pxt!OyiPw^t6ac(i2sNMOAjn{B+7!
zMA|(>$bcv93Q=fCtsf=5LuVhw5n7od*#w2~vLA?at5LuHeySi1EgBaJ^hEgRW?USP
z6Lid+jyt1nRB1zL6$rXz`%g|c*WMlV4~`F1sqMVu=v?tamxv$QIAFWfyEQZn*fxm#
z-1rtGbPN(@#bw6w{6i{mRGfemYV99WGf2_`5(ry-;6wTIV1FqNp@R(aSFbG$deG!3
zXfRlz3t&S?&DDUml3l>Pky5t(1^i9>+iqC-hGWJM^vfvfn0>QAK3DwD@xqryk3pIs
za((X8Twl;mHl5o+{pddr1SK~P+QiLwjwYAlNnY;4V{D@sA~kw?ub2*@@o*7hI=Bva
zJ<dFpG20mWG9~s<ZR<PiQ|vyxWItx1mc1_>&3Eqjn3cRs4}^$S(*P<S;xm&bo`T;D
z`ev74hA6&U83WaFwVU^~PeNFXu9=Kqxf`-em|nyN%*84y+y3|6m}EYt)-EI@Pk1>z
z`*Pl+VnDW=G0BO?o2tL}$-Cz7XV06;{p)B=(6BPdQM-DrJlv7H1m(&i#20cmFkZE|
z!dAIb<cpdws(}^sK_8`$z_He{_08gcapfKYr>fVLqqoMTcf~K%qHFWBDgB`3{#<A4
z1TS^V!5YqDKAVW?9Qc?>gnBV>mTI54=DI8uE6^6B>^`4p4MRn^i$%V4=@Jfj1FVmH
zS<B~1%u2s9%YnoVWlqX^9*=E?{y_);)$`=QB7mYhcC?9Gq1%|`LLVXaDZn_&N2<H?
zJ!n=9NO6%cFLQj>?8v+KeE%O{e$jjIZ_Vv7D*-hpCyeZEQgxz~a}S}u>y><Jqy}GT
z%M<&j%pll1(&Wo3TLBU3Hf8$!9TW^Tf!4ZV@^B*DlRng@kh{L5d{ZOSokP2~zxU1!
zpfy!8oirCM%MKWJ;B`G{tE)ST_Hk*>&z{xDIe16?gwM<Ck~Mughe>14&EjL~Y5scO
z7BtEG!1)XR(RMG5E0t(47xdG!gI`9J3%5(}7tDbu%+FOWHtYL|2ph>3w5J;b6S=QN
zMfr4?`5b&P<AWX_j@##_ncVIm>~(icxNSGyKyIt$OM}$QZHg-J06q`g7P#b6c!xy4
z1I<&Q9JYP6(70&jlLEchXCYH;U?R*I+wW5S{P*DXl4u1q-15q0<$Pesq~_%2$V#e9
zq6|EwUb2I}U>S}U1XP3yK@o#qQoN5VWL6R#+wCYV-mj$jua>JJnND%6R`JDuRtmY7
z+WS5C;&?pDNb_xkBipR)WkfE0TP;|=_kKCxKl?-cU%!SA8Xri&Lcz8>Q^Znr49}g!
ztjhd=NzNhUFgs_DH;dyPgKnzym@NI10(`vk^76_(0avxYMTsL?-}tXr5hEA?pRa5D
z>rpgVdTAtsgDNwRqqIMh8mD}HfJmthd#?8{F|L-3kPjuDrX%J43XSbxF5|1z=u{3Z
zSO&g_jw3}~+b-xE_<+4hGlh%XBGlf#o{l-dj%t-w3+26gduwqJ{pgW8RVaWvD1*b_
z9~XQm+_IzCIaNIwE&KJl<Kz$1mF~`m6}R~&vx3Tvj#>?Al0|keK+5Y3ofBW6y58$S
zE{i|?D%sf#<onMuA`o7U$`f>Jt0LmKsDS9~(NU8>Qvt}a|9%O*-|R=<b<G5Brt&&2
zoT>*(@<aUEZaK#Jefhzo^6@<2NTlYMQ*uzx5a1YfM{{R%`kV?cK(ipZokpbJ*XW~a
z{1{Iu(fg1aqP^%nc!!d_AiQiV+%2fV2scKHFa!{gJ#7ofoKWR9gAmGdn8{#2Yp?)C
z>oK88DUhpS)6#@rWSQf{Y%2rPWjC{XJFgqq5LE;Wf2MW8^WXj%3hCG7*N<Ovcxd<I
zvH7R|934=ZV*h+ljQ{&+YowLLYJKW3V3OwqoQ!v(PqxJ@X&wK-fitK5mOdJauFFot
zDEk>BZ_#=~E1AfhPq+SSr)`?i`gt$*SU!=s=Kl~lCwRQV?1Udt{*Js~%Eny=;{G2b
z%l%OpEU&@aao&eC-qp3$`O9%Dn-B7$BflsSk9E*Z!jE5jDVJ3A1c>R(mHVR@o-z{c
z4}S}^Th%qx#F)&gvItpe^&Tw#JZ18`61sFK9!hPMTe8)O1LL-}_C}|H7(>1I6u!ij
zjrE2Jw)Jj*y!1TNI^gY-Kdz-m{zPO#y&^a}ufGF=8UIfu!n<|n{$%e-w3ob}?=p7a
z<I)9#tTf9%jjunW0*DQ#Q@KI?$hh+;M-b3T9Vm^sZaLGl^nOfzt~bS0@0sec1q(dt
zYe9LJsmP=FLNDXg7Q?YY_NZ5ExwI@XiY6co#9kxOG3~SCbW^+w`KfAnGA%CKyXzHV
z$xUcTri=GpGNFmT@yfhB>(1+PCi+SHdpiLyELFy-S>{}HfY&o%zpeSPZMOiqGZIs+
zNBlHP;(4m9Q_sGvHey9W0c(9_kkAkDA1;8m%KK_IG3E~h$~!!A>8jr$0LSqK1Xj;F
z@2|es>vY>T-v}Tbi_V5_9LKsH&oMd~B^zFL5r{kgksSX2;I0m9Ow_-n7x0_~#{lvC
zF%E>ZUIr|Y+$Ut|k35);G@oc+O}crh?(y|UZ>*!>|MTnEKZs%7;kuE7Pa7Mz_0SGF
z8uigd=4jD=x9aeodv4?nEvbGHi$x~Rr58)EF9WwzyXzbm4^M->jW}V&XQgh$3fAeZ
zR}!*;5WeHHtSq{XRaJduQa{HH5c6a{CXju!T%U^TMWaT&KOB6S6xFQ>J#Lzi<;zBR
z`}WS|AK^hvEySx`G08Hw{GF7;97mid-tLbYvtT99uputXW6Xj%zF5+OG<Rmk0?3jv
z-q$gt#j}Xdu+%%zgFYPXGn63msYJ*ex5I1qx&%OSI<jP9Zj7A<B8SZ|ce!RQ3w@|*
zTbiv_e2|;$ZE@w5!<S>}oB)|Q4jOxLj!D!_2Y_p<&Y;B;64XE30`dp=o(hgY)PHNN
z$vhr5;Cs0O<o3v9ZlIGbNj(38ZkCkS-_TCRFx^hH(3TgtP(rL@<Y>9*PXLsbDGQgB
zid8-u?R&Y%`k?4N;Zt+1`1QZ2o1Iu8RAhbQ-e`6EC8_b<53Gxd|3^Ivpfv2*zJ987
znZ_IMoBiT$Q6CO=e?1J~fAkIUqd-dujEf?i<@3(qpb2_^F1pWru-+_~QXu9>fz}fW
zaB=8HEb>evh-AC4p4%?Ag5l3iq0X3oJ{N~`m8jHxgF1V%<gUY`R9_=;Gm2CVIOv#_
z#0G~A`hn_XMfN78uKmLj4Sr*#oCqKPhJf4QF>uAzpn-CH2XNG3tGJu(tY}~JOq-W<
zCy1%nbxqNfjxC$ao-nfwQI!s}x-;9fgQ_~aS)8A}d`Ey~i2eL5rYtsJ=0J#F{ToU;
z%pGvW<ekk_5)BJl&jA&lqxL1Pl|-|%drYDq_uKGuYHlvMB|6EylJ)$MVXVm$D==;$
z&=&WKtt}cGEDugSpMNrUl(j9=TJ>IJ!#_z*n6fKb6CX9u1NvTW2(`-I{JjeBsWty)
zwQ(xIc0^zIu|de`&jFvhoMSl&FpLhN3a{=Q^OqLaC}5(w|6VK0eq)zJS5#50uwkKT
zi5w%o#be=*0cbzsh(<}bY2GGj&zirlL|^`^_cRI};jc#(n%5V4gptT*!5JHKzvK=w
zPQAi2lP6zgIO>o=oaFts?>$V4`r#Q)6PK^ElH?wL)_t*Z67gk>F>EZROya<re84gx
zY?FZ~7!Zt->6TKsLE;#-!dZv@@2vu8nUPA$2O0PeJ<k?w8yJlq@y^+OI%w$S&pu)j
zCs>XIG}XDLBx?16Nm3oaww~YJlUD9XK>fu&uiJ5HZ#CDQl}1?o@bSb;o>z|eD!oh!
zD48s#-XB-it*LuinnuyDh`HqgwiyIGdbgtc*?f96$gkPiIZsK2*)a8;Sd$Ix^3C1#
z?k;Ddnx2S|n2^`yhLw4heIA9J0^5vD4~b0>>0o2nxb$am(K0O%`Ee{qP-&(2`xz4I
zAX^YN%OSsv@{tkZR#)hBJp-sHg-&)(?4CF&azY>FjjLbbR5=x*{$WNz$hTYMilmJt
z4lw-yy4+JPV2kI_SGLIL9~9Q8;vB62^n0Vuz&3BNcQ!M~6Ck1gno>={-T*jh7KOn;
zFyB~u1AA+vaOp;)L^AWnLFnR8BqtCkx^Q{R%XQYA?f!o94wKX0;U&21RNzQ$x{G<0
za8dnkf&(w!+o+PX5dW71d12}OY`$lMyL)pxoot%=Wr034R__1Nln85NAjAd)QJgO`
zUaowgM!<jd5JLZ*TmDBy9xd{1w14~>E?!fthkSTS<??*vog>TGf{pl}w{H$<`|~sG
z;g29M{3+l5J7MB4y7N>WsXi5~b39q`Sti5Odod$tjU=;7%7#N$!8KTAr;KBdq7|XS
zo^Ob2A4XgzBufh!yqK7Zqd8=`nNgnsXM<Bx4g4#w`p*FdZoxI;pU+bIS#$f*0c*)m
za&yX%VJ=Kt?)$aQD4<u0pK9V9VYG5vJ)0>&X#g{^xS?S5{!Gh~A?$6aSLau7k1+QK
zFG>0UXJy)wn&++f=tG$(M1D7aDb;)Y*+^Yg1H-)EWlr3r?IXJly;nPxg^DoZ5ZVRj
z^XdfCTY5wNE59~=Ug>)G$=k%cq6SuPzRJ_L5)ru=v~6X+qwZO2EIbu}m6?>WEH_CI
zGbz7q{LV%!!QN>Y{n%7Jgq}GBxA*nLg;zgR*W~=1CuwhC^L3Do<SxSnt)Qo{^nl)@
z)Bu^hoAseI^o6FiCVgz;J(RIJDK+)}dsMqPmi?uNiLJT_sHyzQet$ZT6jnohY$AaN
zJEZw#g<psdL!iz7xl$uc#C<n#-P&>Rwoa99B+wcIHeOfMZN~Jp6~klLvHrx3EUB-T
z-o0@cdl)kVWuX1xs(*#iigVX?n||8pYcoyiNwFl1{aP($C*U9OocB7rAoe_${s$qC
z`S)1MpSfB5HjUS{-qc06F8Y(RfD|)w?S2fuD%;@L+b7YlL(Hb=b+NI}|G7|I$OBV$
z6JSlRY)mh`Wb$*rt=<uG@a||XCil?9$#5^ycQ*3Kz;lYLPV@VXA+iWl$mNIDgZ7Zx
zH8l2Pir3CoAFDQDMZM=~n<zShn-ggU6M%x0h2%`#yIQljrhIon;I3l7@N2!kHvHzN
zxgkgW;rNmp5j|yOj{3wFTLJJgu`{`{5jDPAx+}ca`qcaOWTzdY)onDfHQa<Y8kopt
zU6RCwWPk-~T%5Pt)##JDl;XD-)uTBP>vAV#8XRkB&BiD6?e(urh9FTP^U;V+BhGoa
zEN`|y;f}uc5Pif#YVf#Bws4JlsPSp*rc`E=LHo{%lB)wYu%9Gc0#8RtTd`QY`1#SN
zo4VHF)vG2u4YmW#f3Covm*#ZTZ|Zz*ye+-PG>Le{vgQbJZb;f*m>KwZ%hfNi_O0z@
zK|I@(4+sd=oUHQM?CDl77mR@GR+`Btzv&<oH?<9mnz-sWW9$vS{d0z_=s?ZUm6VpH
zSFC^sV;2<<)k4=(zo-+&O1ryW138u1esV2WXzn7ivND3zUd>-<zp~^!nCkdf)XrV(
z_BdmI-M9aIR=-z|uY(mYdY9I6-%cYEv0zBO>g0D+K*Q5O;O)16_Wo3^*8gsAXt?D2
zCmug<vKz*Wq*q~A&-5&-Xhe)tW;PG9`%B9uq%BtNal)=SiQfU2Xf!91SIEoDbNl=4
zbJFVp@F2l?qDL8d{Ke?m$?}hRbHw|t^SsK$f-2u#f%7mSZltXscyadDl95_rUE1ii
z+}u$Q(j|+jRE#|Qgq1k`LGeCVMh25kxajL^BRCwqU+fkV!x9|@C9pZF@PT({c_k%c
zEiulstudMe)-@as!@o%@ZbV7;*x&VgefedJ<7*cs+m-lxW8bDp-QU{9`|XVetg+t1
zio!D^vW1p$mX}n}+{U%tg4Si4(q>n_V#<N~@)o)C>~LiS%>eS>(f4)TbGGGNJ}aB3
z0R)WI^l{Jvt4-d&cO?vqc@MJFg5)Sc+w;1K$;q?kw&&Pb8VlFPjYD5{wZ;?&_3)BB
z!u_s(1pof>1CnFO5iKR_A*5@9<vuhgcNGQN%!$z5WSu%0*hSfO9VvGzcJ}nNNfVLw
z_*B5D<$vz0et}tZ_@(XQ_V3aCLtVa0!yMTrdhfDcs{JE%>5R7@r>utJ%Ln?4)h_^4
zA*7EsP42u%HFcOXo>e%ChSK6jD~r~Uo?fto0uwyF&*bhU^i#cWn_BGjDi@j_yD-+e
zZLm*7xh$;BXV}e$6upO-2&7c@O9=&f(E`a~{Ci5}W%N<lC^vZEjETqQ6+S*296N@q
zn*7Yvdl&ZSP0u~=?dW~KOK*CV;j1k@Q23aojF|Jc9!<hzTH0;!u$2!-EF7K+p~r$6
z=lLBQHo0)Px0vcp4b^jCL7-?^)Vt$)6m~F9wFB&qEerN5D=XIfk2x2@SX8=SD{RKL
z8p(he^hJ7ugK$rp6=p7mbmV<YM8O5Blw@xHJbsK9eH|!SKYN^Y*l|O2`>i2dQ{Zf^
z{TXq%{FE#KJGeO!5nGQu!p1eFi<vxQkp0Es(8J+Jy{yOgz;PN3&fF{<#Wr@vYSZ=`
zGp^k|bmPm&W;@KEc59E-rH%G?W9dO%zD!oUmy*(T&OT`Qf=KsUy6QXM!nOYl(SSEL
zfxutYDS9`|d#xlFok=J#(2-3ixS>*at^a5F$)(hKu0%Z8chqwH!{IMKjN-nTxNxEW
z%e0DKg!+iwIF$eH+1%D#;WS0vN^+I`_yV%|3_u$8S-QFk;D22Ew?&5hw8`gFCWd?U
zSGaxNw-rF^)tc^_&gxn_xhFM~Nxy%0ZF>l4Jy9;Vj)0q9gh1J%N=x$}gg5H~hf9`f
ztb81+Zz`GKKQaXn#Sb{@ZMb0Ptg}xQ%{MZfj<ptvl_UCkh&Z1R#^D-nW#OKNVZp64
zCZ2>ocJ;?}d~Uc^OzV~T25xw`g*DqQfiL#*w~LT%;#!@N2kn_fg5p+-+YY8WG3Z4b
zW3NC;IPOIwukj=YIVp7WHxmAoJSN-d6vDBYcrVmg-CJMzilxh4?fuSGrqYNqk@36h
z82iI~>;;id0KI$T8Q-4H<JN!Bg3Tr_;19&R)!R;P)zf-68YuA`q*S4SM-G>T*BIi>
z<eLs;tkZYjU-uG@j{V4OEU2D&ji;meN(FkK0K#B7uaTc28gEH#K5Pg8Oy4pO4_i0R
z-1xEkFw3Zh;V0tVD?fX()cK%JrvH4)ZXgR5fs%qRYg@mEXs9)YCa}LQ)NYj_KxX(D
z`+hEv4LFWoH1`c232l!1y!ReO0$Iorw*CPlP1$jWKASfToR-K$6mkQ)wE#3n`kB+U
z#%%;2HcZi7^vQLEy^<%37bN){i>u<kYs=HTcB2$iQK_6?e=<`Pfk=P}4UG4>_?W!e
zJ9j?U<(*AfSm4uDWN20ID|rA>Q0U-w*fd&2#Xxr&b9e(*oObw&&NPb(g8Peu!@%&x
z79bGkOD6K?Nfa9Ai^)49MF(E)k7ZFSBbWqZAwd>F{Ay;=Eem9x*SK@dz~>Ch1t}~R
zihOS0HsL12xlQ<plGhuu%wXmoHCB_^YVqHEm`~QcsvA>$MgK0f6`k`q=Bfo@0uovF
z{qoOTcnt7iqY4kpyHjLoE->;b6FSpRg^R)4BPyS9>b9m&=!CzE&71ToF(NlHpq~d9
zx8vIc9c3MOn2%azf(tJX3b-#Dw7Mt8scOl;dv6%#sm>w7Yv2p%-&A=Qbgehdq^h%I
zetIKRn?P_~T-5iSYi1_JF_RTfyYLFr{GJvd-+joCd}!DON%qtJ-s`P<H{a$#0D(0#
z7Im*$t!|P?X<2dQI}+VKDmLai;u@blvKkj)+eNzZ3)yxhv{`z7fL(-H|J(h1t=BBB
z(YCTzrUhIkPo-3_shHGHMVT|COSZ5XnwAANFI&CWg0!lS=jQZQva#^oM3_vb2Bs+~
z^;gxpe-)f6KEY>!i-^LXiIRA^d8WE@Kb%E5urbOdszp=K*_!ZLu3b`;w6tiU%pJ-i
zUt7(}=i+-6^eha_kEvLh{pi0KXGoGuMqzz+^ybw;SDgD>48K{vKkszgP&Rg#uuZ;+
zt_*IG&93E+@?mS;kY^lf<%cW2iA;&hAHJ^Kn<B((jmM4fR~E51a?d}-%HG_*xU{w5
zP^hER;KM2yhFR!sWMdNusYk9W`0ftfNK8+%biudQT!tin&I)9ZyP_Y{YVFGzrw7QG
ziEE!=^9dJ;MhCyHrB9nEL=3Cyr+nJoyQgNHpe9nG?*;a|U`;Od_#$jbD^MaTN|vo{
zE^?{3l>I)b?QzS8Pwq6*&>Zc5F*fxs^EC-mi#9u#9Ysb^2cgw|k`vcl!X0BBJd>$;
z_3s3N7``&(p5J$djaPZ3<-(^4vJ7^r{M@yosczX7Vm;1w8`R3_e&>$A#nB;~CO!pP
zUm5$D5O<j=#C5(gjgA;MRm!rCbQL&P7-`Oa1wY_Omp*+JBwo3QYfWy&&G$E|n~XsE
zw6t>bH`(3fhetb-1Ax4am=wom35!idbbre{cSsMY%o1~sF9_FfU81eM8axedy-XHB
zjM|-gq*r=tU6O(C)VI(caG-@6D`GZ>ZyqoXXvuTR3f2^P`@4BNTX{^1QF#`aS2$4&
zjC@l0<%^2Wr4<ZXa#jSU1ARLDKm3lCkID!ikw$v7OIDV2k^?V*m!cO|thn0o-2J|N
zd)c<Eo6>+OI!GYMpDc-gJ~Q^`3jZAs2K3%m7jdYxuj!-Xhy|^cg`1Dt-n5Dblg{kP
z6R#vAoCo<DOeMsj550V6b{w3mFvD1V-?oFj700Nzzxjpu<M+?5c4rYG1BRE!GNQwj
z2tpCqj@!ErmRd~6U<v^%6$bUN8K{`C#JO%TX4<(bopY*Y)2x89Vic#&)H!a%nCp`W
zMO=BBmwQ*5=DLWTCDyK#fvY6Nvm3m<L57R#+5%8BF}(e}S7EPhkWIk@b-yHq%z454
zKAxYsSkvlHm1^HIJh+<TLxxSz>l}UBi9AwM%xnv1VQZx92JAnp%WMS!t<kc@6~<V*
zeJDk4P%LkVofHXj#tG5seVIj_73T$)<VXn}=z47vXAFjYBjov`%!^5y5(Ac&nQm7F
zg6v#g$H09x1i+TXC&P>0joMK)@YF0f_>URxfG%-#NNfWa8utprz!CH~h}HzcqoY_V
z$TwBHLrZ)OScfpSf|NGvZ@_6}9H9hi9vnX2h4sj1YaAm6%Y6n7MwE}51c>xYIJ5pt
zwC4v^_DofCA=;dxF$yzpEY0Z`K*;jU;*-};?QZf>J7>zTbPNAH3qO~nq=Hd1AmpN0
z8yy$=whL}o=GYaQ+r%`u)Dch7c0L+TFURn8Pk{vJlq#WBMKZPE+B2K=K)Q(Q&X!Gi
znys#9EqbltB>HQxh1dtVh5R(>g!c>9Uxf0sUiWc_`n63<-33dZDiVP9inR<5%Q1}$
z9vNacLY$o=BbHxotcI@#OL0#WgEW2avWnD9F(0geUXjpVST})#^`x{2{@s*wWVX|}
zSs9t1V)-p&JZbrlNzW{lokP5$*AE|&NB?wcRIoBDJDz`;G<=N+Da5+zviqD4Fm*5W
zIq!LjzvfQ7{mi5T`KnXL(X1OSPI93k7#7p(a&D^biRHh2l-q4@UYXp^$u|=dmyeP&
zVK{kb-0)HB-PE@Z91HFp3t5(&cD$UOcy+@z!x1fw^zb-04skbWvJoSAxwxv8&yC!q
z6x198fm3FD0*mwJvlv^gv)TC^nat}@hq!xDc+DuDV*N01bmOffD}mZh))Y4~zZ~(6
za+n-zw$V;pn?Buvi>_|TT_e7l726ka(3Ci{Uu}pI{fN&3e15A19s1{LjPF0;8;(UM
zav_K^fBG4%38{&63&FtOaeT?{RO+(oc;>g!DR3|1y@mKi-E3)Cy!erz2N5SF@4qRG
z@3gz~e1Y&uym3$~wgHlk6iinLtlaco$+R~cm5RUe)9%Opp^t)!-iLQ-8!*aw(-=}o
zG_~9*6h&IK?9(liVH&<BDpBrWnnP$5--N&QNxjU&_EeCi?%el8Uq@e8c4w`&*11hL
z=Nkz<afi1S1*Q@ezm}68EiUtgAIjBOycp-RfcFJhSZrtaSFGRLsj*~En9s5FbC?5h
zY*hl|!PJ>R+JpkB2Hrz--QI|ny|dy}su%b<2|}=WUerVDeE$9>`^)uU&o&0iG@1Og
zQVu#|zudy2Jc=A5mfDRJ`e1N4R`gTkVj=o?uM<C*Tyw>fPWsXsy-tKiA~#2b{AXZH
zg??~?)2xoQ>OOU6dxq0}VlVNL&JM!TpEswtYIvk#D`9H!^)1JK4T%@V9?q7*as$Q2
z;RM`uOy4-h4SuR5Z^!pNTZ*lG@A@pC!}L>MV1cL|U`=t54?p}CrF#xO-BR)ml_68X
zy$}2^1Uj5=`wXH?bDMjG8s2E_QKpBvLX<HQTbRnlvo)$xu^7tcOLhLJBK8*qEWf^s
z=nu|?NL{@lRha3goV=sQT=gYI%<A6G+5^R3+OpR2ebk$Z986zu{ZYd`SEublX>@^v
z7Oqhb&_spTt-F<uT>KG&e41Lu+aH9IwSP<%(f(W`{({l0T0T$Hu_-gv9}ZW5XJxAI
zKTL@)Zjp>?=#5xy;xIPvqe#Vw83=^6y=XoCoFz(_?RZ|g-_mU2YqO3&TGVn?B_^!B
zjb1&xigWFg*euZTOI6ey{lGypPO3-H*}9m8eD5l1THPgJH`>nzK4B3&+FPAsVOtw-
zD?If<*tlNwiH2q|mOq1mxqgC)r6(E%fG)tGYTv`JD$zz^S3A9(dW!dqVA+@ZWoFXm
zRdUQ3f%C8LThBqS1PLB#RSu5VFbE=cGPD*Ren-B_lk#ji?!Q7d<H>^LuUf;6Ciq&O
z@^$|507VxQlWma;g$l<{45#V#@9r%vV6OeC+>lC3Tnq^Cf$XSD7?nh#Buq<~!>2vq
z-7`<!aP$~kZu2CR?ypKIZkSVBvL`lFDGtsA#uiJBREDswB|^4T_+DlLMof%g7JvT$
z<DfF@De_CzKnv5r7S7mAmILC=dvH@Lne)3+*OVV~I<0oLRD#dNrdtQWMMW3F#>h%R
zvB&!%T7IOrJ!7tov9tK!SAF-(l^z`(W_D@3c)^}VBU;L@ZNACBRpH#&W!Kx_1M_D_
z@6)zj_s8T&-WG5{f-bxI^v49);bz>-cP{hkOL8t#6>Q#cTz_IHoJE>xSe56NiDK*{
z^R=-He_s9>ihObRiKEb<!Q6uw;M2M7HZokWDqqPXz^BnaC37YPL=ht;OmRP5@|)%#
zQw+!qun;@vT*;{EjphmWK?_iZq3kRq%_1GDg@bbP_*Uod&Id%Fb3vN(T}Ac1%GW$F
z$#Scp^SF%|>CIGVw(SoW1HE|1MUhW^{!yulZC45R^dLGtI+&^slbBnS#ZKiTetBZt
z=UltDF25N5T?LaD?Oj44tAZgg#Gq*9!pyGZG{IKj0rASMeWu_%v{SxAY9#WyQ$|p=
z@=euPs^DO>1ICsG++k}c;$cE~sYDZmJw@r30r17DIJzW8?vF4Mm|XEC`r#HbEMCQt
z;Sz!RsC|t?GLL`0TV=7k49AXTO#O8*`X_*#-Qa>1psdMBuW${RRNUJa68(NM9QD@e
zU)v$*A8bVfn1xjjCO>o9?S@KbZV00re~NyE^Q4vGfhC5`$>>%u!}jz<$!MbXpDuJs
z5x=u`({16JhF11bgYwpd+py+6^FR)9d~HVsEyVEU>;m24>aP*N=VW&L%_v37u{${!
zOPMiIfgb)UYSc4FrfJP>cf~26$quEpuw<CnhDR#c*=4AfCGgKgbmN=WHK3|@vLZM#
z2?crF)S12^bef{E*n))F?6505Xcd2Nqu(F>b+%V$Ct)epAfT;tJd_c<8ah)D*U;Wu
zp>~u5xo5H<S+-Xg{Q5>9HPFL=pt(c+(~r%TRF-N~7V+YSn#hSheN~*ADm?ma=1|jJ
z<hHCCw)dNUN<RjZ`UdvV-JN1Z#a&NDdtY?!UaE6Sm$Y6PhG1G__J{?;!BflH01P7u
zj`PNd1cY0Tr0c($8j5nAa%Fndeph^~yd=j>^iU`eL7CgHd|A0j6_klEMk7+p4n74c
zih;k(-rBSxiq-ml6#&l{TcTp*hMKtHi<7alqrXG+O=-rit6~*ckKy8|z!Iru-k0&p
zQ&%}FMxTZ)^2b<XSoz!_E>kZQ9`iY3H2ED*SXtkA)5c2x@U~e2e|;{#`1}Vvh&F|3
z`CdANQK-aLqo{DNV*);r-ie$PvUxFv{0wF}DDlP;m~AgCNukr!6-0}#9wckZtx6rO
z*B?MCbI5n=9-s5{!PNO)H7Wz?!Rk5-{Ao3DK^N9zt@wz3HnyLVU97c>S(|p}?zgG?
zT}5m1Ml@s?zqrHkLxFoi<%NH|ne@9zhp{)U=1>HHPhvpsp-2iu`ONqfSug9!?3vPe
zMTQ6WLlp`8V_n)&k`M%ot&91qjwp#HTFm=fhNnf1WbT}CL69xIFJ5*e=AL1<?YQw%
zDjAZ*z5Z#X;QOb|fdUcHnIeJj@oI~f&-vcHDS2GnmGb$<n7;NK)sd@&-9?w4jJwp^
z-RekfW@L2q#A(1WK7Ns3lK>KNWRN=qaO-CZd+;@s544PRY<MThY?N#H%8dui*Q}~=
z&B?8aWGYGQxx}AJr~H_l2i;Albnw+J$*^sYTNwnDi(g9)cg;H>QOX0}WcmJFW8RXH
zhpR)R9F^E22JS5`xzuKP37CmK`sE981sm>WLSwU85K>&|#J0SXB`PRG_Eue$3ABJx
z+pRtJ9qX88rn1kMzzkAHFLs*fQeGQzQQBBK$!k71`R4h{^RaU5aR1q!GsM_5CKCZH
z#SLH+UT`(bM5!BEIBhxca*dv@J~{-RZ7xRkNBa>V<YwV(Z6#kq^@+)-#q37v%k_p?
z^D0;;YUF@MUyIy0YEAgg`Yxh_%(RoJ5syMx!wlNXtg_q!$2Tg1Xx5c8`ws`#v311<
zkw|8NG$pvA>9(FB<iSpjV-SO_b1Et+QbrQ7bosVUnvF<Jx$|qSjT*MraFmR&Y6nnC
z_EYLs=Yy&^c$0b9Chu7Nb2x?t{}PJb6$(EyuW(xShHpQoFxIQbTa>bY6vYJt3?E*(
zW*4~#+$A=+XfEg;Obk5lH$gB1&m&Z4wnoHFVu;v1II4yeqhf;Os=3+Dw4(<N<a{`;
zF(^7L+uDq;DBPI1v;U+rdc1YaKDoK9qO{_0w*H#6+Bw)0>+EwNa&3Ku&)4|jD)*<8
zSIjL(u#L&?xDzj8YVB_a1&9Z+f@*HY#kIIpX@Kaxq}aeGN9{sqjQ)g5aYq^MWwMP)
zK^@sO37ZS_t(%)BQCK#<*Y;}A1QGvOmX^wn2;Htq^Jq0b^hX}Kw&NK@QnT9Kh%?9<
z#u>3Xkq~A~@14mKKgub4mmZw~L;;gaDa1n*$kyfKh4>^(tKO<xUyZb9nD+XjnLQI*
z{(<I{Tpf&D9XR)9;-3qCI0#AZwWeTgM@zbZqJ_ULL!qvYYo5rOS@G^Bl4Y_EGhq?3
zG!}92+aN9oR8qkhr-Dy5ldL^p=APPhUNrV#(-DhnymhU&Vb^iNf{@Iy`MWu1aoxoX
z3Y=l(P|{Z(%xa8kAp~Wc3jR?ptkTNLIEpzqa9!-Y=qv<ZT3iff6eM%+kjjS_C|Cb(
zf6X_o`dXFAL~FijT|Ib6k7Cw)N};0RjPFUkrgv-^Vf|j%vPhX2>@G{@ii}SZp}Yi~
zEGmTsPm$?h3A0ZFa91#zW)l`BZpIFkGjv6*YKq`Fq6~?6v!+ri!ti7(8W*SrvyBt{
z7KN_tkrl?16~TC-7-lgv=5FnI3Fk}FS=_ZDY};1i(U?}jkSa+*FJn%rtwzl&QwrP&
z;xe>$wg~#;BI2?23*W@LIqvPRmN8l!3f?VmWN?-D)RN-{Wq;`Nnb+<A&dmUrkeQKm
z-;lKtQTJidP<zC_kC3RN%Nv!os&wAxJZ!(8a}Nz)zBpLc(pJn<Z%6gUm08l6R=U5I
zn>6Dd-{asO^#tLctB|&{lK+|V+-}{z-%5bq_1uk7-B^6aJw`imJ64(g+BKBLcWd|@
zNM-EO=WM6Bn~S3EZ;HjFAiZ8&m+bAz3VQy2W-ersv>OiUyj|JvWY?Em3zR!F6vFKU
zEM_8k(~6>9wH;MspdptdF6HKBv#d{oPk?~A0e95Su*kC%8ARps^nZO#aS6p1xFS^U
z4|W5PNp$rhnbK`GFCRM>6LtMEg9{BiIKnuII~Vn6swA>4**{)@PrAa+)<}jG=G$gC
zYdOf$2-&m9X5;h5B?5ak_+;_sC2F?l%S`du(i|(<aNWkr?-6~Q5AW6rvlC{ehT3?E
z;Uc&b%?oZ~^08IM>G$7{iXh=zEx)FH;MZrzKkD-piK`=TyphrvN|RD^%JexN;B6Ns
zlGDSFF)j2{HveqGEGAB=wc;P*zU_Q+DXT8i@Lf%L;cYHllE%!&50Hp}els^A>tlof
z4}aJ}jSt3jXB!%{##@n$EuA;V(HgXbjj_-(%fk+LKqdA5#zP9lhvmINl_jz#78XXD
zI;whHy`4EHp7RIW(^rpW<L$CEKc7f=-~+l!lNI^!LRKg4<A?gVWUE^Hq+YEsMwW~f
zt2c2I<87_q_U#@71aQ?1xVXBCRg{VU9op>m!Es&~#;Ib~sm%nj;+6HqFBabdY)<IC
zdl4s0Oe?wx{O~#GJBOljiT2BH7YygzMvXY6>{nL9V$%glm!_u{6`Gv0f@5m;6*E#$
z76xKLCSqg8bP~LnaG{qo;2K``6c?;KrS$&PG*A?$N_bFxn~l7_IARpK%}@}?MJ*<q
z7#QTwL?~}w4EcuARq~dzybyP|x~vLk(Cm}qa<)d0jf1`os(<+?>Nw+y@2<Ru(S(F{
zr+=PlM7XFs?yPZ{TlB3omCHF!uQ9UP_f$lNA^ip3U4Lq$P-Vu`)v+x#(EeRZqgCzK
zr8jp6biQ+chtVIV&Mr($);V&S*0_b%)tPg_KR6*)VfXCf$OX=C2EKl4efyTMRH9*z
z>iBA2St)rYcOHyssbX>U;3y(4S@iloo$eHSb4J+3onJ|!&we<e5iG89V_SFDl2!FW
zwX(F;wu`Rkq3fUJ-@<XT&j$x(&af$y$Mdq5P=7(gtQOxDrEKCYJ$P6EXo@M+k(y<A
zios=}g_9F2asllTHXze-rBxPHSQJ?pn^9<<u9qGg=XMd`FIz6lU#jzMt2ntg`|%Yv
z@-ABH68_;E!xhU=mLei1Z-H1kT8s^!)&&<Qp<D_y`avlt$Y3itZpLn{rPy}1snaa1
zB)DB!L~^UmY52k)7FGCii{bjrq!0F!98Wf54X@;YIugD*0|v8}ETO!+gRA8k-^;x-
z=V)I`(vLU$(uMy2*m}=+IJ>TWI6;VrPIMC`B+;WqPof8rM4~f#XNb-qf@p(?h!#W_
zy?3Mc-bOb>@53mg|4**#zMtoPe($IGFrW6`>)dOt<2cr_?{1Yqe~Hh=S_?y(kM%u*
zA!B|F*5l2KXUBy$K`ZUH8<ekq50a$hhjuzaCn*(JILr1?89|Vi2J>UPiF!x&U$sub
zg6rss1C&%~KaI<itmtF(r$8}p4<;f0%HWske^=6wd3mJIrS6%p2sQqh<k|{ptnRF!
z4>M#bN(57OGCQp%p-S`L1|Lbzr=NQvmSlUJB$>r6FuE}%6TAXAp6%`P8EfW@SB&JJ
zYHH;t!o*L{<f}6dZ?1V6f<fdg5oAgVt6{%Czi7SSop&jb<Qlg4pVv~*MI9;spZJ-4
zY?na1uUT!JE}aC>6O-zEP$oNQlG8q5hSwHCmd)stl=2h<`-GvS;VPmwC5C>)hJ8)p
z(H#5s_q~c70xvPI9(teg!Y>{i7R7b!%MjcAWttFZU12y~IL7eL|D|&&s$`ar=M=u`
zmjqbikaN@u>jbLh7a?!pU0;d9lNjIWM`?XQ`+K`8tmpCVvPYdRC#TMsg%s(Tx5P&_
zGk1x1C)4zWC#Sj*?04vtFhY^W8td&#C)Ad)h`5IUw(8IwMQc6vzI9>XuW!BQ<my#o
zE!2H^RqqzO&2^+6q>R6r_EaLGPBX?KvBoHQt@4pF*J0yW8`IH|LETe(BSVUIDnv;o
z-f=lH9PVM4xkf>qvYPRNUzJ>lFUVFGi{lQlI~`{9b?V?@w8P`mIAnthmu@46_amv@
z+{7?Cknh><mEhXW&f?E8-zL+C*T~J-L_-GXa=o^qu>mt7={8^7cMJP`yH55qJ8xGY
z1Zs7L;2XNPUbtzU6Bo#n|80@Us3&}V%RecFiCU+D5qA%}KF!u<Wns+I%^7@-07FB$
zz1cwFrImN&UFu$wBsfl#wY1X;q1f!^Of>cl(})42m!ab=_C+y}dMS<gNfW+I7_@Q(
zqPUtdLQ7S}K`Qc45!jyFA}O@Cgk-W#<}tMV1J2%XRj0=D%y*D2U&lf^oiwUImDnv1
z4cjIolkNVecsWxzeCu91DLxm&N3bLfg=4ee9Nd!pd`Dgqh^3_C#x(^r-h}HxfiD;F
zMOtQS&ZF*WbJwcHEUMJpgjP1(u80=7`LVW_z-59pFgJyZ4jqDs%vQ6X+i-kHttW0?
z%=jMCn<bjN(SEunzTgV-qL?dkV5uaz>z#z$nQg#vMPBTz_{8cy8l=2dIS8g@l``zi
zB8VGK3T%tc8b#L_w)5K+$r0bYG0icVJPfTL*wcOPHi&~6Kr-_30X=R(6FIFi?D^Yz
zdV?d#fw>hruzSZ23Gi7N6IYVj%rBT98%B`;Ij>K=&Ju6Vo$7Fqs*tU<3WLU3G<zsS
z25<|aMu}^_8ief{w~WFgBS|eHHVh2Hm}OjiRYW~^5@@I<O>5!>S-VE+c_oRYJhugb
zTsY@Ee+y8Te3j7X*DtG3Fk<+z7i8oW!1Bz;SAwT9+}pV4A;j;y5k-<pt2c%}O!$_&
z0a9|J#1PxlQ&b33DY57Cd2T^vriT##HI~|HV4I`StNaoU6gy!4mP+w7wco^K?q&Q_
zh!w*(>h~napN)FITRZIy0v>A7Z(yt-B5VrY95+k?4>f4hP*30h7JLzgeen!4C8ri=
zEmF{Sv|!J5eBMP6yEdIM>JuAJ2cp#LXP!Rvc&V@TYIG%MJ(o?5f&7XDE_ng9{L`*y
zM5<#u89P8t$0Z~1wc^D6DCH8vj78_<HU?;R9_rr-QW1qta|>M+(R?NrVzgZhy+hyP
zL#V3H5F}(nVRV1xg-YE$l0)2&8Yf*=6ih`OeLeSY7~_O?nVK!7?xeI}fiU(hDWAiS
zZu^_yRh`3mpVLEd;B100I-ha^hO<dIjl-gH8D@?}xDJPrga<p3MhcxV558cowQr5N
zX?!%7Zd%`G`P9LgHro*}*QUvxK+J>TEp`r4Qp=EwUFtE9^I*4?ZcW(G$QwuV7b~S@
zWw&Ny8gO>!Didw_g`N3a6);a&1P-6+|EtZhwU+)r(wxI2zHVnRQFcswR|0B((r>29
zV{L6M!0XJL29Y<})o-rOf1nNqD!Y3lLvy&0IpDs%N=ZaT_;~Tawi!PPUyz))WP1Ei
z4ARyuhmINN82*K9<oXTFz(3`*qc`lwelpRaX^hhXg4zcWV|S#Djd`Do-jkM=-U5U<
zro{S`L1AK0M!&unOYQkOTj++rxFGKij`O1GKn>EreOM$2)}=n_I`<CEA6XpxG0eH|
zr~(RoLl{eRX%F8}&q$O%g%O49&gD#*zp=Kf;UPT2E5#W#e|!*a{4AuKoXMj}tYPMk
zh6nM)>w6AEHRK!*x<=H)Hb)<!<Kk%pXRWb(9>l)!CH2t1Cx8|(Bv)gT+$0oq4X_J>
zc@2p)!E2K~3)d}xo7dJD?QFe-Al~!<*4YqPzLhB}C>D<W;-=!gS7Mh<;C(W5h&lt+
z(lNTD)z>DT&+p>kILkylL&YHje@1943o4&GdwHEPxqHj5XSYkYb+<#?CoH;xF7`Z~
znvqiC_vwUejlx9F?@Qt!3qoDz+sqz%>w~;^G@vacHb)EQR|^1ago`^th=sCxQTOSV
z7s|J<$u}isMVBq;{Em>>JB*o}j0^T`9TlE?jWgCU#DhqbzZ=6UKs$hA-NIQS*i)<|
zm+<Yhv+T~2`$nBxFCaK^dOEz<^*ftKVt->>g15348ISxusU)a(K{f1pUM3CGA{81R
zAa=VrENpswRUce*cUA^pYH`@fUiWRXAbz>>COPIT?_EshuW)P5M?dUnlgxkH{+w+(
z0_d9nJkWX85E^lD-f*}H@(13UmzXGBwawkZWqf)=>AYjwnFL<lzUmQMsPwia-tsVz
z@cs<~J-H<=@=YS7(fo}DL$ctS(O<~lti?@^!a^wbVv3Qev83@UE5nw8N~)UEQv;`M
zZE8kZBgk%B?g=WB!7TvfT>(Ci^V*xnu(7jGPJ1j4+->E+Jxil%N}T;dhaZP{{A!*f
zYy}?Y5IOr>$WWTakbGS~Db@hFDFda<vbS}zeR}^Qtj=`m9`Ff%V84d-BELt00bSfs
ze|%8wvxcPED(3b~CmL>T_hP2<$g+X$>G2fnORW2{4Z?nDBGs^%_0$Q@R23MP<R?lk
z$gNi&kV6%pu%GJ2FD!(iI$5r8fQMII4Y#K}<LY<H%F6*(MAB`m7#VnRhQyPQ_|7q<
z!FN;1KaRUha&hbM<-_<zZ2KM8`A|98P&Xc9_J=|~o8McPw}37?3<VqOZSaPT9|{hQ
z?Qv*x40M=^f5@}x&G;!Gi4NT=&M%G9!>{sQl+zRwYM&#55lXSb`R-O+7QLkQ)3_i0
zx9?%9Lt%M3-(Gae0wg<o^j8x8LDSXk_3-f9q}|R7*5CFN^TM7cg>@nLItqrvy3l(m
zO!@pT+oefHiY(Tahthu!wnUy}xVW9DZ-n=&2z2Cj6S$5g7ub;{P|Gm7o*t+g4%9ev
zXmqd{=+ro>Vl3q-LI&dfwWm6Qemz6vfBVIVkPCgR>Z5~3K0cphpq;vPJa@+Ab}-~a
zm>?|GE=gQy=LD!*3iNgj2^+BSyzh8;q>jJ!C#lKh0WNyQt=UmLkT;Y?e6Q7PqR5N-
zcdpsh13z}oF8Rm6=8U1D!GR?38cF!MlQRIi1cpS1#R2Snj3HUnLE9<=!`3~xex0<v
zIeRS@Ny2A0XORAOAOw$*mho$N=bXP~BD0SQ;YvnO%pZZtJ9_{PyE+<;xM&BM^RalZ
zeq^EK9*)n0$oN7VJs4>|*A@4xhDZt~G@)(8xA4bCum$Z`veV-4W~y^F=1Y-h_jRj3
zVVRj{rGl=K*398C^#MpK?jCz~b<Xcu`LyGIIF90y;P>Rip^V<^NuCB_TWDUdG&KYN
zq!_F|ag~YIiNF6r?p@oLmtCG$vf5!>oezr=z&5j|xYRV&1V4ZNOhXb1r^QC+;^fMi
zn*$oT16<&Xt1q%1*GY>PmT%~IzsfFy`_JC_&i9EDZBc)&lqk7vkS=^>@}*(6;3vhq
zT5F;h470KGh1V%%e)MV@Eit7vzZtEXS-AMw@NhRa9TtXnsJJgq#vp1oQQ)W#7P7^x
zcl5|KqM|g36u$v8>4O=O3Md^f*vsEQH&)o9AU=m0(VN2)WO*7kJ+Qk7If?;J=GnW`
zUcXvHanG2Cf;)SBO~g&yNg^35@#WMN+5~Ka51768r>s$xJavtgJ4FtQX}m%e1c*Xp
zk0il;q-cx`a|cO&z{FMfyZnx11+#*XFk=(R{RuJbHAf5z*_5Fq<#)Ayu^Y(LMF`XM
zuZb1=k2v)^bseqy{*i5LT!LTMx@Bx!V!-;EudBWJRgoX)1b9W2#m}#|-EL-dq($;H
zMc5;}595o;?~iw+^jI%YgEb=GHi>6_{`v)v>j{}{GL3nR=X@6W6tdZtyuLX;AQPcx
zzu0aWn*?u_55e=HIy{E_ZAd;sBURgkjW>yIfZTfqHowW|pu5ytHJ*eoAioE;3EbcR
z^&32_-k5H$T@nmF+|JEi?HQ0)$=Q2bEMyGdQdpyx+`40ZmyUae?L%;T{1&syk*opL
z@+-jidO%eYpL{5EqXv~(@3{c>p2yyDAGxoFHq}WLROo8}I_Jf>7t%Eqq&Z$Nfv}LW
zNh)7lvj5_$&Bo6bvGTwEi>sy^#aFO;fZdO@Fbl=TP^J6&cd8r~=?vW52w*UqFvf&b
zswoDjK&TDatk3oS_iwlyi=Wut&$~ar28+9N1ZljIyg2arG8b-&Y4Aq*4d)hW%KQtj
z#>cDZi<=obkoN1t?{^F&`Z*4{)_t1Blg7s8rI$d2!(M<@iO<4?y=h4VWCO-F3)H^<
zqw}U9yhE&g-GK9ZLv{==dw<2nDFVBTOHOV+-yX)FM@qdB^5Xo0U%W2rVuyx)hFSvg
z(mZS4F#w<CCc%*2Js;1;p2nLW@=D_^vVDHF0cXDJJZvP5&;C|>;%2+Y4iDE?m3rj4
z$K0|t6N(32S_6L;*0D_m`Qes6hVrv8*ZSJ&li!1Q>?5zTu39T|i1j<lvvLU}PvOT+
zm$S8T`IcXUs|AyuARh9k(g2aI$67)lFYu{-O~{rmNC4wHMEyMmyUQ(hWkIMZNSlBZ
zb|nQrH?1<n#Qk7k@ERi6V<@tnHXb_e8$?RsZg2O6y+HOgh#A+^D5~R{0zn<^tCr<`
zkNpjlka4<#(EBh_K2`-c^evTRpnvkSS86=#0fca>aC+aE2*$8x!Ogu}Nt*>UG`?Zh
zh$1}wC(lQb1My#IlsCyRbOXID+5%7fwRM6E25eLZ7(P$W?{XEFmPQp7@#3keE!v+Q
z;q>;-!18){rKIjeMNy?8>1MZ1hKI$`0QQe^#F?HJODQdl5X98f;-w<ltqn~~=+2;$
zuoSVquf8{8?WFZ}o4}6autsb=#O?ctHE{N;#NM^HsNf$DDvD*71iDfdH+%59(CA1d
zLB^*Dr(U_9(QwS*eu}UE@Pi4P1W}cvA9BVV`3$yMoE-KI)d6=f!dE9!rK=zDbOd!C
zP^CnTOFkFC0bMlW8FV1>Z4#KYV-DgU;_H}w5MUcLco6C0Zx8DEL~E6%_x$@l#3pXz
zVQ?%P$XC53wX+4F0iV3*qe*VX<WE;F_?&cWV17zW+|{i;Vmzv-9U8J7A~+V@lC2Ng
z`msZ8G7?tJ&+YP|L;g&TkDIYYEQY6X!29Ux>&Zz1suluenJIPW2a&g!?6Z+Yp?bmw
zM`a4DPuwy613pAC1^Rk?>bL4%+0U*$hRHS<BO=0@Y5-ZOx{>kw&EJ(zpEw;ye7U|w
znC|I;X^8uG{j7T~prM+!H)v8bxz6}OBacMQ?NcE_EGO+>Gan+~vOjnC@7zqaB)3i&
zV0|+_o|R0TJV*;@a8>o_xYWdq$QC&u*}rh%Iql!xKBBcQ2^9dob-l<<#DTy$v8rTO
zk_Js(zp+~CdW^~0(KZV1T8AxEpi^n2fDyQfUFHyZ9NHYV0(%)2^ZTVi2P%Rjk#A)5
z$!Y%$-OhV|Vl(qwC*t-TFjwBIV$dFt+j4)6^Kr{m`Hiq+ZIjNjhM89^LkxGw$#@4A
zocN*G0oE&T1_DXl<7qfoJUaJ*?s^)U7(C<|dYySKHjpy)&t&wFIP2!vdfo8UPXo7r
zfx#3)I#=hyWdl_AVxhuJnt1imioa_i6gOlHfKf#59R}#x+gbjAkmeHJWHNsiVx%BW
z!(P(1{eU#xZ|f?&-}M2K_79LroF4vcGTc-W3D0A6Yl>^&Y~Jn|)X*QGM1u4_Linea
znFJY`h{Cs+l?8O2t5q_GZ<wmif=<DPqFZ2}6`G^E96*rpSeT0oKf$}RFF|-AAV&Yo
zUNHVdksZUqqq$7>Sog0mRipY(i?Y_Com;<j^(b*g(EAEfCX$8&ZcDvf*vG;!ZKlZ0
zLjtP#!4ovFYZxuCni|!7cXZ4`Zycy)vUH&7aML_3(go22<T}N1#im3Ohdk}~iK-7g
z*9O6*xfeBJqVw2zME2JB76nvuz89cWq|`93Q$QnKm~I&GTqP{LB_@Kw&v>=Jm@Z<T
zA!Cn?B@uv-+Z>{r{JsW0eQj=Kd8>ej7{z5&{_?t6xl4e!@6sfnJb|+d?J=7uaBLw0
zv2%aKT8)DGx4JUwa3zS^I5M#n13mITkQpTh9e+7XDqnK7DQ2H)Ql(thV&gTx0|4E2
za_4FYgO|oB>ox0L(Pq!x_gQ;;2U=2<OIbBXAP$lBeZmoYxf%=ARA3XH_DVU&=4vJz
zxm5SK{*SJDBF%hSZ{jMNksp!lgw6@)fB5272Y`j}j~Alz?K!#oONg%k0o%i05@%}9
zU3c7pY=m_NEnHrHbX2X+=Nt1;QS^8$S3b?dr!fgMQAJAB3h^Cy@;z+h4eLS-T}n!D
z^0q7La2yZ>Hda1floFZ`Qy@xwe%qWIZ01VeK7ijUUu$w1&%r=6>t*};v0i`gCmt9N
zG5UI&Acv3ce&L;O((^-A1PWup4sJR9z_`riT=m$I9-j0X=9<z{iADs&!y!o#g*C6j
z1B1Aj+s%o{J;%z`geo$EsKV2?zqrK}Y;ZO34-p`eC6DRPTZnz`R;E$eSEhC-&r^MJ
zKaeZ8)zvi-KH$yZ_gJ_HudKVnC}RIv@D?4vM&3&^)`i><^9PZ3GIKhX337zxL)`Ry
zW5KAspU8Ig@SpY{GAYT!u2!>>zBmf+VPkqSDqR_1u&pnG)<=y~<+W+sDV<NRw5s_9
z7kV>#D9?I25;}+&&pE1Gwn;oM#UO6j^b-|f66V=5%(Q7D-Y|dFg37ORs|{#|*kAE~
zb@E@z%j0<}q4abriYo6MI}7`mRkS}fADR>v5ugyazz3#4%vw1vleDT5Zob36fO3{O
z@G^<WZ5-Qr!4EQhPi#jEZ*ctM<+S@=TY7&QXGnHS7Cb-PQg=<9?X<?@sS|pAExWVF
zSnTUkSZGm~d3=Yt9-DS1EmOImL1c*Aq(YP)3C{@ShJzF|VM=ehvgP7<jvy->6j-os
zNv)@Cq%pG)&7p)bNNW46B2_m$aFm?=Gj&S!m*^cHdQqGNVZg1qsmUUznUYr;h%|SR
z1Ub{GXJMV(^zSfq#v~)CvaV!WCC|Sq7Vv{+0%)%8g9n+0RvY552+)_2ET7hbyT1|>
z!^5lB+-!IK3%HvY4D&U$?j-8*in!y4Gn|RR@1QO{cY^z`JTU4q;X!gCVytDbzZAb}
zp>a~yghi3wj#l^Hfa~rS{(62piyP2i#nG&-@2Kk&YEq}{b2#bo8=c2x5OUD9=J*v{
z52k%9GB*Z^m$*_0Z82~hZ@}OBpN_v~c(r_#VV!Gy9c$s>XsE8B?t!Us#fkDEbdofR
z8tm>|Zn_ub_jSXH1K?#Luwa3c<Ad0KN{cneC5(gHh7R%z9XUDSnax3}ov6n=rLSX(
z{&P7PCs&i^0V>Y0<trhoU$`MoP&lBks`WfUAx~p5&VVpZ)<E6~QRf@*)m~<Pfb&^m
zP@QNbO#W$Radg@v`e;j87_#d2)ux*2UcQx|22O!iDUZk2*WzLThvfej8*jfdysXVM
z)e+)A@445e+H$yAShPJfr;l;n_3=4+uEE}gbhWGk7r2JcZJmolw*#g8>7t@$hNlkX
z&&;h4&Mc09a&4J0$+;3BW`*xvtIqr%1N(JT0K)#wkjyXTJ?ZKW=iXI;q$(jQpvr<*
zK%LQfH*z2_GykjQFBQgFYOuGcs3fpu8@%d28A!v{r$|6YFKC-H2`WtikR+1rjJb&8
z6VqPlI#3JfAg?6r9##!oF(&v79sgOA<$ynOb7<S@MGceXg)yc~kkwCIqCAlV)^cjD
zffV^qzH8^U|J<qhWOD=|GmBi8cUe&AZYW;<9YsTBA;_8O{_TTuo%7O+2xbv<`3*7_
z3WpQ&ir0|?;L-C_IDSn{$*OrthPI*U+ZBEZFTy16Q>i_!&%g^Rff17XL2hAV!Rkhz
zptP*#KS*gkVhX|LeGOx{bJQl&UkUAZ`x7}-8pD67@{a-g6Hg(&cKtP)oKMbN@M6=t
zs<CMkdE7L|t$K8)^DF@QU0LVMVKc#3vhOlCcK?#Y^_28Gi92*{(kk{^$sRm1){h?P
zT1{ir2&;hwqn3B2J3dUe+U2B#GHd)R-snw|n;#z=yK}rXiA^klx}{s=LCJ<0^SKKb
zZ&S^u4bq<{<l-u>Tyc@1oggXmGOUywZmOgTTtejQpw658wxluRg?c@69v$=OfxIsU
zdg+g$0W2lIu<qx4`<}~`D+T1r*gRFNk?dqf5v;5Ny+xfvBSXWGdO^(Jw$+8vKcGeo
zhA%cw>qNOWJbY(uD7qB3F(&H1FK>@tQ3O6kOn+jLsSOPB@fOes{w|EDm$-Vba$L-X
zA;xmcV3}jWjHzWAm=Rrx+%c$X0N2_5l!7SEP<Ag44{OJ!wIg$GCJpnnQC#%LHVmJU
z9^*n)XOUrp{8OPZ=X>v%v6DC-fl^S3ErCg4au!$AiPU<wAW6v$hy1h+<CF_hwc4O>
zQ_o~*<Ap4betBd#JT-~`l{wT|S(t^#ShFih&Bzcy0LXf0wm3n$w=edR6He2piYKb*
zT5}~*0UgxF+#_npUtg8h)GF$}#YrkaS`<e!0XHRZVk;=?`tU1O8u%>ryW>{!q(`!Q
zH4Lf3V>e_J<SF<pX_FqDj2Q1BRDt@RgKJ-Xu(}#QHum{FDjrF^EY+)9+`4X0N>t?4
zAc03mL~C2hP$4kZPze`jZ!<?Blon;z!UqcSR>w<C)>?SFx+O0Zzc}*nCUsVuUU(d7
zg=^faa&)9@YG<E&glRoNZ*eyP!**LBHjf&tOJn#BTu3%$^ZOoRpI4bX)JHN1@%6e1
z$P<-26<hEkF7!LDHzk;HMv8v+T@ec)jh|OkJYAqePDz$2FRD0Bjb_uS{fG$1P2G3Q
zPkU|5tX!^qNU8-R_R_E&fieTdDBVj7Qwqxxp7?6Aii5|ekh!AUbErY-!O0P9cGbkc
zELueYUsW0x)_nWKp>X=P#j($pgC!~Rnc(kAz;5#pF?=}+Cz>eouAf}B4JOIwkOw&X
zn7rtaItSKq`FB3b?JcS5G97`W40K?(BP~tNb!vP<eNy24X9vVxamN+;q2m}JCD1xK
zAC}5J)Texm4dMKBxRDDb*que-f@-e3j}zqA+G|%7eIcHDy4qiQa<{r01l%6YS9560
z+aGuytxcBlK6~Ct`*{oWSUj&9S<lvlF7}<#rTs$JlZ#1%sq-DsTyGdHkqyYdQmTq*
zYMfA}Fz5bdEH0+SS|(9-RC;VjZCtj84Z+Hm4i5(>MGzCpER#QjcNXVNk`Bcd5bhbe
z9#T%k$}S|(kWQ-Yh2@v&{lM!QYBgURB0xjDcRlS74#zcMwDW?BZhj=^x-aS!@7h7M
zbp7qL81C>>*H?KDOe2Y0cc<F)4a{?1=iiooTf1$3!K9MLY<nd8+gIkFo6R)G4;2eA
z^|^))oqoCrzx=Ppk+apt+g8RVCOok4op466)D9PyRRf&D{j+t|D=A(v&n(5G`Aij5
zaR+Eshr<M+Ss_KnOH3#w*Q`pD@+VNgFi{*|O2W;)?S;TZxjX;XTn(jAzSiP-1ci#H
zv@19vb+Ek4&A|biUVo;!GH49v8a=VF^bnn1=sSird%2k+4O~xD>c07luA>x#XuT|J
z)?(Yg9?3fyh3q?w-KE*}>@`=8-uj3jnfO2`@^5~;?umYI7+TY+O!6JIu8*MF3cV>a
zv(zO^m18vjg!qgh3iJp-9)JqI=PO~KkE<Z!j-k#I9Y3IMSh$Si^9|p?7~Blm5FW{<
zl1KjA*t;xPFt2n55TN#UeUc&hcBt(}KR+>pi&N~alVW4=G7HD~e*JN?&MrIwlK-s|
z{y!yGxIT`@!bcyZ_o-!ULNb_-j1c$i%y1W&%{AXtU)wGA*|au>SnN=7uucilu*cvo
zOa{~geWJ>BpBVJhWO|CxZGeT&z#dKV!nL0Pu1?ORs%(SOis0d*Vi?gR_erR^-i{})
zqy_bt0GcbZp7Y}ERKZ?QV}VuYziIG$iTt$I0HFGIvzGKQ40pZqI;vi+$`EuzU7juY
z?Rrol|FR-}?{U^=lJ+teu~hUSsLdiR{-?hdXXED{w9=&ChaNA-ELacub*>W6UAjy>
ztpdE*!c5QnTjU(CQ`z-&xN!s>pq*9)bkp475sn0rxP`Z78jSI3If~Y$(gn8hfkGS*
zL<%hSegd7_cuOIf)yZ$Zkh1$2cg|y5kiXCQj&0FZq{^Vgl)Uh!f}e(d5Toe=e?#F(
zl|^iOJSM@4Mf6g%29P7~w?}>x3860?BWKzd9($o|RX#M{#+qzx8-nQ#>Mf9dqoR^V
z28Q-b&epn?%1w?1<5$gLf%+pfE-p6kBnO>Fh@Xg9$I7j7jgU;BOV&R`ZKm;M!2)IT
z6_N%b^gRi=XKUIj@6sd=4J-tLpd?dzQd!S}cZ%~jvlR0X(<3c&b5o=j^zYHCy@zY&
z*iv|?gA64bB=OPV`o&xbs&<^!O<d?_TF3yW?YvH_)tBpwb2N#tMGw^q3@pvpR<;y)
zj^940#OQcPf8KoLK@qT9C+Vasplqyiu3|krcC@f48t&iwZYlPuoIj6`8iNc)nzb}^
z^zQ~o5zXU}NXw8d5kY)88bYz-?al1~yq$U~mz^1W^MT~R=}HK{=j__%Zs669G051J
zzrR<{SdT=AO<RK%!1!$q{v^IMyMeINHJ2hf;6QtB&+p7`$=oBcIo={I=7P#M^UwWN
z3K~oMWV2{uz;8V=RbJ$^C|Bi;W>X&MMUgUh_0G`$s)y1LB})>PqS3qpyaq1v1_{a5
z^2+=!Wn4{@ZG=Tj7mrzTZZOeoF5qZCgMFsetFDT<itMI&>obIL+{mK5DSQi|z^Z}S
zV`nd0tKq%le^yOur2LgL)5?V$x+L(YWlY1c2Ntwda{kPdR@R$lLWZ7>#uty6{_|wO
zohqm0dz?HAw&EMNg)T1JIwadArF-tq3NBN-6O^I<Iv+TC39S6sE6S#fJejHUqJy9Q
z4TXlj-D}^THtYa+3%aiz#wmat=xAeiJHSiK<%Na@mG1A{tMqw!e3+_oV<f-%{K_Z=
zK>hTfKzt??pVHmL84Ee{eMK~@n+S_NeqT?bm8Sw~IL2B}9PdNDX~S_}!a+%j3#Nko
zE-j8Pn~C23ts0CPgfVVW&!faM57#hfC<g}lS1Txq75>cEE^&p#%KIXtY=@s>6FA=k
zy!4mMA%DoC)>AkD+|^$kH%c5g<^wHJVH3gO&RW}E!+9doccH0?5<6K9-ODV4rfO_A
z*?d~+IGRSUW*e%V&LVP1P~7R`ZiJO2>HP~r`gE?=M+)|s7Sa5%e=@@6gDaRFke%gC
z@CxC&+D>ATM@8|{T-Y(S8erT1MQ|Z*)$s|oUGlLU%RQ_)n_|hN0ahD8!&~(|(gJ^N
zg%OD2FM4zy3%19Feui~g@;nzDAD8hDZ+9r_I{u)9kWqD04ebq}pS)sOs_N5x=fUQp
zM%elhDPZ1JQ(iq3$UM-y#i>gR3N-iLBY5s|{=D}iDnH@s+!*B7=f;j?O|51Lt`-Ul
zM!YaE22xXyTt9WR5HJkGMq*jSLk4sz^a|}Plp_<}L$4z)mr#ZnBd1=Hlk)-4`NR9^
zbYdkJ-()IOpW6s@WW8+W5q_ms+QmWXxHTi<b_C4N3DsX{gdQ3jqrQ5bX`%&Gj7^=7
z%L&PBqhRt>E=`je@8Gg|53X@#8PlfRvm=|Gz+-VZjX_?74neBFPw@!b629!+SHLU7
zNZA%4#5G7)_JH9^JegVGRd`RkjS&gh1AZ<&C2d*N&hkWT`hBap9J6PX3P}$zn7~Xt
zEbz3%+Jc)4_&NZrXt^bDp>rtBese`d@;#C5_BN^*d?q+iZJSVZ2{?ZYB}$!rFK4OP
zWN6_evn?Z#e}5Uq6&xPX4jL|cd@#)7a{<hz^Jl4|xn8yR552?GyTYR&84CHXW{0L{
zEVt2vDC6tqBHn|h@B_Vf#caxA$HMZOYw&^Qqxgv`GeYqa2&k5vcTShYQoH)?X7=D%
zpPRP>Iejo=>&DGp^&z|uu#oo!+U!deKX0kZ!tTC0k`ISVTqb%2Q`~z(#cX^v@J#3H
zX#TSgoHQw4rYO^PjZo`HDu=P_jGWpV)X6<;0)BsFA+|{qnKtQ`nSd8;6RM<Sk4A7k
z$86-Hx~&VFFwq>)BI=^`<JyLkgd*n=Hq*o{y0iF)3)2EWhYfW8i?%Nbu9OK(PzAs7
zcR;KD9mo^W?i#u}&4)Dcor+iLO1ce5ppGuiPSit2JqR?w|0f2N$oz!MFq29=O~H<<
zRRNvG=o)8~rqdP-6(MYqmv>X=<|gb-4bwiadI%3D9ICm=zpp<hhdCK^+^Rc)I}etN
zoGw~Ur|a1EN$9*v{1*GZIXLpce>MUx=ml6@5L0e?a;EF8g!&H`MsiQT$NZClzgKYV
zKLqNn+J{M6!!Gc|vbidE@;9<PGhmNp%IEGI9|r0C3`TZcy%&WII(~B5049pRz2D`-
z{0pBZY01=(%7uwZ|0e|gXGTme=^8vwmb!9>%VuLLccJVyw9$1J@5(_&CjZ|E;E|=D
z8~ocg4h}Rn@3$w2#G4a5QNbe;K#nThlwo6CZq}C6NjejKKpBR0U4}i5hVQtz*^iTm
zpMF)jg*`iy^O@Fj0pB?_RXS*fJHDbgWf6MTwsf+&R&907x|Y)b;twO@KVkme>p8iW
zYJx%jES9wjQC_4Ivei}jmaNO;6Z^kK04oV9Ztz!gFrdVf;aQ@hdJsEH1v(2uey#-^
zAn-NKW<?|YAtRcv{cM8)`vz|Mp-vj(QE?d=+O1F-c4i*D?|T4wSn;V0wV4r&k3lAT
zgxHc>UA6>DQY1z_?Uyz^V+yuOwx6&FSM~D;+xcgjMa9VV0xwj|XQat5Y-K3TyvlA2
zrQVq>Ct#c?#j6n&D?bjUB(qI~@%PlDN>4m74~n&KZuE>IFFl^Ri%VzT8f%K*(!;%J
zY+&cR?)%Iwu5YL$rZp3uM^Q6?M`)oIKScnV!>m^W_9r}8$jGShm88tLu&J@my@s3k
zml<|>$H*#Qx~EJszzJ*#v7#k6Cud_D&_JB{;Vq2D@!O-@!=hg*c`vene7$N<KC;9g
zZX&u#fQbqP;a6i6vhn{wgNi4@6|ADe##jr+G{i5JTbPHH*679E$s`X)l1pq~_3HuG
zS|M_e2>$3L5CJl0EO-9b9l+D~ysmV602t#(-|>OH&|BZx_);m3x@Du+AUQJ|M|s8#
zDdd*_=;gD+^xrGc@Y<CJTMTc3%gq^S!BhDuyGw(|NmbixUMwaw$u>{rmmLfuCDS21
zkktzinI*-{M#|2wrkrgRB5UIRk*WZfU*8M0TAJwns7cajvyq_3#AK827H2Y)>^HWw
z+%6`zPfKiW-HP^6lzI7q@3}c9P9NlkJe!Zi#sfgN9JY$+622-GUS`ITYcm#mnV)LE
zS!{dbEUq)b-9>tZ8^r2x**wV?L@&{uME+^6+MM`&_~IDNb?5&l6DU;@^@nvS0(L;>
z_g}0#C<{zb_*^0zZ&kX<l@L1}^$x5OH3~TGaTy8TNPfPQ+Eo$ET?~3?`iwuG%W-W}
z2!kV1v&mm1yX%(O7^^HgT>w%aQMkLLY52A9-~_g3SZMu5Lk}>=MBgW!l#K@jVvM%t
z5-gtt=mzfZ7Xj13SSo$f{QUMS&3D|60;uL^PSCu<Vm<ODb&L6sfJ*pY*p_?LcQfvB
z?Ks(Vx|FFxg^}zOp{akyWMt$gdH;L(ixpQ$VfbzKLOtUFe37e;$_9pte65yBitHn4
zn1xTe0V6Gd`QGq{iy1od%1cs|7w~HnjOWmOR!(m2{oAi_;1|aLzd4zfR-dRNkuu@x
z?s-UsK5h(K9+jn*QIfu9625%)SxcG_#)4Eq7#ic@9zv%%nx*Ub+Hw8`V_ER9pSG9Z
zfM;!#(W@sK?&30ewvuWRprh1s>2>BBc>JD?5@|Ax;`E(jKw0tc6_^4<fgGd1t;6&d
zP(h0LFow(VY=I>{;n=CF@{8@a8b>sF&h0;dXB1B205~2QTm>LX6-=nYEtX)BLMy<T
zQB{YVq(8~*lK+Vvxum77{pAW3z|)wwI^b+0dkJYc?-`uVa6eSDst?HJnPlht-=%<3
zWjL!iE`-tXu$>UIQ#w#GK>ldSMyr95vEf*sdMi|){6wbYYX4{>m%wv=c$frePq43r
zI?Wt@^$e(=o*f3eUYoa1pL*b*SXECAu9$FBFj?<q*ygHz_*KvVkzyUF?7r5*!uZCB
zEyv55WH+p)J`g7L-aWRxrt4yU=%oiPSpar5)2~-w%2YcE-toXxe_k9fY#EATpG3Qg
zUp}vmDhxLRD1G4yQ#paf>nJdswg@wP5d7aLU@xo=G7-n&BMddLF|xXMxeLC0hGaIe
zVWj=yb2fKeBG_AydTY!(h!m4<_<w?+8&ruxUnTi$3s#Bp6&tZWb8>QOOxSGzxd1BV
zZMc5u<D*2YLPnhleDj~c$W}4!&0VNJQBi?6ch-qZJt2fOxPJ29g-%_`ap1v7lsMAA
zdsli~y@jnGn0*3wjA!XbcSyIM0_%S^g#Rau;t%OV{V~>VSo$`z1R!b9Sb7;RC+i~J
z2V9LwA?7;FDy$_vMy4+Qw!s2|e@&05KcbZZHy*T8WL>+i*@u6zX?no8juVfWTFh(t
z-gykfD-=TdY_8Omr!K+=p6ryIu6Tl<_Ni<lTNXZZSEY}Q=Vs2Ui|ZV)Z^(Mnl*_K#
zEUtIw@inxHLf|oKR-+4vS<h6<H9BGtO+5sCt|si-3HRzm+3%(n%#V3N>Rs<&&I|8;
zjkrAZrk)T~w;3T$@GQYgkt;L9JPvs);$<;~P?v!PH-sl!+EIRFbRB!Jz~BC%M}|G!
zg~Oo4%(AvI0x&0v6373+NV{hl&-{K*&MH_lnoWBAYp0W}g+-7Bt>75SwDRTev7)=}
z`VyW$rC2`9)LWAK{L~!4GN^WnAWrUxysLKc!{<|OVI>vgWCKX@2oNLi)UT@)*})2)
zDmfnQn_^Z5eEKCh_*ir5r-+o!Y5vVz?OT(hcvl<PVhhPv2np1nP9-+X<FYP9!0ii@
z2g&6V<3#DD{R~TTNR;poU<ryl;$V7>lQmebJxyg;`OpDP+AtZ@{&BRLM(sV|9uH=g
z7DAW^|51EdkZHF)qxwA4&qz`zUftQE9~>~K!&fO_Hrcvlsn9{V`gBoXc2qi4(9iEY
z?)D!q)*0iZBjb48k9~E9IknjEeCO+;YoQWODR~<1+Sqb-+|o(nm8si%s8c!5A$3Z6
zwgYP`Mi4swcPUeKCrR%xe#i*|5GJ}d#r#a+K*JB2GA@k4<2-rTozZzg${dlKWLswm
zvt-os0ZZ~p9(-8jE)EK<xKF17_#$a%o(dhV8NCO0iaHJEM^8Uvp*`#ow<$bTOU;cd
z5)8|P-iiDqAV_mB!65zRPOJh|<fO>PyYN>!&yBQ|-h?UKh~U!O-@IGoi4pYj;vG$>
zDzz;8dg9r}Pup=@dVJska7YbLJ#5^l8Eek$FwvFugp%|U6_6>MZTPOejx;K~6MWXC
zIzQEyF!L8Q>Ah|VfYDZd-hc0Yd0S$bPA7TVDhDjEo|w^nI(&`uPjZACmMZ-)aq1LB
z`2f_qc4M-2x;^&yV&P_50$_zf1FY?-WNYkwEf^0M=rog-ZHg_($=Nn27;;ad&@A3a
z+3Nkn5gT#xwLj0Y+kZS>F;278lc$Y2guUzK3ahN4RB)v37bJuSpoF~O+T5;%;DKbV
zZ&UKk5<g2pw#OyYL+=<985jWDIEp#+S{h0Rqdw1`^f{kS2sJd;qb7Amqg&&h2+=aa
zh^6J)5{a&F!*lOhos;4)ne483V`4xP_zfo!M=ce2LRbeLr?<4vTYf-E@?UmdZ0L-C
z>|vX0N3zOwXlSSj7ZxPJ>NLlyl;*^3_D^B&EQ5K6_CAIdR1`ujoi0dbYbYuJ5GUz{
z(8{@%PtBf4Mdr|t!iVp}Gufm8ETGd#ExX<Sge+TL<c9IPu8k-}-LG#&%bq*qY|T#>
zQnsg0N^xk<TE*awRvV^u%A5AtwOP5L^^_Q|2npP)B|nhAtK{ip8_m+M5e3|+%x6+w
zC(8!OC+yQo61D_D4E}TZ5sFSt&tF93`AK2=yiiJ@If8nCChjyNOd?4KwQf8dl|25l
znPW@~iQ@5x^CVM8L-bYL-E*ipTFha8X8pfSG3z?3{AKtgGNAu0LF>M?)jk6D(cQG&
zX*2Rf)`d=`FYhGi)02HbP_z-Yr(TI=N#rL=hl)jyVn&QwP`o|}*im=ZQ6w+7xs=uF
zD>7|4hdU{HYE~*WqM`=l0Qj0B+(09Fba~v82C%1_9qr7OPy3(Uj~W?4&8{W~wmk~W
zMcp6IC0@&!{|8_-5Z(kvdF=v_SLVk}vk`W;sa2qJ!A~&q?A~Q$UGC8**(@n5^GX9B
zI|L9nr!^j<s+?44Rc8mxPr6sj#9Ey6CbY{l0LzkY<*<n(jnnh1%e^#-C#-!qxAMCA
z`kX}Hhrf<WDZ0+yU$Sv?_zdAV*H$P2U$wmgbd!J4M#2IgvRTG~e{;C+Q?G4&xlCe&
zN53*o{=nELvPQ4-TD<Th#8r5v!BhBrUuUj4{_`&2nm7S>Sj@pFHwFW@#SG9h_VA?E
zk4{!}CX?p+R!E(<nEyO+1jckpfnrtKm9Qyct_a`Yx#(W;MT_2QRGR2`QpYD1@A9*P
zk-((Tua-6nJ`7;s!f=P@>^*HiWz79vZ(*xP_ufbxzWiFjWB_K5WV`NP6endp6s{BU
zOU9!xFO_dmc!oM(#9^J>q0y716_19OfH2Jk{5b<yQh$*VrY&M!hoZ7G3F9?S*!J`4
zum}9LCn^dWO0oy{I-8d`p6#f_$v#pfYxA~vh*E437dWTBQOW4H1I0<~j|TNJLt95j
zNBeD{MRwj}zNVY?n8riy9~?Fn)?aK7DNzKDKmJ~>qEpD79}x2Z2ja@EO;!W2^XfDS
z9Uk+-$cj7OSIKh(5RYT-&yo6QT<X(~h^J9`Y6ljkGUzbS$7WYO$+8=m5sg70{*0Ty
z8FQmF*83FyaO7Ee!hET>j`B*DwY2VE(|FQ2(jAV7x4%36I_2nGUe>8FsPH{eVX2qN
z`{#H~YMy*BZiYTHf!`x!IMfA&(eg8Xj|DVRR;QLlrcOVkqJDeMtYOxzPUQF@?LB4s
z2`4lN#Oy3#0-_**orwN3(rvtU9#nreiyDM`74emnlc`lgsE)LJ7+wGdVQkk(Os@Zh
z4&FT2$DaNC6td+<c$|m5##g6>cj+4r`pBZCv;Q)PG@nwkyWO1Y20i_&j0OxtPghRX
zR-|>yx<fu1MvaUXG|TkJ>#GsYiYhxRefhZfzM)Yy^U1qj=AL})h`z^bR^8{!wJINl
z_@*AOu+m57hvA1Dk{W5Vqy{PaC5>mVX9aAbFTSr8LivRpIOVSKGaKvs!7eJe9WLbW
z==Eui2j}WhzQ9e1yvzU_K?vcVXD-w<^h<9J+p71nstXcFF*+Zw5e4no0wV9yaGy`a
zb;r`4zdfgeCP7*Uz^O24sf}6Xihb190iSrSO#sAg+zKi`an*0JM#Q<y#2*&D@LlaI
zXZxsKNU*n-;KpyALdE=lvU6rgq|vivoY%wzeK#N9Gg)T!jIOs6&sdoCUP2MyU;I%0
z`p3mT+@l~uE(~ZLdu3~i<+-J2XL(mjU#2@k<LLb)hkA1zZJyviZxCPJ>3^ds4455w
z&)>$30U9#zO|M<QV~NY>ThrC#r=ARwm325MF1UU~A4OOBmSw{7K+xDNF8~ECd*}+e
z2FyxGMTKO2o-6Nk>mU?^VL4;cc?PGGJaANGB`4tsV8V-p!=c~Ar2skR1c`{sb`TF&
zq{I)Nzd3s81%7PP-!Z0nGlgB;L8d>;Zpvk$B%1318I5qDcWx`;dYRNLKpI&9Kq&w+
zk~6SweErvdd!#=^*OJ#K`8?+bS*WPRj*|yWe-k~A8u1vR_NLMmJVqbRSc6}%L%_k;
z4buJp<=(Jd$dDf?zm}qfY;MNV?VvM<MYKe3jxP+CTNf>TI-dC~?sPnhYTTKH_K<d+
zwjxX|{6M`D=?%$2+Si?u1t|&~G*{+>!|2{(_W6?AZtVB#K0&L4G4c^|X$|O-<5*Fc
za4>&g%4Xr>>a{>pEi8~)I;7XqN*-zvurI4y4m8fed(CpB*Xo6%58~6$di?~Ff)7ag
z#4UO>1@|8c2}`_;Zx?WE1Fjsix-!Jw1h06Ra-wUn>_vJLT)~N1*Ae<x4C$-Et-qw>
zA9S^MvD@JIG1>^+njQX!?Y(z%Ha(Y+RpdXY=yfZ^CfH^HxQJ>ZDyb#xR8&iA!5|SY
zTC<gv`DTpUpS~ARhDy&(bvLvx_RQbgZKtg)QVF#nvqsb+u;%MA@N5z5z?ymB1$Qj3
zguq)h0?2xB0Ip81@QL!Ox#Rx(k0HN8BXqfGD~f9zGEdnq-Ql9zk}C`E<;R>%jh&0A
z)~(4R{0GO1in)T58{^9fi~{1B>-|+nVmrCJS&G`c<47YVC<iK>1j)Uhi8TYi|2=j&
z6~X@kV!?0pfkMqMf{T;8`D(c|*>nJ(&owNpslk1`7V|*9Vuf$WExhC269z-&HwEn$
z`YL=gJmWfQJU3l&H<)9pLa4P^lC`2$7*XYdX7e;~w1*loTSym`mXcT4PTV5ioh9(|
z@|xMWIdwnu+5FTefVa}70qPzZLQ9CXAGR3o{5CNJN0aKJ1Y!FudeSCWEsfZ^P3nhp
z&g4OB*5i<z9zj)_JvKhmlMRdY6}6M<QPNk!GQTWg#Ga4Atsli?ptt%`X)SWZ6&B+Z
zh=+RpDqtU3h*PZ;CV^)0zTGwX@x{$DpH%2fAlQoG+KJ&MAV$d;jD!goJzJddK16U~
ze7u)AwnUg#j^~bqCd$1fkbgUmWMNi7dt`~f_-Vg*s4o70IbnQ{9i=c<qx-7~`kwXK
zwL7WKE5WOQPXvFYw3JtZzlAnng8r9|rU8V#KP@fI&x?^s=u{3D*PvyB`u^duw1!5^
zthm$LwZEO#1hosz05okEbb9?2WRtddC8Fl_q*3!}35UDgN_}unXREan=283reA%)2
z?^$b_QL!aZ=Q1bspL5-fyD6JuZ!%VuluU@1L)WZ+BoY+E%yr15d<CtkTJH%`dH~Vg
zt=KA*6uf`eX39h1X_9U9pWq;;>sgKAu($aH!va>>VgE4LkA}LextVDqEVepN&?tGp
z@l}|_QizbK`3eO;ekt$7_*h$ta@gxT+cG<>yQi&T^+a&qyLt{l$p7gWwTCMHbq3*J
zbwXic!Mgr!YsE!nI6y_b<?iVzhTa5Fv+y+$c!sc|bdwVin?l);hnu0WZaN?^`Md)-
z@Ce@8?LXt0YrytCuH5m(Lzd_jt_S*PhS=G+iS%?Pvgs-97t@mFOP@auN!_nmqhhco
zGfv$e(a4gUg+7&vq35o72JM*KpbU<nm8Aus^if9YkLr_}QF@g-^bIq*scr-b24%iZ
zy)V^56v(@}DAT&1(DQd@4Ic#v1m?ZzKd~Oi@V{zpnFj2Bk_o5YJ@HDgaiO$o)#|S&
z3CZJn!ELj)Fb4IKkl-C1$Mc$<-O8`M01Pp-sOfyyGr{DKw&;4FPYIQmyetz3P>&#5
z-iE6CQX&FG_FnNitn;8S%sUFxCFSqE$xxEhALZUD=xU4F_V`pf<M%Ot3OOV{RgW6x
z(p@umZ0WNfq-C;2m90n&Q`g@|HUFd`J;^j+#F8>hx0As7ps#(8s5&G4@8r){YSCPk
zWN~)=^q-tb11jD?TRKYQSQkm#=o_GM5E5p_$?{Ms^1@`YFt5}w)%;u3g(~qs8U`ls
zm5b+PMZao0HpZ4soV7)t(+q7~Cs?cINc0G5Nh7=kEhSFKTiKaUe;%N}1WJAmjueSB
znR~Wh$QGi%`mI{m$<`P!p)*o0^RnC+U1f>cjwPt5DS4hC;$IfHPZ$v4Az>7i@<gt(
zonC6vqNSi#Qbef90()JPBAJGKna{Vw`VaLAnFpOktZ8VGCV{2@tIY|_b04{M_nrbh
z-YqvY4!XYTiwb?{kc8c}0nLLBU`|_0M;4LtCxpz3)}p66+uwrby224$G%r+XL_(LH
zV@14-mx*QG`LO=gmA!oTdWG{DPh>`?7wOZ$&;q4jmU#sY!3C;);-IJ+lWc)kFs72d
zw+^!8IgC*!n^U^@@BU|>8_pHN<%ndKGctb1R^qFW{&A}?O^ri+lzl?Ug%(U19DFx$
zIF!w|y|*iMZiu0RcyVm$Rff;r4X<i3u6124{Ilr>3kSCyt7eLhcwqOdW^XZpt3q;>
z<aIWT$Wl#H5<E#{!ra#inA|oKDsSI+att*O5r6UrgHIAE-AVjN?$z3~)dR|`O-Smd
zlA3%+V1!lOj<M;4^a*nf?Z@5>%MrL1E^|p15CDdZ>3hl=5kR4R;?k}ITzWMp{>r6Y
z*XYoLc!iPJT<+$JgmxU-UZXz@BNN}(vJoO|i%lJ7R1fB7Fwg15o$u*a+uv@_Qr|Uq
z4?8+&Y&e_8n_SIPJbnqY=`&8<qmQ7(5pAz#jt%?@Vp+)gkMDWZ{&+Z|6L_jHpSU=m
zFIZ*nW)S5)>l8BC^X^Y|(`iTNYbABI2(|<RY9|oU6b`hsWuBM}l|IR4yp%V(uhyol
zrdk7FPo`A=Ss#&cCNNo|P6;K|BOKSg(^|?E05)SS5uU^;?)_eowP+;cPF5Udo+^%&
zP{H^;q|)-7=~-Vpe<U}M?*$e!%H=Ey37Ch?z*v^{`|&<;?|h}^Y@GmnPhZ>tizrbH
zFRK8&%986xsHDV8;f1U=CD@v(AQURp>9DK&yp{*19pf95=~ktVPHU6n+FzElurjN2
zvJU3^^Q0X}a4B%=8;idiF=6DIq=e6{%ord&4&~fbrT(*YXY+oLDYC@b`=3H@%L>kC
z4^@idJ7B3%s!WS213-7^kt0Kd!%#6z?rZq7trTF9VNX|v%O(Xv;<td{TK>+yFz&`S
zYz%(y63O^Iid}B(+xNB{fyti=Q2^=!6r$p#K~5~Nc&o2d7Jzj6oKA-<vF@f3%rp#c
zx0{6%%J(M+nn!>FSu_ezzKC=d4;vlLPAGirg^$$MB%Hy;cJ`k2*T9rqjv>sqd=dE?
z)#3DW|99@ym}BvVe+CWy(?YqF$uXt`T6zW9hAG<YXI{Te@SS<k+rb;8tFyDJBa=J4
zw?KR~=d(0`|0SDmJnO-HH4FKmuoy92#r<XQMIhjuP8wno{>{CcRYD#aq5iNH`pPQa
zK*vvoks;)IABGOe4aqli?et7Mu4B4;aO^Nri7F=k8nfsWzUEz&hL;k9YylhwpTBcs
zMPO1^a09`WLpn5e-7hL=kR+GSq^JOOd5VzHZ*JKf!}p#cNd?>!C1!*(fSIoyxCX~-
z;(pM<3WGpyw}C+A?U$v--rkJddiD1T0ew3Uz2yBxa&lg($(pqCbwG3s5<d6j{&Pst
zzMNO>!mTE7T<&=Jx#Pp*WqV5N_o#t|rTvF9x-PYQgzvo8LV#NJC9bE~d~!X6z09^<
zSSbDOMMwI)l={nTH2tefjuZ%Vd_!$OE;oo;&xc%(0Vxg|-=mHO&vqzp2PwiL>QbB{
z%OpM&VA)J*EHVL<Rs_KEIip$6X$PFpK?mP{iSxY?WCnsFBlmwYb>D4ETES5{PcxZJ
zUj?Uhjo3$9G{jnrb`2sT_&z7gJT1dU=)ki9xz!gVL&iDv3@g?7{WcLR<4<&?@Grc)
zPzZTOU#`vZQn}xE*CiFB_Q~azEd=EV9alW80dVIZOMKaB#Mf9v;KXQbI~yeAW`ys_
zHaaw6zI^Rvnb5UmW!XbVf*#aX<R-7rg^&$W{O;gv^>FXKvB404u{dMXDqMSKE`wH-
z%<<o?=|((dmK#TY;E(uU>(9gtQk{IwsqNHdc0$)qd@IwN@8B$P!&jLLo@nRSN)iA7
z(g#B(<N5EF*h9w-mn(V|nkxb74><3a&)ITr^Nl`W9#MZ7Rx}u53_m*<u|EWiCB+q;
z+B*ww0Rszdq3~mwY3gEws9clN6LIA@u;%AQz=x8fBcfEU_p6}wGgn1~K4D_jukx|}
zXlCJp?ElBtS3gAAZQ&Y7DP7V63Q9N9(kdmw(2eBK9RnyG0wNMKlyna{bSd2+F-Yf7
zg1`V$;(Z6dbIy0~5BD#a+3#L^#j~FEEPuO%g2I{Nm+#P-!H)Gcf&h_DI~1TAjeRvt
zO9`bSjOn1!4XTU@4)_!CS^Kh(x$pd2+@uN!*<mdywPBfKnIJAtNT$l@3t#19Z)1=7
z{$QM=gB$J9g}_c|l(n(@{5m1#yVe5;ZSjtw`>Nz=pCX;LDWDsk)?haYt|WZn>MwYr
z1fZm32Hb`Xx1<A5)UA(jq^ZjNeAdgVSd^7VY5jXf1`VQWhGuQMs)dWcuP5D6*0?+y
zNM2srDerhK_=^uvcT}?nNVh){v}HD?`M+%R;~^w|&~k)U9l|~qyGfltX{ZqF{?D;i
zIJs`qNlM>I9$5(Sr^UH(PN(*6pGe`dkhHI{Sl48-Xy*K^_{)Z&iif>xYWL<TW!Z)p
zJosgY=U^DK8%1QH=XXd{nstO>qnk%1V7mZP|K@-c)=UO?<|+x>`b=le?7oa|pSo~2
zn5*)7e~qHY`V!?g9SWJ`f#CswGIf6{a;VhlYCoJ;RH%!~*OXEsxU=MDI5WXztm?BG
zoddgPr4+P*Qo6D36?(p#r!KWIURc8ybk8WRfrE6ifLbB#AON(+XXsFvoB;I7ttpXc
zu51uS>!y^{QH84=9bPWev&Lz^Ac#8(ZEb^MUFMKU7eA}YV4s|{(=v<kU;ib9Lrb69
z%*rR$s<6`#%fc#j@hoxVmu}(@DTqS-b|cDla^hJYD{XbrgI-Wz<0h)Iu;2+ZP)Vtv
z(kqlD?icgrh~h%f*&Y)F>>sjoTz(wcdM}SMwOZn!A{R;DJK=QWNcF5zq`ti=ovuO)
z2nS9^Nqxb7V0<&e>1M0zZ!!CYisfO7x9sk*JP8(~Y%m`I<NRanw9HrE<)r?(HuTYo
z;?;W_&^3aCEAdBET<Uf?eFOHdvfXzl%f?b4Z~eL78CUgU>UF5EHizX;97Z4iNylP~
z1f-TUQ38$ou~vO73w!3ZLEd#Bl9_jl6!f2!$GuEZ5XoRcJ5DS3aBgeIyzF5f8cbQf
zY_@=9UVs()YUE}?x#m_I8_P}^zEMWU8)TOSZ<@by2PpFj-&~!G7WNPiIyPNZM`zZ@
ze4+K5tM|BTEar*@S6EaE3;W!r<EpkxQv0!S2fEJ@K(v-)q#56%j@M(yeP?2VaKPEl
z#o`DNrG&qH7byKY+244Y;br*w-#WM2Fze7|d?wPW$8zgW{iokbJn$@h1hMP4i=smR
zb(Z9EUsWO5^PThaqX9HP{L+$Q>elLAkzK<NEo4aWfex~Lg5w&gA|;+^*+Ka-X7g|D
zDyD~@**@{^pC>na+Vd|LQV&482#>z;53%5>t0O?2vLTx4O#6NXX)`uq>f4U7+ij}^
zvK5oMT4cnFQSW`CO;=E=Pa&624Vyg)!f>!nE7~&Sa-R<w)sx;SQ^y`^rsnxqb~j1>
zQKV9eOgV>l^(Zn$e)8HR&DD%r33!@aDHs-}adv~}Pnn1yxzG0F)DAVgzM=X2C!ow5
z%Pi{DH2G&L2#-q-A%CF2eVz4fU((8;e8kXl+OoAVW$LpWt|~P%u=bsVUwYm<cCg8e
zNF#?^qd6wXLR%=w?OW7G+kq*WWghvQllM4r&@@1q?a#umX*BbfK>a<vR&<i7{jHTm
zZzd<=ssP)_b*_(q{+RRH+KZ>sQiQiLeVIi@F8`~mDp?&7tpIf!9i24^$l9Ti(@k_6
zLIO)9xx)nU@)Hg~Y%xy^_Il#Ccf7;v;zD|2{{v@}YstiKpGfzzP^?eig8?2Ph-Lug
z7M?@a4m5Hi!)#lGuY5MQ&qiOzToiVM2aqoMvcK`oKY0~#cvdK*gM6rit4i?q+8&4H
zMA^!^1<01CodY?MguSwt16csOY8a;Y*}`9`P7=rMkr#SC`m06>#!@i+y+IU%H_QR_
z)^`Nbmvb#wn|y=i{T5z$u?($nY`XV?2T^TR?bthG;|0sp->WOkhG;f-wjw<Naaz0B
zuhbi*!Mp71-9Gw%3~!-G*!_B~^IlE(Hh$e{ZWX*Gl6*1kmMwIxp!FVHhgycptyZ-9
zQ!65gPFv22{uDk0AV;I*O${7*j{-)^*IM-4=Th4Ole|M41_nAK;YoIeb%cx4sKl;B
zQKfO1yeE=I73`e#`1%UlYUcUFzjYuKjk3LI@AQ<~y>aV@g2xN5@%^wk%b1AX+dpOf
zM)u|_+-8OAeOIwns23)YBDyTXzxUBM4!FLYwsYLx$T?e{d|UgTiwm?u5It?<zuSrD
zy*u&bJ}rZl&du7o@bif;E4@Fc!~H{Tl*Cbd*}aDNuwq=paZ>lM>v<%t3N_4_)_is(
zazy>hjuQN<KiGF2Sm1T@&$BL!XZZ*zWi96aGumKu>_1^m>I568tA%H%bCJN$TYKEM
zNtOkYd73zwjTjr;JycyoM*)|aM7`P&WBzo859UU>y*`0_qi)-89ui?E|K>!nF#u??
zg^bR=l8e|$GkU((KJvNA>!<N_qb->s9b-83yPni*?jLsTo9_soHOt@%->lqQ-^}v+
z%@9!gU|}NXEybi&fZ01EKnwkc9w5A9s@i_ic~w!A*tN+Z659~;*}yNf(4-4n{=&ky
zA&w@H>zX&)%PSBk@~56~K*XtIDkk`b5iIjlP+;$>lHNn`<EEryO_YA<@K*7ze)5ZE
z+p<=Fk7Gl`;fmb$j!nTqnGR)@zJe#22?_jwl{~Kq>?WhbBhSU%prAXE6J=_FKLj`y
z5gnbdK02&cpE0rRDH<4J)j(Yd8)x~;?+df}8HDrIw?`?NnHxFl!POBzN$J`i-B0zs
zt1(~9`oMS>8BNfEAEa|5qt*E)psEY=9~^G1tjxzYC4*gzi8uBD8FqW6ab`7z9S9-`
z5`2P{hO;pC3(!;j&&|p9pMk6QF9&o?txd%5EzJiiIQ{98mOaRM{?R9a=xYrBl;9f%
zDhFhv%Id(8(3l$&PaL>R7+o=$05EOv9K60LamSa#RTVKW&}(7neeVyv=@he=zp2=}
z3|=`ke|~Xh`e;D`i!ijC?56?2z~M$grg1_C^$!Qcol7p_z%~S&y;fQ=F>$lYa}QlZ
z!?m2r8ztR=0o`E?#(4LE<4The)4qGLpU(ga6yOrp9WiOWhe^KcHc+~^D>(H1G=UAe
zvA2PN2LWb@Mg-f~`vZQZ)N?x(9L}#rT``bo8wE@R3?aP?AJ<V^3<6U*epyhx%^~uq
zT{`R_=kZ6;=s{)jx)Ub1`pw3vH>3vnP8w3I9vAY7NH0~XV_A6ebAzN2L@Y7-@k>ph
zty!H}{KY0lxw*g>#_TTCz$Nf;pF&=!f`0sqkTH%eVf1lN-Dk)7dXl~G)i{(nWNibh
ztB_pZPA9x>aV!fKVrdrQ^0W>PES#KOv3D_3fSy2T-)x=P&_3@4T{kx6#n@d;EVrC2
zi4H$6CGYPCKlhvEPt%)ghhMVjlVSS2SzX6{iK&72xH#Jzhl0mUB5qx&QvA{FCdw8g
zuQZX}YT9g^u=sn7Viz5Mon1kNEvOxIdwK#E9%smGVwC!x7$)}k2x!DNK*P;q_<x2O
zeJ)66A}OIyco<a|S>2x(ZSSu-H7_ftB#BhY>3*-IrsAbT^<$=yVsEjH(Q#z>>nQ>Z
z$%QT1S1gBn;dR&DMkbb!kvVTp^8pI&kh`9mw|R-?ErgZ}s5KtX2yG1`aTHMB9+|3&
zB!`*75Ww8s^(feVmz8gV%_rL+Z)Z*Ec2lA{-&=oD9x=)v<F$;)+RoZFPMH1g80n{i
zpwYVyR7{6LfL8GOOAQ!}k3D;3)L7YwrGs~Ak}NyX?6;xxeQ*?U7l(aNJZGbjOUMH2
z*_+;NUPK9RgsnHL`F>eJQ=L<IOOxl9A3%ShB_KVHn5luZ49(FK`{lF~s|oM9N%=Xe
zAEi<l1FR>`Eg;pxfGn#j9HNbHxh}N!ZAHJ{DALcIjOm(^(>=(|`9&bS{N`Lefx*GH
z^n7Yy?icBhJ{*le*}aiso5sPBhf@#3o#vVS--LYA={lV%p;4H!`>4ED0!|>26^Jp5
zol~jOzX$&0Bl4rsBbv8VAtWZqMI~DeRQekE27YU?*#_w75fEL--4V_>`7wP{yw1Tw
zCr?#X9ZkagP=t+bc|Ve@lqBN9)Z6rJoEBZrs)o4OzRRL~e1kil)_7T-Ont*#50JG7
zW9A-o-L7WtH9OnT?J;bJWC)Eh;-^c?Z<0wzOtE&h5*FD;DHAkS<|Ck+=g*Xs))T0H
zzAjKUCK2|PvVeS7h=3TaTBe+CzXnE@9%sl=u+XCYCo@Mm3R+qH*yA|6^Avebdq;jF
zvfs?K{-Alod`I7B8vyK`O#$y*5g=*U>PcW_WqpQALTZ15LDHXw+qk8l<}IL10`>2z
z^@xkRCW_`;S&0(%crs*tDsz_{6qtKi#(P(iw^gtcI6}@Y(Vu<jdBB5PGQ^2y8|Q%N
zFpg*R;uY<FobRLdvt{dn5SVq0bRDZ)M8oMs%_KWBcZgGliH2CQOFE7Hihs|8R#)hD
zNL6Yl)jF6odb2at#%_8#sguWIJwVk{wZ-xUA<$c1#}v@=z5r#A`<tBW;RAc+4>%6g
zU_h*P@L2+&icN+=y}u&XY{~`9v+4g|g_lQcL<`5i2D_s7=QDrb%iqFpm!+nr2@;v}
z<FQ|?!D|%TYn~0;<@+9>{8!3i5Lo_43V>iywPge@r9^yMx?cp#-?H8z@_o@wg=|;8
zJ9|bsW_PVz=)K&1bgS8WlihtWBa~oPQQPtAT}*|T$nTYNG()A+<8NnLam8fW+L^w!
z`n6}OE3&7kv+u(kG0F^oPk-7e7+Bb-atNJRdR_iPmDv!C#Dv=M-XRIBfSGGhtkGBW
zxJWV2BHqf1lTF>36OGB`6?PtcUd|cqV_9A1U98=_baV$;Hq&j4!2FaR*{jK?k1pSd
z{CVoLM@J{>f#2BJIBoFuAa?NHygzKeS{$D}Xr0eK^g%MnlSl|yq9@`0B{Yu@2t72{
zd_LOfvB<@ufx6|Ru^s0TU6C4s66sI%?)yX)+Zsyp=m<tohnGVuLZx!Z;dR{$LK7Hf
z(pYO%>|A(yb7o>oT#-uCA$}e~m~dK!E*LB8>FSP;d(1ny9F$c2FW(FRK%LfHUV!MI
z-F|3jZJZqq4eV@ocGe6~)NuYY?<MDF%?6A@6kwamaB^~1gzfAUsCT@sLG36ieZSg|
zMOA({;Fk>3yb2q88KO@qZK2JVbzrC|&iMt}bqvH7q3hV<m2XA0{GA1vwL9f7T@`TS
zb5laxC#)+oa!GAkP)6Gb46>bK*Dxq2GAzHCoF5y@!)Ty2BkUtLwH;K-t%=Ei5)8RL
z8ixB2o|sIw(%n&pYtURb2T0Pr^l&=#<)<IlTO->4>k}CDVi-;|Z%0xt-1MFW2q;Te
zzke{k<uPjbsuVG5dV3#Iv%teBi7yGX7pm{O1g0z<;67o<?5VqBX-Sk=N078UxDvJr
zE{nsLy=I!@b~ls=IPXuoOiPa9wSX_8&u+(vej}27Z&S9f!P;uKFG@vAKRegx!g{U%
z4=#z@)qStp9P3E4=%ZqxrQeuh(}WU;^y>SmUw}LBYq*V?)>0QatLRh;*ej=aG=5@<
zd_eYO^qWTVnR{(rN9<7Q@}5zXsktku<3eK>@P~nA9@ad@A#y#jmA}RKpB6gafYB+z
zTCt<d>0);!Li%r_pB$tka~NJ4>FM8gbKXF+L{41`;b2+1*isvdx!mT?XePYyial)u
z((0Fw7(>KCx|${p0~3`)CA8Mr<*U(BJ^_tl+=VMf2+Lk~|Bbt#Kot#i1j!a`&9!Ji
zC=R?^+v}RU+ZZ6KJM>&6+^G#ql_c9c^q<)NDi%5@vjiq~h(Xd-ofS{Li#t9(YQJjQ
z3&JQSbe0)7gIzy<k2J4;Dm6^=7MR5PVf8@L$Ow{asuEy@^`FG~k&BrN{h&<W&k5$B
zYicw8GV17-#IJ7@o0cMf?ql23U9Lt3x+f4ZOIX6|%I8|QMJ6kTifA`1)bcmi2GK4>
zvkJ4U<(&CHB!oTk_{Mn0D3)`$=?!<t%^DckiN2@&OkpJz63M_rF-;6W%k8wpC~b>D
zUDKU5Ih7^~HVRK_JXYNN-<;$5b+ar%aL&(lZ#$!bN%yU7P)d6GI*UAsVM74Dv8d-g
zVC!6k|2q+LTUigoPoe7vg~9(p73_GM=5WZyf^8%*W-o8Hq$IUEBt1;mM)=DyLhTuL
zqYk##5@XL=@Z)6uU*U0(RzVeamERWb7X62A3W+@kY+8rZG==p8nZjm_$u|N!=~Fs)
zKBvlwlt*Vemfm~OoK&nViPx47)Z&<qtGmJhe#k-?jqDqmd{1@335Ua6^Zl27%UMjG
zw?EJWsu@7j7fLPllKjh1-wRK&4<&%QvP}(-Z&TCv06y7q=~YX&dykx>2}4%Xf|e@t
zBlAe?E)kEfOBQEVYszIPuz&aKvF&s#obj`W%L~z~gO^efcA7`6hw&piE%`LUq8=7e
zC*xx-$CdPMGWoRZCN$%hQrrAvC-Lf3$%ahkTtXlXC|?ZSU`GWd)T;wt!gg3~n2i0m
zs(zxPBg~f2fPW41lnELB(j>jhWiLoTc&}Zu$xALUP&%#69S_ixJAJHyCE_?A)WN3K
z-UUX|=-i+Yxls~@l8_ymdWr0q2~QchbQLkwqd^(Sb6{E`4_>aW9P6ESWx062(PXS4
zIG$<yom`0RO-d#m)h-klC#EW7@4DShzFUx!ld2rXzK|!C+W3Y~*ay;+dyr)o1c!r-
zn57d$p{LtR`f(wwcT^>p?|O%yYZ+HnVgkCcwi@%)V>qXGi$0=%O1ef8BSn}{p1qNf
z-_k-a4A;9&^kb38VGMmYnFmdh{g}0&V0jamqBvhsxKGazcD1muNjz%P4yH4R7;+ht
zXOF8kW`LpXzOu3kX4vV!RjdDtCSK1ted_<l-(IZ*??@acbN-bt^S1S)R7lGH-Hf-L
zaUbeY`x-g6&8kxKDcQf>-61y9<~j6L8N|KYLeOxc7(vVO9LIq8%#9gDs%wPuJpP-7
zmlTVL`%`H$<mK1gw;1c7)6#XqVm^AqRgVuS;xZ6J)%`?mJC(KXV`DdGg&jkhem34k
zQBp3=bYd|wMXj9t;PeAVjNLX&-+150_P#wT&LpTno~RUb9mG<Ozjt{6ea@GEN0WH@
z5I08XX~Gc|xE4D&Ig4t$kdQ)2Z*#KqkU^nvuz&uU!{HWi4b}q5g6ssdmG_}>s`JA7
z3SFBk#moAsF9PD73eP=8%X%J!4{GL_l8uL04av5oHP|9QJp5MkHxtibMc0kKDk=Z4
zlc1SHoSK5$R49G~G}-nXW|uI2)DGAJu~`2_DIz;1JvA5@E@}q@jO)fRjm@|dJY(pS
z&cmmCtfmZEt{Q0km~fw(I<AhtWznPSH*DFS8i)aNsnk^4uVP>#e;#b#&iK$7_X=20
zQ#kZ+fi)R3VWMM*4k^~m2PddgtO~^8GM+Qp9mSjAB<sqP|15@fKMIe|{ORLdqovvK
zJ*mqOD2z6G9iSxh-gji3<OrDf2{fCq#o`GViQK@ii8sw|JO7NO$KVnvDQO`FW5JJ;
zLc0_IQG6ZsG}FwsdfW8hAQPzM5AS=nN`6nZ5hQEuynehh**X^1R3h!Kl?TI~%O+0{
zV`+#E8>hcxlMcKeu#Z6Au%fC@w0W{0@?;GU7TpRqVm%OUaJPT}+jhwFn7%4Gl2qNF
z6;s87E`mA`!U>nmi^M*+a*7`cbngtu@TFw}vDPUbF{J>ibQNZGqc}w=>=YV+Pv_Lx
zxdtR?r%K<M_>cH4vHbMsX`<!TkZ?nzowsI$S|7D<@Z~c9{k|_(nVb@Wa9UbxeE{I~
zpYQ8)Ho<j@ZiC=CGPdDYyhq+9<-*DM93t_(YSOc8;NcW&oa1}-Hi>JQF`wzqesg;}
z_aeziqTj?q=e^~@V1{|`d0nd1+o5x7!9D%t7+9<7{?WjD+liE)%WqLMd1C-_mx^*9
z?ZfEI7LOhDeb8hKxLS1jbI8+yvv{UvcO$9YD!k;kIe-<D&1K(Sep*ocZB~)W=pXRr
zo5~RPSGGS_Sde3Mxje1g+{+p|??du!tbT>rJk|u=R2q-pMyL<0>VSM-Vpw#mYH8eA
z{eyv@Iovui%&SJOp%I7)Ed|T&Z=w>D=BS~{v~u#hE%~?_%>d~3;#4>T?_aRn+I^cI
zjJQ3dbCbO@!29+G%(j@t^U?677+^7*q@*My1b1k({YId+oP8`PPS^6+tLhQGU$tX9
zWq)hNi6DX1y5-ft15&!nD?Hx|Cyrn?>4OfDS{OGXWw=W)S=qtX9bxEpUMd<H3m@#s
zXQk!JJX1Iw^(s>!XJii4@Nsu3Qwn-1Bb=Wuq+zlO{cM$$(E-Ub)@aU<6%ghdwi!+|
zTNwc4GLI{J!734SPln*bYTiG9DgDfVHJ1Hu9EP{UHmSG+x(lEr*v0(WCa(9ryNM2m
zcS^@-dSHsP`KJQeiGK%@&CG161Kc;+FNYSJ@tv<&>qxnjBNCoW)JRsk74<xv5{!#&
zgXa&>SwJRoXzJ((<S2Ktw}L;Msn#g8!VwgR+D0GQLB1H}SoWb{7EOn$rZqyXI%3A1
z?tj-=z@EA|(48=kU%+Z63nGadZ5P09?AyQQI?6(v_FoPxztXuBr7MZ>tg&Fwi8}Qm
zUUr%~I(S)zi<vMiq)%qGTJGbae@E5om$q;<yS)_oDuV2(foih8in%I!^K{4Ve3ZY;
zB!q~7M7Xi)KA?a{#_}~e7FUNrqd79)5fUNMsEg9}dDC5Q?D&3J;>`j<)B4=~*59Op
z3Y$}a3y2RXe^C5xR~K8xLGd8zxm(0kKPATdij?%115*_XC%r8Dl|+H}_hwqt4e(6f
z-y@V&^7CWrDWH^!n<SUDOoiM8*K}v)FVyP&_eD9J^_3u}X9G9X*!QFqYp}g<tzLBY
zYmes<Cg_e!h+Q?p0B&s$sL)()3-&rB=|$`-8rB1k_FoWp<e-~z7EI3Ap#`g0V@o#|
zN<Ci4x7cx0FJK6vnSD*jsMHf5@cFLrVzdwg$v~DIoPrlGN5f4wY8~cY)>i?7Emxd4
zL#v(01Bx8_-ISihs;Z0LSyiuNhZooGwCqdWpMaH}p5-PJe*#nUFoVdXJL+qY%)#%Q
z4HKrMV3<<RV%~`$T;J!Q*kzWlGv859<-0yJWV7CGvpaQ-<SeW=q5F~c+m~DdNyA(v
z3doT=k@?!J>c@gb5rPyx_$3M8r%(C-NW(8XYmZ4iA*hTYuA>|%R+i<Q{AeLZEk8pE
z81@vKX1<rso^pJA+f(;Sbblm)e9?#NxUuj6`SCh%B`r+~)Ge)MN@6KxO35**R?@U5
zP-n6|7uM$%IxUrqiLK?yl!)qIO(>ouV<{QPUHCvFRvISJC@iOiU&XhZBJU5xRT#)u
zb#qu}xgee6ouFcplAzPgS_^8<sfp(%L0WQN(p#UuRSjB@Q><f@=`YVO761)lWRvOV
zVXr(c=$EU2D-{byn>~Xjmd{qFmp<&!+Raccp3xX`0|E@nHO9X*8Mp}9BfT85HVz8a
zL^dSa*zU^@Ap4Tak{kl&pUyTyJ|~Eb^dLMz&Aa!_HIo5iZtQet%95OtPKOWx#wv3s
zpVbdz-Vgq;T>i`(_4Op}*YflEy@wkUc_75<9L-pg5Hn_uvo$IsPT0o-eUPhT@MgBv
zN8mC6u*rbmfVM4w70LK1;V?dOzyBoLw%A3bcr<oua8)~oj>D)2Ap;}lJQkZx0^U4m
z^Ir?YxxP#cGE4jd@XUJQJF|6hih(Vgi%<0#h}d2d*w@&jFG|G$@8sy1sn3770$Sy6
zu&jim@j~Moqb7*Yple=zn&e3hKxSF_Y{wMT@Kc|2s{D!Ubw=a9N`c!#AQaLo2(f80
zO^^l_66K9nr`rP5xO*qZUoc4l0S;k%t!k<%P^x#An}m0DDeNia>WPct|61_S8JDV@
z$60uCUjNHE4GD#=1N(8VA)>b!h^U5>s(j=tk8v~Bg;Had-)?#xH!&A}hVz-&7W-j{
zy2!AWNsqMFo*?|7wtg+K>%59Zm=7@B6qJglCxiysurninI<;EZY-u6ur^W@y?Mr7F
zKd=WVX$4WBZP*W`XrW1sx+&fwNh0{V#+b5yA<#4;N=23h<DBdUw}^Y0TNASu<eQqk
zTNqAx(}NnSuo!1;&5J}y^?Qr%8SL6*_IPV}i|rZJH$KG|N%jNK5&XaAY-LNm)U`@>
zLnPm9r}5_73^gg)8qQX=!&g081HyHVh7rE@V6}t-V9lxLy*}Ds??&hVbtFMUbmYpw
zg!|AgiP@v7^KU2YL&+N-EbXcaw&S(gikl)D3VDYztGQKsftDs2X0GLoJ;`Q*O`d}7
zeh0_R3t$Ci@J<$h(*H5?0@i2y`&nTF7*_*w`%d{uN6FRw+KYz|Z-PmIu8>Y7s3xU{
z*}Wq%4;byOxrHe}%t6jq+xIZ#pKf(Cu7!aj`(bhlE_TM+shEu5B&Kks%FbZ+*Q)Qn
zP}GhqG71HTjl|%s`9eEO+0=G>w(b@LuDuc<eEUIX24SD(5oCo`?msp{+WMcYx3c&_
znMRGaPp`R_ei7e%saQaxzN-B3k-JA_(=@|$6V={)HO^F--fg$LcQHk*co{cC)1QFN
z-940nSFe%Dkj3NGJZc>d$WUK!Z1h)V_x8U4&fgPEz7@HU^bNr0PHV=#Q1t!mkiu=|
z<kK&9vl{sKI$Dyp?(SMraF1!bjdhPQWt>b=hz-=H9yK1rQ_Py|3mE|hV_3aAg6|Ou
zn|TllDYLMs(t<8t((yvu_F$|ScAU$!Qoi@lC!M;1X41-eBclq>H&`dTGk1vQ-H7aU
zn<z0zbXn1py|J04pOHO3+cN#j@6IG9Nff*+EnKGIl)*_eG8_QF!)xO%6?0VMX|FCf
z3WgWyP1o2)I9`oyFZIR7B}i!!%%(NENTEJ;)&Uby65Ojys0ZVMh?0F!9T7blK{ABt
zFyhkCj`Ie{16p}x_{NRHWw>fk)ie1#DlPSKvhDwcb6`Yza5Xl!-jJR@7epTcqf6h1
zo}K)h!;8b<fQD@h0Jq!J)t4qI`5-giATDNSs`5-YUROHZ{}pa)TocycpeNYX`ce34
zOMQkkoBmCVKdiJ9bz2vvlae6xB5;=_n!n6BK5${SDmF{i1W+ZDp1S9Z6z~J)LqUJk
zCtE1$)mf^|SxPEA49v}lm|^bU<^TE2O`~Y$&8IHGAFW7*kS=|p|9^-UFBv&ki8x%L
za0b-6oqr9d@9zU5DAT~Qz2oCAM}zkUM@HOw&a1%#l+(3lw->9PxiMNc)BSQ-k7J4j
zc`gdkZh8P3G!0$H6<F>r@+H@hhqjR0U1jI}Q!GPCft!9_F*n`E80(X|dtQ7h3Ug3f
zWP?P+=JPO8Fs&LP6ncS^9OhYOuUZ&4wgSbf1hb)_k?Tb)mnnmq(=Un#M=O>5cM_}I
zIlpOj^;JZUDb^~&zy<PISveUy7MeNh&86A)$WhB(H}iy?Q@0wcHzD(MYwwzVhNT<S
zVgzOY-Oaf<x?ja?L_g;5=Ew;(K!XRahGh!v6L?Hx6!=;Nv8$hnDA-eA!a5d-i@z&F
zfXhB<sh)8VALPQx@An^5GSdAwgU2C*S{=ffYH3Ba4qW+b>n`+lS-okkHO2WkauJ5y
zFxE<i^%tL``rmJ%oT1^|t;l|J#|9Gk&Js<WGXdKpJ^Fl40(WLwDTXN}oiBy(haYK-
zIC@x@o|!OWFyp=17FrDQpKfHt>v17GIJy6_U4-FjyQAkvrpkZ@aK4X3J=~vW(KoB2
zxVk;FQZN|+jva{U$mF@M`_5R_w$x5ne0V*v?>l}Qgx$cd@)Bmz$0cIZRHTtQ?AtFZ
z%h?#{p-l9QXy#WPH<a2r(a?NsLehAkFqIpuOsG*B4g>u+;<q9nU^V*Gn{*ohp}wCF
zgKQ9Fm&Kzn4m~l+fg$r4`Z>N100_=~eNAV(V}87B_xGB=bNlyi&RT9S+<+hLH*L!I
zY5Zga;zmnK5eG3}B=VDRmb+aU7n!&J>bAe0%P<JCHl_|=kF=eRa{7~3@<*SFk<FEr
zM)(Y<<&et*N(#o!JM|I7u&e`6LZsT`MM0*+(-X2~QV!W%oddeBy>yb3g@92E0tHe!
zP#~>0+WrH5+1n9`E(#_!tUOKF40BRb8;{!WZ-@aSt48j0_QY}-Ok3Yt8Imc+UFcoI
zOqeJ?Gxt|PGT*Q5c@eKwsF?cFk>}T!$vl}>g$a7hgOqvxG|&N?u+8-j!^|`h??gwL
z{J`>R=b(HCYo?>hrkMR>c6AuBk0q@2hw$kr|JdlUN7=vz4HFaU@mGFribQJVjSuVY
zfJuC$1B3=DDTPE7LQ<-UhCTpL*R@3mpJ`f%oe>+g-6I5l8I4RnJpxkv{iO##W-D)F
z{C4dU0k9GgEaK=Kci13a>`*cgBl*z=?$JHq!e|S0>s`9on|{KCj*~v$=H`w$<jUT|
z1WR1mn+un#&<>sD87OUq7a(cI<KOF`9K<FAXVd<tWS+{(S>ImO4_3PpWD@je8gS50
zh!YLb4KKC@_ACu&fi*uLCdG($of-#czbf^>Xy_K}idymOyepTf`6NNGRsUkwlDJB0
zWQH2?ysacBc%JDLUS{x}K1c*^fCH%P@VcN`IeB>8dRBaF{sd_2?BaS$-~u&NsD#yR
zfx0;YD$L-!hS&AW%ewx%$=n@Nd&2i;o<0<1d0-c}g9m#7F;aWr;TmQvV(^=I=h66)
zW!)F~0qcc_%trHuOxr|4BM(9S%PR$7vVVCBj8XizYJP3w2Td572~BgN88CE7<JHeD
zO6{v+esb#LJ8eIUu)GUyb^wA5;K0oSTu7|$w#x0Z{F=MnGB275PS<UlLyXP^QUjtM
z)Q)3aW}t%AnFp%)G`^cIRxZE$u{Hdz|GioJGLUFrMuMG*7A|w*_S`2w)@K+(J7?Fv
zMdBF+6EL9*Yb-5%j>!?+%s`4_vBKD|6XMtm&Ja`>8oRa<4lsejsWHc0$z(mdu5;ZU
zjCoump#|OU9FxcB*kOIt+4JrM)%J^%AMYe*>#9?ORpuJ+pzq$D<GAM?x}AeVSdqV4
zEMsk=h}$A)ky;U1bc5a@+S$s5g){k=rjqw{FP1oLuCe9DNHgTlxfRN^BUYmidQ$^z
zsQyi5MJ}EAyi`VMz=J``?lHa|SL=@~zCPN1k(eEQ@_2)_4RLLqxE#PB&SJ;9ogJrg
z4=cD55Ngwp81Aj7^cmg(cn#RwuXAZw5y>*W=Mv!ALNgU7bs~HHrlff$ZvY4yrFU3N
zOS|_Ct8;`A0yGR;6Xhu4Vu~5dNlS}Qcy%;G(aJlfH?=i1>qD=7TF9dUZuJhHjJ6w1
z=i*s`zi$9_b?^>e5<tJ;HVPzSizINbIfOu_61o+e27S@IL%TWJWCTWw#Nf@3;_{MN
z2Q5>L7>ReQ^?<LHxKR4|SOplAPa{Ub9BRVR@8Z}nMsrRb_W>Pab6A}&9VZZ3TU`EN
zAN9ass@z)jS+i&_fUN-nT|z=#l5^8_;N-~|Nmrz-+aJqj`(NTp;xl~e%AYH-k;#h7
zC;TlPMHJUyK*(+7fg9m!3%U(wJ*<!m9w2__IZN^hMv4rrU*t~KW!wX0G}G9FIqe)H
zY~jUc(YTcXjWE^*f8>*IZbb$x+UxPuZbb_3pdvlIN@Pjk#QZ0+I^z%H%v5EaqJ1ns
z*rv=#j=ieuv(QUDes74BWs@3FvM@xAO@7A%D=JPKc>|0eu%c2Hy%VA94&^W2jc2(6
z!V-gZ5rSX+&D`j|cVd0b{RrUW^@zESx(Kmr-4R@(r6Fl!2?@fEyLT(Bb|Su;fU;-m
zGfG1xzK@Ry_hoMEh-W(4yUq@O7t|ryG5^S-C5d@_mQz2o0i!^6Ygi`76q&=GuQ^Dd
zB^U8O6urd6jqfRgprur*vK+^<WjARTuDGbP<A#Z!ZhIsFlY66msc$iQXgDW(p#<Qm
zXF%x%z6G0Oy6^O33~4bs{i0NTZ%(2-?~@k8ZmGf~+ZT+Bb30;9#xXN*$aEs{xYkA`
z&d@c~53``k2il~SKE$6hxnrED4b^MhR~-&FJL{)1f$bA7S=1MW+eU<XJ9N0Au|=)G
zhV{iN;z`}`#z$+~aB-259p?9IzWgANFaQ)3MJ1S#<-~L}u9Lo|9UC3-E;NTAOGd~0
z9v;QpY-_`JHbR!gJU!jpH%Ez^M0<lTPAcp6%q@teshnMGBmE+MF0U?h9DgkPEmD8z
zGc@O74w=*{Rn&AZ$u!vu8c+rVDLAh4BixY+0F7<DH9%HpwlmDvY(&e*WJi#(M&XjJ
zbJCX^cWF8Nh1JXWz5SeEG7r5;${NA)&MXn%!K6&?#_Kx8(y@E)$L?TMijUhpH$f8S
zDFw~2IFB_NwMppkcPp-Y=B|<;pbm(B8LeP#bco8$GHW9A4qpOl@o8Ww--Thj1FeXx
zi3y|1Vk-fFZ_a^3<FlB4)B0)Ar~iH}97I!Ad?Kk6x*&1X-i48Ao4}0UVaZ^vTq3Bw
z54x21_qzjZX!q8&vrncooS~gi)ab!4=wgi&346?|Gye%A!F-{M6oAnOFL#p89h|d;
z^w4a7f8!PP#Jr5tbxrZ{lo&v!0c{m`zehdT7mBFOkD!z3H84m(s3(va6bWfgM0q85
zvp!g$!;&6wLQa)cxa{iJ0|Rpc)z!{AH7!83xJRGWWQErTXppE54bIr?Yz_M)R?(HM
zeeXg*q6-Ba0#Cz7HUn{Q$Pi<hb+wiS15;@~?`#&N2G(F<XJ#gR`3ID<%E*i}EOtR&
zRvi$F?%4BuLw1Fc#}_aQ*FHKJ&}}eDFw#^%)iV|Kjj^`eVv3j40j6TBx#C$dW222h
z-c}@AiiJ~H0NZ{H1<V>mVk;qr`KXD;CFZyRW$&)%f_WZ2_fhgR+KHZm`S)AoWaT!?
z`$^)(+fqJds&O}S@ah9ZQSEyqgB{G)7y&I{EIsKVt4G)LRQS`W2;Ili>lCA};y1Zu
z#36wvH-U?QdS0bJzzxFfjdx4fKWOtN2{@Q1o0FZONe*O!543S_Ocn_cQ4uhnDNJ`K
zM%oUsURhqihE;JEVa$@rXSSWi68sk2h{wEW)P0oASIyGkC<bni?vM#y#6>r28^Eto
zFnFT7hA7e!X9dOaa<2;r@{Ud%3FMa|L_VKWz)e8~KaZ0)5gnvQ?aV+fcMk(DKEO9O
zbD<76-v*sFjWFJ-=kF?$cW@Ei`W7QJl(p)V4c!3io5W8~Scx-^KQ-B*<S#F$N!YXt
zD-@7-%#1jsnaVyq5iJQz8qlXeNV_93!XbDjGFrERpaPKPu*asJyQNIPUyz_(vNTMF
z=SrlAXs6RSPkgRF7)Sd7TW76QZXwO>EKPqMTA{1IRg?_*;1);Sa^FZBK2D%Bsj0Dl
zoly2lecJw3Hw>^=o<n&do@BeZN7@P+E?K|*mtl2<js6=&jTyM}bg+Q0`JdpbZ$~-U
zw1wCKPTMiNT{mk!>Eo$a)@lt>G#4Vk8c|za!J|W)*A+nEAUpRi<fB5;5gyzbgu?)8
zmJ!K>4G8hG;pLI<EX)aCZsMBGCG)8>&X^%w9ddXlMbanTAVU9j=L;(+Ta{14Kq*%o
zJJKp@%TRV`US0<&B45@jq~Obq9*+P4n5&`P6Wp>v?&HL#B4NyJ&LJ1#Ag%#hJ05({
zRTY?QxnV&iBrMF6GTj$a?jk!<?Rs!(oJwh4zbhQ5_|Mb*)I0Z;^s)~b_eG@bh`n9#
zceK7krKH(CkL;oxgS8iq#@v89NGZNvnnUZTCpFV=xzqC#fvku*%Pv<feBte?lrST$
z>YpC9y2<&t^EB67xW9ahq>u^i2SZHy97@gXs7Y&$Hph@ZiBA}X3vOH(>Ikf^>qh^C
z;vU=E-K*?Q5(F_9mW0Dk!Fbmxlg7>1jLabEb9Aut7Dg;?f^atB>HJeo|4g<-AAKpe
zv;2Mp2rt2d-Az@Y!Dzea{9xIy^;Yq|r-^~RR)^Nd;J7mmLH7wja@6j6^PXKLbLXYe
zuV^($MqGEjL*mz`NKxcF*zj2K_AQjywA4`Mf1(D3nXM1?6)N0%>278K-qQKKI#R^f
z6D$>x;a_;h@oYUl4O*)TQ<2kC+XLJzHG#P9U|fw7CsiuR0bP)m6=yn2CrpgFq_8ie
z*g)*=*kx2&b=>9LVv(Jmnr4%e{b4~jesRsKpy9ra*2&42r<n0D-?Te&1|vXo7Eu?8
z7gGFjTkA#rbs&g{ayjeflsJPXFL(XSyz$vDa3T&rz1Ma!+W=rSLD98CMpi@YDm(aF
zj9-y;jIXsF6KKXG%#DtovI83mQ{+%(S_iywtei(cD-H<*o(}8`WbBJiCnJ^Pt(+?=
zxZxC})75;=x$Pvs*6Eub{{F^JdG@qD&GcZ~k`Ou+G}*cV`?ixa^509*2!L#|L(VmM
zzRvaGIWp<f%#(?XlQpxaU!o?nX^5yfzOv@y%1(hMMZ>#)HAj!4)8q_>yto)pW;shA
zC&i}FQp!0%VV<L50d$gcG@~hE!H?g2&hv;J_(;*v)PnF=4L1zsYOI^_k*tcsXiwc5
zsjG<XzuyAi&BB*K>fktMf4;mGo42~ROn$vm(q$qB?Qg!#QU0?rs=d}qFO8?^N!Iar
z%I5a@`N`dP%8r;@foBiE#0Ek4`;OlftDa~%>b?Kj*kTHkwl;!&u)U}Pyj!51O5L~5
z0*B4Uh$4^<2S%5@2zqeXOk_x+4ZsNHK4-?6pB7%tasrQr;yT_<yww?*0}OQSuJa1i
zZBAAzgFQX>ytE6%!DB46_i=WEhFoUTSM><<SQ40`%3sHDA4lS9Rf80`(Zpt1NOH4!
zVf_V#Qj0Io!AC99{s}sn3cnjkLQ7S??3mcsKGXW0k~m5VR@Wl@`*T>$XFZxDhHpoH
zugknu)|`8(MU$GKHh+JSt)}&;Ut+9wh$|XhwsBX}pX|@8R)tr2z7QnyH_f$IK>GDv
zYc9eC>IA(Qdg%34hayj_W{@fY$i4pF#I-UzTX442P8#clLWmlqpN%XlE6d2zn7h+s
z*cP&*aH>5qHJ%pzhdNn(JdJc`6e&CcJQvJx6R=Fz;u1wCaVAp!b$n3_pSNwWOy06m
zl$y}_7QOs<QnNmw0)^gh!Or*P57cbBw}?hZIy$uecIO)_ITVX_*3&dfy^%!~m-wXc
z(>3W#C8rxggd>Q29Hae7Wmd4jGNyZa?L`;4RuL?Fszi2r_^HUSC)Z8|kp$h(h)I>(
zaCIJ`D?6g=P-EBcR<7R8CUtf<Mbrgwp-HkESi?sTc}7fhMv~QVAsgIo+*y2316-5E
z0Dgc7;0%pcEbV6z(C&nIxD@5RM87iAEf^JFGvuntBazy<$G0Id^md0%O;o1+JnvKd
zIGaa#8<sqW@HeTx{~h1T(G2hEsK{c!jHl(L$e%)fCUpsa@XmGvYABkh!HU@)-M6B$
zo1e0savFZfb!*-iauK;t4=yY?w6e%JqVY^V+cuBSc_Qm&^dA!Os~GAb4AF?ID?E&+
z)J~%`D>h?PDZ9uhb2wlKJvKTDXA9+3E3hIX^=;^{ZJ-BF0lf&8qpG4Y7h?j*R;P1|
zW8+Hs$#t2ZeaHLvCHdtRupPxZLwbZue*H08p7#Z=DhU(1-iAsjQ1gukbjo1zjHwZE
zb~2vV4`%B?>m$ylRu@x=yS7n@Bwe$QpIUJdb@@^cy~xk0l;&p$N(obt<B?<Umi;0v
zjDAJzt|kROmT)3tuUu5le(Clz<j&RmLJkbN4M=fVp^}$Mq{WF$u6B}v+2Dr6LUIP<
zOWXth*YFaZmFSy~YCZZ^ipA6!QwF&@fyLpCs1u{6=nTtJ_I~sa*g=aY9n>yCGI6%>
z<1-I1X0`|u-L~Agii;@{3i^6f`FGB-Q&f|P!TF!3?cY5wgPx?}8}G2kXZWc9>#^3{
z-+&LdCuI8F3Aoy|a7V{YhV<dtX|PrM_!xE@p_crv_$4))p5%+MUcrUVVHVHb!AHAn
zUw`@9Zt@XzG;4#-jisXzcGsk0pquHm3N=$zT)5>>ZSSeO&TANcf3cwy*;93e{clH>
z#+sE}QVN}Q3tb?YvaW^)-7jgtnJ(C^z>;)upEOCoE@S1R4}l@bedq`sR=r>sm;9rB
zcmDfn{I2H@+asNv--xQXQMLS(U2_}b<j@ZJsekX-z6#v)(9Tu@-4Y<A*gz^Z_2L@q
z%+bivU`Q1$Fy|z{Ob{R%rwJbJvl_a?OQEiByzk6Oa+a%8n!K=Q0KSf{z)WIhS~%Mu
z6zFtM$K-ZOw)Au&(*#~&B)pvF&|Nn60w(v77zXNjGYsEZ_46N?GXaTeyYDz3xTTcv
z@++0w3a)d+yuakPpz+x7durO?s3z3s3Y!7x4l75fm!(LkMyK5@uIVuwVizL14MCQb
z6*NrA@}&M=t8fY?ot#t>IV}{rHW1xA&Auo~$jUg*@?|ttLuetrW!gRMW2_0B)(zHi
z0YvKSb^lz0R3~X_h0??ULqT&81C%dWA3wWhkO^g9_~b}<^s)JFR4<7|;(&`ZeopkM
zPwzmMMzCk182UhRBg7b9<>0s|q;tOnimjXJ=v(4ccKz8q+0B4=+j_KGy+1)^a<g8j
zr%c{<I_=X|XUgWe7~kMHo+Nh%^3oi#6Zv6%KXF#RAmS`cw$5`gfVn?vy^!2~qKv(`
z{i-*H*w)nm1ATo10k4*+6T$+_VIEomT_Q2WW_HQCKmp0S6A*T^1b2n~osA>4`5m5j
zYf%JsV==dP{^rNp)NI^p9SMOh#Ict@?zP_nAOWq-hZE_<;A#AeE_Xt>n{XuZ_;|M<
z+DURTVjFE(U8~Ow7~p5hw#g?EuCr_S`k5R4;~^@u$@3z@XAa<jXg8>vT4}H=lU9qA
zgW-u;h-ZxzLsqhCOf1@NNjZFcT#bt0pUkiHb?fiTN2f;xM7Ipn)+n<nH`98dX+PG_
zw(WeRzHnkK3Jn|q&WpU|g!K%9L>24g`EpvMTYXN|T%)qt{TobkUP6%cur!Y+G`93N
zrx5Q&@4CH2<pa)YX6?+zhFo9$eJDo%7oc?uH6H7{=P!wM^J*sr;wZqv1M0?7b4Qh-
zjsz2ld(x0?kGDRJ=jG(G=^@&?Yr^FWNEhl>WLAVn;`JS{eR85PaVxp?2rdWc=a)wu
zm$yqf7C+O9`!~g;u$%wB-43wnMn8d^I_?>wThR8&HX14*=uwDJ+bN1ZNfQhpBz6Ml
zLd&1~&RPLI+t_PQ8T2+;ZSHN{pDncWVFM<f(=4)u`1m%S^^0i`#MhNJoaR!XK1bWY
z8_~!Z5518pb5Zs?-%~VINYKX>+%GPRZC5M^@d=hA1{cx3FDv^@v>aI1CbXV+5$?P`
zK7aFuyu{l+_dqKrF{<Zy>#c$fPUG{>@V~!o%E<{BTW0HxYbVLNmGnFGbvPN485cpw
zZnP+B2~4^1^A>d>9OqBj%i`Yi+mWvy8HGC%4mHd2EDEv2TipPOaBge~62*ZzK=_q^
zARl!Y8L50<n70h;R!dRL9NXB~o;_#`z;k8c6fAqfuM^#_&3XP!$?K$y>-rTwfIao{
zz|Odwh<5(5Vcih=7;BtNgFcXKNWz4XtshQP?9F(*^;({LoK9hQk$ipza?w}#O<!*+
zJE|Y_X`R!>=nXXWN74EFVlG$5u!fKYY??d6gZBo6>Y`q+u{-#4qD=J5uqUz<HET9<
z8`MLMg{6s-q~)j~Tk=OWW1%<y$s|HPI7#@CNF5w(+#Ft9p$r;rU&XW9(sJR>7wWrI
zb(cM(eCH@>c^0|f64rWbSlB)XW0pU_$Gkp{2)oZT0<hpgo&Wj51ks+{zs@jD)~-fR
zS#5oL{AkEqv{YvPv0Hvh6D_5VaKy|tjS^S3A1Ht%&hn5;ekOhdrb&A!3cB$G9s0mb
zws|QvxRKlR)aNKPU-?5IO88Ne>qR@ltEPIqjHx7n;GEHKAdDhk^=l2o(6d3NqHG1Q
zH-MOy0ts_q2J7E@&@R%;#NYK5tf_$jNIn*d<Yr}I=8gOI1+<WV(n;cmX4h@X%Ptf8
z5Y3Uy4a%Tbucq)k;(elfEnzvE9z^jY3e}ruY)=HdNaw{2FWnt(g%~#?a6c%DP3Tgy
zYnPAdi<{>a7Ut9>*65~wvlIGEC<M!vWE=8lKdL&@Bk9yFoL;Booe;*<IoE?A#37V*
zjy&zi08a*~U&%N(;bU#5cRG&NRC*T2@fbS_wCM|1ApVnDYjcnd*U6G^0f<rspj}b&
z4f3d$-M%V3Md^;95yw7XupvQ=s3qX=)DZbcX*H1eYVObp8}H^vB#L62czk1+kA4P+
zKG45+{oIKa-!G;CWUUMS=KU738%KMKQ#la{8_jg^Remj(){R|*kvoit6dNtuy@>1E
zer!gdr+6KLGg3p19Yd~FEbvSc;dX;Igcv2mca3^n+`5AuFig}jE;)LY=_x6Ntoz~N
z*0rI(`5M?-K-6$C>x8zJpTEXn0mw()xB)-&q9oWE-TSnTc!5_^bo_wl{7v;>!Yn(B
zK{9)pPCQ~0b+}ugoOmQ(8RjeeJ>48Kg8Yz;Dv4Q7OSAj9UQ?(*vutMK^QDr|tY&8p
z0(T27l#9u?8F}HB>1)6u(^@sv7VNmEVQ?L8Wpsme{`_9AQ~yfT8h~Z$`_3S8Cc*3V
z&bjR4hfH<}wv?5G`Jzz^BCKaoa0kSSU%G;Cqwq~x9tI9go2r7tUUNgcqaZWxth#rK
zOaj$<(gQ-sn8RIbg<-m2RBWZ8b?)Z5E82GW(dv2yb_hedL-gpP@Cbn4{<ZF}&}8+6
z5b57XGEdLuL>-yZ_$1#=0_V-uP2?%31Y9h)5D3o%0ox=;NHzKx?i@%m#v3m+6lU{V
zY|&MvB_V3)vOSgn1PcZ8G0=<l&m~*Vk;;(}?t!V^`$$D)ql&0Z*9m&PeelYwD0dr~
z5rb+5o;Vl<PM6#~y75~skp6EIayb-w_UQC{oNB2gQyh!;#UGmIpCZc1-srsX_@rSn
zCR$QOWbVFmu)87qnXVO7cEnyG4P%yoBM`6E2=p6UQOhoprH}dAc&ovZlm}4UOz_H!
z0SkqYJC<dEI<zhGb^rMB{eI}d?EYGTamdEi^3zeJcTdfIU5(f7-ezow>eXN?7F=Wo
zj}5KQmU9|zlXdx~l~=#cPU@~M{z-IGhJ)Ke@{K5$UA!VzN^)X$HsK!%)?R4f=o}y6
z#k2ccewnOCu8Bf!*3A&)T+~yzh8L~TfsWGOJH}ex>Od})|Ji=fo@ndH#4a<&)>Y<8
z%AKl2f)*H(xtQi*MVIvJd&A`lQ6D-K)vv*$-IAlXalcJJFI&JHXlRcAib)UA<N=<m
z_{D<$bSWU{@uP#?%um90*kIGL+A8S&T?*^&8MZTlY_-491Gjt9z-@nR!{l8iWcBg<
zWk(XfdLi01S&!B{)r3OlBu$0441{vZUTu_WJZSy>2B4C3R#D~@y2oZueZNtr!hQVE
zJXK2`26$5WYSzT$j9W?qCLc%7jq&28)f3|Gwmhty0qGC+NPNAazC&B1s9OB0!!DaX
z>0kd~_-+sD^pG?+U2`v&Jg>VNN-}-lk4cLQ1kEWW)byh-nlZe-*r3PZGR8aj1F%%w
z#eh4nwi^coRdy-96pMAG6-ycroi1eFJa$cVaGUKs%dnnwNpbR;r2`TLyuY3!^sUIB
z(^<y!P@VwPqqnrxb1!W)ev!RSJ}(jU(&O@2l;)30OLl~d!4ZX4i)h{nO`lmU)@a-5
zfjOt7hG1=beX?IXE)=OBrj`$$3g(f569oQb+tmzdMhcbU&VaLg2Jg101QI>Kd||fp
z8BAvI&li6DDH>_L+^61#g+U89=Wo5M5QTn<FIX2yCa66Fxmb?82(eXF7|F5&qna|<
z8OVIR;SH*}*G<R%NKgz-jE^zxqp$`;4m4l&Uw5g+Cw$T2$%yLGo3Pc3yq|ooa**4l
z1(~vxY51-~g?YgWUO!9d|3>GR%St?P9UD^q#7QIbYm~B)KNGe;s((wx1u272_p1m^
zW)$Fwj@&>|3mIwYMNB-XgGEh7+kn=J_E;&5RQHk(rrzuLLI(%T3Enev9<{7|T&z5K
zAVAd8wk>N;@48{##OSa>7e~#}Upg*Rki7T9K>V*sf9gIR5<M+9k-DYse?ake)}3_h
zTYXw#k_*2vmgV4pk{Ve|@orPq_Sg`|;dq1)EK!-O^&p)^ekci+rLC^4qg`EO^iGhu
zU24B<v+XOED4GZ+-fhN+%i?i->&FHE7{*n|!PUk%7AF;TE+?-Ue=@{*ZYwK-mt&bN
zxCu(-u6LK;kC5$CibK6*Ybnbb7fL)9cZO#z{r^eBE%^xipy-$t|KYy7`EG|5Bg#F$
z=wAC;!apGAe0FjxV`q-NXV{E4oL=N(FZLL4m^Uc;V>HS%sQw;ND=2T>@^nH(y?rs@
z#PTg(<5@WIB3mT@sYo@{4wXl7_^C0gfZ2Ts;b-ov{uf)Xk9=XTpZU*F3xY{}MF&|f
z@pa9gvLSuex^^}<&TR(!9>=MSEPH?W2o$HG#_(g^PI0SJR@R-RMieVKl%t!q;6Z+G
zf782#JWY{*{z24Tq}5Vin=c;W?@jVLVy@urHQ|XqPinRUX4xr?L{ZpM6G{v&L$V2v
zjyCy>NG=loZsZEwrI5fvq<7*+{^%D3`Dos>%*wzArNyr%jAQifbegr{vD{|B0WDC?
zAZ+34i5qrbpI?vzZ|Tn8x0JYcfm04ZC07#dp$2oZyeev~{uzZh5?jdZAD3B=EsU$u
zSP|sF-e7e7re1e35DFB^z|^{fTq3f$dMh;8eLvjh^&K2k`k-yvF+SkJ%I+EPvTYz=
zBI&P|55<%Jr>N_2g!=#gBqfraB7~0-Au_T<D0|)6JM-*O;VzOb<YRByZn%uIx9pX1
zciFOY*?a$v`h0)?!0X=k+^@&;`Fy<QY0N##zyES=x-2*TyjK3gCdJqwkn3V!<Zx@I
z>CTh7Z-^j$6)E>!mDnSj8Zbs{-}gAr(R0f*rX*8!ni})jMz{|Iu%W^&KmQId@Z@nI
zH|f%QoUG7h^;C93q&sqco^y4)@vcak@@T*}iNn+^`V6g3TICg+w5P!=3E*+d6j}Dw
ztG%{|VLdx@Ta24qvM0oS>D2>#L86%&oW^(2$9nE-Jw*;EW(CE*364-af<MSv_eSk9
z8?a&Yl;Z2Y{qFD-+rDeRe5G7klx(k*%L_dO^Zh&Cr0l)vZ^LUqWmRPPM0bmh^q>CG
z=4fDWyUpQ9Ve}s?9d+M@!!dMqL3%VCk|F@?DOjng!;iil;7f}k+^>%j*vIf_>|>aT
z2z%~WZHc-@Mp%`R^CoJ8*W<P!;7gJ83fuz&FQP*a%sa3=P53t^No_ykL6h93h?qh~
zbS1|<`?PuE7fgTZ8g1T7$o#UjdT=y$KLCBaZjQI)T3?^{jW`t=BU$0HV@O{wcAU1R
zqqYWoLS<!wsp&%<#MixQ_l}lv>=|<Fc{oy8WXxb$wCE|LVHEwYKSN9)@(dtO?b(%~
z9=n6;&%e*LyDi+511yF`$pjYujgLhfGw;u-!Ak<>K{avshSSSO+plZXpic&C;iwo1
z<F)E-X_mk6I9%mKv+C~5`_${Ek4B7O6*H7kzCz5<dt(M+KFbc<?q8BX(yUADt*n}#
zeX#Lpagd1VS3Iw@_toqk<qPK;8F7lN!(;8k0sT8|P{|8}e6E>bcH?C>OjE+XyXo>=
zoeO<K8Rq*FBz}=Y;K+bmq^=M(+TLl>J7t_%v@)on4RBcAs?bo&ca&Y`z~)A&XW(I(
zF@~je7vtH%Fo{3?*PQ+OH*cuVPY2BdjFIMxcU$jG(a5lH4e~@uK_*W5#ltbi>@z!$
zK#&(c<RERI-j0f=_87KZ4+C9?ywJVj4x4q+uI)#wPkNloH3%`7s+<c_GR|fO{ZzK)
zw4VG7!}5;WjhdnTM|hm<>QRfpher5Ei-`{-T{8j-8xKjFQ4^CYv;&TQR5~{bKqk8~
z?)`;~8xxvIP+T~-l{!)X<skDxyrjOa<&Lgg#B)&3X9wej+XneE#kD9<Pnv7j-uI!(
zj3ln0qsN#RujG*{M^uk)?>oPpkiv(6t3g6JJiHhliRWjuqiBhokuCLgJ!fq_2$!%_
zX4+SX6&ZGa6_g|233TB|L}7jva_Pb>2FN3wq*W#hTl?rFI^yF2cHOwRZnLB_8--h*
zxPrQ4%oY7S{@m^u3e#ZC_uKRY^e*nIWOFAkkaQXmIEkBvQ=B`38Feuz9>)gnoodXy
zpA5fmVkxw!&mh|y1(nj`YvksF{ex^_I4qLX&&`8a1?6;PqnF|8+PsAsog<CHz*h_X
zQGDC)%C9Vu2gGu3nB1$~JmZf843$l0ng(5ln0Lp<%7nHn>(<ko?mE!nO|M~4kh*li
zo1Dy*bxk~mqw=g0Z0ls7aP*XVkzdg7M@?@9op=<qv{<?s>`ixOhv@{Zk22U+5Wxjm
zZ2|C#rLEDSN)ppuOzQYw3t9A8-@A4A4PRw0mW$QcB<PGre*Es^v}gRQU7KlH2bBub
zi&5pvF;I|B`;BKYL6J0gSOId!_gFb``34(!z7PEbIT2gEv`0eDk?D<4ui;WtSp72@
z{@t?v`|}LQNCoCXA9|4{V4LmKb-^$CO`-U#%=jn7z)6?xCeAj!UnBwl91$Q?tNcDe
zvyzwky`|ILDo1<Ny*MrOy~Zi>!%Knh5};z=o~b;YkMjk`bknT5Fh_`uwa5>y0$LIF
zH7VBji7ITmANwbrunJF0NO7;pM9x=<uAuZVK>H$N`BfFoE-Ov`eB+#>C!5&3+P82O
zG}?3WwXf!AV9iiqr6(EVIGeRxoEwkRU2@Njywul;L7G&Q1l*rFTzp}m;M-AUv&wRA
zO(Vq1ZsxD_tN`0<<|ZM7kfn#s?XO%|@6~dRe9^2m-Bkt4%?8EPLi!uJMBupd;dYz%
zc>CbSIo<}jCwt2<N2g@(EmC>#NyDS48P5nre(~HeBj;fCG<J~uebX|p#B9a746wCZ
ze%E8~=D01sLcL-lB?&Y@UEMmH-rak@JBlM15jodajf-pJ$q2yyY7c+|b^Xwr+IHHW
z=>B;lP4q#=#<B43rGe0W9T>iyGnu(s3efI2DpJQYkLg<g@(aCPEnrFXotGKj6W`pR
zV_WGH#`4YEluuWlbm1Rw&(`+4Y)kwn%fS{JG5nr|1c_%MXLYb!$NoQtNt9eazkn+9
zq26O!sG7qPWqoAe$!G!F+P$QsrEfRPUtuAsUqhiHhYTw@r9cUkC|DRXdN0X<-ecRh
zxM$?}DbH684nVeI0*KdR=4S>Z-1m=R28J0J7=>tp*SNtus>o|<eQvyRl<M(&R$rYW
zlB+IfIZF`oMbHuG<jAQ9Kpv4>?9y^l{==!A#_GM9i0(<<${A~4q2KMhV0h*2Obp-X
z>H%u#$iwuHhspi(3_s~DvsI%%yE0-&kXYl6lH=OmYxX+BRW^ZHLK~OOtM-IjY}8?u
z{F%5%pf^fB#GV6@Z_xro4s^xG^#E9E6)u$)RlxDPZtkgV_ozkro#K1W#Qi3B>4Lcm
z_xWpKuFijdDlZvdFSyFQz16s>FMwWNV&O>d(@%vWI~{+Rd#M2vRW|NV&3CkI0ahjS
zel05iBNd~DNfZz|K;`ld9mk!pH)l3Xxtu<|(9%`f3VuB5KK0!70P#28lH5vi?VH9=
zVVzP}RT2Osw1Ma{2F(>n-x#(Q;C>bf?S7j{jTz?7b%BKJbnWq5nlglWghIK#z<RR#
z)^jU`u#$2GqyW>h(ultHu=G@vZM@iqL`h^ICRJUg^~iU;@C(MHA_EV&-~8jWl;Zd)
zs`EJkiq_py*rdGuP@q5`$*mIHlHU~WD~mUX_J}w&O9d@%ont#MCBEa#pne2pUtH*T
zv>)lLK_AiMfZ_)zXj_v*U0b1Mp^$$*jiPyvessOAD(-x9hhuOmJbhdrS2#ENw}uY7
zaS85ZLFTW6PkKWMOEf%rwRse?UiNOMj60KvW1(<sm=@+#rN{U+5NA>X(p?x&T@u<H
z+<EMlIQ<=A=;y+JVnl8H(nTSnq}zZaR!pq2*;`fD%$EE*jyh-hyiy#WyM+E+Pe_QO
z)6?)__n7Y4L2EGP+-`tcGw<U_k0*QIkTi3|GZ*kO<uA0La{H#JWSh6p_CAW@wFD{T
zR~VGje6QmW6Zx|ivnuZGQ!Ft$8NOY#DZ}J+SvLay3~^nzY)SI>xFd7$>*JHR+Y(i9
zofTrGn)uRl#ihJ?m{gC6-_#_tqlve2vQt0l^o+z9gA6g&<hP1})@Gv$dtZq$p3VPC
zNU83NC7-V=uHnPbf4=hdHNJNV#_ws(ZZsY8G+Pn#Upy7RkCJxhSXCc7X($5M`hc;z
zTUqP?Z*)oh!xtxm-dWp+pH$@W!8=8|P;}4!Lv@KI6|ISNmlJTD<9BZMeO0NYo?iF|
z3sW$5Vn#{q;J*Y<q8x3W9w>XULvOK-zoh5t=$*COp>gb|b7f7D&wyT@_b_aDGCk>j
z_53-Gk3|DoC+&J-dQ8+2{fRLy($lLzL=~=)+s)k3j_2^u@pG544skwfx~t&{YiP;k
zp6UOTL7~?FxV5m+=i2#CSSX>($OQ8zt&Be4?u)AJL!&C5k|r1R5){4Q+oT5{R&|Se
zIx2XF;D#&h)+kyjL=;PcW2_iguJ{DpGkYEw=FH#!441O7;%E4U4O?UT8rWfrxW-~^
zww0ipGeQ{ooklv$@z0E4;AYp)r*WGLT;>5u{_m(Q-iI6O3)-YFBz|zp6x~7MQfn{`
zM$rkM^&H4ym9o-UUCi%K+dWH8wRi7=>`*d+Qsj}ynhMflANmH;R&eb}4)KBEl<h*@
zbhN0%3_{|Mp@k8g&gQ!xP6#}>RxRAkCz5xEYnjzi&mpmflhqSzX&(CR;M8>2s3tK&
zXZ0~L2oC33kl89md|CA51R6y!KrnY{<VSABovr=)@#LzK*weGlf8-ba@99V9a{QI>
ziX#!<(*98G2DzJgn%1`2R~17z$-UARqu&;eq_EHE$uGgp@N`CP_yify<B0+%2@f2U
z#AA8&FmB}z1w~SgLnV?M)wA>RNK%$KhlpN1zWTn2H?j^LXonfEsv%<t?9YHZ3EPvw
z2Q_!HPETQO^C;d)9V&1`HTR|>sT?KY(3zp=X$rE{l_ZxzZHa3eak%lVel-vQP>aRs
zq9L)mNkFuZCz;gSFSRZM&^3CBJv}F-ev+l-@@cu+a@WN_V4ECm^IDDK;qF_FmL!=I
zq2@h&MM)`=?<g?-xy;Nhb){wjPi(K~V|L4$;YfO|u0T5pr9mmLOEzL6<ZmuwymDxS
z2+>#(FTj7q9hff>!j#(sh8P2HQw|jT7AM^vu{#B@{JBcL_>kTtO60#grav3JkwK7L
zSfxbC`~*U!wHtW@3$|(MG7K&Pnur!OBnclBs*~5elS!;EzK>btKt8r>k#$YM_r(Mt
zjY6bw*h=arK!g!lnp7o<uG&A+B0K6_uI_IJe@JSJAIo{a-2*u6p*Key^vik;iMNn@
zQ%_oPVXqHeXPV(tRsB5{7&F+4{WPeVjBT52=qvm*Lscu@Tf`?9_Iq)F`(}&p3AC!t
zIa-z-N_ti1kmUS>Ummv<yLh+G5>2Q!=*b)VWr6;toWo+x*z5^$F7EhlsXus`JI9tP
zr(O{~$Ex`XW~)2^j!ac`#ym!ujeCd6yoanz4D{1f;<4NE0TL|FZfS}#kH6i-H7^1@
zDB1G|q<g5cghvN|YRa5ZPs+SWFb)G~2G9p#;?J(-mgqT5y+EyC^p~WhmJ^{h@R-}o
zS=z5wBy78^v^+BlwVg#lHutP!zm(~@@N=HD+M1s{j1{Z8mc$U5ZhjwhpW9CIv7Erv
zSk>^~pW<o$eM$}9X5^ZcG?%&kWbrUDTu6iS*0tiD(n4cDDo=Qu1hArG77)I~<j4p9
zl~|4R60!s9?e%n+EER=$Yk3zop8;2;RKNAslTB)}ZUP{ZkMZa^-Cx7f3&Remx3WN1
zIi4M!(GWz^y_<N!q+zcxYM=WkvbZ-gFb4<kt*v;L(m-k7)TPI`Pw6A=5MoFrRzLcW
zF^lerV+b<b%|;5Iltf2SKa;-146SVT1Zo>BQ7?DBTx4;|!arleBY#P6fnQC&_iiVK
zg{hX*e=kXZ(4@8<5%gA%tA9x5v(8fYn=!8ApyY@R`Wdu1&Y>jfdt|2(cGkz-NB1cL
z#@1=#LPv~!)Sn1dsDpmrJNaIfVcdNWo04p|xMW|)A~Me-W=`1PylaU5FVSdga~3T9
z>~8WQH#?oa6>#)CD`Re6@#dUrAy36PILvmn)}IVM!Kpjt?+67psY;`M(1&z-PQ@Tq
ztjh$9es5UYiZU<h|DvPr8&q!i=v;*_KO_EA$v}*G4Jpe-9KFTTH<9cZEmj~$t-!a{
z&A^IwGvVd3@g<mom3logy2W?dhXGyC1rq7ZiD8`CTh2xp&Pu!+%ro{II;kxn(J%gu
zQeD_^E$lE)_^Xf6fZ+Tq#G$RK$>ZMaxNsHqi}3G}BL3BIn}mIZFe5J!Q_sjbX$Oho
zYRqif^Syh;J&xQUgq{zq?BW_$JJwKCHvu)6%kRDP9G|a5|HfJd_v^0bTUcZ(4qyL&
zeaU5by>M0F!Rl#iK{P7T5rYaZ+BsLXl#DBQmW0|3(^fKs6ue<1HDjD|8V5LFT%~&-
zK=`KOLjZil8(iO>S?4Hap=~RiFgZbc<w{yG(4Hy&vH66*DPH^O<`T=yF&;SCk@X{-
zJymz%AR$`az1E&kJAk_5eNv*WFVKlxN!Uq57IZo~LrZ=g1F4OAO-PNulpPQc&fFp!
zvJRK#f?o29;>{+f;F8NA&RpV}?9mC9uwc15q3gMT)@tzjIkfFP0#{03<dOpAN%Q0o
zxj51Xn(x#I-d@(id{S6IV;;N7`dv!9NF1|h_o93NI#`fZatwR?S1PGHt#~A<yGi2u
z(@x~I$G}~4V{Z*doJZh#8d_pQLbnZ40R3Sr3NRqVF#x*<(_;)791hL&6EdCp@<Yn3
zAhG%ivkcDYsCnug5=nM4sM~`Twjwm@*v#=1aQAOKC3Ew0p*r8)&}^={SQiP{dgbEG
zy6PpQK1we!kowh-d@UJJiAfpbT*|7WCdBrrjO8$Agt?XP1}voqn@N<4%ZP%kX=t_$
zU=k@Qeg=`P!1CW`9XnM?CO23^uST}656M3e7bEsRZD)5oZzn@H9LI3qF&DT2+=`TI
zB~oOPE%C`Y{1g~o&{b7b!xg(IjA*x8BSwC7skDMQ3W7IA0%Kxpbrt~(AlDZEcP><a
zD_p*g(^hKmFRtaENLf!33vdL!%cYH_m*eyvA(Qq5&ZqihmLrUGO%(S(Ch<u1<uky(
z$hE(JUjcjc)g?)-)gWp-%9B&r(>ii``7)T&P%TuHunn&kR2>8V6HTPY9tJ;bcGR{x
zcc6JqZKz+?v2$C<U+1As{iZ#g8fM@?Lf@eYke4{l++Bdv&xg21{(KNPV4$r^2KNFz
z5FqS^M~Ck5IJNVBb@iKAF)4@sbx@;pmL?B6nWHg%3kKKoQa$;n;a@+RWe4Y@vHHTY
zrfFf#I2*WG=^1?)1HE&`4=I-1(}*)GK%}E=Z-+b+qfaf^WF=o$H##OnUP*zMGfD#?
z$k`Yq2CZGq_}YxGW&gBXua}bIc~(xx#57vpHogCLx+*JC%C-8!GzJUMd0SHTT$3(0
z!Bo$>9W&u4;w_yil~Z@ELdQ@=l39VN%KONNJ6A(aV}i$b-1ZC83QIEVioM|>i51<}
zYRap3+qwcRrCC|T$;Jzq2R-k49ZX|0Do&(QkiBI0X5z&+$~=%5$W+ejecSsdSmDCH
zrh!h|c1LVUhyK)uWmlNXgQR5TANLJE8_l<_@}B&N-p2E70Q>=e_JZ(Ve$?TaG`o}K
zPrRgQitkT<&Hhb)KS)f-Pse&jV&ja5Syo$J1D{hCtTRc;1z-l>8oAk&EmmcT*kbx*
zN><#B(blCgE}Pm{1PS;e3l`5Swmcit^!`Eg9NK>;0>9!n+n!1%x?|?7uZ*#2v|(<e
zJ3e{@{a45CfmMS|D&2S$pxHs>HdY8bVKehCjE?8sGqVa>9UpjE>gcMa*^59SjPB==
z$_Qrs@Y@x?0X`G}#b?0hLZ{t@c<z9wG`p2%`oIaYhrTV!D|5bZy#)i_Jkmi=KJX&#
zo-sPdh(Z&TkDxw&b{k>NS?fuLx}k`4=G{;wo*F%b{qy&(bIo=+X?EUR@{T6tmq<X`
z)5#t~ME`Siz5cIT4K%D6xLQBJ<F-M6*$og)PV47#prfC`hE(upPu2OS`mx^h@*iZ4
z`BR*%@c{rw<z+*9cz;K!yUAa9yb#}RQu%9)VK=z=)!SGu_AH+ror(tpbGRS~Sv9;=
zWqdeQa_&UANQk0}CU|_1V!Q4-%ky8YQ1oWxXLq!9j#us1S&-hO9^)%-1DB%a!{q;_
z2uk%n0sa=O|NC1q7d83>{F)x1_Agc3G=S_r`Vqy2{dk3f93A@Tlu`C=K2QI#S(q7q
z7iKD?7G9GxP-d5%APZ;$^mCp-iT6R2t7ZJmzjE+CN&P2WU1z<_6ASb)&8o68<WE$&
zJk%@Xb4z;!UiQEdMU#pPL;cz)YdI~=AzAln{h+}|xj2Wl1}rFCmnj4ki!;Tz^G#sG
zG}LX-eAyK$kFgK!8<_W1!;XVeIhYlsojvh&`4xo@-eiLQWoZP{oiB`p&K}3QyRbrc
zTj6qaAU3FS*nD1qwPr$#-<xAYo{T<bt+lwE1O2g1W%X!H%LE1#;~@Wc-%;xXX?6qE
z5$_dy<Dlr)i{g;3ziuhF5QuLr_Prf!mIbfv{qC1t{j^3F!eh3j?E02-^`(01y<rK(
zeW4{il(*ifu2=prxR{~Wtfh5Fe=V6cwRr3Neje<c=Wq52{Jyj~_Udqcu;GRPzM=E}
z-oq+PGSD$`DDs)7eSc~fT*(y6<u(-zkFdDI@8fN?0}h&p?b;RHE!S9Mgr%zQBR8yB
zM@s>_g2rBr$z|J0UL(t=fIm~f_<fb@_`S{0(2|xc|6|JLeB;U59`3E6)XkS@h9(iI
zlGk)tOHNit_+&kVNmnMOzxKg0^C~S*&@g$>feGurzx+s{uM#ZhWB^uR5nXWnrxo^R
zxfw5CI^FWf&4r}s)`KixCW{b_?Jx%FIpQK$DXhWjC(~ssD)W!hlP@gydN5_bvu@2C
z^`^#vrppdKayi)zF_a%XSC9GlbKe1f872MeEv?}Sg<L;%{m*#Rn45s72(eo^+(rw*
z)#Q9nccAMEakaFKfE3fO64yOl4W!nLofDpS>ij$+>d)P#mEI`B)a2D?IP-04KjFLa
zC$k5-*?QPg$?UC-Y-S*vn4RH)6y61l4wRg5JXNM*|Cos8trJ`H1M&zI+K?y<_YWGn
zHCO;#lv%RN`ROMOtM-=~rL_W@i4c1V53$(SU7`$sx>MKX-p!*tbvGBx8?GUxA$Kh-
z!;Np{SU?Z0V!!8<F5f<|43l2ZdjxG-m>*W&_psCAt7l;$wn#(!p%`|=22#H78X^yB
z(sZ^r$Qj4SS5A?Re|?!`1H9bsez=`vMpWG+yPO}b-!%k)^auHFX}8gf1WAheWtasl
z&wi2NB~GjNKB{?JJmMYgZJ7JO$oj5GLC}0roinpUK&n6BiY1K_zi0Jx5@q;q+cmnG
zqS2k!!uE2O#A=v?d`~`C(Tv3J3mj#@chdYilbCrUzvR8*0mKEsfi>+1sA>(JF{I}#
zXY|ms)#<CZ94D{5y)>-9sQq=S_#{pB)-3<^CzBHInJiunN#*eV4wHhu`J}f-T<#(b
zfYpVJIuhHQOpLk&#0By+ttAtV^JH%4Kay;sj5BG(@pxoX3(@Ii7eJr>s3euys8RYa
zoXuXAW9HyxVW~Ydme6Iu4MsRh`#6iI{(JFwBtqgrQY~nywvvB~bstt>8_*{w42qm@
z+^b1MRGU{U#gE^t#LYs^grmvl({HJ7>?MZJ4{G1{8lv-0_)jLZ9pQKa(U_)9i3fh`
zQq63Tb23X3{A9)}d%C`Ec*?6bdG|@QgSlP-Ff_}$P<&;<Id(=6AxK6MkCJuXkr<s&
z*8hJ&XmjsGs%}OtBb#R+i{1gMag*b0@ngw3Ft%{YXZ4$8+tk>c%!l4^E8vuGtuW}F
z(;~LdCfd8Bt<jtH8#Ht}uJ-us3Qi#YTNk}}l+K*>;o=xQzj;cbN!W+4TGDTwwIWEP
z+W$4rqhc<ad(T}~@V&-k*SpLZVE+^eG&o>eewYIVt<>?WN@oFRtB0+6WH7gXIb<#4
z??!)t-iIpIKKL8)n|Ie_j#Bh3=T<~q_SEPYfCx7cvwkzLc%>A9XzB0UeksxOiFxRK
z=)sZRUNyF;x7y?QKE3W{trt-Py6S%^XShbX>bn14L-}S#A^s*@Ior+FK>3Kpm)C2H
z)_XFvhvt{Wqmv@lY|H#rP?S8vn*qQ)QA#qkN|u^W@8pQe$m*>fbx{BKuY{5UJQ8nS
zgYnVt@l!c*x&Qlxp3U)^NAI@FLw_!i(Gw;b6>00f7;z90t~6On9-*{v{e~68Bkj0S
zZC(#WhML>UrZ1zyQ&YDSeEO;%!Y10l45ZOU0CU5_V=qiWNhi?M&2T2?06)|k99^ln
zas!Wi0R95UlnL{=L9>vi(>@phO;t_srUnyZFl>+bi7DbofYC#QqRqbD^u}F^CO;nk
z-^b@?MpSqi>%C`L`stUo@O?xnstNj^h<UpEC&)Gz=*>dSPgjJW@1}36wFnU8Y3cH2
zL4WfORxH7RUsH{(WS2X5qr-t_Se1VdcJ3c9S9Q%$uzEPyGuZKYs5^0I%OLMx-&<Xw
zK#gQTV1Ihj_`9vJth5xN|4|jPZTA-=!6-XVj23w<wCOqfYUEO>qH?zQGzQ+4b&+~j
zB3KOU6%+BFK)V?#CjH2c2RrG#F@4|=%5naQoAGZN*7n(4ZIV5~I2>|LpIS$((<)Qu
zQ_m@S^rs4GY}VXJb2kJTCM<;)O7vuqYXgKj6^A+@-LiI^pHP}xb}z>M^VM;;-B#h#
zszIg?yvQh`$J_p|%$!~6_C(BmBl$`rvN;Esyf8V`xO_*qXTq|1v8=-0-@A?V$~)ik
zHR38qdF}UBKdi}6eOr8ls=|Bd|Dx+?L_7YHdK$yuynMby((E0Y=0A$sT09BXBPlAb
zmUy)KqoQlX>jmbg9d|g=*G8h?_{Mo7a6BoRhX3S4=Z&;bRcDcwcOd}tI6Vj}l+TaY
zcYKA3;s1~ScguJOucw$+&8D{u7A|yC;w5*~QGiXh_#*lm<<~{lqMuz@Wi4eu`u21}
zNvU%XXQ?$^dOAy|qAXb&sFp(H(9F_iOnH$0e?E&iYTruRw9AscRy8=eka+h)N$SW)
zGx!mEvk`D#v5_#v@mDhYu-<I)*W!Ta*ODq18MCQ_)(t!yV3_O8sRRV|{{;F}x{f64
z6t9;KEZ(vD++akCqEkktEy_53h*Xo7hz?olpo(L?unl4&@&c$-DH;L-7SomB&i75L
z^36E(kpYlZk&lWEA9vr(sQhP`_`T*Nmo^svqkE?p6Lr{S&XT9m^D$1%&y9o>CaTtY
z&(n^0vtN<i>K1lj9H!)&6ue(TNKNv@>xur1z>ga<i;<6EZ^~Ge;Ca%1>vc1OYPPh7
z<B$Q*=m#0*|7^rb#<8JlGXg$j7nd7|6^V|myc!oQpI(lO@^xz97jZ;0<qzzb_Ptvb
z-`<5ydD6x#TTLSDGbTDfY{<u`A4Otqf(e5RI_6KqVB=0abkk3R17;}*2xwIh)1P<V
z+AZvk$vw?F|0vO_?cXzg^7<poM8^@rmx-9paDW;tAmWSOP7%d(t3*i{NpfB{F4mwh
zJ&^tPNJi@?#DIk4oPgjtE4dn?>kBfgTT@?I(J`&NA%LsoT1}YfMM~bMLJRf^_H1tp
z=)Vt@7QP@m27d^L)OS)?$<U_2#S69yF{y9)c+WMa0YA_5|D5(^%2!BW!+%-;G#~k<
zjb3Ikkpe=NHTbn{=-5q0o<drLNAt`%lg3BvJzaD9-JBT1Z|BEj#V@U2ZFQD7dXULb
zoe~lRw8C6P7jnznJhr6FFNo8Jr$+CBb>Zy#c6ukTE_!JS)?WKgLrQYHFkf|0RG$!`
zk$%@dTnZ&#>B0aY^Bcmi8)F{fBypT~Kg+(YGE6kf-8oJ&%xPri-!*sId`)AC=zm5_
zZHsyH#+^t1{h5rB%zj$(d5J5sey>0ZiH>~(5dGJ9GyLkrp8a;&X!ErA3-DRI<^^G<
zVx{^3SLKuDn~c({ZfcC>bC3ciq)0}&&qe~FU{s4f0z4;h`?@~?L3X_DU69AhH_sYR
zh?69_+B#oy4rKhZrlv0-D9SgbIvF|YnS3W}oM}+OqKHGha`nC*bJnj?b!$BY7YWZ!
z#eAt;G>T+rNM4blEJpLpM|txm0fEX|$GnfYyW{h_p-XXo1hhkpX$}eBEh3Ro?_#e7
zv}&11zQjNH=5^xeW@g0XkmP1#K69nxUr~c4y>>G(-HVZJY`tJU;!SxQB1iTYPu$-#
zdZTjP8y^oko!krdOk0g8o;R5HJ%^MFPT%gVn6}AqPp~29{<QO)-XGX57AQ+V5b*8e
zPY!YLY04*=boaMg6f|lO;zZ<dqSCJMNC3vM&TMJ5ym<EEH-a2>bwc1tYWa9sUkc2}
z+b#3+vJ|hib@t5HuuIWegev8-HdGZXzT0>jJ63R^>MY@;C2eNkVq(9SEoBnSGVwSo
zvcLM~-R02}a4p!`Dct?6L(B6M#Ch>KR>gy?krHnz1bNA+$nG!Ggf;JdUcAmvoWzlQ
zV8|YDpS6tG(XVsNVq3!rNQ|!AyndxVT{k|;xUskJrA*Y?tTSUJnr~xAH#6x?9+9bP
zH{YTBrF_44Onu-u!WPRJGMm0!e??`irTjJQTBNOCmQ$+ts)z(8J<Z=*q(ZW3Zm<Hh
zIvSbYF#N?s=>To>`l~CZvq%>fx)+-e)5GvhV_J}ifPjFh-e~-@HN<+mEkQe$BRJBZ
z^6Qr(BWEX~>CHFP;6<u~O1%hSO~Vc=>-yNe?*=GhNYn<NBI}RV2!xJ(EMCbRxoB}R
zH&GY6avreJ%&>B^!WkbyqHTD4Nd>#C0<`<=Uub{0#<o;0l0Usta_b`qdh~KCq1J1M
z5yN|_E&^iop8C;oNBnN)eQF=We=j1{?L*)?qA|rVuI5>``iNEws6gof*6@tej8Qx$
z_Mp#f8E6BSWX7SE*t&mQoY8gq^WJxTc!>W$U6#9xsShN?|B*Fip*KWw9k7wq%5lgQ
zYKw~g7;+w04-!XIh`;fIcyuqNzg}p##v=XXodyogJ9>;^Co(Wc92UU~%VH-3DJ~s^
zfWbxD6z>nWySDG?BgjJ5LwIM&jV&olZpiuNFZ&=We4-QdGefbSollqGuNDp0m7NU?
z{VC{QPxj~j=;S)DXS@8Q?-domBy0N41*@5`mm;6VsswNHHiD|$&#AAQs!RY9`Vs3&
z&pM)Zqf^=8G9-p4m~HnAjQS@W{)L74PZjindI-JTb*Tl&`;g@|1h>gO@{Y?0L~w6d
z)+~Vn|7~$6BW!6*fz0TKMb}$m1zGobhNDVJM@(ZXty%j~#YQGkk2ZYd{UE`>7$OOF
z%zC0qlj(qZGp^a&#_te{)W7<tH_)r(;{F>6=?zaFZ##<A=*#{7@Q~0(;+VRjO0?&`
z*q)rrnK7E#O^eFQS>`*180~T2r;17>h_8m2<<fr%9>3eS0^G@Bjy<U|4;vMY^CMDv
zLaZ+6<U3O&=&X?SRBedK?{))-YyY_1LsWz2iy6jRSbCuQo+l=;<5fe+Q8JC`BTv*$
zCUfH|j;@6T@5CHYN(v=1=}Ci)_dl<`cu0ToaF%FR9^=LpdN1%ZTm%}8rxfove(lb0
znY?adn2O6J%{tt<3W?2S?dV$YuoyqK<Wo5OUE`thOU7sNFTb(P&axN0<g7M=41k(J
zJoNor&(=DkjmSjFR7_$ZF{5$DZc&$h-yG|w(Ce{~#(Ljpm#3zaNY|3_OGavr^ENq;
zx1AnV@fIdFf$GhQac|akEnp61EV`_+{DZ?>K~nib`FtQt>`J0{5Ux%=Rk8zJ#hz_!
zp5pxHAD&UgSH7_OEkq?*A+3~Y(WZUR>g;i&LdL(LDy%}H?CPF4uY#Qt*quS#&z{EV
ztJv}G$+HH|P|c{GHRCw8y>$8udpgdP+bQyQ;3Ok+7QE>Xcb^3E8|Bo=sKYnlo%X)z
zaU!whmBJdUQQ_ezYr>rciYrlaP0(LdAHCL&elPNnH?DV8&Mez?9r1917d{C~7TJ=d
zZz3}}Yjk*<Suaa8&b`y)F24V9>VwGwJm^ZdQpO19o=(WT$IlbNSZJ%hrPVDNc|_bS
zNh}Fl^6cI=SwhXq;g5gddKu=qBCiuT4A8gDQ6PYLlOsaHdgQmNo=(T{gmhgH^7!~F
zhEaxf((rmL)Ul>8n{-|5nP)$?nov7|Vn=<5a8;#_R>sO7a?|!P1;g&%SX3~!{ha^2
zq*UNas)U9!_FYNX!i6R~<vc;gv9#ARRM)@dqB~@kLuB_1@3wxNjko+nWpQ`g6J2Mj
zqv_Hja+MJX)CbTDo4>Scw%YK|ux%G!$dmrIqM*1Xdp3OAGLV#lBtG47%sIoyO)xZL
zAQlx;&?#Od_10ONr}dt&x4vjzB(B;|?lS&Tew8Fr-ERLyWtd^~YJKQAx6R$N>+v?>
zvT92IAY<sPH+5RScKr%_yDRoX<HUr>XE0;CerA2l_~*vy(etYVs>c?|#mB|?=>?)9
zAEW>ElV;lt<m0$45n%t?2Xg$Ld_t<JwbU8##cPtlKkLm-xU9gH64-`h8`)Sg8DfVD
zJrM%ua@+@^zSpo1T&qpf`wzvPcDb9g*+GiX1eg?_@Dto0h}^mR0XEGvdyU2H%9LK@
z0bO5i#0V_*KFu9YHdb9neY3T&#r-H{q6dDvNAS8tyxFbyojPl@`^{W5u?!H`BXgdk
zl7e0Hn_h#S9}BoeT)M_X`4KV^aj>o0`b<Wt{}u&-HjWnYg06v*HX-h#Pu2U7d)Shn
zsc!+*Wyn7vl&WsL>7lY;sLdO%M>RJC_*|VoymgA0aE_U9$+%T^8VR{0fzI*<tH<o1
z>DG3jeTA!&yi(H!7hX$>-r&TB5t~_g8!e?P!PTJ03&8H)DjEk-;g5OEPTThN4b&fG
zA;;wPEz)M}i80C}=n5&bLiehZ8ka#UlR81n?RjLOuxYCpE%+)saEE>PB_i%&t9B!+
zFWgDEp(dJjR@V{-4*FzIV|2Yi;u?vu4ll{0>ujHg>EBVL{0zv0Cp6IY`lsD1zS05d
zYe+6@;d)=s*3P_6)*`uRHeCJCI92P51)<^NTy2Ez;#P?tC62Vr?CmrbII@$zkKNq(
z$c&E(o>$gOER52MI~a`WZCH!+7xq8ayqOiIVA%d3gc#_T=qSHYIdpu@%vgh`SK_@f
zL`S-$bv|Pk8P|-#x-_oi9$dQe^ADze72KVTjIl@EjvksyGS_61mT$8*k0L%$VxpTx
zyMaf{u&AAz&-~@|Xvb<XC6#JO>yIX0YZ?5FQL_wJN3`%EoqJ>jG`rD^x@<6T`;B(3
zN@vI~tJ{H$BKWz%ad*n9c<)V@VBVA3E5U0~AB4iTTe`LH`%1X(<C>my4iz5uoEo->
zRelMgaW4~eWkm*mcO*OYra6U4bj(QJ08O{g9C}7y8cg;JQG90cu><-xWl<WMXs&zr
z0jmL{$VTWV*bDQEen}l9Q-ph4lR_9;5>`AisraO-gtx*&<|afGa>Ke4#85ZvPIqkU
zd)mLRhp&0vg0Bt@QnCYi9QFmBZnVG~kQX78^4bbK6O5Ad+tDq!9{{ZUb<1IpAbLC`
zLEGTBx?pW%`@ylcw7(ptjz+z~w@`J<^^OALl`vk3Fp$^my2;6Gq<yZr6(u%Dc*8h9
z!TqUG(EUh{+d;(k9J8pOFd;L(4}uj+Z}Hx^d}RqLE+s}ugGh~DRN4p|TGX`*avjVX
z6yYEe#PEi?HSsR%D?jAP{3DBAU(Ca$Qf@-;u6b^i7|)f&V~p%+RS(Bi-E%rO?S{{p
z3`w!eWNc}KO^>*|+7h>2($=~r2dQt~?GMDgD6A0_e`^@M=#E&)95Y{J65i7*)(AfO
zDt*o$C|h{-DVyW_SpTL5X<qsg&EAgsHhc+}O4y!2{p|IvyH24LVeyFP+I!W6#11W>
zRdkCB-WXM{jPYIXd^g}|_B3y@@kHpi=0(bjLy|$EP$Ka_3jG!Z%#HyBHAWf&782&f
zW@t;8Q!OLxZ}$%|mwIXAsqZffo9C)xS^T|CgX&%Ifp*z9Y$ilsIl96+Z8u+U{rG0%
zOv!oU7eUsc=T+PwaBeUe|140_-?f<bl#Ta3vV-2ES-)~K_A8xFHL2QmW~XxY$ggzb
zJ1O8OwdMMj!H_Q~q!(?l=lJiL^ByA)GXAR|_Au^dF7Ni%5xvkZc<bQFqj;@wH?l64
ze}bURaz>JJpK9|OX1cv@s>}>AfuL+@-F($oF*l$O=$PiQy)0gH*!F33>4DiPJ4NgB
z!%^DwXJfln#yiA!Jk(<Pj0;WrV!J(Gi(@d>@J~HJq+SdN7XA7DJx@)J%>xQzh07>J
zU~@(T^H`W8tMT*)=Gp$bn4}l4-jNoS*FH)0ovfTMX#FVF$y)8!(QRm*H7HclNj+}b
zeQ2j(asATV2t39HL|mjy6Ez3bd&bi{W^mfKzC=(PBsh4290sh6Zz-m1RnC)uO(%-@
zJA9%Lwrjo$W=TFEkZ<O&Htm2zy=Qj~uW?h0ACJixmhQ@k><x`3;$!$VddFZRV*=+?
zWX5vV<L>MfBBuJTU<F5`E=!`hZ>jPhc1c>Yc&`-yWNvIU6%4jiQM!@zuU{(d;)pL#
zgv65200-q0`dEukrMWKs1)V0zE12gbXQBKhzi}$qxj8Qn8vOTvFNbW!xNh`y>q7+c
zN*@drURaTpJ;pGqa)mMXvestjnLebR;M!M18}GVziZ(vq)pP9T`%&@r37_fPjeX7w
zQv4aoTUM3|=kDjhQtzI*Zp^VX<^{Weuid#rDZt&u*b&eAO;B7Z>x|9K@1u?wt*>2u
zZ+FdsyNx-Q%ha(>?ZFzXzN-4!qni}x0AJsIF4=4N6L-n`;i?yIfb;WZ=`xV(TMwi%
zxD5H^$F|=ntMJP+2W@$ViTRh&Nbf`fKZIUv)Ukq@zYmDaNA3RT>nb(OjQTlw0pDY_
z?gZY>yrzo~FMXbbgUXHM<m$S;Ts#U6EAx9hh>`X4hX~_;EZY*F1Y{|IHr~aY{wJz2
zP7Orr?}tNFW)+{=Duv1n+r+<`j8)ls!a_KFhnZKXY^!iVRILcO{Mu@+^!Dx3f0Fo_
i5|m@syB8MnPFZ4Mo#o6YXCVB!1g{h|UX;jN1pXi3?2hsP

literal 0
HcmV?d00001

diff --git a/71_RayTracingPipeline/docs/Images/shader_binding_table.png b/71_RayTracingPipeline/docs/Images/shader_binding_table.png
new file mode 100644
index 0000000000000000000000000000000000000000..b146adeec959656a937e5f8b05b10980f5a08700
GIT binary patch
literal 8569
zcmbVy2UJs8*LIX)7^%*nB1lnCIz);fU6f*hAWClmlqS8`1eKvl6C=Gzks>9u&`Ctl
z&=HUl0)!5sh0sYz_`~>q@Bjbb%vxVL>#pqFyUxA$+57Cf&p!J((T_peER0-?004mH
z!F`P<0KhNpwEu5@`<3=vM2w52i8J0$wC@5c`nmD6lV2Rvb=3iYssyHE8+zLLIgk6M
z-T(kA?Jm~#GidkX=K#Pp)(0Bu1_4$ZQ>|{;?n^7gMJAQI)rt1*nP0SKzOc>67S&ZT
z8A}dZ+;J-B|5*B7<h|TkH`q{8%xCCpcD0v`0&h!O+1^?gJxZi&tM+Y-;NyDN&IU1^
z+d5&2>g3mqZ;)N^xSfe<M4@sXew&}qS5{W;d@h|x3=kj4R*6}<Fl=X7G%R((VY=tF
zN<JWL&wY(q0m=^<$L=mBs>%;7Ymwe0B9%|&<bANzP>ME9=3K#xX_2;1rML545>;=P
zi>4i|ZSf}AoXW~yLn(X4yNrNJ-C|}s+J%?54#@;mzA@mX>s`Ccr(FO5v#{=)r_v-0
z7kZ)+L=PAjG>AAuyYQLon26>Z6a##g%avw6?E(OlS9ynqs>N9e@$&I4&d#a}!lGl*
z&jP8(fwBxgdyIo{NJxxTqf?ggMt3D%w*InmRZY#k5EZQu%0C`PLP)c*ug~87yxoyS
z^mH(3?K{^`7y!VBTR+hQ0C$`pbDoYeO_R|N|Hh>LpH%?>0&Jwy3_yQ>z)KFrEYPCH
z-*2e>^{2@Ifd2&ot5hAqB=)~QU?v<?^i)CrzX$ohi{K@PAnd_EsSuDmjQ=N{|Jz3U
zX9P+I2M5C|zH>yPJ})otbLq6s0OjQ6ft((nbaUqHaSY}fl2-r;{l{bY5K7*cAob|3
zwK#G{j5L$4!e8^v^k=NC#C_k(m}$+Dx1GM#qX9R@lj4quLQe%*GCtsxbiQSU1nfOj
zolL8D7)1bI8j6y#CI}p@e%Ih`DRStskQ7t1`l}_SV@rpHLuT7q>1BOH-4$~75*YA#
z(yApuUy-tpbvKBxv1APk1&8H5ST~c5Bw0N_&_U{`b?fggKXjbkFj;t^c<UBr7=NGC
z{wy#FD5a9Cbi@@xpr}Ap-eEK70vny}|CH?uR%wi;{=!x6h7^R$!BU8=$@0M*UW#vq
z6jyt;#(i_A&Z>Sh7K4=4#*^+TxH-E<51^MnPeqOm<=><Sq(-U=re%GbC+?2H!NWOf
zp`g&?!`uyqu$^lmi0#98((pYzsCla0DO5HXj@?K<2s>+39UE9)(Im(-`@9w2y>vrs
zn3}88j7e+CYKPvGhw>DNnYh(umpY9hHeE<hLf0IchM^}nNtt^p>^4F6__{)26=O?^
z(&4(Hy}4K+_D@u{I1;1$dks#{ieo`6tVNMc7Y*y&M5j`q+Y|YRt4B9rCUO3f6z4pA
zz#kaO<<pMA7jTXx8h*jfg|us$xE`ol;D${|xvYxev|&2yVay>Z?vPXt3PDWAB-@x|
zz+iNZJmGMHDz2N_1wGX6N3@8vc$>=&cpaEZx;1&dYh5hyzl+Lz;6+EYzv@-7#z=^_
zVi4oo*S4P;7E7Er<|~+P9eA}X3A!O1MW3VecG_y+Gt5$foVW9O&gMY$A=&1!A5-1M
zkIO6hh4V3)X-Ziy_2L)ii-B#LdT;$@B!2HK<=qlSYV+BV20`lx3)Z<y!mf+PRKyen
z`8H$o0Z9oF?@nxmvB7~0yTj%pqIPEiGqm_O>fjEJNHHsGgnc*`0X_msMr_{_RoKXe
zbXjD+@2lb7e6+V0W7#<zlU<RSqxrQtpB+DK(YYLS<2F89o4zW)en>6z<DlPx@8#=a
zIwLFlTCCL@q*q#;omEo_v7i`b#rsxMPh1n=`w9gddpXxZZ@^~U;9>32_+P-rBzLCn
z^KoPShpY3697V__Q^03Mea?tI4CN_E7iA;Rs3P3VUg|HRw`dEL{EVR>-uLx1G_urP
zh-O57%xNTxwy)SZ>g~foeV@i982lC&+jNH*B`^3bG>tVUW=v;4KohSDf8u4pmq5@L
z#^om7-7%*#fjQaS6xv=*e&HPgU+n9*^3Yd#2b{JttiO$%sO}0`_HLXK6VaA6W17EG
zK-l|Q5|Ss#45$Q%wpVJYw`oZ(>_k4|?0=}__CWnsYHi7}v+=c2DPZA6_eeS^mcd>e
z(V>DiV6Uc9yw0~WI~A1;_B4+3G=HmgI=N4Zj2x2Zk#Sbcd;MvCkmbXz&QhqeZAc$I
zDv%d9|EBVyQwUY%9n7un3w?CNS7Sb<hDLl>c1amlhG)fD6JxJLW&p_2C)t>k=ELal
zdl6nFhc5bvcngCcXY*aG9zM2KB>Wowp*O2%?GdL2aHsaM@>_o*gAb<tsnmm8;mxMr
zkl}&*htggGtq1x}L{jVSbyB7-c{&O=rc{LBRdIZI2=Xi>$noqebJk6Z!#2pz=ywm^
zPJZk!vSDY84Az^{cW5}^-2Q~)x&-bk3RY2QNPjrUepq=+9N8%Sr|M8jxksB$&75;n
z|IH*{MW>Hx_x&@(5H)tGW{cRDmWAp0A-BYGo{K-#a(<#9Ue|k&#Xz)-DHm~I$03Ac
z-9^ippg68#=JueCjUJoF7eqIpG+(#i>G0ow{lNV@+TW*V$<9f;vqW1oe-Sai_IiU4
z((*2>7!L`W(%!BE4#kY_DGEn@B)oYsRzt`eVEK5`l~_1fKFQr8OK?8(7<A%O)SP%4
z49j76JYGrtIN)D$^BS+(>|RkH>=+h3bldPCjBVQkg^?Lbm^Rtfo}0m*v(3ffZB=W(
zXj!?c>djpNXJ())=4yW(zVx0=z9;4QN`bFWkd)ZH^%fXK-&4^%Y<>sUOx_ZmNbuye
z=Sp^8vwG|vrj^vrpT49?>d#;&TBd9W)AaQfiGmv)nmS2v-+_O5E?ttT5&6CsG*1j5
zt&PF)1379%YS3fr75#zVae;V`nTl(l>xaANudu{TZ<vv<2GGK!VuVM6g|DvSm#!di
zQr{z4mj-_dD6PZFcT8rX*^ON@J;~TI%AIp=O@3~zUhZrCO0#{pT%owMm|g)cp<B3r
zJ7!+y%+t-^y6r!%$;d*8it7iiEMk%Lk5&%kb6UHhxF8?Pz4N%qrYu}LwRfop(^L@@
z<?c4sPp}EKKRV8MNA4f-v8)J^3?|`DsPZ4}F24-V%;dgKshETylPIt}^*;EbuFVD%
zqa33uI9M<{z1JVKnkL5EcLwkwX<b|U>6dB^6AKT7k7w=VqY`Sbbpv0r-NR9bl%y6B
z8Vqj851)|48sC)GDECxVa1W+KUI)j%49;-+ij%k!J5c_wY2I7uUBZn7rrK}QNM>DN
zv;Rb7d5gHpnLZM;6eOeoWNo)2jv#qXWY11J>~aREq}|*l4-3FgWA|zJwj;*z?ujR3
z_zF=7(UF`7-O_EEgupQ_?rXmr@pnMTo7q{9{5|k3D{gKI>*vnnSiT*c>&A5W%Mjy8
z;n=n4UEC!LKFMfW?ie$us;%=gWR;e)vF?{pwbva;G>?__ynQmGv2E|5CkfJyeZl~^
zbH(YT@ty?GaJavqy}Hk{>jS;h*PIgUlO;j?n1n?n`3oX#u~O(xh(LC7O|<)SI^syO
z^F)(NkYs)Ejj^alw3P4j>(^l!v>?ouB3-6vCd5!v+_>&M_NreBc*%Dk`+j!14?mvi
z*emnF;LkIF??z_4o^{2)0e@>RO5e?cYD?<nZGSK3vj7>DkYXWcKG~(M*nte74+KjQ
z56hw#D~s;_$E#sS#@Z77M#bf)dEq5LAFr#e(>I;6*9!7eE=+wC%SicyTQ);B6P5kp
zAjs={=7*SU(k}pj>-mIFX`AxPq&}<1$d#Bt61w(F=RW@F_b&oQ^D<$FtM$bLA1q8u
ziY-Pfy~bhu$cXwympNx;yAW@jKnQp3r;V@t>0r*}7fTL$o1mr2yQF)FgTV5KaS~hC
zQJk&jGcWo%r|CW1YgS~;buw5PBHM=~x#CqL+S$-JQ}?+J1l4Y*noD*vIONIAKFECS
zudq_Ro<2sx3yhPFpfOY~&>;>zYB!pr!t09bp9aif1L&7a`&h!=aOc2@pK##=RYJ(}
zE^MKm!N$u8QU$sl89V3IwV4(&)DNFOg1axyROa2#h#T-(l+JWeTxC`I+W#;miLqPx
zlRC|KS|Ry%DiXAB+!0gMjZ!z$F5;+JYtQnpRucs+Ke1@niAT@l;b2suS|~M>dPr<%
z)@?*EMoe(yJ+dou6%uRihxAeV*8O?T*8NUC7J<q%#Z5MSnRVK*aUI6~u(7vnACPn?
z-60r>a@`G7Di=pGF`Kt|#T2SXj)NeQ+|@^wBI@GmA#|?saPJ(~?aPKYmC4fGP!5&^
z6(UQm0@CEW-~09`LeqIL>K1L$L0H^)_O3ZJyW7S_+p{M9*+#8oiRP%?yXHkBrfg&P
z1>H|y<ENU?P1uVN>b;@gr%5l0<=4!n*NR!wxK{HM1n`ga=8O_G$;zp}*S#d&Y`)^S
z`P$5%Vc1`@XTSd2w;+^9#z^u>pGn!o=AZcrWt@BY3kp82t;Ca67dDef7hah_ei#E!
z;B{)&B|fH}czE6GmYhEplKkoLRv2_XE&ut!(Rk>xm{je1u`grCB<aQM<M-1NcSEjl
zfj2tjnu0Uqa?`;6;Bn8Aqcmf}qFj*tF~p>2BD8tPZV8R;q}p3da0SoxR9yjVU1%w1
zmgy6MIp~GKJ~~-l_7#_>rLVV@F+5p5c9+$*6dB(D^x0e2<P|m6ge|BR&#hH{a~`<j
zw4+6i4IPHXP*%K+Eje3uD(AjH?Qh=@?g)v_nUb7d&b@-4%D%p%kzs8$5sK1FmcZfY
z61}G)E){;NGj+q@VmC<A!ZUV!>+y;}Wssr0UBpP-iua15^cz3^!%$}zw$g?82hJb2
zQ_J~LN!))KVW@>qY*pKamaAXyYwQ_IdYB*=rnflQg#Kh5n!2>qMvixNk(k4_9Hv8T
z_p%bBZHNaWjAwUl<OSB8dGvySdKa)H9t?GJUXKQ%H%IfimriGajcCH@RHIzNe5D|J
zcEpJ==d;i4E+@Rd@|(7h*ZHYeO~_cKvtyu%CNJL5nU(Tg*euT%*2y{6cHN%rwvWDF
z>Q##<A{dFdkaw<o@1Ir=#NESZ@G{U9zU!o7H{Zj9LI)V17-DP7N%GIU<I;1#bbrAg
zmG3dvLYC!xh(X%Z^c-njox(|2y3sDrDqDla`n+d07~+Ug(t$)$)>3l0SVlHX$GNWw
zEqW{1m6YPxv2w1%pFE$T08Rsim5&$LnruUAlGr9fqZF3=YEItF;j34UFhg&~*>Se2
z9UOqla++>65f@sAHnFeSv-9)iAT1K&F5Kn<LhD_~l`fgvC9F(ZvYm~I4ebpcmiy-r
z^t)di4o8~EM!l^A%eQdUiu%5bC-yF8SL3wN9)o9Jw73o@=2~9!6Y!eQ6Gh#<h^3Qr
z+*Gy-3_*<alEV|^i*}N|EtiOP8LD{~S~DMcNTsf(%7O={G&<z#x(@9lb%C5JrK{vi
z-wHRW*rXK13x&xgstSUCzhi7>oCRQ#d%83}I!G;uSG6{~hwZC@>*kzI5z<!|FtPSS
zP9~}8X}aOS-8Z4fI)uv@p_NRRwy(&rv9tCk<Ni3?2@SPPp*)wIfznN?l}ASDVc)m~
zC0t|!#*f{F9_aKp?{{7cV%XK0&bI1YE08s+Qn41e)a#yvHa3rIao;>-yUq(S+xfv!
zvu?QMWZu@~C2?8H%ltqpx9mB#?YT<c51kJAqI1M@5;DB1r1_+F(R<fRlIb`*3c^d2
zc%X!K1AGo?S(kA+ScmN*+zT7#+9p-1GrUVz`k&lU0`ak~qX*&_3Y6aZd)jsE1><iB
zjjy-J7s7V@vSjCH4TdS|j%vm#_!y^ftEz#ujAx<1s=mcg8JT|Sm-lkBeB8^5>7&|!
z>I8!qK_H-&kiSq*UA>4upOvP+e!GbIfINq{c0*ohqG+i}q=awEM7B{woS^w_R1y!^
z)pv(lk(uRh=aJc%0f$`C0@A*rEaM+Sw)D7u`VRSWDWb#?64M>Ikg701t(kKY6|hG|
z2NVYvg+ui^R1*$^*m19=wLXa>lD%|mk%n-kQb&p%wmhTIu<RK&jgWW(DfksCT)hsp
z_)4|>s%UntCGlnqk#V2NGb^TuXN5s%To0na%^k@QHzjjaz+?MOt#+*5>wLnGXP3s#
zcD!n-1%1%wgD>1s!v0tU9z2Bz7cB9xWhIOq_3xH^$V&P~mQ9V1q?qQI8<Im<5eElP
zuohH#cF|R!j#CboeNp(D?64AsPt#xFsjSL`i<)Bb1vht@TK?YMs^rNM7L;*0Y_};@
zsTl+MoRZWkz~fude2YWtO05P|ObI#hVi^M2y}g$jBMIl>AJ{g+G!5;%9=HKv=)ZQq
zeMH@Tly4?)4ReWRCUv3&71<WJ7~{(-&66F8?%k%yzZ;HmoNk^ePVJCEN?ocTAToc|
z?mZhrrFjJ_aZdz^k=DA5D!A(+v6$YeIM!P1sxz#$7|O@ciiz94>q?%IO*Z;7jn3+i
z`8dmGwF(udb(M8l-h?6)l#)(8KJq5bqZ6DeP0sTr#F2*E-8>pZ@^(HdNtoshwj>Y~
z4Jh_TuF)Iisz~GE3Nf{X?Py6Mdoq9IFjY7RyN9G;)`O};!M^zes|M8;TQBOedc7hk
zPN*vbuoCaFrBM#gLLjQ5)SA*2l<z9oY#r{C5oc7JV{<wgsv?CoLnw6yAu;iBet~UP
z-&$~4ery}ky6Ot(&SZpi&r%)AI<2b=Ed{Kn-XA1?64za7F6xP$vPkdVM<4sBT$!KL
z%Xd<XI&nEk=@6%ygO%outKpg0_=P1`Ijx@WvM%qv)z+xd@U&fD{|N%07Fi6cwltA)
zgIs^ZGtSBElVfX2kwp8dOhOQTswUu1PJwl@XyYThDNDLZRN7wP>V~0o0{?AAysk=~
zzHCP@@jGG9VvP4}t54@%Mz+N|dZ$S`z*voCNk4Ic%z&3dN3M`#?c2Nf{+iwdq?<10
zqUwJ`8>4;)158p?vsc$s<UM50D~xSx1?V7p6!6<`OUw2Qr_+&>wFrG`YKuu)Ndt2&
zV~K254D*u#JnQq+M=;ii$HzQZ24XKO1(OP7?d2SfjcVXggMZ|$75Yr>itv-8b6@Sw
z33<x$nOk0HoY%8h9wBw#+-U%>Y8Ta(1fI8MsZOij8MNQT^^%nL_Fs6ioIl?Ji@j?|
zQ)u=(VJCI(-V^0tf=1r1jVkl-AS#Dvd*2?kJ?CZ99m-U2!tjuO#l~vWw<0GisMB&W
z^QE{bAV9pr#XzSSTvs`OX>h3u_LH1yvkS;o_BVANlz#ev)6oed+uEn>1+M2Fd!8n}
zVx`AgF-NhyD1&X<zbIt}GO$uaWvLWyT<)t2=3TU`37vQT5ICm>_xp<2qvgFuH)I$M
z2hS72@1e$W{dw1^n%hDw?LJG+=02Uit?j>C{s8-iq1c{Mk~${ayBM$+RHW#Il4j3l
zk<J2MCZ{<V^kx+3)mI+du$bM{Gj;BghBY(SUe_IdvZ%>tV)xzg;pRz2S!VMuwzXVl
zrab6zi}~>}IX|@@6DwC&V?48n?xq$c%yc*TTFK{ekXk!}#-%9{lYQ3L_ys9Nrwc0f
zY0XNF9dFmFW}e^bj?3eUKReG0f*MnmFE&C#Megw?(lT<xC9aDy=BBoYq4c6MghyIQ
z84bZsgRn$2V|jk26417mo2N?-JMzdQ&U;#dE2M*~z%`3_AY1|KD)7paX6`Fw_`_RX
zH2qC>%D{`2EjU(PJ9S7uUpnb-uu38oyQM%hII(#Z5X#ZYM{!D}3u*J%L`2>J558UF
zY<9>gfDas?g@E1;-=Hq)rU{<ZK+*PB+6nh1({XZr$NgnKLQjJe&J+UgI)g9xv>FeH
z#ut^U3bOAgZ^Pv~QAoz_C+INDGyRX%i`=Cl21R8_)z)bxdCaw$F6q;c2qj{{brV6|
zjXT%09!bS9#mk`GAsT~jEPk{q$Kctf&>QhY#d%ys!xPSkZ_?rFr82%*y<6kDA~W)-
z_|l$O058Ym_INYY_4?~E5dpv;oRM@US)X$4yRq~_5a=zH7RQ-ER`A{HRpwHeEh2Au
zJA0BcGv!K;jwDVR+P+0pPT!3Ch$l;8IBc?IQFvX`p33PCWi9F+!_in#sss53OV5-3
zipM-di<!A2are|s#WpLk`cid|(@H)u8wlh0oGZe0t|k@sP;^k5^be8w?#YtN$(v%f
zwa|&>>?FOuUC_R^AGmbl2Bq1<N;Dy*Bd0;|w0wVEvEaEMR|0w%jDqThQjZlV#Ch%V
zT#-SGzF0WxY&wsvk;KjQO8sgJH{mt=YbS1RsZe{hZ-{{O!_i)MLGah3&i=?Ii>)K5
ze42l;SabJrR;qy7CAVSQWX}cMI+=d0Y1JP2OnIHc{@tf6c@xV;{vysnU{czTKiC*h
zJSf?#zqs=?q3cHbG1_Y_=Js$3c<kv{i~2@r6R-;v@qq3~o)(WymsJr|!IQ!jIikiJ
zD7&*p$a3wg%M#DX^QCD0u|{(AE%AyfMdC;h&@3W1jWHuELcqJje}uNEvS_lf7k2)@
zI{e2y!Z+mJjPX-3lp#wMRG<|Ld!D_*Q}l>TIOwYOgX*wx@ZDcTe{xNhNthAL`tuTe
zZKXi0{jf1%u>3xZ{8FaM$0wF+YLJPq)RUzLj_$}iSR2xQxjR3*>|KR;&j*W%ID(H(
zr@}&6wt+_+EV(G-(!Dp)w-&cH?#3Sf_{(#a6a!548%mVcT}Hex)blT2Ea^XKmcbU5
z_PQJ1E8}~pR&iOD!)+hur{-xBdai$VjzvkWU#2V=>C_PTLpZSP!*k0vso9yinnvqp
z<n9Ny+YaKs@AFpd1}l%f*~SRAdnHAI9Y@NK_oN!9PyI)odgJ6>=%BETU2DmTEbOAQ
z%Q@RxtF&j6FehVLG`J_@bmqn3$ORw9w0IR)`u6@&OIF8H#UqL8^lPJ!B4YfD>vc;{
z(n{U7{5_=H_gc?;!^Lh3Mp^aJa|E^<`aW)Q^`X7}qZdL|Uz0HQ$A$IY%2}dnT_jt5
z*?Ou@@2MT+wW#!ft29z6{Q&`>XdW1@tEAgQnY3JbZR4;e$xv&Ubu2h0#%-&D_4FFF
zETOLt+|_@r8OG#CPAseT-RRCK;;$zF?Z$V}uC+;8pFV=FV_y^$BfeF)yJp1K<wGoX
zQ%4o!yIkWHj%Z~xfs?a<x3t0r%f*D-jvTSC^9D0i@DIRWisWgY(9F`#PKPbig36Si
zS=atx5Eyg5-UV745i?+BoAFnC$)E0kOdFIr%QsYWv4#XpYCS8-54DpNJqY5b)VEFE
z`E-(dM_D{{kUW?Z(Re<4Lw*>ocUY~QEV8B`Sg*9S06raf0N_fJjidN5IGi(4&W#@>
zKwJ2a<Y+v;UaGLZHAI7uO&e_{wyNxu{seq(!PDf4rHV|*YR~M9i4|Vce59#2eBo8g
z?#$=633lTp`zZxS@%tP$mwsCo$|}0l>K-Fz^~KJr$H}9SOMxV8%%FmasV6U>pWCX&
zC>+(7bfPw56adyLem?iR{()!85bWH7UE+^|qoUxrC<6PS(FFpyo5LY;aP3}*{K`37
zP)N>iNA-|Sl*u2&y(`2$^Q^-cmK+i^8d2EuVAgQ_4L`36+qA$E^=k6Qz1}kb94Zu;
zMLk;PGmChY?uq*yOv?)X;TVYqo<8HWeHZ;ZllZ@2)wcescmAEr1OR;$v${`9X@9kU
zrs4j_2wHBY>4*ybolX7Blm#j}o!WlBt^6O@)BiaF08mE|_D?Eo-7Oj}$^5&{{{?^h
zUm@_@BoK-Q*E;@xCOt<jVd<w<04yF`?>;CdU%vCnkaK?>Q{xDdvNZBl<k<;LGz1nQ
zLr)}uRxMFfF9FEc`(R$Yw*bVg0Dc+Yb1Dbob}tG*0t*IQVR-UDC{^HudVtFJpOi~e
zMf8#NPKo%R1bjwC0{Z$XSN?zLY8q=!B@_N%)_im(L5&(^bp}v(?mtt+)+|)g8hqs<
zz&hh!#J2O+0WMS<4gg^P{I@@R>!(mCvO!q^AtCQ0BN@_k1f_Ws7BMvrOTK9V(0^|m
z$O)w;;-F+2-TEK!<8~Vpk+ucE1e&C6BKWzPpkB9-fI6YTkJrE_Z>Bx5&q+Vz2x^+N
heFXn91FTDfs#Z0vo4agJEIw83fhI_!;;!}U{{!}yoZJ8a

literal 0
HcmV?d00001

diff --git a/71_RayTracingPipeline/main.cpp b/71_RayTracingPipeline/main.cpp
index ac3befb5e..b5cde7410 100644
--- a/71_RayTracingPipeline/main.cpp
+++ b/71_RayTracingPipeline/main.cpp
@@ -1188,9 +1188,10 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication,
 
     // intersection geometries setup
     {
-      auto spheresInfoBuffer = ICPUBuffer::create({ NumberOfProceduralGeometries * sizeof(SProceduralGeomInfo) });
-      SProceduralGeomInfo* sphereInfos = reinterpret_cast<SProceduralGeomInfo*>(spheresInfoBuffer->getPointer());
+      core::vector<SProceduralGeomInfo> proceduralGeoms;
+      proceduralGeoms.reserve(NumberOfProceduralGeometries);
       core::vector<Aabb> aabbs;
+      aabbs.reserve(NumberOfProceduralGeometries);
       for (int32_t i = 0; i < NumberOfProceduralGeometries; i++)
       {
         const auto middle_i = NumberOfProceduralGeometries / 2.0;
@@ -1206,7 +1207,7 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication,
           },
         };
 
-        sphereInfos[i] = sphere;
+        proceduralGeoms.push_back(sphere);
         aabbs.push_back({
           .minimum = sphere.center - sphere.radius,
           .maximum = sphere.center + sphere.radius,
@@ -1216,8 +1217,8 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication,
       {
         IGPUBuffer::SCreationParams params;
         params.usage = IGPUBuffer::EUF_STORAGE_BUFFER_BIT | IGPUBuffer::EUF_TRANSFER_DST_BIT | IGPUBuffer::EUF_INLINE_UPDATE_VIA_CMDBUF | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT;
-        params.size = spheresInfoBuffer->getSize();
-        m_utils->createFilledDeviceLocalBufferOnDedMem(SIntendedSubmitInfo{ .queue = queue }, std::move(params), sphereInfos).move_into(m_proceduralGeomInfoBuffer);
+        params.size = proceduralGeoms.size() * sizeof(SProceduralGeomInfo);
+        m_utils->createFilledDeviceLocalBufferOnDedMem(SIntendedSubmitInfo{ .queue = queue }, std::move(params), proceduralGeoms.data()).move_into(m_proceduralGeomInfoBuffer);
         m_logger->log("Device address : %d", ILogger::ELL_INFO, m_proceduralGeomInfoBuffer->getDeviceAddress());
       }
 

From 7b5059cd913f89424c48180fcaaa82d6f0301cd6 Mon Sep 17 00:00:00 2001
From: kevyuu <kevin.kayu@gmail.com>
Date: Fri, 24 Jan 2025 19:20:28 +0700
Subject: [PATCH 008/529] Fix alignment issue of scratch buffer

Signed-off-by: kevyuu <kevin.kayu@gmail.com>
---
 71_RayTracingPipeline/main.cpp | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

diff --git a/71_RayTracingPipeline/main.cpp b/71_RayTracingPipeline/main.cpp
index b5cde7410..2905e0427 100644
--- a/71_RayTracingPipeline/main.cpp
+++ b/71_RayTracingPipeline/main.cpp
@@ -1344,7 +1344,7 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication,
     m_currentImageAcquire = m_surface->acquireNextImage();
 #endif
     size_t totalScratchSize = 0;
-
+    const auto scratchOffsetAlignment = getRequiredDeviceLimits().minAccelerationStructureScratchOffsetAlignment;
 
     // build bottom level ASes
     {
@@ -1416,7 +1416,8 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication,
         }
 
         scratchSizes[i] = buildSizes.buildScratchSize;
-        totalScratchSize += buildSizes.buildScratchSize;
+        totalScratchSize = core::alignUp(totalScratchSize, scratchOffsetAlignment);
+        totalScratchSize += buildSizes.buildScratchSize, scratchOffsetAlignment;
 
         {
           IGPUBuffer::SCreationParams params;
@@ -1456,7 +1457,14 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication,
       {
         blasBuildInfos[i].dstAS = m_gpuBlasList[i].get();
         blasBuildInfos[i].scratch.buffer = scratchBuffer;
-        blasBuildInfos[i].scratch.offset = (i == 0) ? 0u : blasBuildInfos[i - 1].scratch.offset + scratchSizes[i - 1];
+        if (i == 0)
+        {
+          blasBuildInfos[i].scratch.offset = 0u;
+        } else
+        {
+          const auto unalignedOffset = blasBuildInfos[i - 1].scratch.offset + scratchSizes[i - 1];
+          blasBuildInfos[i].scratch.offset = core::alignUp(unalignedOffset, scratchOffsetAlignment);
+        }
 
         buildRangeInfos[i].primitiveCount = primitiveCounts[i];
         buildRangeInfos[i].primitiveByteOffset = 0u;

From 8cbb0dc19f3cf5c5f562ab2e4be3d0df08c954ec Mon Sep 17 00:00:00 2001
From: kevyuu <kevin.kayu@gmail.com>
Date: Fri, 24 Jan 2025 20:19:45 +0700
Subject: [PATCH 009/529] Fix alignment issue of scratch buffer

Signed-off-by: kevyuu <kevin.kayu@gmail.com>
---
 71_RayTracingPipeline/main.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/71_RayTracingPipeline/main.cpp b/71_RayTracingPipeline/main.cpp
index 2905e0427..7a35facc4 100644
--- a/71_RayTracingPipeline/main.cpp
+++ b/71_RayTracingPipeline/main.cpp
@@ -1344,7 +1344,7 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication,
     m_currentImageAcquire = m_surface->acquireNextImage();
 #endif
     size_t totalScratchSize = 0;
-    const auto scratchOffsetAlignment = getRequiredDeviceLimits().minAccelerationStructureScratchOffsetAlignment;
+    const auto scratchOffsetAlignment = m_device->getPhysicalDevice()->getLimits().minAccelerationStructureScratchOffsetAlignment;
 
     // build bottom level ASes
     {

From a9d5f8bcec54e5c5ea6d0eff4e2bb8a2469a981b Mon Sep 17 00:00:00 2001
From: kevyuu <kevin.kayu@gmail.com>
Date: Fri, 24 Jan 2025 20:49:06 +0700
Subject: [PATCH 010/529] Fix alignment issue of scratch buffer

Signed-off-by: kevyuu <kevin.kayu@gmail.com>
---
 71_RayTracingPipeline/main.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/71_RayTracingPipeline/main.cpp b/71_RayTracingPipeline/main.cpp
index 7a35facc4..ce0388ce9 100644
--- a/71_RayTracingPipeline/main.cpp
+++ b/71_RayTracingPipeline/main.cpp
@@ -1417,7 +1417,7 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication,
 
         scratchSizes[i] = buildSizes.buildScratchSize;
         totalScratchSize = core::alignUp(totalScratchSize, scratchOffsetAlignment);
-        totalScratchSize += buildSizes.buildScratchSize, scratchOffsetAlignment;
+        totalScratchSize += buildSizes.buildScratchSize;
 
         {
           IGPUBuffer::SCreationParams params;

From feedf653f12e3e04ada616b48fc7012219cabb0f Mon Sep 17 00:00:00 2001
From: kevyuu <kevin.kayu@gmail.com>
Date: Fri, 24 Jan 2025 22:43:07 +0700
Subject: [PATCH 011/529] Reduce Push Constant Size and skip compacting
 procedural geometries

Signed-off-by: kevyuu <kevin.kayu@gmail.com>
---
 71_RayTracingPipeline/app_resources/common.hlsl | 16 +++++++++-------
 .../app_resources/light_directional.rcall.hlsl  |  2 +-
 .../app_resources/light_point.rcall.hlsl        |  2 +-
 .../app_resources/light_spot.rcall.hlsl         |  4 ++--
 71_RayTracingPipeline/main.cpp                  | 17 ++++-------------
 5 files changed, 17 insertions(+), 24 deletions(-)

diff --git a/71_RayTracingPipeline/app_resources/common.hlsl b/71_RayTracingPipeline/app_resources/common.hlsl
index a35bd3fcd..0b8bb277d 100644
--- a/71_RayTracingPipeline/app_resources/common.hlsl
+++ b/71_RayTracingPipeline/app_resources/common.hlsl
@@ -75,10 +75,9 @@ struct Light
 {
     float32_t3 direction;
     float32_t3 position;
-    float32_t intensity;
-    float32_t innerCutoff;
     float32_t outerCutoff;
-    int type;
+    uint16_t type;
+
 
 #ifndef __HLSL_VERSION
     bool operator==(const Light&) const = default;
@@ -86,16 +85,19 @@ struct Light
 
 };
 
+static const float LightIntensity = 100.0f;
+
 struct SPushConstants
 {
-    Light light;
+    uint64_t proceduralGeomInfoBuffer;
+    uint64_t triangleGeomInfoBuffer;
 
     float32_t3 camPos;
+    uint32_t frameCounter;
     float32_t4x4 invMVP;
 
-    uint64_t proceduralGeomInfoBuffer;
-    uint64_t triangleGeomInfoBuffer;
-    uint32_t frameCounter;
+
+    Light light;
 };
 
 
diff --git a/71_RayTracingPipeline/app_resources/light_directional.rcall.hlsl b/71_RayTracingPipeline/app_resources/light_directional.rcall.hlsl
index d4aeca85e..1eb18be34 100644
--- a/71_RayTracingPipeline/app_resources/light_directional.rcall.hlsl
+++ b/71_RayTracingPipeline/app_resources/light_directional.rcall.hlsl
@@ -6,6 +6,6 @@
 void main(inout RayLight cLight)
 {
     cLight.outLightDir = normalize(-pc.light.direction);
-    cLight.outIntensity = 1.0;
+    cLight.outIntensity = 1;
     cLight.outLightDistance = 10000000;
 }
diff --git a/71_RayTracingPipeline/app_resources/light_point.rcall.hlsl b/71_RayTracingPipeline/app_resources/light_point.rcall.hlsl
index e82d17ec8..2265a98e7 100644
--- a/71_RayTracingPipeline/app_resources/light_point.rcall.hlsl
+++ b/71_RayTracingPipeline/app_resources/light_point.rcall.hlsl
@@ -7,7 +7,7 @@ void main(inout RayLight cLight)
 {
     float32_t3 lDir = pc.light.position - cLight.inHitPosition;
     float lightDistance = length(lDir);
-    cLight.outIntensity = pc.light.intensity / (lightDistance * lightDistance);
+    cLight.outIntensity = LightIntensity / (lightDistance * lightDistance);
     cLight.outLightDir = normalize(lDir);
     cLight.outLightDistance = lightDistance;
 }
\ No newline at end of file
diff --git a/71_RayTracingPipeline/app_resources/light_spot.rcall.hlsl b/71_RayTracingPipeline/app_resources/light_spot.rcall.hlsl
index 5dbc5a830..f1357d30b 100644
--- a/71_RayTracingPipeline/app_resources/light_spot.rcall.hlsl
+++ b/71_RayTracingPipeline/app_resources/light_spot.rcall.hlsl
@@ -7,10 +7,10 @@ void main(inout RayLight cLight)
 {
     float32_t3 lDir = pc.light.position - cLight.inHitPosition;
     cLight.outLightDistance = length(lDir);
-    cLight.outIntensity = pc.light.intensity / (cLight.outLightDistance * cLight.outLightDistance);
+    cLight.outIntensity = LightIntensity / (cLight.outLightDistance * cLight.outLightDistance);
     cLight.outLightDir = normalize(lDir);
     float theta = dot(cLight.outLightDir, normalize(-pc.light.direction));
-    float epsilon = pc.light.innerCutoff - pc.light.outerCutoff;
+    float epsilon = - pc.light.outerCutoff;
     float spotIntensity = clamp((theta - pc.light.outerCutoff) / epsilon, 0.0, 1.0);
     cLight.outIntensity *= spotIntensity;
 }
diff --git a/71_RayTracingPipeline/main.cpp b/71_RayTracingPipeline/main.cpp
index ce0388ce9..d9cad5947 100644
--- a/71_RayTracingPipeline/main.cpp
+++ b/71_RayTracingPipeline/main.cpp
@@ -507,27 +507,21 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication,
         ImGui::SliderFloat("zNear", &zNear, 0.1f, 100.f);
         ImGui::SliderFloat("zFar", &zFar, 110.f, 10000.f);
         Light m_oldLight = m_light;
-        ImGui::ListBox("LightType", &m_light.type, s_lightTypeNames, ELT_COUNT);
+        int light_type = m_light.type;
+        ImGui::ListBox("LightType", &light_type, s_lightTypeNames, ELT_COUNT);
+        m_light.type = static_cast<E_LIGHT_TYPE>(light_type);
         if (m_light.type == ELT_DIRECTIONAL)
         {
           ImGui::SliderFloat3("Light Direction", &m_light.direction.x, -1.f, 1.f);
         } else if (m_light.type == ELT_POINT)
         {
           ImGui::SliderFloat3("Light Position", &m_light.position.x, -20.f, 20.f);
-          ImGui::SliderFloat("Light Intensity", &m_light.intensity, 0.0f, 500.f);
         } else if (m_light.type == ELT_SPOT)
         {
           ImGui::SliderFloat3("Light Direction", &m_light.direction.x, -1.f, 1.f);
           ImGui::SliderFloat3("Light Position", &m_light.position.x, -20.f, 20.f);
-          ImGui::SliderFloat("Light Intensity", &m_light.intensity, 0.0f, 500.f);
 
-          float32_t dInnerCutoff = degrees(acos(m_light.innerCutoff));
           float32_t dOuterCutoff = degrees(acos(m_light.outerCutoff));
-          if (ImGui::SliderFloat("Light Inner Cutoff", &dInnerCutoff, 0.0f, 45.0f))
-          {
-            dInnerCutoff = dInnerCutoff > dOuterCutoff ? dOuterCutoff : dInnerCutoff;
-            m_light.innerCutoff = cos(radians(dInnerCutoff));
-          }
           if (ImGui::SliderFloat("Light Outer Cutoff", &dOuterCutoff, 0.0f, 45.0f))
           {
             m_light.outerCutoff = cos(radians(dOuterCutoff));
@@ -1219,7 +1213,6 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication,
         params.usage = IGPUBuffer::EUF_STORAGE_BUFFER_BIT | IGPUBuffer::EUF_TRANSFER_DST_BIT | IGPUBuffer::EUF_INLINE_UPDATE_VIA_CMDBUF | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT;
         params.size = proceduralGeoms.size() * sizeof(SProceduralGeomInfo);
         m_utils->createFilledDeviceLocalBufferOnDedMem(SIntendedSubmitInfo{ .queue = queue }, std::move(params), proceduralGeoms.data()).move_into(m_proceduralGeomInfoBuffer);
-        m_logger->log("Device address : %d", ILogger::ELL_INFO, m_proceduralGeomInfoBuffer->getDeviceAddress());
       }
 
       {
@@ -1307,7 +1300,6 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication,
       params.usage = IGPUBuffer::EUF_TRANSFER_DST_BIT | IGPUBuffer::EUF_INLINE_UPDATE_VIA_CMDBUF | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT | IGPUBuffer::EUF_SHADER_BINDING_TABLE_BIT;
       params.size = bufferSize;
       m_utils->createFilledDeviceLocalBufferOnDedMem(SIntendedSubmitInfo{ .queue = queue }, std::move(params), pData).move_into(raygenRegion.buffer);
-      m_logger->log("Device address : %d", ILogger::ELL_INFO, raygenRegion.buffer->getDeviceAddress());
       missRegion.buffer = core::smart_refctd_ptr(raygenRegion.buffer);
       hitRegion.buffer = core::smart_refctd_ptr(raygenRegion.buffer);
       callableRegion.buffer = core::smart_refctd_ptr(raygenRegion.buffer);
@@ -1510,6 +1502,7 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication,
       core::vector<smart_refctd_ptr<IGPUBottomLevelAccelerationStructure>> cleanupBlas(blasCount);
       for (uint32_t i = 0; i < blasCount; i++)
       {
+        if (asSizes[i] == 0) continue;
         cleanupBlas[i] = m_gpuBlasList[i];
         {
           IGPUBuffer::SCreationParams params;
@@ -1671,8 +1664,6 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication,
   Light m_light = {
     .direction = {-1.0f, -1.0f, -0.4f},
     .position = {10.0f, 15.0f, 8.0f},
-    .intensity = 100.0f,
-    .innerCutoff = 0.939692621f, // {cos(radians(20.0f))},
     .outerCutoff = 0.866025404f, // {cos(radians(30.0f))}, 
     .type = ELT_DIRECTIONAL
   };

From e369368bfe9fcf1b037aa869e7a1741909ca1ead Mon Sep 17 00:00:00 2001
From: kevyuu <kevin.kayu@gmail.com>
Date: Fri, 24 Jan 2025 22:55:22 +0700
Subject: [PATCH 012/529] Fix occlusion of procedural geometries

Signed-off-by: kevyuu <kevin.kayu@gmail.com>
---
 71_RayTracingPipeline/app_resources/raytrace.rchit.hlsl         | 2 +-
 .../app_resources/raytrace_procedural.rchit.hlsl                | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/71_RayTracingPipeline/app_resources/raytrace.rchit.hlsl b/71_RayTracingPipeline/app_resources/raytrace.rchit.hlsl
index 734491e7d..462287689 100644
--- a/71_RayTracingPipeline/app_resources/raytrace.rchit.hlsl
+++ b/71_RayTracingPipeline/app_resources/raytrace.rchit.hlsl
@@ -119,7 +119,7 @@ void main(inout ColorPayload p, in BuiltInTriangleIntersectionAttributes attribs
     const STriangleGeomInfo geom = vk::RawBufferLoad < STriangleGeomInfo > (pc.triangleGeomInfoBuffer + instID * sizeof(STriangleGeomInfo));
     const VertexData vertexData = fetchVertexData(instID, primID, geom, attribs.barycentrics);
     const float32_t3 worldPosition = mul(ObjectToWorld3x4(), float32_t4(vertexData.position, 1));
-    const float32_t3 worldNormal = mul(vertexData.normal, WorldToObject3x4()).xyz;
+    const float32_t3 worldNormal = normalize(mul(vertexData.normal, WorldToObject3x4()).xyz);
 
     RayLight cLight;
     cLight.inHitPosition = worldPosition;
diff --git a/71_RayTracingPipeline/app_resources/raytrace_procedural.rchit.hlsl b/71_RayTracingPipeline/app_resources/raytrace_procedural.rchit.hlsl
index ef3503346..dd5598105 100644
--- a/71_RayTracingPipeline/app_resources/raytrace_procedural.rchit.hlsl
+++ b/71_RayTracingPipeline/app_resources/raytrace_procedural.rchit.hlsl
@@ -44,7 +44,7 @@ void main(inout ColorPayload p, in BuiltInTriangleIntersectionAttributes attribs
         ShadowPayload shadowPayload;
         shadowPayload.isShadowed = true;
         shadowPayload.seed = p.seed;
-        TraceRay(topLevelAS, flags, 0xFF, ERT_OCCLUSION, 0, EMT_PRIMARY, rayDesc, shadowPayload);
+        TraceRay(topLevelAS, flags, 0xFF, ERT_OCCLUSION, 0, EMT_OCCLUSION, rayDesc, shadowPayload);
 
         bool isShadowed = shadowPayload.isShadowed;
         if (isShadowed)

From c2c82d49f918de9f90577156bbeb33edf2e050c1 Mon Sep 17 00:00:00 2001
From: Ali Cheraghi <alichraghi@proton.me>
Date: Fri, 24 Jan 2025 18:43:16 +0330
Subject: [PATCH 013/529] 71: cache shaders

Signed-off-by: Ali Cheraghi <alichraghi@proton.me>
Signed-off-by: kevyuu <kevin.kayu@gmail.com>
---
 71_RayTracingPipeline/main.cpp | 129 +++++++++++++++++++++++----------
 1 file changed, 90 insertions(+), 39 deletions(-)

diff --git a/71_RayTracingPipeline/main.cpp b/71_RayTracingPipeline/main.cpp
index d9cad5947..95540c0b9 100644
--- a/71_RayTracingPipeline/main.cpp
+++ b/71_RayTracingPipeline/main.cpp
@@ -101,45 +101,101 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication,
     if (!asset_base_t::onAppInitialized(smart_refctd_ptr(system)))
       return false;
 
+    smart_refctd_ptr<IShaderCompiler::CCache> shaderReadCache = nullptr;
+    smart_refctd_ptr<IShaderCompiler::CCache> shaderWriteCache = core::make_smart_refctd_ptr<IShaderCompiler::CCache>();
+    auto shaderCachePath = localOutputCWD / "main_pipeline_shader_cache.bin";
 
-    const auto compileShader = [&](const std::string & filePath, const std::string & header = "") -> smart_refctd_ptr<IGPUShader>
     {
-      IAssetLoader::SAssetLoadParams lparams = {};
-      lparams.logger = m_logger.get();
-      lparams.workingDirectory = "";
-      auto bundle = m_assetMgr->getAsset(filePath, lparams);
-      if (bundle.getContents().empty() || bundle.getAssetType() != IAsset::ET_SHADER)
-      {
-        m_logger->log("Shader %s not found!", ILogger::ELL_ERROR, filePath);
-        exit(-1);
-      }
+        core::smart_refctd_ptr<system::IFile> shaderReadCacheFile;
+        {
+            system::ISystem::future_t<core::smart_refctd_ptr<system::IFile>> future;
+            m_system->createFile(future, shaderCachePath.c_str(), system::IFile::ECF_READ);
+            if (future.wait())
+            {
+                future.acquire().move_into(shaderReadCacheFile);
+                if (shaderReadCacheFile)
+                {
+                    const size_t size = shaderReadCacheFile->getSize();
+                    if (size > 0ull)
+                    {
+                        std::vector<uint8_t> contents(size);
+                        system::IFile::success_t succ;
+                        shaderReadCacheFile->read(succ, contents.data(), 0, size);
+                        if (succ)
+                            shaderReadCache = IShaderCompiler::CCache::deserialize(contents);
+                    }
+                }
+            }
+            else
+                m_logger->log("Failed Openning Shader Cache File.", ILogger::ELL_ERROR);
+        }
 
-      const auto assets = bundle.getContents();
-      assert(assets.size() == 1);
-      smart_refctd_ptr<ICPUShader> sourceRaw = IAsset::castDown<ICPUShader>(assets[0]);
-      if (!sourceRaw)
-        m_logger->log("Fail to load shader source", ILogger::ELL_ERROR, filePath);
-      smart_refctd_ptr<ICPUShader> source = CHLSLCompiler::createOverridenCopy(
-        sourceRaw.get(),
-        "%s\n",
-        header.c_str()
-      );
+    }
 
-      return m_device->createShader(source.get());
-    };
+    // Load Custom Shader
+    auto loadCompileAndCreateShader = [&](const std::string& relPath, const std::string& header = "") -> smart_refctd_ptr<IGPUShader>
+        {
+            IAssetLoader::SAssetLoadParams lp = {};
+            lp.logger = m_logger.get();
+            lp.workingDirectory = ""; // virtual root
+            auto assetBundle = m_assetMgr->getAsset(relPath, lp);
+            const auto assets = assetBundle.getContents();
+            if (assets.empty())
+                return nullptr;
+
+            // lets go straight from ICPUSpecializedShader to IGPUSpecializedShader
+            auto sourceRaw = IAsset::castDown<ICPUShader>(assets[0]);
+            if (!sourceRaw)
+                return nullptr;
+
+            smart_refctd_ptr<ICPUShader> source = CHLSLCompiler::createOverridenCopy(
+                sourceRaw.get(),
+                "%s\n",
+                header.c_str()
+            );
+
+            return m_device->createShader({ source.get(), nullptr, shaderReadCache.get(), shaderWriteCache.get() });
+        };
 
-    // shader
-    const auto raygenShader = compileShader("app_resources/raytrace.rgen.hlsl");
-    const auto closestHitShader = compileShader("app_resources/raytrace.rchit.hlsl");
-    const auto proceduralClosestHitShader = compileShader("app_resources/raytrace_procedural.rchit.hlsl");
-    const auto intersectionHitShader = compileShader("app_resources/raytrace.rint.hlsl");
-    const auto anyHitShaderColorPayload = compileShader("app_resources/raytrace.rahit.hlsl", "#define USE_COLOR_PAYLOAD\n");
-    const auto anyHitShaderShadowPayload = compileShader("app_resources/raytrace.rahit.hlsl", "#define USE_SHADOW_PAYLOAD\n");
-    const auto missShader = compileShader("app_resources/raytrace.rmiss.hlsl");
-    const auto shadowMissShader = compileShader("app_resources/raytraceShadow.rmiss.hlsl");
-    const auto directionalLightCallShader = compileShader("app_resources/light_directional.rcall.hlsl");
-    const auto pointLightCallShader = compileShader("app_resources/light_point.rcall.hlsl");
-    const auto spotLightCallShader = compileShader("app_resources/light_spot.rcall.hlsl");
+    // load shaders
+    const auto raygenShader = loadCompileAndCreateShader("app_resources/raytrace.rgen.hlsl");
+    const auto closestHitShader = loadCompileAndCreateShader("app_resources/raytrace.rchit.hlsl");
+    const auto proceduralClosestHitShader = loadCompileAndCreateShader("app_resources/raytrace_procedural.rchit.hlsl");
+    const auto intersectionHitShader = loadCompileAndCreateShader("app_resources/raytrace.rint.hlsl");
+    const auto anyHitShaderColorPayload = loadCompileAndCreateShader("app_resources/raytrace.rahit.hlsl", "#define USE_COLOR_PAYLOAD\n");
+    const auto anyHitShaderShadowPayload = loadCompileAndCreateShader("app_resources/raytrace.rahit.hlsl", "#define USE_SHADOW_PAYLOAD\n");
+    const auto missShader = loadCompileAndCreateShader("app_resources/raytrace.rmiss.hlsl");
+    const auto shadowMissShader = loadCompileAndCreateShader("app_resources/raytraceShadow.rmiss.hlsl");
+    const auto directionalLightCallShader = loadCompileAndCreateShader("app_resources/light_directional.rcall.hlsl");
+    const auto pointLightCallShader = loadCompileAndCreateShader("app_resources/light_point.rcall.hlsl");
+    const auto spotLightCallShader = loadCompileAndCreateShader("app_resources/light_spot.rcall.hlsl");
+    const auto fragmentShader = loadCompileAndCreateShader("app_resources/present.frag.hlsl");
+
+    core::smart_refctd_ptr<system::IFile> shaderWriteCacheFile;
+    {
+        system::ISystem::future_t<core::smart_refctd_ptr<system::IFile>> future;
+        m_system->deleteFile(shaderCachePath); // temp solution instead of trimming, to make sure we won't have corrupted json
+        m_system->createFile(future, shaderCachePath.c_str(), system::IFile::ECF_WRITE);
+        if (future.wait())
+        {
+            future.acquire().move_into(shaderWriteCacheFile);
+            if (shaderWriteCacheFile)
+            {
+                auto serializedCache = shaderWriteCache->serialize();
+                if (shaderWriteCacheFile)
+                {
+                    system::IFile::success_t succ;
+                    shaderWriteCacheFile->write(succ, serializedCache->getPointer(), 0, serializedCache->getSize());
+                    if (!succ)
+                        m_logger->log("Failed Writing To Shader Cache File.", ILogger::ELL_ERROR);
+                }
+            }
+            else
+                m_logger->log("Failed Creating Shader Cache File.", ILogger::ELL_ERROR);
+        }
+        else
+            m_logger->log("Failed Creating Shader Cache File.", ILogger::ELL_ERROR);
+    }
 
     m_semaphore = m_device->createSemaphore(m_realFrameIx);
     if (!m_semaphore)
@@ -382,11 +438,6 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication,
       if (!fsTriProtoPPln)
         return logFail("Failed to create Full Screen Triangle protopipeline or load its vertex shader!");
 
-      // Load Fragment Shader
-      auto fragmentShader = compileShader("app_resources/present.frag.hlsl");
-      if (!fragmentShader)
-        return logFail("Failed to Load and Compile Fragment Shader: lumaMeterShader!");
-
       const IGPUShader::SSpecInfo fragSpec = {
         .entryPoint = "main",
         .shader = fragmentShader.get()

From 34a3fa9c925af2c7d8abeaaf500a7e79603fa9eb Mon Sep 17 00:00:00 2001
From: kevyuu <kevin.kayu@gmail.com>
Date: Fri, 24 Jan 2025 23:43:13 +0700
Subject: [PATCH 014/529] Remove unnecesary log

Signed-off-by: kevyuu <kevin.kayu@gmail.com>
---
 71_RayTracingPipeline/main.cpp | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/71_RayTracingPipeline/main.cpp b/71_RayTracingPipeline/main.cpp
index 95540c0b9..09a20340f 100644
--- a/71_RayTracingPipeline/main.cpp
+++ b/71_RayTracingPipeline/main.cpp
@@ -413,7 +413,6 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication,
       if (!createShaderBindingTable(gQueue, m_rayTracingPipeline))
         return logFail("Could not create shader binding table");
 
-      m_logger->log("Shader binding table created", system::ILogger::ELL_INFO);
     }
 
     {
@@ -1023,7 +1022,7 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication,
       return logFail("Couldn't create Command Pool for geometry creation!");
 
     const auto defaultMaterial = Material{
-      .ambient = {0.1, 0.1, 0.1},
+      .ambient = {},
       .diffuse = {0.8, 0.3, 0.3},
       .specular = {0.8, 0.8, 0.8},
       .shininess = 1.0f,
@@ -1057,7 +1056,7 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication,
         .meta = {.type = OT_CUBE, .name = "Cube Mesh 2"},
         .data = gc->createCubeMesh(nbl::core::vector3df(1.5, 1.5, 1.5)),
         .material = {
-          .ambient = {0.1, 0.1, 0.1},
+          .ambient = {},
           .diffuse = {0.2, 0.2, 0.8},
           .specular = {0.8, 0.8, 0.8},
           .shininess = 1.0f,
@@ -1069,7 +1068,7 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication,
         .meta = {.type = OT_CUBE, .name = "Transparent Cube Mesh"},
         .data = gc->createCubeMesh(nbl::core::vector3df(1.5, 1.5, 1.5)),
         .material = {
-          .ambient = {0.1, 0.1, 0.1},
+          .ambient = {},
           .diffuse = {0.2, 0.8, 0.2},
           .specular = {0.8, 0.8, 0.8},
           .shininess = 1.0f,
@@ -1244,7 +1243,7 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication,
           .center = float32_t3((i - middle_i) * 4.0, 2, 5.0),
           .radius = 1,
           .material = {
-            .ambient = {0.1, 0.1, 0.1},
+            .ambient = {},
             .diffuse = {0.3, 0.2 * i, 0.3},
             .specular = {0.8, 0.8, 0.8},
             .shininess = 1.0f,

From ec8826816de92eb0ccb78e63157af2a721f5f236 Mon Sep 17 00:00:00 2001
From: kevyuu <kevin.kayu@gmail.com>
Date: Sat, 25 Jan 2025 16:22:04 +0700
Subject: [PATCH 015/529] Change recursion depth to 1 so the demo can be run on
 more devices.

Signed-off-by: kevyuu <kevin.kayu@gmail.com>
---
 71_RayTracingPipeline/main.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/71_RayTracingPipeline/main.cpp b/71_RayTracingPipeline/main.cpp
index 09a20340f..4245a186d 100644
--- a/71_RayTracingPipeline/main.cpp
+++ b/71_RayTracingPipeline/main.cpp
@@ -404,7 +404,7 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication,
       shaderGroups.callableGroups[ELT_POINT] = { .shaderIndex = RTDS_POINT_CALL };
       shaderGroups.callableGroups[ELT_SPOT] = { .shaderIndex = RTDS_SPOT_CALL };
 
-      params.cached.maxRecursionDepth = 2;
+      params.cached.maxRecursionDepth = 1;
 
       if (!m_device->createRayTracingPipelines(nullptr, { &params, 1 }, &m_rayTracingPipeline))
         return logFail("Failed to create ray tracing pipeline");

From d21c22c4b9a5785eae115b0e8e6f83df2bbfaf3b Mon Sep 17 00:00:00 2001
From: kevyuu <kevin.kayu@gmail.com>
Date: Sat, 25 Jan 2025 17:05:17 +0700
Subject: [PATCH 016/529] Temporarily remove procedural geometries to debug
 crash on amd cards.

Signed-off-by: kevyuu <kevin.kayu@gmail.com>
---
 71_RayTracingPipeline/main.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/71_RayTracingPipeline/main.cpp b/71_RayTracingPipeline/main.cpp
index 4245a186d..5961ed225 100644
--- a/71_RayTracingPipeline/main.cpp
+++ b/71_RayTracingPipeline/main.cpp
@@ -1362,8 +1362,8 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication,
   {
     // plus 1 blas for procedural geometry contains {{var::NumberOfProcedural}}
     // spheres. Each sphere is a primitive instead one instance or geometry
-    const auto blasCount = m_gpuTriangleGeometries.size() + 1;
-    const auto proceduralBlasIdx = blasCount - 1;
+    const auto blasCount = m_gpuTriangleGeometries.size();
+    const auto proceduralBlasIdx = m_gpuTriangleGeometries.size();
 
     IQueryPool::SCreationParams qParams{ .queryCount = static_cast<uint32_t>(blasCount), .queryType = IQueryPool::ACCELERATION_STRUCTURE_COMPACTED_SIZE };
     smart_refctd_ptr<IQueryPool> queryPool = m_device->createQueryPool(std::move(qParams));
@@ -1587,7 +1587,7 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication,
 
     // build top level AS
     {
-      const uint32_t instancesCount = m_gpuBlasList.size();
+      const uint32_t instancesCount = blasCount;
       core::vector<IGPUTopLevelAccelerationStructure::DeviceStaticInstance> instances(instancesCount);
       for (uint32_t i = 0; i < instancesCount; i++)
       {

From bc093c73bca79308880e586bb604692025e03f51 Mon Sep 17 00:00:00 2001
From: kevyuu <kevin.kayu@gmail.com>
Date: Sat, 25 Jan 2025 23:15:05 +0700
Subject: [PATCH 017/529] Use Nabla AABB type instead creating another aabb
 type

Signed-off-by: kevyuu <kevin.kayu@gmail.com>
---
 .../app_resources/common.hlsl                 |  6 ------
 71_RayTracingPipeline/main.cpp                | 19 +++++++++++--------
 2 files changed, 11 insertions(+), 14 deletions(-)

diff --git a/71_RayTracingPipeline/app_resources/common.hlsl b/71_RayTracingPipeline/app_resources/common.hlsl
index 0b8bb277d..8c73fada3 100644
--- a/71_RayTracingPipeline/app_resources/common.hlsl
+++ b/71_RayTracingPipeline/app_resources/common.hlsl
@@ -22,12 +22,6 @@ struct SProceduralGeomInfo
     Material material;
 };
 
-struct Aabb
-{
-    float32_t3 minimum;
-    float32_t3 maximum;
-};
-
 struct STriangleGeomInfo
 {
     uint64_t vertexBufferAddress;
diff --git a/71_RayTracingPipeline/main.cpp b/71_RayTracingPipeline/main.cpp
index 5961ed225..0e76e4c72 100644
--- a/71_RayTracingPipeline/main.cpp
+++ b/71_RayTracingPipeline/main.cpp
@@ -365,7 +365,7 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication,
       shaders[RTDS_SPOT_CALL] = {.shader = spotLightCallShader.get()};
 
       params.layout = pipelineLayout.get();
-      params.shaders = std::span(shaders, std::size(shaders));
+      params.shaders = std::span(shaders);
 
       auto& shaderGroups = params.cached.shaderGroups;
 
@@ -1234,6 +1234,7 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication,
     {
       core::vector<SProceduralGeomInfo> proceduralGeoms;
       proceduralGeoms.reserve(NumberOfProceduralGeometries);
+      using Aabb = IGPUBottomLevelAccelerationStructure::AABB_t;
       core::vector<Aabb> aabbs;
       aabbs.reserve(NumberOfProceduralGeometries);
       for (int32_t i = 0; i < NumberOfProceduralGeometries; i++)
@@ -1252,10 +1253,11 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication,
         };
 
         proceduralGeoms.push_back(sphere);
-        aabbs.push_back({
-          .minimum = sphere.center - sphere.radius,
-          .maximum = sphere.center + sphere.radius,
-        });
+        const auto sphereMin = sphere.center - sphere.radius;
+        const auto sphereMax = sphere.center + sphere.radius;
+        aabbs.emplace_back(
+          vector3d(sphereMin.x, sphereMin.y, sphereMin.z), 
+          vector3d(sphereMax.x, sphereMax.y, sphereMax.z));
       }
 
       {
@@ -1362,7 +1364,7 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication,
   {
     // plus 1 blas for procedural geometry contains {{var::NumberOfProcedural}}
     // spheres. Each sphere is a primitive instead one instance or geometry
-    const auto blasCount = m_gpuTriangleGeometries.size();
+    const auto blasCount = m_gpuTriangleGeometries.size() + 1;
     const auto proceduralBlasIdx = m_gpuTriangleGeometries.size();
 
     IQueryPool::SCreationParams qParams{ .queryCount = static_cast<uint32_t>(blasCount), .queryType = IQueryPool::ACCELERATION_STRUCTURE_COMPACTED_SIZE };
@@ -1415,8 +1417,9 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication,
         bool isProcedural = i == proceduralBlasIdx;
         if (isProcedural)
         {
-          aabbs.data.buffer = smart_refctd_ptr<IGPUBuffer>(m_proceduralAabbBuffer);
-          aabbs.stride = sizeof(Aabb);
+          aabbs.data.buffer = smart_refctd_ptr(m_proceduralAabbBuffer);
+          aabbs.data.offset = 0;
+          aabbs.stride = sizeof(IGPUBottomLevelAccelerationStructure::AABB_t);
           aabbs.geometryFlags = IGPUBottomLevelAccelerationStructure::GEOMETRY_FLAGS::OPAQUE_BIT; // only allow opaque for now
 
           primitiveCounts[proceduralBlasIdx] = NumberOfProceduralGeometries;

From 1dc682adea052395fc0be30044f5b8aafd2ceb1e Mon Sep 17 00:00:00 2001
From: kevyuu <kevin.kayu@gmail.com>
Date: Sun, 26 Jan 2025 11:45:10 +0700
Subject: [PATCH 018/529] Fix query compact blas size issue

Signed-off-by: kevyuu <kevin.kayu@gmail.com>
---
 71_RayTracingPipeline/main.cpp | 34 +++++++++++++++++-----------------
 1 file changed, 17 insertions(+), 17 deletions(-)

diff --git a/71_RayTracingPipeline/main.cpp b/71_RayTracingPipeline/main.cpp
index 0e76e4c72..22c745635 100644
--- a/71_RayTracingPipeline/main.cpp
+++ b/71_RayTracingPipeline/main.cpp
@@ -287,20 +287,6 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication,
     });
 
 
-    auto assetManager = make_smart_refctd_ptr<nbl::asset::IAssetManager>(smart_refctd_ptr(system));
-    auto* geometryCreator = assetManager->getGeometryCreator();
-
-    // create geometry objects
-    if (!createGeometries(gQueue, geometryCreator))
-      return logFail("Could not create geometries from geometry creator");
-
-    if (!createAccelerationStructures(getComputeQueue()))
-      return logFail("Could not create acceleration structures");
-
-    ISampler::SParams samplerParams = {
-      .AnisotropicFilter = 0
-    };
-    auto defaultSampler = m_device->createSampler(samplerParams);
 
     // ray trace pipeline and descriptor set layout setup
     {
@@ -415,6 +401,21 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication,
 
     }
 
+    auto assetManager = make_smart_refctd_ptr<nbl::asset::IAssetManager>(smart_refctd_ptr(system));
+    auto* geometryCreator = assetManager->getGeometryCreator();
+
+    // create geometry objects
+    if (!createGeometries(gQueue, geometryCreator))
+      return logFail("Could not create geometries from geometry creator");
+
+    if (!createAccelerationStructures(getComputeQueue()))
+      return logFail("Could not create acceleration structures");
+
+    ISampler::SParams samplerParams = {
+      .AnisotropicFilter = 0
+    };
+    auto defaultSampler = m_device->createSampler(samplerParams);
+
     {
       const IGPUDescriptorSetLayout::SBinding bindings[] = {
         {
@@ -1495,7 +1496,6 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication,
         scratchBuffer = createBuffer(params);
       }
 
-      uint32_t queryCount = 0;
       core::vector<IGPUBottomLevelAccelerationStructure::BuildRangeInfo> buildRangeInfos(blasCount);
       core::vector<IGPUBottomLevelAccelerationStructure::BuildRangeInfo*> pRangeInfos(blasCount);
       for (uint32_t i = 0; i < blasCount; i++)
@@ -1536,7 +1536,7 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication,
       for (uint32_t i = 0; i < blasCount; i++)
         ases[i] = m_gpuBlasList[i].get();
       if (!cmdbufBlas->writeAccelerationStructureProperties(std::span(ases), IQueryPool::ACCELERATION_STRUCTURE_COMPACTED_SIZE,
-        queryPool.get(), queryCount++))
+        queryPool.get(), 0))
         return logFail("Failed to write acceleration structure properties!");
 
       cmdbufBlas->endDebugMarker();
@@ -1549,7 +1549,7 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication,
     // compact blas
     {
       core::vector<size_t> asSizes(blasCount);
-      if (!m_device->getQueryPoolResults(queryPool.get(), 0, blasCount, asSizes.data(), sizeof(size_t), IQueryPool::WAIT_BIT))
+      if (!m_device->getQueryPoolResults(queryPool.get(), 0, blasCount, asSizes.data(), sizeof(size_t), bitflag(IQueryPool::WAIT_BIT) | IQueryPool::_64_BIT))
         return logFail("Could not get query pool results for AS sizes");
 
       core::vector<smart_refctd_ptr<IGPUBottomLevelAccelerationStructure>> cleanupBlas(blasCount);

From eab0f70c674f93b86ac4649805ac7baaed8d0af0 Mon Sep 17 00:00:00 2001
From: keptsecret <sorchon@gmail.com>
Date: Mon, 3 Feb 2025 14:52:11 +0700
Subject: [PATCH 019/529] copied ex 30 to new ex 31

---
 31_HLSLPathTracer/CMakeLists.txt              |   37 +
 .../app_resources/glsl/common.glsl            |  811 +++++++++++
 .../app_resources/glsl/litByRectangle.comp    |  182 +++
 .../app_resources/glsl/litBySphere.comp       |   60 +
 .../app_resources/glsl/litByTriangle.comp     |  105 ++
 .../app_resources/hlsl/present.frag.hlsl      |   19 +
 31_HLSLPathTracer/config.json.template        |   28 +
 .../include/nbl/this_example/common.hpp       |   17 +
 31_HLSLPathTracer/main.cpp                    | 1276 +++++++++++++++++
 31_HLSLPathTracer/pipeline.groovy             |   50 +
 CMakeLists.txt                                |    2 +
 11 files changed, 2587 insertions(+)
 create mode 100644 31_HLSLPathTracer/CMakeLists.txt
 create mode 100644 31_HLSLPathTracer/app_resources/glsl/common.glsl
 create mode 100644 31_HLSLPathTracer/app_resources/glsl/litByRectangle.comp
 create mode 100644 31_HLSLPathTracer/app_resources/glsl/litBySphere.comp
 create mode 100644 31_HLSLPathTracer/app_resources/glsl/litByTriangle.comp
 create mode 100644 31_HLSLPathTracer/app_resources/hlsl/present.frag.hlsl
 create mode 100644 31_HLSLPathTracer/config.json.template
 create mode 100644 31_HLSLPathTracer/include/nbl/this_example/common.hpp
 create mode 100644 31_HLSLPathTracer/main.cpp
 create mode 100644 31_HLSLPathTracer/pipeline.groovy

diff --git a/31_HLSLPathTracer/CMakeLists.txt b/31_HLSLPathTracer/CMakeLists.txt
new file mode 100644
index 000000000..07b0fd396
--- /dev/null
+++ b/31_HLSLPathTracer/CMakeLists.txt
@@ -0,0 +1,37 @@
+include(common RESULT_VARIABLE RES)
+if(NOT RES)
+        message(FATAL_ERROR "common.cmake not found. Should be in {repo_root}/cmake directory")
+endif()
+
+if(NBL_BUILD_IMGUI)
+	set(NBL_INCLUDE_SERACH_DIRECTORIES
+		"${CMAKE_CURRENT_SOURCE_DIR}/include"
+	)
+
+	list(APPEND NBL_LIBRARIES 
+		imtestengine
+		"${NBL_EXT_IMGUI_UI_LIB}"
+	)
+
+	nbl_create_executable_project("" "" "${NBL_INCLUDE_SERACH_DIRECTORIES}" "${NBL_LIBRARIES}" "${NBL_EXECUTABLE_PROJECT_CREATION_PCH_TARGET}")
+
+	if(NBL_EMBED_BUILTIN_RESOURCES)
+		set(_BR_TARGET_ ${EXECUTABLE_NAME}_builtinResourceData)
+		set(RESOURCE_DIR "app_resources")
+
+		get_filename_component(_SEARCH_DIRECTORIES_ "${CMAKE_CURRENT_SOURCE_DIR}" ABSOLUTE)
+		get_filename_component(_OUTPUT_DIRECTORY_SOURCE_ "${CMAKE_CURRENT_BINARY_DIR}/src" ABSOLUTE)
+		get_filename_component(_OUTPUT_DIRECTORY_HEADER_ "${CMAKE_CURRENT_BINARY_DIR}/include" ABSOLUTE)
+
+		file(GLOB_RECURSE BUILTIN_RESOURCE_FILES RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}/${RESOURCE_DIR}" "${CMAKE_CURRENT_SOURCE_DIR}/${RESOURCE_DIR}/*")
+		foreach(RES_FILE ${BUILTIN_RESOURCE_FILES})
+			LIST_BUILTIN_RESOURCE(RESOURCES_TO_EMBED "${RES_FILE}")
+		endforeach()
+
+		ADD_CUSTOM_BUILTIN_RESOURCES(${_BR_TARGET_} RESOURCES_TO_EMBED "${_SEARCH_DIRECTORIES_}" "${RESOURCE_DIR}" "nbl::this_example::builtin" "${_OUTPUT_DIRECTORY_HEADER_}" "${_OUTPUT_DIRECTORY_SOURCE_}")
+
+		LINK_BUILTIN_RESOURCES_TO_TARGET(${EXECUTABLE_NAME} ${_BR_TARGET_})
+	endif()
+endif()
+
+
diff --git a/31_HLSLPathTracer/app_resources/glsl/common.glsl b/31_HLSLPathTracer/app_resources/glsl/common.glsl
new file mode 100644
index 000000000..2463f82cf
--- /dev/null
+++ b/31_HLSLPathTracer/app_resources/glsl/common.glsl
@@ -0,0 +1,811 @@
+// Copyright (C) 2018-2020 - DevSH Graphics Programming Sp. z O.O.
+// This file is part of the "Nabla Engine".
+// For conditions of distribution and use, see copyright notice in nabla.h
+
+// firefly and variance reduction techniques
+//#define KILL_DIFFUSE_SPECULAR_PATHS
+//#define VISUALIZE_HIGH_VARIANCE
+
+// debug
+//#define NEE_ONLY
+
+layout(set = 2, binding = 0) uniform sampler2D envMap; 
+layout(set = 2, binding = 1) uniform usamplerBuffer sampleSequence;
+layout(set = 2, binding = 2) uniform usampler2D scramblebuf;
+
+layout(set=0, binding=0, rgba16f) uniform image2D outImage;
+
+#ifndef _NBL_GLSL_WORKGROUP_SIZE_
+#define _NBL_GLSL_WORKGROUP_SIZE_ 32
+layout(local_size_x=_NBL_GLSL_WORKGROUP_SIZE_, local_size_y=_NBL_GLSL_WORKGROUP_SIZE_, local_size_z=1) in;
+#endif
+
+ivec2 getCoordinates() {
+    return ivec2(gl_GlobalInvocationID.xy);
+}
+
+vec2 getTexCoords() {
+    ivec2 imageSize = imageSize(outImage);
+    ivec2 iCoords = getCoordinates();
+    return vec2(float(iCoords.x) / imageSize.x, 1.0 - float(iCoords.y) / imageSize.y);
+}
+
+
+#include <nbl/builtin/glsl/limits/numeric.glsl>
+#include <nbl/builtin/glsl/math/constants.glsl>
+#include <nbl/builtin/glsl/utils/common.glsl>
+
+#include <nbl/builtin/glsl/sampling/box_muller_transform.glsl>
+
+layout(push_constant, row_major) uniform constants
+{
+    mat4 invMVP;
+    int sampleCount;
+    int depth;
+} PTPushConstant;
+
+#define INVALID_ID_16BIT 0xffffu
+struct Sphere
+{
+    vec3 position;
+    float radius2;
+    uint bsdfLightIDs;
+}; 
+
+Sphere Sphere_Sphere(in vec3 position, in float radius, in uint bsdfID, in uint lightID)
+{
+    Sphere sphere;
+    sphere.position = position;
+    sphere.radius2 = radius*radius;
+    sphere.bsdfLightIDs = bitfieldInsert(bsdfID,lightID,16,16);
+    return sphere;
+}
+
+// return intersection distance if found, nbl_glsl_FLT_NAN otherwise
+float Sphere_intersect(in Sphere sphere, in vec3 origin, in vec3 direction)
+{
+    vec3 relOrigin = origin-sphere.position;
+    float relOriginLen2 = dot(relOrigin,relOrigin);
+    const float radius2 = sphere.radius2;
+
+    float dirDotRelOrigin = dot(direction,relOrigin);
+    float det = radius2-relOriginLen2+dirDotRelOrigin*dirDotRelOrigin;
+
+    // do some speculative math here
+    float detsqrt = sqrt(det);
+    return -dirDotRelOrigin+(relOriginLen2>radius2 ? (-detsqrt):detsqrt);
+}
+
+vec3 Sphere_getNormal(in Sphere sphere, in vec3 position)
+{
+    const float radiusRcp = inversesqrt(sphere.radius2);
+    return (position-sphere.position)*radiusRcp;
+}
+
+float Sphere_getSolidAngle_impl(in float cosThetaMax)
+{
+    return 2.0*nbl_glsl_PI*(1.0-cosThetaMax);
+}
+float Sphere_getSolidAngle(in Sphere sphere, in vec3 origin)
+{
+    float cosThetaMax = sqrt(1.0-sphere.radius2/nbl_glsl_lengthSq(sphere.position-origin));
+    return Sphere_getSolidAngle_impl(cosThetaMax);
+}
+
+
+Sphere spheres[SPHERE_COUNT] = {
+    Sphere_Sphere(vec3(0.0,-100.5,-1.0),100.0,0u,INVALID_ID_16BIT),
+    Sphere_Sphere(vec3(2.0,0.0,-1.0),0.5,1u,INVALID_ID_16BIT),
+    Sphere_Sphere(vec3(0.0,0.0,-1.0),0.5,2u,INVALID_ID_16BIT),
+    Sphere_Sphere(vec3(-2.0,0.0,-1.0),0.5,3u,INVALID_ID_16BIT),
+    Sphere_Sphere(vec3(2.0,0.0,1.0),0.5,4u,INVALID_ID_16BIT),
+    Sphere_Sphere(vec3(0.0,0.0,1.0),0.5,4u,INVALID_ID_16BIT),
+    Sphere_Sphere(vec3(-2.0,0.0,1.0),0.5,5u,INVALID_ID_16BIT),
+    Sphere_Sphere(vec3(0.5,1.0,0.5),0.5,6u,INVALID_ID_16BIT)
+#if SPHERE_COUNT>8
+    ,Sphere_Sphere(vec3(-1.5,1.5,0.0),0.3,INVALID_ID_16BIT,0u)
+#endif
+};
+
+
+struct Triangle
+{
+    vec3 vertex0;
+    uint bsdfLightIDs;
+    vec3 vertex1;
+    uint padding0;
+    vec3 vertex2;
+    uint padding1;
+};
+
+Triangle Triangle_Triangle(in mat3 vertices, in uint bsdfID, in uint lightID)
+{
+    Triangle tri;
+    tri.vertex0 = vertices[0];
+    tri.vertex1 = vertices[1];
+    tri.vertex2 = vertices[2];
+    //
+    tri.bsdfLightIDs = bitfieldInsert(bsdfID, lightID, 16, 16);
+    return tri;
+}
+
+// return intersection distance if found, nbl_glsl_FLT_NAN otherwise
+float Triangle_intersect(in Triangle tri, in vec3 origin, in vec3 direction)
+{
+    const vec3 edges[2] = vec3[2](tri.vertex1-tri.vertex0,tri.vertex2-tri.vertex0);
+
+    const vec3 h = cross(direction,edges[1]);
+    const float a = dot(edges[0],h);
+
+    const vec3 relOrigin = origin-tri.vertex0;
+
+    const float u = dot(relOrigin,h)/a;
+
+    const vec3 q = cross(relOrigin,edges[0]);
+    const float v = dot(direction,q)/a;
+
+    const float t = dot(edges[1],q)/a;
+
+    return t>0.f&&u>=0.f&&v>=0.f&&(u+v)<=1.f ? t:nbl_glsl_FLT_NAN;
+}
+
+vec3 Triangle_getNormalTimesArea_impl(in mat2x3 edges)
+{
+    return cross(edges[0],edges[1])*0.5;
+}
+vec3 Triangle_getNormalTimesArea(in Triangle tri)
+{
+    return Triangle_getNormalTimesArea_impl(mat2x3(tri.vertex1-tri.vertex0,tri.vertex2-tri.vertex0));
+}
+
+
+
+struct Rectangle
+{
+    vec3 offset;
+    uint bsdfLightIDs;
+    vec3 edge0;
+    uint padding0;
+    vec3 edge1;
+    uint padding1;
+};
+
+Rectangle Rectangle_Rectangle(in vec3 offset, in vec3 edge0, in vec3 edge1, in uint bsdfID, in uint lightID)
+{
+    Rectangle rect;
+    rect.offset = offset;
+    rect.edge0 = edge0;
+    rect.edge1 = edge1;
+    //
+    rect.bsdfLightIDs = bitfieldInsert(bsdfID, lightID, 16, 16);
+    return rect;
+}
+
+void Rectangle_getNormalBasis(in Rectangle rect, out mat3 basis, out vec2 extents)
+{
+    extents = vec2(length(rect.edge0), length(rect.edge1));
+    basis[0] = rect.edge0/extents[0];
+    basis[1] = rect.edge1/extents[1];
+    basis[2] = normalize(cross(basis[0],basis[1]));
+}        
+
+// return intersection distance if found, nbl_glsl_FLT_NAN otherwise
+float Rectangle_intersect(in Rectangle rect, in vec3 origin, in vec3 direction)
+{
+    const vec3 h = cross(direction,rect.edge1);
+    const float a = dot(rect.edge0,h);
+
+    const vec3 relOrigin = origin-rect.offset;
+
+    const float u = dot(relOrigin,h)/a;
+
+    const vec3 q = cross(relOrigin,rect.edge0);
+    const float v = dot(direction,q)/a;
+
+    const float t = dot(rect.edge1,q)/a;
+
+    const bool intersection = t>0.f&&u>=0.f&&v>=0.f&&u<=1.f&&v<=1.f;
+    return intersection ? t:nbl_glsl_FLT_NAN;
+}
+
+vec3 Rectangle_getNormalTimesArea(in Rectangle rect)
+{
+    return cross(rect.edge0,rect.edge1);
+}
+
+
+
+#define DIFFUSE_OP 0u
+#define CONDUCTOR_OP 1u
+#define DIELECTRIC_OP 2u
+#define OP_BITS_OFFSET 0
+#define OP_BITS_SIZE 2
+struct BSDFNode
+{ 
+    uvec4 data[2];
+};
+
+uint BSDFNode_getType(in BSDFNode node)
+{
+    return bitfieldExtract(node.data[0].w,OP_BITS_OFFSET,OP_BITS_SIZE);
+}
+bool BSDFNode_isBSDF(in BSDFNode node)
+{
+    return BSDFNode_getType(node)==DIELECTRIC_OP;
+}
+bool BSDFNode_isNotDiffuse(in BSDFNode node)
+{
+    return BSDFNode_getType(node)!=DIFFUSE_OP;
+}
+float BSDFNode_getRoughness(in BSDFNode node)
+{
+    return uintBitsToFloat(node.data[1].w);
+}
+vec3 BSDFNode_getRealEta(in BSDFNode node)
+{
+    return uintBitsToFloat(node.data[0].rgb);
+}
+vec3 BSDFNode_getImaginaryEta(in BSDFNode node)
+{
+    return uintBitsToFloat(node.data[1].rgb);
+}
+mat2x3 BSDFNode_getEta(in BSDFNode node)
+{
+    return mat2x3(BSDFNode_getRealEta(node),BSDFNode_getImaginaryEta(node));
+}
+#include <nbl/builtin/glsl/bxdf/fresnel.glsl>
+vec3 BSDFNode_getReflectance(in BSDFNode node, in float VdotH)
+{
+    const vec3 albedoOrRealIoR = uintBitsToFloat(node.data[0].rgb);
+    if (BSDFNode_isNotDiffuse(node))
+        return nbl_glsl_fresnel_conductor(albedoOrRealIoR, BSDFNode_getImaginaryEta(node), VdotH);
+    else
+        return albedoOrRealIoR;
+}
+
+float BSDFNode_getNEEProb(in BSDFNode bsdf)
+{
+    const float alpha = BSDFNode_isNotDiffuse(bsdf) ? BSDFNode_getRoughness(bsdf):1.0;
+    return min(8.0*alpha,1.0);
+}
+
+#include <nbl/builtin/glsl/colorspace/EOTF.glsl>
+#include <nbl/builtin/glsl/colorspace/encodeCIEXYZ.glsl>
+float getLuma(in vec3 col)
+{
+    return dot(transpose(nbl_glsl_scRGBtoXYZ)[1],col);
+}
+
+#define BSDF_COUNT 7
+BSDFNode bsdfs[BSDF_COUNT] = {
+    {{uvec4(floatBitsToUint(vec3(0.8,0.8,0.8)),DIFFUSE_OP),floatBitsToUint(vec4(0.0,0.0,0.0,0.0))}},
+    {{uvec4(floatBitsToUint(vec3(0.8,0.4,0.4)),DIFFUSE_OP),floatBitsToUint(vec4(0.0,0.0,0.0,0.0))}},
+    {{uvec4(floatBitsToUint(vec3(0.4,0.8,0.4)),DIFFUSE_OP),floatBitsToUint(vec4(0.0,0.0,0.0,0.0))}},
+    {{uvec4(floatBitsToUint(vec3(1.02,1.02,1.3)),CONDUCTOR_OP),floatBitsToUint(vec4(1.0,1.0,2.0,0.0))}},
+    {{uvec4(floatBitsToUint(vec3(1.02,1.3,1.02)),CONDUCTOR_OP),floatBitsToUint(vec4(1.0,2.0,1.0,0.0))}},
+    {{uvec4(floatBitsToUint(vec3(1.02,1.3,1.02)),CONDUCTOR_OP),floatBitsToUint(vec4(1.0,2.0,1.0,0.15))}},
+    {{uvec4(floatBitsToUint(vec3(1.4,1.45,1.5)),DIELECTRIC_OP),floatBitsToUint(vec4(0.0,0.0,0.0,0.0625))}}
+};
+
+
+struct Light
+{
+    vec3 radiance;
+    uint objectID;
+};
+
+vec3 Light_getRadiance(in Light light)
+{
+    return light.radiance;
+}
+uint Light_getObjectID(in Light light)
+{
+    return light.objectID;
+}
+
+
+#define LIGHT_COUNT 1
+float scene_getLightChoicePdf(in Light light)
+{
+    return 1.0/float(LIGHT_COUNT);
+}
+
+
+#define LIGHT_COUNT 1
+Light lights[LIGHT_COUNT] =
+{
+    {
+        vec3(30.0,25.0,15.0),
+#ifdef POLYGON_METHOD
+        0u
+#else
+        8u
+#endif
+    }
+};
+
+
+
+#define ANY_HIT_FLAG (-2147483648)
+#define DEPTH_BITS_COUNT 8
+#define DEPTH_BITS_OFFSET (31-DEPTH_BITS_COUNT)
+struct ImmutableRay_t
+{
+    vec3 origin;
+    vec3 direction;
+#if POLYGON_METHOD==2
+    vec3 normalAtOrigin;
+    bool wasBSDFAtOrigin;
+#endif
+};
+struct MutableRay_t
+{
+    float intersectionT;
+    uint objectID;
+    /* irrelevant here
+    uint triangleID;
+    vec2 barycentrics;
+    */
+};
+struct Payload_t
+{
+    vec3 accumulation;
+    float otherTechniqueHeuristic;
+    vec3 throughput;
+    #ifdef KILL_DIFFUSE_SPECULAR_PATHS
+    bool hasDiffuse;
+    #endif
+};
+
+struct Ray_t
+{
+    ImmutableRay_t _immutable;
+    MutableRay_t _mutable;
+    Payload_t _payload;
+};
+
+
+#define INTERSECTION_ERROR_BOUND_LOG2 (-8.0)
+float getTolerance_common(in uint depth)
+{
+    float depthRcp = 1.0/float(depth);
+    return INTERSECTION_ERROR_BOUND_LOG2;// *depthRcp*depthRcp;
+}
+float getStartTolerance(in uint depth)
+{
+    return exp2(getTolerance_common(depth));
+}
+float getEndTolerance(in uint depth)
+{
+    return 1.0-exp2(getTolerance_common(depth)+1.0);
+}
+
+
+vec2 SampleSphericalMap(vec3 v)
+{
+    vec2 uv = vec2(atan(v.z, v.x), asin(v.y));
+    uv *= nbl_glsl_RECIPROCAL_PI*0.5;
+    uv += 0.5; 
+    return uv;
+}
+
+void missProgram(in ImmutableRay_t _immutable, inout Payload_t _payload)
+{
+    vec3 finalContribution = _payload.throughput; 
+    // #define USE_ENVMAP
+#ifdef USE_ENVMAP
+	vec2 uv = SampleSphericalMap(_immutable.direction);
+    finalContribution *= textureLod(envMap, uv, 0.0).rgb;
+#else
+    const vec3 kConstantEnvLightRadiance = vec3(0.15, 0.21, 0.3);
+    finalContribution *= kConstantEnvLightRadiance;
+    _payload.accumulation += finalContribution;
+#endif
+}
+
+#include <nbl/builtin/glsl/bxdf/brdf/diffuse/oren_nayar.glsl>
+#include <nbl/builtin/glsl/bxdf/brdf/specular/beckmann.glsl>
+#include <nbl/builtin/glsl/bxdf/brdf/specular/ggx.glsl>
+#include <nbl/builtin/glsl/bxdf/bsdf/diffuse/lambert.glsl>
+#include <nbl/builtin/glsl/bxdf/bsdf/specular/dielectric.glsl>
+#include <nbl/builtin/glsl/bxdf/bsdf/specular/beckmann.glsl>
+#include <nbl/builtin/glsl/bxdf/bsdf/specular/ggx.glsl>
+nbl_glsl_LightSample nbl_glsl_bsdf_cos_generate(in nbl_glsl_AnisotropicViewSurfaceInteraction interaction, in vec3 u, in BSDFNode bsdf, in float monochromeEta, out nbl_glsl_AnisotropicMicrofacetCache _cache)
+{
+    const float a = BSDFNode_getRoughness(bsdf);
+    const mat2x3 ior = BSDFNode_getEta(bsdf);
+    
+    // fresnel stuff for dielectrics
+    float orientedEta, rcpOrientedEta;
+    const bool viewerInsideMedium = nbl_glsl_getOrientedEtas(orientedEta,rcpOrientedEta,interaction.isotropic.NdotV,monochromeEta);
+
+    nbl_glsl_LightSample smpl;
+    nbl_glsl_AnisotropicMicrofacetCache dummy;
+    switch (BSDFNode_getType(bsdf))
+    {
+        case DIFFUSE_OP:
+            smpl = nbl_glsl_oren_nayar_cos_generate(interaction,u.xy,a*a);
+            break;
+        case CONDUCTOR_OP:
+            smpl = nbl_glsl_ggx_cos_generate(interaction,u.xy,a,a,_cache);
+            break;
+        default:
+            smpl = nbl_glsl_ggx_dielectric_cos_generate(interaction,u,a,a,monochromeEta,_cache);
+            break;
+    }
+    return smpl;
+}
+
+vec3 nbl_glsl_bsdf_cos_remainder_and_pdf(out float pdf, in nbl_glsl_LightSample _sample, in nbl_glsl_AnisotropicViewSurfaceInteraction interaction, in BSDFNode bsdf, in float monochromeEta, in nbl_glsl_AnisotropicMicrofacetCache _cache)
+{
+    // are V and L on opposite sides of the surface?
+    const bool transmitted = nbl_glsl_isTransmissionPath(interaction.isotropic.NdotV,_sample.NdotL);
+
+    // is the BSDF or BRDF, if it is then we make the dot products `abs` before `max(,0.0)`
+    const bool transmissive = BSDFNode_isBSDF(bsdf);
+    const float clampedNdotL = nbl_glsl_conditionalAbsOrMax(transmissive,_sample.NdotL,0.0);
+    const float clampedNdotV = nbl_glsl_conditionalAbsOrMax(transmissive,interaction.isotropic.NdotV,0.0);
+
+    vec3 remainder;
+
+    const float minimumProjVectorLen = 0.00000001;
+    if (clampedNdotV>minimumProjVectorLen && clampedNdotL>minimumProjVectorLen)
+    {
+        // fresnel stuff for conductors (but reflectance also doubles as albedo)
+        const mat2x3 ior = BSDFNode_getEta(bsdf);
+        const vec3 reflectance = BSDFNode_getReflectance(bsdf,_cache.isotropic.VdotH);
+
+        // fresnel stuff for dielectrics
+        float orientedEta, rcpOrientedEta;
+        const bool viewerInsideMedium = nbl_glsl_getOrientedEtas(orientedEta,rcpOrientedEta,interaction.isotropic.NdotV,monochromeEta);
+
+        //
+        const float VdotL = dot(interaction.isotropic.V.dir,_sample.L);
+
+        //
+        const float a = max(BSDFNode_getRoughness(bsdf),0.0001); // TODO: @Crisspl 0-roughness still doesn't work! Also Beckmann has a weird dark rim instead as fresnel!?
+        const float a2 = a*a;
+
+        // TODO: refactor into Material Compiler-esque thing
+        switch (BSDFNode_getType(bsdf))
+        {
+            case DIFFUSE_OP:
+                remainder = reflectance*nbl_glsl_oren_nayar_cos_remainder_and_pdf_wo_clamps(pdf,a*a,VdotL,clampedNdotL,clampedNdotV);
+                break;
+            case CONDUCTOR_OP:
+                remainder = nbl_glsl_ggx_cos_remainder_and_pdf_wo_clamps(pdf,nbl_glsl_ggx_trowbridge_reitz(a2,_cache.isotropic.NdotH2),clampedNdotL,_sample.NdotL2,clampedNdotV,interaction.isotropic.NdotV_squared,reflectance,a2);
+                break;
+            default:
+                remainder = vec3(nbl_glsl_ggx_dielectric_cos_remainder_and_pdf(pdf, _sample, interaction.isotropic, _cache.isotropic, monochromeEta, a*a));
+                break;
+        }
+    }
+    else
+        remainder = vec3(0.0);
+    return remainder;
+}
+
+layout (constant_id = 0) const int MAX_DEPTH_LOG2 = 4;
+layout (constant_id = 1) const int MAX_SAMPLES_LOG2 = 10;
+
+
+#include <nbl/builtin/glsl/random/xoroshiro.glsl>
+
+mat2x3 rand3d(in uint protoDimension, in uint _sample, inout nbl_glsl_xoroshiro64star_state_t scramble_state)
+{
+    mat2x3 retval;
+    uint address = bitfieldInsert(protoDimension,_sample,MAX_DEPTH_LOG2,MAX_SAMPLES_LOG2);
+    for (int i=0; i<2u; i++)
+    {
+	    uvec3 seqVal = texelFetch(sampleSequence,int(address)+i).xyz;
+	    seqVal ^= uvec3(nbl_glsl_xoroshiro64star(scramble_state),nbl_glsl_xoroshiro64star(scramble_state),nbl_glsl_xoroshiro64star(scramble_state));
+        retval[i] = vec3(seqVal)*uintBitsToFloat(0x2f800004u);
+    }
+    return retval;
+}
+
+
+void traceRay_extraShape(inout int objectID, inout float intersectionT, in vec3 origin, in vec3 direction);
+int traceRay(inout float intersectionT, in vec3 origin, in vec3 direction)
+{
+    const bool anyHit = intersectionT!=nbl_glsl_FLT_MAX;
+
+	int objectID = -1;
+	for (int i=0; i<SPHERE_COUNT; i++)
+    {
+        float t = Sphere_intersect(spheres[i],origin,direction);
+        bool closerIntersection = t>0.0 && t<intersectionT;
+
+        intersectionT = closerIntersection ? t : intersectionT;
+		objectID = closerIntersection ? i:objectID;
+        
+        // allowing early out results in a performance regression, WTF!?
+        //if (anyHit && closerIntersection)
+           //break;
+    }
+    traceRay_extraShape(objectID,intersectionT,origin,direction);
+    return objectID;
+}
+
+//
+float nbl_glsl_light_deferred_pdf(in Light light, in Ray_t ray);
+vec3 nbl_glsl_light_deferred_eval_and_prob(out float pdf, in Light light, in Ray_t ray)
+{
+    // we don't have to worry about solid angle of the light w.r.t. surface of the light because this function only ever gets called from closestHit routine, so such ray cannot be produced (because lights have no BSDFs here)
+    pdf = scene_getLightChoicePdf(light);
+    pdf *= nbl_glsl_light_deferred_pdf(light,ray);
+    return Light_getRadiance(light);
+}
+
+vec3 nbl_glsl_light_generate_and_pdf(out float pdf, out float newRayMaxT, in vec3 origin, in nbl_glsl_AnisotropicViewSurfaceInteraction interaction, in bool isBSDF, in vec3 xi, in uint objectID);
+nbl_glsl_LightSample nbl_glsl_light_generate_and_remainder_and_pdf(out vec3 remainder, out float pdf, out float newRayMaxT, in vec3 origin, in nbl_glsl_AnisotropicViewSurfaceInteraction interaction, in bool isBSDF, in vec3 xi, in uint depth)
+{
+    // normally we'd pick from set of lights, using `xi.z`
+    const Light light = lights[0];
+    
+    vec3 L = nbl_glsl_light_generate_and_pdf(pdf,newRayMaxT,origin,interaction,isBSDF,xi,Light_getObjectID(light));
+
+    newRayMaxT *= getEndTolerance(depth);
+    pdf *= scene_getLightChoicePdf(light);
+    remainder = Light_getRadiance(light)/pdf;
+    return nbl_glsl_createLightSample(L,interaction);
+}
+
+uint getBSDFLightIDAndDetermineNormal(out vec3 normal, in uint objectID, in vec3 intersection);
+bool closestHitProgram(in uint depth, in uint _sample, inout Ray_t ray, inout nbl_glsl_xoroshiro64star_state_t scramble_state)
+{
+    const MutableRay_t _mutable = ray._mutable;
+    const uint objectID = _mutable.objectID;
+
+    // interaction stuffs
+    const ImmutableRay_t _immutable = ray._immutable;
+    const vec3 intersection = _immutable.origin+_immutable.direction*_mutable.intersectionT;
+
+    uint bsdfLightIDs;
+    nbl_glsl_AnisotropicViewSurfaceInteraction interaction;
+    {
+        nbl_glsl_IsotropicViewSurfaceInteraction isotropic;
+        bsdfLightIDs = getBSDFLightIDAndDetermineNormal(isotropic.N,objectID,intersection);
+
+        isotropic.V.dir = -_immutable.direction;
+        isotropic.NdotV = dot(isotropic.V.dir,isotropic.N);
+        isotropic.NdotV_squared = isotropic.NdotV*isotropic.NdotV;
+
+        interaction = nbl_glsl_calcAnisotropicInteraction(isotropic);
+    }
+
+    //
+    vec3 throughput = ray._payload.throughput;
+
+    // add emissive and finish MIS
+    const uint lightID = bitfieldExtract(bsdfLightIDs,16,16);
+    if (lightID != INVALID_ID_16BIT) // has emissive
+    {
+        float lightPdf;
+        ray._payload.accumulation += nbl_glsl_light_deferred_eval_and_prob(lightPdf,lights[lightID],ray)*throughput/(1.0+lightPdf*lightPdf*ray._payload.otherTechniqueHeuristic);
+    }
+
+    // check if we even have a BSDF at all
+    uint bsdfID = bitfieldExtract(bsdfLightIDs, 0, 16);
+    if (bsdfID != INVALID_ID_16BIT)
+    {
+        BSDFNode bsdf = bsdfs[bsdfID];
+#ifdef KILL_DIFFUSE_SPECULAR_PATHS
+        if (BSDFNode_isNotDiffuse(bsdf))
+        {
+            if (ray._payload.hasDiffuse)
+                return true;
+        }
+        else
+            ray._payload.hasDiffuse = true;
+#endif
+
+        const bool isBSDF = BSDFNode_isBSDF(bsdf);
+        //rand
+        mat2x3 epsilon = rand3d(depth,_sample,scramble_state);
+
+        // thresholds
+        const float bsdfPdfThreshold = 0.0001;
+        const float lumaContributionThreshold = getLuma(nbl_glsl_eotf_sRGB(vec3(1.0)/255.0)); // OETF smallest perceptible value
+        const vec3 throughputCIE_Y = transpose(nbl_glsl_sRGBtoXYZ)[1]*throughput;
+        const float monochromeEta = dot(throughputCIE_Y,BSDFNode_getEta(bsdf)[0])/(throughputCIE_Y.r+throughputCIE_Y.g+throughputCIE_Y.b);
+
+        // do NEE
+        const float neeProbability = 1.0;// BSDFNode_getNEEProb(bsdf);
+        float rcpChoiceProb;
+        if (!nbl_glsl_partitionRandVariable(neeProbability,epsilon[0].z,rcpChoiceProb) && depth<2u)
+        {
+            vec3 neeContrib; float lightPdf, t;
+            nbl_glsl_LightSample nee_sample = nbl_glsl_light_generate_and_remainder_and_pdf(
+                neeContrib, lightPdf, t,
+                intersection, interaction,
+                isBSDF, epsilon[0], depth
+            );
+            // We don't allow non watertight transmitters in this renderer
+            bool validPath = nee_sample.NdotL>nbl_glsl_FLT_MIN;
+            // but if we allowed non-watertight transmitters (single water surface), it would make sense just to apply this line by itself
+            nbl_glsl_AnisotropicMicrofacetCache _cache;
+            validPath = validPath && nbl_glsl_calcAnisotropicMicrofacetCache(_cache, interaction, nee_sample, monochromeEta);
+            if (lightPdf<nbl_glsl_FLT_MAX)
+            {
+            if (any(isnan(nee_sample.L)))
+                ray._payload.accumulation += vec3(1000.f,0.f,0.f);
+            else
+            if (all(equal(vec3(69.f),nee_sample.L)))
+                ray._payload.accumulation += vec3(0.f,1000.f,0.f);
+            else
+            if (validPath)
+            {
+                float bsdfPdf;
+                neeContrib *= nbl_glsl_bsdf_cos_remainder_and_pdf(bsdfPdf,nee_sample,interaction,bsdf,monochromeEta,_cache)*throughput;
+                const float otherGenOverChoice = bsdfPdf*rcpChoiceProb;
+#ifndef NEE_ONLY
+                const float otherGenOverLightAndChoice = otherGenOverChoice/lightPdf;
+                neeContrib *= otherGenOverChoice/(1.f+otherGenOverLightAndChoice*otherGenOverLightAndChoice); // MIS weight
+#else
+                neeContrib *= otherGenOverChoice;
+#endif
+                if (bsdfPdf<nbl_glsl_FLT_MAX && getLuma(neeContrib)>lumaContributionThreshold && traceRay(t,intersection+nee_sample.L*t*getStartTolerance(depth),nee_sample.L)==-1)
+                    ray._payload.accumulation += neeContrib;
+            }}
+        }
+#if NEE_ONLY
+        return false;
+#endif
+        // sample BSDF
+        float bsdfPdf; vec3 bsdfSampleL;
+        {
+            nbl_glsl_AnisotropicMicrofacetCache _cache;
+            nbl_glsl_LightSample bsdf_sample = nbl_glsl_bsdf_cos_generate(interaction,epsilon[1],bsdf,monochromeEta,_cache);
+            // the value of the bsdf divided by the probability of the sample being generated
+            throughput *= nbl_glsl_bsdf_cos_remainder_and_pdf(bsdfPdf,bsdf_sample,interaction,bsdf,monochromeEta,_cache);
+            //
+            bsdfSampleL = bsdf_sample.L;
+        }
+        
+        // additional threshold
+        const float lumaThroughputThreshold = lumaContributionThreshold;
+        if (bsdfPdf>bsdfPdfThreshold && getLuma(throughput)>lumaThroughputThreshold)
+        {
+            ray._payload.throughput = throughput;
+            ray._payload.otherTechniqueHeuristic = neeProbability/bsdfPdf; // numerically stable, don't touch
+            ray._payload.otherTechniqueHeuristic *= ray._payload.otherTechniqueHeuristic;
+                    
+            // trace new ray
+            ray._immutable.origin = intersection+bsdfSampleL*(1.0/*kSceneSize*/)*getStartTolerance(depth);
+            ray._immutable.direction = bsdfSampleL;
+            #if POLYGON_METHOD==2
+            ray._immutable.normalAtOrigin = interaction.isotropic.N;
+            ray._immutable.wasBSDFAtOrigin = isBSDF;
+            #endif
+            return true;
+        }
+    }
+    return false;
+}
+
+void main()
+{
+    const ivec2 imageExtents = imageSize(outImage);
+    const ivec2 coords = getCoordinates();
+    vec2 texCoord = vec2(coords) / vec2(imageExtents);
+    texCoord.y = 1.0 - texCoord.y;
+
+    if (false == (all(lessThanEqual(ivec2(0),coords)) && all(greaterThan(imageExtents,coords)))) {
+        return;
+    }
+
+    if (((PTPushConstant.depth-1)>>MAX_DEPTH_LOG2)>0 || ((PTPushConstant.sampleCount-1)>>MAX_SAMPLES_LOG2)>0)
+    {
+        vec4 pixelCol = vec4(1.0,0.0,0.0,1.0);
+        imageStore(outImage, coords, pixelCol);
+        return;
+    }
+
+	nbl_glsl_xoroshiro64star_state_t scramble_start_state = texelFetch(scramblebuf,coords,0).rg;
+    const vec2 pixOffsetParam = vec2(1.0)/vec2(textureSize(scramblebuf,0));
+
+
+    const mat4 invMVP = PTPushConstant.invMVP;
+    
+    vec4 NDC = vec4(texCoord*vec2(2.0,-2.0)+vec2(-1.0,1.0),0.0,1.0);
+    vec3 camPos;
+    {
+        vec4 tmp = invMVP*NDC;
+        camPos = tmp.xyz/tmp.w;
+        NDC.z = 1.0;
+    }
+
+    vec3 color = vec3(0.0);
+    float meanLumaSquared = 0.0;
+    // TODO: if we collapse the nested for loop, then all GPUs will get `PTPushConstant.depth` factor speedup, not just NV with separate PC
+    for (int i=0; i<PTPushConstant.sampleCount; i++)
+    {
+        nbl_glsl_xoroshiro64star_state_t scramble_state = scramble_start_state;
+
+        Ray_t ray;
+        // raygen
+        {
+            ray._immutable.origin = camPos;
+
+            vec4 tmp = NDC;
+            // apply stochastic reconstruction filter
+            const float gaussianFilterCutoff = 2.5;
+            const float truncation = exp(-0.5*gaussianFilterCutoff*gaussianFilterCutoff);
+            vec2 remappedRand = rand3d(0u,i,scramble_state)[0].xy;
+            remappedRand.x *= 1.0-truncation;
+            remappedRand.x += truncation;
+            tmp.xy += pixOffsetParam*nbl_glsl_BoxMullerTransform(remappedRand,1.5);
+            // for depth of field we could do another stochastic point-pick
+            tmp = invMVP*tmp;
+            ray._immutable.direction = normalize(tmp.xyz/tmp.w-camPos);
+
+            #if POLYGON_METHOD==2
+                ray._immutable.normalAtOrigin = vec3(0.0,0.0,0.0);
+                ray._immutable.wasBSDFAtOrigin = false;
+            #endif
+
+            ray._payload.accumulation = vec3(0.0);
+            ray._payload.otherTechniqueHeuristic = 0.0; // needed for direct eye-light paths
+            ray._payload.throughput = vec3(1.0);
+            #ifdef KILL_DIFFUSE_SPECULAR_PATHS
+            ray._payload.hasDiffuse = false;
+            #endif
+        }
+
+        // bounces
+        {
+            bool hit = true; bool rayAlive = true;
+            for (int d=1; d<=PTPushConstant.depth && hit && rayAlive; d+=2)
+            {
+                ray._mutable.intersectionT = nbl_glsl_FLT_MAX;
+                ray._mutable.objectID = traceRay(ray._mutable.intersectionT,ray._immutable.origin,ray._immutable.direction);
+                hit = ray._mutable.objectID!=-1;
+                if (hit)
+                    rayAlive = closestHitProgram(d, i, ray, scramble_state);
+            }
+            // was last trace a miss?
+            if (!hit)
+                missProgram(ray._immutable,ray._payload);
+        }
+
+        vec3 accumulation = ray._payload.accumulation;
+
+        float rcpSampleSize = 1.0/float(i+1);
+        color += (accumulation-color)*rcpSampleSize;
+        
+        #ifdef VISUALIZE_HIGH_VARIANCE
+            float luma = getLuma(accumulation);
+            meanLumaSquared += (luma*luma-meanLumaSquared)*rcpSampleSize;
+        #endif
+    }
+
+    #ifdef VISUALIZE_HIGH_VARIANCE
+        float variance = getLuma(color);
+        variance *= variance;
+        variance = meanLumaSquared-variance;
+        if (variance>5.0)
+            color = vec3(1.0,0.0,0.0);
+    #endif
+
+    vec4 pixelCol = vec4(color, 1.0);
+    imageStore(outImage, coords, pixelCol);
+}
+/** TODO: Improving Rendering
+
+Now:
+- Always MIS (path correlated reuse)
+- Test MIS alpha (roughness) scheme
+
+Many Lights:
+- Path Guiding
+- Light Importance Lists/Classification
+- Spatio-Temporal Reservoir Sampling
+
+Indirect Light:
+- Bidirectional Path Tracing
+- Uniform Path Sampling / Vertex Connection and Merging / Path Space Regularization
+
+Animations:
+- A-SVGF / BMFR
+**/
\ No newline at end of file
diff --git a/31_HLSLPathTracer/app_resources/glsl/litByRectangle.comp b/31_HLSLPathTracer/app_resources/glsl/litByRectangle.comp
new file mode 100644
index 000000000..300cef559
--- /dev/null
+++ b/31_HLSLPathTracer/app_resources/glsl/litByRectangle.comp
@@ -0,0 +1,182 @@
+// Copyright (C) 2018-2020 - DevSH Graphics Programming Sp. z O.O.
+// This file is part of the "Nabla Engine".
+// For conditions of distribution and use, see copyright notice in nabla.h
+
+#version 430 core
+#extension GL_GOOGLE_include_directive : require
+
+#define SPHERE_COUNT 8
+#define POLYGON_METHOD 1 // 0 area sampling, 1 solid angle sampling, 2 approximate projected solid angle sampling
+#include "common.glsl"
+
+#define RECTANGLE_COUNT 1
+const vec3 edge0 = normalize(vec3(2,0,-1));
+const vec3 edge1 = normalize(vec3(2,-5,4));
+Rectangle rectangles[RECTANGLE_COUNT] = {
+    Rectangle_Rectangle(vec3(-3.8,0.35,1.3),edge0*7.0,edge1*0.1,INVALID_ID_16BIT,0u)
+};
+
+
+void traceRay_extraShape(inout int objectID, inout float intersectionT, in vec3 origin, in vec3 direction)
+{
+	for (int i=0; i<RECTANGLE_COUNT; i++)
+    {
+        float t = Rectangle_intersect(rectangles[i],origin,direction);
+        bool closerIntersection = t>0.0 && t<intersectionT;
+
+		objectID = closerIntersection ? (i+SPHERE_COUNT):objectID;
+        intersectionT = closerIntersection ? t:intersectionT;
+    }
+}
+
+#include <nbl/builtin/glsl/sampling/projected_spherical_triangle.glsl>
+#include <nbl/builtin/glsl/barycentric/utils.glsl>
+#include <nbl/builtin/glsl/sampling/spherical_rectangle.glsl>
+
+float nbl_glsl_light_deferred_pdf(in Light light, in Ray_t ray)
+{
+    const Rectangle rect = rectangles[Light_getObjectID(light)];
+    
+    const ImmutableRay_t _immutable = ray._immutable;
+    const vec3 L = _immutable.direction;
+#if POLYGON_METHOD==0
+    const float dist = ray._mutable.intersectionT;
+    return dist*dist/abs(dot(Rectangle_getNormalTimesArea(rect),L));
+#else
+    #ifdef TRIANGLE_REFERENCE
+        const mat3 sphericalVertices[2] = 
+        {
+            nbl_glsl_shapes_getSphericalTriangle(mat3(rect.offset,rect.offset+rect.edge0,rect.offset+rect.edge1),_immutable.origin),
+            nbl_glsl_shapes_getSphericalTriangle(mat3(rect.offset+rect.edge1,rect.offset+rect.edge0,rect.offset+rect.edge0+rect.edge1),_immutable.origin)
+        };
+        float solidAngle[2];
+        vec3 cos_vertices[2],sin_vertices[2];
+        float cos_a[2],cos_c[2],csc_b[2],csc_c[2];
+        for (uint i=0u; i<2u; i++)
+            solidAngle[i] = nbl_glsl_shapes_SolidAngleOfTriangle(sphericalVertices[i],cos_vertices[i],sin_vertices[i],cos_a[i],cos_c[i],csc_b[i],csc_c[i]);
+        const float rectSolidAngle = solidAngle[0]+solidAngle[1];
+        #if POLYGON_METHOD==1
+            return 1.f/rectSolidAngle;
+        #elif POLYGON_METHOD==2
+            // TODO: figure out what breaks for a directly visible light under MIS
+            if (rectSolidAngle > nbl_glsl_FLT_MIN)
+            {
+                const vec2 bary = nbl_glsl_barycentric_reconstructBarycentrics(L*ray._mutable.intersectionT+_immutable.origin-rect.offset,mat2x3(rect.edge0,rect.edge1));
+                const uint i = bary.x>=0.f&&bary.y>=0.f&&(bary.x+bary.y)<=1.f ? 0u:1u;
+
+                float pdf = nbl_glsl_sampling_probProjectedSphericalTriangleSample(solidAngle[i],cos_vertices[i],sin_vertices[i],cos_a[i],cos_c[i],csc_b[i],csc_c[i],sphericalVertices[i],_immutable.normalAtOrigin,_immutable.wasBSDFAtOrigin,L);
+                pdf *= solidAngle[i]/rectSolidAngle;
+                return pdf;
+            }
+            else
+                return nbl_glsl_FLT_INF;
+        #endif
+    #else
+        float pdf;
+        mat3 rectNormalBasis;
+        vec2 rectExtents;
+        Rectangle_getNormalBasis(rect, rectNormalBasis, rectExtents);
+        vec3 sphR0 = nbl_glsl_shapes_getSphericalRectangle(_immutable.origin, rect.offset, rectNormalBasis);
+        float solidAngle = nbl_glsl_shapes_SolidAngleOfRectangle(sphR0, rectExtents);
+        if (solidAngle > nbl_glsl_FLT_MIN)
+        {
+            #if POLYGON_METHOD==1
+            pdf = 1.f/solidAngle;
+            #else
+                #error
+            #endif  
+        }
+        else
+            pdf = nbl_glsl_FLT_INF;
+        return pdf;
+    #endif
+#endif
+}
+
+vec3 nbl_glsl_light_generate_and_pdf(out float pdf, out float newRayMaxT, in vec3 origin, in nbl_glsl_AnisotropicViewSurfaceInteraction interaction, in bool isBSDF, in vec3 xi, in uint objectID)
+{
+    const Rectangle rect = rectangles[objectID];
+    const vec3 N = Rectangle_getNormalTimesArea(rect);
+
+    const vec3 origin2origin = rect.offset-origin;
+#if POLYGON_METHOD==0
+    vec3 L = origin2origin+rect.edge0*xi.x+rect.edge1*xi.y; // TODO: refactor
+    
+    const float distanceSq = dot(L,L);
+    const float rcpDistance = inversesqrt(distanceSq);
+    L *= rcpDistance;
+    
+    pdf = distanceSq/abs(dot(N,L));
+    newRayMaxT = 1.0/rcpDistance;
+    return L;
+#else 
+    #ifdef TRIANGLE_REFERENCE
+        const mat3 sphericalVertices[2] = 
+        {
+            nbl_glsl_shapes_getSphericalTriangle(mat3(rect.offset,rect.offset+rect.edge0,rect.offset+rect.edge1),origin),
+            nbl_glsl_shapes_getSphericalTriangle(mat3(rect.offset+rect.edge1,rect.offset+rect.edge0,rect.offset+rect.edge0+rect.edge1),origin)
+        };
+        float solidAngle[2];
+        vec3 cos_vertices[2],sin_vertices[2];
+        float cos_a[2],cos_c[2],csc_b[2],csc_c[2];
+        for (uint i=0u; i<2u; i++)
+            solidAngle[i] = nbl_glsl_shapes_SolidAngleOfTriangle(sphericalVertices[i],cos_vertices[i],sin_vertices[i],cos_a[i],cos_c[i],csc_b[i],csc_c[i]);
+        vec3 L = vec3(0.f,0.f,0.f);
+        const float rectangleSolidAngle = solidAngle[0]+solidAngle[1];
+        if (rectangleSolidAngle > nbl_glsl_FLT_MIN)
+        {
+            float rcpTriangleChoiceProb;
+            const uint i = nbl_glsl_partitionRandVariable(solidAngle[0]/rectangleSolidAngle,xi.z,rcpTriangleChoiceProb) ? 1u:0u;
+        #if POLYGON_METHOD==1
+            L = nbl_glsl_sampling_generateSphericalTriangleSample(solidAngle[i],cos_vertices[i],sin_vertices[i],cos_a[i],cos_c[i],csc_b[i],csc_c[i],sphericalVertices[i],xi.xy);
+            pdf = 1.f/rectangleSolidAngle;
+        #elif POLYGON_METHOD==2
+            float rcpPdf;
+            L = nbl_glsl_sampling_generateProjectedSphericalTriangleSample(rcpPdf,solidAngle[i],cos_vertices[i],sin_vertices[i],cos_a[i],cos_c[i],csc_b[i],csc_c[i],sphericalVertices[i],interaction.isotropic.N,isBSDF,xi.xy);
+            pdf = 1.f/(rcpPdf*rcpTriangleChoiceProb);
+        #endif
+        }
+        else
+            pdf = nbl_glsl_FLT_INF;
+    #else
+        mat3 rectNormalBasis;
+        vec2 rectExtents;
+        Rectangle_getNormalBasis(rect, rectNormalBasis, rectExtents);
+        vec3 sphR0 = nbl_glsl_shapes_getSphericalRectangle(origin, rect.offset, rectNormalBasis);
+        vec3 L = vec3(0.f,0.f,0.f);
+        float solidAngle;
+        vec2 sphUv = nbl_glsl_sampling_generateSphericalRectangleSample(sphR0, rectExtents, xi.xy, solidAngle);
+        if (solidAngle > nbl_glsl_FLT_MIN)
+        {
+            #if POLYGON_METHOD==1
+            vec3 sph_sample = sphUv[0] * rect.edge0 + sphUv[1] * rect.edge1 + rect.offset;
+            L = normalize(sph_sample - origin);
+            pdf = 1.f/solidAngle;
+            #else
+                #error
+            #endif  
+        }
+        else
+            pdf = nbl_glsl_FLT_INF;
+    #endif
+    newRayMaxT = dot(N,origin2origin)/dot(N,L);
+    return L;
+#endif
+}
+
+
+uint getBSDFLightIDAndDetermineNormal(out vec3 normal, in uint objectID, in vec3 intersection)
+{
+    if (objectID<SPHERE_COUNT)
+    {
+        Sphere sphere = spheres[objectID];
+        normal = Sphere_getNormal(sphere,intersection);
+        return sphere.bsdfLightIDs;
+    }
+    else
+    {
+        Rectangle rect = rectangles[objectID-SPHERE_COUNT];
+        normal = normalize(Rectangle_getNormalTimesArea(rect));
+        return rect.bsdfLightIDs;
+    }
+}
\ No newline at end of file
diff --git a/31_HLSLPathTracer/app_resources/glsl/litBySphere.comp b/31_HLSLPathTracer/app_resources/glsl/litBySphere.comp
new file mode 100644
index 000000000..bd1a48575
--- /dev/null
+++ b/31_HLSLPathTracer/app_resources/glsl/litBySphere.comp
@@ -0,0 +1,60 @@
+// Copyright (C) 2018-2020 - DevSH Graphics Programming Sp. z O.O.
+// This file is part of the "Nabla Engine".
+// For conditions of distribution and use, see copyright notice in nabla.h
+
+#version 430 core
+#extension GL_GOOGLE_include_directive : require
+
+#define SPHERE_COUNT 9
+#include "common.glsl"
+
+
+void traceRay_extraShape(inout int objectID, inout float intersectionT, in vec3 origin, in vec3 direction)
+{
+}
+
+float nbl_glsl_light_deferred_pdf(in Light light, in Ray_t ray)
+{
+    const Sphere sphere = spheres[ray._mutable.objectID];
+    return 1.0/Sphere_getSolidAngle(sphere,ray._immutable.origin);
+}
+
+vec3 nbl_glsl_light_generate_and_pdf(out float pdf, out float newRayMaxT, in vec3 origin, in nbl_glsl_AnisotropicViewSurfaceInteraction interaction, in bool isBSDF, in vec3 xi, in uint objectID)
+{
+    const Sphere sphere = spheres[objectID];
+
+    vec3 Z = sphere.position-origin;
+    const float distanceSQ = dot(Z,Z);
+    const float cosThetaMax2 = 1.0-sphere.radius2/distanceSQ;
+    if (cosThetaMax2>0.0)
+    {
+        const float rcpDistance = inversesqrt(distanceSQ);
+        Z *= rcpDistance;
+    
+        const float cosThetaMax = sqrt(cosThetaMax2);
+        const float cosTheta = mix(1.0,cosThetaMax,xi.x);
+
+        vec3 L = Z*cosTheta;
+
+        const float cosTheta2 = cosTheta*cosTheta;
+        const float sinTheta = sqrt(1.0-cosTheta2);
+        float sinPhi,cosPhi;
+        nbl_glsl_sincos(2.0*nbl_glsl_PI*xi.y-nbl_glsl_PI,sinPhi,cosPhi);
+        mat2x3 XY = nbl_glsl_frisvad(Z);
+    
+        L += (XY[0]*cosPhi+XY[1]*sinPhi)*sinTheta;
+    
+        newRayMaxT = (cosTheta-sqrt(cosTheta2-cosThetaMax2))/rcpDistance;
+        pdf = 1.0/Sphere_getSolidAngle_impl(cosThetaMax);
+        return L;
+    }
+    pdf = 0.0;
+    return vec3(0.0,0.0,0.0);
+}
+
+uint getBSDFLightIDAndDetermineNormal(out vec3 normal, in uint objectID, in vec3 intersection)
+{
+    Sphere sphere = spheres[objectID];
+    normal = Sphere_getNormal(sphere,intersection);
+    return sphere.bsdfLightIDs;
+}
\ No newline at end of file
diff --git a/31_HLSLPathTracer/app_resources/glsl/litByTriangle.comp b/31_HLSLPathTracer/app_resources/glsl/litByTriangle.comp
new file mode 100644
index 000000000..ba23c82e5
--- /dev/null
+++ b/31_HLSLPathTracer/app_resources/glsl/litByTriangle.comp
@@ -0,0 +1,105 @@
+// Copyright (C) 2018-2020 - DevSH Graphics Programming Sp. z O.O.
+// This file is part of the "Nabla Engine".
+// For conditions of distribution and use, see copyright notice in nabla.h
+
+#version 430 core
+#extension GL_GOOGLE_include_directive : require
+
+#define SPHERE_COUNT 8
+#define POLYGON_METHOD 1 // 0 area sampling, 1 solid angle sampling, 2 approximate projected solid angle sampling
+#include "common.glsl"
+
+#define TRIANGLE_COUNT 1
+Triangle triangles[TRIANGLE_COUNT] = {
+    Triangle_Triangle(mat3(vec3(-1.8,0.35,0.3),vec3(-1.2,0.35,0.0),vec3(-1.5,0.8,-0.3))*10.0,INVALID_ID_16BIT,0u)
+};
+
+void traceRay_extraShape(inout int objectID, inout float intersectionT, in vec3 origin, in vec3 direction)
+{
+	for (int i=0; i<TRIANGLE_COUNT; i++)
+    {
+        float t = Triangle_intersect(triangles[i],origin,direction);
+        bool closerIntersection = t>0.0 && t<intersectionT;
+
+		objectID = closerIntersection ? (i+SPHERE_COUNT):objectID;
+        intersectionT = closerIntersection ? t:intersectionT;
+    }
+}
+
+
+#include <nbl/builtin/glsl/sampling/projected_spherical_triangle.glsl>
+float nbl_glsl_light_deferred_pdf(in Light light, in Ray_t ray)
+{
+    const Triangle tri = triangles[Light_getObjectID(light)];
+
+    const vec3 L = ray._immutable.direction;
+#if POLYGON_METHOD==0
+    const float dist = ray._mutable.intersectionT;
+    return dist*dist/abs(dot(Triangle_getNormalTimesArea(tri),L));
+#else
+    const ImmutableRay_t _immutable = ray._immutable;
+    const mat3 sphericalVertices = nbl_glsl_shapes_getSphericalTriangle(mat3(tri.vertex0,tri.vertex1,tri.vertex2),_immutable.origin);
+    #if POLYGON_METHOD==1
+        const float rcpProb = nbl_glsl_shapes_SolidAngleOfTriangle(sphericalVertices);
+        // if `rcpProb` is NAN then the triangle's solid angle was close to 0.0 
+        return rcpProb>nbl_glsl_FLT_MIN ? (1.0/rcpProb):nbl_glsl_FLT_MAX;
+    #elif POLYGON_METHOD==2
+        const float pdf = nbl_glsl_sampling_probProjectedSphericalTriangleSample(sphericalVertices,_immutable.normalAtOrigin,_immutable.wasBSDFAtOrigin,L);
+        // if `pdf` is NAN then the triangle's projected solid angle was close to 0.0, if its close to INF then the triangle was very small
+        return pdf<nbl_glsl_FLT_MAX ? pdf:0.0;
+    #endif
+#endif
+}
+
+vec3 nbl_glsl_light_generate_and_pdf(out float pdf, out float newRayMaxT, in vec3 origin, in nbl_glsl_AnisotropicViewSurfaceInteraction interaction, in bool isBSDF, in vec3 xi, in uint objectID)
+{
+    const Triangle tri = triangles[objectID];
+    
+#if POLYGON_METHOD==0
+    const mat2x3 edges = mat2x3(tri.vertex1-tri.vertex0,tri.vertex2-tri.vertex0);
+    const float sqrtU = sqrt(xi.x);
+    vec3 point = tri.vertex0+edges[0]*(1.0-sqrtU)+edges[1]*sqrtU*xi.y;
+    vec3 L = point-origin;
+    
+    const float distanceSq = dot(L,L);
+    const float rcpDistance = inversesqrt(distanceSq);
+    L *= rcpDistance;
+    
+    pdf = distanceSq/abs(dot(Triangle_getNormalTimesArea_impl(edges),L));
+    newRayMaxT = 1.0/rcpDistance;
+    return L;
+#else 
+    float rcpPdf;
+
+    const mat3 sphericalVertices = nbl_glsl_shapes_getSphericalTriangle(mat3(tri.vertex0,tri.vertex1,tri.vertex2),origin);
+#if POLYGON_METHOD==1
+    const vec3 L = nbl_glsl_sampling_generateSphericalTriangleSample(rcpPdf,sphericalVertices,xi.xy);
+#elif POLYGON_METHOD==2
+    const vec3 L = nbl_glsl_sampling_generateProjectedSphericalTriangleSample(rcpPdf,sphericalVertices,interaction.isotropic.N,isBSDF,xi.xy);
+#endif
+
+    // if `rcpProb` is NAN or negative then the triangle's solidAngle or projectedSolidAngle was close to 0.0 
+    pdf = rcpPdf>nbl_glsl_FLT_MIN ? (1.0/rcpPdf):0.0;
+
+    const vec3 N = Triangle_getNormalTimesArea(tri);
+    newRayMaxT = dot(N,tri.vertex0-origin)/dot(N,L);
+    return L;
+#endif
+}
+
+
+uint getBSDFLightIDAndDetermineNormal(out vec3 normal, in uint objectID, in vec3 intersection)
+{
+    if (objectID<SPHERE_COUNT)
+    {
+        Sphere sphere = spheres[objectID];
+        normal = Sphere_getNormal(sphere,intersection);
+        return sphere.bsdfLightIDs;
+    }
+    else
+    {
+        Triangle tri = triangles[objectID-SPHERE_COUNT];
+        normal = normalize(Triangle_getNormalTimesArea(tri));
+        return tri.bsdfLightIDs;
+    }
+}
\ No newline at end of file
diff --git a/31_HLSLPathTracer/app_resources/hlsl/present.frag.hlsl b/31_HLSLPathTracer/app_resources/hlsl/present.frag.hlsl
new file mode 100644
index 000000000..22695657c
--- /dev/null
+++ b/31_HLSLPathTracer/app_resources/hlsl/present.frag.hlsl
@@ -0,0 +1,19 @@
+// Copyright (C) 2024-2024 - DevSH Graphics Programming Sp. z O.O.
+// This file is part of the "Nabla Engine".
+// For conditions of distribution and use, see copyright notice in nabla.h
+
+#pragma wave shader_stage(fragment)
+
+// vertex shader is provided by the fullScreenTriangle extension
+#include <nbl/builtin/hlsl/ext/FullScreenTriangle/SVertexAttributes.hlsl>
+using namespace nbl::hlsl;
+using namespace ext::FullScreenTriangle;
+
+// binding 0 set 0
+[[vk::combinedImageSampler]] [[vk::binding(0, 0)]] Texture2D texture;
+[[vk::combinedImageSampler]] [[vk::binding(0, 0)]] SamplerState samplerState;
+
+[[vk::location(0)]] float32_t4 main(SVertexAttributes vxAttr) : SV_Target0
+{
+    return float32_t4(texture.Sample(samplerState, vxAttr.uv).rgb, 1.0f);
+}
\ No newline at end of file
diff --git a/31_HLSLPathTracer/config.json.template b/31_HLSLPathTracer/config.json.template
new file mode 100644
index 000000000..24adf54fb
--- /dev/null
+++ b/31_HLSLPathTracer/config.json.template
@@ -0,0 +1,28 @@
+{
+  "enableParallelBuild": true,
+  "threadsPerBuildProcess" : 2,
+  "isExecuted": false,
+  "scriptPath": "",
+  "cmake": {
+    "configurations": [ "Release", "Debug", "RelWithDebInfo" ],
+    "buildModes": [],
+    "requiredOptions": []
+  }, 
+  "profiles": [
+    {
+      "backend": "vulkan",
+      "platform": "windows",
+      "buildModes": [],
+      "runConfiguration": "Release",
+      "gpuArchitectures": []
+    }
+  ],
+  "dependencies": [],
+  "data": [
+    {
+      "dependencies": [],
+      "command": [""],
+      "outputs": []
+    }
+  ]
+}
diff --git a/31_HLSLPathTracer/include/nbl/this_example/common.hpp b/31_HLSLPathTracer/include/nbl/this_example/common.hpp
new file mode 100644
index 000000000..ff3dd8095
--- /dev/null
+++ b/31_HLSLPathTracer/include/nbl/this_example/common.hpp
@@ -0,0 +1,17 @@
+#ifndef __NBL_THIS_EXAMPLE_COMMON_H_INCLUDED__
+#define __NBL_THIS_EXAMPLE_COMMON_H_INCLUDED__
+
+#include <nabla.h>
+
+// common api
+#include "CCamera.hpp"
+#include "SimpleWindowedApplication.hpp"
+#include "nbl/application_templates/MonoAssetManagerAndBuiltinResourceApplication.hpp"
+#include "CEventCallback.hpp"
+
+// example's own headers
+#include "nbl/ui/ICursorControl.h"
+#include "nbl/ext/ImGui/ImGui.h"
+#include "imgui/imgui_internal.h"
+
+#endif // __NBL_THIS_EXAMPLE_COMMON_H_INCLUDED__
\ No newline at end of file
diff --git a/31_HLSLPathTracer/main.cpp b/31_HLSLPathTracer/main.cpp
new file mode 100644
index 000000000..73434a852
--- /dev/null
+++ b/31_HLSLPathTracer/main.cpp
@@ -0,0 +1,1276 @@
+// Copyright (C) 2018-2020 - DevSH Graphics Programming Sp. z O.O.
+// This file is part of the "Nabla Engine".
+// For conditions of distribution and use, see copyright notice in nabla.h
+
+#include "nbl/this_example/common.hpp"
+#include "nbl/asset/interchange/IImageAssetHandlerBase.h"
+#include "nbl/ext/FullScreenTriangle/FullScreenTriangle.h"
+#include "nbl/builtin/hlsl/surface_transform.h"
+
+using namespace nbl;
+using namespace core;
+using namespace hlsl;
+using namespace system;
+using namespace asset;
+using namespace ui;
+using namespace video;
+
+struct PTPushConstant {
+	matrix4SIMD invMVP;
+	int sampleCount;
+	int depth;
+};
+
+// TODO: Add a QueryPool for timestamping once its ready
+// TODO: Do buffer creation using assConv
+class ComputeShaderPathtracer final : public examples::SimpleWindowedApplication, public application_templates::MonoAssetManagerAndBuiltinResourceApplication
+{
+		using device_base_t = examples::SimpleWindowedApplication;
+		using asset_base_t = application_templates::MonoAssetManagerAndBuiltinResourceApplication;
+		using clock_t = std::chrono::steady_clock;
+
+		enum E_LIGHT_GEOMETRY : uint8_t
+		{
+			ELG_SPHERE,
+			ELG_TRIANGLE,
+			ELG_RECTANGLE,
+			ELG_COUNT
+		};
+
+		constexpr static inline uint32_t2 WindowDimensions = { 1280, 720 };
+		constexpr static inline uint32_t MaxFramesInFlight = 5;
+		constexpr static inline clock_t::duration DisplayImageDuration = std::chrono::milliseconds(900);
+		constexpr static inline uint32_t DefaultWorkGroupSize = 16u;
+		constexpr static inline uint32_t MaxDescriptorCount = 256u;
+		constexpr static inline uint32_t MaxDepthLog2 = 4u; // 5
+		constexpr static inline uint32_t MaxSamplesLog2 = 10u; // 18
+		constexpr static inline uint32_t MaxBufferDimensions = 3u << MaxDepthLog2;
+		constexpr static inline uint32_t MaxBufferSamples = 1u << MaxSamplesLog2;
+		constexpr static inline uint8_t MaxUITextureCount = 1u;
+		static inline std::string DefaultImagePathsFile = "envmap/envmap_0.exr";
+		static inline std::string OwenSamplerFilePath = "owen_sampler_buffer.bin";
+		static inline std::array<std::string, E_LIGHT_GEOMETRY::ELG_COUNT> PTShaderPaths = { "app_resources/glsl/litBySphere.comp", "app_resources/glsl/litByTriangle.comp", "app_resources/glsl/litByRectangle.comp" };
+		static inline std::string PresentShaderPath = "app_resources/hlsl/present.frag.hlsl";
+
+		const char* shaderNames[E_LIGHT_GEOMETRY::ELG_COUNT] = {
+			"ELG_SPHERE",
+			"ELG_TRIANGLE",
+			"ELG_RECTANGLE"
+		};
+
+	public:
+		inline ComputeShaderPathtracer(const path& _localInputCWD, const path& _localOutputCWD, const path& _sharedInputCWD, const path& _sharedOutputCWD)
+			: IApplicationFramework(_localInputCWD, _localOutputCWD, _sharedInputCWD, _sharedOutputCWD) {}
+
+		inline bool isComputeOnly() const override { return false; }
+
+		inline core::vector<video::SPhysicalDeviceFilter::SurfaceCompatibility> getSurfaces() const override
+		{
+			if (!m_surface)
+			{
+				{
+					auto windowCallback = core::make_smart_refctd_ptr<CEventCallback>(smart_refctd_ptr(m_inputSystem), smart_refctd_ptr(m_logger));
+					IWindow::SCreationParams params = {};
+					params.callback = core::make_smart_refctd_ptr<nbl::video::ISimpleManagedSurface::ICallback>();
+					params.width = WindowDimensions.x;
+					params.height = WindowDimensions.y;
+					params.x = 32;
+					params.y = 32;
+					params.flags = ui::IWindow::ECF_HIDDEN | IWindow::ECF_BORDERLESS | IWindow::ECF_RESIZABLE;
+					params.windowCaption = "ComputeShaderPathtracer";
+					params.callback = windowCallback;
+					const_cast<std::remove_const_t<decltype(m_window)>&>(m_window) = m_winMgr->createWindow(std::move(params));
+				}
+
+				auto surface = CSurfaceVulkanWin32::create(smart_refctd_ptr(m_api), smart_refctd_ptr_static_cast<IWindowWin32>(m_window));
+				const_cast<std::remove_const_t<decltype(m_surface)>&>(m_surface) = nbl::video::CSimpleResizeSurface<nbl::video::CDefaultSwapchainFramebuffers>::create(std::move(surface));
+			}
+
+			if (m_surface)
+				return { {m_surface->getSurface()/*,EQF_NONE*/} };
+
+			return {};
+		}
+
+		inline bool onAppInitialized(smart_refctd_ptr<ISystem>&& system) override
+		{
+			// Init systems
+			{
+				m_inputSystem = make_smart_refctd_ptr<InputSystem>(logger_opt_smart_ptr(smart_refctd_ptr(m_logger)));
+
+				// Remember to call the base class initialization!
+				if (!device_base_t::onAppInitialized(smart_refctd_ptr(system)))
+					return false;
+				if (!asset_base_t::onAppInitialized(std::move(system)))
+					return false;
+
+				m_semaphore = m_device->createSemaphore(m_realFrameIx);
+
+				if (!m_semaphore)
+					return logFail("Failed to create semaphore!");
+			}
+
+			// Create renderpass and init surface
+			nbl::video::IGPURenderpass* renderpass;
+			{
+				ISwapchain::SCreationParams swapchainParams = { .surface = smart_refctd_ptr<ISurface>(m_surface->getSurface()) };
+				if (!swapchainParams.deduceFormat(m_physicalDevice))
+					return logFail("Could not choose a Surface Format for the Swapchain!");
+
+				const static IGPURenderpass::SCreationParams::SSubpassDependency dependencies[] =
+				{
+					{
+						.srcSubpass = IGPURenderpass::SCreationParams::SSubpassDependency::External,
+						.dstSubpass = 0,
+						.memoryBarrier =
+						{
+							.srcStageMask = asset::PIPELINE_STAGE_FLAGS::COPY_BIT,
+							.srcAccessMask = asset::ACCESS_FLAGS::TRANSFER_WRITE_BIT,
+							.dstStageMask = asset::PIPELINE_STAGE_FLAGS::COLOR_ATTACHMENT_OUTPUT_BIT,
+							.dstAccessMask = asset::ACCESS_FLAGS::COLOR_ATTACHMENT_WRITE_BIT
+						}
+					},
+					{
+						.srcSubpass = 0,
+						.dstSubpass = IGPURenderpass::SCreationParams::SSubpassDependency::External,
+						.memoryBarrier =
+						{
+							.srcStageMask = asset::PIPELINE_STAGE_FLAGS::COLOR_ATTACHMENT_OUTPUT_BIT,
+							.srcAccessMask = asset::ACCESS_FLAGS::COLOR_ATTACHMENT_WRITE_BIT
+						}
+					},
+					IGPURenderpass::SCreationParams::DependenciesEnd
+				};
+
+				auto scResources = std::make_unique<CDefaultSwapchainFramebuffers>(m_device.get(), swapchainParams.surfaceFormat.format, dependencies);
+				renderpass = scResources->getRenderpass();
+
+				if (!renderpass)
+					return logFail("Failed to create Renderpass!");
+
+				auto gQueue = getGraphicsQueue();
+				if (!m_surface || !m_surface->init(gQueue, std::move(scResources), swapchainParams.sharedParams))
+					return logFail("Could not create Window & Surface or initialize the Surface!");
+			}
+
+			// image upload utils
+			{
+				m_scratchSemaphore = m_device->createSemaphore(0);
+				if (!m_scratchSemaphore)
+					return logFail("Could not create Scratch Semaphore");
+				m_scratchSemaphore->setObjectDebugName("Scratch Semaphore");
+				// we don't want to overcomplicate the example with multi-queue
+				m_intendedSubmit.queue = getGraphicsQueue();
+				// wait for nothing before upload
+				m_intendedSubmit.waitSemaphores = {};
+				m_intendedSubmit.waitSemaphores = {};
+				// fill later
+				m_intendedSubmit.scratchCommandBuffers = {};
+				m_intendedSubmit.scratchSemaphore = {
+					.semaphore = m_scratchSemaphore.get(),
+					.value = 0,
+					.stageMask = PIPELINE_STAGE_FLAGS::ALL_TRANSFER_BITS
+				};
+			}
+
+			// Create command pool and buffers
+			{
+				auto gQueue = getGraphicsQueue();
+				m_cmdPool = m_device->createCommandPool(gQueue->getFamilyIndex(), IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT);
+				if (!m_cmdPool)
+					return logFail("Couldn't create Command Pool!");
+
+				if (!m_cmdPool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY, { m_cmdBufs.data(), MaxFramesInFlight }))
+					return logFail("Couldn't create Command Buffer!");
+			}
+
+			ISampler::SParams samplerParams = {
+				.AnisotropicFilter = 0
+			};
+			auto defaultSampler = m_device->createSampler(samplerParams);
+
+			// Create descriptors and pipeline for the pathtracer
+			{
+				auto convertDSLayoutCPU2GPU = [&](smart_refctd_ptr<ICPUDescriptorSetLayout> cpuLayout) {
+					auto converter = CAssetConverter::create({ .device = m_device.get() });
+					CAssetConverter::SInputs inputs = {};
+					inputs.readCache = converter.get();
+					inputs.logger = m_logger.get();
+					CAssetConverter::SConvertParams params = {};
+					params.utilities = m_utils.get();
+
+					std::get<CAssetConverter::SInputs::asset_span_t<ICPUDescriptorSetLayout>>(inputs.assets) = { &cpuLayout.get(),1 };
+					// don't need to assert that we don't need to provide patches since layouts are not patchable
+					//assert(true);
+					auto reservation = converter->reserve(inputs);
+					// the `.value` is just a funny way to make the `smart_refctd_ptr` copyable
+					auto gpuLayout = reservation.getGPUObjects<ICPUDescriptorSetLayout>().front().value;
+					if (!gpuLayout) {
+						m_logger->log("Failed to convert %s into an IGPUDescriptorSetLayout handle", ILogger::ELL_ERROR);
+						std::exit(-1);
+					}
+
+					return gpuLayout;
+					};
+				auto convertDSCPU2GPU = [&](smart_refctd_ptr<ICPUDescriptorSet> cpuDS) {
+					auto converter = CAssetConverter::create({ .device = m_device.get() });
+					CAssetConverter::SInputs inputs = {};
+					inputs.readCache = converter.get();
+					inputs.logger = m_logger.get();
+					CAssetConverter::SConvertParams params = {};
+					params.utilities = m_utils.get();
+
+					std::get<CAssetConverter::SInputs::asset_span_t<ICPUDescriptorSet>>(inputs.assets) = { &cpuDS.get(), 1 };
+					// don't need to assert that we don't need to provide patches since layouts are not patchable
+					//assert(true);
+					auto reservation = converter->reserve(inputs);
+					// the `.value` is just a funny way to make the `smart_refctd_ptr` copyable
+					auto gpuDS = reservation.getGPUObjects<ICPUDescriptorSet>().front().value;
+					if (!gpuDS) {
+						m_logger->log("Failed to convert %s into an IGPUDescriptorSet handle", ILogger::ELL_ERROR);
+						std::exit(-1);
+					}
+
+					return gpuDS;
+					};
+
+				std::array<ICPUDescriptorSetLayout::SBinding, 1> descriptorSet0Bindings = {};
+				std::array<ICPUDescriptorSetLayout::SBinding, 3> descriptorSet3Bindings = {};
+				std::array<IGPUDescriptorSetLayout::SBinding, 1> presentDescriptorSetBindings;
+
+				descriptorSet0Bindings[0] = {
+					.binding = 0u,
+					.type = nbl::asset::IDescriptor::E_TYPE::ET_STORAGE_IMAGE,
+					.createFlags = ICPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_NONE,
+					.stageFlags = IShader::E_SHADER_STAGE::ESS_COMPUTE,
+					.count = 1u,
+					.immutableSamplers = nullptr
+				};
+				descriptorSet3Bindings[0] = {
+					.binding = 0u,
+					.type = nbl::asset::IDescriptor::E_TYPE::ET_COMBINED_IMAGE_SAMPLER,
+					.createFlags = ICPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_NONE,
+					.stageFlags = IShader::E_SHADER_STAGE::ESS_COMPUTE,
+					.count = 1u,
+					.immutableSamplers = nullptr
+				};
+				descriptorSet3Bindings[1] = {
+					.binding = 1u,
+					.type = nbl::asset::IDescriptor::E_TYPE::ET_UNIFORM_TEXEL_BUFFER,
+					.createFlags = ICPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_NONE,
+					.stageFlags = IShader::E_SHADER_STAGE::ESS_COMPUTE,
+					.count = 1u,
+					.immutableSamplers = nullptr
+				};
+				descriptorSet3Bindings[2] = {
+					.binding = 2u,
+					.type = nbl::asset::IDescriptor::E_TYPE::ET_COMBINED_IMAGE_SAMPLER,
+					.createFlags = ICPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_NONE,
+					.stageFlags = IShader::E_SHADER_STAGE::ESS_COMPUTE,
+					.count = 1u,
+					.immutableSamplers = nullptr
+				};
+				presentDescriptorSetBindings[0] = {
+					.binding = 0u,
+					.type = nbl::asset::IDescriptor::E_TYPE::ET_COMBINED_IMAGE_SAMPLER,
+					.createFlags = ICPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_NONE,
+					.stageFlags = IShader::E_SHADER_STAGE::ESS_FRAGMENT,
+					.count = 1u,
+					.immutableSamplers = &defaultSampler
+				};
+
+				auto cpuDescriptorSetLayout0 = make_smart_refctd_ptr<ICPUDescriptorSetLayout>(descriptorSet0Bindings);
+				auto cpuDescriptorSetLayout2 = make_smart_refctd_ptr<ICPUDescriptorSetLayout>(descriptorSet3Bindings);
+
+				auto gpuDescriptorSetLayout0 = convertDSLayoutCPU2GPU(cpuDescriptorSetLayout0);
+				auto gpuDescriptorSetLayout2 = convertDSLayoutCPU2GPU(cpuDescriptorSetLayout2);
+				auto gpuPresentDescriptorSetLayout = m_device->createDescriptorSetLayout(presentDescriptorSetBindings);
+
+				auto cpuDescriptorSet0 = make_smart_refctd_ptr<ICPUDescriptorSet>(std::move(cpuDescriptorSetLayout0));
+				auto cpuDescriptorSet2 = make_smart_refctd_ptr<ICPUDescriptorSet>(std::move(cpuDescriptorSetLayout2));
+
+				m_descriptorSet0 = convertDSCPU2GPU(cpuDescriptorSet0);
+				m_descriptorSet2 = convertDSCPU2GPU(cpuDescriptorSet2);
+
+				smart_refctd_ptr<IDescriptorPool> presentDSPool;
+				{
+					const video::IGPUDescriptorSetLayout* const layouts[] = { gpuPresentDescriptorSetLayout.get() };
+					const uint32_t setCounts[] = { 1u };
+					presentDSPool = m_device->createDescriptorPoolForDSLayouts(IDescriptorPool::E_CREATE_FLAGS::ECF_NONE, layouts, setCounts);
+				}
+				m_presentDescriptorSet = presentDSPool->createDescriptorSet(gpuPresentDescriptorSetLayout);
+
+				// Create Shaders
+				auto loadAndCompileShader = [&](std::string pathToShader)
+				{
+					IAssetLoader::SAssetLoadParams lp = {};
+					lp.workingDirectory = localInputCWD;
+					auto assetBundle = m_assetMgr->getAsset(pathToShader, lp);
+					const auto assets = assetBundle.getContents();
+					if (assets.empty())
+					{
+						m_logger->log("Could not load shader: ", ILogger::ELL_ERROR, pathToShader);
+						std::exit(-1);
+					}
+
+					auto source = IAsset::castDown<ICPUShader>(assets[0]);
+					// The down-cast should not fail!
+					assert(source);
+
+					// this time we skip the use of the asset converter since the ICPUShader->IGPUShader path is quick and simple
+					auto shader = m_device->createShader(source.get());
+					if (!shader)
+					{
+						m_logger->log("Shader creationed failed: %s!", ILogger::ELL_ERROR, pathToShader);
+						std::exit(-1);
+					}
+
+					return shader;
+				};
+
+				// Create compute pipelines
+				{
+					for (int index = 0; index < E_LIGHT_GEOMETRY::ELG_COUNT; index++) {
+						auto ptShader = loadAndCompileShader(PTShaderPaths[index]);
+						const nbl::asset::SPushConstantRange pcRange = {
+							.stageFlags = IShader::E_SHADER_STAGE::ESS_COMPUTE,
+							.offset = 0,
+							.size = sizeof(PTPushConstant)
+						};
+						auto ptPipelineLayout = m_device->createPipelineLayout(
+							{ &pcRange, 1 },
+							core::smart_refctd_ptr(gpuDescriptorSetLayout0),
+							nullptr,
+							core::smart_refctd_ptr(gpuDescriptorSetLayout2),
+							nullptr
+						);
+						if (!ptPipelineLayout) {
+							return logFail("Failed to create Pathtracing pipeline layout");
+						}
+
+						IGPUComputePipeline::SCreationParams params = {};
+						params.layout = ptPipelineLayout.get();
+						params.shader.shader = ptShader.get();
+						params.shader.entryPoint = "main";
+						params.shader.entries = nullptr;
+						params.shader.requireFullSubgroups = true;
+						params.shader.requiredSubgroupSize = static_cast<IGPUShader::SSpecInfo::SUBGROUP_SIZE>(5);
+						if (!m_device->createComputePipelines(nullptr, { &params, 1 }, m_PTPipelines.data() + index)) {
+							return logFail("Failed to create compute pipeline!\n");
+						}
+					}
+				}
+
+				// Create graphics pipeline
+				{
+					auto scRes = static_cast<CDefaultSwapchainFramebuffers*>(m_surface->getSwapchainResources());
+					ext::FullScreenTriangle::ProtoPipeline fsTriProtoPPln(m_assetMgr.get(), m_device.get(), m_logger.get());
+					if (!fsTriProtoPPln)
+						return logFail("Failed to create Full Screen Triangle protopipeline or load its vertex shader!");
+
+					// Load Fragment Shader
+					auto fragmentShader = loadAndCompileShader(PresentShaderPath);
+					if (!fragmentShader)
+						return logFail("Failed to Load and Compile Fragment Shader: lumaMeterShader!");
+
+					const IGPUShader::SSpecInfo fragSpec = {
+						.entryPoint = "main",
+						.shader = fragmentShader.get()
+					};
+
+					auto presentLayout = m_device->createPipelineLayout(
+						{},
+						core::smart_refctd_ptr(gpuPresentDescriptorSetLayout),
+						nullptr,
+						nullptr,
+						nullptr
+					);
+					m_presentPipeline = fsTriProtoPPln.createPipeline(fragSpec, presentLayout.get(), scRes->getRenderpass());
+					if (!m_presentPipeline)
+						return logFail("Could not create Graphics Pipeline!");
+
+				}
+			}
+
+			// load CPUImages and convert to GPUImages
+			smart_refctd_ptr<IGPUImage> envMap, scrambleMap;
+			{
+				auto convertImgCPU2GPU = [&](std::span<ICPUImage *> cpuImgs) {
+					auto queue = getGraphicsQueue();
+					auto cmdbuf = m_cmdBufs[0].get();
+					cmdbuf->reset(IGPUCommandBuffer::RESET_FLAGS::NONE);
+					std::array<IQueue::SSubmitInfo::SCommandBufferInfo, 1> commandBufferInfo = { cmdbuf };
+					core::smart_refctd_ptr<ISemaphore> imgFillSemaphore = m_device->createSemaphore(0);
+					imgFillSemaphore->setObjectDebugName("Image Fill Semaphore");
+
+					auto converter = CAssetConverter::create({ .device = m_device.get() });
+					// We don't want to generate mip-maps for these images, to ensure that we must override the default callbacks.
+					struct SInputs final : CAssetConverter::SInputs
+					{
+						// we also need to override this to have concurrent sharing
+						inline std::span<const uint32_t> getSharedOwnershipQueueFamilies(const size_t groupCopyID, const asset::ICPUImage* buffer, const CAssetConverter::patch_t<asset::ICPUImage>& patch) const override
+						{
+							if (familyIndices.size() > 1)
+								return familyIndices;
+							return {};
+						}
+
+						inline uint8_t getMipLevelCount(const size_t groupCopyID, const ICPUImage* image, const CAssetConverter::patch_t<asset::ICPUImage>& patch) const override
+						{
+							return image->getCreationParameters().mipLevels;
+						}
+						inline uint16_t needToRecomputeMips(const size_t groupCopyID, const ICPUImage* image, const CAssetConverter::patch_t<asset::ICPUImage>& patch) const override
+						{
+							return 0b0u;
+						}
+
+						std::vector<uint32_t> familyIndices;
+					} inputs = {};
+					inputs.readCache = converter.get();
+					inputs.logger = m_logger.get();
+					{
+						const core::set<uint32_t> uniqueFamilyIndices = { queue->getFamilyIndex(), queue->getFamilyIndex() };
+						inputs.familyIndices = { uniqueFamilyIndices.begin(),uniqueFamilyIndices.end() };
+					}
+					// scratch command buffers for asset converter transfer commands
+					SIntendedSubmitInfo transfer = {
+						.queue = queue,
+						.waitSemaphores = {},
+						.prevCommandBuffers = {},
+						.scratchCommandBuffers = commandBufferInfo,
+						.scratchSemaphore = {
+							.semaphore = imgFillSemaphore.get(),
+							.value = 0,
+							// because of layout transitions
+							.stageMask = PIPELINE_STAGE_FLAGS::ALL_COMMANDS_BITS
+						}
+					};
+					// as per the `SIntendedSubmitInfo` one commandbuffer must be begun
+					cmdbuf->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT);
+					// Normally we'd have to inherit and override the `getFinalOwnerQueueFamily` callback to ensure that the
+					// compute queue becomes the owner of the buffers and images post-transfer, but in this example we use concurrent sharing
+					CAssetConverter::SConvertParams params = {};
+					params.transfer = &transfer;
+					params.utilities = m_utils.get();
+
+					std::get<CAssetConverter::SInputs::asset_span_t<ICPUImage>>(inputs.assets) = cpuImgs;
+					// assert that we don't need to provide patches
+					assert(cpuImgs[0]->getImageUsageFlags().hasFlags(ICPUImage::E_USAGE_FLAGS::EUF_SAMPLED_BIT));
+					auto reservation = converter->reserve(inputs);
+					// the `.value` is just a funny way to make the `smart_refctd_ptr` copyable
+					auto gpuImgs = reservation.getGPUObjects<ICPUImage>();
+					for (auto& gpuImg : gpuImgs) {
+						if (!gpuImg) {
+							m_logger->log("Failed to convert %s into an IGPUImage handle", ILogger::ELL_ERROR, DefaultImagePathsFile);
+							std::exit(-1);
+						}
+					}
+
+					// and launch the conversions
+					m_api->startCapture();
+					auto result = reservation.convert(params);
+					m_api->endCapture();
+					if (!result.blocking() && result.copy() != IQueue::RESULT::SUCCESS) {
+						m_logger->log("Failed to record or submit conversions", ILogger::ELL_ERROR);
+						std::exit(-1);
+					}
+
+					envMap = gpuImgs[0].value;
+					scrambleMap = gpuImgs[1].value;
+				};
+
+				smart_refctd_ptr<ICPUImage> envMapCPU, scrambleMapCPU;
+				{
+					IAssetLoader::SAssetLoadParams lp;
+					lp.workingDirectory = this->sharedInputCWD;
+					SAssetBundle bundle = m_assetMgr->getAsset(DefaultImagePathsFile, lp);
+					if (bundle.getContents().empty()) {
+						m_logger->log("Couldn't load an asset.", ILogger::ELL_ERROR);
+						std::exit(-1);
+					}
+
+					envMapCPU = IAsset::castDown<ICPUImage>(bundle.getContents()[0]);
+					if (!envMapCPU) {
+						m_logger->log("Couldn't load an asset.", ILogger::ELL_ERROR);
+						std::exit(-1);
+					}
+				};
+				{
+					asset::ICPUImage::SCreationParams info;
+					info.format = asset::E_FORMAT::EF_R32G32_UINT;
+					info.type = asset::ICPUImage::ET_2D;
+					auto extent = envMapCPU->getCreationParameters().extent;
+					info.extent.width = extent.width;
+					info.extent.height = extent.height;
+					info.extent.depth = 1u;
+					info.mipLevels = 1u;
+					info.arrayLayers = 1u;
+					info.samples = asset::ICPUImage::E_SAMPLE_COUNT_FLAGS::ESCF_1_BIT;
+					info.flags = static_cast<asset::IImage::E_CREATE_FLAGS>(0u);
+					info.usage = asset::IImage::EUF_TRANSFER_SRC_BIT | asset::IImage::EUF_SAMPLED_BIT;
+
+					scrambleMapCPU = ICPUImage::create(std::move(info));
+					const uint32_t texelFormatByteSize = getTexelOrBlockBytesize(scrambleMapCPU->getCreationParameters().format);
+					const uint32_t texelBufferSize = scrambleMapCPU->getImageDataSizeInBytes();
+					auto texelBuffer = ICPUBuffer::create({ texelBufferSize });
+
+					core::RandomSampler rng(0xbadc0ffeu);
+					auto out = reinterpret_cast<uint32_t *>(texelBuffer->getPointer());
+					for (auto index = 0u; index < texelBufferSize / 4; index++) {
+						out[index] = rng.nextSample();
+					}
+
+					auto regions = core::make_refctd_dynamic_array<core::smart_refctd_dynamic_array<ICPUImage::SBufferCopy>>(1u);
+					ICPUImage::SBufferCopy& region = regions->front();
+					region.imageSubresource.aspectMask = IImage::E_ASPECT_FLAGS::EAF_COLOR_BIT;
+					region.imageSubresource.mipLevel = 0u;
+					region.imageSubresource.baseArrayLayer = 0u;
+					region.imageSubresource.layerCount = 1u;
+					region.bufferOffset = 0u;
+					region.bufferRowLength = IImageAssetHandlerBase::calcPitchInBlocks(extent.width, texelFormatByteSize);
+					region.bufferImageHeight = 0u;
+					region.imageOffset = { 0u, 0u, 0u };
+					region.imageExtent = scrambleMapCPU->getCreationParameters().extent;
+
+					scrambleMapCPU->setBufferAndRegions(std::move(texelBuffer), regions);
+				}
+
+				std::array<ICPUImage*, 2> cpuImgs = { envMapCPU.get(), scrambleMapCPU.get()};
+				convertImgCPU2GPU(cpuImgs);
+			}
+
+			// create views for textures
+			{
+				auto createHDRIImage = [this](const asset::E_FORMAT colorFormat, const uint32_t width, const uint32_t height) -> smart_refctd_ptr<IGPUImage> {
+					IGPUImage::SCreationParams imgInfo;
+					imgInfo.format = colorFormat;
+					imgInfo.type = IGPUImage::ET_2D;
+					imgInfo.extent.width = width;
+					imgInfo.extent.height = height;
+					imgInfo.extent.depth = 1u;
+					imgInfo.mipLevels = 1u;
+					imgInfo.arrayLayers = 1u;
+					imgInfo.samples = IGPUImage::ESCF_1_BIT;
+					imgInfo.flags = static_cast<asset::IImage::E_CREATE_FLAGS>(0u);
+					imgInfo.usage = asset::IImage::EUF_STORAGE_BIT | asset::IImage::EUF_TRANSFER_DST_BIT | asset::IImage::EUF_SAMPLED_BIT;
+
+					auto image = m_device->createImage(std::move(imgInfo));
+					auto imageMemReqs = image->getMemoryReqs();
+					imageMemReqs.memoryTypeBits &= m_device->getPhysicalDevice()->getDeviceLocalMemoryTypeBits();
+					m_device->allocate(imageMemReqs, image.get());
+
+					return image;
+				};
+				auto createHDRIImageView = [this](smart_refctd_ptr<IGPUImage> img) -> smart_refctd_ptr<IGPUImageView>
+				{
+					auto format = img->getCreationParameters().format;
+					IGPUImageView::SCreationParams imgViewInfo;
+					imgViewInfo.image = std::move(img);
+					imgViewInfo.format = format;
+					imgViewInfo.viewType = IGPUImageView::ET_2D;
+					imgViewInfo.flags = static_cast<IGPUImageView::E_CREATE_FLAGS>(0u);
+					imgViewInfo.subresourceRange.aspectMask = IImage::E_ASPECT_FLAGS::EAF_COLOR_BIT;
+					imgViewInfo.subresourceRange.baseArrayLayer = 0u;
+					imgViewInfo.subresourceRange.baseMipLevel = 0u;
+					imgViewInfo.subresourceRange.layerCount = 1u;
+					imgViewInfo.subresourceRange.levelCount = 1u;
+
+					return m_device->createImageView(std::move(imgViewInfo));
+				};
+
+				auto params = envMap->getCreationParameters();
+				auto extent = params.extent;
+				envMap->setObjectDebugName("Env Map");
+				m_envMapView = createHDRIImageView(envMap);
+				m_envMapView->setObjectDebugName("Env Map View"); 
+				scrambleMap->setObjectDebugName("Scramble Map");
+				m_scrambleView = createHDRIImageView(scrambleMap);
+				m_scrambleView->setObjectDebugName("Scramble Map View");
+				auto outImg = createHDRIImage(asset::E_FORMAT::EF_R16G16B16A16_SFLOAT, WindowDimensions.x, WindowDimensions.y);
+				outImg->setObjectDebugName("Output Image");
+				m_outImgView = createHDRIImageView(outImg);
+				m_outImgView->setObjectDebugName("Output Image View");
+			}
+
+			// create sequence buffer view
+			{
+				// TODO: do this better use asset manager to get the ICPUBuffer from `.bin`
+				auto createBufferFromCacheFile = [this](
+					system::path filename,
+					size_t bufferSize,
+					void *data,
+					smart_refctd_ptr<ICPUBuffer>& buffer
+				) -> std::pair<smart_refctd_ptr<IFile>, bool>
+				{
+					ISystem::future_t<smart_refctd_ptr<nbl::system::IFile>> owenSamplerFileFuture;
+					ISystem::future_t<size_t> owenSamplerFileReadFuture;
+					size_t owenSamplerFileBytesRead;
+
+					m_system->createFile(owenSamplerFileFuture, localOutputCWD / filename, IFile::ECF_READ);
+					smart_refctd_ptr<IFile> owenSamplerFile;
+
+					if (owenSamplerFileFuture.wait())
+					{
+						owenSamplerFileFuture.acquire().move_into(owenSamplerFile);
+						if (!owenSamplerFile)
+							return { nullptr, false };
+
+						owenSamplerFile->read(owenSamplerFileReadFuture, data, 0, bufferSize);
+						if (owenSamplerFileReadFuture.wait())
+						{
+							owenSamplerFileReadFuture.acquire().move_into(owenSamplerFileBytesRead);
+
+							if (owenSamplerFileBytesRead < bufferSize)
+							{
+								buffer = asset::ICPUBuffer::create({ sizeof(uint32_t) * bufferSize });
+								return { owenSamplerFile, false };
+							}
+
+							buffer = asset::ICPUBuffer::create({ { sizeof(uint32_t) * bufferSize }, data });
+						}
+					}
+
+					return { owenSamplerFile, true };
+				};
+				auto writeBufferIntoCacheFile = [this](smart_refctd_ptr<IFile> file, size_t bufferSize, void* data)
+				{
+					ISystem::future_t<size_t> owenSamplerFileWriteFuture;
+					size_t owenSamplerFileBytesWritten;
+
+					file->write(owenSamplerFileWriteFuture, data, 0, bufferSize);
+					if (owenSamplerFileWriteFuture.wait())
+						owenSamplerFileWriteFuture.acquire().move_into(owenSamplerFileBytesWritten);
+				};
+
+				constexpr size_t bufferSize = MaxBufferDimensions * MaxBufferSamples;
+				std::array<uint32_t, bufferSize> data = {};
+				smart_refctd_ptr<ICPUBuffer> sampleSeq;
+
+				auto cacheBufferResult = createBufferFromCacheFile(sharedOutputCWD/OwenSamplerFilePath, bufferSize, data.data(), sampleSeq);
+				if (!cacheBufferResult.second)
+				{
+					core::OwenSampler sampler(MaxBufferDimensions, 0xdeadbeefu);
+
+					ICPUBuffer::SCreationParams params = {};
+					params.size = MaxBufferDimensions*MaxBufferSamples*sizeof(uint32_t);
+					sampleSeq = ICPUBuffer::create(std::move(params));
+
+					auto out = reinterpret_cast<uint32_t*>(sampleSeq->getPointer());
+					for (auto dim = 0u; dim < MaxBufferDimensions; dim++)
+						for (uint32_t i = 0; i < MaxBufferSamples; i++)
+						{
+							out[i * MaxBufferDimensions + dim] = sampler.sample(dim, i);
+						}
+					if (cacheBufferResult.first)
+						writeBufferIntoCacheFile(cacheBufferResult.first, bufferSize, out);
+				}
+
+				IGPUBuffer::SCreationParams params = {};
+				params.usage = asset::IBuffer::EUF_TRANSFER_DST_BIT | asset::IBuffer::EUF_UNIFORM_TEXEL_BUFFER_BIT;
+				params.size = sampleSeq->getSize();
+
+				// we don't want to overcomplicate the example with multi-queue
+				auto queue = getGraphicsQueue();
+				auto cmdbuf = m_cmdBufs[0].get();
+				cmdbuf->reset(IGPUCommandBuffer::RESET_FLAGS::NONE);
+				IQueue::SSubmitInfo::SCommandBufferInfo cmdbufInfo = { cmdbuf };
+				m_intendedSubmit.scratchCommandBuffers = { &cmdbufInfo, 1 };
+
+				cmdbuf->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT);
+				m_api->startCapture();
+				auto bufferFuture = m_utils->createFilledDeviceLocalBufferOnDedMem(
+					m_intendedSubmit,
+					std::move(params),
+					sampleSeq->getPointer()
+				);
+				m_api->endCapture();
+				bufferFuture.wait();
+				auto buffer = bufferFuture.get();
+
+				m_sequenceBufferView = m_device->createBufferView({ 0u, buffer->get()->getSize(), *buffer }, asset::E_FORMAT::EF_R32G32B32_UINT);
+				m_sequenceBufferView->setObjectDebugName("Sequence Buffer");
+			}
+
+			// Update Descriptors
+			{
+				ISampler::SParams samplerParams0 = {
+					ISampler::E_TEXTURE_CLAMP::ETC_CLAMP_TO_EDGE,
+					ISampler::E_TEXTURE_CLAMP::ETC_CLAMP_TO_EDGE,
+					ISampler::E_TEXTURE_CLAMP::ETC_CLAMP_TO_EDGE,
+					ISampler::ETBC_FLOAT_OPAQUE_BLACK,
+					ISampler::ETF_LINEAR,
+					ISampler::ETF_LINEAR,
+					ISampler::ESMM_LINEAR,
+					0u,
+					false,
+					ECO_ALWAYS
+				};
+				auto sampler0 = m_device->createSampler(samplerParams0);
+				ISampler::SParams samplerParams1 = {
+					ISampler::E_TEXTURE_CLAMP::ETC_CLAMP_TO_EDGE,
+					ISampler::E_TEXTURE_CLAMP::ETC_CLAMP_TO_EDGE,
+					ISampler::E_TEXTURE_CLAMP::ETC_CLAMP_TO_EDGE,
+					ISampler::ETBC_INT_OPAQUE_BLACK,
+					ISampler::ETF_NEAREST,
+					ISampler::ETF_NEAREST,
+					ISampler::ESMM_NEAREST,
+					0u,
+					false,
+					ECO_ALWAYS
+				};
+				auto sampler1 = m_device->createSampler(samplerParams1);
+
+				std::array<IGPUDescriptorSet::SDescriptorInfo, 5> writeDSInfos = {};
+				writeDSInfos[0].desc = m_outImgView;
+				writeDSInfos[0].info.image.imageLayout = IImage::LAYOUT::GENERAL;
+				writeDSInfos[1].desc = m_envMapView;
+				// ISampler::SParams samplerParams = { ISampler::ETC_CLAMP_TO_EDGE, ISampler::ETC_CLAMP_TO_EDGE, ISampler::ETC_CLAMP_TO_EDGE, ISampler::ETBC_FLOAT_OPAQUE_BLACK, ISampler::ETF_LINEAR, ISampler::ETF_LINEAR, ISampler::ESMM_LINEAR, 0u, false, ECO_ALWAYS };
+				writeDSInfos[1].info.combinedImageSampler.sampler = sampler0;
+				writeDSInfos[1].info.combinedImageSampler.imageLayout = asset::IImage::LAYOUT::READ_ONLY_OPTIMAL;
+				writeDSInfos[2].desc = m_sequenceBufferView;
+				writeDSInfos[3].desc = m_scrambleView;
+				// ISampler::SParams samplerParams = { ISampler::ETC_CLAMP_TO_EDGE, ISampler::ETC_CLAMP_TO_EDGE, ISampler::ETC_CLAMP_TO_EDGE, ISampler::ETBC_INT_OPAQUE_BLACK, ISampler::ETF_NEAREST, ISampler::ETF_NEAREST, ISampler::ESMM_NEAREST, 0u, false, ECO_ALWAYS };
+				writeDSInfos[3].info.combinedImageSampler.sampler = sampler1;
+				writeDSInfos[3].info.combinedImageSampler.imageLayout = asset::IImage::LAYOUT::READ_ONLY_OPTIMAL;
+				writeDSInfos[4].desc = m_outImgView;
+				writeDSInfos[4].info.image.imageLayout = IImage::LAYOUT::READ_ONLY_OPTIMAL;
+
+				std::array<IGPUDescriptorSet::SWriteDescriptorSet, 5> writeDescriptorSets = {};
+				writeDescriptorSets[0] = {
+					.dstSet = m_descriptorSet0.get(),
+					.binding = 0,
+					.arrayElement = 0u,
+					.count = 1u,
+					.info = &writeDSInfos[0]
+				};
+				writeDescriptorSets[1] = {
+					.dstSet = m_descriptorSet2.get(),
+					.binding = 0,
+					.arrayElement = 0u,
+					.count = 1u,
+					.info = &writeDSInfos[1]
+				};
+				writeDescriptorSets[2] = {
+					.dstSet = m_descriptorSet2.get(),
+					.binding = 1,
+					.arrayElement = 0u,
+					.count = 1u,
+					.info = &writeDSInfos[2]
+				};
+				writeDescriptorSets[3] = {
+					.dstSet = m_descriptorSet2.get(),
+					.binding = 2,
+					.arrayElement = 0u,
+					.count = 1u,
+					.info = &writeDSInfos[3]
+				};
+				writeDescriptorSets[4] = {
+					.dstSet = m_presentDescriptorSet.get(),
+					.binding = 0,
+					.arrayElement = 0u,
+					.count = 1u,
+					.info = &writeDSInfos[4]
+				};
+
+				m_device->updateDescriptorSets(writeDescriptorSets, {});
+			}
+
+			// Create ui descriptors
+			{
+				using binding_flags_t = IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS;
+				{
+					IGPUSampler::SParams params;
+					params.AnisotropicFilter = 1u;
+					params.TextureWrapU = ISampler::E_TEXTURE_CLAMP::ETC_REPEAT;
+					params.TextureWrapV = ISampler::E_TEXTURE_CLAMP::ETC_REPEAT;
+					params.TextureWrapW = ISampler::E_TEXTURE_CLAMP::ETC_REPEAT;
+
+					m_ui.samplers.gui = m_device->createSampler(params);
+					m_ui.samplers.gui->setObjectDebugName("Nabla IMGUI UI Sampler");
+				}
+
+				std::array<core::smart_refctd_ptr<IGPUSampler>, 69u> immutableSamplers;
+				for (auto& it : immutableSamplers)
+					it = smart_refctd_ptr(m_ui.samplers.scene);
+
+				immutableSamplers[nbl::ext::imgui::UI::FontAtlasTexId] = smart_refctd_ptr(m_ui.samplers.gui);
+
+				nbl::ext::imgui::UI::SCreationParameters params;
+
+				params.resources.texturesInfo = { .setIx = 0u, .bindingIx = 0u };
+				params.resources.samplersInfo = { .setIx = 0u, .bindingIx = 1u };
+				params.assetManager = m_assetMgr;
+				params.pipelineCache = nullptr;
+				params.pipelineLayout = nbl::ext::imgui::UI::createDefaultPipelineLayout(m_utils->getLogicalDevice(), params.resources.texturesInfo, params.resources.samplersInfo, MaxUITextureCount);
+				params.renderpass = smart_refctd_ptr<IGPURenderpass>(renderpass);
+				params.streamingBuffer = nullptr;
+				params.subpassIx = 0u;
+				params.transfer = getTransferUpQueue();
+				params.utilities = m_utils;
+				{
+					m_ui.manager = ext::imgui::UI::create(std::move(params));
+
+					// note that we use default layout provided by our extension, but you are free to create your own by filling nbl::ext::imgui::UI::S_CREATION_PARAMETERS::resources
+					const auto* descriptorSetLayout = m_ui.manager->getPipeline()->getLayout()->getDescriptorSetLayout(0u);
+					const auto& params = m_ui.manager->getCreationParameters();
+
+					IDescriptorPool::SCreateInfo descriptorPoolInfo = {};
+					descriptorPoolInfo.maxDescriptorCount[static_cast<uint32_t>(asset::IDescriptor::E_TYPE::ET_SAMPLER)] = (uint32_t)nbl::ext::imgui::UI::DefaultSamplerIx::COUNT;
+					descriptorPoolInfo.maxDescriptorCount[static_cast<uint32_t>(asset::IDescriptor::E_TYPE::ET_SAMPLED_IMAGE)] = MaxUITextureCount;
+					descriptorPoolInfo.maxSets = 1u;
+					descriptorPoolInfo.flags = IDescriptorPool::E_CREATE_FLAGS::ECF_UPDATE_AFTER_BIND_BIT;
+
+					m_guiDescriptorSetPool = m_device->createDescriptorPool(std::move(descriptorPoolInfo));
+					assert(m_guiDescriptorSetPool);
+
+					m_guiDescriptorSetPool->createDescriptorSets(1u, &descriptorSetLayout, &m_ui.descriptorSet);
+					assert(m_ui.descriptorSet);
+				}
+			}
+			m_ui.manager->registerListener(
+				[this]() -> void {
+					ImGuiIO& io = ImGui::GetIO();
+
+					m_camera.setProjectionMatrix([&]()
+					{
+						static matrix4SIMD projection;
+
+						projection = matrix4SIMD::buildProjectionMatrixPerspectiveFovRH(core::radians(fov), io.DisplaySize.x / io.DisplaySize.y, zNear, zFar);
+
+						return projection;
+					}());
+
+					ImGui::SetNextWindowPos(ImVec2(1024, 100), ImGuiCond_Appearing);
+					ImGui::SetNextWindowSize(ImVec2(256, 256), ImGuiCond_Appearing);
+
+					// create a window and insert the inspector
+					ImGui::SetNextWindowPos(ImVec2(10, 10), ImGuiCond_Appearing);
+					ImGui::SetNextWindowSize(ImVec2(320, 340), ImGuiCond_Appearing);
+					ImGui::Begin("Controls");
+
+					ImGui::SameLine();
+
+					ImGui::Text("Camera");
+
+					ImGui::SliderFloat("Move speed", &moveSpeed, 0.1f, 10.f);
+					ImGui::SliderFloat("Rotate speed", &rotateSpeed, 0.1f, 10.f);
+					ImGui::SliderFloat("Fov", &fov, 20.f, 150.f);
+					ImGui::SliderFloat("zNear", &zNear, 0.1f, 100.f);
+					ImGui::SliderFloat("zFar", &zFar, 110.f, 10000.f);
+					ImGui::ListBox("Shader", &PTPipline, shaderNames, E_LIGHT_GEOMETRY::ELG_COUNT);
+					ImGui::SliderInt("SPP", &spp, 1, MaxBufferSamples);
+					ImGui::SliderInt("Depth", &depth, 1, MaxBufferDimensions / 3);
+
+					ImGui::Text("X: %f Y: %f", io.MousePos.x, io.MousePos.y);
+
+					ImGui::End();
+				}
+			);
+
+			// Set Camera
+			{
+				core::vectorSIMDf cameraPosition(0, 5, -10);
+				matrix4SIMD proj = matrix4SIMD::buildProjectionMatrixPerspectiveFovRH(
+					core::radians(60.0f),
+					WindowDimensions.x / WindowDimensions.y,
+					0.01f,
+					500.0f
+				);
+				m_camera = Camera(cameraPosition, core::vectorSIMDf(0, 0, 0), proj);
+			}
+
+			m_winMgr->setWindowSize(m_window.get(), WindowDimensions.x, WindowDimensions.y);
+			m_surface->recreateSwapchain();
+			m_winMgr->show(m_window.get());
+			m_oracle.reportBeginFrameRecord();
+			m_camera.mapKeysToWASD();
+
+			return true;
+		}
+
+		bool updateGUIDescriptorSet()
+		{
+			// texture atlas, note we don't create info & write pair for the font sampler because UI extension's is immutable and baked into DS layout
+			static std::array<IGPUDescriptorSet::SDescriptorInfo, MaxUITextureCount> descriptorInfo;
+			static IGPUDescriptorSet::SWriteDescriptorSet writes[MaxUITextureCount];
+
+			descriptorInfo[nbl::ext::imgui::UI::FontAtlasTexId].info.image.imageLayout = IImage::LAYOUT::READ_ONLY_OPTIMAL;
+			descriptorInfo[nbl::ext::imgui::UI::FontAtlasTexId].desc = smart_refctd_ptr<IGPUImageView>(m_ui.manager->getFontAtlasView());
+
+			for (uint32_t i = 0; i < descriptorInfo.size(); ++i)
+			{
+				writes[i].dstSet = m_ui.descriptorSet.get();
+				writes[i].binding = 0u;
+				writes[i].arrayElement = i;
+				writes[i].count = 1u;
+			}
+			writes[nbl::ext::imgui::UI::FontAtlasTexId].info = descriptorInfo.data() + nbl::ext::imgui::UI::FontAtlasTexId;
+
+			return m_device->updateDescriptorSets(writes, {});
+		}
+
+		inline void workLoopBody() override
+		{
+			// framesInFlight: ensuring safe execution of command buffers and acquires, `framesInFlight` only affect semaphore waits, don't use this to index your resources because it can change with swapchain recreation.
+			const uint32_t framesInFlight = core::min(MaxFramesInFlight, m_surface->getMaxAcquiresInFlight());
+			// We block for semaphores for 2 reasons here:
+				// A) Resource: Can't use resource like a command buffer BEFORE previous use is finished! [MaxFramesInFlight]
+				// B) Acquire: Can't have more acquires in flight than a certain threshold returned by swapchain or your surface helper class. [MaxAcquiresInFlight]
+			if (m_realFrameIx >= framesInFlight)
+			{
+				const ISemaphore::SWaitInfo cbDonePending[] = 
+				{
+					{
+						.semaphore = m_semaphore.get(),
+						.value = m_realFrameIx + 1 - framesInFlight
+					}
+				};
+				if (m_device->blockForSemaphores(cbDonePending) != ISemaphore::WAIT_RESULT::SUCCESS)
+					return;
+			}
+			const auto resourceIx = m_realFrameIx % MaxFramesInFlight;
+
+			m_api->startCapture();
+
+			// CPU events
+			update();
+
+			auto queue = getGraphicsQueue();
+			auto cmdbuf = m_cmdBufs[resourceIx].get();
+
+			if (!keepRunning())
+				return;
+
+			// render whole scene to offline frame buffer & submit
+			{
+				cmdbuf->reset(IGPUCommandBuffer::RESET_FLAGS::NONE);
+				// disregard surface/swapchain transformation for now
+				const auto viewProjectionMatrix = m_camera.getConcatenatedMatrix();
+				PTPushConstant pc;
+				viewProjectionMatrix.getInverseTransform(pc.invMVP);
+				pc.sampleCount = spp;
+				pc.depth = depth;
+
+				// safe to proceed
+				// upload buffer data
+				cmdbuf->beginDebugMarker("ComputeShaderPathtracer IMGUI Frame");
+				cmdbuf->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT);
+
+				// TRANSITION m_outImgView to GENERAL (because of descriptorSets0 -> ComputeShader Writes into the image)
+				{
+					const IGPUCommandBuffer::SImageMemoryBarrier<IGPUCommandBuffer::SOwnershipTransferBarrier> imgBarriers[] = {
+						{
+							.barrier = {
+								.dep = {
+									.srcStageMask = PIPELINE_STAGE_FLAGS::ALL_TRANSFER_BITS,
+									.srcAccessMask = ACCESS_FLAGS::TRANSFER_WRITE_BIT,
+									.dstStageMask = PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT,
+									.dstAccessMask = ACCESS_FLAGS::SHADER_WRITE_BITS
+								}
+							},
+							.image = m_outImgView->getCreationParameters().image.get(),
+							.subresourceRange = {
+								.aspectMask = IImage::EAF_COLOR_BIT,
+								.baseMipLevel = 0u,
+								.levelCount = 1u,
+								.baseArrayLayer = 0u,
+								.layerCount = 1u
+							},
+							.oldLayout = IImage::LAYOUT::UNDEFINED,
+							.newLayout = IImage::LAYOUT::GENERAL
+						}
+					};
+					cmdbuf->pipelineBarrier(E_DEPENDENCY_FLAGS::EDF_NONE, { .imgBarriers = imgBarriers });
+				}
+
+				// cube envmap handle
+				{
+					auto pipeline = m_PTPipelines[PTPipline].get();
+					cmdbuf->bindComputePipeline(pipeline);
+					cmdbuf->bindDescriptorSets(EPBP_COMPUTE, pipeline->getLayout(), 0u, 1u, &m_descriptorSet0.get());
+					cmdbuf->bindDescriptorSets(EPBP_COMPUTE, pipeline->getLayout(), 2u, 1u, &m_descriptorSet2.get());
+					cmdbuf->pushConstants(pipeline->getLayout(), IShader::E_SHADER_STAGE::ESS_COMPUTE, 0, sizeof(PTPushConstant), &pc);
+					cmdbuf->dispatch(1 + (WindowDimensions.x - 1) / DefaultWorkGroupSize, 1 + (WindowDimensions.y - 1) / DefaultWorkGroupSize, 1u);
+				}
+
+				// TRANSITION m_outImgView to READ (because of descriptorSets0 -> ComputeShader Writes into the image)
+				{
+					const IGPUCommandBuffer::SImageMemoryBarrier<IGPUCommandBuffer::SOwnershipTransferBarrier> imgBarriers[] = {
+						{
+							.barrier = {
+								.dep = {
+									.srcStageMask = PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT,
+									.srcAccessMask = ACCESS_FLAGS::SHADER_WRITE_BITS,
+									.dstStageMask = PIPELINE_STAGE_FLAGS::FRAGMENT_SHADER_BIT,
+									.dstAccessMask = ACCESS_FLAGS::SHADER_READ_BITS
+								}
+							},
+							.image = m_outImgView->getCreationParameters().image.get(),
+							.subresourceRange = {
+								.aspectMask = IImage::EAF_COLOR_BIT,
+								.baseMipLevel = 0u,
+								.levelCount = 1u,
+								.baseArrayLayer = 0u,
+								.layerCount = 1u
+							},
+							.oldLayout = IImage::LAYOUT::GENERAL,
+							.newLayout = IImage::LAYOUT::READ_ONLY_OPTIMAL
+						}
+					};
+					cmdbuf->pipelineBarrier(E_DEPENDENCY_FLAGS::EDF_NONE, { .imgBarriers = imgBarriers });
+				}
+
+				// TODO: tone mapping and stuff
+			}
+
+			asset::SViewport viewport;
+			{
+				viewport.minDepth = 1.f;
+				viewport.maxDepth = 0.f;
+				viewport.x = 0u;
+				viewport.y = 0u;
+				viewport.width = WindowDimensions.x;
+				viewport.height = WindowDimensions.y;
+			}
+			cmdbuf->setViewport(0u, 1u, &viewport);
+
+
+			VkRect2D defaultScisors[] = { {.offset = {(int32_t)viewport.x, (int32_t)viewport.y}, .extent = {(uint32_t)viewport.width, (uint32_t)viewport.height}} };
+			cmdbuf->setScissor(defaultScisors);
+
+			const VkRect2D currentRenderArea =
+			{
+				.offset = {0,0},
+				.extent = {m_window->getWidth(),m_window->getHeight()}
+			};
+			auto scRes = static_cast<CDefaultSwapchainFramebuffers*>(m_surface->getSwapchainResources());
+
+			// Upload m_outImg to swapchain + UI
+			{
+				const IGPUCommandBuffer::SRenderpassBeginInfo info =
+				{
+					.framebuffer = scRes->getFramebuffer(m_currentImageAcquire.imageIndex),
+					.colorClearValues = &clearColor,
+					.depthStencilClearValues = nullptr,
+					.renderArea = currentRenderArea
+				};
+				nbl::video::ISemaphore::SWaitInfo waitInfo = { .semaphore = m_semaphore.get(), .value = m_realFrameIx + 1u };
+
+				cmdbuf->beginRenderPass(info, IGPUCommandBuffer::SUBPASS_CONTENTS::INLINE);
+
+				cmdbuf->bindGraphicsPipeline(m_presentPipeline.get());
+				cmdbuf->bindDescriptorSets(EPBP_GRAPHICS, m_presentPipeline->getLayout(), 0, 1u, &m_presentDescriptorSet.get());
+				ext::FullScreenTriangle::recordDrawCall(cmdbuf);
+
+				const auto uiParams = m_ui.manager->getCreationParameters();
+				auto* uiPipeline = m_ui.manager->getPipeline();
+				cmdbuf->bindGraphicsPipeline(uiPipeline);
+				cmdbuf->bindDescriptorSets(EPBP_GRAPHICS, uiPipeline->getLayout(), uiParams.resources.texturesInfo.setIx, 1u, &m_ui.descriptorSet.get());
+				m_ui.manager->render(cmdbuf, waitInfo);
+
+				cmdbuf->endRenderPass();
+			}
+
+			cmdbuf->end();
+			{
+				const IQueue::SSubmitInfo::SSemaphoreInfo rendered[] =
+				{
+					{
+						.semaphore = m_semaphore.get(),
+						.value = ++m_realFrameIx,
+						.stageMask = PIPELINE_STAGE_FLAGS::COLOR_ATTACHMENT_OUTPUT_BIT
+					}
+				};
+				{
+					{
+						const IQueue::SSubmitInfo::SCommandBufferInfo commandBuffers[] =
+						{
+							{.cmdbuf = cmdbuf }
+						};
+
+						const IQueue::SSubmitInfo::SSemaphoreInfo acquired[] =
+						{
+							{
+								.semaphore = m_currentImageAcquire.semaphore,
+								.value = m_currentImageAcquire.acquireCount,
+								.stageMask = PIPELINE_STAGE_FLAGS::NONE
+							}
+						};
+						const IQueue::SSubmitInfo infos[] =
+						{
+							{
+								.waitSemaphores = acquired,
+								.commandBuffers = commandBuffers,
+								.signalSemaphores = rendered
+							}
+						};
+
+						updateGUIDescriptorSet();
+
+						if (queue->submit(infos) != IQueue::RESULT::SUCCESS)
+							m_realFrameIx--;
+					}
+				}
+
+				m_window->setCaption("[Nabla Engine] Computer Path Tracer");
+				m_surface->present(m_currentImageAcquire.imageIndex, rendered);
+			}
+			m_api->endCapture();
+		}
+
+		inline bool keepRunning() override
+		{
+			if (m_surface->irrecoverable())
+				return false;
+
+			return true;
+		}
+
+		inline bool onAppTerminated() override
+		{
+			return device_base_t::onAppTerminated();
+		}
+
+		inline void update()
+		{
+			m_camera.setMoveSpeed(moveSpeed);
+			m_camera.setRotateSpeed(rotateSpeed);
+
+			static std::chrono::microseconds previousEventTimestamp{};
+
+			m_inputSystem->getDefaultMouse(&mouse);
+			m_inputSystem->getDefaultKeyboard(&keyboard);
+
+			auto updatePresentationTimestamp = [&]()
+			{
+				m_currentImageAcquire = m_surface->acquireNextImage();
+
+				m_oracle.reportEndFrameRecord();
+				const auto timestamp = m_oracle.getNextPresentationTimeStamp();
+				m_oracle.reportBeginFrameRecord();
+
+				return timestamp;
+			};
+
+			const auto nextPresentationTimestamp = updatePresentationTimestamp();
+
+			struct
+			{
+				std::vector<SMouseEvent> mouse{};
+				std::vector<SKeyboardEvent> keyboard{};
+			} capturedEvents;
+
+			m_camera.beginInputProcessing(nextPresentationTimestamp);
+			{
+				mouse.consumeEvents([&](const IMouseEventChannel::range_t& events) -> void
+				{
+					m_camera.mouseProcess(events); // don't capture the events, only let camera handle them with its impl
+
+					for (const auto& e : events) // here capture
+					{
+						if (e.timeStamp < previousEventTimestamp)
+							continue;
+
+						previousEventTimestamp = e.timeStamp;
+						capturedEvents.mouse.emplace_back(e);
+
+						if (e.type == nbl::ui::SMouseEvent::EET_SCROLL)
+							gcIndex = std::clamp<uint16_t>(int16_t(gcIndex) + int16_t(core::sign(e.scrollEvent.verticalScroll)), int64_t(0), int64_t(ELG_COUNT - (uint8_t)1u));
+					}
+				}, m_logger.get());
+
+				keyboard.consumeEvents([&](const IKeyboardEventChannel::range_t& events) -> void
+				{
+					m_camera.keyboardProcess(events); // don't capture the events, only let camera handle them with its impl
+
+					for (const auto& e : events) // here capture
+					{
+						if (e.timeStamp < previousEventTimestamp)
+							continue;
+
+						previousEventTimestamp = e.timeStamp;
+						capturedEvents.keyboard.emplace_back(e);
+					}
+				}, m_logger.get());
+			}
+			m_camera.endInputProcessing(nextPresentationTimestamp);
+
+			const core::SRange<const nbl::ui::SMouseEvent> mouseEvents(capturedEvents.mouse.data(), capturedEvents.mouse.data() + capturedEvents.mouse.size());
+			const core::SRange<const nbl::ui::SKeyboardEvent> keyboardEvents(capturedEvents.keyboard.data(), capturedEvents.keyboard.data() + capturedEvents.keyboard.size());
+			const auto cursorPosition = m_window->getCursorControl()->getPosition();
+			const auto mousePosition = float32_t2(cursorPosition.x, cursorPosition.y) - float32_t2(m_window->getX(), m_window->getY());
+
+			const ext::imgui::UI::SUpdateParameters params =
+			{
+				.mousePosition = mousePosition,
+				.displaySize = { m_window->getWidth(), m_window->getHeight() },
+				.mouseEvents = mouseEvents,
+				.keyboardEvents = keyboardEvents
+			};
+
+			m_ui.manager->update(params);
+		}
+
+	private:
+		smart_refctd_ptr<IWindow> m_window;
+		smart_refctd_ptr<CSimpleResizeSurface<CDefaultSwapchainFramebuffers>> m_surface;
+
+		// gpu resources
+		smart_refctd_ptr<IGPUCommandPool> m_cmdPool;
+		std::array<smart_refctd_ptr<IGPUComputePipeline>, E_LIGHT_GEOMETRY::ELG_COUNT> m_PTPipelines;
+		smart_refctd_ptr<IGPUGraphicsPipeline> m_presentPipeline;
+		uint64_t m_realFrameIx = 0;
+		std::array<smart_refctd_ptr<IGPUCommandBuffer>, MaxFramesInFlight> m_cmdBufs;
+		ISimpleManagedSurface::SAcquireResult m_currentImageAcquire = {};
+		smart_refctd_ptr<IGPUDescriptorSet> m_descriptorSet0, m_descriptorSet2, m_presentDescriptorSet;
+
+		core::smart_refctd_ptr<IDescriptorPool> m_guiDescriptorSetPool;
+
+		// system resources
+		core::smart_refctd_ptr<InputSystem> m_inputSystem;
+		InputSystem::ChannelReader<IMouseEventChannel> mouse;
+		InputSystem::ChannelReader<IKeyboardEventChannel> keyboard;
+
+		// pathtracer resources
+		smart_refctd_ptr<IGPUImageView> m_envMapView, m_scrambleView;
+		smart_refctd_ptr<IGPUBufferView> m_sequenceBufferView;
+		smart_refctd_ptr<IGPUImageView> m_outImgView;
+
+		// sync
+		smart_refctd_ptr<ISemaphore> m_semaphore;
+
+		// image upload resources
+		smart_refctd_ptr<ISemaphore> m_scratchSemaphore;
+		SIntendedSubmitInfo m_intendedSubmit;
+
+		struct C_UI
+		{
+			nbl::core::smart_refctd_ptr<nbl::ext::imgui::UI> manager;
+
+			struct
+			{
+				core::smart_refctd_ptr<video::IGPUSampler> gui, scene;
+			} samplers;
+
+			core::smart_refctd_ptr<IGPUDescriptorSet> descriptorSet;
+		} m_ui;
+
+		Camera m_camera;
+
+		video::CDumbPresentationOracle m_oracle;
+
+		uint16_t gcIndex = {}; // note: this is dirty however since I assume only single object in scene I can leave it now, when this example is upgraded to support multiple objects this needs to be changed
+
+		float fov = 60.f, zNear = 0.1f, zFar = 10000.f, moveSpeed = 1.f, rotateSpeed = 1.f;
+		float viewWidth = 10.f;
+		float camYAngle = 165.f / 180.f * 3.14159f;
+		float camXAngle = 32.f / 180.f * 3.14159f;
+		int PTPipline = E_LIGHT_GEOMETRY::ELG_SPHERE;
+		int spp = 32;
+		int depth = 3;
+
+		bool m_firstFrame = true;
+		IGPUCommandBuffer::SClearColorValue clearColor = { .float32 = {0.f,0.f,0.f,1.f} };
+};
+
+NBL_MAIN_FUNC(ComputeShaderPathtracer)
diff --git a/31_HLSLPathTracer/pipeline.groovy b/31_HLSLPathTracer/pipeline.groovy
new file mode 100644
index 000000000..955e77cec
--- /dev/null
+++ b/31_HLSLPathTracer/pipeline.groovy
@@ -0,0 +1,50 @@
+import org.DevshGraphicsProgramming.Agent
+import org.DevshGraphicsProgramming.BuilderInfo
+import org.DevshGraphicsProgramming.IBuilder
+
+class CHLSLPathTracerBuilder extends IBuilder
+{
+	public CHLSLPathTracerBuilder(Agent _agent, _info)
+	{
+		super(_agent, _info)
+	}
+	
+	@Override
+	public boolean prepare(Map axisMapping)
+	{
+		return true
+	}
+	
+	@Override
+  	public boolean build(Map axisMapping)
+	{
+		IBuilder.CONFIGURATION config = axisMapping.get("CONFIGURATION")
+		IBuilder.BUILD_TYPE buildType = axisMapping.get("BUILD_TYPE")
+		
+		def nameOfBuildDirectory = getNameOfBuildDirectory(buildType)
+		def nameOfConfig = getNameOfConfig(config)
+		
+		agent.execute("cmake --build ${info.rootProjectPath}/${nameOfBuildDirectory}/${info.targetProjectPathRelativeToRoot} --target ${info.targetBaseName} --config ${nameOfConfig} -j12 -v")
+		
+		return true
+	}
+	
+	@Override
+  	public boolean test(Map axisMapping)
+	{
+		return true
+	}
+	
+	@Override
+	public boolean install(Map axisMapping)
+	{
+		return true
+	}
+}
+
+def create(Agent _agent, _info)
+{
+	return new CHLSLPathTracerBuilder(_agent, _info)
+}
+
+return this
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 935354ed7..aa84caa6b 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -70,6 +70,8 @@ if(NBL_BUILD_EXAMPLES)
 	# Showcase compute pathtracing
 	add_subdirectory(30_ComputeShaderPathTracer EXCLUDE_FROM_ALL)
 
+	add_subdirectory(31_HLSLPathTracer EXCLUDE_FROM_ALL)
+
 	add_subdirectory(38_EXRSplit EXCLUDE_FROM_ALL)
 	# if (NBL_BUILD_MITSUBA_LOADER AND NBL_BUILD_OPTIX)
 	#	add_subdirectory(39_DenoiserTonemapper EXCLUDE_FROM_ALL)

From 85211238891bff05b749cda5f36c8b2610210666 Mon Sep 17 00:00:00 2001
From: keptsecret <sorchon@gmail.com>
Date: Mon, 3 Feb 2025 15:51:28 +0700
Subject: [PATCH 020/529] ignore events on imgui focus

---
 31_HLSLPathTracer/main.cpp | 49 ++++++++++++++++++++------------------
 1 file changed, 26 insertions(+), 23 deletions(-)

diff --git a/31_HLSLPathTracer/main.cpp b/31_HLSLPathTracer/main.cpp
index 73434a852..018468e46 100644
--- a/31_HLSLPathTracer/main.cpp
+++ b/31_HLSLPathTracer/main.cpp
@@ -1112,7 +1112,7 @@ class ComputeShaderPathtracer final : public examples::SimpleWindowedApplication
 					}
 				}
 
-				m_window->setCaption("[Nabla Engine] Computer Path Tracer");
+				m_window->setCaption("[Nabla Engine] HLSL Compute Path Tracer");
 				m_surface->present(m_currentImageAcquire.imageIndex, rendered);
 			}
 			m_api->endCapture();
@@ -1162,36 +1162,39 @@ class ComputeShaderPathtracer final : public examples::SimpleWindowedApplication
 
 			m_camera.beginInputProcessing(nextPresentationTimestamp);
 			{
+				const auto& io = ImGui::GetIO();
 				mouse.consumeEvents([&](const IMouseEventChannel::range_t& events) -> void
-				{
-					m_camera.mouseProcess(events); // don't capture the events, only let camera handle them with its impl
-
-					for (const auto& e : events) // here capture
 					{
-						if (e.timeStamp < previousEventTimestamp)
-							continue;
+						if (!io.WantCaptureMouse)
+							m_camera.mouseProcess(events); // don't capture the events, only let camera handle them with its impl
 
-						previousEventTimestamp = e.timeStamp;
-						capturedEvents.mouse.emplace_back(e);
+						for (const auto& e : events) // here capture
+						{
+							if (e.timeStamp < previousEventTimestamp)
+								continue;
 
-						if (e.type == nbl::ui::SMouseEvent::EET_SCROLL)
-							gcIndex = std::clamp<uint16_t>(int16_t(gcIndex) + int16_t(core::sign(e.scrollEvent.verticalScroll)), int64_t(0), int64_t(ELG_COUNT - (uint8_t)1u));
-					}
-				}, m_logger.get());
+							previousEventTimestamp = e.timeStamp;
+							capturedEvents.mouse.emplace_back(e);
 
+							if (e.type == nbl::ui::SMouseEvent::EET_SCROLL)
+								gcIndex = std::clamp<uint16_t>(int16_t(gcIndex) + int16_t(core::sign(e.scrollEvent.verticalScroll)), int64_t(0), int64_t(ELG_COUNT - (uint8_t)1u));
+						}
+					}, m_logger.get());
+				
 				keyboard.consumeEvents([&](const IKeyboardEventChannel::range_t& events) -> void
-				{
-					m_camera.keyboardProcess(events); // don't capture the events, only let camera handle them with its impl
-
-					for (const auto& e : events) // here capture
 					{
-						if (e.timeStamp < previousEventTimestamp)
-							continue;
+						if (!io.WantCaptureKeyboard)
+							m_camera.keyboardProcess(events); // don't capture the events, only let camera handle them with its impl
 
-						previousEventTimestamp = e.timeStamp;
-						capturedEvents.keyboard.emplace_back(e);
-					}
-				}, m_logger.get());
+						for (const auto& e : events) // here capture
+						{
+							if (e.timeStamp < previousEventTimestamp)
+								continue;
+
+							previousEventTimestamp = e.timeStamp;
+							capturedEvents.keyboard.emplace_back(e);
+						}
+					}, m_logger.get());
 			}
 			m_camera.endInputProcessing(nextPresentationTimestamp);
 

From b171724bb0db3bf6f144d6eb077e95ddea806cbd Mon Sep 17 00:00:00 2001
From: keptsecret <sorchon@gmail.com>
Date: Tue, 4 Feb 2025 14:16:06 +0700
Subject: [PATCH 021/529] initial files for pathtracer

---
 .../app_resources/hlsl/common.hlsl            | 49 ++++++++++++
 .../app_resources/hlsl/intersector.hlsl       | 27 +++++++
 .../app_resources/hlsl/material_system.hlsl   | 20 +++++
 .../hlsl/next_event_estimator.hlsl            | 20 +++++
 .../app_resources/hlsl/pathtracer.hlsl        | 32 ++++++++
 .../app_resources/hlsl/rand_gen.hlsl          | 38 +++++++++
 .../app_resources/hlsl/ray_gen.hlsl           | 80 +++++++++++++++++++
 7 files changed, 266 insertions(+)
 create mode 100644 31_HLSLPathTracer/app_resources/hlsl/common.hlsl
 create mode 100644 31_HLSLPathTracer/app_resources/hlsl/intersector.hlsl
 create mode 100644 31_HLSLPathTracer/app_resources/hlsl/material_system.hlsl
 create mode 100644 31_HLSLPathTracer/app_resources/hlsl/next_event_estimator.hlsl
 create mode 100644 31_HLSLPathTracer/app_resources/hlsl/pathtracer.hlsl
 create mode 100644 31_HLSLPathTracer/app_resources/hlsl/rand_gen.hlsl
 create mode 100644 31_HLSLPathTracer/app_resources/hlsl/ray_gen.hlsl

diff --git a/31_HLSLPathTracer/app_resources/hlsl/common.hlsl b/31_HLSLPathTracer/app_resources/hlsl/common.hlsl
new file mode 100644
index 000000000..694defc08
--- /dev/null
+++ b/31_HLSLPathTracer/app_resources/hlsl/common.hlsl
@@ -0,0 +1,49 @@
+#ifndef _NBL_HLSL_EXT_PATHTRACING_COMMON_INCLUDED_
+#define _NBL_HLSL_EXT_PATHTRACING_COMMON_INCLUDED_
+
+namespace nbl
+{
+namespace hlsl
+{
+namespace ext
+{
+
+template<typename T>
+struct Payload
+{
+    using this_t = Payload<T>;
+    using scalar_type = T;
+    using vector3_type = vector<T, 3>;
+
+    vector3_type accumulation;
+    scalar_type otherTechniqueHeuristic;
+    vector3_type throughput;
+    // #ifdef KILL_DIFFUSE_SPECULAR_PATHS
+    // bool hasDiffuse;
+    // #endif
+};
+
+template<typename T>
+struct Ray
+{
+    using this_t = Ray<T>;
+    using scalar_type = T;
+    using vector3_type = vector<T, 3>;
+
+    // immutable
+    vector3_type origin;
+    vector3_type direction;
+    // TODO: polygon method == 2 stuff
+
+    // mutable
+    scalar_type intersectionT;
+    uint32_t objectID;
+
+    Payload<T> payload;
+};
+
+}
+}
+}
+
+#endif
\ No newline at end of file
diff --git a/31_HLSLPathTracer/app_resources/hlsl/intersector.hlsl b/31_HLSLPathTracer/app_resources/hlsl/intersector.hlsl
new file mode 100644
index 000000000..5d12d6d18
--- /dev/null
+++ b/31_HLSLPathTracer/app_resources/hlsl/intersector.hlsl
@@ -0,0 +1,27 @@
+#ifndef _NBL_HLSL_EXT_INTERSECTOR_INCLUDED_
+#define _NBL_HLSL_EXT_INTERSECTOR_INCLUDED_
+
+namespace nbl
+{
+namespace hlsl
+{
+namespace ext
+{
+namespace Intersector
+{
+
+// ray query method
+
+// ray tracing pipeline method
+
+struct Procedural
+{
+    
+};
+
+}
+}
+}
+}
+
+#endif
\ No newline at end of file
diff --git a/31_HLSLPathTracer/app_resources/hlsl/material_system.hlsl b/31_HLSLPathTracer/app_resources/hlsl/material_system.hlsl
new file mode 100644
index 000000000..6f635ab68
--- /dev/null
+++ b/31_HLSLPathTracer/app_resources/hlsl/material_system.hlsl
@@ -0,0 +1,20 @@
+#ifndef _NBL_HLSL_EXT_MATERIAL_SYSTEM_INCLUDED_
+#define _NBL_HLSL_EXT_MATERIAL_SYSTEM_INCLUDED_
+
+namespace nbl
+{
+namespace hlsl
+{
+namespace ext
+{
+namespace MaterialSystem
+{
+
+
+
+}
+}
+}
+}
+
+#endif
\ No newline at end of file
diff --git a/31_HLSLPathTracer/app_resources/hlsl/next_event_estimator.hlsl b/31_HLSLPathTracer/app_resources/hlsl/next_event_estimator.hlsl
new file mode 100644
index 000000000..1afa8d12e
--- /dev/null
+++ b/31_HLSLPathTracer/app_resources/hlsl/next_event_estimator.hlsl
@@ -0,0 +1,20 @@
+#ifndef _NBL_HLSL_EXT_NEXT_EVENT_ESTIMATOR_INCLUDED_
+#define _NBL_HLSL_EXT_NEXT_EVENT_ESTIMATOR_INCLUDED_
+
+namespace nbl
+{
+namespace hlsl
+{
+namespace ext
+{
+namespace NextEventEstimator
+{
+
+
+
+}
+}
+}
+}
+
+#endif
\ No newline at end of file
diff --git a/31_HLSLPathTracer/app_resources/hlsl/pathtracer.hlsl b/31_HLSLPathTracer/app_resources/hlsl/pathtracer.hlsl
new file mode 100644
index 000000000..9d2e8c260
--- /dev/null
+++ b/31_HLSLPathTracer/app_resources/hlsl/pathtracer.hlsl
@@ -0,0 +1,32 @@
+#ifndef _NBL_HLSL_EXT_PATHTRACER_INCLUDED_
+#define _NBL_HLSL_EXT_PATHTRACER_INCLUDED_
+
+namespace nbl
+{
+namespace hlsl
+{
+namespace ext
+{
+namespace PathTracer
+{
+
+template<class RandGen, class RayGen, class Intersector, class MaterialSystem, /* class PathGuider, */ class NextEventEstimator>
+struct Unidirectional
+{
+    using this_t = Unidirectional<RandGen, RayGen, Intersector, MaterialSystem, NextEventEstimator>;
+
+    static this_t create(RandGen randGen,
+                        RayGen rayGen,
+                        Intersector intersector,
+                        MaterialSystem materialSystem,
+                        /* PathGuider pathGuider, */
+                        NextEventEstimator nee)
+    {}
+};
+
+}
+}
+}
+}
+
+#endif
\ No newline at end of file
diff --git a/31_HLSLPathTracer/app_resources/hlsl/rand_gen.hlsl b/31_HLSLPathTracer/app_resources/hlsl/rand_gen.hlsl
new file mode 100644
index 000000000..949c2064b
--- /dev/null
+++ b/31_HLSLPathTracer/app_resources/hlsl/rand_gen.hlsl
@@ -0,0 +1,38 @@
+#ifndef _NBL_HLSL_EXT_RANDGEN_INCLUDED_
+#define _NBL_HLSL_EXT_RANDGEN_INCLUDED_
+
+namespace nbl
+{
+namespace hlsl
+{
+namespace ext
+{
+namespace RandGen
+{
+
+template<typename RNG>
+struct Uniform3D
+{
+    using rng_type = RNG;
+
+    static Uniform3D<RNG> create(uint32_t2 seed)
+    {
+        Uniform3D<RNG> retval;
+        retval.rng = rng_type::construct(seed);
+        return retval;
+    }
+
+    float32_t3 operator()()
+    {
+        return float32_t3(uint32_t3(rng(), rng(), rng()));
+    }
+
+    rng_type rng;
+};
+
+}
+}
+}
+}
+
+#endif
\ No newline at end of file
diff --git a/31_HLSLPathTracer/app_resources/hlsl/ray_gen.hlsl b/31_HLSLPathTracer/app_resources/hlsl/ray_gen.hlsl
new file mode 100644
index 000000000..467ef2bd4
--- /dev/null
+++ b/31_HLSLPathTracer/app_resources/hlsl/ray_gen.hlsl
@@ -0,0 +1,80 @@
+#ifndef _NBL_HLSL_EXT_RAYGEN_INCLUDED_
+#define _NBL_HLSL_EXT_RAYGEN_INCLUDED_
+
+#include "common.hlsl"
+
+namespace nbl
+{
+namespace hlsl
+{
+namespace ext
+{
+namespace RayGen
+{
+
+template<class Ray>
+struct Basic
+{
+    using this_t = Basic<Ray>;
+    using ray_type = Ray;
+    using scalar_type = typename Ray::scalar_type;
+    using vector3_type = typename Ray::vector3_type;
+    
+    using vector2_type = vector<scalar_type, 2>;
+    using vector4_type = vector<scalar_type, 4>;
+    using matrix4x4_type = matrix<scalar_type, 4, 4>;
+
+    static this_t create(NBL_CONST_REF_ARG(vector2_type) pixOffsetParam, NBL_CONST_REF_ARG(vector3_type) camPos, NBL_CONST_REF_ARG(vector4_type) NDC, NBL_CONST_REF_ARG(matrix4x4_type) invMVP)
+    {
+        this_t retval;
+        retval.pixOffsetParam = pixOffsetParam;
+        retval.camPos = camPos;
+        retval.NDC = NDC;
+        retval.invMVP = invMVP;
+        return retval;
+    }
+
+    ray_type generate(NBL_CONST_REF_ARG(vector3_type) randVec)
+    {
+        ray_type ray;
+        ray.origin = camPos;
+
+        vector4_type tmp = NDC;
+        // apply stochastic reconstruction filter
+        const float gaussianFilterCutoff = 2.5;
+        const float truncation = nbl::hlsl::exp(-0.5 * gaussianFilterCutoff * gaussianFilterCutoff);
+        vec2 remappedRand = randVec.xy;
+        remappedRand.x *= 1.0 - truncation;
+        remappedRand.x += truncation;
+        tmp.xy += pixOffsetParam * nbl::hlsl::boxMullerTransform<scalar_type>(remappedRand, 1.5);
+        // for depth of field we could do another stochastic point-pick
+        tmp = invMVP * tmp;
+        ray.direction = nbl::hlsl::normalize(tmp.xyz / tmp.w - camPos);
+
+        // #if POLYGON_METHOD==2
+        //     ray._immutable.normalAtOrigin = vec3(0.0,0.0,0.0);
+        //     ray._immutable.wasBSDFAtOrigin = false;
+        // #endif
+
+        ray.payload.accumulation = (vector3_type)0.0;
+        ray.payload.otherTechniqueHeuristic = 0.0; // needed for direct eye-light paths
+        ray.payload.throughput = (vector3_type)1.0;
+        // #ifdef KILL_DIFFUSE_SPECULAR_PATHS
+        // ray._payload.hasDiffuse = false;
+        // #endif
+
+        return ray;
+    }
+
+    vector2_type pixOffsetParam;
+    vector3_type camPos;
+    vector4_type NDC;
+    matrix4x4_type invMVP;
+};
+
+}
+}
+}
+}
+
+#endif
\ No newline at end of file

From af35393db518deca935259cab7c414dbad44be20 Mon Sep 17 00:00:00 2001
From: keptsecret <sorchon@gmail.com>
Date: Wed, 5 Feb 2025 14:16:08 +0700
Subject: [PATCH 022/529] intersection logic

---
 .../app_resources/hlsl/common.hlsl            | 95 +++++++++++++++++++
 .../app_resources/hlsl/intersector.hlsl       | 39 +++++++-
 .../app_resources/hlsl/pathtracer.hlsl        |  9 ++
 3 files changed, 142 insertions(+), 1 deletion(-)

diff --git a/31_HLSLPathTracer/app_resources/hlsl/common.hlsl b/31_HLSLPathTracer/app_resources/hlsl/common.hlsl
index 694defc08..56a4cace7 100644
--- a/31_HLSLPathTracer/app_resources/hlsl/common.hlsl
+++ b/31_HLSLPathTracer/app_resources/hlsl/common.hlsl
@@ -1,6 +1,10 @@
 #ifndef _NBL_HLSL_EXT_PATHTRACING_COMMON_INCLUDED_
 #define _NBL_HLSL_EXT_PATHTRACING_COMMON_INCLUDED_
 
+#include <nbl/builtin/hlsl/spirv_intrinsics/core.hlsl>
+#include <nbl/builtin/hlsl/spirv_intrinsics/glsl.std.450.hlsl>
+#include <nbl/builtin/hlsl/numbers.hlsl>
+
 namespace nbl
 {
 namespace hlsl
@@ -42,6 +46,97 @@ struct Ray
     Payload<T> payload;
 };
 
+enum PTIntersectionType : uint16_t
+{
+    PIT_NONE = 0,
+    PIT_SPHERE,
+    PIT_TRIANGLE,
+    PIT_RECTANGLE
+};
+
+// TODO: check if this works for ambiguous arrays of Intersection
+// unsure if calling correct method
+struct IIntersection
+{
+    PTIntersectionType type = PIT_NONE;
+};
+
+template<PTIntersectionType shape>
+struct Intersection : IIntersection
+{
+    PTIntersectionType type = PIT_NONE;
+};
+
+template<>
+struct Intersection<PIT_SPHERE> : IIntersection
+{
+    static Intersection<PIT_SPHERE> create(NBL_CONST_REF_ARG(float32_t3) position, float32_t radius, uint32_t bsdfID, uint32_t lightID)
+    {
+        Intersection<PIT_SPHERE> retval;
+        retval.type = PIT_SPHERE;
+        retval.position = position;
+        retval.radius2 = radius * radius;
+        retval.bsdfLightIDs = spirv::bitFieldInsert<uint32_t>(bsdfID, lightID, 16, 16);
+        return retval;
+    }
+
+    // return intersection distance if found, nan otherwise
+    float intersect(NBL_CONST_REF_ARG(float32_t3) origin, NBL_CONST_REF_ARG(float32_t3) direction)
+    {
+        float32_t3 relOrigin = origin - position;
+        float relOriginLen2 = nbl::hlsl::dot(relOrigin, relOrigin);
+
+        float dirDotRelOrigin = nbl::hlsl::dot(direction, relOrigin);
+        float det = radius2 - relOriginLen2 + dirDotRelOrigin * dirDotRelOrigin;
+
+        // do some speculative math here
+        float detsqrt = nbl::hlsl::sqrt(det);
+        return -dirDotRelOrigin + (relOriginLen2 > radius2 ? (-detsqrt) : detsqrt);
+    }
+
+    float32_t3 getNormal(NBL_CONST_REF_ARG(float32_t3) hitPosition)
+    {
+        const float radiusRcp = spirv::inverseSqrt<float32_t>(radius2);
+        return (hitPosition - position) * radiusRcp;
+    }
+
+    float getSolidAngle(NBL_CONST_REF_ARG(float32_t3) origin)
+    {
+        float32_t3 dist = position - origin;
+        float cosThetaMax = nbl::hlsl::sqrt(1.0 - radius2 / nbl::hlsl::dot(dist, dist));
+        return 2.0 * numbers::pi<float> * (1.0 - cosThetaMax);
+    }
+
+    // should this be in material system?
+    float deferredPdf(Light light, Ray ray)
+    {
+        return 1.0 / getSolidAngle(ray.origin);
+    }
+
+    float generate_and_pdf()
+    {
+        // TODO
+    }
+
+    float32_t3 generate_and
+
+    float32_t3 position;
+    float32_t radius2;
+    uint32_t bsdfLightIDs;
+};
+
+template<>
+struct Intersection<PIT_RECTANGLE> : IIntersection
+{
+
+};
+
+template<>
+struct Intersection<PIT_TRIANGLE> : IIntersection
+{
+
+};
+
 }
 }
 }
diff --git a/31_HLSLPathTracer/app_resources/hlsl/intersector.hlsl b/31_HLSLPathTracer/app_resources/hlsl/intersector.hlsl
index 5d12d6d18..b2b3d0d2d 100644
--- a/31_HLSLPathTracer/app_resources/hlsl/intersector.hlsl
+++ b/31_HLSLPathTracer/app_resources/hlsl/intersector.hlsl
@@ -1,6 +1,9 @@
 #ifndef _NBL_HLSL_EXT_INTERSECTOR_INCLUDED_
 #define _NBL_HLSL_EXT_INTERSECTOR_INCLUDED_
 
+#include "common.hlsl"
+#include <nbl/builtin/hlsl/limits.hlsl>
+
 namespace nbl
 {
 namespace hlsl
@@ -11,12 +14,46 @@ namespace Intersector
 {
 
 // ray query method
+// ray query struct holds AS info
+// pass in address to vertex/index buffers?
 
 // ray tracing pipeline method
 
+// does everything in traceray in ex 30
+template<class Ray>
 struct Procedural
 {
-    
+    using scalar_type = typename Ray::scalar_type;
+    using ray_type = Ray;
+
+    static int traceRay(NBL_REF_ARG(ray_type) ray, IIntersection objects[32], int objCount)
+    {
+        const bool anyHit = ray.intersectionT != numeric_limits<scalar_type>::max;
+
+        int objectID = -1;
+        for (int i = 0; i < objCount; i++)
+        {
+            float t;
+            if (objects[i].type == PIT_SPHERE)  // we don't know what type of intersection it is so cast, there has to be a better way to do this
+            {
+                Intersection<PIT_SPHERE> sphere = (Intersection<PIT_SPHERE>)objects[i];
+                t = sphere.intersect(ray.origin, ray.direction);
+            }
+            // TODO: other types
+            
+            bool closerIntersection = t > 0.0 && t < ray.intersectionT;
+
+            ray.intersectionT = closerIntersection ? t : ray.intersectionT;
+            objectID = closerIntersection ? i : objectID;
+            
+            // allowing early out results in a performance regression, WTF!?
+            //if (anyHit && closerIntersection)
+            //break;
+        }
+        return objectID;
+    }
+
+    // TODO? traceray with vertex/index buffer
 };
 
 }
diff --git a/31_HLSLPathTracer/app_resources/hlsl/pathtracer.hlsl b/31_HLSLPathTracer/app_resources/hlsl/pathtracer.hlsl
index 9d2e8c260..f28dc621b 100644
--- a/31_HLSLPathTracer/app_resources/hlsl/pathtracer.hlsl
+++ b/31_HLSLPathTracer/app_resources/hlsl/pathtracer.hlsl
@@ -22,6 +22,15 @@ struct Unidirectional
                         /* PathGuider pathGuider, */
                         NextEventEstimator nee)
     {}
+
+    // closest hit
+
+    // Li
+    MaterialSystem::measure_t getMeasure()
+    {
+        // loop through bounces, do closest hit
+        // return ray.payload.accumulation --> color
+    }
 };
 
 }

From ab582180d9a1e735706d913deb0e078c97280d48 Mon Sep 17 00:00:00 2001
From: kevyuu <kevin.kayu@gmail.com>
Date: Wed, 5 Feb 2025 23:53:52 +0700
Subject: [PATCH 023/529] Reorder hlsl datastructure to reduce padding and make
 it more compact

---
 71_RayTracingPipeline/app_resources/common.hlsl | 9 ++++-----
 71_RayTracingPipeline/main.cpp                  | 6 +++---
 2 files changed, 7 insertions(+), 8 deletions(-)

diff --git a/71_RayTracingPipeline/app_resources/common.hlsl b/71_RayTracingPipeline/app_resources/common.hlsl
index 8c73fada3..af35cb731 100644
--- a/71_RayTracingPipeline/app_resources/common.hlsl
+++ b/71_RayTracingPipeline/app_resources/common.hlsl
@@ -17,23 +17,22 @@ struct Material
 
 struct SProceduralGeomInfo
 {
+    Material material;
     float32_t3 center;
     float32_t radius;
-    Material material;
 };
 
 struct STriangleGeomInfo
 {
+    Material material;
     uint64_t vertexBufferAddress;
     uint64_t indexBufferAddress;
 
-    uint32_t vertexStride : 29;
+    uint32_t vertexStride : 26;
+    uint32_t objType: 3;
     uint32_t indexType : 2; // 16 bit, 32 bit or none
     uint32_t smoothNormals : 1;	// flat for cube, rectangle, disk
 
-    uint32_t objType;
-
-    Material material;
 };
 
 enum E_GEOM_TYPE : uint16_t
diff --git a/71_RayTracingPipeline/main.cpp b/71_RayTracingPipeline/main.cpp
index 22c745635..d9186a9bc 100644
--- a/71_RayTracingPipeline/main.cpp
+++ b/71_RayTracingPipeline/main.cpp
@@ -1242,8 +1242,6 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication,
       {
         const auto middle_i = NumberOfProceduralGeometries / 2.0;
         SProceduralGeomInfo sphere = {
-          .center = float32_t3((i - middle_i) * 4.0, 2, 5.0),
-          .radius = 1,
           .material = {
             .ambient = {},
             .diffuse = {0.3, 0.2 * i, 0.3},
@@ -1251,6 +1249,8 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication,
             .shininess = 1.0f,
             .illum = 2
           },
+          .center = float32_t3((i - middle_i) * 4.0, 2, 5.0),
+          .radius = 1,
         };
 
         proceduralGeoms.push_back(sphere);
@@ -1415,7 +1415,7 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication,
       // setup blas info for triangle geometries
       for (uint32_t i = 0; i < blasCount; i++)
       {
-        bool isProcedural = i == proceduralBlasIdx;
+        const auto isProcedural = i == proceduralBlasIdx;
         if (isProcedural)
         {
           aabbs.data.buffer = smart_refctd_ptr(m_proceduralAabbBuffer);

From 6966942f6de8cc51be77b281687cbd8d922bcbac Mon Sep 17 00:00:00 2001
From: kevyuu <kevin.kayu@gmail.com>
Date: Wed, 5 Feb 2025 23:54:36 +0700
Subject: [PATCH 024/529] Adjust changes to SPhysicalDeviceLimit regarding
 shaderGroupHandleSize

---
 71_RayTracingPipeline/main.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/71_RayTracingPipeline/main.cpp b/71_RayTracingPipeline/main.cpp
index d9186a9bc..c3aadeff7 100644
--- a/71_RayTracingPipeline/main.cpp
+++ b/71_RayTracingPipeline/main.cpp
@@ -1282,7 +1282,7 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication,
   bool createShaderBindingTable(video::CThreadSafeQueueAdapter* queue, const smart_refctd_ptr<video::IGPURayTracingPipeline>& pipeline)
   {
     const auto& limits = m_device->getPhysicalDevice()->getLimits();
-    const auto handleSize = limits.shaderGroupHandleSize;
+    const auto handleSize = SPhysicalDeviceLimits::ShaderGroupHandleSize;
     const auto handleSizeAligned = nbl::core::alignUp(handleSize, limits.shaderGroupHandleAlignment);
 
     auto& raygenRegion = m_shaderBindingTable.raygenGroupRegion;

From ef02db2fda6fbd4e9652e67bcf946310e76a8103 Mon Sep 17 00:00:00 2001
From: kevyuu <kevin.kayu@gmail.com>
Date: Wed, 5 Feb 2025 23:55:11 +0700
Subject: [PATCH 025/529] Adjust changes to ray tracing SShaderGroupParams

---
 71_RayTracingPipeline/main.cpp | 33 ++++++++++++++++++---------------
 1 file changed, 18 insertions(+), 15 deletions(-)

diff --git a/71_RayTracingPipeline/main.cpp b/71_RayTracingPipeline/main.cpp
index c3aadeff7..218c6157d 100644
--- a/71_RayTracingPipeline/main.cpp
+++ b/71_RayTracingPipeline/main.cpp
@@ -353,42 +353,45 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication,
       params.layout = pipelineLayout.get();
       params.shaders = std::span(shaders);
 
-      auto& shaderGroups = params.cached.shaderGroups;
+      auto& shaderGroups = params.shaderGroups;
 
       shaderGroups.raygenGroup = { .shaderIndex = RTDS_RAYGEN };
 
-      shaderGroups.missGroups.resize(E_MISS_TYPE::EMT_COUNT, {});
-      shaderGroups.missGroups[EMT_PRIMARY] = { .shaderIndex = RTDS_MISS };
-      shaderGroups.missGroups[EMT_OCCLUSION] = { .shaderIndex = RTDS_SHADOW_MISS };
+      SGeneralShaderGroup missGroups[EMT_COUNT];
+      missGroups[EMT_PRIMARY] = { .shaderIndex = RTDS_MISS };
+      missGroups[EMT_OCCLUSION] = { .shaderIndex = RTDS_SHADOW_MISS };
+      shaderGroups.missGroups = missGroups;
 
       auto getHitGroupIndex = [](E_GEOM_TYPE geomType, E_RAY_TYPE rayType)
         {
           return geomType * ERT_COUNT + rayType;
         };
-      shaderGroups.hitGroups.resize(E_RAY_TYPE::ERT_COUNT * E_GEOM_TYPE::EGT_COUNT);
-      shaderGroups.hitGroups[getHitGroupIndex(EGT_TRIANGLES, ERT_PRIMARY)] = {
+      SHitShaderGroup hitGroups[E_RAY_TYPE::ERT_COUNT * E_GEOM_TYPE::EGT_COUNT];
+      hitGroups[getHitGroupIndex(EGT_TRIANGLES, ERT_PRIMARY)] = {
         .closestHitShaderIndex = RTDS_CLOSEST_HIT,
         .anyHitShaderIndex = RTDS_ANYHIT_COLOR,
       };
-      shaderGroups.hitGroups[getHitGroupIndex(EGT_TRIANGLES, ERT_OCCLUSION)] = {
+      hitGroups[getHitGroupIndex(EGT_TRIANGLES, ERT_OCCLUSION)] = {
         .closestHitShaderIndex = RTDS_CLOSEST_HIT,
         .anyHitShaderIndex = RTDS_ANYHIT_SHADOW,
       };
-      shaderGroups.hitGroups[getHitGroupIndex(EGT_PROCEDURAL, ERT_PRIMARY)] = {
+      hitGroups[getHitGroupIndex(EGT_PROCEDURAL, ERT_PRIMARY)] = {
         .closestHitShaderIndex = RTDS_SPHERE_CLOSEST_HIT,
         .anyHitShaderIndex = RTDS_ANYHIT_COLOR,
         .intersectionShaderIndex = RTDS_INTERSECTION,
       };
-      shaderGroups.hitGroups[getHitGroupIndex(EGT_PROCEDURAL, ERT_OCCLUSION)] = {
+      hitGroups[getHitGroupIndex(EGT_PROCEDURAL, ERT_OCCLUSION)] = {
         .closestHitShaderIndex = RTDS_CLOSEST_HIT,
         .anyHitShaderIndex = RTDS_ANYHIT_SHADOW,
         .intersectionShaderIndex = RTDS_INTERSECTION,
       };
+      shaderGroups.hitGroups = hitGroups;
 
-      shaderGroups.callableGroups.resize(ELT_COUNT);
-      shaderGroups.callableGroups[ELT_DIRECTIONAL] = { .shaderIndex = RTDS_DIRECTIONAL_CALL };
-      shaderGroups.callableGroups[ELT_POINT] = { .shaderIndex = RTDS_POINT_CALL };
-      shaderGroups.callableGroups[ELT_SPOT] = { .shaderIndex = RTDS_SPOT_CALL };
+      SGeneralShaderGroup callableGroups[ELT_COUNT];
+      callableGroups[ELT_DIRECTIONAL] = { .shaderIndex = RTDS_DIRECTIONAL_CALL };
+      callableGroups[ELT_POINT] = { .shaderIndex = RTDS_POINT_CALL };
+      callableGroups[ELT_SPOT] = { .shaderIndex = RTDS_SPOT_CALL };
+      shaderGroups.callableGroups = callableGroups;
 
       params.cached.maxRecursionDepth = 1;
 
@@ -1213,13 +1216,13 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication,
         const auto& gpuObject = m_gpuTriangleGeometries[i];
         const uint64_t vertexBufferAddress = gpuObject.bindings.vertex.buffer->getDeviceAddress();
         geomInfos[i] = {
+          .material = gpuObject.material,
           .vertexBufferAddress = vertexBufferAddress,
           .indexBufferAddress = gpuObject.useIndex() ? gpuObject.bindings.index.buffer->getDeviceAddress() : vertexBufferAddress,
           .vertexStride = gpuObject.vertexStride,
+          .objType = gpuObject.meta.type,
           .indexType = gpuObject.indexType,
           .smoothNormals = s_smoothNormals[gpuObject.meta.type],
-          .objType = gpuObject.meta.type,
-          .material = gpuObject.material,
         };
       }
     }

From 5a5fbfe55aa4cf062c562f19507ba30de085b7a6 Mon Sep 17 00:00:00 2001
From: keptsecret <sorchon@gmail.com>
Date: Thu, 6 Feb 2025 11:24:28 +0700
Subject: [PATCH 026/529] changes to intersection logic

---
 .../app_resources/hlsl/common.hlsl            | 115 ++++++++++++++---
 .../app_resources/hlsl/intersector.hlsl       | 120 ++++++++++++++++--
 2 files changed, 208 insertions(+), 27 deletions(-)

diff --git a/31_HLSLPathTracer/app_resources/hlsl/common.hlsl b/31_HLSLPathTracer/app_resources/hlsl/common.hlsl
index 56a4cace7..84933edfb 100644
--- a/31_HLSLPathTracer/app_resources/hlsl/common.hlsl
+++ b/31_HLSLPathTracer/app_resources/hlsl/common.hlsl
@@ -46,34 +46,22 @@ struct Ray
     Payload<T> payload;
 };
 
-enum PTIntersectionType : uint16_t
+enum ProceduralIntersectionType : uint16_t
 {
-    PIT_NONE = 0,
     PIT_SPHERE,
     PIT_TRIANGLE,
     PIT_RECTANGLE
 };
 
-// TODO: check if this works for ambiguous arrays of Intersection
-// unsure if calling correct method
-struct IIntersection
-{
-    PTIntersectionType type = PIT_NONE;
-};
-
-template<PTIntersectionType shape>
-struct Intersection : IIntersection
-{
-    PTIntersectionType type = PIT_NONE;
-};
+template<ProceduralIntersectionType type>
+struct Intersection;
 
 template<>
-struct Intersection<PIT_SPHERE> : IIntersection
+struct Intersection<PIT_SPHERE>
 {
     static Intersection<PIT_SPHERE> create(NBL_CONST_REF_ARG(float32_t3) position, float32_t radius, uint32_t bsdfID, uint32_t lightID)
     {
         Intersection<PIT_SPHERE> retval;
-        retval.type = PIT_SPHERE;
         retval.position = position;
         retval.radius2 = radius * radius;
         retval.bsdfLightIDs = spirv::bitFieldInsert<uint32_t>(bsdfID, lightID, 16, 16);
@@ -118,7 +106,7 @@ struct Intersection<PIT_SPHERE> : IIntersection
         // TODO
     }
 
-    float32_t3 generate_and
+    NBL_CONSTEXPR_STATIC_INLINE uint32_t ObjSize = 5;
 
     float32_t3 position;
     float32_t radius2;
@@ -126,15 +114,104 @@ struct Intersection<PIT_SPHERE> : IIntersection
 };
 
 template<>
-struct Intersection<PIT_RECTANGLE> : IIntersection
+struct Intersection<PIT_TRIANGLE>
 {
+    static Intersection<PIT_TRIANGLE> create(NBL_CONST_REF_ARG(float32_t3) vertex0, NBL_CONST_REF_ARG(float32_t3) vertex1, NBL_CONST_REF_ARG(float32_t3) vertex2, uint32_t bsdfID, uint32_t lightID)
+    {
+        Intersection<PIT_TRIANGLE> retval;
+        retval.vertex0 = vertex0;
+        retval.vertex1 = vertex1;
+        retval.vertex2 = vertex2;
+        retval.bsdfLightIDs = spirv::bitFieldInsert<uint32_t>(bsdfID, lightID, 16, 16);
+        return retval;
+    }
+
+    float intersect(NBL_CONST_REF_ARG(float32_t3) origin, NBL_CONST_REF_ARG(float32_t3) direction)
+    {
+        const float32_t3 edges[2] = { vertex1 - vertex0, vertex2 - vertex0 };
+
+        const float32_t3 h = nbl::hlsl::cross(direction, edges[1]);
+        const float a = nbl::hlsl::dot(edges[0], h);
+
+        const float32_t3 relOrigin = origin - vertex0;
+
+        const float u = nbl::hlsl::dot(relOrigin, h) / a;
+
+        const float32_t3 q = nbl::hlsl::cross(relOrigin, edges[0]);
+        const float v = nbl::hlsl::dot(direction, q) / a;
+
+        const float t = nbl::hlsl::dot(edges[1], q) / a;
+
+        const bool intersection = t > 0.f && u >= 0.f && v >= 0.f && (u + v) <= 1.f;
+        return intersection ? t : numeric_limits<float>::infinity;
+    }
+
+    float32_t3 getNormalTimesArea()
+    {
+        const float32_t3 edges[2] = { vertex1 - vertex0, vertex2 - vertex0 };
+        return nbl::hlsl::cross(edges[0], edges[1]) * 0.5f;
+    }
+
+    NBL_CONSTEXPR_STATIC_INLINE uint32_t ObjSize = 10;
 
+    float32_t3 vertex0;
+    float32_t3 vertex1;
+    float32_t3 vertex2;
+    uint32_t bsdfLightIDs;
 };
 
 template<>
-struct Intersection<PIT_TRIANGLE> : IIntersection
+struct Intersection<PIT_RECTANGLE>
 {
+    static Intersection<PIT_TRIANGLE> create(NBL_CONST_REF_ARG(float32_t3) offset, NBL_CONST_REF_ARG(float32_t3) edge0, NBL_CONST_REF_ARG(float32_t3) edge1, uint32_t bsdfID, uint32_t lightID)
+    {
+        Intersection<PIT_TRIANGLE> retval;
+        retval.offset = offset;
+        retval.edge0 = edge0;
+        retval.edge1 = edge1;
+        retval.bsdfLightIDs = spirv::bitFieldInsert<uint32_t>(bsdfID, lightID, 16, 16);
+        return retval;
+    }
+
+    float intersect(NBL_CONST_REF_ARG(float32_t3) origin, NBL_CONST_REF_ARG(float32_t3) direction)
+    {
+        const float32_t3 h = nbl::hlsl::cross(direction, edge1);
+        const float a = nbl::hlsl::dot(edge0, h);
+
+        const float32_t3 relOrigin = origin - offset;
+
+        const float u = nbl::hlsl::dot(relOrigin,h)/a;
+
+        const float32_t3 q = nbl::hlsl::cross(relOrigin, edge0);
+        const float v = nbl::hlsl::dot(direction, q) / a;
+
+        const float t = nbl::hlsl::dot(edge1, q) / a;
 
+        const bool intersection = t > 0.f && u >= 0.f && v >= 0.f && u <= 1.f && v <= 1.f;
+        return intersection ? t : numeric_limits<float>::infinity;
+    }
+
+    float32_t3 getNormalTimesArea()
+    {
+        return nbl::hlsl::cross(edge0, edge1);
+    }
+
+    void getNormalBasis(NBL_REF_ARG(float32_t3x3) basis, NBL_REF_ARG(float32_t2) extents)
+    {
+        extents = float32_t2(nbl::hlsl::length(edge0), nbl::hlsl::length(edge1));
+        basis[0] = edge0 / extents[0];
+        basis[1] = edge1 / extents[1];
+        basis[2] = normalize(cross(basis[0],basis[1]));
+
+        basis = nbl::hlsl::transpose<matrix3x3_type>(basis);    // TODO: double check transpose
+    }
+
+    NBL_CONSTEXPR_STATIC_INLINE uint32_t ObjSize = 10;
+
+    float32_t3 offset;
+    float32_t3 edge0;
+    float32_t3 edge1;
+    uint32_t bsdfLightIDs;
 };
 
 }
diff --git a/31_HLSLPathTracer/app_resources/hlsl/intersector.hlsl b/31_HLSLPathTracer/app_resources/hlsl/intersector.hlsl
index b2b3d0d2d..d4b87196d 100644
--- a/31_HLSLPathTracer/app_resources/hlsl/intersector.hlsl
+++ b/31_HLSLPathTracer/app_resources/hlsl/intersector.hlsl
@@ -19,27 +19,71 @@ namespace Intersector
 
 // ray tracing pipeline method
 
-// does everything in traceray in ex 30
+// procedural data store: [obj count] [intersect type] [obj1] [obj2] [...]
+
+struct IntersectData
+{
+    enum class Mode : uint32_t
+    {
+        RAY_QUERY,
+        RAY_TRACING,
+        PROCEDURAL
+    };
+
+    NBL_CONSTEXPR_STATIC_INLINE uint32_t DataSize = 128;
+
+    uint32_t mode : 1;
+    unit32_t unused : 31;   // possible space for flags
+    uint32_t data[DataSize];
+};
+
 template<class Ray>
-struct Procedural
+struct Comprehensive
 {
     using scalar_type = typename Ray::scalar_type;
     using ray_type = Ray;
 
-    static int traceRay(NBL_REF_ARG(ray_type) ray, IIntersection objects[32], int objCount)
+    static int traceProcedural(NBL_REF_ARG(ray_type) ray, NBL_REF_ARG(IntersectData) intersect)
     {
         const bool anyHit = ray.intersectionT != numeric_limits<scalar_type>::max;
+        const uint32_t objCount = intersect.data[0];
+        const ProceduralIntersectionType type = intersect.data[1];
 
         int objectID = -1;
         for (int i = 0; i < objCount; i++)
         {
             float t;
-            if (objects[i].type == PIT_SPHERE)  // we don't know what type of intersection it is so cast, there has to be a better way to do this
+            switch (type)
             {
-                Intersection<PIT_SPHERE> sphere = (Intersection<PIT_SPHERE>)objects[i];
-                t = sphere.intersect(ray.origin, ray.direction);
+                case PIT_SPHERE:
+                {
+                    float32_t3 position = float32_t3(asfloat(intersect.data[2 + i * Intersection<PIT_SPHERE>::ObjSize]), asfloat(intersect.data[2 + i * Intersection<PIT_SPHERE>::ObjSize + 1]), asfloat(intersect.data[2 + i * Intersection<PIT_SPHERE>::ObjSize + 2]));
+                    Intersection<PIT_SPHERE> sphere = Intersection<PIT_SPHERE>::create(position, asfloat(intersect.data[2 + i * Intersection<PIT_SPHERE>::ObjSize + 3]), intersect.data[2 + i * Intersection<PIT_SPHERE>::ObjSize + 4]);
+                    t = sphere.intersect(ray.origin, ray.direction);
+                }
+                break;
+                case PIT_TRIANGLE:
+                {
+                    float32_t3 vertex0 = float32_t3(asfloat(intersect.data[2 + i * Intersection<PIT_TRIANGLE>::ObjSize]), asfloat(intersect.data[2 + i * Intersection<PIT_SPHERE>::ObjSize + 1]), asfloat(intersect.data[2 + i * Intersection<PIT_TRIANGLE>::ObjSize + 2]));
+                    float32_t3 vertex1 = float32_t3(asfloat(intersect.data[2 + i * Intersection<PIT_TRIANGLE>::ObjSize + 3]), asfloat(intersect.data[2 + i * Intersection<PIT_SPHERE>::ObjSize + 4]), asfloat(intersect.data[2 + i * Intersection<PIT_TRIANGLE>::ObjSize + 5]));
+                    float32_t3 vertex2 = float32_t3(asfloat(intersect.data[2 + i * Intersection<PIT_TRIANGLE>::ObjSize + 6]), asfloat(intersect.data[2 + i * Intersection<PIT_SPHERE>::ObjSize + 7]), asfloat(intersect.data[2 + i * Intersection<PIT_TRIANGLE>::ObjSize + 8]));
+                    Intersection<PIT_TRIANGLE> tri = Intersection<PIT_TRIANGLE>::create(vertex0, vertex1, vertex2, intersect.data[2 + i * Intersection<PIT_TRIANGLE>::ObjSize + 9]);
+                    t = tri.intersect(ray.origin, ray.direction);
+                }
+                break;
+                case PIT_RECTANGLE:
+                {
+                    float32_t3 offset = float32_t3(asfloat(intersect.data[2 + i * Intersection<PIT_RECTANGLE>::ObjSize]), asfloat(intersect.data[2 + i * Intersection<PIT_RECTANGLE>::ObjSize + 1]), asfloat(intersect.data[2 + i * Intersection<PIT_RECTANGLE>::ObjSize + 2]));
+                    float32_t3 edge0 = float32_t3(asfloat(intersect.data[2 + i * Intersection<PIT_RECTANGLE>::ObjSize + 3]), asfloat(intersect.data[2 + i * Intersection<PIT_RECTANGLE>::ObjSize + 4]), asfloat(intersect.data[2 + i * Intersection<PIT_RECTANGLE>::ObjSize + 5]));
+                    float32_t3 edge1 = float32_t3(asfloat(intersect.data[2 + i * Intersection<PIT_RECTANGLE>::ObjSize + 6]), asfloat(intersect.data[2 + i * Intersection<PIT_RECTANGLE>::ObjSize + 7]), asfloat(intersect.data[2 + i * Intersection<PIT_RECTANGLE>::ObjSize + 8]));
+                    Intersection<PIT_RECTANGLE> rect = Intersection<PIT_RECTANGLE>::create(offset, edge0, edge1, intersect.data[2 + i * Intersection<PIT_RECTANGLE>::ObjSize + 9]);
+                    t = rect.intersect(ray.origin, ray.direction);
+                }
+                break;
+                default:
+                    t = numeric_limits<float>::infinity;
+                    break;
             }
-            // TODO: other types
             
             bool closerIntersection = t > 0.0 && t < ray.intersectionT;
 
@@ -53,9 +97,69 @@ struct Procedural
         return objectID;
     }
 
-    // TODO? traceray with vertex/index buffer
+    static int traceRay(NBL_REF_ARG(ray_type) ray, NBL_REF_ARG(IntersectData) intersect)
+    {
+        const IntersectData::Mode mode = intersect.mode;
+        switch (mode)
+        {
+            case IntersectData::Mode::RAY_QUERY:
+            {
+                // TODO: do ray query stuff
+            }
+            break;
+            case IntersectData::Mode::RAY_TRACING:
+            {
+                // TODO: do ray tracing stuff
+            }
+            break;
+            case IntersectData::Mode::PROCEDURAL:
+            {
+                return traceProcedural(ray, intersect);
+            }
+            break;
+            default:
+                return -1;
+        }
+    }
 };
 
+// does everything in traceray in ex 30
+// template<class Ray>
+// struct Procedural
+// {
+//     using scalar_type = typename Ray::scalar_type;
+//     using ray_type = Ray;
+
+//     static int traceRay(NBL_REF_ARG(ray_type) ray, IIntersection objects[32], int objCount)
+//     {
+//         const bool anyHit = ray.intersectionT != numeric_limits<scalar_type>::max;
+
+//         int objectID = -1;
+//         for (int i = 0; i < objCount; i++)
+//         {
+//             float t;
+//             if (objects[i].type == PIT_SPHERE)  // we don't know what type of intersection it is so cast, there has to be a better way to do this
+//             {
+//                 Intersection<PIT_SPHERE> sphere = (Intersection<PIT_SPHERE>)objects[i];
+//                 t = sphere.intersect(ray.origin, ray.direction);
+//             }
+//             // TODO: other types
+            
+//             bool closerIntersection = t > 0.0 && t < ray.intersectionT;
+
+//             ray.intersectionT = closerIntersection ? t : ray.intersectionT;
+//             objectID = closerIntersection ? i : objectID;
+            
+//             // allowing early out results in a performance regression, WTF!?
+//             //if (anyHit && closerIntersection)
+//             //break;
+//         }
+//         return objectID;
+//     }
+
+//     // TODO? traceray with vertex/index buffer
+// };
+
 }
 }
 }

From c810949a4a66eb6f0c614537404333c6394463de Mon Sep 17 00:00:00 2001
From: kevyuu <kevin.kayu@gmail.com>
Date: Thu, 6 Feb 2025 13:05:41 +0700
Subject: [PATCH 027/529] Adjust ray tracing pipeline demo to remove
 SStridedBufferRegion

---
 71_RayTracingPipeline/main.cpp | 73 ++++++++++++++++++----------------
 1 file changed, 39 insertions(+), 34 deletions(-)

diff --git a/71_RayTracingPipeline/main.cpp b/71_RayTracingPipeline/main.cpp
index 218c6157d..e95032181 100644
--- a/71_RayTracingPipeline/main.cpp
+++ b/71_RayTracingPipeline/main.cpp
@@ -26,10 +26,14 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication,
 
   struct ShaderBindingTable
   {
-    SStridedBufferRegion<IGPUBuffer> raygenGroupRegion;
-    SStridedBufferRegion<IGPUBuffer> hitGroupsRegion;
-    SStridedBufferRegion<IGPUBuffer> missGroupsRegion;
-    SStridedBufferRegion<IGPUBuffer> callableGroupsRegion;
+    SBufferRange<IGPUBuffer> raygenGroupRange;
+    uint32_t raygenGroupStride;
+    SBufferRange<IGPUBuffer> hitGroupsRange;
+    uint32_t hitGroupsStride;
+    SBufferRange<IGPUBuffer> missGroupsRange;
+    uint32_t missGroupsStride;
+    SBufferRange<IGPUBuffer> callableGroupsRange;
+    uint32_t callableGroupsStride;
   };
 
 
@@ -718,10 +722,11 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication,
       cmdbuf->bindRayTracingPipeline(m_rayTracingPipeline.get());
       cmdbuf->pushConstants(m_rayTracingPipeline->getLayout(), IShader::E_SHADER_STAGE::ESS_ALL_RAY_TRACING, 0, sizeof(SPushConstants), &pc);
       cmdbuf->bindDescriptorSets(EPBP_RAY_TRACING, m_rayTracingPipeline->getLayout(), 0, 1, &m_rayTracingDs.get());
-      cmdbuf->traceRays(m_shaderBindingTable.raygenGroupRegion,
-        m_shaderBindingTable.missGroupsRegion,
-        m_shaderBindingTable.hitGroupsRegion,
-        m_shaderBindingTable.callableGroupsRegion,
+      cmdbuf->traceRays(
+        m_shaderBindingTable.raygenGroupRange, m_shaderBindingTable.raygenGroupStride,
+        m_shaderBindingTable.missGroupsRange, m_shaderBindingTable.missGroupsStride,
+        m_shaderBindingTable.hitGroupsRange, m_shaderBindingTable.hitGroupsStride,
+        m_shaderBindingTable.callableGroupsRange, m_shaderBindingTable.callableGroupsStride,
         WIN_W, WIN_H, 1);
     }
 
@@ -1288,36 +1293,36 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication,
     const auto handleSize = SPhysicalDeviceLimits::ShaderGroupHandleSize;
     const auto handleSizeAligned = nbl::core::alignUp(handleSize, limits.shaderGroupHandleAlignment);
 
-    auto& raygenRegion = m_shaderBindingTable.raygenGroupRegion;
-    auto& hitRegion = m_shaderBindingTable.hitGroupsRegion;
-    auto& missRegion = m_shaderBindingTable.missGroupsRegion;
-    auto& callableRegion = m_shaderBindingTable.callableGroupsRegion;
+    auto& raygenRange = m_shaderBindingTable.raygenGroupRange;
+    auto& hitRange = m_shaderBindingTable.hitGroupsRange;
+    auto& missRange = m_shaderBindingTable.missGroupsRange;
+    auto& callableRange = m_shaderBindingTable.callableGroupsRange;
 
-    raygenRegion = {
+    raygenRange = {
       .offset = 0,
-      .stride = core::alignUp(handleSizeAligned, limits.shaderGroupBaseAlignment),
       .size = core::alignUp(handleSizeAligned, limits.shaderGroupBaseAlignment)
     };
+    m_shaderBindingTable.raygenGroupStride = core::alignUp(handleSizeAligned, limits.shaderGroupBaseAlignment);
 
-    missRegion = {
-      .offset = raygenRegion.size,
-      .stride = handleSizeAligned,
+    missRange = {
+      .offset = raygenRange.size,
       .size = core::alignUp(pipeline->getMissGroupCount() * handleSizeAligned, limits.shaderGroupBaseAlignment),
     };
+    m_shaderBindingTable.missGroupsStride = handleSizeAligned;
 
-    hitRegion = {
-      .offset = missRegion.offset + missRegion.size,
-      .stride = handleSizeAligned,
+    hitRange = {
+      .offset = missRange.offset + missRange.size,
       .size = core::alignUp(pipeline->getHitGroupCount() * handleSizeAligned, limits.shaderGroupBaseAlignment),
     };
+    m_shaderBindingTable.hitGroupsStride = handleSizeAligned;
 
-    callableRegion = {
-      .offset = hitRegion.offset + hitRegion.size,
-      .stride = handleSizeAligned,
+    callableRange = {
+      .offset = hitRange.offset + hitRange.size,
       .size = core::alignUp(pipeline->getCallableGroupCount() * handleSizeAligned, limits.shaderGroupBaseAlignment),
     };
+    m_shaderBindingTable.callableGroupsStride = handleSizeAligned;
 
-    const auto bufferSize = raygenRegion.size + missRegion.size + hitRegion.size + callableRegion.size;
+    const auto bufferSize = raygenRange.size + missRange.size + hitRange.size + callableRange.size;
 
     ICPUBuffer::SCreationParams cpuBufferParams;
     cpuBufferParams.size = bufferSize;
@@ -1328,37 +1333,37 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication,
     memcpy(pData, pipeline->getRaygenGroupShaderHandle().data(), handleSize);
 
     // copy miss region
-    uint8_t* pMissData = pData + missRegion.offset;
+    uint8_t* pMissData = pData + missRange.offset;
     for (int32_t missIx = 0; missIx < pipeline->getMissGroupCount(); missIx++)
     {
       memcpy(pMissData, pipeline->getMissGroupShaderHandle(missIx).data(), handleSize);
-      pMissData += missRegion.stride;
+      pMissData += m_shaderBindingTable.missGroupsStride;
     }
 
     // copy hit region
-    uint8_t* pHitData = pData + hitRegion.offset;
+    uint8_t* pHitData = pData + hitRange.offset;
     for (int32_t hitIx = 0; hitIx < pipeline->getHitGroupCount(); hitIx++)
     {
       memcpy(pHitData, pipeline->getHitGroupShaderHandle(hitIx).data(), handleSize);
-      pHitData += hitRegion.stride;
+      pHitData += m_shaderBindingTable.hitGroupsStride;
     }
 
     // copy callable region
-    uint8_t* pCallableData = pData + callableRegion.offset;
+    uint8_t* pCallableData = pData + callableRange.offset;
     for (int32_t callableIx = 0; callableIx < pipeline->getCallableGroupCount(); callableIx++)
     {
       memcpy(pCallableData, pipeline->getCallableGroupShaderHandle(callableIx).data(), handleSize);
-      pCallableData += callableRegion.stride;
+      pCallableData += m_shaderBindingTable.callableGroupsStride;
     }
 
     {
       IGPUBuffer::SCreationParams params;
       params.usage = IGPUBuffer::EUF_TRANSFER_DST_BIT | IGPUBuffer::EUF_INLINE_UPDATE_VIA_CMDBUF | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT | IGPUBuffer::EUF_SHADER_BINDING_TABLE_BIT;
       params.size = bufferSize;
-      m_utils->createFilledDeviceLocalBufferOnDedMem(SIntendedSubmitInfo{ .queue = queue }, std::move(params), pData).move_into(raygenRegion.buffer);
-      missRegion.buffer = core::smart_refctd_ptr(raygenRegion.buffer);
-      hitRegion.buffer = core::smart_refctd_ptr(raygenRegion.buffer);
-      callableRegion.buffer = core::smart_refctd_ptr(raygenRegion.buffer);
+      m_utils->createFilledDeviceLocalBufferOnDedMem(SIntendedSubmitInfo{ .queue = queue }, std::move(params), pData).move_into(raygenRange.buffer);
+      missRange.buffer = core::smart_refctd_ptr(raygenRange.buffer);
+      hitRange.buffer = core::smart_refctd_ptr(raygenRange.buffer);
+      callableRange.buffer = core::smart_refctd_ptr(raygenRange.buffer);
     }
 
     return true;

From 13251cac673b1c118c43d5f8fc24e8b9cee4492b Mon Sep 17 00:00:00 2001
From: kevyuu <kevin.kayu@gmail.com>
Date: Thu, 6 Feb 2025 16:50:06 +0700
Subject: [PATCH 028/529] Pack material into more compact representation before
 send to gpu.

---
 .../app_resources/common.hlsl                 | 72 ++++++++++++++++++-
 .../app_resources/raytrace.rahit.hlsl         |  7 +-
 .../app_resources/raytrace.rchit.hlsl         |  5 +-
 .../raytrace_procedural.rchit.hlsl            |  6 +-
 71_RayTracingPipeline/include/common.hpp      |  2 +-
 71_RayTracingPipeline/main.cpp                | 10 +--
 6 files changed, 85 insertions(+), 17 deletions(-)

diff --git a/71_RayTracingPipeline/app_resources/common.hlsl b/71_RayTracingPipeline/app_resources/common.hlsl
index af35cb731..eb66aa374 100644
--- a/71_RayTracingPipeline/app_resources/common.hlsl
+++ b/71_RayTracingPipeline/app_resources/common.hlsl
@@ -5,6 +5,38 @@
 
 NBL_CONSTEXPR uint32_t WorkgroupSize = 16;
 
+inline uint32_t packUnorm10(float32_t v)
+{
+    return trunc(v * 1023.0f + 0.5f);
+}
+
+inline float32_t unpackUnorm10(uint32_t packed)
+{
+    return float32_t(packed & 0x3ff) * (1.0f / 1023.0f);
+}
+
+inline uint32_t packUnorm18(float32_t v)
+{
+    const float maxValue = 262143;
+    return trunc(v * maxValue + 0.5f);
+}
+
+inline float32_t unpackUnorm18(uint32_t packed)
+{
+    const float maxValue = 262143;
+    return float32_t(packed & 0x3ffff) * (1.0f / maxValue);
+}
+
+inline uint32_t packUnorm3x10(float32_t3 v)
+{
+    return (packUnorm10(v.z) << 20 | (packUnorm10(v.y) << 10 | packUnorm10(v.x)));
+}
+
+inline float32_t3 unpackUnorm3x10(uint32_t packed)
+{
+    return float32_t3(unpackUnorm10(packed), unpackUnorm10(packed >> 10), unpackUnorm10(packed >> 20));
+}
+
 struct Material
 {
 	float32_t3 ambient;
@@ -15,16 +47,50 @@ struct Material
     uint32_t illum; // illumination model (see http://www.fileformat.info/format/material/)
 };
 
-struct SProceduralGeomInfo
+struct MaterialPacked
+{
+	uint32_t ambient;
+    uint32_t diffuse;
+    uint32_t specular;
+    uint32_t shininess: 18;
+    uint32_t dissolve : 10; // 1 == opaque; 0 == fully transparent
+    uint32_t illum : 4; // illumination model (see http://www.fileformat.info/format/material/)
+};
+
+inline MaterialPacked packMaterial(Material material)
+{
+    MaterialPacked packed;
+    packed.ambient = packUnorm3x10(material.ambient);      
+    packed.diffuse = packUnorm3x10(material.diffuse);
+    packed.specular = packUnorm3x10(material.specular);      
+    packed.shininess = packUnorm18(material.shininess);
+    packed.dissolve = packUnorm10(material.dissolve);
+    packed.illum = material.illum;
+    return packed;
+}
+
+inline Material unpackMaterial(MaterialPacked packed)
 {
     Material material;
+    material.ambient = unpackUnorm3x10(packed.ambient);
+    material.diffuse = unpackUnorm3x10(packed.diffuse);
+    material.specular = unpackUnorm3x10(packed.specular);
+    material.shininess = unpackUnorm18(packed.shininess);
+    material.dissolve = unpackUnorm10(packed.dissolve);
+    material.illum = packed.illum;
+    return material;
+}
+
+struct SProceduralGeomInfo
+{
+    MaterialPacked material;
     float32_t3 center;
     float32_t radius;
 };
 
 struct STriangleGeomInfo
 {
-    Material material;
+    MaterialPacked material;
     uint64_t vertexBufferAddress;
     uint64_t indexBufferAddress;
 
@@ -89,7 +155,6 @@ struct SPushConstants
     uint32_t frameCounter;
     float32_t4x4 invMVP;
 
-
     Light light;
 };
 
@@ -102,6 +167,7 @@ struct RayLight
     float32_t outIntensity;
 };
 
+
 #ifdef __HLSL_VERSION
 
 struct [raypayload] ColorPayload
diff --git a/71_RayTracingPipeline/app_resources/raytrace.rahit.hlsl b/71_RayTracingPipeline/app_resources/raytrace.rahit.hlsl
index 5db6d70fa..7eb4efbf4 100644
--- a/71_RayTracingPipeline/app_resources/raytrace.rahit.hlsl
+++ b/71_RayTracingPipeline/app_resources/raytrace.rahit.hlsl
@@ -16,13 +16,14 @@ void main(inout AnyHitPayload p, in BuiltInTriangleIntersectionAttributes attrib
 {
     const int instID = InstanceID();
     const STriangleGeomInfo geom = vk::RawBufferLoad < STriangleGeomInfo > (pc.triangleGeomInfoBuffer + instID * sizeof(STriangleGeomInfo));
+    const Material material = unpackMaterial(geom.material);
     
-    if (geom.material.illum != 4)
+    if (material.illum != 4)
         return;
 
     uint32_t seed = p.seed;
-    if (geom.material.dissolve == 0.0)
+    if (material.dissolve == 0.0)
         IgnoreHit();
-    else if (rnd(seed) > geom.material.dissolve)
+    else if (rnd(seed) > material.dissolve)
         IgnoreHit();
 }
diff --git a/71_RayTracingPipeline/app_resources/raytrace.rchit.hlsl b/71_RayTracingPipeline/app_resources/raytrace.rchit.hlsl
index 462287689..bee5429a8 100644
--- a/71_RayTracingPipeline/app_resources/raytrace.rchit.hlsl
+++ b/71_RayTracingPipeline/app_resources/raytrace.rchit.hlsl
@@ -120,12 +120,13 @@ void main(inout ColorPayload p, in BuiltInTriangleIntersectionAttributes attribs
     const VertexData vertexData = fetchVertexData(instID, primID, geom, attribs.barycentrics);
     const float32_t3 worldPosition = mul(ObjectToWorld3x4(), float32_t4(vertexData.position, 1));
     const float32_t3 worldNormal = normalize(mul(vertexData.normal, WorldToObject3x4()).xyz);
+    const Material material = unpackMaterial(geom.material);
 
     RayLight cLight;
     cLight.inHitPosition = worldPosition;
     CallShader(pc.light.type, cLight);
 
-    float32_t3 diffuse = computeDiffuse(geom.material, cLight.outLightDir, worldNormal);
+    float32_t3 diffuse = computeDiffuse(material, cLight.outLightDir, worldNormal);
     float32_t3 specular = float32_t3(0, 0, 0);
     float32_t attenuation = 1;
 
@@ -150,7 +151,7 @@ void main(inout ColorPayload p, in BuiltInTriangleIntersectionAttributes attribs
         }
         else
         {
-            specular = computeSpecular(geom.material, WorldRayDirection(), cLight.outLightDir, worldNormal);
+            specular = computeSpecular(material, WorldRayDirection(), cLight.outLightDir, worldNormal);
         }
     }
     p.hitValue = (cLight.outIntensity * attenuation * (diffuse + specular));
diff --git a/71_RayTracingPipeline/app_resources/raytrace_procedural.rchit.hlsl b/71_RayTracingPipeline/app_resources/raytrace_procedural.rchit.hlsl
index dd5598105..c056f3925 100644
--- a/71_RayTracingPipeline/app_resources/raytrace_procedural.rchit.hlsl
+++ b/71_RayTracingPipeline/app_resources/raytrace_procedural.rchit.hlsl
@@ -21,10 +21,10 @@ void main(inout ColorPayload p, in BuiltInTriangleIntersectionAttributes attribs
     CallShader(pc.light.type, cLight);
 
     // Material of the object
-    Material mat = sphere.material;
+    Material material = unpackMaterial(sphere.material);
 
     // Diffuse
-    float3 diffuse = computeDiffuse(sphere.material, cLight.outLightDir, worldNormal);
+    float3 diffuse = computeDiffuse(material, cLight.outLightDir, worldNormal);
     float3 specular = float3(0, 0, 0);
     float attenuation = 1;
 
@@ -53,7 +53,7 @@ void main(inout ColorPayload p, in BuiltInTriangleIntersectionAttributes attribs
         }
         else
         {
-            specular = computeSpecular(sphere.material, WorldRayDirection(), cLight.outLightDir, worldNormal);
+            specular = computeSpecular(material, WorldRayDirection(), cLight.outLightDir, worldNormal);
         }
     }
 
diff --git a/71_RayTracingPipeline/include/common.hpp b/71_RayTracingPipeline/include/common.hpp
index 3a8411fd2..3b66fd3e9 100644
--- a/71_RayTracingPipeline/include/common.hpp
+++ b/71_RayTracingPipeline/include/common.hpp
@@ -84,7 +84,7 @@ struct ReferenceObjectGpu
 	uint32_t vertexStride;
 	nbl::asset::E_INDEX_TYPE indexType = nbl::asset::E_INDEX_TYPE::EIT_UNKNOWN;
 	uint32_t indexCount = {};
-	Material material;
+	MaterialPacked material;
   core::matrix3x4SIMD transform;
 
 	const bool useIndex() const
diff --git a/71_RayTracingPipeline/main.cpp b/71_RayTracingPipeline/main.cpp
index e95032181..015f08a42 100644
--- a/71_RayTracingPipeline/main.cpp
+++ b/71_RayTracingPipeline/main.cpp
@@ -1064,7 +1064,7 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication,
       ReferenceObjectCpu {
         .meta = {.type = OT_CUBE, .name = "Cube Mesh 2"},
         .data = gc->createCubeMesh(nbl::core::vector3df(1.5, 1.5, 1.5)),
-        .material = {
+        .material = Material{
           .ambient = {},
           .diffuse = {0.2, 0.2, 0.8},
           .specular = {0.8, 0.8, 0.8},
@@ -1076,7 +1076,7 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication,
       ReferenceObjectCpu {
         .meta = {.type = OT_CUBE, .name = "Transparent Cube Mesh"},
         .data = gc->createCubeMesh(nbl::core::vector3df(1.5, 1.5, 1.5)),
-        .material = {
+        .material = Material{
           .ambient = {},
           .diffuse = {0.2, 0.8, 0.2},
           .specular = {0.8, 0.8, 0.8},
@@ -1211,7 +1211,7 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication,
           .vertexStride = cpuObject.data.inputParams.bindings[0].stride,
           .indexType = cpuObject.data.indexType,
           .indexCount = cpuObject.data.indexCount,
-          .material = cpuObject.material,
+          .material = packMaterial(cpuObject.material),
           .transform = cpuObject.transform,
           });
       }
@@ -1250,13 +1250,13 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication,
       {
         const auto middle_i = NumberOfProceduralGeometries / 2.0;
         SProceduralGeomInfo sphere = {
-          .material = {
+          .material = packMaterial({
             .ambient = {},
             .diffuse = {0.3, 0.2 * i, 0.3},
             .specular = {0.8, 0.8, 0.8},
             .shininess = 1.0f,
             .illum = 2
-          },
+          }),
           .center = float32_t3((i - middle_i) * 4.0, 2, 5.0),
           .radius = 1,
         };

From 85e67ad0c4012d7d8d2014489327036d89b0bf57 Mon Sep 17 00:00:00 2001
From: keptsecret <sorchon@gmail.com>
Date: Thu, 6 Feb 2025 16:51:56 +0700
Subject: [PATCH 029/529] completed material system?

---
 .../app_resources/hlsl/material_system.hlsl   | 119 ++++++++++++++++++
 .../app_resources/hlsl/pathtracer.hlsl        |   2 +-
 2 files changed, 120 insertions(+), 1 deletion(-)

diff --git a/31_HLSLPathTracer/app_resources/hlsl/material_system.hlsl b/31_HLSLPathTracer/app_resources/hlsl/material_system.hlsl
index 6f635ab68..1f13198fa 100644
--- a/31_HLSLPathTracer/app_resources/hlsl/material_system.hlsl
+++ b/31_HLSLPathTracer/app_resources/hlsl/material_system.hlsl
@@ -1,6 +1,9 @@
 #ifndef _NBL_HLSL_EXT_MATERIAL_SYSTEM_INCLUDED_
 #define _NBL_HLSL_EXT_MATERIAL_SYSTEM_INCLUDED_
 
+#include <nbl/builtin/hlsl/limits.hlsl>
+#include <nbl/builtin/hlsl/math/functions.hlsl>
+
 namespace nbl
 {
 namespace hlsl
@@ -10,7 +13,123 @@ namespace ext
 namespace MaterialSystem
 {
 
+struct Material
+{
+    enum class Type : uint32_t
+    {
+        DIFFUSE,
+        CONDUCTOR,
+        DIELECTRIC
+    };
+
+    NBL_CONSTEXPR_STATIC_INLINE uint32_t DataSize = 32;
+
+    uint32_t type : 1;
+    unit32_t unused : 31;   // possible space for flags
+    uint32_t data[DataSize];
+};
+
+template<class DiffuseBxDF, class ConductorBxDF, class DielectricBxDF>
+struct System
+{
+    using this_t = System<DiffuseBxDF, ConductorBxDF, DielectricBxDF>;
+    using scalar_type = typename DiffuseBxDF::scalar_type;      // types should be same across all 3 bxdfs
+    using vector2_type = vector<scalar_type, 2>;
+    using vector3_type = vector<scalar_type, 3>;
+    using measure_type = typename DiffuseBxDF::spectral_type;
+    using quotient_pdf_type = typename DiffuseBxDF::quotient_pdf_type;
+    using anisotropic_type = typename DiffuseBxDF::anisotropic_type;
+    using anisocache_type = typename ConductorBxDF::anisocache_type;
+    using params_t = SBxDFParams<scalar_type>;
+
+    static this_t create(NBL_CONST_REF_ARG(SBxDFCreationParams<scalar_type, measure_type>) diffuseParams, NBL_CONST_REF_ARG(SBxDFCreationParams<scalar_type, measure_type>) conductorParams, NBL_CONST_REF_ARG(SBxDFCreationParams<scalar_type, measure_type>) dielectricParams)
+    {
+        diffuseBxDF = DiffuseBxDF::create(diffuseParams);
+        conductorBxDF = DiffuseBxDF::create(conductorParams);
+        dielectricBxDF = DiffuseBxDF::create(dielectricParams);
+    }
+
+    static measure_type eval(NBL_CONST_REF_ARG(Material) material, NBL_CONST_REF_ARG(params_t) params)
+    {
+        switch(material.type)
+        {
+            case DIFFUSE:
+            {
+                return (measure_type)diffuseBxDF.eval(params);
+            }
+            break;
+            case CONDUCTOR:
+            {
+                return conductorBxDF.eval(params);
+            }
+            break;
+            case DIELECTRIC:
+            {
+                return dielectricBxDF.eval(params);
+            }
+            break;
+            default:
+                return (measure_type)0.0;
+        }
+    }
+
+    static vector3_type generate(NBL_CONST_REF_ARG(Material) material, anisotropic_type interaction, vector2_type u, NBL_REF_ARG(anisocache_type) cache)
+    {
+        switch(material.type)
+        {
+            case DIFFUSE:
+            {
+                return diffuseBxDF.generate(interaction, u);
+            }
+            break;
+            case CONDUCTOR:
+            {
+                return conductorBxDF.generate(interaction, u, cache);
+            }
+            break;
+            case DIELECTRIC:
+            {
+                return dielectricBxDF.generate(interaction, u, cache);
+            }
+            break;
+            default:
+                return (vector3_type)numeric_limits<float>::infinity;
+        }
+    }
+
+    static quotient_pdf_type quotient_and_pdf(NBL_CONST_REF_ARG(Material) material, NBL_CONST_REF_ARG(params_t) params)
+    {
+        const float minimumProjVectorLen = 0.00000001;
+        if (params.NdotV > minimumProjVectorLen && params.NdotL > minimumProjVectorLen)
+        {
+            switch(material.type)
+            {
+                case DIFFUSE:
+                {
+                    return diffuseBxDF.quotient_and_pdf(params);
+                }
+                break;
+                case CONDUCTOR:
+                {
+                    return conductorBxDF.quotient_and_pdf(params);
+                }
+                break;
+                case DIELECTRIC:
+                {
+                    return dielectricBxDF.quotient_and_pdf(params);
+                }
+                break;
+                default:
+                    return quotient_pdf_type::create((measure_type)0.0, numeric_limits<float>::infinity);
+            }
+        }
+        return quotient_pdf_type::create((measure_type)0.0, numeric_limits<float>::infinity);
+    }
 
+    DiffuseBxDF diffuseBxDF;
+    ConductorBxDF conductorBxDF;
+    DielectricBxDF dielectricBxDF;
+};
 
 }
 }
diff --git a/31_HLSLPathTracer/app_resources/hlsl/pathtracer.hlsl b/31_HLSLPathTracer/app_resources/hlsl/pathtracer.hlsl
index f28dc621b..9ca0f77e4 100644
--- a/31_HLSLPathTracer/app_resources/hlsl/pathtracer.hlsl
+++ b/31_HLSLPathTracer/app_resources/hlsl/pathtracer.hlsl
@@ -26,7 +26,7 @@ struct Unidirectional
     // closest hit
 
     // Li
-    MaterialSystem::measure_t getMeasure()
+    MaterialSystem::measure_type getMeasure()
     {
         // loop through bounces, do closest hit
         // return ray.payload.accumulation --> color

From 2c500b1e06e3e83b2a427bf0aa1ef27878467e0b Mon Sep 17 00:00:00 2001
From: keptsecret <sorchon@gmail.com>
Date: Fri, 7 Feb 2025 14:37:56 +0700
Subject: [PATCH 030/529] sphere nee stuff

---
 .../app_resources/hlsl/common.hlsl            | 62 +++++++++++++------
 .../app_resources/hlsl/intersector.hlsl       | 30 ++++-----
 2 files changed, 59 insertions(+), 33 deletions(-)

diff --git a/31_HLSLPathTracer/app_resources/hlsl/common.hlsl b/31_HLSLPathTracer/app_resources/hlsl/common.hlsl
index 84933edfb..2b627523f 100644
--- a/31_HLSLPathTracer/app_resources/hlsl/common.hlsl
+++ b/31_HLSLPathTracer/app_resources/hlsl/common.hlsl
@@ -46,22 +46,22 @@ struct Ray
     Payload<T> payload;
 };
 
-enum ProceduralIntersectionType : uint16_t
+enum ProceduralShapeType : uint16_t
 {
-    PIT_SPHERE,
-    PIT_TRIANGLE,
-    PIT_RECTANGLE
+    PST_SPHERE,
+    PST_TRIANGLE,
+    PST_RECTANGLE
 };
 
-template<ProceduralIntersectionType type>
-struct Intersection;
+template<ProceduralShapeType type>
+struct Shape;
 
 template<>
-struct Intersection<PIT_SPHERE>
+struct Shape<PST_SPHERE>
 {
-    static Intersection<PIT_SPHERE> create(NBL_CONST_REF_ARG(float32_t3) position, float32_t radius, uint32_t bsdfID, uint32_t lightID)
+    static Shape<PST_SPHERE> create(NBL_CONST_REF_ARG(float32_t3) position, float32_t radius, uint32_t bsdfID, uint32_t lightID)
     {
-        Intersection<PIT_SPHERE> retval;
+        Shape<PST_SPHERE> retval;
         retval.position = position;
         retval.radius2 = radius * radius;
         retval.bsdfLightIDs = spirv::bitFieldInsert<uint32_t>(bsdfID, lightID, 16, 16);
@@ -95,15 +95,41 @@ struct Intersection<PIT_SPHERE>
         return 2.0 * numbers::pi<float> * (1.0 - cosThetaMax);
     }
 
-    // should this be in material system?
     float deferredPdf(Light light, Ray ray)
     {
         return 1.0 / getSolidAngle(ray.origin);
     }
 
-    float generate_and_pdf()
+    template<class Aniso>
+    float generate_and_pdf(NBL_REF_ARG(float32_t) pdf, NBL_REF_ARG(float32_t) newRayMaxT, NBL_CONST_REF_ARG(float32_t3) origin, NBL_CONST_REF_ARG(Aniso) interaction, bool isBSDF, float32_t3 xi, uint32_t objectID)
     {
-        // TODO
+        float32_t3 Z = position - origin;
+        const float distanceSQ = nbl::hlsl::dot(Z,Z);
+        const float cosThetaMax2 = 1.0 - radius2 / distanceSQ;
+        if (cosThetaMax2 > 0.0)
+        {
+            const float rcpDistance = 1.0 / nbl::hlsl::sqrt(distanceSQ);
+            Z *= rcpDistance;
+        
+            const float cosThetaMax = nbl::hlsl::sqrt(cosThetaMax2);
+            const float cosTheta = nbl::hlsl::mix(1.0, cosThetaMax, xi.x);
+
+            vec3 L = Z * cosTheta;
+
+            const float cosTheta2 = cosTheta * cosTheta;
+            const float sinTheta = nbl::hlsl::sqrt(1.0 - cosTheta2);
+            float sinPhi, cosPhi;
+            math::sincos(2.0 * numbers::pi<float> * xi.y - numbers::pi<float>, sinPhi, cosPhi);
+            float32_t2x3 XY = math::frisvad<float>(Z);
+        
+            L += (XY[0] * cosPhi + XY[1] * sinPhi) * sinTheta;
+        
+            newRayMaxT = (cosTheta - nbl::hlsl::sqrt(cosTheta2 - cosThetaMax2)) / rcpDistance;
+            pdf = 1.0 / (2.0 * numbers::pi<float> * (1.0 - cosThetaMax));
+            return L;
+        }
+        pdf = 0.0;
+        return float32_t3(0.0,0.0,0.0);
     }
 
     NBL_CONSTEXPR_STATIC_INLINE uint32_t ObjSize = 5;
@@ -114,11 +140,11 @@ struct Intersection<PIT_SPHERE>
 };
 
 template<>
-struct Intersection<PIT_TRIANGLE>
+struct Shape<PST_TRIANGLE>
 {
-    static Intersection<PIT_TRIANGLE> create(NBL_CONST_REF_ARG(float32_t3) vertex0, NBL_CONST_REF_ARG(float32_t3) vertex1, NBL_CONST_REF_ARG(float32_t3) vertex2, uint32_t bsdfID, uint32_t lightID)
+    static Shape<PST_TRIANGLE> create(NBL_CONST_REF_ARG(float32_t3) vertex0, NBL_CONST_REF_ARG(float32_t3) vertex1, NBL_CONST_REF_ARG(float32_t3) vertex2, uint32_t bsdfID, uint32_t lightID)
     {
-        Intersection<PIT_TRIANGLE> retval;
+        Shape<PST_TRIANGLE> retval;
         retval.vertex0 = vertex0;
         retval.vertex1 = vertex1;
         retval.vertex2 = vertex2;
@@ -161,11 +187,11 @@ struct Intersection<PIT_TRIANGLE>
 };
 
 template<>
-struct Intersection<PIT_RECTANGLE>
+struct Shape<PST_RECTANGLE>
 {
-    static Intersection<PIT_TRIANGLE> create(NBL_CONST_REF_ARG(float32_t3) offset, NBL_CONST_REF_ARG(float32_t3) edge0, NBL_CONST_REF_ARG(float32_t3) edge1, uint32_t bsdfID, uint32_t lightID)
+    static Shape<PST_TRIANGLE> create(NBL_CONST_REF_ARG(float32_t3) offset, NBL_CONST_REF_ARG(float32_t3) edge0, NBL_CONST_REF_ARG(float32_t3) edge1, uint32_t bsdfID, uint32_t lightID)
     {
-        Intersection<PIT_TRIANGLE> retval;
+        Shape<PST_TRIANGLE> retval;
         retval.offset = offset;
         retval.edge0 = edge0;
         retval.edge1 = edge1;
diff --git a/31_HLSLPathTracer/app_resources/hlsl/intersector.hlsl b/31_HLSLPathTracer/app_resources/hlsl/intersector.hlsl
index d4b87196d..a694082fe 100644
--- a/31_HLSLPathTracer/app_resources/hlsl/intersector.hlsl
+++ b/31_HLSLPathTracer/app_resources/hlsl/intersector.hlsl
@@ -55,28 +55,28 @@ struct Comprehensive
             float t;
             switch (type)
             {
-                case PIT_SPHERE:
+                case PST_SPHERE:
                 {
-                    float32_t3 position = float32_t3(asfloat(intersect.data[2 + i * Intersection<PIT_SPHERE>::ObjSize]), asfloat(intersect.data[2 + i * Intersection<PIT_SPHERE>::ObjSize + 1]), asfloat(intersect.data[2 + i * Intersection<PIT_SPHERE>::ObjSize + 2]));
-                    Intersection<PIT_SPHERE> sphere = Intersection<PIT_SPHERE>::create(position, asfloat(intersect.data[2 + i * Intersection<PIT_SPHERE>::ObjSize + 3]), intersect.data[2 + i * Intersection<PIT_SPHERE>::ObjSize + 4]);
+                    float32_t3 position = float32_t3(asfloat(intersect.data[2 + i * Shape<PST_SPHERE>::ObjSize]), asfloat(intersect.data[2 + i * Shape<PST_SPHERE>::ObjSize + 1]), asfloat(intersect.data[2 + i * Shape<PST_SPHERE>::ObjSize + 2]));
+                    Shape<PST_SPHERE> sphere = Shape<PST_SPHERE>::create(position, asfloat(intersect.data[2 + i * Shape<PST_SPHERE>::ObjSize + 3]), intersect.data[2 + i * Shape<PST_SPHERE>::ObjSize + 4]);
                     t = sphere.intersect(ray.origin, ray.direction);
                 }
                 break;
-                case PIT_TRIANGLE:
+                case PST_TRIANGLE:
                 {
-                    float32_t3 vertex0 = float32_t3(asfloat(intersect.data[2 + i * Intersection<PIT_TRIANGLE>::ObjSize]), asfloat(intersect.data[2 + i * Intersection<PIT_SPHERE>::ObjSize + 1]), asfloat(intersect.data[2 + i * Intersection<PIT_TRIANGLE>::ObjSize + 2]));
-                    float32_t3 vertex1 = float32_t3(asfloat(intersect.data[2 + i * Intersection<PIT_TRIANGLE>::ObjSize + 3]), asfloat(intersect.data[2 + i * Intersection<PIT_SPHERE>::ObjSize + 4]), asfloat(intersect.data[2 + i * Intersection<PIT_TRIANGLE>::ObjSize + 5]));
-                    float32_t3 vertex2 = float32_t3(asfloat(intersect.data[2 + i * Intersection<PIT_TRIANGLE>::ObjSize + 6]), asfloat(intersect.data[2 + i * Intersection<PIT_SPHERE>::ObjSize + 7]), asfloat(intersect.data[2 + i * Intersection<PIT_TRIANGLE>::ObjSize + 8]));
-                    Intersection<PIT_TRIANGLE> tri = Intersection<PIT_TRIANGLE>::create(vertex0, vertex1, vertex2, intersect.data[2 + i * Intersection<PIT_TRIANGLE>::ObjSize + 9]);
+                    float32_t3 vertex0 = float32_t3(asfloat(intersect.data[2 + i * Shape<PST_TRIANGLE>::ObjSize]), asfloat(intersect.data[2 + i * Shape<PST_SPHERE>::ObjSize + 1]), asfloat(intersect.data[2 + i * Shape<PST_TRIANGLE>::ObjSize + 2]));
+                    float32_t3 vertex1 = float32_t3(asfloat(intersect.data[2 + i * Shape<PST_TRIANGLE>::ObjSize + 3]), asfloat(intersect.data[2 + i * Shape<PST_SPHERE>::ObjSize + 4]), asfloat(intersect.data[2 + i * Shape<PST_TRIANGLE>::ObjSize + 5]));
+                    float32_t3 vertex2 = float32_t3(asfloat(intersect.data[2 + i * Shape<PST_TRIANGLE>::ObjSize + 6]), asfloat(intersect.data[2 + i * Shape<PST_SPHERE>::ObjSize + 7]), asfloat(intersect.data[2 + i * Shape<PST_TRIANGLE>::ObjSize + 8]));
+                    Shape<PST_TRIANGLE> tri = Shape<PST_TRIANGLE>::create(vertex0, vertex1, vertex2, intersect.data[2 + i * Shape<PST_TRIANGLE>::ObjSize + 9]);
                     t = tri.intersect(ray.origin, ray.direction);
                 }
                 break;
-                case PIT_RECTANGLE:
+                case PST_RECTANGLE:
                 {
-                    float32_t3 offset = float32_t3(asfloat(intersect.data[2 + i * Intersection<PIT_RECTANGLE>::ObjSize]), asfloat(intersect.data[2 + i * Intersection<PIT_RECTANGLE>::ObjSize + 1]), asfloat(intersect.data[2 + i * Intersection<PIT_RECTANGLE>::ObjSize + 2]));
-                    float32_t3 edge0 = float32_t3(asfloat(intersect.data[2 + i * Intersection<PIT_RECTANGLE>::ObjSize + 3]), asfloat(intersect.data[2 + i * Intersection<PIT_RECTANGLE>::ObjSize + 4]), asfloat(intersect.data[2 + i * Intersection<PIT_RECTANGLE>::ObjSize + 5]));
-                    float32_t3 edge1 = float32_t3(asfloat(intersect.data[2 + i * Intersection<PIT_RECTANGLE>::ObjSize + 6]), asfloat(intersect.data[2 + i * Intersection<PIT_RECTANGLE>::ObjSize + 7]), asfloat(intersect.data[2 + i * Intersection<PIT_RECTANGLE>::ObjSize + 8]));
-                    Intersection<PIT_RECTANGLE> rect = Intersection<PIT_RECTANGLE>::create(offset, edge0, edge1, intersect.data[2 + i * Intersection<PIT_RECTANGLE>::ObjSize + 9]);
+                    float32_t3 offset = float32_t3(asfloat(intersect.data[2 + i * Shape<PST_RECTANGLE>::ObjSize]), asfloat(intersect.data[2 + i * Shape<PST_RECTANGLE>::ObjSize + 1]), asfloat(intersect.data[2 + i * Shape<PST_RECTANGLE>::ObjSize + 2]));
+                    float32_t3 edge0 = float32_t3(asfloat(intersect.data[2 + i * Shape<PST_RECTANGLE>::ObjSize + 3]), asfloat(intersect.data[2 + i * Shape<PST_RECTANGLE>::ObjSize + 4]), asfloat(intersect.data[2 + i * Shape<PST_RECTANGLE>::ObjSize + 5]));
+                    float32_t3 edge1 = float32_t3(asfloat(intersect.data[2 + i * Shape<PST_RECTANGLE>::ObjSize + 6]), asfloat(intersect.data[2 + i * Shape<PST_RECTANGLE>::ObjSize + 7]), asfloat(intersect.data[2 + i * Shape<PST_RECTANGLE>::ObjSize + 8]));
+                    Shape<PST_RECTANGLE> rect = Shape<PST_RECTANGLE>::create(offset, edge0, edge1, intersect.data[2 + i * Shape<PST_RECTANGLE>::ObjSize + 9]);
                     t = rect.intersect(ray.origin, ray.direction);
                 }
                 break;
@@ -138,9 +138,9 @@ struct Comprehensive
 //         for (int i = 0; i < objCount; i++)
 //         {
 //             float t;
-//             if (objects[i].type == PIT_SPHERE)  // we don't know what type of intersection it is so cast, there has to be a better way to do this
+//             if (objects[i].type == PST_SPHERE)  // we don't know what type of intersection it is so cast, there has to be a better way to do this
 //             {
-//                 Intersection<PIT_SPHERE> sphere = (Intersection<PIT_SPHERE>)objects[i];
+//                 Shape<PST_SPHERE> sphere = (Shape<PST_SPHERE>)objects[i];
 //                 t = sphere.intersect(ray.origin, ray.direction);
 //             }
 //             // TODO: other types

From e6a99165c1b153977192f9722381fc24f566c9ca Mon Sep 17 00:00:00 2001
From: keptsecret <sorchon@gmail.com>
Date: Mon, 10 Feb 2025 16:58:50 +0700
Subject: [PATCH 031/529] triangle sampling

---
 .../app_resources/hlsl/common.hlsl            | 109 +++++++++++++++++-
 1 file changed, 108 insertions(+), 1 deletion(-)

diff --git a/31_HLSLPathTracer/app_resources/hlsl/common.hlsl b/31_HLSLPathTracer/app_resources/hlsl/common.hlsl
index 2b627523f..dfc500beb 100644
--- a/31_HLSLPathTracer/app_resources/hlsl/common.hlsl
+++ b/31_HLSLPathTracer/app_resources/hlsl/common.hlsl
@@ -4,6 +4,11 @@
 #include <nbl/builtin/hlsl/spirv_intrinsics/core.hlsl>
 #include <nbl/builtin/hlsl/spirv_intrinsics/glsl.std.450.hlsl>
 #include <nbl/builtin/hlsl/numbers.hlsl>
+#include <nbl/builtin/hlsl/shapes/triangle.hlsl>
+#include <nbl/builtin/hlsl/shapes/rectangle.hlsl>
+#include <nbl/builtin/hlsl/sampling/spherical_triangle.hlsl>
+#include <nbl/builtin/hlsl/sampling/projected_spherical_triangle.hlsl>
+//#include <nbl/builtin/hlsl/shapes/rectangle.hlsl>
 
 namespace nbl
 {
@@ -53,6 +58,13 @@ enum ProceduralShapeType : uint16_t
     PST_RECTANGLE
 };
 
+enum PTPolygonMethod : uint16_t
+{
+    PPM_AREA,
+    PPM_SOLID_ANGLE,
+    PPM_APPROX_PROJECTED_SOLID_ANGLE
+};
+
 template<ProceduralShapeType type>
 struct Shape;
 
@@ -95,7 +107,7 @@ struct Shape<PST_SPHERE>
         return 2.0 * numbers::pi<float> * (1.0 - cosThetaMax);
     }
 
-    float deferredPdf(Light light, Ray ray)
+    float deferredPdf(NBL_CONST_REF_ARG(Light light), NBL_CONST_REF_ARG(Ray) ray)
     {
         return 1.0 / getSolidAngle(ray.origin);
     }
@@ -149,6 +161,7 @@ struct Shape<PST_TRIANGLE>
         retval.vertex1 = vertex1;
         retval.vertex2 = vertex2;
         retval.bsdfLightIDs = spirv::bitFieldInsert<uint32_t>(bsdfID, lightID, 16, 16);
+        retval.polygonMethod = PPM_SOLID_ANGLE;
         return retval;
     }
 
@@ -178,12 +191,104 @@ struct Shape<PST_TRIANGLE>
         return nbl::hlsl::cross(edges[0], edges[1]) * 0.5f;
     }
 
+    float deferredPdf(NBL_CONST_REF_ARG(Light light), NBL_CONST_REF_ARG(Ray) ray)
+    {
+        const float32_t3 L = ray.direction;
+        switch (polygonMethod)
+        {
+            case PPM_AREA:
+            {
+                const float dist = ray.intersectionT;
+                return dist * dist / nbl::hlsl::abs(nbl::hlsl::dot(getNormalTimesArea()), L);
+            }
+            break;
+            case PPM_SOLID_ANGLE:
+            {
+                shapes::SphericalTriangle<float> st = shapes::SphericalTriangle<float>::create(vertex0, vertex1, vertex2, ray.origin);
+                const float rcpProb = st.solidAngleOfTriangle();
+                // if `rcpProb` is NAN then the triangle's solid angle was close to 0.0 
+                return rcpProb > numeric_limits<float>::min ? (1.0 / rcpProb) : numeric_limits<float>::max;
+            }
+            break;
+            case PPM_APPROX_PROJECTED_SOLID_ANGLE:
+            {
+                shapes::SphericalTriangle<float> st = shapes::SphericalTriangle<float>::create(vertex0, vertex1, vertex2, ray.origin);
+                const float pdf = st.projectedSolidAngleOfTriangle(ray.normalAtOrigin, ray.wasBSDFAtOrigin, L);
+                // if `pdf` is NAN then the triangle's projected solid angle was close to 0.0, if its close to INF then the triangle was very small
+                return pdf < numeric_limits<float>::max ? pdf : 0.0;
+            }
+            break;
+            default:
+                return 0.0;
+        }
+    }
+
+    template<class Aniso>
+    float32_t3 generate_and_pdf(NBL_REF_ARG(float32_t) pdf, NBL_REF_ARG(float32_t) newRayMaxT, NBL_CONST_REF_ARG(float32_t3) origin, NBL_CONST_REF_ARG(Aniso) interaction, bool isBSDF, float32_t3 xi, uint32_t objectID)
+    {
+        switch(polygonMethod)
+        {
+            case PPM_AREA:
+            {
+                const float32_t3 edge0 = vertex1 - vertex0;
+                const float32_t3 edge1 = vertex2 - vertex0;
+                const float sqrtU = nbl::hlsl::sqrt(xi.x);
+                float32_t3 pnt = vertex0 + edge0 * (1.0 - sqrtU) + edge1 * sqrtU * xi.y;
+                float32_t3 L = pnt - origin;
+                
+                const float distanceSq = nbl::hlsl::dot(L,L);
+                const float rcpDistance = 1.0 / nbl::hlsl::sqrt(distanceSq);
+                L *= rcpDistance;
+                
+                pdf = distanceSq / nbl::hlsl::abs(nbl::hlsl::dot(nbl::hlsl::cross(edge0, edge1) * 0.5f, L));
+                newRayMaxT = 1.0 / rcpDistance;
+                return L;
+            }
+            break;
+            case PPM_SOLID_ANGLE:
+            {
+                float rcpPdf;
+
+                shapes::SphericalTriangle<float> st = shapes::SphericalTriangle<float>::create(vertex0, vertex1, vertex2, ray.origin);
+                sampling::SphericalTriangle<float> sst = sampling::SphericalTriangle<float>::create(st);
+
+                const float32_t3 L = sst.generate(rcpPdf, xi.xy);
+
+                pdf = rcpPdf > numeric_limits<float>::min ? (1.0 / rcpPdf) : 0.0;
+
+                const float32_t3 N = getNormalTimesArea();
+                newRayMaxT = nbl::hlsl::dot(N, vertex0 - origin) / nbl::hlsl::dot(N, L);
+                return L;
+            }
+            break;
+            case PPM_APPROX_PROJECTED_SOLID_ANGLE:
+            {
+                float rcpPdf;
+
+                shapes::SphericalTriangle<float> st = shapes::SphericalTriangle<float>::create(vertex0, vertex1, vertex2, ray.origin);
+                sampling::ProjectedSphericalTriangle<float> sst = sampling::ProjectedSphericalTriangle<float>::create(st);
+            
+                const float32_t3 L = sst.generate(rcpPdf, interaction.N, isBSDF, xi.xy);
+
+                pdf = rcpPdf > numeric_limits<float>::min ? (1.0 / rcpPdf) : 0.0;
+
+                const float32_t3 N = getNormalTimesArea();
+                newRayMaxT = nbl::hlsl::dot(N, vertex0 - origin) / nbl::hlsl::dot(N, L);
+                return L;
+            }
+            break;
+            default:
+                return (float32_t3)0.0;
+        }
+    }
+
     NBL_CONSTEXPR_STATIC_INLINE uint32_t ObjSize = 10;
 
     float32_t3 vertex0;
     float32_t3 vertex1;
     float32_t3 vertex2;
     uint32_t bsdfLightIDs;
+    PTPolygonMethod polygonMethod;
 };
 
 template<>
@@ -196,6 +301,7 @@ struct Shape<PST_RECTANGLE>
         retval.edge0 = edge0;
         retval.edge1 = edge1;
         retval.bsdfLightIDs = spirv::bitFieldInsert<uint32_t>(bsdfID, lightID, 16, 16);
+        retval.polygonMethod = PPM_SOLID_ANGLE;
         return retval;
     }
 
@@ -238,6 +344,7 @@ struct Shape<PST_RECTANGLE>
     float32_t3 edge0;
     float32_t3 edge1;
     uint32_t bsdfLightIDs;
+    PTPolygonMethod polygonMethod;
 };
 
 }

From c48b5b9015bd81230952d37667108e953c8a97f2 Mon Sep 17 00:00:00 2001
From: kevyuu <kevin.kayu@gmail.com>
Date: Tue, 11 Feb 2025 11:56:09 +0700
Subject: [PATCH 032/529] Change ray tracing implementation from recursion to
 loop based

---
 .../app_resources/common.hlsl                 | 16 ++++---
 .../app_resources/raytrace.rahit.hlsl         |  2 +-
 .../app_resources/raytrace.rchit.hlsl         | 39 ++-------------
 .../app_resources/raytrace.rgen.hlsl          | 48 +++++++++++++++++--
 .../app_resources/raytrace.rmiss.hlsl         |  5 +-
 .../app_resources/raytraceShadow.rmiss.hlsl   |  4 +-
 .../raytrace_procedural.rchit.hlsl            | 46 ++----------------
 7 files changed, 66 insertions(+), 94 deletions(-)

diff --git a/71_RayTracingPipeline/app_resources/common.hlsl b/71_RayTracingPipeline/app_resources/common.hlsl
index eb66aa374..b56155855 100644
--- a/71_RayTracingPipeline/app_resources/common.hlsl
+++ b/71_RayTracingPipeline/app_resources/common.hlsl
@@ -170,18 +170,20 @@ struct RayLight
 
 #ifdef __HLSL_VERSION
 
-struct [raypayload] ColorPayload
-{
-	float32_t3 hitValue : read(caller) : write(closesthit,miss);
-    uint32_t seed : read(closesthit,anyhit) : write(caller);
-};
-
 struct [raypayload] ShadowPayload
 {
 	bool isShadowed : read(caller) : write(caller,miss);
     uint32_t seed : read(anyhit) : write(caller);
 };
 
+struct [raypayload] HitPayload
+{
+    MaterialPacked material : read(caller) : write(closesthit);
+    float32_t3 worldNormal : read(caller) : write(closesthit);
+    float32_t rayDistance : read(caller) : write(closesthit, miss);
+    uint32_t seed : read(closesthit, anyhit) : write(caller);
+};
+
 enum ObjectType : uint32_t  // matches c++
 {
     OT_CUBE = 0,
@@ -197,6 +199,7 @@ enum ObjectType : uint32_t  // matches c++
 };
 
 static uint32_t s_offsetsToNormalBytes[OT_COUNT] = { 18, 24, 24, 20, 20, 24, 16, 12 };	// based on normals data position
+
 float32_t3 computeDiffuse(Material mat, float32_t3 light_dir, float32_t3 normal)
 {
 	// Lambertian
@@ -213,7 +216,6 @@ float32_t3 computeSpecular(Material mat, float32_t3 view_dir,
 	if (mat.illum < 2)
 		return float32_t3(0, 0, 0);
 
-	// Compute specular only if not in shadow
 	const float32_t kPi = 3.14159265;
 	const float32_t kShininess = max(mat.shininess, 4.0);
 
diff --git a/71_RayTracingPipeline/app_resources/raytrace.rahit.hlsl b/71_RayTracingPipeline/app_resources/raytrace.rahit.hlsl
index 7eb4efbf4..7df0c16ca 100644
--- a/71_RayTracingPipeline/app_resources/raytrace.rahit.hlsl
+++ b/71_RayTracingPipeline/app_resources/raytrace.rahit.hlsl
@@ -6,7 +6,7 @@
 [[vk::binding(0, 0)]] RaytracingAccelerationStructure topLevelAS;
 
 #if defined(USE_COLOR_PAYLOAD)
-using AnyHitPayload = ColorPayload;
+using AnyHitPayload = HitPayload;
 #elif defined(USE_SHADOW_PAYLOAD)
 using AnyHitPayload = ShadowPayload;
 #endif
diff --git a/71_RayTracingPipeline/app_resources/raytrace.rchit.hlsl b/71_RayTracingPipeline/app_resources/raytrace.rchit.hlsl
index bee5429a8..a0dd973e6 100644
--- a/71_RayTracingPipeline/app_resources/raytrace.rchit.hlsl
+++ b/71_RayTracingPipeline/app_resources/raytrace.rchit.hlsl
@@ -112,47 +112,16 @@ VertexData fetchVertexData(int instID, int primID, STriangleGeomInfo geom, float
 }
 
 [shader("closesthit")]
-void main(inout ColorPayload p, in BuiltInTriangleIntersectionAttributes attribs)
+void main(inout HitPayload payload, in BuiltInTriangleIntersectionAttributes attribs)
 {
     const int instID = InstanceID();
     const int primID = PrimitiveIndex();
     const STriangleGeomInfo geom = vk::RawBufferLoad < STriangleGeomInfo > (pc.triangleGeomInfoBuffer + instID * sizeof(STriangleGeomInfo));
     const VertexData vertexData = fetchVertexData(instID, primID, geom, attribs.barycentrics);
-    const float32_t3 worldPosition = mul(ObjectToWorld3x4(), float32_t4(vertexData.position, 1));
     const float32_t3 worldNormal = normalize(mul(vertexData.normal, WorldToObject3x4()).xyz);
-    const Material material = unpackMaterial(geom.material);
 
-    RayLight cLight;
-    cLight.inHitPosition = worldPosition;
-    CallShader(pc.light.type, cLight);
+    payload.material = geom.material;
+    payload.worldNormal = worldNormal;
+    payload.rayDistance = RayTCurrent();
 
-    float32_t3 diffuse = computeDiffuse(material, cLight.outLightDir, worldNormal);
-    float32_t3 specular = float32_t3(0, 0, 0);
-    float32_t attenuation = 1;
-
-    if (dot(worldNormal, cLight.outLightDir) > 0)
-    {
-        RayDesc rayDesc;
-        rayDesc.Origin = WorldRayOrigin() + WorldRayDirection() * RayTCurrent();
-        rayDesc.Direction = cLight.outLightDir;
-        rayDesc.TMin = 0.01;
-        rayDesc.TMax = cLight.outLightDistance;
-
-        uint flags = RAY_FLAG_SKIP_CLOSEST_HIT_SHADER;
-        ShadowPayload shadowPayload;
-        shadowPayload.isShadowed = true;
-        shadowPayload.seed = p.seed;
-        TraceRay(topLevelAS, flags, 0xFF, ERT_OCCLUSION, 0, EMT_OCCLUSION, rayDesc, shadowPayload);
-
-        bool isShadowed = shadowPayload.isShadowed;
-        if (isShadowed)
-        {
-            attenuation = 0.3;
-        }
-        else
-        {
-            specular = computeSpecular(material, WorldRayDirection(), cLight.outLightDir, worldNormal);
-        }
-    }
-    p.hitValue = (cLight.outIntensity * attenuation * (diffuse + specular));
 }
\ No newline at end of file
diff --git a/71_RayTracingPipeline/app_resources/raytrace.rgen.hlsl b/71_RayTracingPipeline/app_resources/raytrace.rgen.hlsl
index 43b052630..facba537c 100644
--- a/71_RayTracingPipeline/app_resources/raytrace.rgen.hlsl
+++ b/71_RayTracingPipeline/app_resources/raytrace.rgen.hlsl
@@ -41,19 +41,59 @@ void main()
         const float32_t4 tmp = mul(pc.invMVP, float32_t4(d.x, d.y, 1, 1));
         const float32_t3 targetPos = tmp.xyz / tmp.w;
 
-        float32_t3 direction = normalize(targetPos - pc.camPos);
+        const float32_t3 camDirection = normalize(targetPos - pc.camPos);
 
         RayDesc rayDesc;
         rayDesc.Origin = pc.camPos;
-        rayDesc.Direction = direction;
+        rayDesc.Direction = camDirection;
         rayDesc.TMin = 0.001;
         rayDesc.TMax = 10000.0;
         
-        ColorPayload payload;
+        HitPayload payload;
         payload.seed = seed;
         TraceRay(topLevelAS, RAY_FLAG_NONE, 0xff, ERT_PRIMARY, 0, EMT_PRIMARY, rayDesc, payload);
 
-        hitValues += payload.hitValue;
+        if (payload.rayDistance < 0)
+        {
+            hitValues += float32_t3(0.3, 0.3, 0.3);
+            continue;
+        }
+
+        const float32_t3 worldPosition = pc.camPos + (camDirection * payload.rayDistance);
+        const float32_t3 worldNormal = payload.worldNormal;
+        const Material material = unpackMaterial(payload.material);
+        RayLight cLight;
+        cLight.inHitPosition = worldPosition;
+        CallShader(pc.light.type, cLight);
+
+        const float32_t3 diffuse = computeDiffuse(material, cLight.outLightDir, worldNormal);
+        float32_t3 specular = float32_t3(0, 0, 0);
+        float32_t attenuation = 1;
+
+        if (dot(worldNormal, cLight.outLightDir) > 0)
+        {
+            RayDesc rayDesc;
+            rayDesc.Origin = worldPosition;
+            rayDesc.Direction = cLight.outLightDir;
+            rayDesc.TMin = 0.01;
+            rayDesc.TMax = 100000;
+
+            ShadowPayload shadowPayload;
+            shadowPayload.isShadowed = true;
+            shadowPayload.seed = seed;
+            TraceRay(topLevelAS, RAY_FLAG_SKIP_CLOSEST_HIT_SHADER, 0xFF, ERT_PRIMARY, 0, EMT_OCCLUSION, rayDesc, shadowPayload);
+
+            bool isShadowed = shadowPayload.isShadowed;
+            if (isShadowed)
+            {
+                attenuation = 0.3;
+            }
+            else
+            {
+                specular = computeSpecular(material, camDirection, cLight.outLightDir, worldNormal);
+            }
+        }
+        hitValues += (cLight.outIntensity * attenuation * (diffuse + specular));
     }
 
     float32_t3 hitValue = hitValues / s_sampleCount;
diff --git a/71_RayTracingPipeline/app_resources/raytrace.rmiss.hlsl b/71_RayTracingPipeline/app_resources/raytrace.rmiss.hlsl
index 70db3b0e4..602104a19 100644
--- a/71_RayTracingPipeline/app_resources/raytrace.rmiss.hlsl
+++ b/71_RayTracingPipeline/app_resources/raytrace.rmiss.hlsl
@@ -1,8 +1,7 @@
 #include "common.hlsl"
 
 [shader("miss")]
-void main(inout ColorPayload p)
+void main(inout HitPayload payload)
 {
-    p.hitValue = float32_t3(0.3, 0.3, 0.6);
-
+    payload.rayDistance = -1;
 }
diff --git a/71_RayTracingPipeline/app_resources/raytraceShadow.rmiss.hlsl b/71_RayTracingPipeline/app_resources/raytraceShadow.rmiss.hlsl
index 295e721f2..c1ea42173 100644
--- a/71_RayTracingPipeline/app_resources/raytraceShadow.rmiss.hlsl
+++ b/71_RayTracingPipeline/app_resources/raytraceShadow.rmiss.hlsl
@@ -1,7 +1,7 @@
 #include "common.hlsl"
 
 [shader("miss")]
-void main(inout ShadowPayload p)
+void main(inout ShadowPayload payload)
 {
-	p.isShadowed = false;
+	payload.isShadowed = false;
 }
diff --git a/71_RayTracingPipeline/app_resources/raytrace_procedural.rchit.hlsl b/71_RayTracingPipeline/app_resources/raytrace_procedural.rchit.hlsl
index c056f3925..227bfa092 100644
--- a/71_RayTracingPipeline/app_resources/raytrace_procedural.rchit.hlsl
+++ b/71_RayTracingPipeline/app_resources/raytrace_procedural.rchit.hlsl
@@ -5,7 +5,7 @@
 [[vk::binding(0, 0)]] RaytracingAccelerationStructure topLevelAS;
 
 [shader("closesthit")]
-void main(inout ColorPayload p, in BuiltInTriangleIntersectionAttributes attribs)
+void main(inout HitPayload payload, in BuiltInTriangleIntersectionAttributes attribs)
 {
     const int instID = InstanceID();
     const int primID = PrimitiveIndex();
@@ -16,46 +16,8 @@ void main(inout ColorPayload p, in BuiltInTriangleIntersectionAttributes attribs
     // Computing the normal at hit position
     float32_t3 worldNormal = normalize(worldPosition - sphere.center);
 
-    RayLight cLight;
-    cLight.inHitPosition = worldPosition;
-    CallShader(pc.light.type, cLight);
+    payload.material = sphere.material;
+    payload.worldNormal = worldNormal;
+    payload.rayDistance = RayTCurrent();
 
-    // Material of the object
-    Material material = unpackMaterial(sphere.material);
-
-    // Diffuse
-    float3 diffuse = computeDiffuse(material, cLight.outLightDir, worldNormal);
-    float3 specular = float3(0, 0, 0);
-    float attenuation = 1;
-
-    // Tracing shadow ray only if the light is visible from the surface
-    if (dot(worldNormal, cLight.outLightDir) > 0)
-    {
-        RayDesc rayDesc;
-        rayDesc.Origin = WorldRayOrigin() + WorldRayDirection() * RayTCurrent();
-        rayDesc.Direction = cLight.outLightDir;
-        rayDesc.TMin = 0.01;
-        rayDesc.TMax = cLight.outLightDistance;
-
-        uint flags =
-            RAY_FLAG_ACCEPT_FIRST_HIT_AND_END_SEARCH | RAY_FLAG_FORCE_OPAQUE |
-            RAY_FLAG_SKIP_CLOSEST_HIT_SHADER;
-
-        ShadowPayload shadowPayload;
-        shadowPayload.isShadowed = true;
-        shadowPayload.seed = p.seed;
-        TraceRay(topLevelAS, flags, 0xFF, ERT_OCCLUSION, 0, EMT_OCCLUSION, rayDesc, shadowPayload);
-
-        bool isShadowed = shadowPayload.isShadowed;
-        if (isShadowed)
-        {
-            attenuation = 0.3;
-        }
-        else
-        {
-            specular = computeSpecular(material, WorldRayDirection(), cLight.outLightDir, worldNormal);
-        }
-    }
-
-    p.hitValue = (cLight.outIntensity * attenuation * (diffuse + specular));
 }
\ No newline at end of file

From 6dc25eb438bed7c9a729da6520b930d571410d20 Mon Sep 17 00:00:00 2001
From: keptsecret <sorchon@gmail.com>
Date: Tue, 11 Feb 2025 14:04:11 +0700
Subject: [PATCH 033/529] rectangle sampling

---
 .../app_resources/hlsl/common.hlsl            | 102 +++++++++++++++++-
 1 file changed, 101 insertions(+), 1 deletion(-)

diff --git a/31_HLSLPathTracer/app_resources/hlsl/common.hlsl b/31_HLSLPathTracer/app_resources/hlsl/common.hlsl
index dfc500beb..9295b459b 100644
--- a/31_HLSLPathTracer/app_resources/hlsl/common.hlsl
+++ b/31_HLSLPathTracer/app_resources/hlsl/common.hlsl
@@ -51,6 +51,13 @@ struct Ray
     Payload<T> payload;
 };
 
+template<class Spectrum>
+struct Light
+{
+    Spectrum radiance;
+    uint32_t objectID;
+};
+
 enum ProceduralShapeType : uint16_t
 {
     PST_SPHERE,
@@ -191,7 +198,7 @@ struct Shape<PST_TRIANGLE>
         return nbl::hlsl::cross(edges[0], edges[1]) * 0.5f;
     }
 
-    float deferredPdf(NBL_CONST_REF_ARG(Light light), NBL_CONST_REF_ARG(Ray) ray)
+    float deferredPdf(NBL_CONST_REF_ARG(Light light), NBL_CONST_REF_ARG(Ray<float>) ray)
     {
         const float32_t3 L = ray.direction;
         switch (polygonMethod)
@@ -338,6 +345,99 @@ struct Shape<PST_RECTANGLE>
         basis = nbl::hlsl::transpose<matrix3x3_type>(basis);    // TODO: double check transpose
     }
 
+    float deferredPdf(NBL_CONST_REF_ARG(Light light), NBL_CONST_REF_ARG(Ray<float>) ray)
+    {
+        switch (polygonMethod)
+        {
+            case PPM_AREA:
+            {
+                const float dist = ray.intersectionT;
+                return dist * dist / nbl::hlsl::abs(nbl::hlsl::dot(getNormalTimesArea(), L));
+            }
+            break;
+            // #ifdef TRIANGLE_REFERENCE ?
+            case PPM_SOLID_ANGLE:
+            {
+                float pdf;
+                float32_t3x3 rectNormalBasis;
+                float32_t2 rectExtents;
+                getNormalBasis(rectNormalBasis, rectExtents);
+                shapes::SphericalRectangle<float> sphR0 = shapes::SphericalRectangle<float>::create(ray.origin, offset, rectNormalBasis);
+                float solidAngle = sphR0.solidAngleOfRectangle(rectExtents);
+                if (solidAngle > numeric_limits<float>::min)
+                    pdf = 1.f / solidAngle;
+                else
+                    pdf = numeric_limits<float>::infinity;
+                return pdf;
+            }
+            break;
+            case PPM_APPROX_PROJECTED_SOLID_ANGLE:
+            {
+                return numeric_limits<float>::infinity;
+            }
+            break;
+            default:
+                return numeric_limits<float>::infinity;
+        }
+    }
+
+    template<class Aniso>
+    float32_t3 generate_and_pdf(NBL_REF_ARG(float32_t) pdf, NBL_REF_ARG(float32_t) newRayMaxT, NBL_CONST_REF_ARG(float32_t3) origin, NBL_CONST_REF_ARG(Aniso) interaction, bool isBSDF, float32_t3 xi, uint32_t objectID)
+    {
+        const float32_t3 N = getNormalTimesArea();
+        const float32_t3 origin2origin = offset - origin;
+
+        switch (polygonMethod)
+        {
+            case PPM_AREA:
+            {
+                float32_t3 L = origin2origin + edge0 * xi.x + edge1 * xi.y;
+                const float distSq = nbl::hlsl::dot(L, L);
+                const float rcpDist = 1.0 / nbl::hlsl::sqrt(distSq);
+                L *= rcpDist;
+                pdf = distSq / nbl::hlsl::abs(nbl::hlsl::dot(N, L));
+                newRayMaxT = 1.0 / rcpDist;
+                return L;
+            }
+            break;
+            // #ifdef TRIANGLE_REFERENCE ?
+            case PPM_SOLID_ANGLE:
+            {
+                float pdf;
+                float32_t3x3 rectNormalBasis;
+                float32_t2 rectExtents;
+                getNormalBasis(rectNormalBasis, rectExtents);
+                shapes::SphericalRectangle<float> sphR0 = shapes::SphericalRectangle<float>::create(origin, offset, rectNormalBasis);
+                float32_t3 L = (float32_t3)0.0;
+                float solidAngle = sphR0.solidAngleOfRectangle(rectExtents);
+
+                sampling::SphericalRectangle<float> ssph = sampling::SphericalRectangle<float>::create(sphR0);
+                float32_t2 sphUv = ssph.generate(rectExtents, xi.xy, solidAngle);
+                if (solidAngle > numeric_limits<float>::min)
+                {
+                    float32_t3 sph_sample = sphUv[0] * edge0 + sphUv[1] * edge1 + offset;
+                    L = nbl::hlsl::normalize(sph_sample - origin);
+                    pdf = 1.f / solidAngle;
+                }
+                else
+                    pdf = numeric_limits<float>::infinity;
+
+                newRayMaxT = nbl::hlsl::dot(N, origin2origin) / nbl::hlsl::dot(N, L);
+                return L;
+            }
+            break;
+            case PPM_APPROX_PROJECTED_SOLID_ANGLE:
+            {
+                pdf = numeric_limits<float>::infinity;
+                return (float32_t3)0.0;
+            }
+            break;
+            default:
+                pdf = numeric_limits<float>::infinity;
+                return (float32_t3)0.0;
+        }
+    }
+
     NBL_CONSTEXPR_STATIC_INLINE uint32_t ObjSize = 10;
 
     float32_t3 offset;

From 73b3f9915da069a7efc8e1d6f5e617b85b06742b Mon Sep 17 00:00:00 2001
From: kevyuu <kevin.kayu@gmail.com>
Date: Tue, 11 Feb 2025 16:27:44 +0700
Subject: [PATCH 034/529] Assign Blas flag according the the primitives
 transparency

---
 71_RayTracingPipeline/app_resources/raytrace.rahit.hlsl | 2 --
 71_RayTracingPipeline/main.cpp                          | 4 +++-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/71_RayTracingPipeline/app_resources/raytrace.rahit.hlsl b/71_RayTracingPipeline/app_resources/raytrace.rahit.hlsl
index 7df0c16ca..598e271a5 100644
--- a/71_RayTracingPipeline/app_resources/raytrace.rahit.hlsl
+++ b/71_RayTracingPipeline/app_resources/raytrace.rahit.hlsl
@@ -18,8 +18,6 @@ void main(inout AnyHitPayload p, in BuiltInTriangleIntersectionAttributes attrib
     const STriangleGeomInfo geom = vk::RawBufferLoad < STriangleGeomInfo > (pc.triangleGeomInfoBuffer + instID * sizeof(STriangleGeomInfo));
     const Material material = unpackMaterial(geom.material);
     
-    if (material.illum != 4)
-        return;
 
     uint32_t seed = p.seed;
     if (material.dissolve == 0.0)
diff --git a/71_RayTracingPipeline/main.cpp b/71_RayTracingPipeline/main.cpp
index 015f08a42..fb0dca14f 100644
--- a/71_RayTracingPipeline/main.cpp
+++ b/71_RayTracingPipeline/main.cpp
@@ -1451,7 +1451,9 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication,
           triangles[i].vertexStride = vertexStride;
           triangles[i].vertexFormat = EF_R32G32B32_SFLOAT;
           triangles[i].indexType = gpuObject.indexType;
-          triangles[i].geometryFlags = IGPUBottomLevelAccelerationStructure::GEOMETRY_FLAGS::NO_DUPLICATE_ANY_HIT_INVOCATION_BIT;
+          triangles[i].geometryFlags = gpuObject.material.illum == 4 ? 
+            IGPUBottomLevelAccelerationStructure::GEOMETRY_FLAGS::NO_DUPLICATE_ANY_HIT_INVOCATION_BIT :
+            IGPUBottomLevelAccelerationStructure::GEOMETRY_FLAGS::OPAQUE_BIT;
 
           blasBuildInfos[i].triangles = &triangles[i];
         }

From adb7bb612a19e5e4b9c932f5c7f77d9c0b26a3c6 Mon Sep 17 00:00:00 2001
From: kevyuu <kevin.kayu@gmail.com>
Date: Tue, 11 Feb 2025 16:30:09 +0700
Subject: [PATCH 035/529] Store material information in intersection shader

---
 71_RayTracingPipeline/app_resources/common.hlsl   |  6 ++++++
 .../app_resources/raytrace.rint.hlsl              | 15 +++++++--------
 .../app_resources/raytrace_procedural.rchit.hlsl  | 14 ++++----------
 3 files changed, 17 insertions(+), 18 deletions(-)

diff --git a/71_RayTracingPipeline/app_resources/common.hlsl b/71_RayTracingPipeline/app_resources/common.hlsl
index b56155855..a57fa82dd 100644
--- a/71_RayTracingPipeline/app_resources/common.hlsl
+++ b/71_RayTracingPipeline/app_resources/common.hlsl
@@ -167,6 +167,12 @@ struct RayLight
     float32_t outIntensity;
 };
 
+struct ProceduralHitAttribute
+{
+    MaterialPacked material;
+    float32_t3 center;
+};
+
 
 #ifdef __HLSL_VERSION
 
diff --git a/71_RayTracingPipeline/app_resources/raytrace.rint.hlsl b/71_RayTracingPipeline/app_resources/raytrace.rint.hlsl
index f302543b6..b9941fc59 100644
--- a/71_RayTracingPipeline/app_resources/raytrace.rint.hlsl
+++ b/71_RayTracingPipeline/app_resources/raytrace.rint.hlsl
@@ -8,11 +8,6 @@ struct Ray
     float32_t3 direction;
 };
 
-struct Attrib
-{
-    float3 HitAttribute;
-};
-
 // Ray-Sphere intersection
 // http://viclw17.github.io/2018/07/16/raytracing-ray-sphere-intersection/
 float32_t hitSphere(SProceduralGeomInfo s, Ray r)
@@ -45,10 +40,14 @@ void main()
     // Sphere data
     SProceduralGeomInfo sphere = vk::RawBufferLoad < SProceduralGeomInfo > (pc.proceduralGeomInfoBuffer + primID * sizeof(SProceduralGeomInfo));
 
-    float32_t tHit = hitSphere(sphere, ray);
+    const float32_t tHit = hitSphere(sphere, ray);
     
-    Attrib attrib;
+    ProceduralHitAttribute hitAttrib;
     // Report hit point
     if (tHit > 0)
-        ReportHit(tHit, 0, attrib);
+    {
+        hitAttrib.center = sphere.center;
+        hitAttrib.material = sphere.material;
+        ReportHit(tHit, 0, hitAttrib);
+    }
 }
\ No newline at end of file
diff --git a/71_RayTracingPipeline/app_resources/raytrace_procedural.rchit.hlsl b/71_RayTracingPipeline/app_resources/raytrace_procedural.rchit.hlsl
index 227bfa092..48495f0fc 100644
--- a/71_RayTracingPipeline/app_resources/raytrace_procedural.rchit.hlsl
+++ b/71_RayTracingPipeline/app_resources/raytrace_procedural.rchit.hlsl
@@ -5,18 +5,12 @@
 [[vk::binding(0, 0)]] RaytracingAccelerationStructure topLevelAS;
 
 [shader("closesthit")]
-void main(inout HitPayload payload, in BuiltInTriangleIntersectionAttributes attribs)
+void main(inout HitPayload payload, in ProceduralHitAttribute attrib)
 {
-    const int instID = InstanceID();
-    const int primID = PrimitiveIndex();
-    float32_t3 worldPosition = WorldRayOrigin() + WorldRayDirection() * RayTCurrent();
+    const float32_t3 worldPosition = WorldRayOrigin() + WorldRayDirection() * RayTCurrent();
+    const float32_t3 worldNormal = normalize(worldPosition - attrib.center);
 
-    SProceduralGeomInfo sphere = vk::RawBufferLoad < SProceduralGeomInfo > (pc.proceduralGeomInfoBuffer + primID * sizeof(SProceduralGeomInfo));
-
-    // Computing the normal at hit position
-    float32_t3 worldNormal = normalize(worldPosition - sphere.center);
-
-    payload.material = sphere.material;
+    payload.material = attrib.material;
     payload.worldNormal = worldNormal;
     payload.rayDistance = RayTCurrent();
 

From bc16d1bb717c3812ce3bb8e0e5253a45ba97ee43 Mon Sep 17 00:00:00 2001
From: kevyuu <kevin.kayu@gmail.com>
Date: Tue, 11 Feb 2025 16:31:00 +0700
Subject: [PATCH 036/529] Refactor shadow ray tracing implementation to use
 opacity instead of stochastic method.

---
 .../app_resources/common.hlsl                  |  3 +--
 .../app_resources/raytrace.rahit.hlsl          | 15 ++++++---------
 .../app_resources/raytrace.rgen.hlsl           | 17 ++++++-----------
 .../app_resources/raytraceShadow.rahit.hlsl    | 18 ++++++++++++++++++
 .../app_resources/raytraceShadow.rmiss.hlsl    |  2 +-
 71_RayTracingPipeline/main.cpp                 |  4 ++--
 6 files changed, 34 insertions(+), 25 deletions(-)
 create mode 100644 71_RayTracingPipeline/app_resources/raytraceShadow.rahit.hlsl

diff --git a/71_RayTracingPipeline/app_resources/common.hlsl b/71_RayTracingPipeline/app_resources/common.hlsl
index a57fa82dd..9eb7744f5 100644
--- a/71_RayTracingPipeline/app_resources/common.hlsl
+++ b/71_RayTracingPipeline/app_resources/common.hlsl
@@ -178,8 +178,7 @@ struct ProceduralHitAttribute
 
 struct [raypayload] ShadowPayload
 {
-	bool isShadowed : read(caller) : write(caller,miss);
-    uint32_t seed : read(anyhit) : write(caller);
+    float32_t attenuation : read(caller) : write(caller, miss, anyhit);
 };
 
 struct [raypayload] HitPayload
diff --git a/71_RayTracingPipeline/app_resources/raytrace.rahit.hlsl b/71_RayTracingPipeline/app_resources/raytrace.rahit.hlsl
index 598e271a5..d0f24c209 100644
--- a/71_RayTracingPipeline/app_resources/raytrace.rahit.hlsl
+++ b/71_RayTracingPipeline/app_resources/raytrace.rahit.hlsl
@@ -5,23 +5,20 @@
 
 [[vk::binding(0, 0)]] RaytracingAccelerationStructure topLevelAS;
 
-#if defined(USE_COLOR_PAYLOAD)
-using AnyHitPayload = HitPayload;
-#elif defined(USE_SHADOW_PAYLOAD)
-using AnyHitPayload = ShadowPayload;
-#endif
-
 [shader("anyhit")]
-void main(inout AnyHitPayload p, in BuiltInTriangleIntersectionAttributes attribs)
+void main(inout HitPayload payload, in BuiltInTriangleIntersectionAttributes attribs)
 {
     const int instID = InstanceID();
     const STriangleGeomInfo geom = vk::RawBufferLoad < STriangleGeomInfo > (pc.triangleGeomInfoBuffer + instID * sizeof(STriangleGeomInfo));
     const Material material = unpackMaterial(geom.material);
     
-
-    uint32_t seed = p.seed;
+    uint32_t seed = payload.seed;
     if (material.dissolve == 0.0)
+    {
         IgnoreHit();
+    }
     else if (rnd(seed) > material.dissolve)
+    {
         IgnoreHit();
+    }
 }
diff --git a/71_RayTracingPipeline/app_resources/raytrace.rgen.hlsl b/71_RayTracingPipeline/app_resources/raytrace.rgen.hlsl
index facba537c..fb4cb45b9 100644
--- a/71_RayTracingPipeline/app_resources/raytrace.rgen.hlsl
+++ b/71_RayTracingPipeline/app_resources/raytrace.rgen.hlsl
@@ -68,7 +68,7 @@ void main()
 
         const float32_t3 diffuse = computeDiffuse(material, cLight.outLightDir, worldNormal);
         float32_t3 specular = float32_t3(0, 0, 0);
-        float32_t attenuation = 1;
+        float32_t attenuation = 0;
 
         if (dot(worldNormal, cLight.outLightDir) > 0)
         {
@@ -76,21 +76,16 @@ void main()
             rayDesc.Origin = worldPosition;
             rayDesc.Direction = cLight.outLightDir;
             rayDesc.TMin = 0.01;
-            rayDesc.TMax = 100000;
+            rayDesc.TMax = cLight.outLightDistance;
 
             ShadowPayload shadowPayload;
-            shadowPayload.isShadowed = true;
-            shadowPayload.seed = seed;
-            TraceRay(topLevelAS, RAY_FLAG_SKIP_CLOSEST_HIT_SHADER, 0xFF, ERT_PRIMARY, 0, EMT_OCCLUSION, rayDesc, shadowPayload);
+            shadowPayload.attenuation = -1; // negative attenuation indicate occlusion happening. will be multiplied by -1 in miss shader.
+            TraceRay(topLevelAS, RAY_FLAG_SKIP_CLOSEST_HIT_SHADER, 0xFF, ERT_OCCLUSION, 0, EMT_OCCLUSION, rayDesc, shadowPayload);
 
-            bool isShadowed = shadowPayload.isShadowed;
-            if (isShadowed)
-            {
-                attenuation = 0.3;
-            }
-            else
+            if (shadowPayload.attenuation > 0)
             {
                 specular = computeSpecular(material, camDirection, cLight.outLightDir, worldNormal);
+                attenuation = shadowPayload.attenuation;
             }
         }
         hitValues += (cLight.outIntensity * attenuation * (diffuse + specular));
diff --git a/71_RayTracingPipeline/app_resources/raytraceShadow.rahit.hlsl b/71_RayTracingPipeline/app_resources/raytraceShadow.rahit.hlsl
new file mode 100644
index 000000000..5ac656a7b
--- /dev/null
+++ b/71_RayTracingPipeline/app_resources/raytraceShadow.rahit.hlsl
@@ -0,0 +1,18 @@
+#include "common.hlsl"
+#include "random.hlsl"
+
+[[vk::push_constant]] SPushConstants pc;
+
+[[vk::binding(0, 0)]] RaytracingAccelerationStructure topLevelAS;
+
+[shader("anyhit")]
+void main(inout ShadowPayload payload, in BuiltInTriangleIntersectionAttributes attribs)
+{
+    const int instID = InstanceID();
+    const STriangleGeomInfo geom = vk::RawBufferLoad < STriangleGeomInfo > (pc.triangleGeomInfoBuffer + instID * sizeof(STriangleGeomInfo));
+    const Material material = unpackMaterial(geom.material);
+    
+    payload.attenuation = (1 - material.dissolve) * payload.attenuation;
+    IgnoreHit();
+
+}
diff --git a/71_RayTracingPipeline/app_resources/raytraceShadow.rmiss.hlsl b/71_RayTracingPipeline/app_resources/raytraceShadow.rmiss.hlsl
index c1ea42173..287c38f55 100644
--- a/71_RayTracingPipeline/app_resources/raytraceShadow.rmiss.hlsl
+++ b/71_RayTracingPipeline/app_resources/raytraceShadow.rmiss.hlsl
@@ -3,5 +3,5 @@
 [shader("miss")]
 void main(inout ShadowPayload payload)
 {
-	payload.isShadowed = false;
+	payload.attenuation = payload.attenuation * -1;
 }
diff --git a/71_RayTracingPipeline/main.cpp b/71_RayTracingPipeline/main.cpp
index fb0dca14f..1594ee6c4 100644
--- a/71_RayTracingPipeline/main.cpp
+++ b/71_RayTracingPipeline/main.cpp
@@ -166,8 +166,8 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication,
     const auto closestHitShader = loadCompileAndCreateShader("app_resources/raytrace.rchit.hlsl");
     const auto proceduralClosestHitShader = loadCompileAndCreateShader("app_resources/raytrace_procedural.rchit.hlsl");
     const auto intersectionHitShader = loadCompileAndCreateShader("app_resources/raytrace.rint.hlsl");
-    const auto anyHitShaderColorPayload = loadCompileAndCreateShader("app_resources/raytrace.rahit.hlsl", "#define USE_COLOR_PAYLOAD\n");
-    const auto anyHitShaderShadowPayload = loadCompileAndCreateShader("app_resources/raytrace.rahit.hlsl", "#define USE_SHADOW_PAYLOAD\n");
+    const auto anyHitShaderColorPayload = loadCompileAndCreateShader("app_resources/raytrace.rahit.hlsl");
+    const auto anyHitShaderShadowPayload = loadCompileAndCreateShader("app_resources/raytraceShadow.rahit.hlsl");
     const auto missShader = loadCompileAndCreateShader("app_resources/raytrace.rmiss.hlsl");
     const auto shadowMissShader = loadCompileAndCreateShader("app_resources/raytraceShadow.rmiss.hlsl");
     const auto directionalLightCallShader = loadCompileAndCreateShader("app_resources/light_directional.rcall.hlsl");

From dc7db460b7d9fea4bfdb29f6d891e05c370eb9ec Mon Sep 17 00:00:00 2001
From: kevyuu <kevin.kayu@gmail.com>
Date: Tue, 11 Feb 2025 20:06:50 +0700
Subject: [PATCH 037/529] Remove unnecessary read of attenuation by miss shader

---
 71_RayTracingPipeline/app_resources/common.hlsl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/71_RayTracingPipeline/app_resources/common.hlsl b/71_RayTracingPipeline/app_resources/common.hlsl
index 9eb7744f5..bef1cb674 100644
--- a/71_RayTracingPipeline/app_resources/common.hlsl
+++ b/71_RayTracingPipeline/app_resources/common.hlsl
@@ -178,7 +178,7 @@ struct ProceduralHitAttribute
 
 struct [raypayload] ShadowPayload
 {
-    float32_t attenuation : read(caller) : write(caller, miss, anyhit);
+    float32_t attenuation : read(caller) : write(caller, anyhit);
 };
 
 struct [raypayload] HitPayload

From 86f4f4bc84cc8f9f19974664d0731161ffd80d9c Mon Sep 17 00:00:00 2001
From: kevyuu <kevin.kayu@gmail.com>
Date: Tue, 11 Feb 2025 20:08:21 +0700
Subject: [PATCH 038/529] Remove unnecsarry topLevelAs binding declaration

---
 71_RayTracingPipeline/app_resources/raytrace.rahit.hlsl       | 2 --
 71_RayTracingPipeline/app_resources/raytrace.rchit.hlsl       | 2 --
 71_RayTracingPipeline/app_resources/raytraceShadow.rahit.hlsl | 2 --
 3 files changed, 6 deletions(-)

diff --git a/71_RayTracingPipeline/app_resources/raytrace.rahit.hlsl b/71_RayTracingPipeline/app_resources/raytrace.rahit.hlsl
index d0f24c209..e85d5b572 100644
--- a/71_RayTracingPipeline/app_resources/raytrace.rahit.hlsl
+++ b/71_RayTracingPipeline/app_resources/raytrace.rahit.hlsl
@@ -3,8 +3,6 @@
 
 [[vk::push_constant]] SPushConstants pc;
 
-[[vk::binding(0, 0)]] RaytracingAccelerationStructure topLevelAS;
-
 [shader("anyhit")]
 void main(inout HitPayload payload, in BuiltInTriangleIntersectionAttributes attribs)
 {
diff --git a/71_RayTracingPipeline/app_resources/raytrace.rchit.hlsl b/71_RayTracingPipeline/app_resources/raytrace.rchit.hlsl
index a0dd973e6..c5cf70185 100644
--- a/71_RayTracingPipeline/app_resources/raytrace.rchit.hlsl
+++ b/71_RayTracingPipeline/app_resources/raytrace.rchit.hlsl
@@ -2,8 +2,6 @@
 
 [[vk::push_constant]] SPushConstants pc;
 
-[[vk::binding(0, 0)]] RaytracingAccelerationStructure topLevelAS;
-
 float3 unpackNormals3x10(uint32_t v)
 {
     // host side changes float32_t3 to EF_A2B10G10R10_SNORM_PACK32
diff --git a/71_RayTracingPipeline/app_resources/raytraceShadow.rahit.hlsl b/71_RayTracingPipeline/app_resources/raytraceShadow.rahit.hlsl
index 5ac656a7b..3f063daba 100644
--- a/71_RayTracingPipeline/app_resources/raytraceShadow.rahit.hlsl
+++ b/71_RayTracingPipeline/app_resources/raytraceShadow.rahit.hlsl
@@ -3,8 +3,6 @@
 
 [[vk::push_constant]] SPushConstants pc;
 
-[[vk::binding(0, 0)]] RaytracingAccelerationStructure topLevelAS;
-
 [shader("anyhit")]
 void main(inout ShadowPayload payload, in BuiltInTriangleIntersectionAttributes attribs)
 {

From 96f25cad0585b88bcd506bd93db47601a9aeee01 Mon Sep 17 00:00:00 2001
From: kevyuu <kevin.kayu@gmail.com>
Date: Tue, 11 Feb 2025 20:09:11 +0700
Subject: [PATCH 039/529] Rework gpu random value generation to use pcg hash
 and xoroshiro

---
 .../app_resources/common.hlsl                 |  2 +-
 .../app_resources/random.hlsl                 | 34 ------------------
 .../app_resources/raytrace.rahit.hlsl         |  8 +----
 .../app_resources/raytrace.rgen.hlsl          | 36 ++++++++++++-------
 .../app_resources/raytraceShadow.rahit.hlsl   | 13 +++++--
 .../app_resources/raytraceShadow.rmiss.hlsl   |  1 -
 6 files changed, 36 insertions(+), 58 deletions(-)
 delete mode 100644 71_RayTracingPipeline/app_resources/random.hlsl

diff --git a/71_RayTracingPipeline/app_resources/common.hlsl b/71_RayTracingPipeline/app_resources/common.hlsl
index bef1cb674..a089b152a 100644
--- a/71_RayTracingPipeline/app_resources/common.hlsl
+++ b/71_RayTracingPipeline/app_resources/common.hlsl
@@ -186,7 +186,7 @@ struct [raypayload] HitPayload
     MaterialPacked material : read(caller) : write(closesthit);
     float32_t3 worldNormal : read(caller) : write(closesthit);
     float32_t rayDistance : read(caller) : write(closesthit, miss);
-    uint32_t seed : read(closesthit, anyhit) : write(caller);
+    float32_t dissolveThreshold : read(closesthit, anyhit) : write(caller);
 };
 
 enum ObjectType : uint32_t  // matches c++
diff --git a/71_RayTracingPipeline/app_resources/random.hlsl b/71_RayTracingPipeline/app_resources/random.hlsl
deleted file mode 100644
index e01d7ff6c..000000000
--- a/71_RayTracingPipeline/app_resources/random.hlsl
+++ /dev/null
@@ -1,34 +0,0 @@
-// Generate a random unsigned int from two unsigned int values, using 16 pairs
-// of rounds of the Tiny Encryption Algorithm. See Zafar, Olano, and Curtis,
-// "GPU Random Numbers via the Tiny Encryption Algorithm"
-uint32_t tea(uint32_t val0, uint32_t val1)
-{
-  uint32_t v0 = val0;
-  uint32_t v1 = val1;
-  uint32_t s0 = 0;
-
-  for(uint32_t n = 0; n < 16; n++)
-  {
-    s0 += 0x9e3779b9;
-    v0 += ((v1 << 4) + 0xa341316c) ^ (v1 + s0) ^ ((v1 >> 5) + 0xc8013ea4);
-    v1 += ((v0 << 4) + 0xad90777d) ^ (v0 + s0) ^ ((v0 >> 5) + 0x7e95761e);
-  }
-
-  return v0;
-}
-
-// Generate a random unsigned int in [0, 2^24) given the previous RNG state
-// using the Numerical Recipes linear congruential generator
-uint32_t lcg(inout uint32_t prev)
-{
-  uint32_t LCG_A = 1664525u;
-  uint32_t LCG_C = 1013904223u;
-  prev       = (LCG_A * prev + LCG_C);
-  return prev & 0x00FFFFFF;
-}
-
-// Generate a random float32_t in [0, 1) given the previous RNG state
-float32_t rnd(inout uint32_t prev)
-{
-  return (float32_t(lcg(prev)) / float32_t(0x01000000));
-}
diff --git a/71_RayTracingPipeline/app_resources/raytrace.rahit.hlsl b/71_RayTracingPipeline/app_resources/raytrace.rahit.hlsl
index e85d5b572..9fece3a2d 100644
--- a/71_RayTracingPipeline/app_resources/raytrace.rahit.hlsl
+++ b/71_RayTracingPipeline/app_resources/raytrace.rahit.hlsl
@@ -1,5 +1,4 @@
 #include "common.hlsl"
-#include "random.hlsl"
 
 [[vk::push_constant]] SPushConstants pc;
 
@@ -10,12 +9,7 @@ void main(inout HitPayload payload, in BuiltInTriangleIntersectionAttributes att
     const STriangleGeomInfo geom = vk::RawBufferLoad < STriangleGeomInfo > (pc.triangleGeomInfoBuffer + instID * sizeof(STriangleGeomInfo));
     const Material material = unpackMaterial(geom.material);
     
-    uint32_t seed = payload.seed;
-    if (material.dissolve == 0.0)
-    {
-        IgnoreHit();
-    }
-    else if (rnd(seed) > material.dissolve)
+    if (material.dissolve == 0.0 || material.dissolve < payload.dissolveThreshold)
     {
         IgnoreHit();
     }
diff --git a/71_RayTracingPipeline/app_resources/raytrace.rgen.hlsl b/71_RayTracingPipeline/app_resources/raytrace.rgen.hlsl
index fb4cb45b9..a493e13af 100644
--- a/71_RayTracingPipeline/app_resources/raytrace.rgen.hlsl
+++ b/71_RayTracingPipeline/app_resources/raytrace.rgen.hlsl
@@ -1,12 +1,13 @@
 #include "common.hlsl"
-#include "random.hlsl"
 
 #include "nbl/builtin/hlsl/jit/device_capabilities.hlsl"
+#include "nbl/builtin/hlsl/random/xoroshiro.hlsl"
 
 #include "nbl/builtin/hlsl/glsl_compat/core.hlsl"
 #include "nbl/builtin/hlsl/spirv_intrinsics/raytracing.hlsl"
 
 static const int32_t s_sampleCount = 10;
+static const float32_t3 s_clearColor = float32_t3(0.3, 0.3, 0.8);
 
 [[vk::push_constant]] SPushConstants pc;
 
@@ -14,9 +15,16 @@ static const int32_t s_sampleCount = 10;
 
 [[vk::binding(1, 0)]] RWTexture2D<float32_t4> colorImage;
 
-float32_t3 reinhardTonemap(float32_t3 v)
+uint32_t pcgHash(uint32_t v)
 {
-    return v / (1.0f + v);
+    uint32_t state = v * 747796405u + 2891336453u;
+    uint32_t word = ((state >> ((state >> 28u) + 4u)) ^ state) * 277803737u;
+    return (word >> 22u) ^ word;
+}
+
+float32_t nextRandomUnorm(inout nbl::hlsl::Xoroshiro64StarStar rnd)
+{
+    return float32_t(rnd()) / float32_t(0xFFFFFFFF);
 }
 
 [shader("raygeneration")]
@@ -25,13 +33,16 @@ void main()
     uint32_t3 launchID = DispatchRaysIndex();
     uint32_t3 launchSize = DispatchRaysDimensions();
     uint32_t2 coords = launchID.xy;
-    uint32_t seed = tea(launchID.y * launchSize.x + launchID.x, pc.frameCounter);
+
+    uint32_t seed1 = pcgHash(pc.frameCounter);
+    uint32_t seed2 = pcgHash(launchID.y * launchSize.x + launchID.x);
+    nbl::hlsl::Xoroshiro64StarStar rnd = nbl::hlsl::Xoroshiro64StarStar::construct(uint32_t2(seed1, seed2));
 
     float32_t3 hitValues = float32_t3(0, 0, 0);
     for (uint32_t sample_i = 0; sample_i < s_sampleCount; sample_i++)
     {
-        const float32_t r1 = rnd(seed);
-        const float32_t r2 = rnd(seed);
+        const float32_t r1 = nextRandomUnorm(rnd);
+        const float32_t r2 = nextRandomUnorm(rnd);
         const float32_t2 subpixelJitter = pc.frameCounter == 0 ? float32_t2(0.5f, 0.5f) : float32_t2(r1, r2);
 
         const float32_t2 pixelCenter = float32_t2(coords) + subpixelJitter;
@@ -50,12 +61,12 @@ void main()
         rayDesc.TMax = 10000.0;
         
         HitPayload payload;
-        payload.seed = seed;
+        payload.dissolveThreshold = nextRandomUnorm(rnd);
         TraceRay(topLevelAS, RAY_FLAG_NONE, 0xff, ERT_PRIMARY, 0, EMT_PRIMARY, rayDesc, payload);
 
         if (payload.rayDistance < 0)
         {
-            hitValues += float32_t3(0.3, 0.3, 0.3);
+            hitValues += s_clearColor;
             continue;
         }
 
@@ -68,7 +79,7 @@ void main()
 
         const float32_t3 diffuse = computeDiffuse(material, cLight.outLightDir, worldNormal);
         float32_t3 specular = float32_t3(0, 0, 0);
-        float32_t attenuation = 0;
+        float32_t attenuation = 1;
 
         if (dot(worldNormal, cLight.outLightDir) > 0)
         {
@@ -78,14 +89,15 @@ void main()
             rayDesc.TMin = 0.01;
             rayDesc.TMax = cLight.outLightDistance;
 
+            uint32_t shadowRayFlags = RAY_FLAG_ACCEPT_FIRST_HIT_AND_END_SEARCH | RAY_FLAG_FORCE_NON_OPAQUE | RAY_FLAG_SKIP_CLOSEST_HIT_SHADER;
             ShadowPayload shadowPayload;
-            shadowPayload.attenuation = -1; // negative attenuation indicate occlusion happening. will be multiplied by -1 in miss shader.
-            TraceRay(topLevelAS, RAY_FLAG_SKIP_CLOSEST_HIT_SHADER, 0xFF, ERT_OCCLUSION, 0, EMT_OCCLUSION, rayDesc, shadowPayload);
+            shadowPayload.attenuation = 1; // negative attenuation indicate occlusion happening. will be multiplied by -1 in miss shader.
+            TraceRay(topLevelAS, shadowRayFlags, 0xFF, ERT_OCCLUSION, 0, EMT_OCCLUSION, rayDesc, shadowPayload);
 
+            attenuation = shadowPayload.attenuation;
             if (shadowPayload.attenuation > 0)
             {
                 specular = computeSpecular(material, camDirection, cLight.outLightDir, worldNormal);
-                attenuation = shadowPayload.attenuation;
             }
         }
         hitValues += (cLight.outIntensity * attenuation * (diffuse + specular));
diff --git a/71_RayTracingPipeline/app_resources/raytraceShadow.rahit.hlsl b/71_RayTracingPipeline/app_resources/raytraceShadow.rahit.hlsl
index 3f063daba..15ac009e7 100644
--- a/71_RayTracingPipeline/app_resources/raytraceShadow.rahit.hlsl
+++ b/71_RayTracingPipeline/app_resources/raytraceShadow.rahit.hlsl
@@ -1,5 +1,4 @@
 #include "common.hlsl"
-#include "random.hlsl"
 
 [[vk::push_constant]] SPushConstants pc;
 
@@ -10,7 +9,15 @@ void main(inout ShadowPayload payload, in BuiltInTriangleIntersectionAttributes
     const STriangleGeomInfo geom = vk::RawBufferLoad < STriangleGeomInfo > (pc.triangleGeomInfoBuffer + instID * sizeof(STriangleGeomInfo));
     const Material material = unpackMaterial(geom.material);
     
-    payload.attenuation = (1 - material.dissolve) * payload.attenuation;
-    IgnoreHit();
+    if (material.illum != 4)
+    {
+        payload.attenuation = 0;
+        AcceptHitAndEndSearch();
+    }
+    else
+    {
+        payload.attenuation = (1 - material.dissolve) * payload.attenuation;
+        IgnoreHit();
+    }
 
 }
diff --git a/71_RayTracingPipeline/app_resources/raytraceShadow.rmiss.hlsl b/71_RayTracingPipeline/app_resources/raytraceShadow.rmiss.hlsl
index 287c38f55..aa8df4123 100644
--- a/71_RayTracingPipeline/app_resources/raytraceShadow.rmiss.hlsl
+++ b/71_RayTracingPipeline/app_resources/raytraceShadow.rmiss.hlsl
@@ -3,5 +3,4 @@
 [shader("miss")]
 void main(inout ShadowPayload payload)
 {
-	payload.attenuation = payload.attenuation * -1;
 }

From 583c0f9a9f44bd064f9341b3ef89e9579df075be Mon Sep 17 00:00:00 2001
From: kevyuu <kevin.kayu@gmail.com>
Date: Tue, 11 Feb 2025 22:38:44 +0700
Subject: [PATCH 040/529] Add trace ray indirect option

---
 71_RayTracingPipeline/main.cpp | 45 +++++++++++++++++++++++++++++-----
 1 file changed, 39 insertions(+), 6 deletions(-)

diff --git a/71_RayTracingPipeline/main.cpp b/71_RayTracingPipeline/main.cpp
index 1594ee6c4..18600f604 100644
--- a/71_RayTracingPipeline/main.cpp
+++ b/71_RayTracingPipeline/main.cpp
@@ -4,6 +4,7 @@
 
 #include "common.hpp"
 #include "nbl/ext/FullScreenTriangle/FullScreenTriangle.h"
+#include "nbl/builtin/hlsl/indirect_commands.hlsl"
 
 class RaytracingPipelineApp final : public examples::SimpleWindowedApplication, public application_templates::MonoAssetManagerAndBuiltinResourceApplication
 {
@@ -411,6 +412,9 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication,
     auto assetManager = make_smart_refctd_ptr<nbl::asset::IAssetManager>(smart_refctd_ptr(system));
     auto* geometryCreator = assetManager->getGeometryCreator();
 
+    if (!createIndirectBuffer(gQueue))
+      return logFail("Could not create indirect buffer");
+
     // create geometry objects
     if (!createGeometries(gQueue, geometryCreator))
       return logFail("Could not create geometries from geometry creator");
@@ -585,6 +589,7 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication,
             m_light.outerCutoff = cos(radians(dOuterCutoff));
           }
         }
+        ImGui::Checkbox("Use Indirect Command", &m_useIndirectCommand);
         if (m_light != m_oldLight)
         {
           m_frameAccumulationCounter = 0;
@@ -722,12 +727,26 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication,
       cmdbuf->bindRayTracingPipeline(m_rayTracingPipeline.get());
       cmdbuf->pushConstants(m_rayTracingPipeline->getLayout(), IShader::E_SHADER_STAGE::ESS_ALL_RAY_TRACING, 0, sizeof(SPushConstants), &pc);
       cmdbuf->bindDescriptorSets(EPBP_RAY_TRACING, m_rayTracingPipeline->getLayout(), 0, 1, &m_rayTracingDs.get());
-      cmdbuf->traceRays(
-        m_shaderBindingTable.raygenGroupRange, m_shaderBindingTable.raygenGroupStride,
-        m_shaderBindingTable.missGroupsRange, m_shaderBindingTable.missGroupsStride,
-        m_shaderBindingTable.hitGroupsRange, m_shaderBindingTable.hitGroupsStride,
-        m_shaderBindingTable.callableGroupsRange, m_shaderBindingTable.callableGroupsStride,
-        WIN_W, WIN_H, 1);
+      if (m_useIndirectCommand)
+      {
+        cmdbuf->traceRaysIndirect(
+          m_shaderBindingTable.raygenGroupRange, m_shaderBindingTable.raygenGroupStride,
+          m_shaderBindingTable.missGroupsRange, m_shaderBindingTable.missGroupsStride,
+          m_shaderBindingTable.hitGroupsRange, m_shaderBindingTable.hitGroupsStride,
+          m_shaderBindingTable.callableGroupsRange, m_shaderBindingTable.callableGroupsStride,
+          SBufferBinding<const IGPUBuffer>{
+            .offset = 0,
+            .buffer = m_indirectBuffer,
+          });
+      }else
+      {
+        cmdbuf->traceRays(
+          m_shaderBindingTable.raygenGroupRange, m_shaderBindingTable.raygenGroupStride,
+          m_shaderBindingTable.missGroupsRange, m_shaderBindingTable.missGroupsStride,
+          m_shaderBindingTable.hitGroupsRange, m_shaderBindingTable.hitGroupsStride,
+          m_shaderBindingTable.callableGroupsRange, m_shaderBindingTable.callableGroupsStride,
+          WIN_W, WIN_H, 1);
+      }
     }
 
     // pipeline barrier
@@ -1024,6 +1043,16 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication,
     }
   }
 
+  bool createIndirectBuffer(video::CThreadSafeQueueAdapter* queue)
+  {
+    const auto command = TraceRaysIndirectCommand_t{ WIN_W, WIN_H, 1 };
+    IGPUBuffer::SCreationParams params;
+    params.usage = IGPUBuffer::EUF_TRANSFER_DST_BIT | IGPUBuffer::EUF_INDIRECT_BUFFER_BIT | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT;
+    params.size = sizeof(TraceRaysIndirectCommand_t);
+    m_utils->createFilledDeviceLocalBufferOnDedMem(SIntendedSubmitInfo{ .queue = queue }, std::move(params), &command).move_into(m_indirectBuffer);
+    return true;
+  }
+
   bool createGeometries(video::CThreadSafeQueueAdapter* queue, const IGeometryCreator* gc)
   {
     auto pool = m_device->createCommandPool(queue->getFamilyIndex(), IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT);
@@ -1757,6 +1786,8 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication,
   smart_refctd_ptr<IGPUBuffer> m_triangleGeomInfoBuffer;
   smart_refctd_ptr<IGPUBuffer> m_proceduralGeomInfoBuffer;
   smart_refctd_ptr<IGPUBuffer> m_proceduralAabbBuffer;
+  smart_refctd_ptr<IGPUBuffer> m_indirectBuffer;
+
   smart_refctd_ptr<IGPUImage> m_hdrImage;
   smart_refctd_ptr<IGPUImageView> m_hdrImageView;
 
@@ -1771,5 +1802,7 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication,
 
   smart_refctd_ptr<CAssetConverter> m_converter;
 
+  bool m_useIndirectCommand = false;
+
 };
 NBL_MAIN_FUNC(RaytracingPipelineApp)

From 0797b331ec2feacba4c03af7a26182994d443652 Mon Sep 17 00:00:00 2001
From: kevyuu <kevin.kayu@gmail.com>
Date: Tue, 11 Feb 2025 22:58:10 +0700
Subject: [PATCH 041/529] Use matrix changes to reset frameAccumulationCounter

---
 71_RayTracingPipeline/main.cpp | 14 +++++++++-----
 common/include/CCamera.hpp     | 14 ++------------
 2 files changed, 11 insertions(+), 17 deletions(-)

diff --git a/71_RayTracingPipeline/main.cpp b/71_RayTracingPipeline/main.cpp
index 18600f604..e471065c7 100644
--- a/71_RayTracingPipeline/main.cpp
+++ b/71_RayTracingPipeline/main.cpp
@@ -687,6 +687,11 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication,
     modelMatrix.setRotation(quaternion(0, 0, 0));
 
     core::matrix4SIMD modelViewProjectionMatrix = core::concatenateBFollowedByA(viewProjectionMatrix, modelMatrix);
+    if (m_cachedModelViewProjectionMatrix != modelViewProjectionMatrix)
+    {
+      m_frameAccumulationCounter = 0;
+      m_cachedModelViewProjectionMatrix = modelViewProjectionMatrix;
+    }
     core::matrix4SIMD invModelViewProjectionMatrix;
     modelViewProjectionMatrix.getInverseTransform(invModelViewProjectionMatrix);
 
@@ -903,10 +908,9 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication,
 
     m_camera.beginInputProcessing(nextPresentationTimestamp);
     {
-      bool camera_moved = false;
       m_mouse.consumeEvents([&](const IMouseEventChannel::range_t& events) -> void
         {
-          camera_moved |= m_camera.mouseProcess(events); // don't capture the events, only let camera handle them with its impl
+          m_camera.mouseProcess(events); // don't capture the events, only let camera handle them with its impl
 
           for (const auto& e : events) // here capture
           {
@@ -921,7 +925,7 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication,
 
       m_keyboard.consumeEvents([&](const IKeyboardEventChannel::range_t& events) -> void
         {
-          camera_moved |= m_camera.keyboardProcess(events); // don't capture the events, only let camera handle them with its impl
+          m_camera.keyboardProcess(events); // don't capture the events, only let camera handle them with its impl
 
           for (const auto& e : events) // here capture
           {
@@ -933,8 +937,6 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication,
           }
         }, m_logger.get());
 
-      if (camera_moved)
-        m_frameAccumulationCounter = 0;
     }
     m_camera.endInputProcessing(nextPresentationTimestamp);
 
@@ -1802,6 +1804,8 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication,
 
   smart_refctd_ptr<CAssetConverter> m_converter;
 
+
+  core::matrix4SIMD m_cachedModelViewProjectionMatrix;
   bool m_useIndirectCommand = false;
 
 };
diff --git a/common/include/CCamera.hpp b/common/include/CCamera.hpp
index d9f31a260..797602a3b 100644
--- a/common/include/CCamera.hpp
+++ b/common/include/CCamera.hpp
@@ -133,9 +133,8 @@ class Camera
 public:
 
 	// return whether camera is moved by mouse
-	bool mouseProcess(const nbl::ui::IMouseEventChannel::range_t& events)
+	void mouseProcess(const nbl::ui::IMouseEventChannel::range_t& events)
 	{
-		bool cameraMoved = false;
 		for (auto eventIt=events.begin(); eventIt!=events.end(); eventIt++)
 		{
 			auto ev = *eventIt;
@@ -181,15 +180,11 @@ class Camera
 				mat.transformVect(localTarget);
 				
 				setTarget(localTarget + pos);
-
-				cameraMoved = true;
 			}
 		}
-		return cameraMoved;
 	}
 
-	// return whether camera is moved by keyboard
-	bool keyboardProcess(const nbl::ui::IKeyboardEventChannel::range_t& events)
+	void keyboardProcess(const nbl::ui::IKeyboardEventChannel::range_t& events)
 	{
 		for(uint32_t k = 0; k < E_CAMERA_MOVE_KEYS::ECMK_COUNT; ++k)
 			perActionDt[k] = 0.0;
@@ -200,14 +195,12 @@ class Camera
 		* And If an UP event was sent It will get subtracted it from this value. (Currently Disabled Because we Need better Oracle)
 		*/
 
-		bool cameraMoved = false;
 		for(uint32_t k = 0; k < E_CAMERA_MOVE_KEYS::ECMK_COUNT; ++k) 
 			if(keysDown[k]) 
 			{
 				auto timeDiff = std::chrono::duration_cast<std::chrono::milliseconds>(nextPresentationTimeStamp - lastVirtualUpTimeStamp).count();
 				assert(timeDiff >= 0);
 				perActionDt[k] += timeDiff;
-				cameraMoved = true;
 			}
 
 		for (auto eventIt=events.begin(); eventIt!=events.end(); eventIt++)
@@ -245,11 +238,8 @@ class Camera
 					position = initialPosition;
 					target = initialTarget;
 					recomputeViewMatrix();
-					cameraMoved = true;
 				}
 		}
-
-		return cameraMoved;
 	}
 
 	void beginInputProcessing(std::chrono::microseconds _nextPresentationTimeStamp)

From 7409e4b863a06a3a5ed5a9d099babd1129bfe806 Mon Sep 17 00:00:00 2001
From: kevyuu <kevin.kayu@gmail.com>
Date: Tue, 11 Feb 2025 23:03:14 +0700
Subject: [PATCH 042/529] Remove comment in CCamera.hpp

---
 common/include/CCamera.hpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/common/include/CCamera.hpp b/common/include/CCamera.hpp
index 797602a3b..1b0fe9c0f 100644
--- a/common/include/CCamera.hpp
+++ b/common/include/CCamera.hpp
@@ -132,7 +132,6 @@ class Camera
 
 public:
 
-	// return whether camera is moved by mouse
 	void mouseProcess(const nbl::ui::IMouseEventChannel::range_t& events)
 	{
 		for (auto eventIt=events.begin(); eventIt!=events.end(); eventIt++)

From f9c3fad711cef70d39ffff683c28fde2dc2f1199 Mon Sep 17 00:00:00 2001
From: kevyuu <kevin.kayu@gmail.com>
Date: Tue, 11 Feb 2025 23:34:51 +0700
Subject: [PATCH 043/529] Remove unnecesarry calculation of vertex position

---
 .../app_resources/raytrace.rchit.hlsl         | 25 ++++---------------
 1 file changed, 5 insertions(+), 20 deletions(-)

diff --git a/71_RayTracingPipeline/app_resources/raytrace.rchit.hlsl b/71_RayTracingPipeline/app_resources/raytrace.rchit.hlsl
index c5cf70185..aedea08d2 100644
--- a/71_RayTracingPipeline/app_resources/raytrace.rchit.hlsl
+++ b/71_RayTracingPipeline/app_resources/raytrace.rchit.hlsl
@@ -11,13 +11,7 @@ float3 unpackNormals3x10(uint32_t v)
     return clamp(float3(pn) / 511.0, -1.0, 1.0);
 }
 
-struct VertexData
-{
-    float32_t3 position;
-    float32_t3 normal;
-};
-
-VertexData fetchVertexData(int instID, int primID, STriangleGeomInfo geom, float2 bary)
+float32_t3 fetchVertexNormal(int instID, int primID, STriangleGeomInfo geom, float2 bary)
 {
     uint idxOffset = primID * 3;
 
@@ -52,12 +46,7 @@ VertexData fetchVertexData(int instID, int primID, STriangleGeomInfo geom, float
             }
     }
 
-    const uint64_t vertexBufferAddress = geom.vertexBufferAddress;
-    float32_t3 p0 = vk::RawBufferLoad < float32_t3 > (vertexBufferAddress + i0 * vertexStride);
-    float32_t3 p1 = vk::RawBufferLoad < float32_t3 > (vertexBufferAddress + i1 * vertexStride);
-    float32_t3 p2 = vk::RawBufferLoad < float32_t3 > (vertexBufferAddress + i2 * vertexStride);
-
-    const uint64_t normalVertexBufferAddress = vertexBufferAddress + s_offsetsToNormalBytes[objType];
+    const uint64_t normalVertexBufferAddress = geom.vertexBufferAddress + s_offsetsToNormalBytes[objType];
     float3 n0, n1, n2;
     switch (objType)
     {
@@ -102,11 +91,7 @@ VertexData fetchVertexData(int instID, int primID, STriangleGeomInfo geom, float
 
     float3 barycentrics = float3(0.0, bary);
     barycentrics.x = 1.0 - barycentrics.y - barycentrics.z;
-
-    VertexData data;
-    data.position = barycentrics.x * p0 + barycentrics.y * p1 + barycentrics.z * p2;
-    data.normal = normalize(barycentrics.x * n0 + barycentrics.y * n1 + barycentrics.z * n2);
-    return data;
+    return normalize(barycentrics.x * n0 + barycentrics.y * n1 + barycentrics.z * n2);
 }
 
 [shader("closesthit")]
@@ -115,8 +100,8 @@ void main(inout HitPayload payload, in BuiltInTriangleIntersectionAttributes att
     const int instID = InstanceID();
     const int primID = PrimitiveIndex();
     const STriangleGeomInfo geom = vk::RawBufferLoad < STriangleGeomInfo > (pc.triangleGeomInfoBuffer + instID * sizeof(STriangleGeomInfo));
-    const VertexData vertexData = fetchVertexData(instID, primID, geom, attribs.barycentrics);
-    const float32_t3 worldNormal = normalize(mul(vertexData.normal, WorldToObject3x4()).xyz);
+    const float32_t3 vertexNormal = fetchVertexNormal(instID, primID, geom, attribs.barycentrics);
+    const float32_t3 worldNormal = normalize(mul(vertexNormal, WorldToObject3x4()).xyz);
 
     payload.material = geom.material;
     payload.worldNormal = worldNormal;

From 22bd6f970c3954049a899181085a8df06b64fbe7 Mon Sep 17 00:00:00 2001
From: keptsecret <sorchon@gmail.com>
Date: Wed, 12 Feb 2025 11:30:50 +0700
Subject: [PATCH 044/529] nee stuff

---
 .../app_resources/hlsl/common.hlsl            |  35 +++-
 .../app_resources/hlsl/intersector.hlsl       |   2 +-
 .../hlsl/next_event_estimator.hlsl            | 171 ++++++++++++++++++
 3 files changed, 203 insertions(+), 5 deletions(-)

diff --git a/31_HLSLPathTracer/app_resources/hlsl/common.hlsl b/31_HLSLPathTracer/app_resources/hlsl/common.hlsl
index 9295b459b..e5940aab0 100644
--- a/31_HLSLPathTracer/app_resources/hlsl/common.hlsl
+++ b/31_HLSLPathTracer/app_resources/hlsl/common.hlsl
@@ -54,10 +54,34 @@ struct Ray
 template<class Spectrum>
 struct Light
 {
-    Spectrum radiance;
+    using spectral_type = Spectrum;
+
+    spectral_type radiance;
     uint32_t objectID;
 };
 
+template<typename T>
+struct Tolerance
+{
+    NBL_CONSTEXPR_STATIC_INLINE float INTERSECTION_ERROR_BOUND_LOG2 = -8.0;
+
+    static T __common(uint32_t depth)
+    {
+        float depthRcp = 1.0 / float(depth);
+        return INTERSECTION_ERROR_BOUND_LOG2;
+    }
+
+    static T getStart(uint32_t depth)
+    {
+        return nbl::hlsl::exp2(__common(depth));
+    }
+
+    static T getEnd(uint32_t depth)
+    {
+        return 1.0 - nbl::hlsl::exp2(__common(depth) + 1.0);
+    }
+}
+
 enum ProceduralShapeType : uint16_t
 {
     PST_SPHERE,
@@ -114,7 +138,8 @@ struct Shape<PST_SPHERE>
         return 2.0 * numbers::pi<float> * (1.0 - cosThetaMax);
     }
 
-    float deferredPdf(NBL_CONST_REF_ARG(Light light), NBL_CONST_REF_ARG(Ray) ray)
+    template<typename Light, typename Ray>
+    float deferredPdf(NBL_CONST_REF_ARG(Light) light, NBL_CONST_REF_ARG(Ray) ray)
     {
         return 1.0 / getSolidAngle(ray.origin);
     }
@@ -198,7 +223,8 @@ struct Shape<PST_TRIANGLE>
         return nbl::hlsl::cross(edges[0], edges[1]) * 0.5f;
     }
 
-    float deferredPdf(NBL_CONST_REF_ARG(Light light), NBL_CONST_REF_ARG(Ray<float>) ray)
+    template<typename Light, typename Ray>
+    float deferredPdf(NBL_CONST_REF_ARG(Light) light, NBL_CONST_REF_ARG(Ray) ray)
     {
         const float32_t3 L = ray.direction;
         switch (polygonMethod)
@@ -345,7 +371,8 @@ struct Shape<PST_RECTANGLE>
         basis = nbl::hlsl::transpose<matrix3x3_type>(basis);    // TODO: double check transpose
     }
 
-    float deferredPdf(NBL_CONST_REF_ARG(Light light), NBL_CONST_REF_ARG(Ray<float>) ray)
+    template<typename Light, typename Ray>
+    float deferredPdf(NBL_CONST_REF_ARG(Light light), NBL_CONST_REF_ARG(Ray) ray)
     {
         switch (polygonMethod)
         {
diff --git a/31_HLSLPathTracer/app_resources/hlsl/intersector.hlsl b/31_HLSLPathTracer/app_resources/hlsl/intersector.hlsl
index a694082fe..919816019 100644
--- a/31_HLSLPathTracer/app_resources/hlsl/intersector.hlsl
+++ b/31_HLSLPathTracer/app_resources/hlsl/intersector.hlsl
@@ -47,7 +47,7 @@ struct Comprehensive
     {
         const bool anyHit = ray.intersectionT != numeric_limits<scalar_type>::max;
         const uint32_t objCount = intersect.data[0];
-        const ProceduralIntersectionType type = intersect.data[1];
+        const ProceduralShapeType type = intersect.data[1];
 
         int objectID = -1;
         for (int i = 0; i < objCount; i++)
diff --git a/31_HLSLPathTracer/app_resources/hlsl/next_event_estimator.hlsl b/31_HLSLPathTracer/app_resources/hlsl/next_event_estimator.hlsl
index 1afa8d12e..5d96ae13e 100644
--- a/31_HLSLPathTracer/app_resources/hlsl/next_event_estimator.hlsl
+++ b/31_HLSLPathTracer/app_resources/hlsl/next_event_estimator.hlsl
@@ -1,6 +1,8 @@
 #ifndef _NBL_HLSL_EXT_NEXT_EVENT_ESTIMATOR_INCLUDED_
 #define _NBL_HLSL_EXT_NEXT_EVENT_ESTIMATOR_INCLUDED_
 
+#include "common.hlsl"
+
 namespace nbl
 {
 namespace hlsl
@@ -10,7 +12,176 @@ namespace ext
 namespace NextEventEstimator
 {
 
+// procedural data store: [light count] [intersect type] [obj]
+
+struct Event
+{
+    enum class Mode : uint32_t
+    {
+        RAY_QUERY,
+        RAY_TRACING,
+        PROCEDURAL
+    };
+
+    NBL_CONSTEXPR_STATIC_INLINE uint32_t DataSize = 128;
+
+    uint32_t mode : 1;
+    unit32_t unused : 31;   // possible space for flags
+    uint32_t data[DataSize];
+};
+
+template<typename Light, typename Ray, class LightSample, class Aniso>
+struct Estimator
+{
+    using scalar_type = typename Ray::scalar_type;
+    using ray_type = Ray;
+    using light_type = Light;
+    using spectral_type = typename Light::spectral_type;
+    using interaction_type = Aniso;
+    using quotient_pdf_type = quotient_and_pdf<spectral_type, scalar_type>;
+    using sample_type = LightSample;
+
+    static spectral_type proceduralDeferredEvalAndPdf(NBL_REF_ARG(scalar_type) pdf, NBL_CONST_REF_ARG(light_type) light, NBL_CONST_REF_ARG(ray_type) ray, NBL_CONST_REF_ARG(Event) event)
+    {
+        const uint32_t lightCount = event.data[0];
+        const ProceduralShapeType type = event.data[1];
+
+        pdf = 1.0 / lightCount;
+        switch (type)
+        {
+            case PST_SPHERE:
+            {
+                float32_t3 position = float32_t3(asfloat(intersect.data[2 + Shape<PST_SPHERE>::ObjSize]), asfloat(intersect.data[2 + Shape<PST_SPHERE>::ObjSize + 1]), asfloat(intersect.data[2 + Shape<PST_SPHERE>::ObjSize + 2]));
+                Shape<PST_SPHERE> sphere = Shape<PST_SPHERE>::create(position, asfloat(intersect.data[2 + Shape<PST_SPHERE>::ObjSize + 3]), intersect.data[2 + Shape<PST_SPHERE>::ObjSize + 4]);
+                pdf *= sphere.template deferredPdf<light_type, ray_type>(light, ray);
+            }
+            break;
+            case PST_TRIANGLE:
+            {
+                float32_t3 vertex0 = float32_t3(asfloat(intersect.data[2 + Shape<PST_TRIANGLE>::ObjSize]), asfloat(intersect.data[2 + Shape<PST_SPHERE>::ObjSize + 1]), asfloat(intersect.data[2 + Shape<PST_TRIANGLE>::ObjSize + 2]));
+                float32_t3 vertex1 = float32_t3(asfloat(intersect.data[2 + Shape<PST_TRIANGLE>::ObjSize + 3]), asfloat(intersect.data[2 + Shape<PST_SPHERE>::ObjSize + 4]), asfloat(intersect.data[2 + Shape<PST_TRIANGLE>::ObjSize + 5]));
+                float32_t3 vertex2 = float32_t3(asfloat(intersect.data[2 + Shape<PST_TRIANGLE>::ObjSize + 6]), asfloat(intersect.data[2 + Shape<PST_SPHERE>::ObjSize + 7]), asfloat(intersect.data[2 + Shape<PST_TRIANGLE>::ObjSize + 8]));
+                Shape<PST_TRIANGLE> tri = Shape<PST_TRIANGLE>::create(vertex0, vertex1, vertex2, intersect.data[2 + Shape<PST_TRIANGLE>::ObjSize + 9]);
+                pdf *= tri.template deferredPdf<light_type, ray_type>(light, ray);
+            }
+            break;
+            case PST_RECTANGLE:
+            {
+                float32_t3 offset = float32_t3(asfloat(intersect.data[2 + Shape<PST_RECTANGLE>::ObjSize]), asfloat(intersect.data[2 + Shape<PST_RECTANGLE>::ObjSize + 1]), asfloat(intersect.data[2 + Shape<PST_RECTANGLE>::ObjSize + 2]));
+                float32_t3 edge0 = float32_t3(asfloat(intersect.data[2 + Shape<PST_RECTANGLE>::ObjSize + 3]), asfloat(intersect.data[2 + Shape<PST_RECTANGLE>::ObjSize + 4]), asfloat(intersect.data[2 + Shape<PST_RECTANGLE>::ObjSize + 5]));
+                float32_t3 edge1 = float32_t3(asfloat(intersect.data[2 + Shape<PST_RECTANGLE>::ObjSize + 6]), asfloat(intersect.data[2 + Shape<PST_RECTANGLE>::ObjSize + 7]), asfloat(intersect.data[2 + Shape<PST_RECTANGLE>::ObjSize + 8]));
+                Shape<PST_RECTANGLE> rect = Shape<PST_RECTANGLE>::create(offset, edge0, edge1, intersect.data[2 + Shape<PST_RECTANGLE>::ObjSize + 9]);
+                pdf *= rect.template deferredPdf<light_type, ray_type>(light, ray);
+            }
+            break;
+            default:
+                pdf = numeric_limits<float>::infinity;
+                break;
+        }
+
+        return light.radiance;
+    }
+
+    static spectral_type deferredEvalAndPdf(NBL_REF_ARG(scalar_type) pdf, NBL_CONST_REF_ARG(light_type) light, NBL_CONST_REF_ARG(ray_type) ray, NBL_CONST_REF_ARG(Event) event)
+    {
+        const Event::Mode mode = event.mode;
+        switch (mode)
+        {
+            case Event::Mode::RAY_QUERY:
+            {
+                // TODO: do ray query stuff
+            }
+            break;
+            case Event::Mode::RAY_TRACING:
+            {
+                // TODO: do ray tracing stuff
+            }
+            break;
+            case Event::Mode::PROCEDURAL:
+            {
+                return proceduralDeferredEvalAndPdf(pdf, light, ray, event);
+            }
+            break;
+            default:
+                return (spectral_type)0.0;
+        }
+    }
+
+    static sample_type procedural_generate_and_quotient_and_pdf(NBL_REF_ARG(quotient_pdf_type) quotient_pdf, NBL_REF_ARG(scalar_type) newRayMaxT, NBL_CONST_REF_ARG(vector3_type) origin, NBL_CONST_REF_ARG(interaction_type) interaction, bool isBSDF, NBL_CONST_REF_ARG(vector3_type) xi, unit32_t depth, NBL_CONST_REF_ARG(Event) event)
+    {
+        const uint32_t lightCount = event.data[0];
+        const ProceduralShapeType type = event.data[1];
+
+        sample_type L;
+        scalar_type pdf;
+        switch (type)
+        {
+            case PST_SPHERE:
+            {
+                float32_t3 position = float32_t3(asfloat(intersect.data[2 + Shape<PST_SPHERE>::ObjSize]), asfloat(intersect.data[2 + Shape<PST_SPHERE>::ObjSize + 1]), asfloat(intersect.data[2 + Shape<PST_SPHERE>::ObjSize + 2]));
+                Shape<PST_SPHERE> sphere = Shape<PST_SPHERE>::create(position, asfloat(intersect.data[2 + Shape<PST_SPHERE>::ObjSize + 3]), intersect.data[2 + Shape<PST_SPHERE>::ObjSize + 4]);
+                L = sphere.template generate_and_pdf<interaction_type>(pdf, newRayMaxT, origin, interaction, isBSDF, xi, objectID);
+            }
+            break;
+            case PST_TRIANGLE:
+            {
+                float32_t3 vertex0 = float32_t3(asfloat(intersect.data[2 + Shape<PST_TRIANGLE>::ObjSize]), asfloat(intersect.data[2 + Shape<PST_SPHERE>::ObjSize + 1]), asfloat(intersect.data[2 + Shape<PST_TRIANGLE>::ObjSize + 2]));
+                float32_t3 vertex1 = float32_t3(asfloat(intersect.data[2 + Shape<PST_TRIANGLE>::ObjSize + 3]), asfloat(intersect.data[2 + Shape<PST_SPHERE>::ObjSize + 4]), asfloat(intersect.data[2 + Shape<PST_TRIANGLE>::ObjSize + 5]));
+                float32_t3 vertex2 = float32_t3(asfloat(intersect.data[2 + Shape<PST_TRIANGLE>::ObjSize + 6]), asfloat(intersect.data[2 + Shape<PST_SPHERE>::ObjSize + 7]), asfloat(intersect.data[2 + Shape<PST_TRIANGLE>::ObjSize + 8]));
+                Shape<PST_TRIANGLE> tri = Shape<PST_TRIANGLE>::create(vertex0, vertex1, vertex2, intersect.data[2 + Shape<PST_TRIANGLE>::ObjSize + 9]);
+                L = tri.template generate_and_pdf<interaction_type>(pdf, newRayMaxT, origin, interaction, isBSDF, xi, objectID);
+            }
+            break;
+            case PST_RECTANGLE:
+            {
+                float32_t3 offset = float32_t3(asfloat(intersect.data[2 + Shape<PST_RECTANGLE>::ObjSize]), asfloat(intersect.data[2 + Shape<PST_RECTANGLE>::ObjSize + 1]), asfloat(intersect.data[2 + Shape<PST_RECTANGLE>::ObjSize + 2]));
+                float32_t3 edge0 = float32_t3(asfloat(intersect.data[2 + Shape<PST_RECTANGLE>::ObjSize + 3]), asfloat(intersect.data[2 + Shape<PST_RECTANGLE>::ObjSize + 4]), asfloat(intersect.data[2 + Shape<PST_RECTANGLE>::ObjSize + 5]));
+                float32_t3 edge1 = float32_t3(asfloat(intersect.data[2 + Shape<PST_RECTANGLE>::ObjSize + 6]), asfloat(intersect.data[2 + Shape<PST_RECTANGLE>::ObjSize + 7]), asfloat(intersect.data[2 + Shape<PST_RECTANGLE>::ObjSize + 8]));
+                Shape<PST_RECTANGLE> rect = Shape<PST_RECTANGLE>::create(offset, edge0, edge1, intersect.data[2 + Shape<PST_RECTANGLE>::ObjSize + 9]);
+                L = rect.template generate_and_pdf<interaction_type>(pdf, newRayMaxT, origin, interaction, isBSDF, xi, objectID);
+            }
+            break;
+            default:
+                pdf = numeric_limits<float>::infinity;
+                break;
+        }
+
+        newRayMaxT *= Tolerance<scalar_type>::getEnd(depth);
+        pdf *= 1.0 / lightCount;
+        spectral_type quo = light.radiance / pdf;
+        quotient_pdf = quotient_pdf_type::create(quo, pdf);
+
+        return L;
+    }
 
+    static sample_type generate_and_quotient_and_pdf(NBL_REF_ARG(quotient_pdf_type) quotient_pdf, NBL_REF_ARG(scalar_type) newRayMaxT, NBL_CONST_REF_ARG(vector3_type) origin, NBL_CONST_REF_ARG(interaction_type) interaction, bool isBSDF, NBL_CONST_REF_ARG(vector3_type) xi, unit32_t depth, NBL_CONST_REF_ARG(Event) event)
+    {
+        const Event::Mode mode = event.mode;
+        switch (mode)
+        {
+            case Event::Mode::RAY_QUERY:
+            {
+                // TODO: do ray query stuff
+            }
+            break;
+            case Event::Mode::RAY_TRACING:
+            {
+                // TODO: do ray tracing stuff
+            }
+            break;
+            case Event::Mode::PROCEDURAL:
+            {
+                return procedural_generate_and_quotient_and_pdf(newRayMaxT, origin, interaction, isBSDF, xi, depth, event);
+            }
+            break;
+            default:
+            {
+                sample_type L;
+                return L;
+            }
+        }
+    }
+};
 
 }
 }

From 0da41df6fafe255caf47cf9dc1c6a363b12a0324 Mon Sep 17 00:00:00 2001
From: kevyuu <kevin.kayu@gmail.com>
Date: Wed, 12 Feb 2025 20:04:18 +0700
Subject: [PATCH 045/529] Tidy up variable name and file name

---
 .../app_resources/common.hlsl                 | 56 ++++++-------
 .../app_resources/raytrace.rahit.hlsl         |  4 +-
 .../app_resources/raytrace.rchit.hlsl         |  2 +-
 .../app_resources/raytrace.rgen.hlsl          | 34 ++++----
 .../app_resources/raytrace.rmiss.hlsl         |  2 +-
 .../raytrace_procedural.rchit.hlsl            |  4 +-
 ....rahit.hlsl => raytrace_shadow.rahit.hlsl} | 12 +--
 ....rmiss.hlsl => raytrace_shadow.rmiss.hlsl} |  2 +-
 71_RayTracingPipeline/main.cpp                | 81 ++++++++++---------
 9 files changed, 99 insertions(+), 98 deletions(-)
 rename 71_RayTracingPipeline/app_resources/{raytraceShadow.rahit.hlsl => raytrace_shadow.rahit.hlsl} (69%)
 rename 71_RayTracingPipeline/app_resources/{raytraceShadow.rmiss.hlsl => raytrace_shadow.rmiss.hlsl} (51%)

diff --git a/71_RayTracingPipeline/app_resources/common.hlsl b/71_RayTracingPipeline/app_resources/common.hlsl
index a089b152a..5b69c4a76 100644
--- a/71_RayTracingPipeline/app_resources/common.hlsl
+++ b/71_RayTracingPipeline/app_resources/common.hlsl
@@ -4,27 +4,29 @@
 #include "nbl/builtin/hlsl/cpp_compat.hlsl"
 
 NBL_CONSTEXPR uint32_t WorkgroupSize = 16;
+NBL_CONSTEXPR uint32_t MAX_UNORM_10 = 1023;
+NBL_CONSTEXPR uint32_t MAX_UNORM_22 = 4194303;
 
 inline uint32_t packUnorm10(float32_t v)
 {
-    return trunc(v * 1023.0f + 0.5f);
+    return trunc(v * float32_t(MAX_UNORM_10) + 0.5f);
 }
 
 inline float32_t unpackUnorm10(uint32_t packed)
 {
-    return float32_t(packed & 0x3ff) * (1.0f / 1023.0f);
+    return float32_t(packed & 0x3ff) * (1.0f / float32_t(MAX_UNORM_10));
 }
 
-inline uint32_t packUnorm18(float32_t v)
+inline uint32_t packUnorm22(float32_t v)
 {
-    const float maxValue = 262143;
+    const float maxValue = float32_t(MAX_UNORM_22);
     return trunc(v * maxValue + 0.5f);
 }
 
-inline float32_t unpackUnorm18(uint32_t packed)
+inline float32_t unpackUnorm22(uint32_t packed)
 {
-    const float maxValue = 262143;
-    return float32_t(packed & 0x3ffff) * (1.0f / maxValue);
+    const float maxValue = float32_t(MAX_UNORM_22);
+    return float32_t(packed & 0x3fffff) * (1.0f / maxValue);
 }
 
 inline uint32_t packUnorm3x10(float32_t3 v)
@@ -43,8 +45,12 @@ struct Material
     float32_t3 diffuse;
     float32_t3 specular;
     float32_t shininess;
-    float32_t dissolve; // 1 == opaque; 0 == fully transparent
-    uint32_t illum; // illumination model (see http://www.fileformat.info/format/material/)
+    float32_t alpha;
+
+    bool isTransparent() NBL_CONST_MEMBER_FUNC
+    {
+        return alpha < 1.0;
+    }
 };
 
 struct MaterialPacked
@@ -52,9 +58,13 @@ struct MaterialPacked
 	uint32_t ambient;
     uint32_t diffuse;
     uint32_t specular;
-    uint32_t shininess: 18;
-    uint32_t dissolve : 10; // 1 == opaque; 0 == fully transparent
-    uint32_t illum : 4; // illumination model (see http://www.fileformat.info/format/material/)
+    uint32_t shininess: 22;
+    uint32_t alpha : 10;
+
+    bool isTransparent() NBL_CONST_MEMBER_FUNC
+    {
+        return alpha != MAX_UNORM_10;
+}
 };
 
 inline MaterialPacked packMaterial(Material material)
@@ -63,9 +73,8 @@ inline MaterialPacked packMaterial(Material material)
     packed.ambient = packUnorm3x10(material.ambient);      
     packed.diffuse = packUnorm3x10(material.diffuse);
     packed.specular = packUnorm3x10(material.specular);      
-    packed.shininess = packUnorm18(material.shininess);
-    packed.dissolve = packUnorm10(material.dissolve);
-    packed.illum = material.illum;
+    packed.shininess = packUnorm22(material.shininess);
+    packed.alpha = packUnorm10(material.alpha);
     return packed;
 }
 
@@ -75,9 +84,8 @@ inline Material unpackMaterial(MaterialPacked packed)
     material.ambient = unpackUnorm3x10(packed.ambient);
     material.diffuse = unpackUnorm3x10(packed.diffuse);
     material.specular = unpackUnorm3x10(packed.specular);
-    material.shininess = unpackUnorm18(packed.shininess);
-    material.dissolve = unpackUnorm10(packed.dissolve);
-    material.illum = packed.illum;
+    material.shininess = unpackUnorm22(packed.shininess);
+    material.alpha = unpackUnorm10(packed.alpha);
     return material;
 }
 
@@ -176,17 +184,17 @@ struct ProceduralHitAttribute
 
 #ifdef __HLSL_VERSION
 
-struct [raypayload] ShadowPayload
+struct [raypayload] OcclusionPayload
 {
     float32_t attenuation : read(caller) : write(caller, anyhit);
 };
 
-struct [raypayload] HitPayload
+struct [raypayload] PrimaryPayload
 {
     MaterialPacked material : read(caller) : write(closesthit);
     float32_t3 worldNormal : read(caller) : write(closesthit);
     float32_t rayDistance : read(caller) : write(closesthit, miss);
-    float32_t dissolveThreshold : read(closesthit, anyhit) : write(caller);
+    float32_t alphaThreshold : read(closesthit, anyhit) : write(caller);
 };
 
 enum ObjectType : uint32_t  // matches c++
@@ -207,20 +215,14 @@ static uint32_t s_offsetsToNormalBytes[OT_COUNT] = { 18, 24, 24, 20, 20, 24, 16,
 
 float32_t3 computeDiffuse(Material mat, float32_t3 light_dir, float32_t3 normal)
 {
-	// Lambertian
 	float32_t dotNL = max(dot(normal, light_dir), 0.0);
 	float32_t3 c = mat.diffuse * dotNL;
-	if (mat.illum >= 1)
-		c += mat.ambient;
 	return c;
 }
 
 float32_t3 computeSpecular(Material mat, float32_t3 view_dir, 
 	float32_t3 light_dir, float32_t3 normal)
 {
-	if (mat.illum < 2)
-		return float32_t3(0, 0, 0);
-
 	const float32_t kPi = 3.14159265;
 	const float32_t kShininess = max(mat.shininess, 4.0);
 
diff --git a/71_RayTracingPipeline/app_resources/raytrace.rahit.hlsl b/71_RayTracingPipeline/app_resources/raytrace.rahit.hlsl
index 9fece3a2d..2923e95d9 100644
--- a/71_RayTracingPipeline/app_resources/raytrace.rahit.hlsl
+++ b/71_RayTracingPipeline/app_resources/raytrace.rahit.hlsl
@@ -3,13 +3,13 @@
 [[vk::push_constant]] SPushConstants pc;
 
 [shader("anyhit")]
-void main(inout HitPayload payload, in BuiltInTriangleIntersectionAttributes attribs)
+void main(inout PrimaryPayload payload, in BuiltInTriangleIntersectionAttributes attribs)
 {
     const int instID = InstanceID();
     const STriangleGeomInfo geom = vk::RawBufferLoad < STriangleGeomInfo > (pc.triangleGeomInfoBuffer + instID * sizeof(STriangleGeomInfo));
     const Material material = unpackMaterial(geom.material);
     
-    if (material.dissolve == 0.0 || material.dissolve < payload.dissolveThreshold)
+    if (material.alpha > payload.alphaThreshold)
     {
         IgnoreHit();
     }
diff --git a/71_RayTracingPipeline/app_resources/raytrace.rchit.hlsl b/71_RayTracingPipeline/app_resources/raytrace.rchit.hlsl
index aedea08d2..fdb252cda 100644
--- a/71_RayTracingPipeline/app_resources/raytrace.rchit.hlsl
+++ b/71_RayTracingPipeline/app_resources/raytrace.rchit.hlsl
@@ -95,7 +95,7 @@ float32_t3 fetchVertexNormal(int instID, int primID, STriangleGeomInfo geom, flo
 }
 
 [shader("closesthit")]
-void main(inout HitPayload payload, in BuiltInTriangleIntersectionAttributes attribs)
+void main(inout PrimaryPayload payload, in BuiltInTriangleIntersectionAttributes attribs)
 {
     const int instID = InstanceID();
     const int primID = PrimitiveIndex();
diff --git a/71_RayTracingPipeline/app_resources/raytrace.rgen.hlsl b/71_RayTracingPipeline/app_resources/raytrace.rgen.hlsl
index a493e13af..df6a5215d 100644
--- a/71_RayTracingPipeline/app_resources/raytrace.rgen.hlsl
+++ b/71_RayTracingPipeline/app_resources/raytrace.rgen.hlsl
@@ -17,8 +17,8 @@ static const float32_t3 s_clearColor = float32_t3(0.3, 0.3, 0.8);
 
 uint32_t pcgHash(uint32_t v)
 {
-    uint32_t state = v * 747796405u + 2891336453u;
-    uint32_t word = ((state >> ((state >> 28u) + 4u)) ^ state) * 277803737u;
+    const uint32_t state = v * 747796405u + 2891336453u;
+    const uint32_t word = ((state >> ((state >> 28u) + 4u)) ^ state) * 277803737u;
     return (word >> 22u) ^ word;
 }
 
@@ -30,12 +30,12 @@ float32_t nextRandomUnorm(inout nbl::hlsl::Xoroshiro64StarStar rnd)
 [shader("raygeneration")]
 void main()
 {
-    uint32_t3 launchID = DispatchRaysIndex();
-    uint32_t3 launchSize = DispatchRaysDimensions();
-    uint32_t2 coords = launchID.xy;
+    const uint32_t3 launchID = DispatchRaysIndex();
+    const uint32_t3 launchSize = DispatchRaysDimensions();
+    const uint32_t2 coords = launchID.xy;
 
-    uint32_t seed1 = pcgHash(pc.frameCounter);
-    uint32_t seed2 = pcgHash(launchID.y * launchSize.x + launchID.x);
+    const uint32_t seed1 = pcgHash(pc.frameCounter);
+    const uint32_t seed2 = pcgHash(launchID.y * launchSize.x + launchID.x);
     nbl::hlsl::Xoroshiro64StarStar rnd = nbl::hlsl::Xoroshiro64StarStar::construct(uint32_t2(seed1, seed2));
 
     float32_t3 hitValues = float32_t3(0, 0, 0);
@@ -57,11 +57,11 @@ void main()
         RayDesc rayDesc;
         rayDesc.Origin = pc.camPos;
         rayDesc.Direction = camDirection;
-        rayDesc.TMin = 0.001;
+        rayDesc.TMin = 0.01;
         rayDesc.TMax = 10000.0;
         
-        HitPayload payload;
-        payload.dissolveThreshold = nextRandomUnorm(rnd);
+        PrimaryPayload payload;
+        payload.alphaThreshold = nextRandomUnorm(rnd);
         TraceRay(topLevelAS, RAY_FLAG_NONE, 0xff, ERT_PRIMARY, 0, EMT_PRIMARY, rayDesc, payload);
 
         if (payload.rayDistance < 0)
@@ -90,20 +90,20 @@ void main()
             rayDesc.TMax = cLight.outLightDistance;
 
             uint32_t shadowRayFlags = RAY_FLAG_ACCEPT_FIRST_HIT_AND_END_SEARCH | RAY_FLAG_FORCE_NON_OPAQUE | RAY_FLAG_SKIP_CLOSEST_HIT_SHADER;
-            ShadowPayload shadowPayload;
-            shadowPayload.attenuation = 1; // negative attenuation indicate occlusion happening. will be multiplied by -1 in miss shader.
-            TraceRay(topLevelAS, shadowRayFlags, 0xFF, ERT_OCCLUSION, 0, EMT_OCCLUSION, rayDesc, shadowPayload);
+            OcclusionPayload occlusionPayload;
+            occlusionPayload.attenuation = 1; // negative attenuation indicate occlusion happening. will be multiplied by -1 in miss shader.
+            TraceRay(topLevelAS, shadowRayFlags, 0xFF, ERT_OCCLUSION, 0, EMT_OCCLUSION, rayDesc, occlusionPayload);
 
-            attenuation = shadowPayload.attenuation;
-            if (shadowPayload.attenuation > 0)
+            attenuation = occlusionPayload.attenuation;
+            if (occlusionPayload.attenuation > 0)
             {
                 specular = computeSpecular(material, camDirection, cLight.outLightDir, worldNormal);
             }
         }
-        hitValues += (cLight.outIntensity * attenuation * (diffuse + specular));
+        hitValues += ((cLight.outIntensity * attenuation * (diffuse + specular)) + material.ambient);
     }
 
-    float32_t3 hitValue = hitValues / s_sampleCount;
+    const float32_t3 hitValue = hitValues / s_sampleCount;
 
     if (pc.frameCounter > 0)
     {
diff --git a/71_RayTracingPipeline/app_resources/raytrace.rmiss.hlsl b/71_RayTracingPipeline/app_resources/raytrace.rmiss.hlsl
index 602104a19..5ccfed470 100644
--- a/71_RayTracingPipeline/app_resources/raytrace.rmiss.hlsl
+++ b/71_RayTracingPipeline/app_resources/raytrace.rmiss.hlsl
@@ -1,7 +1,7 @@
 #include "common.hlsl"
 
 [shader("miss")]
-void main(inout HitPayload payload)
+void main(inout PrimaryPayload payload)
 {
     payload.rayDistance = -1;
 }
diff --git a/71_RayTracingPipeline/app_resources/raytrace_procedural.rchit.hlsl b/71_RayTracingPipeline/app_resources/raytrace_procedural.rchit.hlsl
index 48495f0fc..0a58ccba8 100644
--- a/71_RayTracingPipeline/app_resources/raytrace_procedural.rchit.hlsl
+++ b/71_RayTracingPipeline/app_resources/raytrace_procedural.rchit.hlsl
@@ -2,10 +2,8 @@
 
 [[vk::push_constant]] SPushConstants pc;
 
-[[vk::binding(0, 0)]] RaytracingAccelerationStructure topLevelAS;
-
 [shader("closesthit")]
-void main(inout HitPayload payload, in ProceduralHitAttribute attrib)
+void main(inout PrimaryPayload payload, in ProceduralHitAttribute attrib)
 {
     const float32_t3 worldPosition = WorldRayOrigin() + WorldRayDirection() * RayTCurrent();
     const float32_t3 worldNormal = normalize(worldPosition - attrib.center);
diff --git a/71_RayTracingPipeline/app_resources/raytraceShadow.rahit.hlsl b/71_RayTracingPipeline/app_resources/raytrace_shadow.rahit.hlsl
similarity index 69%
rename from 71_RayTracingPipeline/app_resources/raytraceShadow.rahit.hlsl
rename to 71_RayTracingPipeline/app_resources/raytrace_shadow.rahit.hlsl
index 15ac009e7..c59f7367e 100644
--- a/71_RayTracingPipeline/app_resources/raytraceShadow.rahit.hlsl
+++ b/71_RayTracingPipeline/app_resources/raytrace_shadow.rahit.hlsl
@@ -3,21 +3,21 @@
 [[vk::push_constant]] SPushConstants pc;
 
 [shader("anyhit")]
-void main(inout ShadowPayload payload, in BuiltInTriangleIntersectionAttributes attribs)
+void main(inout OcclusionPayload payload, in BuiltInTriangleIntersectionAttributes attribs)
 {
     const int instID = InstanceID();
     const STriangleGeomInfo geom = vk::RawBufferLoad < STriangleGeomInfo > (pc.triangleGeomInfoBuffer + instID * sizeof(STriangleGeomInfo));
     const Material material = unpackMaterial(geom.material);
     
-    if (material.illum != 4)
+    if (material.isTransparent())
     {
-        payload.attenuation = 0;
-        AcceptHitAndEndSearch();
+        payload.attenuation = material.alpha * payload.attenuation;
+        IgnoreHit();
     }
     else
     {
-        payload.attenuation = (1 - material.dissolve) * payload.attenuation;
-        IgnoreHit();
+        payload.attenuation = 0;
+        AcceptHitAndEndSearch();
     }
 
 }
diff --git a/71_RayTracingPipeline/app_resources/raytraceShadow.rmiss.hlsl b/71_RayTracingPipeline/app_resources/raytrace_shadow.rmiss.hlsl
similarity index 51%
rename from 71_RayTracingPipeline/app_resources/raytraceShadow.rmiss.hlsl
rename to 71_RayTracingPipeline/app_resources/raytrace_shadow.rmiss.hlsl
index aa8df4123..baad9a3e9 100644
--- a/71_RayTracingPipeline/app_resources/raytraceShadow.rmiss.hlsl
+++ b/71_RayTracingPipeline/app_resources/raytrace_shadow.rmiss.hlsl
@@ -1,6 +1,6 @@
 #include "common.hlsl"
 
 [shader("miss")]
-void main(inout ShadowPayload payload)
+void main(inout OcclusionPayload payload)
 {
 }
diff --git a/71_RayTracingPipeline/main.cpp b/71_RayTracingPipeline/main.cpp
index e471065c7..d457e37dc 100644
--- a/71_RayTracingPipeline/main.cpp
+++ b/71_RayTracingPipeline/main.cpp
@@ -138,7 +138,7 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication,
     }
 
     // Load Custom Shader
-    auto loadCompileAndCreateShader = [&](const std::string& relPath, const std::string& header = "") -> smart_refctd_ptr<IGPUShader>
+    auto loadCompileAndCreateShader = [&](const std::string& relPath) -> smart_refctd_ptr<IGPUShader>
         {
             IAssetLoader::SAssetLoadParams lp = {};
             lp.logger = m_logger.get();
@@ -153,13 +153,7 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication,
             if (!sourceRaw)
                 return nullptr;
 
-            smart_refctd_ptr<ICPUShader> source = CHLSLCompiler::createOverridenCopy(
-                sourceRaw.get(),
-                "%s\n",
-                header.c_str()
-            );
-
-            return m_device->createShader({ source.get(), nullptr, shaderReadCache.get(), shaderWriteCache.get() });
+            return m_device->createShader({ sourceRaw.get(), nullptr, shaderReadCache.get(), shaderWriteCache.get() });
         };
 
     // load shaders
@@ -168,9 +162,9 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication,
     const auto proceduralClosestHitShader = loadCompileAndCreateShader("app_resources/raytrace_procedural.rchit.hlsl");
     const auto intersectionHitShader = loadCompileAndCreateShader("app_resources/raytrace.rint.hlsl");
     const auto anyHitShaderColorPayload = loadCompileAndCreateShader("app_resources/raytrace.rahit.hlsl");
-    const auto anyHitShaderShadowPayload = loadCompileAndCreateShader("app_resources/raytraceShadow.rahit.hlsl");
+    const auto anyHitShaderShadowPayload = loadCompileAndCreateShader("app_resources/raytrace_shadow.rahit.hlsl");
     const auto missShader = loadCompileAndCreateShader("app_resources/raytrace.rmiss.hlsl");
-    const auto shadowMissShader = loadCompileAndCreateShader("app_resources/raytraceShadow.rmiss.hlsl");
+    const auto shadowMissShader = loadCompileAndCreateShader("app_resources/raytrace_shadow.rmiss.hlsl");
     const auto directionalLightCallShader = loadCompileAndCreateShader("app_resources/light_directional.rcall.hlsl");
     const auto pointLightCallShader = loadCompileAndCreateShader("app_resources/light_point.rcall.hlsl");
     const auto spotLightCallShader = loadCompileAndCreateShader("app_resources/light_spot.rcall.hlsl");
@@ -300,14 +294,14 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication,
           .binding = 0,
           .type = asset::IDescriptor::E_TYPE::ET_ACCELERATION_STRUCTURE,
           .createFlags = IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_NONE,
-          .stageFlags = asset::IShader::E_SHADER_STAGE::ESS_ALL_RAY_TRACING,
+          .stageFlags = asset::IShader::E_SHADER_STAGE::ESS_RAYGEN,
           .count = 1,
         },
         {
           .binding = 1,
           .type = asset::IDescriptor::E_TYPE::ET_STORAGE_IMAGE,
           .createFlags = IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_NONE,
-          .stageFlags = asset::IShader::E_SHADER_STAGE::ESS_ALL_RAY_TRACING,
+          .stageFlags = asset::IShader::E_SHADER_STAGE::ESS_RAYGEN,
           .count = 1,
         }
       };
@@ -333,7 +327,7 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication,
         RTDS_SHADOW_MISS,
         RTDS_CLOSEST_HIT,
         RTDS_SPHERE_CLOSEST_HIT,
-        RTDS_ANYHIT_COLOR,
+        RTDS_ANYHIT_PRIMARY,
         RTDS_ANYHIT_SHADOW,
         RTDS_INTERSECTION,
         RTDS_DIRECTIONAL_CALL,
@@ -348,7 +342,7 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication,
       shaders[RTDS_SHADOW_MISS] = {.shader = shadowMissShader.get()};
       shaders[RTDS_CLOSEST_HIT] = {.shader = closestHitShader.get()};
       shaders[RTDS_SPHERE_CLOSEST_HIT] = {.shader = proceduralClosestHitShader.get()};
-      shaders[RTDS_ANYHIT_COLOR] = {.shader = anyHitShaderColorPayload.get()};
+      shaders[RTDS_ANYHIT_PRIMARY] = {.shader = anyHitShaderColorPayload.get()};
       shaders[RTDS_ANYHIT_SHADOW] = {.shader = anyHitShaderShadowPayload.get()};
       shaders[RTDS_INTERSECTION] = {.shader = intersectionHitShader.get() };
       shaders[RTDS_DIRECTIONAL_CALL] = {.shader = directionalLightCallShader.get()};
@@ -374,19 +368,17 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication,
       SHitShaderGroup hitGroups[E_RAY_TYPE::ERT_COUNT * E_GEOM_TYPE::EGT_COUNT];
       hitGroups[getHitGroupIndex(EGT_TRIANGLES, ERT_PRIMARY)] = {
         .closestHitShaderIndex = RTDS_CLOSEST_HIT,
-        .anyHitShaderIndex = RTDS_ANYHIT_COLOR,
+        .anyHitShaderIndex = RTDS_ANYHIT_PRIMARY,
       };
       hitGroups[getHitGroupIndex(EGT_TRIANGLES, ERT_OCCLUSION)] = {
-        .closestHitShaderIndex = RTDS_CLOSEST_HIT,
         .anyHitShaderIndex = RTDS_ANYHIT_SHADOW,
       };
       hitGroups[getHitGroupIndex(EGT_PROCEDURAL, ERT_PRIMARY)] = {
         .closestHitShaderIndex = RTDS_SPHERE_CLOSEST_HIT,
-        .anyHitShaderIndex = RTDS_ANYHIT_COLOR,
+        .anyHitShaderIndex = RTDS_ANYHIT_PRIMARY,
         .intersectionShaderIndex = RTDS_INTERSECTION,
       };
       hitGroups[getHitGroupIndex(EGT_PROCEDURAL, ERT_OCCLUSION)] = {
-        .closestHitShaderIndex = RTDS_CLOSEST_HIT,
         .anyHitShaderIndex = RTDS_ANYHIT_SHADOW,
         .intersectionShaderIndex = RTDS_INTERSECTION,
       };
@@ -546,7 +538,11 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication,
         {
           static matrix4SIMD projection;
 
-          projection = matrix4SIMD::buildProjectionMatrixPerspectiveFovRH(core::radians(fov), io.DisplaySize.x / io.DisplaySize.y, zNear, zFar);
+          projection = matrix4SIMD::buildProjectionMatrixPerspectiveFovRH(
+            core::radians(m_cameraSetting.fov), 
+            io.DisplaySize.x / io.DisplaySize.y, 
+            m_cameraSetting.zNear, 
+            m_cameraSetting.zFar);
 
           return projection;
         }());
@@ -563,11 +559,11 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication,
 
         ImGui::Text("Camera");
 
-        ImGui::SliderFloat("Move speed", &moveSpeed, 0.1f, 10.f);
-        ImGui::SliderFloat("Rotate speed", &rotateSpeed, 0.1f, 10.f);
-        ImGui::SliderFloat("Fov", &fov, 20.f, 150.f);
-        ImGui::SliderFloat("zNear", &zNear, 0.1f, 100.f);
-        ImGui::SliderFloat("zFar", &zFar, 110.f, 10000.f);
+        ImGui::SliderFloat("Move speed", &m_cameraSetting.moveSpeed, 0.1f, 10.f);
+        ImGui::SliderFloat("Rotate speed", &m_cameraSetting.rotateSpeed, 0.1f, 10.f);
+        ImGui::SliderFloat("Fov", &m_cameraSetting.fov, 20.f, 150.f);
+        ImGui::SliderFloat("zNear", &m_cameraSetting.zNear, 0.1f, 100.f);
+        ImGui::SliderFloat("zFar", &m_cameraSetting.zFar, 110.f, 10000.f);
         Light m_oldLight = m_light;
         int light_type = m_light.type;
         ImGui::ListBox("LightType", &light_type, s_lightTypeNames, ELT_COUNT);
@@ -879,8 +875,8 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication,
 
   inline void update()
   {
-    m_camera.setMoveSpeed(moveSpeed);
-    m_camera.setRotateSpeed(rotateSpeed);
+    m_camera.setMoveSpeed(m_cameraSetting.moveSpeed);
+    m_camera.setRotateSpeed(m_cameraSetting.rotateSpeed);
 
     static std::chrono::microseconds previousEventTimestamp{};
 
@@ -1062,11 +1058,11 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication,
       return logFail("Couldn't create Command Pool for geometry creation!");
 
     const auto defaultMaterial = Material{
-      .ambient = {},
+      .ambient = {0.2, 0.1, 0.1},
       .diffuse = {0.8, 0.3, 0.3},
       .specular = {0.8, 0.8, 0.8},
       .shininess = 1.0f,
-      .illum = 2
+      .alpha = 1.0f,
     };
 
     auto getTranslationMatrix = [](float32_t x, float32_t y, float32_t z)
@@ -1096,11 +1092,10 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication,
         .meta = {.type = OT_CUBE, .name = "Cube Mesh 2"},
         .data = gc->createCubeMesh(nbl::core::vector3df(1.5, 1.5, 1.5)),
         .material = Material{
-          .ambient = {},
+          .ambient = {0.1, 0.1, 0.2},
           .diffuse = {0.2, 0.2, 0.8},
           .specular = {0.8, 0.8, 0.8},
           .shininess = 1.0f,
-          .illum = 2
         },
         .transform = getTranslationMatrix(-5.0f, 1.0f, 0),
       },
@@ -1108,12 +1103,11 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication,
         .meta = {.type = OT_CUBE, .name = "Transparent Cube Mesh"},
         .data = gc->createCubeMesh(nbl::core::vector3df(1.5, 1.5, 1.5)),
         .material = Material{
-          .ambient = {},
+          .ambient = {0.1, 0.2, 0.1},
           .diffuse = {0.2, 0.8, 0.2},
           .specular = {0.8, 0.8, 0.8},
           .shininess = 1.0f,
-          .dissolve = 0.2,
-          .illum = 4
+          .alpha = 0.8,
         },
         .transform = getTranslationMatrix(5.0f, 1.0f, 0),
       },
@@ -1282,11 +1276,10 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication,
         const auto middle_i = NumberOfProceduralGeometries / 2.0;
         SProceduralGeomInfo sphere = {
           .material = packMaterial({
-            .ambient = {},
+            .ambient = {0.1, 0.05 * i, 0.1},
             .diffuse = {0.3, 0.2 * i, 0.3},
             .specular = {0.8, 0.8, 0.8},
             .shininess = 1.0f,
-            .illum = 2
           }),
           .center = float32_t3((i - middle_i) * 4.0, 2, 5.0),
           .radius = 1,
@@ -1482,7 +1475,7 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication,
           triangles[i].vertexStride = vertexStride;
           triangles[i].vertexFormat = EF_R32G32B32_SFLOAT;
           triangles[i].indexType = gpuObject.indexType;
-          triangles[i].geometryFlags = gpuObject.material.illum == 4 ? 
+          triangles[i].geometryFlags = gpuObject.material.isTransparent() ?
             IGPUBottomLevelAccelerationStructure::GEOMETRY_FLAGS::NO_DUPLICATE_ANY_HIT_INVOCATION_BIT :
             IGPUBottomLevelAccelerationStructure::GEOMETRY_FLAGS::OPAQUE_BIT;
 
@@ -1749,10 +1742,18 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication,
   InputSystem::ChannelReader<IMouseEventChannel> m_mouse;
   InputSystem::ChannelReader<IKeyboardEventChannel> m_keyboard;
 
-  float fov = 60.f, zNear = 0.1f, zFar = 10000.f, moveSpeed = 1.f, rotateSpeed = 1.f;
-  float viewWidth = 10.f;
-  float camYAngle = 165.f / 180.f * 3.14159f;
-  float camXAngle = 32.f / 180.f * 3.14159f;
+  struct CameraSetting
+  {
+    float fov = 60.f;
+    float zNear = 0.1f;
+    float zFar = 10000.f;
+    float moveSpeed = 1.f;
+    float rotateSpeed = 1.f;
+    float viewWidth = 10.f;
+    float camYAngle = 165.f / 180.f * 3.14159f;
+    float camXAngle = 32.f / 180.f * 3.14159f;
+    
+  } m_cameraSetting;
   Camera m_camera = Camera(core::vectorSIMDf(0, 0, 0), core::vectorSIMDf(0, 0, 0), core::matrix4SIMD());
 
   Light m_light = {

From 3bb858b45166be7b8c4b48a3d465697c7a6aadc8 Mon Sep 17 00:00:00 2001
From: keptsecret <sorchon@gmail.com>
Date: Thu, 13 Feb 2025 17:01:20 +0700
Subject: [PATCH 046/529] scene representation, getmeasure for pt

---
 .../app_resources/hlsl/common.hlsl            |  21 +++
 .../app_resources/hlsl/intersector.hlsl       |  72 ++++++++-
 .../app_resources/hlsl/pathtracer.hlsl        | 144 ++++++++++++++++--
 3 files changed, 227 insertions(+), 10 deletions(-)

diff --git a/31_HLSLPathTracer/app_resources/hlsl/common.hlsl b/31_HLSLPathTracer/app_resources/hlsl/common.hlsl
index e5940aab0..7289d508d 100644
--- a/31_HLSLPathTracer/app_resources/hlsl/common.hlsl
+++ b/31_HLSLPathTracer/app_resources/hlsl/common.hlsl
@@ -474,6 +474,27 @@ struct Shape<PST_RECTANGLE>
     PTPolygonMethod polygonMethod;
 };
 
+struct Scene
+{
+    NBL_CONSTEXPR_STATIC_INLINE uint32_t maxSphereCount = 25;
+    NBL_CONSTEXPR_STATIC_INLINE uint32_t maxTriangleCount = 12;
+    NBL_CONSTEXPR_STATIC_INLINE uint32_t maxRectangleCount = 12;
+
+    Shape<PST_SPHERE> spheres[maxSphereCount];
+    Shape<PST_TRIANGLE> triangles[maxTriangleCount];
+    Shape<PST_RECTANGLE> rectangles[maxRectangleCount];
+
+    uint32_t sphereCount;
+    uint32_t triangleCount;
+    uint32_t rectangleCount;
+
+    Light lights[];
+    // Material materials[];
+    // + obj count for each
+
+    // AS ases;
+};
+
 }
 }
 }
diff --git a/31_HLSLPathTracer/app_resources/hlsl/intersector.hlsl b/31_HLSLPathTracer/app_resources/hlsl/intersector.hlsl
index 919816019..23706402a 100644
--- a/31_HLSLPathTracer/app_resources/hlsl/intersector.hlsl
+++ b/31_HLSLPathTracer/app_resources/hlsl/intersector.hlsl
@@ -23,6 +23,76 @@ namespace Intersector
 
 struct IntersectData
 {
+    static IntersectData encode(uint32_t mode, ProceduralShapeType type, NBL_CONST_REF_ARG(Scene) scene)
+    {
+        IntersectData retval;
+        retval.mode = mode;
+
+        uint32_t objCount = (type == PST_SPHERE) ? scene.sphereCount :
+                            (type == PST_TRIANGLE) ? scene.triangleCount :
+                            (type == PST_RECTANGLE) ? scene.rectangleCount :
+                            -1;
+        retval.data[0] = objCount;
+        retval.data[1] = type;
+        
+        switch (type)
+        {
+            case PST_SPHERE:
+            {
+                for (int i = 0; i < objCount; i++)
+                {
+                    Shape<PST_SPHERE> sphere = scene.spheres[i];
+                    retval.data[2 + i * Shape<PST_SPHERE>::ObjSize] = asuint(sphere.position.x);
+                    retval.data[2 + i * Shape<PST_SPHERE>::ObjSize + 1] = asuint(sphere.position.y);
+                    retval.data[2 + i * Shape<PST_SPHERE>::ObjSize + 2] = asuint(sphere.position.z);
+                    retval.data[2 + i * Shape<PST_SPHERE>::ObjSize + 3] = asuint(sphere.radius);
+                    retval.data[2 + i * Shape<PST_SPHERE>::ObjSize + 4] = sphere.bsdfLightIDs;
+                }
+            }
+            break;
+            case PST_TRIANGLE:
+            {
+                for (int i = 0; i < objCount; i++)
+                {
+                    Shape<PST_TRIANGLE> tri = scene.triangles[i];
+                    retval.data[2 + i * Shape<PST_TRIANGLE>::ObjSize] = asuint(tri.vertex0.x);
+                    retval.data[2 + i * Shape<PST_TRIANGLE>::ObjSize + 1] = asuint(tri.vertex0.y);
+                    retval.data[2 + i * Shape<PST_TRIANGLE>::ObjSize + 2] = asuint(tri.vertex0.z);
+                    retval.data[2 + i * Shape<PST_TRIANGLE>::ObjSize + 3] = asuint(tri.vertex1.x);
+                    retval.data[2 + i * Shape<PST_TRIANGLE>::ObjSize + 4] = asuint(tri.vertex1.y);
+                    retval.data[2 + i * Shape<PST_TRIANGLE>::ObjSize + 5] = asuint(tri.vertex1.z);
+                    retval.data[2 + i * Shape<PST_TRIANGLE>::ObjSize + 6] = asuint(tri.vertex2.x);
+                    retval.data[2 + i * Shape<PST_TRIANGLE>::ObjSize + 7] = asuint(tri.vertex2.y);
+                    retval.data[2 + i * Shape<PST_TRIANGLE>::ObjSize + 8] = asuint(tri.vertex2.z);
+                    retval.data[2 + i * Shape<PST_TRIANGLE>::ObjSize + 9] = tri.bsdfLightIDs;
+                }
+            }
+            break;
+            case PST_RECTANGLE:
+            {
+                for (int i = 0; i < objCount; i++)
+                {
+                    Shape<PST_RECTANGLE> rect = scene.rectangles[i];
+                    retval.data[2 + i * Shape<PST_RECTANGLE>::ObjSize] = asuint(rect.offset.x);
+                    retval.data[2 + i * Shape<PST_RECTANGLE>::ObjSize + 1] = asuint(rect.offset.y);
+                    retval.data[2 + i * Shape<PST_RECTANGLE>::ObjSize + 2] = asuint(rect.offset.z);
+                    retval.data[2 + i * Shape<PST_RECTANGLE>::ObjSize + 3] = asuint(rect.edge0.x);
+                    retval.data[2 + i * Shape<PST_RECTANGLE>::ObjSize + 4] = asuint(rect.edge0.y);
+                    retval.data[2 + i * Shape<PST_RECTANGLE>::ObjSize + 5] = asuint(rect.edge0.z);
+                    retval.data[2 + i * Shape<PST_RECTANGLE>::ObjSize + 6] = asuint(rect.edge1.x);
+                    retval.data[2 + i * Shape<PST_RECTANGLE>::ObjSize + 7] = asuint(rect.edge1.y);
+                    retval.data[2 + i * Shape<PST_RECTANGLE>::ObjSize + 8] = asuint(rect.edge1.z);
+                    retval.data[2 + i * Shape<PST_RECTANGLE>::ObjSize + 9] = rect.bsdfLightIDs;
+                }
+            }
+            break;
+            default:
+                // for ASes
+                break;
+        }
+        return retval;        
+    }
+
     enum class Mode : uint32_t
     {
         RAY_QUERY,
@@ -49,7 +119,7 @@ struct Comprehensive
         const uint32_t objCount = intersect.data[0];
         const ProceduralShapeType type = intersect.data[1];
 
-        int objectID = -1;
+        int objectID = ray.objectID;
         for (int i = 0; i < objCount; i++)
         {
             float t;
diff --git a/31_HLSLPathTracer/app_resources/hlsl/pathtracer.hlsl b/31_HLSLPathTracer/app_resources/hlsl/pathtracer.hlsl
index 9ca0f77e4..06950b825 100644
--- a/31_HLSLPathTracer/app_resources/hlsl/pathtracer.hlsl
+++ b/31_HLSLPathTracer/app_resources/hlsl/pathtracer.hlsl
@@ -10,27 +10,153 @@ namespace ext
 namespace PathTracer
 {
 
+template<typename BxDFCreation, typename Scalar>
+struct PathTracerCreationParams
+{
+    // rng gen
+    uint32_t2 rngState;
+
+    // ray gen
+    vector<Scalar, 2> pixOffsetParam;
+    vector<Scalar, 3> camPos;
+    vector<Scalar, 4> NDC;
+    matrix<Scalar, 4, 4> invMVP;
+
+    // mat
+    BxDFCreation diffuseParams;
+    BxDFCreation conductorParams;
+    BxDFCreation dielectricParams;
+};
+
 template<class RandGen, class RayGen, class Intersector, class MaterialSystem, /* class PathGuider, */ class NextEventEstimator>
 struct Unidirectional
 {
     using this_t = Unidirectional<RandGen, RayGen, Intersector, MaterialSystem, NextEventEstimator>;
+    using randgen_type = RandGen;
+    using raygen_type = RayGen;
+    using intersector_type = Intersector;
+    using material_system_type = MaterialSystem;
+    using nee_type = NextEventEstimator;
+
+    using scalar_type = typename MaterialSystem::scalar_type;
+    using vector3_type = vector<scalar_type, 3>;
+    using measure_type = typename MaterialSystem::measure_type;
+    using ray_type = typename RayGen::ray_type;
 
-    static this_t create(RandGen randGen,
-                        RayGen rayGen,
-                        Intersector intersector,
-                        MaterialSystem materialSystem,
-                        /* PathGuider pathGuider, */
-                        NextEventEstimator nee)
-    {}
+    // static this_t create(RandGen randGen,
+    //                     RayGen rayGen,
+    //                     Intersector intersector,
+    //                     MaterialSystem materialSystem,
+    //                     /* PathGuider pathGuider, */
+    //                     NextEventEstimator nee)
+    // {}
 
-    // closest hit
+    static this_t create(NBL_CONST_REF_ARG(PathTracerCreationParams) params)
+    {
+        this_t retval;
+        retval.randGen = randgen_type::create(params.rngState);
+        retval.rayGen = raygen_type::create(params.pixOffsetParam, params.camPos, params.NDC, params.invMVP);
+        retval.materialSystem = material_system_type::create(diffuseParams, conductorParams, dielectricParams);
+        return retval;
+    }
+
+    // TODO: get working, what is sampleSequence stuff
+    vector3_type rand3d(uint32_t protoDimension, uint32_t _sample)
+    {
+        uint32_t address = spirv::bitfieldInsert(protoDimension, _sample, MAX_DEPTH_LOG2, MAX_SAMPLES_LOG2);
+	    unit32_t3 seqVal = texelFetch(sampleSequence, int(address) + i).xyz;
+	    seqVal ^= unit32_t3(randGen(), randGen(), randGen());
+        return vector3_type(seqVal) * asfloat(0x2f800004u);
+    }
+
+    bool closestHitProgram(unit32_t depth, uint32_t _sample, NBL_REF_ARG(ray_type) ray)
+    {
+        const uint32_t objectID = ray.objectID;
+        const vector3_type intersection = ray.origin + ray.direction * ray.intersectionT;
+
+        uint32_t bsdfLightIDs;
+    }
+
+    void missProgram(NBL_REF_ARG(ray_type) ray)
+    {
+        vector3_type finalContribution = ray.payload.throughput; 
+        // #ifdef USE_ENVMAP
+        //     vec2 uv = SampleSphericalMap(_immutable.direction);
+        //     finalContribution *= textureLod(envMap, uv, 0.0).rgb;
+        // #else
+        const vector3_type kConstantEnvLightRadiance = vector3_type(0.15, 0.21, 0.3);   // TODO: match spectral_type
+        finalContribution *= kConstantEnvLightRadiance;
+        ray.payload.accumulation += finalContribution;
+        // #endif
+    }
 
     // Li
-    MaterialSystem::measure_type getMeasure()
+    measure_type getMeasure(uint32_t numSamples, uint32_t depth, NBL_CONST_REF_ARG(Scene) scene)
     {
         // loop through bounces, do closest hit
         // return ray.payload.accumulation --> color
+
+        // TODO: not hardcode this, pass value from somewhere?, where to get objects?
+        Intersector::IntersectData data;
+
+        measure_type Li = (measure_type)0.0;
+        scalar_type meanLumaSq = 0.0;
+        for (uint32_t i = 0; i < numSamples; i++)
+        {
+            vector3_type uvw = rand3d(0u, i);
+            ray_type ray = rayGen.generate(uvw);
+
+            // bounces
+            bool hit = true;
+            bool rayAlive = true;
+            for (int d = 1; d <= depth && hit && rayAlive; d += 2)
+            {
+                ray.intersectionT = numeric_limits<scalar_type>::max;
+                ray.objectID = -1;  // start with no intersect
+                
+                // prodedural shapes
+                if (scene.sphereCount > 0)
+                {
+                    data = Intersector::IntersectData::encode(Intersector::IntersectData::Mode::PROCEDURAL, PST_SPHERE, scene);
+                    ray.objectID = intersector.traceRay(ray, data);
+                }
+
+                if (scene.triangleCount > 0)
+                {
+                    data = Intersector::IntersectData::encode(Intersector::IntersectData::Mode::PROCEDURAL, PST_TRIANGLE, scene);
+                    ray.objectID = intersector.traceRay(ray, data);
+                }
+
+                if (scene.rectangleCount > 0)
+                {
+                    data = Intersector::IntersectData::encode(Intersector::IntersectData::Mode::PROCEDURAL, PST_RECTANGLE, scene);
+                    ray.objectID = intersector.traceRay(ray, data);
+                }
+
+                // TODO: trace AS
+
+                hit = ray.objectID != -1;
+                if (hit)
+                    rayAlive = closestHitProgram(d, i, ray);
+            }
+            if (!hit)
+                missProgram(ray);
+
+            spectral_type accumulation = ray.payload.accumulation;
+            scalar_type rcpSampleSize = 1.0 / (i + 1);
+            Li += (accumulation - Li) * rcpSampleSize;
+
+            // TODO: visualize high variance
+        }
+
+        return Li;
     }
+
+    randgen_type randGen;
+    raygen_type rayGen;
+    intersector_type intersector;
+    material_system_type materialSystem;
+    nee_type nee;
 };
 
 }

From f5adbf6494a28624db0b6204f34b0235a8687c3c Mon Sep 17 00:00:00 2001
From: keptsecret <sorchon@gmail.com>
Date: Fri, 14 Feb 2025 16:35:04 +0700
Subject: [PATCH 047/529] moved scene rep out, some closest hit stuff

---
 .../app_resources/hlsl/common.hlsl            |  68 +++----
 .../app_resources/hlsl/intersector.hlsl       |  84 +-------
 .../hlsl/next_event_estimator.hlsl            |   8 +-
 .../app_resources/hlsl/pathtracer.hlsl        |  60 +++++-
 .../app_resources/hlsl/scene.hlsl             | 190 ++++++++++++++++++
 5 files changed, 289 insertions(+), 121 deletions(-)
 create mode 100644 31_HLSLPathTracer/app_resources/hlsl/scene.hlsl

diff --git a/31_HLSLPathTracer/app_resources/hlsl/common.hlsl b/31_HLSLPathTracer/app_resources/hlsl/common.hlsl
index 7289d508d..00d35a2a9 100644
--- a/31_HLSLPathTracer/app_resources/hlsl/common.hlsl
+++ b/31_HLSLPathTracer/app_resources/hlsl/common.hlsl
@@ -8,7 +8,7 @@
 #include <nbl/builtin/hlsl/shapes/rectangle.hlsl>
 #include <nbl/builtin/hlsl/sampling/spherical_triangle.hlsl>
 #include <nbl/builtin/hlsl/sampling/projected_spherical_triangle.hlsl>
-//#include <nbl/builtin/hlsl/shapes/rectangle.hlsl>
+#include <nbl/builtin/hlsl/sampling/spherical_rectangle.hlsl>
 
 namespace nbl
 {
@@ -32,6 +32,20 @@ struct Payload
     // #endif
 };
 
+enum ProceduralShapeType : uint16_t
+{
+    PST_SPHERE,
+    PST_TRIANGLE,
+    PST_RECTANGLE
+};
+
+struct ObjectID
+{
+    uint32_t id;
+    uint32_t mode;
+    ProceduralShapeType shapeType;
+};
+
 template<typename T>
 struct Ray
 {
@@ -46,7 +60,7 @@ struct Ray
 
     // mutable
     scalar_type intersectionT;
-    uint32_t objectID;
+    ObjectID objectID;
 
     Payload<T> payload;
 };
@@ -56,10 +70,24 @@ struct Light
 {
     using spectral_type = Spectrum;
 
+    NBL_CONSTEXPR_STATIC_INLINE uint32_t INVALID_ID = 0xffffu;
+
     spectral_type radiance;
-    uint32_t objectID;
+    ObjectID objectID;
 };
 
+template<class Spectrum>
+struct BxDFNode
+{
+    using spectral_type = Spectrum;
+    using params_type = bxdf::SBxDFCreationParams<float, spectral_type>;
+
+    NBL_CONSTEXPR_STATIC_INLINE uint32_t INVALID_ID = 0xffffu;
+
+    params_type params;
+    ObjectID objectID;
+}
+
 template<typename T>
 struct Tolerance
 {
@@ -82,13 +110,6 @@ struct Tolerance
     }
 }
 
-enum ProceduralShapeType : uint16_t
-{
-    PST_SPHERE,
-    PST_TRIANGLE,
-    PST_RECTANGLE
-};
-
 enum PTPolygonMethod : uint16_t
 {
     PPM_AREA,
@@ -145,7 +166,7 @@ struct Shape<PST_SPHERE>
     }
 
     template<class Aniso>
-    float generate_and_pdf(NBL_REF_ARG(float32_t) pdf, NBL_REF_ARG(float32_t) newRayMaxT, NBL_CONST_REF_ARG(float32_t3) origin, NBL_CONST_REF_ARG(Aniso) interaction, bool isBSDF, float32_t3 xi, uint32_t objectID)
+    float generate_and_pdf(NBL_REF_ARG(float32_t) pdf, NBL_REF_ARG(float32_t) newRayMaxT, NBL_CONST_REF_ARG(float32_t3) origin, NBL_CONST_REF_ARG(Aniso) interaction, bool isBSDF, float32_t3 xi)
     {
         float32_t3 Z = position - origin;
         const float distanceSQ = nbl::hlsl::dot(Z,Z);
@@ -257,7 +278,7 @@ struct Shape<PST_TRIANGLE>
     }
 
     template<class Aniso>
-    float32_t3 generate_and_pdf(NBL_REF_ARG(float32_t) pdf, NBL_REF_ARG(float32_t) newRayMaxT, NBL_CONST_REF_ARG(float32_t3) origin, NBL_CONST_REF_ARG(Aniso) interaction, bool isBSDF, float32_t3 xi, uint32_t objectID)
+    float32_t3 generate_and_pdf(NBL_REF_ARG(float32_t) pdf, NBL_REF_ARG(float32_t) newRayMaxT, NBL_CONST_REF_ARG(float32_t3) origin, NBL_CONST_REF_ARG(Aniso) interaction, bool isBSDF, float32_t3 xi)
     {
         switch(polygonMethod)
         {
@@ -409,7 +430,7 @@ struct Shape<PST_RECTANGLE>
     }
 
     template<class Aniso>
-    float32_t3 generate_and_pdf(NBL_REF_ARG(float32_t) pdf, NBL_REF_ARG(float32_t) newRayMaxT, NBL_CONST_REF_ARG(float32_t3) origin, NBL_CONST_REF_ARG(Aniso) interaction, bool isBSDF, float32_t3 xi, uint32_t objectID)
+    float32_t3 generate_and_pdf(NBL_REF_ARG(float32_t) pdf, NBL_REF_ARG(float32_t) newRayMaxT, NBL_CONST_REF_ARG(float32_t3) origin, NBL_CONST_REF_ARG(Aniso) interaction, bool isBSDF, float32_t3 xi)
     {
         const float32_t3 N = getNormalTimesArea();
         const float32_t3 origin2origin = offset - origin;
@@ -474,27 +495,6 @@ struct Shape<PST_RECTANGLE>
     PTPolygonMethod polygonMethod;
 };
 
-struct Scene
-{
-    NBL_CONSTEXPR_STATIC_INLINE uint32_t maxSphereCount = 25;
-    NBL_CONSTEXPR_STATIC_INLINE uint32_t maxTriangleCount = 12;
-    NBL_CONSTEXPR_STATIC_INLINE uint32_t maxRectangleCount = 12;
-
-    Shape<PST_SPHERE> spheres[maxSphereCount];
-    Shape<PST_TRIANGLE> triangles[maxTriangleCount];
-    Shape<PST_RECTANGLE> rectangles[maxRectangleCount];
-
-    uint32_t sphereCount;
-    uint32_t triangleCount;
-    uint32_t rectangleCount;
-
-    Light lights[];
-    // Material materials[];
-    // + obj count for each
-
-    // AS ases;
-};
-
 }
 }
 }
diff --git a/31_HLSLPathTracer/app_resources/hlsl/intersector.hlsl b/31_HLSLPathTracer/app_resources/hlsl/intersector.hlsl
index 23706402a..b2d858ef6 100644
--- a/31_HLSLPathTracer/app_resources/hlsl/intersector.hlsl
+++ b/31_HLSLPathTracer/app_resources/hlsl/intersector.hlsl
@@ -23,76 +23,6 @@ namespace Intersector
 
 struct IntersectData
 {
-    static IntersectData encode(uint32_t mode, ProceduralShapeType type, NBL_CONST_REF_ARG(Scene) scene)
-    {
-        IntersectData retval;
-        retval.mode = mode;
-
-        uint32_t objCount = (type == PST_SPHERE) ? scene.sphereCount :
-                            (type == PST_TRIANGLE) ? scene.triangleCount :
-                            (type == PST_RECTANGLE) ? scene.rectangleCount :
-                            -1;
-        retval.data[0] = objCount;
-        retval.data[1] = type;
-        
-        switch (type)
-        {
-            case PST_SPHERE:
-            {
-                for (int i = 0; i < objCount; i++)
-                {
-                    Shape<PST_SPHERE> sphere = scene.spheres[i];
-                    retval.data[2 + i * Shape<PST_SPHERE>::ObjSize] = asuint(sphere.position.x);
-                    retval.data[2 + i * Shape<PST_SPHERE>::ObjSize + 1] = asuint(sphere.position.y);
-                    retval.data[2 + i * Shape<PST_SPHERE>::ObjSize + 2] = asuint(sphere.position.z);
-                    retval.data[2 + i * Shape<PST_SPHERE>::ObjSize + 3] = asuint(sphere.radius);
-                    retval.data[2 + i * Shape<PST_SPHERE>::ObjSize + 4] = sphere.bsdfLightIDs;
-                }
-            }
-            break;
-            case PST_TRIANGLE:
-            {
-                for (int i = 0; i < objCount; i++)
-                {
-                    Shape<PST_TRIANGLE> tri = scene.triangles[i];
-                    retval.data[2 + i * Shape<PST_TRIANGLE>::ObjSize] = asuint(tri.vertex0.x);
-                    retval.data[2 + i * Shape<PST_TRIANGLE>::ObjSize + 1] = asuint(tri.vertex0.y);
-                    retval.data[2 + i * Shape<PST_TRIANGLE>::ObjSize + 2] = asuint(tri.vertex0.z);
-                    retval.data[2 + i * Shape<PST_TRIANGLE>::ObjSize + 3] = asuint(tri.vertex1.x);
-                    retval.data[2 + i * Shape<PST_TRIANGLE>::ObjSize + 4] = asuint(tri.vertex1.y);
-                    retval.data[2 + i * Shape<PST_TRIANGLE>::ObjSize + 5] = asuint(tri.vertex1.z);
-                    retval.data[2 + i * Shape<PST_TRIANGLE>::ObjSize + 6] = asuint(tri.vertex2.x);
-                    retval.data[2 + i * Shape<PST_TRIANGLE>::ObjSize + 7] = asuint(tri.vertex2.y);
-                    retval.data[2 + i * Shape<PST_TRIANGLE>::ObjSize + 8] = asuint(tri.vertex2.z);
-                    retval.data[2 + i * Shape<PST_TRIANGLE>::ObjSize + 9] = tri.bsdfLightIDs;
-                }
-            }
-            break;
-            case PST_RECTANGLE:
-            {
-                for (int i = 0; i < objCount; i++)
-                {
-                    Shape<PST_RECTANGLE> rect = scene.rectangles[i];
-                    retval.data[2 + i * Shape<PST_RECTANGLE>::ObjSize] = asuint(rect.offset.x);
-                    retval.data[2 + i * Shape<PST_RECTANGLE>::ObjSize + 1] = asuint(rect.offset.y);
-                    retval.data[2 + i * Shape<PST_RECTANGLE>::ObjSize + 2] = asuint(rect.offset.z);
-                    retval.data[2 + i * Shape<PST_RECTANGLE>::ObjSize + 3] = asuint(rect.edge0.x);
-                    retval.data[2 + i * Shape<PST_RECTANGLE>::ObjSize + 4] = asuint(rect.edge0.y);
-                    retval.data[2 + i * Shape<PST_RECTANGLE>::ObjSize + 5] = asuint(rect.edge0.z);
-                    retval.data[2 + i * Shape<PST_RECTANGLE>::ObjSize + 6] = asuint(rect.edge1.x);
-                    retval.data[2 + i * Shape<PST_RECTANGLE>::ObjSize + 7] = asuint(rect.edge1.y);
-                    retval.data[2 + i * Shape<PST_RECTANGLE>::ObjSize + 8] = asuint(rect.edge1.z);
-                    retval.data[2 + i * Shape<PST_RECTANGLE>::ObjSize + 9] = rect.bsdfLightIDs;
-                }
-            }
-            break;
-            default:
-                // for ASes
-                break;
-        }
-        return retval;        
-    }
-
     enum class Mode : uint32_t
     {
         RAY_QUERY,
@@ -113,13 +43,15 @@ struct Comprehensive
     using scalar_type = typename Ray::scalar_type;
     using ray_type = Ray;
 
-    static int traceProcedural(NBL_REF_ARG(ray_type) ray, NBL_REF_ARG(IntersectData) intersect)
+    static ObjectID traceProcedural(NBL_REF_ARG(ray_type) ray, NBL_REF_ARG(IntersectData) intersect)
     {
         const bool anyHit = ray.intersectionT != numeric_limits<scalar_type>::max;
         const uint32_t objCount = intersect.data[0];
         const ProceduralShapeType type = intersect.data[1];
 
-        int objectID = ray.objectID;
+        ObjectID objectID = ray.objectID;
+        objectID.mode = IntersectData::Mode::PROCEDURAL;
+        objectID.type = type;
         for (int i = 0; i < objCount; i++)
         {
             float t;
@@ -152,13 +84,13 @@ struct Comprehensive
                 break;
                 default:
                     t = numeric_limits<float>::infinity;
-                    break;
+                break;
             }
             
             bool closerIntersection = t > 0.0 && t < ray.intersectionT;
 
             ray.intersectionT = closerIntersection ? t : ray.intersectionT;
-            objectID = closerIntersection ? i : objectID;
+            objectID.id = closerIntersection ? i : objectID.id;
             
             // allowing early out results in a performance regression, WTF!?
             //if (anyHit && closerIntersection)
@@ -167,7 +99,7 @@ struct Comprehensive
         return objectID;
     }
 
-    static int traceRay(NBL_REF_ARG(ray_type) ray, NBL_REF_ARG(IntersectData) intersect)
+    static ObjectID traceRay(NBL_REF_ARG(ray_type) ray, NBL_REF_ARG(IntersectData) intersect)
     {
         const IntersectData::Mode mode = intersect.mode;
         switch (mode)
@@ -188,7 +120,7 @@ struct Comprehensive
             }
             break;
             default:
-                return -1;
+                return ObjectID(-1, IntersectData::Mode::PROCEDURAL, PST_SPHERE);
         }
     }
 };
diff --git a/31_HLSLPathTracer/app_resources/hlsl/next_event_estimator.hlsl b/31_HLSLPathTracer/app_resources/hlsl/next_event_estimator.hlsl
index 5d96ae13e..74cf00926 100644
--- a/31_HLSLPathTracer/app_resources/hlsl/next_event_estimator.hlsl
+++ b/31_HLSLPathTracer/app_resources/hlsl/next_event_estimator.hlsl
@@ -23,7 +23,7 @@ struct Event
         PROCEDURAL
     };
 
-    NBL_CONSTEXPR_STATIC_INLINE uint32_t DataSize = 128;
+    NBL_CONSTEXPR_STATIC_INLINE uint32_t DataSize = 16;
 
     uint32_t mode : 1;
     unit32_t unused : 31;   // possible space for flags
@@ -120,7 +120,7 @@ struct Estimator
             {
                 float32_t3 position = float32_t3(asfloat(intersect.data[2 + Shape<PST_SPHERE>::ObjSize]), asfloat(intersect.data[2 + Shape<PST_SPHERE>::ObjSize + 1]), asfloat(intersect.data[2 + Shape<PST_SPHERE>::ObjSize + 2]));
                 Shape<PST_SPHERE> sphere = Shape<PST_SPHERE>::create(position, asfloat(intersect.data[2 + Shape<PST_SPHERE>::ObjSize + 3]), intersect.data[2 + Shape<PST_SPHERE>::ObjSize + 4]);
-                L = sphere.template generate_and_pdf<interaction_type>(pdf, newRayMaxT, origin, interaction, isBSDF, xi, objectID);
+                L = sphere.template generate_and_pdf<interaction_type>(pdf, newRayMaxT, origin, interaction, isBSDF, xi);
             }
             break;
             case PST_TRIANGLE:
@@ -129,7 +129,7 @@ struct Estimator
                 float32_t3 vertex1 = float32_t3(asfloat(intersect.data[2 + Shape<PST_TRIANGLE>::ObjSize + 3]), asfloat(intersect.data[2 + Shape<PST_SPHERE>::ObjSize + 4]), asfloat(intersect.data[2 + Shape<PST_TRIANGLE>::ObjSize + 5]));
                 float32_t3 vertex2 = float32_t3(asfloat(intersect.data[2 + Shape<PST_TRIANGLE>::ObjSize + 6]), asfloat(intersect.data[2 + Shape<PST_SPHERE>::ObjSize + 7]), asfloat(intersect.data[2 + Shape<PST_TRIANGLE>::ObjSize + 8]));
                 Shape<PST_TRIANGLE> tri = Shape<PST_TRIANGLE>::create(vertex0, vertex1, vertex2, intersect.data[2 + Shape<PST_TRIANGLE>::ObjSize + 9]);
-                L = tri.template generate_and_pdf<interaction_type>(pdf, newRayMaxT, origin, interaction, isBSDF, xi, objectID);
+                L = tri.template generate_and_pdf<interaction_type>(pdf, newRayMaxT, origin, interaction, isBSDF, xi);
             }
             break;
             case PST_RECTANGLE:
@@ -138,7 +138,7 @@ struct Estimator
                 float32_t3 edge0 = float32_t3(asfloat(intersect.data[2 + Shape<PST_RECTANGLE>::ObjSize + 3]), asfloat(intersect.data[2 + Shape<PST_RECTANGLE>::ObjSize + 4]), asfloat(intersect.data[2 + Shape<PST_RECTANGLE>::ObjSize + 5]));
                 float32_t3 edge1 = float32_t3(asfloat(intersect.data[2 + Shape<PST_RECTANGLE>::ObjSize + 6]), asfloat(intersect.data[2 + Shape<PST_RECTANGLE>::ObjSize + 7]), asfloat(intersect.data[2 + Shape<PST_RECTANGLE>::ObjSize + 8]));
                 Shape<PST_RECTANGLE> rect = Shape<PST_RECTANGLE>::create(offset, edge0, edge1, intersect.data[2 + Shape<PST_RECTANGLE>::ObjSize + 9]);
-                L = rect.template generate_and_pdf<interaction_type>(pdf, newRayMaxT, origin, interaction, isBSDF, xi, objectID);
+                L = rect.template generate_and_pdf<interaction_type>(pdf, newRayMaxT, origin, interaction, isBSDF, xi);
             }
             break;
             default:
diff --git a/31_HLSLPathTracer/app_resources/hlsl/pathtracer.hlsl b/31_HLSLPathTracer/app_resources/hlsl/pathtracer.hlsl
index 06950b825..80a342a86 100644
--- a/31_HLSLPathTracer/app_resources/hlsl/pathtracer.hlsl
+++ b/31_HLSLPathTracer/app_resources/hlsl/pathtracer.hlsl
@@ -42,6 +42,10 @@ struct Unidirectional
     using vector3_type = vector<scalar_type, 3>;
     using measure_type = typename MaterialSystem::measure_type;
     using ray_type = typename RayGen::ray_type;
+    using light_type = Light<measure_type>;
+    using bxdfnode_type = BxDFNode<measure_type>;
+    using anisotropic_type = typename MaterialSystem::anisotropic_type;
+    using isotropic_type = typename anisotropic_type::isotropic_type;
 
     // static this_t create(RandGen randGen,
     //                     RayGen rayGen,
@@ -69,12 +73,54 @@ struct Unidirectional
         return vector3_type(seqVal) * asfloat(0x2f800004u);
     }
 
-    bool closestHitProgram(unit32_t depth, uint32_t _sample, NBL_REF_ARG(ray_type) ray)
+    // TODO: probably will only work with procedural shapes, do the other ones
+    bool closestHitProgram(unit32_t depth, uint32_t _sample, NBL_REF_ARG(ray_type) ray, NBL_CONST_REF_ARG(Scene) scene)
     {
         const uint32_t objectID = ray.objectID;
         const vector3_type intersection = ray.origin + ray.direction * ray.intersectionT;
 
         uint32_t bsdfLightIDs;
+        anisotropic_type interaction;
+        switch (objectID.mode)
+        {
+            // TODO
+            case Intersector::IntersectData::Mode::RAY_QUERY:
+            case Intersector::IntersectData::Mode::RAY_TRACING:
+                break;
+            case Intersector::IntersectData::Mode::PROCEDURAL:
+            {
+                bsdfLightIDs = scene.getBsdfLightIDs(objectID.id);
+                vector3_type N = scene.getNormal(objectID.id)
+                N = nbl::hlsl::normalize(N);
+                typename isotropic_type::ray_dir_info_type V;
+                V.direction = nbl::hlsl::normalize(-ray.direction);
+                isotropic_type iso = isotropic_type::create(V, N);
+                interaction = anisotropic_type::create(iso);
+            }
+            break;
+            default:
+                break;
+        }
+
+        vector3_type throughput = ray.payload.throughput;
+
+        // emissive
+        const uint32_t lightID = spirv::bitfieldExtract(bsdfLightIDs, 16, 16);
+        if (lightID != light_type::INVALID_ID)
+        {
+            float pdf;
+            ray.payload.accumulation += nee.deferredEvalAndPdf(pdf, lights[lightID], ray, scene.toNextEvent(lightID)) * throughput / (1.0 + pdf * pdf * ray.payload.otherTechniqueHeuristic);
+        }
+
+        const uint32_t bsdfID = spirv::bitfieldExtract(bsdfLightIDs, 0, 16);
+        if (bsdfID == bxdfnode_type::INVALID_ID)
+            return false;
+
+        // TODO: ifdef kill diffuse specular paths
+
+        // sample lights
+
+        // sample BSDF
     }
 
     void missProgram(NBL_REF_ARG(ray_type) ray)
@@ -112,32 +158,32 @@ struct Unidirectional
             for (int d = 1; d <= depth && hit && rayAlive; d += 2)
             {
                 ray.intersectionT = numeric_limits<scalar_type>::max;
-                ray.objectID = -1;  // start with no intersect
+                ray.objectID.id = -1;  // start with no intersect
                 
                 // prodedural shapes
                 if (scene.sphereCount > 0)
                 {
-                    data = Intersector::IntersectData::encode(Intersector::IntersectData::Mode::PROCEDURAL, PST_SPHERE, scene);
+                    data = scene.toIntersectData(Intersector::IntersectData::Mode::PROCEDURAL, PST_SPHERE);
                     ray.objectID = intersector.traceRay(ray, data);
                 }
 
                 if (scene.triangleCount > 0)
                 {
-                    data = Intersector::IntersectData::encode(Intersector::IntersectData::Mode::PROCEDURAL, PST_TRIANGLE, scene);
+                    data = scene.toIntersectData(Intersector::IntersectData::Mode::PROCEDURAL, PST_TRIANGLE);
                     ray.objectID = intersector.traceRay(ray, data);
                 }
 
                 if (scene.rectangleCount > 0)
                 {
-                    data = Intersector::IntersectData::encode(Intersector::IntersectData::Mode::PROCEDURAL, PST_RECTANGLE, scene);
+                    data = scene.toIntersectData(Intersector::IntersectData::Mode::PROCEDURAL, PST_RECTANGLE);
                     ray.objectID = intersector.traceRay(ray, data);
                 }
 
                 // TODO: trace AS
 
-                hit = ray.objectID != -1;
+                hit = ray.objectID.id != -1;
                 if (hit)
-                    rayAlive = closestHitProgram(d, i, ray);
+                    rayAlive = closestHitProgram(d, i, ray, scene);
             }
             if (!hit)
                 missProgram(ray);
diff --git a/31_HLSLPathTracer/app_resources/hlsl/scene.hlsl b/31_HLSLPathTracer/app_resources/hlsl/scene.hlsl
new file mode 100644
index 000000000..ea173e1a7
--- /dev/null
+++ b/31_HLSLPathTracer/app_resources/hlsl/scene.hlsl
@@ -0,0 +1,190 @@
+#ifndef _NBL_HLSL_EXT_PATHTRACING_SCENE_INCLUDED_
+#define _NBL_HLSL_EXT_PATHTRACING_SCENE_INCLUDED_
+
+#include "common.hlsl"
+#include "material_system.hlsl"
+#include "next_event_estimator.hlsl"
+#include "intersector.hlsl"
+
+namespace nbl
+{
+namespace hlsl
+{
+namespace ext
+{
+
+struct Scene
+{
+    NBL_CONSTEXPR_STATIC_INLINE uint32_t maxSphereCount = 25;
+    NBL_CONSTEXPR_STATIC_INLINE uint32_t maxTriangleCount = 12;
+    NBL_CONSTEXPR_STATIC_INLINE uint32_t maxRectangleCount = 12;
+
+    Shape<PST_SPHERE> spheres[maxSphereCount];
+    Shape<PST_TRIANGLE> triangles[maxTriangleCount];
+    Shape<PST_RECTANGLE> rectangles[maxRectangleCount];
+
+    uint32_t sphereCount;
+    uint32_t triangleCount;
+    uint32_t rectangleCount;
+
+    NBL_CONSTEXPR_STATIC_INLINE uint32_t maxLightCount = 4;
+
+    Light lights[maxLightCount];
+    uint32_t lightCount;
+    // Material materials[];
+    // + obj count for each
+
+    // AS ases;
+
+    Intersector::IntersectData toIntersectData(uint32_t mode, ProceduralShapeType type)
+    {
+        Intersector::IntersectData retval;
+        retval.mode = mode;
+
+        uint32_t objCount = (type == PST_SPHERE) ? sphereCount :
+                            (type == PST_TRIANGLE) ? triangleCount :
+                            (type == PST_RECTANGLE) ? rectangleCount :
+                            -1;
+        retval.data[0] = objCount;
+        retval.data[1] = type;
+        
+        switch (type)
+        {
+            case PST_SPHERE:
+            {
+                for (int i = 0; i < objCount; i++)
+                {
+                    Shape<PST_SPHERE> sphere = spheres[i];
+                    retval.data[2 + i * Shape<PST_SPHERE>::ObjSize] = asuint(sphere.position.x);
+                    retval.data[2 + i * Shape<PST_SPHERE>::ObjSize + 1] = asuint(sphere.position.y);
+                    retval.data[2 + i * Shape<PST_SPHERE>::ObjSize + 2] = asuint(sphere.position.z);
+                    retval.data[2 + i * Shape<PST_SPHERE>::ObjSize + 3] = asuint(sphere.radius);
+                    retval.data[2 + i * Shape<PST_SPHERE>::ObjSize + 4] = sphere.bsdfLightIDs;
+                }
+            }
+            break;
+            case PST_TRIANGLE:
+            {
+                for (int i = 0; i < objCount; i++)
+                {
+                    Shape<PST_TRIANGLE> tri = triangles[i];
+                    retval.data[2 + i * Shape<PST_TRIANGLE>::ObjSize] = asuint(tri.vertex0.x);
+                    retval.data[2 + i * Shape<PST_TRIANGLE>::ObjSize + 1] = asuint(tri.vertex0.y);
+                    retval.data[2 + i * Shape<PST_TRIANGLE>::ObjSize + 2] = asuint(tri.vertex0.z);
+                    retval.data[2 + i * Shape<PST_TRIANGLE>::ObjSize + 3] = asuint(tri.vertex1.x);
+                    retval.data[2 + i * Shape<PST_TRIANGLE>::ObjSize + 4] = asuint(tri.vertex1.y);
+                    retval.data[2 + i * Shape<PST_TRIANGLE>::ObjSize + 5] = asuint(tri.vertex1.z);
+                    retval.data[2 + i * Shape<PST_TRIANGLE>::ObjSize + 6] = asuint(tri.vertex2.x);
+                    retval.data[2 + i * Shape<PST_TRIANGLE>::ObjSize + 7] = asuint(tri.vertex2.y);
+                    retval.data[2 + i * Shape<PST_TRIANGLE>::ObjSize + 8] = asuint(tri.vertex2.z);
+                    retval.data[2 + i * Shape<PST_TRIANGLE>::ObjSize + 9] = tri.bsdfLightIDs;
+                }
+            }
+            break;
+            case PST_RECTANGLE:
+            {
+                for (int i = 0; i < objCount; i++)
+                {
+                    Shape<PST_RECTANGLE> rect = rectangles[i];
+                    retval.data[2 + i * Shape<PST_RECTANGLE>::ObjSize] = asuint(rect.offset.x);
+                    retval.data[2 + i * Shape<PST_RECTANGLE>::ObjSize + 1] = asuint(rect.offset.y);
+                    retval.data[2 + i * Shape<PST_RECTANGLE>::ObjSize + 2] = asuint(rect.offset.z);
+                    retval.data[2 + i * Shape<PST_RECTANGLE>::ObjSize + 3] = asuint(rect.edge0.x);
+                    retval.data[2 + i * Shape<PST_RECTANGLE>::ObjSize + 4] = asuint(rect.edge0.y);
+                    retval.data[2 + i * Shape<PST_RECTANGLE>::ObjSize + 5] = asuint(rect.edge0.z);
+                    retval.data[2 + i * Shape<PST_RECTANGLE>::ObjSize + 6] = asuint(rect.edge1.x);
+                    retval.data[2 + i * Shape<PST_RECTANGLE>::ObjSize + 7] = asuint(rect.edge1.y);
+                    retval.data[2 + i * Shape<PST_RECTANGLE>::ObjSize + 8] = asuint(rect.edge1.z);
+                    retval.data[2 + i * Shape<PST_RECTANGLE>::ObjSize + 9] = rect.bsdfLightIDs;
+                }
+            }
+            break;
+            default:
+                // for ASes
+                break;
+        }
+        return retval;
+    }
+
+    NextEventEstimator::Event toNextEvent(uint32_t lightID)
+    {
+        NextEventEstimator::Event retval;
+
+        ObjectID objectID = lights[lightID].objectID;
+        retval.mode = objectID.mode;
+
+        retval.data[0] = lightCount;
+        retval.data[1] = objectID.type;
+
+        uint32_t id = objectID.id;
+        switch (type)
+        {
+            case PST_SPHERE:
+            {
+                Shape<PST_SPHERE> sphere = spheres[id];
+                retval.data[2 + Shape<PST_SPHERE>::ObjSize] = asuint(sphere.position.x);
+                retval.data[2 + Shape<PST_SPHERE>::ObjSize + 1] = asuint(sphere.position.y);
+                retval.data[2 + Shape<PST_SPHERE>::ObjSize + 2] = asuint(sphere.position.z);
+                retval.data[2 + Shape<PST_SPHERE>::ObjSize + 3] = asuint(sphere.radius);
+                retval.data[2 + Shape<PST_SPHERE>::ObjSize + 4] = sphere.bsdfLightIDs;
+            }
+            break;
+            case PST_TRIANGLE:
+            {
+                Shape<PST_TRIANGLE> tri = triangles[id];
+                retval.data[2 + Shape<PST_TRIANGLE>::ObjSize] = asuint(tri.vertex0.x);
+                retval.data[2 + Shape<PST_TRIANGLE>::ObjSize + 1] = asuint(tri.vertex0.y);
+                retval.data[2 + Shape<PST_TRIANGLE>::ObjSize + 2] = asuint(tri.vertex0.z);
+                retval.data[2 + Shape<PST_TRIANGLE>::ObjSize + 3] = asuint(tri.vertex1.x);
+                retval.data[2 + Shape<PST_TRIANGLE>::ObjSize + 4] = asuint(tri.vertex1.y);
+                retval.data[2 + Shape<PST_TRIANGLE>::ObjSize + 5] = asuint(tri.vertex1.z);
+                retval.data[2 + Shape<PST_TRIANGLE>::ObjSize + 6] = asuint(tri.vertex2.x);
+                retval.data[2 + Shape<PST_TRIANGLE>::ObjSize + 7] = asuint(tri.vertex2.y);
+                retval.data[2 + Shape<PST_TRIANGLE>::ObjSize + 8] = asuint(tri.vertex2.z);
+                retval.data[2 + Shape<PST_TRIANGLE>::ObjSize + 9] = tri.bsdfLightIDs;
+            }
+            break;
+            case PST_RECTANGLE:
+            {
+                Shape<PST_RECTANGLE> rect = rectangles[id];
+                retval.data[2 + Shape<PST_RECTANGLE>::ObjSize] = asuint(rect.offset.x);
+                retval.data[2 + Shape<PST_RECTANGLE>::ObjSize + 1] = asuint(rect.offset.y);
+                retval.data[2 + Shape<PST_RECTANGLE>::ObjSize + 2] = asuint(rect.offset.z);
+                retval.data[2 + Shape<PST_RECTANGLE>::ObjSize + 3] = asuint(rect.edge0.x);
+                retval.data[2 + Shape<PST_RECTANGLE>::ObjSize + 4] = asuint(rect.edge0.y);
+                retval.data[2 + Shape<PST_RECTANGLE>::ObjSize + 5] = asuint(rect.edge0.z);
+                retval.data[2 + Shape<PST_RECTANGLE>::ObjSize + 6] = asuint(rect.edge1.x);
+                retval.data[2 + Shape<PST_RECTANGLE>::ObjSize + 7] = asuint(rect.edge1.y);
+                retval.data[2 + Shape<PST_RECTANGLE>::ObjSize + 8] = asuint(rect.edge1.z);
+                retval.data[2 + Shape<PST_RECTANGLE>::ObjSize + 9] = rect.bsdfLightIDs;
+            }
+            break;
+            default:
+                // for ASes
+                break;
+        }
+        return retval;
+    }
+
+    // TODO: get these to work with AS types as well
+    uint32_t getBsdfLightIDs(uint32_t id)
+    {
+        return (objectID.type == PST_SPHERE) ? spheres[id].bsdfLightIDs :
+                (objectID.type == PST_TRIANGLE) ? triangles[id].bsdfLightIDs :
+                (objectID.type == PST_RECTANGLE) ? rectangles[id].bsdfLightIDs : -1;
+    }
+
+    float32_t3 getNormal(uint32_t id, NBL_CONST_REF_ARG(float32_t3) intersection)
+    {
+        return (objectID.type == PST_SPHERE) ? scene.spheres[id].getNormal(intersection) :
+                (objectID.type == PST_TRIANGLE) ? scene.triangles[id].getNormalTimesArea() :
+                (objectID.type == PST_RECTANGLE) ? scene.rectangles[id].getNormalTimesArea() :
+                (float32_t3)0.0;
+    }
+};
+
+}
+}
+}
+
+#endif

From 159d1533e8d82e3c5e82165e8b79ea67c0f23111 Mon Sep 17 00:00:00 2001
From: keptsecret <sorchon@gmail.com>
Date: Mon, 17 Feb 2025 16:58:03 +0700
Subject: [PATCH 048/529] sample light part of closest hit

---
 .../app_resources/hlsl/common.hlsl            |   1 +
 .../app_resources/hlsl/intersector.hlsl       |   2 +-
 .../app_resources/hlsl/material_system.hlsl   |   8 +-
 .../hlsl/next_event_estimator.hlsl            |   2 +-
 .../app_resources/hlsl/pathtracer.hlsl        | 132 ++++++++++++++++--
 .../app_resources/hlsl/scene.hlsl             |   7 +-
 6 files changed, 137 insertions(+), 15 deletions(-)

diff --git a/31_HLSLPathTracer/app_resources/hlsl/common.hlsl b/31_HLSLPathTracer/app_resources/hlsl/common.hlsl
index 00d35a2a9..7d29dabd4 100644
--- a/31_HLSLPathTracer/app_resources/hlsl/common.hlsl
+++ b/31_HLSLPathTracer/app_resources/hlsl/common.hlsl
@@ -84,6 +84,7 @@ struct BxDFNode
 
     NBL_CONSTEXPR_STATIC_INLINE uint32_t INVALID_ID = 0xffffu;
 
+    uint32_t materialType;
     params_type params;
     ObjectID objectID;
 }
diff --git a/31_HLSLPathTracer/app_resources/hlsl/intersector.hlsl b/31_HLSLPathTracer/app_resources/hlsl/intersector.hlsl
index b2d858ef6..60aa7143b 100644
--- a/31_HLSLPathTracer/app_resources/hlsl/intersector.hlsl
+++ b/31_HLSLPathTracer/app_resources/hlsl/intersector.hlsl
@@ -23,7 +23,7 @@ namespace Intersector
 
 struct IntersectData
 {
-    enum class Mode : uint32_t
+    enum Mode : uint32_t    // enum class?
     {
         RAY_QUERY,
         RAY_TRACING,
diff --git a/31_HLSLPathTracer/app_resources/hlsl/material_system.hlsl b/31_HLSLPathTracer/app_resources/hlsl/material_system.hlsl
index 1f13198fa..b89bfbd40 100644
--- a/31_HLSLPathTracer/app_resources/hlsl/material_system.hlsl
+++ b/31_HLSLPathTracer/app_resources/hlsl/material_system.hlsl
@@ -15,7 +15,7 @@ namespace MaterialSystem
 
 struct Material
 {
-    enum class Type : uint32_t
+    enum Type : uint32_t    // enum class?
     {
         DIFFUSE,
         CONDUCTOR,
@@ -29,7 +29,7 @@ struct Material
     uint32_t data[DataSize];
 };
 
-template<class DiffuseBxDF, class ConductorBxDF, class DielectricBxDF>
+template<class DiffuseBxDF, class ConductorBxDF, class DielectricBxDF>  // NOTE: these bxdfs should match the ones in Scene BxDFNode
 struct System
 {
     using this_t = System<DiffuseBxDF, ConductorBxDF, DielectricBxDF>;
@@ -42,6 +42,10 @@ struct System
     using anisocache_type = typename ConductorBxDF::anisocache_type;
     using params_t = SBxDFParams<scalar_type>;
 
+    using diffuse_op_type = DiffuseBxDF;
+    using conductor_op_type = ConductorBxDF;
+    using dielectric_op_type = DielectricBxDF;
+
     static this_t create(NBL_CONST_REF_ARG(SBxDFCreationParams<scalar_type, measure_type>) diffuseParams, NBL_CONST_REF_ARG(SBxDFCreationParams<scalar_type, measure_type>) conductorParams, NBL_CONST_REF_ARG(SBxDFCreationParams<scalar_type, measure_type>) dielectricParams)
     {
         diffuseBxDF = DiffuseBxDF::create(diffuseParams);
diff --git a/31_HLSLPathTracer/app_resources/hlsl/next_event_estimator.hlsl b/31_HLSLPathTracer/app_resources/hlsl/next_event_estimator.hlsl
index 74cf00926..c6380094d 100644
--- a/31_HLSLPathTracer/app_resources/hlsl/next_event_estimator.hlsl
+++ b/31_HLSLPathTracer/app_resources/hlsl/next_event_estimator.hlsl
@@ -16,7 +16,7 @@ namespace NextEventEstimator
 
 struct Event
 {
-    enum class Mode : uint32_t
+    enum Mode : uint32_t    // enum class?
     {
         RAY_QUERY,
         RAY_TRACING,
diff --git a/31_HLSLPathTracer/app_resources/hlsl/pathtracer.hlsl b/31_HLSLPathTracer/app_resources/hlsl/pathtracer.hlsl
index 80a342a86..e4638703a 100644
--- a/31_HLSLPathTracer/app_resources/hlsl/pathtracer.hlsl
+++ b/31_HLSLPathTracer/app_resources/hlsl/pathtracer.hlsl
@@ -1,6 +1,16 @@
 #ifndef _NBL_HLSL_EXT_PATHTRACER_INCLUDED_
 #define _NBL_HLSL_EXT_PATHTRACER_INCLUDED_
 
+#include <nbl/builtin/hlsl/colorspace/EOTF.hlsl>
+#include <nbl/builtin/hlsl/colorspace/encodeCIEXYZ.hlsl>
+#include <nbl/builtin/hlsl/math/functions.hlsl>
+
+#include "rand_gen.hlsl"
+#include "ray_gen.hlsl"
+#include "intersector.hlsl"
+#include "material_system.hlsl"
+#include "next_event_estimator.hlsl"
+
 namespace nbl
 {
 namespace hlsl
@@ -41,11 +51,20 @@ struct Unidirectional
     using scalar_type = typename MaterialSystem::scalar_type;
     using vector3_type = vector<scalar_type, 3>;
     using measure_type = typename MaterialSystem::measure_type;
+    using sample_type = typename NextEventEstimator::sample_type;
     using ray_type = typename RayGen::ray_type;
     using light_type = Light<measure_type>;
     using bxdfnode_type = BxDFNode<measure_type>;
     using anisotropic_type = typename MaterialSystem::anisotropic_type;
     using isotropic_type = typename anisotropic_type::isotropic_type;
+    using anisocache_type = typename MaterialSystem::anisocache_type;
+    using isocache_type = typename anisocache_type::isocache_type;
+    using quotient_pdf_type = typename NextEventEstimator::quotient_pdf_type;
+    using params_type = typename MaterialSystem::params_t;
+
+    using diffuse_op_type = typename MaterialSystem::diffuse_op_type;
+    using conductor_op_type = typename MaterialSystem::conductor_op_type;
+    using dielectric_op_type = typename MaterialSystem::dielectric_op_type;
 
     // static this_t create(RandGen randGen,
     //                     RayGen rayGen,
@@ -73,6 +92,11 @@ struct Unidirectional
         return vector3_type(seqVal) * asfloat(0x2f800004u);
     }
 
+    scalar_type getLuma(NBL_CONST_REF_ARG(vector3_type) col)
+    {
+        return nbl::hlsl::dot(nbl::hlsl::transpose(colorspace::scRGBtoXYZ)[1], col);
+    }
+
     // TODO: probably will only work with procedural shapes, do the other ones
     bool closestHitProgram(unit32_t depth, uint32_t _sample, NBL_REF_ARG(ray_type) ray, NBL_CONST_REF_ARG(Scene) scene)
     {
@@ -81,21 +105,22 @@ struct Unidirectional
 
         uint32_t bsdfLightIDs;
         anisotropic_type interaction;
+        isotropic_type iso_interaction;
         switch (objectID.mode)
         {
             // TODO
-            case Intersector::IntersectData::Mode::RAY_QUERY:
-            case Intersector::IntersectData::Mode::RAY_TRACING:
+            case ext::Intersector::IntersectData::Mode::RAY_QUERY:
+            case ext::Intersector::IntersectData::Mode::RAY_TRACING:
                 break;
-            case Intersector::IntersectData::Mode::PROCEDURAL:
+            case ext::Intersector::IntersectData::Mode::PROCEDURAL:
             {
                 bsdfLightIDs = scene.getBsdfLightIDs(objectID.id);
                 vector3_type N = scene.getNormal(objectID.id)
                 N = nbl::hlsl::normalize(N);
                 typename isotropic_type::ray_dir_info_type V;
                 V.direction = nbl::hlsl::normalize(-ray.direction);
-                isotropic_type iso = isotropic_type::create(V, N);
-                interaction = anisotropic_type::create(iso);
+                isotropic_type iso_interaction = isotropic_type::create(V, N);
+                interaction = anisotropic_type::create(iso_interaction);
             }
             break;
             default:
@@ -116,9 +141,98 @@ struct Unidirectional
         if (bsdfID == bxdfnode_type::INVALID_ID)
             return false;
 
+        BxDFNode bxdf = scene.bxdfs[bsdfID];
+
         // TODO: ifdef kill diffuse specular paths
 
+        const bool isBSDF = (bxdf.materialType == ext::MaterialSystem::Material::DIFFUSE) ? bxdf_traits<diffuse_op_type>::type == BT_BSDF :
+                            (bxdf.materialType == ext::MaterialSystem::Material::CONDUCTOR) ? bxdf_traits<conductor_op_type>::type == BT_BSDF :
+                            bxdf_traits<dielectric_op_type>::type == BT_BSDF;
+
+        vector3_type eps0 = rand3d(depth, _sample);
+        vector3_type eps1 = rand3d(depth, _sample);
+        vector3_type eps2 = rand3d(depth, _sample);
+
+        // thresholds
+        const scalar_type bsdfPdfThreshold = 0.0001;
+        const scalar_type lumaContributionThreshold = getLuma(colorspace::eotf::sRGB<vector3_type>((vector3_type)1.0 / 255.0)); // OETF smallest perceptible value
+        const vector3_type throughputCIE_Y = nbl::hlsl::transpose(colorspace::sRGBtoXYZ)[1] * throughput;   // TODO: this only works if spectral_type is dim 3
+        const scalar_type monochromeEta = nbl::hlsl::dot(throughputCIE_Y, BSDFNode_getEta(bsdf)[0]) / (throughputCIE_Y.r + throughputCIE_Y.g + throughputCIE_Y.b);  // TODO: fix getEta, what is real eta
+
         // sample lights
+        const scalar_type neeProbability = 1.0;// BSDFNode_getNEEProb(bsdf);
+        scalar_type rcpChoiceProb;
+        if (!math::partitionRandVariable(neeProbability, eps0.z, rcpChoiceProb) && depth < 2u)
+        {
+            quotient_pdf_type neeContrib_pdf;
+            scalar_type t;
+            sample_type nee_sample = nee.generate_and_quotient_and_pdf(
+                neeContrib_pdf, t,
+                intersection, interaction,
+                isBSDF, eps0, depth
+            );
+
+            // We don't allow non watertight transmitters in this renderer
+            bool validPath = nee_sample.NdotL > numeric_limits<scalar_type>::min;
+            // but if we allowed non-watertight transmitters (single water surface), it would make sense just to apply this line by itself
+            anisocache_type _cache;
+            validPath = validPath && anisocache_type::compute(_cache, interaction, nee_sample, monochromeEta);
+
+            if (neeContrib_pdf.pdf < numeric_limits<scalar_type>::max)
+            {
+                if (nbl::hlsl::any(isnan(nee_sample.L)))
+                    ray.payload.accumulation += vector3_type(1000.f, 0.f, 0.f);
+                else if (nbl::hlsl::all((vector3_type)69.f == nee_sample.L))
+                    ray.payload.accumulation += vector3_type(0.f, 1000.f, 0.f);
+                else if (validPath)
+                {
+                    ext::MaterialSystem::Material material;
+                    material.type = bxdf.materialType;
+                    params_type params;
+
+                    // TODO: does not yet account for smooth dielectric
+                    if (!isBSDF && bxdf.materialType == ext::MaterialSystem::Material::DIFFUSE)
+                    {
+                        params = params_type::template create<sample_type, isotropic_type>(nee_sample, iso_interaction, bxdf::BCM_MAX);
+                    }
+                    else if (!isBSDF && bxdf.materialType != ext::MaterialSystem::Material::DIFFUSE)
+                    {
+                        if (bxdf.params.is_aniso)
+                            params = params_type::template create<sample_type, anisotropic_type, anisocache_type>(nee_sample, interaction, _cache, bxdf::BCM_MAX);
+                        else
+                        {
+                            isocache = (iso_cache)_cache;
+                            params = params_type::template create<sample_type, isotropic_type, isocache_type>(nee_sample, iso_interaction, isocache, bxdf::BCM_MAX);
+                        }
+                    }
+                    else if (isBSDF && bxdf.materialType == ext::MaterialSystem::Material::DIFFUSE)
+                    {
+                        params = params_type::template create<sample_type, isotropic_type>(nee_sample, iso_interaction, bxdf::BCM_ABS);
+                    }
+                    else if (isBSDF && bxdf.materialType != ext::MaterialSystem::Material::DIFFUSE)
+                    {
+                        if (bxdf.params.is_aniso)
+                            params = params_type::template create<sample_type, anisotropic_type, anisocache_type>(nee_sample, interaction, _cache, bxdf::BCM_ABS);
+                        else
+                        {
+                            isocache = (iso_cache)_cache;
+                            params = params_type::template create<sample_type, isotropic_type, isocache_type>(nee_sample, iso_interaction, isocache, bxdf::BCM_ABS);
+                        }
+                    }
+
+                    quotient_pdf_type bsdf_quotient_pdf = materialSystem.quotient_and_pdf(material, params) * throughput;
+                    neeContrib_pdf.quotient *= bsdf_quotient_pdf.quotient;
+                    const scalar_type otherGenOverChoice = bsdf_quotient_pdf.pdf * rcpChoiceProb;
+                    const scalar_type otherGenOverLightAndChoice = otherGenOverChoice / bsdf_quotient_pdf.pdf;
+                    neeContrib_pdf.quotient *= otherGenOverChoice/(1.f + otherGenOverLightAndChoice * otherGenOverLightAndChoice);   // balance heuristic
+
+                    // TODO: ifdef NEE only
+
+                    if (bsdf_quotient_pdf.pdf < numeric_limits<scalar_type>::max && getLuma(neeContrib_pdf.quotient) > lumaContributionThreshold && traceRay(t,intersection+nee_sample.L*t*getStartTolerance(depth),nee_sample.L)==-1)
+                        ray._payload.accumulation += neeContrib_pdf.quotient;
+                }
+            }
+        }
 
         // sample BSDF
     }
@@ -143,7 +257,7 @@ struct Unidirectional
         // return ray.payload.accumulation --> color
 
         // TODO: not hardcode this, pass value from somewhere?, where to get objects?
-        Intersector::IntersectData data;
+        ext::Intersector::IntersectData data;
 
         measure_type Li = (measure_type)0.0;
         scalar_type meanLumaSq = 0.0;
@@ -163,19 +277,19 @@ struct Unidirectional
                 // prodedural shapes
                 if (scene.sphereCount > 0)
                 {
-                    data = scene.toIntersectData(Intersector::IntersectData::Mode::PROCEDURAL, PST_SPHERE);
+                    data = scene.toIntersectData(ext::Intersector::IntersectData::Mode::PROCEDURAL, PST_SPHERE);
                     ray.objectID = intersector.traceRay(ray, data);
                 }
 
                 if (scene.triangleCount > 0)
                 {
-                    data = scene.toIntersectData(Intersector::IntersectData::Mode::PROCEDURAL, PST_TRIANGLE);
+                    data = scene.toIntersectData(ext::Intersector::IntersectData::Mode::PROCEDURAL, PST_TRIANGLE);
                     ray.objectID = intersector.traceRay(ray, data);
                 }
 
                 if (scene.rectangleCount > 0)
                 {
-                    data = scene.toIntersectData(Intersector::IntersectData::Mode::PROCEDURAL, PST_RECTANGLE);
+                    data = scene.toIntersectData(ext::Intersector::IntersectData::Mode::PROCEDURAL, PST_RECTANGLE);
                     ray.objectID = intersector.traceRay(ray, data);
                 }
 
diff --git a/31_HLSLPathTracer/app_resources/hlsl/scene.hlsl b/31_HLSLPathTracer/app_resources/hlsl/scene.hlsl
index ea173e1a7..fe4dea8b3 100644
--- a/31_HLSLPathTracer/app_resources/hlsl/scene.hlsl
+++ b/31_HLSLPathTracer/app_resources/hlsl/scene.hlsl
@@ -31,8 +31,11 @@ struct Scene
 
     Light lights[maxLightCount];
     uint32_t lightCount;
-    // Material materials[];
-    // + obj count for each
+    
+    NBL_CONSTEXPR_STATIC_INLINE uint32_t maxBxdfCount = 16; // TODO: limit change?
+
+    BxDFNode bxdfs[maxBxdfCount];
+    uint32_t bxdfCount;
 
     // AS ases;
 

From a7350db7d7e422fa5086982b3327103c06cfbe44 Mon Sep 17 00:00:00 2001
From: keptsecret <sorchon@gmail.com>
Date: Tue, 18 Feb 2025 15:23:52 +0700
Subject: [PATCH 049/529] fix bugs, reorganize traceRay

---
 .../app_resources/hlsl/intersector.hlsl       | 32 +++++++++++++
 .../app_resources/hlsl/material_system.hlsl   | 16 +++++--
 .../app_resources/hlsl/pathtracer.hlsl        | 47 ++++++-------------
 .../app_resources/hlsl/scene.hlsl             |  8 +++-
 4 files changed, 65 insertions(+), 38 deletions(-)

diff --git a/31_HLSLPathTracer/app_resources/hlsl/intersector.hlsl b/31_HLSLPathTracer/app_resources/hlsl/intersector.hlsl
index 60aa7143b..cf2d3ae7c 100644
--- a/31_HLSLPathTracer/app_resources/hlsl/intersector.hlsl
+++ b/31_HLSLPathTracer/app_resources/hlsl/intersector.hlsl
@@ -123,6 +123,38 @@ struct Comprehensive
                 return ObjectID(-1, IntersectData::Mode::PROCEDURAL, PST_SPHERE);
         }
     }
+
+    template<typename Scene>
+    static ObjectID traceRay(NBL_REF_ARG(ray_type) ray, NBL_CONST_REF_ARG(Scene) scene)
+    {
+        IntersectData data;
+
+        ObjectID objectID;
+        objectID.id = -1;  // start with no intersect
+                
+        // prodedural shapes
+        if (scene.sphereCount > 0)
+        {
+            data = scene.toIntersectData(ext::Intersector::IntersectData::Mode::PROCEDURAL, PST_SPHERE);
+            objectID = intersector.traceRay(ray, data);
+        }
+
+        if (scene.triangleCount > 0)
+        {
+            data = scene.toIntersectData(ext::Intersector::IntersectData::Mode::PROCEDURAL, PST_TRIANGLE);
+            objectID = intersector.traceRay(ray, data);
+        }
+
+        if (scene.rectangleCount > 0)
+        {
+            data = scene.toIntersectData(ext::Intersector::IntersectData::Mode::PROCEDURAL, PST_RECTANGLE);
+            objectID = intersector.traceRay(ray, data);
+        }
+
+        // TODO: trace AS
+
+        return objectID;
+    }
 };
 
 // does everything in traceray in ex 30
diff --git a/31_HLSLPathTracer/app_resources/hlsl/material_system.hlsl b/31_HLSLPathTracer/app_resources/hlsl/material_system.hlsl
index b89bfbd40..1d5587443 100644
--- a/31_HLSLPathTracer/app_resources/hlsl/material_system.hlsl
+++ b/31_HLSLPathTracer/app_resources/hlsl/material_system.hlsl
@@ -41,6 +41,7 @@ struct System
     using anisotropic_type = typename DiffuseBxDF::anisotropic_type;
     using anisocache_type = typename ConductorBxDF::anisocache_type;
     using params_t = SBxDFParams<scalar_type>;
+    using create_params_t = SBxDFCreationParams<scalar_type, measure_type>;
 
     using diffuse_op_type = DiffuseBxDF;
     using conductor_op_type = ConductorBxDF;
@@ -53,22 +54,25 @@ struct System
         dielectricBxDF = DiffuseBxDF::create(dielectricParams);
     }
 
-    static measure_type eval(NBL_CONST_REF_ARG(Material) material, NBL_CONST_REF_ARG(params_t) params)
+    static measure_type eval(NBL_CONST_REF_ARG(Material) material, NBL_CONST_REF_ARG(create_params_t) cparams, NBL_CONST_REF_ARG(params_t) params)
     {
         switch(material.type)
         {
             case DIFFUSE:
             {
+                diffuseBxDF.init(cparams);
                 return (measure_type)diffuseBxDF.eval(params);
             }
             break;
             case CONDUCTOR:
             {
+                conductorBxDF.init(cparams);
                 return conductorBxDF.eval(params);
             }
             break;
             case DIELECTRIC:
             {
+                dielectricBxDF.init(cparams);
                 return dielectricBxDF.eval(params);
             }
             break;
@@ -77,22 +81,25 @@ struct System
         }
     }
 
-    static vector3_type generate(NBL_CONST_REF_ARG(Material) material, anisotropic_type interaction, vector2_type u, NBL_REF_ARG(anisocache_type) cache)
+    static vector3_type generate(NBL_CONST_REF_ARG(Material) material, NBL_CONST_REF_ARG(create_params_t) cparams, anisotropic_type interaction, vector2_type u, NBL_REF_ARG(anisocache_type) cache)
     {
         switch(material.type)
         {
             case DIFFUSE:
             {
+                diffuseBxDF.init(cparams);
                 return diffuseBxDF.generate(interaction, u);
             }
             break;
             case CONDUCTOR:
             {
+                conductorBxDF.init(cparams);
                 return conductorBxDF.generate(interaction, u, cache);
             }
             break;
             case DIELECTRIC:
             {
+                dielectricBxDF.init(cparams);
                 return dielectricBxDF.generate(interaction, u, cache);
             }
             break;
@@ -101,7 +108,7 @@ struct System
         }
     }
 
-    static quotient_pdf_type quotient_and_pdf(NBL_CONST_REF_ARG(Material) material, NBL_CONST_REF_ARG(params_t) params)
+    static quotient_pdf_type quotient_and_pdf(NBL_CONST_REF_ARG(Material) material, NBL_CONST_REF_ARG(create_params_t) cparams, NBL_CONST_REF_ARG(params_t) params)
     {
         const float minimumProjVectorLen = 0.00000001;
         if (params.NdotV > minimumProjVectorLen && params.NdotL > minimumProjVectorLen)
@@ -110,16 +117,19 @@ struct System
             {
                 case DIFFUSE:
                 {
+                    diffuseBxDF.init(cparams);
                     return diffuseBxDF.quotient_and_pdf(params);
                 }
                 break;
                 case CONDUCTOR:
                 {
+                    conductorBxDF.init(cparams);
                     return conductorBxDF.quotient_and_pdf(params);
                 }
                 break;
                 case DIELECTRIC:
                 {
+                    dielectricBxDF.init(cparams);
                     return dielectricBxDF.quotient_and_pdf(params);
                 }
                 break;
diff --git a/31_HLSLPathTracer/app_resources/hlsl/pathtracer.hlsl b/31_HLSLPathTracer/app_resources/hlsl/pathtracer.hlsl
index e4638703a..8d8d9a201 100644
--- a/31_HLSLPathTracer/app_resources/hlsl/pathtracer.hlsl
+++ b/31_HLSLPathTracer/app_resources/hlsl/pathtracer.hlsl
@@ -61,6 +61,7 @@ struct Unidirectional
     using isocache_type = typename anisocache_type::isocache_type;
     using quotient_pdf_type = typename NextEventEstimator::quotient_pdf_type;
     using params_type = typename MaterialSystem::params_t;
+    using scene_type = Scene<light_type, bxdfnode_type>;
 
     using diffuse_op_type = typename MaterialSystem::diffuse_op_type;
     using conductor_op_type = typename MaterialSystem::conductor_op_type;
@@ -98,7 +99,7 @@ struct Unidirectional
     }
 
     // TODO: probably will only work with procedural shapes, do the other ones
-    bool closestHitProgram(unit32_t depth, uint32_t _sample, NBL_REF_ARG(ray_type) ray, NBL_CONST_REF_ARG(Scene) scene)
+    bool closestHitProgram(unit32_t depth, uint32_t _sample, NBL_REF_ARG(ray_type) ray, NBL_CONST_REF_ARG(scene_type) scene)
     {
         const uint32_t objectID = ray.objectID;
         const vector3_type intersection = ray.origin + ray.direction * ray.intersectionT;
@@ -157,7 +158,8 @@ struct Unidirectional
         const scalar_type bsdfPdfThreshold = 0.0001;
         const scalar_type lumaContributionThreshold = getLuma(colorspace::eotf::sRGB<vector3_type>((vector3_type)1.0 / 255.0)); // OETF smallest perceptible value
         const vector3_type throughputCIE_Y = nbl::hlsl::transpose(colorspace::sRGBtoXYZ)[1] * throughput;   // TODO: this only works if spectral_type is dim 3
-        const scalar_type monochromeEta = nbl::hlsl::dot(throughputCIE_Y, BSDFNode_getEta(bsdf)[0]) / (throughputCIE_Y.r + throughputCIE_Y.g + throughputCIE_Y.b);  // TODO: fix getEta, what is real eta
+        const measure_type eta = bxdf.params.ior0 / bxdf.params.ior1;   // assume it's real, not imaginary?
+        const scalar_type monochromeEta = nbl::hlsl::dot(throughputCIE_Y, eta) / (throughputCIE_Y.r + throughputCIE_Y.g + throughputCIE_Y.b);  // TODO: imaginary eta?
 
         // sample lights
         const scalar_type neeProbability = 1.0;// BSDFNode_getNEEProb(bsdf);
@@ -177,6 +179,8 @@ struct Unidirectional
             // but if we allowed non-watertight transmitters (single water surface), it would make sense just to apply this line by itself
             anisocache_type _cache;
             validPath = validPath && anisocache_type::compute(_cache, interaction, nee_sample, monochromeEta);
+            bxdf.params.A = nbl::hlsl::max(bxdf.params.A, vector<scalar_type, 2>(0,0));
+            bxdf.params.eta = monochromeEta;
 
             if (neeContrib_pdf.pdf < numeric_limits<scalar_type>::max)
             {
@@ -220,7 +224,7 @@ struct Unidirectional
                         }
                     }
 
-                    quotient_pdf_type bsdf_quotient_pdf = materialSystem.quotient_and_pdf(material, params) * throughput;
+                    quotient_pdf_type bsdf_quotient_pdf = materialSystem.quotient_and_pdf(material, bxdf.params, params) * throughput;
                     neeContrib_pdf.quotient *= bsdf_quotient_pdf.quotient;
                     const scalar_type otherGenOverChoice = bsdf_quotient_pdf.pdf * rcpChoiceProb;
                     const scalar_type otherGenOverLightAndChoice = otherGenOverChoice / bsdf_quotient_pdf.pdf;
@@ -228,7 +232,11 @@ struct Unidirectional
 
                     // TODO: ifdef NEE only
 
-                    if (bsdf_quotient_pdf.pdf < numeric_limits<scalar_type>::max && getLuma(neeContrib_pdf.quotient) > lumaContributionThreshold && traceRay(t,intersection+nee_sample.L*t*getStartTolerance(depth),nee_sample.L)==-1)
+                    ray_type nee_ray;
+                    nee_ray.origin = intersection + nee_sample.L * t * Tolerance<scalar_type>::getStart(depth);
+                    nee_ray.direction = nee_sample.L;
+                    nee_ray.intersectionT = t;
+                    if (bsdf_quotient_pdf.pdf < numeric_limits<scalar_type>::max && getLuma(neeContrib_pdf.quotient) > lumaContributionThreshold && intersector.traceRay(nee_ray, scene).id == -1)
                         ray._payload.accumulation += neeContrib_pdf.quotient;
                 }
             }
@@ -251,14 +259,8 @@ struct Unidirectional
     }
 
     // Li
-    measure_type getMeasure(uint32_t numSamples, uint32_t depth, NBL_CONST_REF_ARG(Scene) scene)
+    measure_type getMeasure(uint32_t numSamples, uint32_t depth, NBL_CONST_REF_ARG(scene_type) scene)
     {
-        // loop through bounces, do closest hit
-        // return ray.payload.accumulation --> color
-
-        // TODO: not hardcode this, pass value from somewhere?, where to get objects?
-        ext::Intersector::IntersectData data;
-
         measure_type Li = (measure_type)0.0;
         scalar_type meanLumaSq = 0.0;
         for (uint32_t i = 0; i < numSamples; i++)
@@ -272,28 +274,7 @@ struct Unidirectional
             for (int d = 1; d <= depth && hit && rayAlive; d += 2)
             {
                 ray.intersectionT = numeric_limits<scalar_type>::max;
-                ray.objectID.id = -1;  // start with no intersect
-                
-                // prodedural shapes
-                if (scene.sphereCount > 0)
-                {
-                    data = scene.toIntersectData(ext::Intersector::IntersectData::Mode::PROCEDURAL, PST_SPHERE);
-                    ray.objectID = intersector.traceRay(ray, data);
-                }
-
-                if (scene.triangleCount > 0)
-                {
-                    data = scene.toIntersectData(ext::Intersector::IntersectData::Mode::PROCEDURAL, PST_TRIANGLE);
-                    ray.objectID = intersector.traceRay(ray, data);
-                }
-
-                if (scene.rectangleCount > 0)
-                {
-                    data = scene.toIntersectData(ext::Intersector::IntersectData::Mode::PROCEDURAL, PST_RECTANGLE);
-                    ray.objectID = intersector.traceRay(ray, data);
-                }
-
-                // TODO: trace AS
+                ray.objectID = intersector.traceRay(ray, scene);
 
                 hit = ray.objectID.id != -1;
                 if (hit)
diff --git a/31_HLSLPathTracer/app_resources/hlsl/scene.hlsl b/31_HLSLPathTracer/app_resources/hlsl/scene.hlsl
index fe4dea8b3..cbc9d153c 100644
--- a/31_HLSLPathTracer/app_resources/hlsl/scene.hlsl
+++ b/31_HLSLPathTracer/app_resources/hlsl/scene.hlsl
@@ -13,8 +13,12 @@ namespace hlsl
 namespace ext
 {
 
+template<typename Light, typename BxdfNode>
 struct Scene
 {
+    using light_type = Light;
+    using bxdfnode_type = BxdfNode;
+
     NBL_CONSTEXPR_STATIC_INLINE uint32_t maxSphereCount = 25;
     NBL_CONSTEXPR_STATIC_INLINE uint32_t maxTriangleCount = 12;
     NBL_CONSTEXPR_STATIC_INLINE uint32_t maxRectangleCount = 12;
@@ -29,12 +33,12 @@ struct Scene
 
     NBL_CONSTEXPR_STATIC_INLINE uint32_t maxLightCount = 4;
 
-    Light lights[maxLightCount];
+    light_type lights[maxLightCount];
     uint32_t lightCount;
     
     NBL_CONSTEXPR_STATIC_INLINE uint32_t maxBxdfCount = 16; // TODO: limit change?
 
-    BxDFNode bxdfs[maxBxdfCount];
+    bxdfnode_type bxdfs[maxBxdfCount];
     uint32_t bxdfCount;
 
     // AS ases;

From 8a4e0a94aab11c6eb0072ca0044db26ffe433a91 Mon Sep 17 00:00:00 2001
From: keptsecret <sorchon@gmail.com>
Date: Tue, 18 Feb 2025 16:45:20 +0700
Subject: [PATCH 050/529] sample bsdf in closest hit

---
 .../app_resources/hlsl/material_system.hlsl   |  6 +-
 .../app_resources/hlsl/pathtracer.hlsl        | 95 ++++++++++++++++---
 2 files changed, 86 insertions(+), 15 deletions(-)

diff --git a/31_HLSLPathTracer/app_resources/hlsl/material_system.hlsl b/31_HLSLPathTracer/app_resources/hlsl/material_system.hlsl
index 1d5587443..038bd578a 100644
--- a/31_HLSLPathTracer/app_resources/hlsl/material_system.hlsl
+++ b/31_HLSLPathTracer/app_resources/hlsl/material_system.hlsl
@@ -81,20 +81,20 @@ struct System
         }
     }
 
-    static vector3_type generate(NBL_CONST_REF_ARG(Material) material, NBL_CONST_REF_ARG(create_params_t) cparams, anisotropic_type interaction, vector2_type u, NBL_REF_ARG(anisocache_type) cache)
+    static vector3_type generate(NBL_CONST_REF_ARG(Material) material, NBL_CONST_REF_ARG(create_params_t) cparams, anisotropic_type interaction, NBL_CONST_REF_ARG(vector3_type) u, NBL_REF_ARG(anisocache_type) cache)
     {
         switch(material.type)
         {
             case DIFFUSE:
             {
                 diffuseBxDF.init(cparams);
-                return diffuseBxDF.generate(interaction, u);
+                return diffuseBxDF.generate(interaction, u.xy);
             }
             break;
             case CONDUCTOR:
             {
                 conductorBxDF.init(cparams);
-                return conductorBxDF.generate(interaction, u, cache);
+                return conductorBxDF.generate(interaction, u.xy, cache);
             }
             break;
             case DIELECTRIC:
diff --git a/31_HLSLPathTracer/app_resources/hlsl/pathtracer.hlsl b/31_HLSLPathTracer/app_resources/hlsl/pathtracer.hlsl
index 8d8d9a201..e20ef705b 100644
--- a/31_HLSLPathTracer/app_resources/hlsl/pathtracer.hlsl
+++ b/31_HLSLPathTracer/app_resources/hlsl/pathtracer.hlsl
@@ -61,6 +61,7 @@ struct Unidirectional
     using isocache_type = typename anisocache_type::isocache_type;
     using quotient_pdf_type = typename NextEventEstimator::quotient_pdf_type;
     using params_type = typename MaterialSystem::params_t;
+    using create_params_type = typename MaterialSystem::create_params_t;
     using scene_type = Scene<light_type, bxdfnode_type>;
 
     using diffuse_op_type = typename MaterialSystem::diffuse_op_type;
@@ -75,17 +76,17 @@ struct Unidirectional
     //                     NextEventEstimator nee)
     // {}
 
-    static this_t create(NBL_CONST_REF_ARG(PathTracerCreationParams) params)
+    static this_t create(NBL_CONST_REF_ARG(PathTracerCreationParams<create_params_type, scalar_type>) params, Buffer samplerSequence)
     {
         this_t retval;
         retval.randGen = randgen_type::create(params.rngState);
         retval.rayGen = raygen_type::create(params.pixOffsetParam, params.camPos, params.NDC, params.invMVP);
         retval.materialSystem = material_system_type::create(diffuseParams, conductorParams, dielectricParams);
+        retval.samplerSequence = samplerSequence;
         return retval;
     }
 
-    // TODO: get working, what is sampleSequence stuff
-    vector3_type rand3d(uint32_t protoDimension, uint32_t _sample)
+    vector3_type rand3d(uint32_t protoDimension, uint32_t _sample, uint32_t i)
     {
         uint32_t address = spirv::bitfieldInsert(protoDimension, _sample, MAX_DEPTH_LOG2, MAX_SAMPLES_LOG2);
 	    unit32_t3 seqVal = texelFetch(sampleSequence, int(address) + i).xyz;
@@ -150,19 +151,18 @@ struct Unidirectional
                             (bxdf.materialType == ext::MaterialSystem::Material::CONDUCTOR) ? bxdf_traits<conductor_op_type>::type == BT_BSDF :
                             bxdf_traits<dielectric_op_type>::type == BT_BSDF;
 
-        vector3_type eps0 = rand3d(depth, _sample);
-        vector3_type eps1 = rand3d(depth, _sample);
-        vector3_type eps2 = rand3d(depth, _sample);
+        vector3_type eps0 = rand3d(depth, _sample, 0u);
+        vector3_type eps1 = rand3d(depth, _sample, 1u);
 
         // thresholds
-        const scalar_type bsdfPdfThreshold = 0.0001;
+        const scalar_type bxdfPdfThreshold = 0.0001;
         const scalar_type lumaContributionThreshold = getLuma(colorspace::eotf::sRGB<vector3_type>((vector3_type)1.0 / 255.0)); // OETF smallest perceptible value
         const vector3_type throughputCIE_Y = nbl::hlsl::transpose(colorspace::sRGBtoXYZ)[1] * throughput;   // TODO: this only works if spectral_type is dim 3
         const measure_type eta = bxdf.params.ior0 / bxdf.params.ior1;   // assume it's real, not imaginary?
         const scalar_type monochromeEta = nbl::hlsl::dot(throughputCIE_Y, eta) / (throughputCIE_Y.r + throughputCIE_Y.g + throughputCIE_Y.b);  // TODO: imaginary eta?
 
         // sample lights
-        const scalar_type neeProbability = 1.0;// BSDFNode_getNEEProb(bsdf);
+        const scalar_type neeProbability = 1.0; // BSDFNode_getNEEProb(bsdf);
         scalar_type rcpChoiceProb;
         if (!math::partitionRandVariable(neeProbability, eps0.z, rcpChoiceProb) && depth < 2u)
         {
@@ -184,9 +184,9 @@ struct Unidirectional
 
             if (neeContrib_pdf.pdf < numeric_limits<scalar_type>::max)
             {
-                if (nbl::hlsl::any(isnan(nee_sample.L)))
+                if (nbl::hlsl::any(isnan(nee_sample.L.direction)))
                     ray.payload.accumulation += vector3_type(1000.f, 0.f, 0.f);
-                else if (nbl::hlsl::all((vector3_type)69.f == nee_sample.L))
+                else if (nbl::hlsl::all((vector3_type)69.f == nee_sample.L.direction))
                     ray.payload.accumulation += vector3_type(0.f, 1000.f, 0.f);
                 else if (validPath)
                 {
@@ -233,8 +233,8 @@ struct Unidirectional
                     // TODO: ifdef NEE only
 
                     ray_type nee_ray;
-                    nee_ray.origin = intersection + nee_sample.L * t * Tolerance<scalar_type>::getStart(depth);
-                    nee_ray.direction = nee_sample.L;
+                    nee_ray.origin = intersection + nee_sample.L.direction * t * Tolerance<scalar_type>::getStart(depth);
+                    nee_ray.direction = nee_sample.L.direction;
                     nee_ray.intersectionT = t;
                     if (bsdf_quotient_pdf.pdf < numeric_limits<scalar_type>::max && getLuma(neeContrib_pdf.quotient) > lumaContributionThreshold && intersector.traceRay(nee_ray, scene).id == -1)
                         ray._payload.accumulation += neeContrib_pdf.quotient;
@@ -243,6 +243,70 @@ struct Unidirectional
         }
 
         // sample BSDF
+        scalar_type bxdfPdf;
+        vector3_type bxdfSample;
+        {
+            ext::MaterialSystem::Material material;
+            material.type = bxdf.materialType;
+
+            anisocache_type _cache;
+            sample_type bsdf_sample = materialSystem.generate(material, bxdf.params, interaction, eps1, _cache);
+
+            // TODO: does not yet account for smooth dielectric
+            params_type params;            
+            if (!isBSDF && bxdf.materialType == ext::MaterialSystem::Material::DIFFUSE)
+            {
+                params = params_type::template create<sample_type, isotropic_type>(bsdf_sample, iso_interaction, bxdf::BCM_MAX);
+            }
+            else if (!isBSDF && bxdf.materialType != ext::MaterialSystem::Material::DIFFUSE)
+            {
+                if (bxdf.params.is_aniso)
+                    params = params_type::template create<sample_type, anisotropic_type, anisocache_type>(bsdf_sample, interaction, _cache, bxdf::BCM_MAX);
+                else
+                {
+                    isocache = (iso_cache)_cache;
+                    params = params_type::template create<sample_type, isotropic_type, isocache_type>(bsdf_sample, iso_interaction, isocache, bxdf::BCM_MAX);
+                }
+            }
+            else if (isBSDF && bxdf.materialType == ext::MaterialSystem::Material::DIFFUSE)
+            {
+                params = params_type::template create<sample_type, isotropic_type>(bsdf_sample, iso_interaction, bxdf::BCM_ABS);
+            }
+            else if (isBSDF && bxdf.materialType != ext::MaterialSystem::Material::DIFFUSE)
+            {
+                if (bxdf.params.is_aniso)
+                    params = params_type::template create<sample_type, anisotropic_type, anisocache_type>(bsdf_sample, interaction, _cache, bxdf::BCM_ABS);
+                else
+                {
+                    isocache = (iso_cache)_cache;
+                    params = params_type::template create<sample_type, isotropic_type, isocache_type>(bsdf_sample, iso_interaction, isocache, bxdf::BCM_ABS);
+                }
+            }
+
+            // the value of the bsdf divided by the probability of the sample being generated
+            throughput *= materialSystem.quotient_and_pdf(material, bxdf.params, params);
+            bxdfSample = bsdf_sample.L.direction;
+        }
+
+        // additional threshold
+        const float lumaThroughputThreshold = lumaContributionThreshold;
+        if (bxdfPdf > bxdfPdfThreshold && getLuma(throughput) > lumaThroughputThreshold)
+        {
+            ray.payload.throughput = throughput;
+            ray.payload.otherTechniqueHeuristic = neeProbability / bxdfPdf; // numerically stable, don't touch
+            ray.payload.otherTechniqueHeuristic *= ray.payload.otherTechniqueHeuristic;
+                    
+            // trace new ray
+            ray.origin = intersection + bsdfSampleL * (1.0/*kSceneSize*/) * Tolerance<scalar_type>::getStart(depth);
+            ray.direction = bxdfSample;
+            // #if POLYGON_METHOD==2
+            // ray._immutable.normalAtOrigin = interaction.isotropic.N;
+            // ray._immutable.wasBSDFAtOrigin = isBSDF;
+            // #endif
+            return true;
+        }
+
+        return false;
     }
 
     void missProgram(NBL_REF_ARG(ray_type) ray)
@@ -288,16 +352,23 @@ struct Unidirectional
             Li += (accumulation - Li) * rcpSampleSize;
 
             // TODO: visualize high variance
+
+            // TODO: russian roulette early exit?
         }
 
         return Li;
     }
 
+    NBL_CONSTEXPR_STATIC_INLINE uint32_t MAX_DEPTH_LOG2 = 4u;
+    NBL_CONSTEXPR_STATIC_INLINE uint32_t MAX_SAMPLES_LOG2 = 10u;
+
     randgen_type randGen;
     raygen_type rayGen;
     intersector_type intersector;
     material_system_type materialSystem;
     nee_type nee;
+
+    Buffer samplerSequence;
 };
 
 }

From 72104b8b192a447bf8bdce09b1826f4150ce1d6a Mon Sep 17 00:00:00 2001
From: keptsecret <sorchon@gmail.com>
Date: Wed, 19 Feb 2025 16:19:54 +0700
Subject: [PATCH 051/529] set up path tracer render shader

---
 .../app_resources/hlsl/common.hlsl            |  13 +-
 .../hlsl/next_event_estimator.hlsl            |   6 +-
 .../app_resources/hlsl/pathtracer.hlsl        |   6 +-
 .../app_resources/hlsl/rand_gen.hlsl          |   4 +-
 .../app_resources/hlsl/render.comp.hlsl       | 171 ++++++++++++++++++
 5 files changed, 185 insertions(+), 15 deletions(-)
 create mode 100644 31_HLSLPathTracer/app_resources/hlsl/render.comp.hlsl

diff --git a/31_HLSLPathTracer/app_resources/hlsl/common.hlsl b/31_HLSLPathTracer/app_resources/hlsl/common.hlsl
index 7d29dabd4..cc92a33ba 100644
--- a/31_HLSLPathTracer/app_resources/hlsl/common.hlsl
+++ b/31_HLSLPathTracer/app_resources/hlsl/common.hlsl
@@ -86,7 +86,6 @@ struct BxDFNode
 
     uint32_t materialType;
     params_type params;
-    ObjectID objectID;
 }
 
 template<typename T>
@@ -160,8 +159,8 @@ struct Shape<PST_SPHERE>
         return 2.0 * numbers::pi<float> * (1.0 - cosThetaMax);
     }
 
-    template<typename Light, typename Ray>
-    float deferredPdf(NBL_CONST_REF_ARG(Light) light, NBL_CONST_REF_ARG(Ray) ray)
+    template<typename Ray>
+    float deferredPdf(NBL_CONST_REF_ARG(Ray) ray)
     {
         return 1.0 / getSolidAngle(ray.origin);
     }
@@ -245,8 +244,8 @@ struct Shape<PST_TRIANGLE>
         return nbl::hlsl::cross(edges[0], edges[1]) * 0.5f;
     }
 
-    template<typename Light, typename Ray>
-    float deferredPdf(NBL_CONST_REF_ARG(Light) light, NBL_CONST_REF_ARG(Ray) ray)
+    template<typename Ray>
+    float deferredPdf(NBL_CONST_REF_ARG(Ray) ray)
     {
         const float32_t3 L = ray.direction;
         switch (polygonMethod)
@@ -393,8 +392,8 @@ struct Shape<PST_RECTANGLE>
         basis = nbl::hlsl::transpose<matrix3x3_type>(basis);    // TODO: double check transpose
     }
 
-    template<typename Light, typename Ray>
-    float deferredPdf(NBL_CONST_REF_ARG(Light light), NBL_CONST_REF_ARG(Ray) ray)
+    template<typename Ray>
+    float deferredPdf(NBL_CONST_REF_ARG(Ray) ray)
     {
         switch (polygonMethod)
         {
diff --git a/31_HLSLPathTracer/app_resources/hlsl/next_event_estimator.hlsl b/31_HLSLPathTracer/app_resources/hlsl/next_event_estimator.hlsl
index c6380094d..86c26a152 100644
--- a/31_HLSLPathTracer/app_resources/hlsl/next_event_estimator.hlsl
+++ b/31_HLSLPathTracer/app_resources/hlsl/next_event_estimator.hlsl
@@ -53,7 +53,7 @@ struct Estimator
             {
                 float32_t3 position = float32_t3(asfloat(intersect.data[2 + Shape<PST_SPHERE>::ObjSize]), asfloat(intersect.data[2 + Shape<PST_SPHERE>::ObjSize + 1]), asfloat(intersect.data[2 + Shape<PST_SPHERE>::ObjSize + 2]));
                 Shape<PST_SPHERE> sphere = Shape<PST_SPHERE>::create(position, asfloat(intersect.data[2 + Shape<PST_SPHERE>::ObjSize + 3]), intersect.data[2 + Shape<PST_SPHERE>::ObjSize + 4]);
-                pdf *= sphere.template deferredPdf<light_type, ray_type>(light, ray);
+                pdf *= sphere.template deferredPdf<ray_type>(ray);
             }
             break;
             case PST_TRIANGLE:
@@ -62,7 +62,7 @@ struct Estimator
                 float32_t3 vertex1 = float32_t3(asfloat(intersect.data[2 + Shape<PST_TRIANGLE>::ObjSize + 3]), asfloat(intersect.data[2 + Shape<PST_SPHERE>::ObjSize + 4]), asfloat(intersect.data[2 + Shape<PST_TRIANGLE>::ObjSize + 5]));
                 float32_t3 vertex2 = float32_t3(asfloat(intersect.data[2 + Shape<PST_TRIANGLE>::ObjSize + 6]), asfloat(intersect.data[2 + Shape<PST_SPHERE>::ObjSize + 7]), asfloat(intersect.data[2 + Shape<PST_TRIANGLE>::ObjSize + 8]));
                 Shape<PST_TRIANGLE> tri = Shape<PST_TRIANGLE>::create(vertex0, vertex1, vertex2, intersect.data[2 + Shape<PST_TRIANGLE>::ObjSize + 9]);
-                pdf *= tri.template deferredPdf<light_type, ray_type>(light, ray);
+                pdf *= tri.template deferredPdf<ray_type>(ray);
             }
             break;
             case PST_RECTANGLE:
@@ -71,7 +71,7 @@ struct Estimator
                 float32_t3 edge0 = float32_t3(asfloat(intersect.data[2 + Shape<PST_RECTANGLE>::ObjSize + 3]), asfloat(intersect.data[2 + Shape<PST_RECTANGLE>::ObjSize + 4]), asfloat(intersect.data[2 + Shape<PST_RECTANGLE>::ObjSize + 5]));
                 float32_t3 edge1 = float32_t3(asfloat(intersect.data[2 + Shape<PST_RECTANGLE>::ObjSize + 6]), asfloat(intersect.data[2 + Shape<PST_RECTANGLE>::ObjSize + 7]), asfloat(intersect.data[2 + Shape<PST_RECTANGLE>::ObjSize + 8]));
                 Shape<PST_RECTANGLE> rect = Shape<PST_RECTANGLE>::create(offset, edge0, edge1, intersect.data[2 + Shape<PST_RECTANGLE>::ObjSize + 9]);
-                pdf *= rect.template deferredPdf<light_type, ray_type>(light, ray);
+                pdf *= rect.template deferredPdf<ray_type>(ray);
             }
             break;
             default:
diff --git a/31_HLSLPathTracer/app_resources/hlsl/pathtracer.hlsl b/31_HLSLPathTracer/app_resources/hlsl/pathtracer.hlsl
index e20ef705b..350e5e404 100644
--- a/31_HLSLPathTracer/app_resources/hlsl/pathtracer.hlsl
+++ b/31_HLSLPathTracer/app_resources/hlsl/pathtracer.hlsl
@@ -90,7 +90,7 @@ struct Unidirectional
     {
         uint32_t address = spirv::bitfieldInsert(protoDimension, _sample, MAX_DEPTH_LOG2, MAX_SAMPLES_LOG2);
 	    unit32_t3 seqVal = texelFetch(sampleSequence, int(address) + i).xyz;
-	    seqVal ^= unit32_t3(randGen(), randGen(), randGen());
+	    seqVal ^= randGen();
         return vector3_type(seqVal) * asfloat(0x2f800004u);
     }
 
@@ -147,8 +147,8 @@ struct Unidirectional
 
         // TODO: ifdef kill diffuse specular paths
 
-        const bool isBSDF = (bxdf.materialType == ext::MaterialSystem::Material::DIFFUSE) ? bxdf_traits<diffuse_op_type>::type == BT_BSDF :
-                            (bxdf.materialType == ext::MaterialSystem::Material::CONDUCTOR) ? bxdf_traits<conductor_op_type>::type == BT_BSDF :
+        const bool isBSDF = (bxdf.materialType == ext::MaterialSystem::Material::Type::DIFFUSE) ? bxdf_traits<diffuse_op_type>::type == BT_BSDF :
+                            (bxdf.materialType == ext::MaterialSystem::Material::Type::CONDUCTOR) ? bxdf_traits<conductor_op_type>::type == BT_BSDF :
                             bxdf_traits<dielectric_op_type>::type == BT_BSDF;
 
         vector3_type eps0 = rand3d(depth, _sample, 0u);
diff --git a/31_HLSLPathTracer/app_resources/hlsl/rand_gen.hlsl b/31_HLSLPathTracer/app_resources/hlsl/rand_gen.hlsl
index 949c2064b..30125c687 100644
--- a/31_HLSLPathTracer/app_resources/hlsl/rand_gen.hlsl
+++ b/31_HLSLPathTracer/app_resources/hlsl/rand_gen.hlsl
@@ -22,9 +22,9 @@ struct Uniform3D
         return retval;
     }
 
-    float32_t3 operator()()
+    uint32_t3 operator()()
     {
-        return float32_t3(uint32_t3(rng(), rng(), rng()));
+        return uint32_t3(rng(), rng(), rng());
     }
 
     rng_type rng;
diff --git a/31_HLSLPathTracer/app_resources/hlsl/render.comp.hlsl b/31_HLSLPathTracer/app_resources/hlsl/render.comp.hlsl
new file mode 100644
index 000000000..306188fd0
--- /dev/null
+++ b/31_HLSLPathTracer/app_resources/hlsl/render.comp.hlsl
@@ -0,0 +1,171 @@
+#include "nbl/builtin/hlsl/cpp_compat.hlsl"
+#include "nbl/builtin/hlsl/glsl_compat/core.hlsl"
+#include "nbl/builtin/hlsl/random/pcg.hlsl"
+
+#include "pathtracer.hlsl"
+
+// add these defines (one at a time) using -D argument to dxc
+// #define SPHERE_LIGHT
+// #define TRIANGLE_LIGHT
+// #define RECTANGLE_LIGHT
+
+#ifdef SPHERE_LIGHT
+#define SPHERE_COUNT 9
+#define LIGHT_TYPE PST_SPHERE
+#else
+#define SPHERE_COUNT 8
+#endif
+
+using namespace nbl::hlsl;
+
+NBL_CONSTEXPR uint32_t WorkgroupSize = 32;
+
+struct SPushConstants
+{
+    float32_t4x4 invMVP;
+    int sampleCount;
+    int depth;
+};
+
+[[vk::push_constant]] SPushConstants pc;
+
+[[vk::combinedImageSampler]][[vk::binding(0, 2)]] Texture2D<float3> envMap;      // unused
+[[vk::combinedImageSampler]][[vk::binding(0, 2)]] SamplerState envSampler
+
+[[vk::binding(1, 2)]] Buffer sampleSequence;
+
+[[vk::combinedImageSampler]][[vk::binding(2, 2)]] Texture2D<uint2> scramblebuf; // unused
+[[vk::combinedImageSampler]][[vk::binding(2, 2)]] SamplerState scrambleSampler;
+
+[[vk::binding(0, 0)]] RWTexture2D<float16_t4> outImage;
+
+int32_t2 getCoordinates()
+{
+    return int32_t2(glsl::gl_GlobalInvocationID.xy);
+}
+
+float32_t2 getTexCoords()
+{
+    uint32_t width, height;
+    outImage.GetDimensions(width, height);
+    int32_t2 iCoords = getCoordinates();
+    return float32_t2(float(iCoords.x) / width, 1.0 - float(iCoords.y) / height);
+}
+
+using ray_dir_info_t = bxdf::ray_dir_info::SBasic<float>;
+using iso_interaction = bxdf::surface_interactions::SIsotropic<ray_dir_info_t>;
+using aniso_interaction = bxdf::surface_interactions::SAnisotropic<ray_dir_info_t>;
+using sample_t = bxdf::SLightSample<ray_dir_info_t>;
+using iso_cache = bxdf::SIsotropicMicrofacetCache<float>;
+using aniso_cache = bxdf::SAnisotropicMicrofacetCache<float>;
+using quotient_pdf_t = bxdf::quotient_and_pdf<float32_t3, float>;
+using spectral_t = vector<float, 3>;
+using params_t = bxdf::SBxDFParams<float>;
+using create_params_t = SBxDFCreationParams<scalar_type, measure_type>;
+
+using diffuse_bxdf_type = bxdf::reflection::SOrenNayarBxDF<sample_t, iso_interaction, aniso_interaction, spectral_t>;
+using conductor_bxdf_type = bxdf::reflection::SGGXBxDF<sample_t, iso_cache, aniso_cache, spectral_t>;
+using dielectric_bxdf_type = bxdf::transmission::SGGXDielectricBxDF<sample_t, iso_cache, aniso_cache, spectral_t>;
+
+using ray_type = ext::Ray<float>;
+using light_type = ext::Light<spectral_t>;
+using bxdfnode_type = ext::BxDFNode<spectral_t>;
+using randgen_type = ext::RandGen::Uniform3D<Xoroshiro64Star>;
+using raygen_type = ext::RayGen::Basic<ray_type>;
+using intersector_type = ext::Intersector::Comprehensive<ray_type>;
+using material_system_type = ext::MaterialSystem::System<diffuse_bxdf_type, conductor_bxdf_type, dielectric_bxdf_type>;
+using nee_type = ext::NextEventEstimator::Estimator<light_type, ray_type, sample_t, aniso_interaction>;
+using pathtracer_type = ext::PathTracer::Unidirectional<randgen_type, raygen_type, intersector_type, material_system_type, nee_type>;
+
+Shape<PST_SPHERE> spheres[SPHERE_COUNT] = {
+    Shape<PST_SPHERE>::create(float3(0.0, -100.5, -1.0), 100.0, 0u, light_type::INVALID_ID),
+    Shape<PST_SPHERE>::create(float3(2.0, 0.0, -1.0), 0.5, 1u, light_type::INVALID_ID),
+    Shape<PST_SPHERE>::create(float3(0.0, 0.0, -1.0), 0.5, 2u, light_type::INVALID_ID),
+    Shape<PST_SPHERE>::create(float3(-2.0, 0.0, -1.0), 0.5, 3u, light_type::INVALID_ID),
+    Shape<PST_SPHERE>::create(float3(2.0, 0.0, 1.0), 0.5, 4u, light_type::INVALID_ID),
+    Shape<PST_SPHERE>::create(float3(0.0, 0.0, 1.0), 0.5, 4u, light_type::INVALID_ID),
+    Shape<PST_SPHERE>::create(float3(-2.0, 0.0, 1.0), 0.5, 5u, light_type::INVALID_ID),
+    Shape<PST_SPHERE>::create(float3(0.5, 1.0, 0.5), 0.5, 6u, light_type::INVALID_ID)
+#ifdef SPHERE_LIGHT
+    ,Shape<PST_SPHERE>::create(float3(-1.5, 1.5, 0.0), 0.3, bxdfnode_type::INVALID_ID, 0u)
+#endif
+};
+
+#ifdef TRIANGLE_LIGHT
+#define LIGHT_TYPE PST_TRIANGLE
+#define TRIANGLE_COUNT 1
+Shape<PST_TRIANGLE> triangles[TRIANGLE_COUNT] = {
+    Shape<PST_TRIANGLE>::create(float3(-1.8,0.35,0.3) * 10.0, float3(-1.2,0.35,0.0) * 10.0, float3(-1.5,0.8,-0.3) * 10.0, bxdfnode_type::INVALID_ID, 0u)
+};
+#endif
+
+#ifdef RECTANGLE_LIGHT
+#define LIGHT_TYPE PST_RECTANGLE
+#define RECTANGLE_COUNT 1
+Shape<PST_RECTANGLE> rectangles[RECTANGLE_COUNT] = {
+    Shape<PST_RECTANGLE>::create(float3(-3.8,0.35,1.3), normalize(float3(2,0,-1))*7.0, normalize(float3(2,-5,4))*0.1, bxdfnode_type::INVALID_ID, 0u)
+};
+#endif
+
+#define LIGHT_COUNT 1
+light_type lights[LIGHT_COUNT] = {
+    light_type(spectral_t(30.0,25.0,15.0), ext::ObjectID(8u, ext::NextEventEstimator::Event::Mode::PROCEDURAL, LIGHT_TYPE))
+};
+
+#define BSDF_COUNT 7
+bxdfnode_type bsdfs[BSDF_COUNT] = {
+    bxdfnode_type(ext::MaterialSystem::Material::Type::DIFFUSE, create_params_t(false, float2(0,0), spectral_t(1,1,1), spectral_t(1.25,1.25,1.25))),
+    bxdfnode_type(ext::MaterialSystem::Material::Type::DIFFUSE, create_params_t(false, float2(0,0), spectral_t(1,1,1), spectral_t(1.25,2.5,2.5))),
+    bxdfnode_type(ext::MaterialSystem::Material::Type::DIFFUSE, create_params_t(false, float2(0,0), spectral_t(1,1,1), spectral_t(2.5,1.25,2.5))),
+    bxdfnode_type(ext::MaterialSystem::Material::Type::CONDUCTOR, create_params_t(false, float2(0,0), spectral_t(1,1,1), spectral_t(0.98,0.98,0.77))),
+    bxdfnode_type(ext::MaterialSystem::Material::Type::CONDUCTOR, create_params_t(false, float2(0,0), spectral_t(1,1,1), spectral_t(0.98,0.77,0.98))),
+    bxdfnode_type(ext::MaterialSystem::Material::Type::CONDUCTOR, create_params_t(false, float2(0.15,0.15), spectral_t(1,1,1), spectral_t(0.98,0.77,0.98))),
+    bxdfnode_type(ext::MaterialSystem::Material::Type::DIELECTRIC, create_params_t(false, float2(0.0625,0.0625), spectral_t(1,1,1), spectral_t(0.71,0.69,0.67)))
+};
+
+[numthreads(WorkgroupGridDim, WorkgroupGridDim, 1)]
+void main(uint32_t3 threadID : SV_DispatchThreadID)
+{
+    uint32_t width, height;
+    outImage.GetDimensions(width, height);
+    const int32_t2 coords = getCoordinates();
+    float32_t2 texCoord = float32_t2(coords) / float32_t2(width, height);
+    texCoord.y = 1.0 - texCoord.y;
+
+    if (false == (all((int32_t2)0 < coords)) && all(int32_t2(width, height) < coords)) {
+        return;
+    }
+
+    if (((pc.depth - 1) >> MAX_DEPTH_LOG2) > 0 || ((pc.sampleCount - 1) >> MAX_SAMPLES_LOG2) > 0)
+    {
+        float32_t4 pixelCol = float32_t4(1.0,0.0,0.0,1.0);
+        outImage[coords] = pixelCol;
+        return;
+    }
+
+    int flatIdx = glsl::gl_GlobalInvocationID.y * glsl::gl_NumWorkGroups.x * WorkgroupSize + glsl::gl_GlobalInvocationID.x;
+    PCG32x2 pcg = PCG32x2::construct(flatIdx);  // replaces scramblebuf?
+
+    // set up path tracer
+    const PathTracerCreationParams<create_params_t, float> ptCreateParams;
+    ptCreateParams.rngState = pcg();
+
+    uint2 scrambleDim;
+    scramblebuf.GetDimensions(scrambleDim.x, scrambleDim.y);
+    ptCreateParams.pixOffsetParam = (float2)1.0 / float2(scrambleDim);
+
+    float4 NDC = float4(texCoord * float2(2.0, -2.0) + float2(-1.0, 1.0), 0.0, 1.0);
+    {
+        vec4 tmp = mul(pc.invMVP, NDC);
+        ptCreateParams.camPos = tmp.xyz / tmp.w;
+        NDC.z = 1.0;
+    }
+ 
+    ptCreateParams.NDC = NDC;
+    ptCreateParams.invMVP = pc.invMVP;
+
+    pathtracer_type pathtracer = pathtracer_type::create(ptCreateParams, samplerSequence);
+
+    // set up scene (can do as global var?)
+    Scene<light_type, bxdfnode_type> scene;
+}

From 202c645b6a43906589457bed95154c4f98785e67 Mon Sep 17 00:00:00 2001
From: keptsecret <sorchon@gmail.com>
Date: Thu, 20 Feb 2025 11:08:58 +0700
Subject: [PATCH 052/529] finish render shader

---
 .../app_resources/hlsl/render.comp.hlsl       | 31 +++++++++++++++++--
 1 file changed, 29 insertions(+), 2 deletions(-)

diff --git a/31_HLSLPathTracer/app_resources/hlsl/render.comp.hlsl b/31_HLSLPathTracer/app_resources/hlsl/render.comp.hlsl
index 306188fd0..7beccd322 100644
--- a/31_HLSLPathTracer/app_resources/hlsl/render.comp.hlsl
+++ b/31_HLSLPathTracer/app_resources/hlsl/render.comp.hlsl
@@ -112,8 +112,8 @@ light_type lights[LIGHT_COUNT] = {
     light_type(spectral_t(30.0,25.0,15.0), ext::ObjectID(8u, ext::NextEventEstimator::Event::Mode::PROCEDURAL, LIGHT_TYPE))
 };
 
-#define BSDF_COUNT 7
-bxdfnode_type bsdfs[BSDF_COUNT] = {
+#define BXDF_COUNT 7
+bxdfnode_type bxdfs[BXDF_COUNT] = {
     bxdfnode_type(ext::MaterialSystem::Material::Type::DIFFUSE, create_params_t(false, float2(0,0), spectral_t(1,1,1), spectral_t(1.25,1.25,1.25))),
     bxdfnode_type(ext::MaterialSystem::Material::Type::DIFFUSE, create_params_t(false, float2(0,0), spectral_t(1,1,1), spectral_t(1.25,2.5,2.5))),
     bxdfnode_type(ext::MaterialSystem::Material::Type::DIFFUSE, create_params_t(false, float2(0,0), spectral_t(1,1,1), spectral_t(2.5,1.25,2.5))),
@@ -168,4 +168,31 @@ void main(uint32_t3 threadID : SV_DispatchThreadID)
 
     // set up scene (can do as global var?)
     Scene<light_type, bxdfnode_type> scene;
+    scene.sphereCount = SPHERE_COUNT;
+    for (uint32_t i = 0; i < SPHERE_COUNT; i++)
+        scene.spheres[i] = spheres[i];
+#ifdef TRIANGLE_LIGHT
+    scene.triangleCount = TRIANGLE_COUNT;
+    for (uint32_t i = 0; i < TRIANGLE_COUNT; i++)
+        scene.triangles[i] = triangles[i];
+#else
+    scene.triangleCount = 0;
+#endif
+#ifdef RECTANGLE_LIGHT
+    scene.rectangleCount = RECTANGLE_COUNT;
+    for (uint32_t i = 0; i < RECTANGLE_COUNT; i++)
+        scene.rectangles[i] = rectangles[i];
+#else
+    scene.rectangleCount = 0;
+#endif
+    scene.lightCount = LIGHT_COUNT;
+    for (uint32_t i = 0; i < LIGHT_COUNT; i++)
+        scene.lights[i] = lights[i];
+    scene.bxdfCount = BXDF_COUNT;
+    for (uint32_t i = 0; i < BXDF_COUNT; i++)
+        scene.bxdfs[i] = bxdfs[i];
+
+    float32_t3 color = pathtracer.getMeasure(pc.sampleCount, pc.depth, scene);
+    float32_t4 pixCol = float32_t4(color, 1.0);
+    outImage[coords] = pixCol;
 }

From 2f77555ce484c2f8ecb390e68fc3f4c830b23ef7 Mon Sep 17 00:00:00 2001
From: keptsecret <sorchon@gmail.com>
Date: Thu, 20 Feb 2025 16:55:07 +0700
Subject: [PATCH 053/529] hlsl path tracer initial, bug fixes

---
 .../app_resources/hlsl/common.hlsl            | 24 ++---
 .../app_resources/hlsl/intersector.hlsl       |  4 +-
 .../app_resources/hlsl/material_system.hlsl   |  3 +-
 .../hlsl/next_event_estimator.hlsl            |  2 +-
 .../app_resources/hlsl/pathtracer.hlsl        |  2 +-
 .../app_resources/hlsl/rand_gen.hlsl          |  2 +-
 .../app_resources/hlsl/ray_gen.hlsl           |  6 +-
 .../app_resources/hlsl/render.comp.hlsl       |  9 +-
 31_HLSLPathTracer/main.cpp                    | 94 ++++++++++++++++---
 9 files changed, 109 insertions(+), 37 deletions(-)

diff --git a/31_HLSLPathTracer/app_resources/hlsl/common.hlsl b/31_HLSLPathTracer/app_resources/hlsl/common.hlsl
index cc92a33ba..938e3ca22 100644
--- a/31_HLSLPathTracer/app_resources/hlsl/common.hlsl
+++ b/31_HLSLPathTracer/app_resources/hlsl/common.hlsl
@@ -86,7 +86,7 @@ struct BxDFNode
 
     uint32_t materialType;
     params_type params;
-}
+};
 
 template<typename T>
 struct Tolerance
@@ -108,7 +108,7 @@ struct Tolerance
     {
         return 1.0 - nbl::hlsl::exp2(__common(depth) + 1.0);
     }
-}
+};
 
 enum PTPolygonMethod : uint16_t
 {
@@ -166,7 +166,7 @@ struct Shape<PST_SPHERE>
     }
 
     template<class Aniso>
-    float generate_and_pdf(NBL_REF_ARG(float32_t) pdf, NBL_REF_ARG(float32_t) newRayMaxT, NBL_CONST_REF_ARG(float32_t3) origin, NBL_CONST_REF_ARG(Aniso) interaction, bool isBSDF, float32_t3 xi)
+    float32_t3 generate_and_pdf(NBL_REF_ARG(float32_t) pdf, NBL_REF_ARG(float32_t) newRayMaxT, NBL_CONST_REF_ARG(float32_t3) origin, NBL_CONST_REF_ARG(Aniso) interaction, bool isBSDF, float32_t3 xi)
     {
         float32_t3 Z = position - origin;
         const float distanceSQ = nbl::hlsl::dot(Z,Z);
@@ -179,7 +179,7 @@ struct Shape<PST_SPHERE>
             const float cosThetaMax = nbl::hlsl::sqrt(cosThetaMax2);
             const float cosTheta = nbl::hlsl::mix(1.0, cosThetaMax, xi.x);
 
-            vec3 L = Z * cosTheta;
+            float32_t3 L = Z * cosTheta;
 
             const float cosTheta2 = cosTheta * cosTheta;
             const float sinTheta = nbl::hlsl::sqrt(1.0 - cosTheta2);
@@ -253,7 +253,8 @@ struct Shape<PST_TRIANGLE>
             case PPM_AREA:
             {
                 const float dist = ray.intersectionT;
-                return dist * dist / nbl::hlsl::abs(nbl::hlsl::dot(getNormalTimesArea()), L);
+                const float32_t3 L = ray.direction;
+                return dist * dist / nbl::hlsl::abs(nbl::hlsl::dot(getNormalTimesArea(), L));
             }
             break;
             case PPM_SOLID_ANGLE:
@@ -303,7 +304,7 @@ struct Shape<PST_TRIANGLE>
             {
                 float rcpPdf;
 
-                shapes::SphericalTriangle<float> st = shapes::SphericalTriangle<float>::create(vertex0, vertex1, vertex2, ray.origin);
+                shapes::SphericalTriangle<float> st = shapes::SphericalTriangle<float>::create(vertex0, vertex1, vertex2, origin);
                 sampling::SphericalTriangle<float> sst = sampling::SphericalTriangle<float>::create(st);
 
                 const float32_t3 L = sst.generate(rcpPdf, xi.xy);
@@ -319,7 +320,7 @@ struct Shape<PST_TRIANGLE>
             {
                 float rcpPdf;
 
-                shapes::SphericalTriangle<float> st = shapes::SphericalTriangle<float>::create(vertex0, vertex1, vertex2, ray.origin);
+                shapes::SphericalTriangle<float> st = shapes::SphericalTriangle<float>::create(vertex0, vertex1, vertex2, origin);
                 sampling::ProjectedSphericalTriangle<float> sst = sampling::ProjectedSphericalTriangle<float>::create(st);
             
                 const float32_t3 L = sst.generate(rcpPdf, interaction.N, isBSDF, xi.xy);
@@ -348,9 +349,9 @@ struct Shape<PST_TRIANGLE>
 template<>
 struct Shape<PST_RECTANGLE>
 {
-    static Shape<PST_TRIANGLE> create(NBL_CONST_REF_ARG(float32_t3) offset, NBL_CONST_REF_ARG(float32_t3) edge0, NBL_CONST_REF_ARG(float32_t3) edge1, uint32_t bsdfID, uint32_t lightID)
+    static Shape<PST_RECTANGLE> create(NBL_CONST_REF_ARG(float32_t3) offset, NBL_CONST_REF_ARG(float32_t3) edge0, NBL_CONST_REF_ARG(float32_t3) edge1, uint32_t bsdfID, uint32_t lightID)
     {
-        Shape<PST_TRIANGLE> retval;
+        Shape<PST_RECTANGLE> retval;
         retval.offset = offset;
         retval.edge0 = edge0;
         retval.edge1 = edge1;
@@ -389,7 +390,7 @@ struct Shape<PST_RECTANGLE>
         basis[1] = edge1 / extents[1];
         basis[2] = normalize(cross(basis[0],basis[1]));
 
-        basis = nbl::hlsl::transpose<matrix3x3_type>(basis);    // TODO: double check transpose
+        basis = nbl::hlsl::transpose<float32_t3x3>(basis);    // TODO: double check transpose
     }
 
     template<typename Ray>
@@ -400,6 +401,7 @@ struct Shape<PST_RECTANGLE>
             case PPM_AREA:
             {
                 const float dist = ray.intersectionT;
+                const float32_t3 L = ray.direction;
                 return dist * dist / nbl::hlsl::abs(nbl::hlsl::dot(getNormalTimesArea(), L));
             }
             break;
@@ -499,4 +501,4 @@ struct Shape<PST_RECTANGLE>
 }
 }
 
-#endif
\ No newline at end of file
+#endif
diff --git a/31_HLSLPathTracer/app_resources/hlsl/intersector.hlsl b/31_HLSLPathTracer/app_resources/hlsl/intersector.hlsl
index cf2d3ae7c..5151ea9c0 100644
--- a/31_HLSLPathTracer/app_resources/hlsl/intersector.hlsl
+++ b/31_HLSLPathTracer/app_resources/hlsl/intersector.hlsl
@@ -33,7 +33,7 @@ struct IntersectData
     NBL_CONSTEXPR_STATIC_INLINE uint32_t DataSize = 128;
 
     uint32_t mode : 1;
-    unit32_t unused : 31;   // possible space for flags
+    uint32_t unused : 31;   // possible space for flags
     uint32_t data[DataSize];
 };
 
@@ -199,4 +199,4 @@ struct Comprehensive
 }
 }
 
-#endif
\ No newline at end of file
+#endif
diff --git a/31_HLSLPathTracer/app_resources/hlsl/material_system.hlsl b/31_HLSLPathTracer/app_resources/hlsl/material_system.hlsl
index 038bd578a..687c41dc0 100644
--- a/31_HLSLPathTracer/app_resources/hlsl/material_system.hlsl
+++ b/31_HLSLPathTracer/app_resources/hlsl/material_system.hlsl
@@ -3,6 +3,7 @@
 
 #include <nbl/builtin/hlsl/limits.hlsl>
 #include <nbl/builtin/hlsl/math/functions.hlsl>
+#include <nbl/builtin/hlsl/bxdf/common.hlsl>
 
 namespace nbl
 {
@@ -150,4 +151,4 @@ struct System
 }
 }
 
-#endif
\ No newline at end of file
+#endif
diff --git a/31_HLSLPathTracer/app_resources/hlsl/next_event_estimator.hlsl b/31_HLSLPathTracer/app_resources/hlsl/next_event_estimator.hlsl
index 86c26a152..5695efc0d 100644
--- a/31_HLSLPathTracer/app_resources/hlsl/next_event_estimator.hlsl
+++ b/31_HLSLPathTracer/app_resources/hlsl/next_event_estimator.hlsl
@@ -188,4 +188,4 @@ struct Estimator
 }
 }
 
-#endif
\ No newline at end of file
+#endif
diff --git a/31_HLSLPathTracer/app_resources/hlsl/pathtracer.hlsl b/31_HLSLPathTracer/app_resources/hlsl/pathtracer.hlsl
index 350e5e404..b14c9baae 100644
--- a/31_HLSLPathTracer/app_resources/hlsl/pathtracer.hlsl
+++ b/31_HLSLPathTracer/app_resources/hlsl/pathtracer.hlsl
@@ -376,4 +376,4 @@ struct Unidirectional
 }
 }
 
-#endif
\ No newline at end of file
+#endif
diff --git a/31_HLSLPathTracer/app_resources/hlsl/rand_gen.hlsl b/31_HLSLPathTracer/app_resources/hlsl/rand_gen.hlsl
index 30125c687..4f5302fea 100644
--- a/31_HLSLPathTracer/app_resources/hlsl/rand_gen.hlsl
+++ b/31_HLSLPathTracer/app_resources/hlsl/rand_gen.hlsl
@@ -35,4 +35,4 @@ struct Uniform3D
 }
 }
 
-#endif
\ No newline at end of file
+#endif
diff --git a/31_HLSLPathTracer/app_resources/hlsl/ray_gen.hlsl b/31_HLSLPathTracer/app_resources/hlsl/ray_gen.hlsl
index 467ef2bd4..dcb695fbe 100644
--- a/31_HLSLPathTracer/app_resources/hlsl/ray_gen.hlsl
+++ b/31_HLSLPathTracer/app_resources/hlsl/ray_gen.hlsl
@@ -1,6 +1,8 @@
 #ifndef _NBL_HLSL_EXT_RAYGEN_INCLUDED_
 #define _NBL_HLSL_EXT_RAYGEN_INCLUDED_
 
+#include <nbl/builtin/hlsl/sampling/box_muller_transform.hlsl>
+
 #include "common.hlsl"
 
 namespace nbl
@@ -43,7 +45,7 @@ struct Basic
         // apply stochastic reconstruction filter
         const float gaussianFilterCutoff = 2.5;
         const float truncation = nbl::hlsl::exp(-0.5 * gaussianFilterCutoff * gaussianFilterCutoff);
-        vec2 remappedRand = randVec.xy;
+        vector2_type remappedRand = randVec.xy;
         remappedRand.x *= 1.0 - truncation;
         remappedRand.x += truncation;
         tmp.xy += pixOffsetParam * nbl::hlsl::boxMullerTransform<scalar_type>(remappedRand, 1.5);
@@ -77,4 +79,4 @@ struct Basic
 }
 }
 
-#endif
\ No newline at end of file
+#endif
diff --git a/31_HLSLPathTracer/app_resources/hlsl/render.comp.hlsl b/31_HLSLPathTracer/app_resources/hlsl/render.comp.hlsl
index 7beccd322..1c8c15ec4 100644
--- a/31_HLSLPathTracer/app_resources/hlsl/render.comp.hlsl
+++ b/31_HLSLPathTracer/app_resources/hlsl/render.comp.hlsl
@@ -2,6 +2,9 @@
 #include "nbl/builtin/hlsl/glsl_compat/core.hlsl"
 #include "nbl/builtin/hlsl/random/pcg.hlsl"
 
+#include "nbl/builtin/hlsl/bxdf/reflection.hlsl"
+#include "nbl/builtin/hlsl/bxdf/transmission.hlsl"
+
 #include "pathtracer.hlsl"
 
 // add these defines (one at a time) using -D argument to dxc
@@ -30,7 +33,7 @@ struct SPushConstants
 [[vk::push_constant]] SPushConstants pc;
 
 [[vk::combinedImageSampler]][[vk::binding(0, 2)]] Texture2D<float3> envMap;      // unused
-[[vk::combinedImageSampler]][[vk::binding(0, 2)]] SamplerState envSampler
+[[vk::combinedImageSampler]][[vk::binding(0, 2)]] SamplerState envSampler;
 
 [[vk::binding(1, 2)]] Buffer sampleSequence;
 
@@ -41,7 +44,7 @@ struct SPushConstants
 
 int32_t2 getCoordinates()
 {
-    return int32_t2(glsl::gl_GlobalInvocationID.xy);
+    return int32_t2(glsl::gl_GlobalInvocationID().xy);
 }
 
 float32_t2 getTexCoords()
@@ -143,7 +146,7 @@ void main(uint32_t3 threadID : SV_DispatchThreadID)
         return;
     }
 
-    int flatIdx = glsl::gl_GlobalInvocationID.y * glsl::gl_NumWorkGroups.x * WorkgroupSize + glsl::gl_GlobalInvocationID.x;
+    int flatIdx = glsl::gl_GlobalInvocationID().y * glsl::gl_NumWorkGroups().x * WorkgroupSize + glsl::gl_GlobalInvocationID().x;
     PCG32x2 pcg = PCG32x2::construct(flatIdx);  // replaces scramblebuf?
 
     // set up path tracer
diff --git a/31_HLSLPathTracer/main.cpp b/31_HLSLPathTracer/main.cpp
index 018468e46..13aa59823 100644
--- a/31_HLSLPathTracer/main.cpp
+++ b/31_HLSLPathTracer/main.cpp
@@ -37,6 +37,14 @@ class ComputeShaderPathtracer final : public examples::SimpleWindowedApplication
 			ELG_COUNT
 		};
 
+		enum E_RENDER_MODE : uint8_t
+		{
+			ERM_GLSL,
+			ERM_HLSL,
+			ERM_CHECKERED,
+			ERM_COUNT
+		};
+
 		constexpr static inline uint32_t2 WindowDimensions = { 1280, 720 };
 		constexpr static inline uint32_t MaxFramesInFlight = 5;
 		constexpr static inline clock_t::duration DisplayImageDuration = std::chrono::milliseconds(900);
@@ -49,7 +57,9 @@ class ComputeShaderPathtracer final : public examples::SimpleWindowedApplication
 		constexpr static inline uint8_t MaxUITextureCount = 1u;
 		static inline std::string DefaultImagePathsFile = "envmap/envmap_0.exr";
 		static inline std::string OwenSamplerFilePath = "owen_sampler_buffer.bin";
-		static inline std::array<std::string, E_LIGHT_GEOMETRY::ELG_COUNT> PTShaderPaths = { "app_resources/glsl/litBySphere.comp", "app_resources/glsl/litByTriangle.comp", "app_resources/glsl/litByRectangle.comp" };
+		static inline std::array<std::string, E_LIGHT_GEOMETRY::ELG_COUNT> PTGLSLShaderPaths = { "app_resources/glsl/litBySphere.comp", "app_resources/glsl/litByTriangle.comp", "app_resources/glsl/litByRectangle.comp" };
+		static inline std::string PTHLSLShaderPath = "app_resources/hlsl/render.comp.hlsl";
+		static inline std::array<std::string, E_LIGHT_GEOMETRY::ELG_COUNT> PTHLSLShaderVariants = { "SPHERE_LIGHT", "TRIANGLE_LIGHT", "RECTANGLE_LIGHT" };
 		static inline std::string PresentShaderPath = "app_resources/hlsl/present.frag.hlsl";
 
 		const char* shaderNames[E_LIGHT_GEOMETRY::ELG_COUNT] = {
@@ -301,7 +311,7 @@ class ComputeShaderPathtracer final : public examples::SimpleWindowedApplication
 				m_presentDescriptorSet = presentDSPool->createDescriptorSet(gpuPresentDescriptorSetLayout);
 
 				// Create Shaders
-				auto loadAndCompileShader = [&](std::string pathToShader)
+				auto loadAndCompileGLSLShader = [&](const std::string& pathToShader) -> smart_refctd_ptr<IGPUShader>
 				{
 					IAssetLoader::SAssetLoadParams lp = {};
 					lp.workingDirectory = localInputCWD;
@@ -328,10 +338,46 @@ class ComputeShaderPathtracer final : public examples::SimpleWindowedApplication
 					return shader;
 				};
 
+				auto loadAndCompileHLSLShader = [&](const std::string& pathToShader, const std::string& defineMacro) -> smart_refctd_ptr<IGPUShader>
+				{
+					IAssetLoader::SAssetLoadParams lp = {};
+					lp.workingDirectory = localInputCWD;
+					auto assetBundle = m_assetMgr->getAsset(pathToShader, lp);
+					const auto assets = assetBundle.getContents();
+					if (assets.empty())
+					{
+						m_logger->log("Could not load shader: ", ILogger::ELL_ERROR, pathToShader);
+						std::exit(-1);
+					}
+
+					auto source = IAsset::castDown<ICPUShader>(assets[0]);
+					// The down-cast should not fail!
+					assert(source);
+
+					auto compiler = make_smart_refctd_ptr<asset::CHLSLCompiler>(smart_refctd_ptr(m_system));
+					CHLSLCompiler::SOptions options = {};
+					options.stage = IShader::E_SHADER_STAGE::ESS_COMPUTE;	// should be compute
+					options.targetSpirvVersion = m_device->getPhysicalDevice()->getLimits().spirvVersion;
+					options.spirvOptimizer = nullptr;
+#ifndef _NBL_DEBUG
+					ISPIRVOptimizer::E_OPTIMIZER_PASS optPasses = ISPIRVOptimizer::EOP_STRIP_DEBUG_INFO;
+					auto opt = make_smart_refctd_ptr<ISPIRVOptimizer>(std::span<ISPIRVOptimizer::E_OPTIMIZER_PASS>(&optPasses, 1));
+					options.spirvOptimizer = opt.get();
+#endif
+					options.debugInfoFlags |= IShaderCompiler::E_DEBUG_INFO_FLAGS::EDIF_SOURCE_BIT;
+					options.preprocessorOptions.sourceIdentifier = source->getFilepathHint();
+					options.preprocessorOptions.logger = m_logger.get();
+					options.preprocessorOptions.includeFinder = compiler->getDefaultIncludeFinder();
+
+					//std::string dxcOptionStr[] = { "-D" + defineMacro };
+					//options.dxcOptions = std::span(dxcOptionStr);
+
+					source = compiler->compileToSPIRV((const char*)source->getContent()->getPointer(), options);
+				};
+
 				// Create compute pipelines
 				{
 					for (int index = 0; index < E_LIGHT_GEOMETRY::ELG_COUNT; index++) {
-						auto ptShader = loadAndCompileShader(PTShaderPaths[index]);
 						const nbl::asset::SPushConstantRange pcRange = {
 							.stageFlags = IShader::E_SHADER_STAGE::ESS_COMPUTE,
 							.offset = 0,
@@ -348,15 +394,31 @@ class ComputeShaderPathtracer final : public examples::SimpleWindowedApplication
 							return logFail("Failed to create Pathtracing pipeline layout");
 						}
 
-						IGPUComputePipeline::SCreationParams params = {};
-						params.layout = ptPipelineLayout.get();
-						params.shader.shader = ptShader.get();
-						params.shader.entryPoint = "main";
-						params.shader.entries = nullptr;
-						params.shader.requireFullSubgroups = true;
-						params.shader.requiredSubgroupSize = static_cast<IGPUShader::SSpecInfo::SUBGROUP_SIZE>(5);
-						if (!m_device->createComputePipelines(nullptr, { &params, 1 }, m_PTPipelines.data() + index)) {
-							return logFail("Failed to create compute pipeline!\n");
+						{
+							auto ptShader = loadAndCompileGLSLShader(PTGLSLShaderPaths[index]);
+
+							IGPUComputePipeline::SCreationParams params = {};
+							params.layout = ptPipelineLayout.get();
+							params.shader.shader = ptShader.get();
+							params.shader.entryPoint = "main";
+							params.shader.entries = nullptr;
+							params.shader.requireFullSubgroups = true;
+							params.shader.requiredSubgroupSize = static_cast<IGPUShader::SSpecInfo::SUBGROUP_SIZE>(5);
+							if (!m_device->createComputePipelines(nullptr, { &params, 1 }, m_PTGLSLPipelines.data() + index))
+								return logFail("Failed to create GLSL compute pipeline!\n");
+						}
+						{
+							auto ptShader = loadAndCompileHLSLShader(PTHLSLShaderPath, PTHLSLShaderVariants[index]);
+
+							IGPUComputePipeline::SCreationParams params = {};
+							params.layout = ptPipelineLayout.get();
+							params.shader.shader = ptShader.get();
+							params.shader.entryPoint = "main";
+							params.shader.entries = nullptr;
+							params.shader.requireFullSubgroups = true;
+							params.shader.requiredSubgroupSize = static_cast<IGPUShader::SSpecInfo::SUBGROUP_SIZE>(5);
+							if (!m_device->createComputePipelines(nullptr, { &params, 1 }, m_PTHLSLPipelines.data() + index))
+								return logFail("Failed to create HLSL compute pipeline!\n");
 						}
 					}
 				}
@@ -369,7 +431,7 @@ class ComputeShaderPathtracer final : public examples::SimpleWindowedApplication
 						return logFail("Failed to create Full Screen Triangle protopipeline or load its vertex shader!");
 
 					// Load Fragment Shader
-					auto fragmentShader = loadAndCompileShader(PresentShaderPath);
+					auto fragmentShader = loadAndCompileGLSLShader(PresentShaderPath);
 					if (!fragmentShader)
 						return logFail("Failed to Load and Compile Fragment Shader: lumaMeterShader!");
 
@@ -985,7 +1047,7 @@ class ComputeShaderPathtracer final : public examples::SimpleWindowedApplication
 
 				// cube envmap handle
 				{
-					auto pipeline = m_PTPipelines[PTPipline].get();
+					auto pipeline = renderMode == E_RENDER_MODE::ERM_HLSL ? m_PTHLSLPipelines[PTPipline].get() : m_PTGLSLPipelines[PTPipline].get();
 					cmdbuf->bindComputePipeline(pipeline);
 					cmdbuf->bindDescriptorSets(EPBP_COMPUTE, pipeline->getLayout(), 0u, 1u, &m_descriptorSet0.get());
 					cmdbuf->bindDescriptorSets(EPBP_COMPUTE, pipeline->getLayout(), 2u, 1u, &m_descriptorSet2.get());
@@ -1220,7 +1282,8 @@ class ComputeShaderPathtracer final : public examples::SimpleWindowedApplication
 
 		// gpu resources
 		smart_refctd_ptr<IGPUCommandPool> m_cmdPool;
-		std::array<smart_refctd_ptr<IGPUComputePipeline>, E_LIGHT_GEOMETRY::ELG_COUNT> m_PTPipelines;
+		std::array<smart_refctd_ptr<IGPUComputePipeline>, E_LIGHT_GEOMETRY::ELG_COUNT> m_PTGLSLPipelines;
+		std::array<smart_refctd_ptr<IGPUComputePipeline>, E_LIGHT_GEOMETRY::ELG_COUNT> m_PTHLSLPipelines;
 		smart_refctd_ptr<IGPUGraphicsPipeline> m_presentPipeline;
 		uint64_t m_realFrameIx = 0;
 		std::array<smart_refctd_ptr<IGPUCommandBuffer>, MaxFramesInFlight> m_cmdBufs;
@@ -1269,6 +1332,7 @@ class ComputeShaderPathtracer final : public examples::SimpleWindowedApplication
 		float camYAngle = 165.f / 180.f * 3.14159f;
 		float camXAngle = 32.f / 180.f * 3.14159f;
 		int PTPipline = E_LIGHT_GEOMETRY::ELG_SPHERE;
+		int renderMode = E_RENDER_MODE::ERM_GLSL;
 		int spp = 32;
 		int depth = 3;
 

From 99aed4777c208c5acc4e66bb7ea8dc48f814c8d0 Mon Sep 17 00:00:00 2001
From: keptsecret <sorchon@gmail.com>
Date: Fri, 21 Feb 2025 14:16:11 +0700
Subject: [PATCH 054/529] fix shader bugs

---
 .../app_resources/hlsl/common.hlsl            | 30 +++++++--
 .../app_resources/hlsl/intersector.hlsl       | 33 +++++-----
 .../app_resources/hlsl/material_system.hlsl   | 40 ++++++------
 .../hlsl/next_event_estimator.hlsl            | 61 ++++++++++---------
 .../app_resources/hlsl/pathtracer.hlsl        |  9 +--
 .../app_resources/hlsl/render.comp.hlsl       |  4 ++
 .../app_resources/hlsl/scene.hlsl             | 50 +++++++--------
 31_HLSLPathTracer/main.cpp                    |  4 +-
 8 files changed, 131 insertions(+), 100 deletions(-)

diff --git a/31_HLSLPathTracer/app_resources/hlsl/common.hlsl b/31_HLSLPathTracer/app_resources/hlsl/common.hlsl
index 938e3ca22..1b0aac72f 100644
--- a/31_HLSLPathTracer/app_resources/hlsl/common.hlsl
+++ b/31_HLSLPathTracer/app_resources/hlsl/common.hlsl
@@ -123,15 +123,21 @@ struct Shape;
 template<>
 struct Shape<PST_SPHERE>
 {
-    static Shape<PST_SPHERE> create(NBL_CONST_REF_ARG(float32_t3) position, float32_t radius, uint32_t bsdfID, uint32_t lightID)
+    static Shape<PST_SPHERE> create(NBL_CONST_REF_ARG(float32_t3) position, float32_t radius, uint32_t bsdfLightIDs)
     {
         Shape<PST_SPHERE> retval;
         retval.position = position;
         retval.radius2 = radius * radius;
-        retval.bsdfLightIDs = spirv::bitFieldInsert<uint32_t>(bsdfID, lightID, 16, 16);
+        retval.bsdfLightIDs = bsdfLightIDs;
         return retval;
     }
 
+    static Shape<PST_SPHERE> create(NBL_CONST_REF_ARG(float32_t3) position, float32_t radius, uint32_t bsdfID, uint32_t lightID)
+    {
+        uint32_t bsdfLightIDs = spirv::bitFieldInsert<uint32_t>(bsdfID, lightID, 16, 16);
+        return create(position, radius, bsdfLightIDs);
+    }
+
     // return intersection distance if found, nan otherwise
     float intersect(NBL_CONST_REF_ARG(float32_t3) origin, NBL_CONST_REF_ARG(float32_t3) direction)
     {
@@ -207,17 +213,23 @@ struct Shape<PST_SPHERE>
 template<>
 struct Shape<PST_TRIANGLE>
 {
-    static Shape<PST_TRIANGLE> create(NBL_CONST_REF_ARG(float32_t3) vertex0, NBL_CONST_REF_ARG(float32_t3) vertex1, NBL_CONST_REF_ARG(float32_t3) vertex2, uint32_t bsdfID, uint32_t lightID)
+    static Shape<PST_TRIANGLE> create(NBL_CONST_REF_ARG(float32_t3) vertex0, NBL_CONST_REF_ARG(float32_t3) vertex1, NBL_CONST_REF_ARG(float32_t3) vertex2, uint32_t bsdfLightIDs)
     {
         Shape<PST_TRIANGLE> retval;
         retval.vertex0 = vertex0;
         retval.vertex1 = vertex1;
         retval.vertex2 = vertex2;
-        retval.bsdfLightIDs = spirv::bitFieldInsert<uint32_t>(bsdfID, lightID, 16, 16);
+        retval.bsdfLightIDs = bsdfLightIDs;
         retval.polygonMethod = PPM_SOLID_ANGLE;
         return retval;
     }
 
+    static Shape<PST_TRIANGLE> create(NBL_CONST_REF_ARG(float32_t3) vertex0, NBL_CONST_REF_ARG(float32_t3) vertex1, NBL_CONST_REF_ARG(float32_t3) vertex2, uint32_t bsdfID, uint32_t lightID)
+    {
+        uint32_t bsdfLightIDs = spirv::bitFieldInsert<uint32_t>(bsdfID, lightID, 16, 16);
+        return create(vertex0, vertex1, vertex2, bsdfLightIDs);
+    }
+
     float intersect(NBL_CONST_REF_ARG(float32_t3) origin, NBL_CONST_REF_ARG(float32_t3) direction)
     {
         const float32_t3 edges[2] = { vertex1 - vertex0, vertex2 - vertex0 };
@@ -349,17 +361,23 @@ struct Shape<PST_TRIANGLE>
 template<>
 struct Shape<PST_RECTANGLE>
 {
-    static Shape<PST_RECTANGLE> create(NBL_CONST_REF_ARG(float32_t3) offset, NBL_CONST_REF_ARG(float32_t3) edge0, NBL_CONST_REF_ARG(float32_t3) edge1, uint32_t bsdfID, uint32_t lightID)
+    static Shape<PST_RECTANGLE> create(NBL_CONST_REF_ARG(float32_t3) offset, NBL_CONST_REF_ARG(float32_t3) edge0, NBL_CONST_REF_ARG(float32_t3) edge1, uint32_t bsdfLightIDs)
     {
         Shape<PST_RECTANGLE> retval;
         retval.offset = offset;
         retval.edge0 = edge0;
         retval.edge1 = edge1;
-        retval.bsdfLightIDs = spirv::bitFieldInsert<uint32_t>(bsdfID, lightID, 16, 16);
+        retval.bsdfLightIDs = bsdfLightIDs;
         retval.polygonMethod = PPM_SOLID_ANGLE;
         return retval;
     }
 
+    static Shape<PST_RECTANGLE> create(NBL_CONST_REF_ARG(float32_t3) offset, NBL_CONST_REF_ARG(float32_t3) edge0, NBL_CONST_REF_ARG(float32_t3) edge1, uint32_t bsdfID, uint32_t lightID)
+    {
+        uint32_t bsdfLightIDs = spirv::bitFieldInsert<uint32_t>(bsdfID, lightID, 16, 16);
+        return create(offset, edge0, edge1, bsdfLightIDs);
+    }
+
     float intersect(NBL_CONST_REF_ARG(float32_t3) origin, NBL_CONST_REF_ARG(float32_t3) direction)
     {
         const float32_t3 h = nbl::hlsl::cross(direction, edge1);
diff --git a/31_HLSLPathTracer/app_resources/hlsl/intersector.hlsl b/31_HLSLPathTracer/app_resources/hlsl/intersector.hlsl
index 5151ea9c0..0bb6cb31c 100644
--- a/31_HLSLPathTracer/app_resources/hlsl/intersector.hlsl
+++ b/31_HLSLPathTracer/app_resources/hlsl/intersector.hlsl
@@ -41,17 +41,18 @@ template<class Ray>
 struct Comprehensive
 {
     using scalar_type = typename Ray::scalar_type;
+    using vector3_type = vector<scalar_type, 3>;
     using ray_type = Ray;
 
     static ObjectID traceProcedural(NBL_REF_ARG(ray_type) ray, NBL_REF_ARG(IntersectData) intersect)
     {
         const bool anyHit = ray.intersectionT != numeric_limits<scalar_type>::max;
         const uint32_t objCount = intersect.data[0];
-        const ProceduralShapeType type = intersect.data[1];
+        const ProceduralShapeType type = (ProceduralShapeType)intersect.data[1];
 
         ObjectID objectID = ray.objectID;
         objectID.mode = IntersectData::Mode::PROCEDURAL;
-        objectID.type = type;
+        objectID.shapeType = type;
         for (int i = 0; i < objCount; i++)
         {
             float t;
@@ -59,25 +60,25 @@ struct Comprehensive
             {
                 case PST_SPHERE:
                 {
-                    float32_t3 position = float32_t3(asfloat(intersect.data[2 + i * Shape<PST_SPHERE>::ObjSize]), asfloat(intersect.data[2 + i * Shape<PST_SPHERE>::ObjSize + 1]), asfloat(intersect.data[2 + i * Shape<PST_SPHERE>::ObjSize + 2]));
+                    vector3_type position = vector3_type(asfloat(intersect.data[2 + i * Shape<PST_SPHERE>::ObjSize]), asfloat(intersect.data[2 + i * Shape<PST_SPHERE>::ObjSize + 1]), asfloat(intersect.data[2 + i * Shape<PST_SPHERE>::ObjSize + 2]));
                     Shape<PST_SPHERE> sphere = Shape<PST_SPHERE>::create(position, asfloat(intersect.data[2 + i * Shape<PST_SPHERE>::ObjSize + 3]), intersect.data[2 + i * Shape<PST_SPHERE>::ObjSize + 4]);
                     t = sphere.intersect(ray.origin, ray.direction);
                 }
                 break;
                 case PST_TRIANGLE:
                 {
-                    float32_t3 vertex0 = float32_t3(asfloat(intersect.data[2 + i * Shape<PST_TRIANGLE>::ObjSize]), asfloat(intersect.data[2 + i * Shape<PST_SPHERE>::ObjSize + 1]), asfloat(intersect.data[2 + i * Shape<PST_TRIANGLE>::ObjSize + 2]));
-                    float32_t3 vertex1 = float32_t3(asfloat(intersect.data[2 + i * Shape<PST_TRIANGLE>::ObjSize + 3]), asfloat(intersect.data[2 + i * Shape<PST_SPHERE>::ObjSize + 4]), asfloat(intersect.data[2 + i * Shape<PST_TRIANGLE>::ObjSize + 5]));
-                    float32_t3 vertex2 = float32_t3(asfloat(intersect.data[2 + i * Shape<PST_TRIANGLE>::ObjSize + 6]), asfloat(intersect.data[2 + i * Shape<PST_SPHERE>::ObjSize + 7]), asfloat(intersect.data[2 + i * Shape<PST_TRIANGLE>::ObjSize + 8]));
+                    vector3_type vertex0 = vector3_type(asfloat(intersect.data[2 + i * Shape<PST_TRIANGLE>::ObjSize]), asfloat(intersect.data[2 + i * Shape<PST_SPHERE>::ObjSize + 1]), asfloat(intersect.data[2 + i * Shape<PST_TRIANGLE>::ObjSize + 2]));
+                    vector3_type vertex1 = vector3_type(asfloat(intersect.data[2 + i * Shape<PST_TRIANGLE>::ObjSize + 3]), asfloat(intersect.data[2 + i * Shape<PST_SPHERE>::ObjSize + 4]), asfloat(intersect.data[2 + i * Shape<PST_TRIANGLE>::ObjSize + 5]));
+                    vector3_type vertex2 = vector3_type(asfloat(intersect.data[2 + i * Shape<PST_TRIANGLE>::ObjSize + 6]), asfloat(intersect.data[2 + i * Shape<PST_SPHERE>::ObjSize + 7]), asfloat(intersect.data[2 + i * Shape<PST_TRIANGLE>::ObjSize + 8]));
                     Shape<PST_TRIANGLE> tri = Shape<PST_TRIANGLE>::create(vertex0, vertex1, vertex2, intersect.data[2 + i * Shape<PST_TRIANGLE>::ObjSize + 9]);
                     t = tri.intersect(ray.origin, ray.direction);
                 }
                 break;
                 case PST_RECTANGLE:
                 {
-                    float32_t3 offset = float32_t3(asfloat(intersect.data[2 + i * Shape<PST_RECTANGLE>::ObjSize]), asfloat(intersect.data[2 + i * Shape<PST_RECTANGLE>::ObjSize + 1]), asfloat(intersect.data[2 + i * Shape<PST_RECTANGLE>::ObjSize + 2]));
-                    float32_t3 edge0 = float32_t3(asfloat(intersect.data[2 + i * Shape<PST_RECTANGLE>::ObjSize + 3]), asfloat(intersect.data[2 + i * Shape<PST_RECTANGLE>::ObjSize + 4]), asfloat(intersect.data[2 + i * Shape<PST_RECTANGLE>::ObjSize + 5]));
-                    float32_t3 edge1 = float32_t3(asfloat(intersect.data[2 + i * Shape<PST_RECTANGLE>::ObjSize + 6]), asfloat(intersect.data[2 + i * Shape<PST_RECTANGLE>::ObjSize + 7]), asfloat(intersect.data[2 + i * Shape<PST_RECTANGLE>::ObjSize + 8]));
+                    vector3_type offset = vector3_type(asfloat(intersect.data[2 + i * Shape<PST_RECTANGLE>::ObjSize]), asfloat(intersect.data[2 + i * Shape<PST_RECTANGLE>::ObjSize + 1]), asfloat(intersect.data[2 + i * Shape<PST_RECTANGLE>::ObjSize + 2]));
+                    vector3_type edge0 = vector3_type(asfloat(intersect.data[2 + i * Shape<PST_RECTANGLE>::ObjSize + 3]), asfloat(intersect.data[2 + i * Shape<PST_RECTANGLE>::ObjSize + 4]), asfloat(intersect.data[2 + i * Shape<PST_RECTANGLE>::ObjSize + 5]));
+                    vector3_type edge1 = vector3_type(asfloat(intersect.data[2 + i * Shape<PST_RECTANGLE>::ObjSize + 6]), asfloat(intersect.data[2 + i * Shape<PST_RECTANGLE>::ObjSize + 7]), asfloat(intersect.data[2 + i * Shape<PST_RECTANGLE>::ObjSize + 8]));
                     Shape<PST_RECTANGLE> rect = Shape<PST_RECTANGLE>::create(offset, edge0, edge1, intersect.data[2 + i * Shape<PST_RECTANGLE>::ObjSize + 9]);
                     t = rect.intersect(ray.origin, ray.direction);
                 }
@@ -101,7 +102,7 @@ struct Comprehensive
 
     static ObjectID traceRay(NBL_REF_ARG(ray_type) ray, NBL_REF_ARG(IntersectData) intersect)
     {
-        const IntersectData::Mode mode = intersect.mode;
+        const IntersectData::Mode mode = (IntersectData::Mode)intersect.mode;
         switch (mode)
         {
             case IntersectData::Mode::RAY_QUERY:
@@ -120,7 +121,11 @@ struct Comprehensive
             }
             break;
             default:
-                return ObjectID(-1, IntersectData::Mode::PROCEDURAL, PST_SPHERE);
+            {
+                ObjectID objID;
+                objID.id = -1;
+                return objID;
+            }
         }
     }
 
@@ -136,19 +141,19 @@ struct Comprehensive
         if (scene.sphereCount > 0)
         {
             data = scene.toIntersectData(ext::Intersector::IntersectData::Mode::PROCEDURAL, PST_SPHERE);
-            objectID = intersector.traceRay(ray, data);
+            objectID = traceRay(ray, data);
         }
 
         if (scene.triangleCount > 0)
         {
             data = scene.toIntersectData(ext::Intersector::IntersectData::Mode::PROCEDURAL, PST_TRIANGLE);
-            objectID = intersector.traceRay(ray, data);
+            objectID = traceRay(ray, data);
         }
 
         if (scene.rectangleCount > 0)
         {
             data = scene.toIntersectData(ext::Intersector::IntersectData::Mode::PROCEDURAL, PST_RECTANGLE);
-            objectID = intersector.traceRay(ray, data);
+            objectID = traceRay(ray, data);
         }
 
         // TODO: trace AS
diff --git a/31_HLSLPathTracer/app_resources/hlsl/material_system.hlsl b/31_HLSLPathTracer/app_resources/hlsl/material_system.hlsl
index 687c41dc0..9d638c232 100644
--- a/31_HLSLPathTracer/app_resources/hlsl/material_system.hlsl
+++ b/31_HLSLPathTracer/app_resources/hlsl/material_system.hlsl
@@ -26,7 +26,7 @@ struct Material
     NBL_CONSTEXPR_STATIC_INLINE uint32_t DataSize = 32;
 
     uint32_t type : 1;
-    unit32_t unused : 31;   // possible space for flags
+    uint32_t unused : 31;   // possible space for flags
     uint32_t data[DataSize];
 };
 
@@ -41,37 +41,39 @@ struct System
     using quotient_pdf_type = typename DiffuseBxDF::quotient_pdf_type;
     using anisotropic_type = typename DiffuseBxDF::anisotropic_type;
     using anisocache_type = typename ConductorBxDF::anisocache_type;
-    using params_t = SBxDFParams<scalar_type>;
-    using create_params_t = SBxDFCreationParams<scalar_type, measure_type>;
+    using params_t = bxdf::SBxDFParams<scalar_type>;
+    using create_params_t = bxdf::SBxDFCreationParams<scalar_type, measure_type>;
 
     using diffuse_op_type = DiffuseBxDF;
     using conductor_op_type = ConductorBxDF;
     using dielectric_op_type = DielectricBxDF;
 
-    static this_t create(NBL_CONST_REF_ARG(SBxDFCreationParams<scalar_type, measure_type>) diffuseParams, NBL_CONST_REF_ARG(SBxDFCreationParams<scalar_type, measure_type>) conductorParams, NBL_CONST_REF_ARG(SBxDFCreationParams<scalar_type, measure_type>) dielectricParams)
+    static this_t create(NBL_CONST_REF_ARG(create_params_t) diffuseParams, NBL_CONST_REF_ARG(create_params_t) conductorParams, NBL_CONST_REF_ARG(create_params_t) dielectricParams)
     {
-        diffuseBxDF = DiffuseBxDF::create(diffuseParams);
-        conductorBxDF = DiffuseBxDF::create(conductorParams);
-        dielectricBxDF = DiffuseBxDF::create(dielectricParams);
+        this_t retval;
+        retval.diffuseBxDF = DiffuseBxDF::create(diffuseParams);
+        retval.conductorBxDF = DiffuseBxDF::create(conductorParams);
+        retval.dielectricBxDF = DiffuseBxDF::create(dielectricParams);
+        return retval;
     }
 
-    static measure_type eval(NBL_CONST_REF_ARG(Material) material, NBL_CONST_REF_ARG(create_params_t) cparams, NBL_CONST_REF_ARG(params_t) params)
+    measure_type eval(NBL_CONST_REF_ARG(Material) material, NBL_CONST_REF_ARG(create_params_t) cparams, NBL_CONST_REF_ARG(params_t) params)
     {
         switch(material.type)
         {
-            case DIFFUSE:
+            case Material::Type::DIFFUSE:
             {
                 diffuseBxDF.init(cparams);
                 return (measure_type)diffuseBxDF.eval(params);
             }
             break;
-            case CONDUCTOR:
+            case Material::Type::CONDUCTOR:
             {
                 conductorBxDF.init(cparams);
                 return conductorBxDF.eval(params);
             }
             break;
-            case DIELECTRIC:
+            case Material::Type::DIELECTRIC:
             {
                 dielectricBxDF.init(cparams);
                 return dielectricBxDF.eval(params);
@@ -82,23 +84,23 @@ struct System
         }
     }
 
-    static vector3_type generate(NBL_CONST_REF_ARG(Material) material, NBL_CONST_REF_ARG(create_params_t) cparams, anisotropic_type interaction, NBL_CONST_REF_ARG(vector3_type) u, NBL_REF_ARG(anisocache_type) cache)
+    vector3_type generate(NBL_CONST_REF_ARG(Material) material, NBL_CONST_REF_ARG(create_params_t) cparams, anisotropic_type interaction, NBL_CONST_REF_ARG(vector3_type) u, NBL_REF_ARG(anisocache_type) cache)
     {
         switch(material.type)
         {
-            case DIFFUSE:
+            case Material::Type::DIFFUSE:
             {
                 diffuseBxDF.init(cparams);
                 return diffuseBxDF.generate(interaction, u.xy);
             }
             break;
-            case CONDUCTOR:
+            case Material::Type::CONDUCTOR:
             {
                 conductorBxDF.init(cparams);
                 return conductorBxDF.generate(interaction, u.xy, cache);
             }
             break;
-            case DIELECTRIC:
+            case Material::Type::DIELECTRIC:
             {
                 dielectricBxDF.init(cparams);
                 return dielectricBxDF.generate(interaction, u, cache);
@@ -109,26 +111,26 @@ struct System
         }
     }
 
-    static quotient_pdf_type quotient_and_pdf(NBL_CONST_REF_ARG(Material) material, NBL_CONST_REF_ARG(create_params_t) cparams, NBL_CONST_REF_ARG(params_t) params)
+    quotient_pdf_type quotient_and_pdf(NBL_CONST_REF_ARG(Material) material, NBL_CONST_REF_ARG(create_params_t) cparams, NBL_CONST_REF_ARG(params_t) params)
     {
         const float minimumProjVectorLen = 0.00000001;
         if (params.NdotV > minimumProjVectorLen && params.NdotL > minimumProjVectorLen)
         {
             switch(material.type)
             {
-                case DIFFUSE:
+                case Material::Type::DIFFUSE:
                 {
                     diffuseBxDF.init(cparams);
                     return diffuseBxDF.quotient_and_pdf(params);
                 }
                 break;
-                case CONDUCTOR:
+                case Material::Type::CONDUCTOR:
                 {
                     conductorBxDF.init(cparams);
                     return conductorBxDF.quotient_and_pdf(params);
                 }
                 break;
-                case DIELECTRIC:
+                case Material::Type::DIELECTRIC:
                 {
                     dielectricBxDF.init(cparams);
                     return dielectricBxDF.quotient_and_pdf(params);
diff --git a/31_HLSLPathTracer/app_resources/hlsl/next_event_estimator.hlsl b/31_HLSLPathTracer/app_resources/hlsl/next_event_estimator.hlsl
index 5695efc0d..c7573fbb3 100644
--- a/31_HLSLPathTracer/app_resources/hlsl/next_event_estimator.hlsl
+++ b/31_HLSLPathTracer/app_resources/hlsl/next_event_estimator.hlsl
@@ -12,7 +12,7 @@ namespace ext
 namespace NextEventEstimator
 {
 
-// procedural data store: [light count] [intersect type] [obj]
+// procedural data store: [light count] [event type] [obj]
 
 struct Event
 {
@@ -26,7 +26,7 @@ struct Event
     NBL_CONSTEXPR_STATIC_INLINE uint32_t DataSize = 16;
 
     uint32_t mode : 1;
-    unit32_t unused : 31;   // possible space for flags
+    uint32_t unused : 31;   // possible space for flags
     uint32_t data[DataSize];
 };
 
@@ -34,43 +34,44 @@ template<typename Light, typename Ray, class LightSample, class Aniso>
 struct Estimator
 {
     using scalar_type = typename Ray::scalar_type;
+    using vector3_type = vector<scalar_type, 3>;
     using ray_type = Ray;
     using light_type = Light;
     using spectral_type = typename Light::spectral_type;
     using interaction_type = Aniso;
-    using quotient_pdf_type = quotient_and_pdf<spectral_type, scalar_type>;
+    using quotient_pdf_type = bxdf::quotient_and_pdf<spectral_type, scalar_type>;
     using sample_type = LightSample;
 
     static spectral_type proceduralDeferredEvalAndPdf(NBL_REF_ARG(scalar_type) pdf, NBL_CONST_REF_ARG(light_type) light, NBL_CONST_REF_ARG(ray_type) ray, NBL_CONST_REF_ARG(Event) event)
     {
         const uint32_t lightCount = event.data[0];
-        const ProceduralShapeType type = event.data[1];
+        const ProceduralShapeType type = (ProceduralShapeType)event.data[1];
 
         pdf = 1.0 / lightCount;
         switch (type)
         {
             case PST_SPHERE:
             {
-                float32_t3 position = float32_t3(asfloat(intersect.data[2 + Shape<PST_SPHERE>::ObjSize]), asfloat(intersect.data[2 + Shape<PST_SPHERE>::ObjSize + 1]), asfloat(intersect.data[2 + Shape<PST_SPHERE>::ObjSize + 2]));
-                Shape<PST_SPHERE> sphere = Shape<PST_SPHERE>::create(position, asfloat(intersect.data[2 + Shape<PST_SPHERE>::ObjSize + 3]), intersect.data[2 + Shape<PST_SPHERE>::ObjSize + 4]);
+                vector3_type position = vector3_type(asfloat(event.data[2]), asfloat(event.data[3]), asfloat(event.data[4]));
+                Shape<PST_SPHERE> sphere = Shape<PST_SPHERE>::create(position, asfloat(event.data[5]), event.data[6]);
                 pdf *= sphere.template deferredPdf<ray_type>(ray);
             }
             break;
             case PST_TRIANGLE:
             {
-                float32_t3 vertex0 = float32_t3(asfloat(intersect.data[2 + Shape<PST_TRIANGLE>::ObjSize]), asfloat(intersect.data[2 + Shape<PST_SPHERE>::ObjSize + 1]), asfloat(intersect.data[2 + Shape<PST_TRIANGLE>::ObjSize + 2]));
-                float32_t3 vertex1 = float32_t3(asfloat(intersect.data[2 + Shape<PST_TRIANGLE>::ObjSize + 3]), asfloat(intersect.data[2 + Shape<PST_SPHERE>::ObjSize + 4]), asfloat(intersect.data[2 + Shape<PST_TRIANGLE>::ObjSize + 5]));
-                float32_t3 vertex2 = float32_t3(asfloat(intersect.data[2 + Shape<PST_TRIANGLE>::ObjSize + 6]), asfloat(intersect.data[2 + Shape<PST_SPHERE>::ObjSize + 7]), asfloat(intersect.data[2 + Shape<PST_TRIANGLE>::ObjSize + 8]));
-                Shape<PST_TRIANGLE> tri = Shape<PST_TRIANGLE>::create(vertex0, vertex1, vertex2, intersect.data[2 + Shape<PST_TRIANGLE>::ObjSize + 9]);
+                vector3_type vertex0 = vector3_type(asfloat(event.data[2]), asfloat(event.data[3]), asfloat(event.data[4]));
+                vector3_type vertex1 = vector3_type(asfloat(event.data[5]), asfloat(event.data[6]), asfloat(event.data[7]));
+                vector3_type vertex2 = vector3_type(asfloat(event.data[8]), asfloat(event.data[9]), asfloat(event.data[10]));
+                Shape<PST_TRIANGLE> tri = Shape<PST_TRIANGLE>::create(vertex0, vertex1, vertex2, event.data[11]);
                 pdf *= tri.template deferredPdf<ray_type>(ray);
             }
             break;
             case PST_RECTANGLE:
             {
-                float32_t3 offset = float32_t3(asfloat(intersect.data[2 + Shape<PST_RECTANGLE>::ObjSize]), asfloat(intersect.data[2 + Shape<PST_RECTANGLE>::ObjSize + 1]), asfloat(intersect.data[2 + Shape<PST_RECTANGLE>::ObjSize + 2]));
-                float32_t3 edge0 = float32_t3(asfloat(intersect.data[2 + Shape<PST_RECTANGLE>::ObjSize + 3]), asfloat(intersect.data[2 + Shape<PST_RECTANGLE>::ObjSize + 4]), asfloat(intersect.data[2 + Shape<PST_RECTANGLE>::ObjSize + 5]));
-                float32_t3 edge1 = float32_t3(asfloat(intersect.data[2 + Shape<PST_RECTANGLE>::ObjSize + 6]), asfloat(intersect.data[2 + Shape<PST_RECTANGLE>::ObjSize + 7]), asfloat(intersect.data[2 + Shape<PST_RECTANGLE>::ObjSize + 8]));
-                Shape<PST_RECTANGLE> rect = Shape<PST_RECTANGLE>::create(offset, edge0, edge1, intersect.data[2 + Shape<PST_RECTANGLE>::ObjSize + 9]);
+                vector3_type offset = vector3_type(asfloat(event.data[2]), asfloat(event.data[3]), asfloat(event.data[4]));
+                vector3_type edge0 = vector3_type(asfloat(event.data[5]), asfloat(event.data[6]), asfloat(event.data[7]));
+                vector3_type edge1 = vector3_type(asfloat(event.data[8]), asfloat(event.data[9]), asfloat(event.data[10]));
+                Shape<PST_RECTANGLE> rect = Shape<PST_RECTANGLE>::create(offset, edge0, edge1, event.data[11]);
                 pdf *= rect.template deferredPdf<ray_type>(ray);
             }
             break;
@@ -84,7 +85,7 @@ struct Estimator
 
     static spectral_type deferredEvalAndPdf(NBL_REF_ARG(scalar_type) pdf, NBL_CONST_REF_ARG(light_type) light, NBL_CONST_REF_ARG(ray_type) ray, NBL_CONST_REF_ARG(Event) event)
     {
-        const Event::Mode mode = event.mode;
+        const Event::Mode mode = (Event::Mode)event.mode;
         switch (mode)
         {
             case Event::Mode::RAY_QUERY:
@@ -107,10 +108,10 @@ struct Estimator
         }
     }
 
-    static sample_type procedural_generate_and_quotient_and_pdf(NBL_REF_ARG(quotient_pdf_type) quotient_pdf, NBL_REF_ARG(scalar_type) newRayMaxT, NBL_CONST_REF_ARG(vector3_type) origin, NBL_CONST_REF_ARG(interaction_type) interaction, bool isBSDF, NBL_CONST_REF_ARG(vector3_type) xi, unit32_t depth, NBL_CONST_REF_ARG(Event) event)
+    static sample_type procedural_generate_and_quotient_and_pdf(NBL_REF_ARG(quotient_pdf_type) quotient_pdf, NBL_REF_ARG(scalar_type) newRayMaxT, NBL_CONST_REF_ARG(light_type) light, NBL_CONST_REF_ARG(vector3_type) origin, NBL_CONST_REF_ARG(interaction_type) interaction, bool isBSDF, NBL_CONST_REF_ARG(vector3_type) xi, uint32_t depth, NBL_CONST_REF_ARG(Event) event)
     {
         const uint32_t lightCount = event.data[0];
-        const ProceduralShapeType type = event.data[1];
+        const ProceduralShapeType type = (ProceduralShapeType)event.data[1];
 
         sample_type L;
         scalar_type pdf;
@@ -118,26 +119,26 @@ struct Estimator
         {
             case PST_SPHERE:
             {
-                float32_t3 position = float32_t3(asfloat(intersect.data[2 + Shape<PST_SPHERE>::ObjSize]), asfloat(intersect.data[2 + Shape<PST_SPHERE>::ObjSize + 1]), asfloat(intersect.data[2 + Shape<PST_SPHERE>::ObjSize + 2]));
-                Shape<PST_SPHERE> sphere = Shape<PST_SPHERE>::create(position, asfloat(intersect.data[2 + Shape<PST_SPHERE>::ObjSize + 3]), intersect.data[2 + Shape<PST_SPHERE>::ObjSize + 4]);
+                vector3_type position = vector3_type(asfloat(event.data[2 + Shape<PST_SPHERE>::ObjSize]), asfloat(event.data[2 + Shape<PST_SPHERE>::ObjSize + 1]), asfloat(event.data[2 + Shape<PST_SPHERE>::ObjSize + 2]));
+                Shape<PST_SPHERE> sphere = Shape<PST_SPHERE>::create(position, asfloat(event.data[2 + Shape<PST_SPHERE>::ObjSize + 3]), event.data[2 + Shape<PST_SPHERE>::ObjSize + 4]);
                 L = sphere.template generate_and_pdf<interaction_type>(pdf, newRayMaxT, origin, interaction, isBSDF, xi);
             }
             break;
             case PST_TRIANGLE:
             {
-                float32_t3 vertex0 = float32_t3(asfloat(intersect.data[2 + Shape<PST_TRIANGLE>::ObjSize]), asfloat(intersect.data[2 + Shape<PST_SPHERE>::ObjSize + 1]), asfloat(intersect.data[2 + Shape<PST_TRIANGLE>::ObjSize + 2]));
-                float32_t3 vertex1 = float32_t3(asfloat(intersect.data[2 + Shape<PST_TRIANGLE>::ObjSize + 3]), asfloat(intersect.data[2 + Shape<PST_SPHERE>::ObjSize + 4]), asfloat(intersect.data[2 + Shape<PST_TRIANGLE>::ObjSize + 5]));
-                float32_t3 vertex2 = float32_t3(asfloat(intersect.data[2 + Shape<PST_TRIANGLE>::ObjSize + 6]), asfloat(intersect.data[2 + Shape<PST_SPHERE>::ObjSize + 7]), asfloat(intersect.data[2 + Shape<PST_TRIANGLE>::ObjSize + 8]));
-                Shape<PST_TRIANGLE> tri = Shape<PST_TRIANGLE>::create(vertex0, vertex1, vertex2, intersect.data[2 + Shape<PST_TRIANGLE>::ObjSize + 9]);
+                vector3_type vertex0 = vector3_type(asfloat(event.data[2 + Shape<PST_TRIANGLE>::ObjSize]), asfloat(event.data[2 + Shape<PST_SPHERE>::ObjSize + 1]), asfloat(event.data[2 + Shape<PST_TRIANGLE>::ObjSize + 2]));
+                vector3_type vertex1 = vector3_type(asfloat(event.data[2 + Shape<PST_TRIANGLE>::ObjSize + 3]), asfloat(event.data[2 + Shape<PST_SPHERE>::ObjSize + 4]), asfloat(event.data[2 + Shape<PST_TRIANGLE>::ObjSize + 5]));
+                vector3_type vertex2 = vector3_type(asfloat(event.data[2 + Shape<PST_TRIANGLE>::ObjSize + 6]), asfloat(event.data[2 + Shape<PST_SPHERE>::ObjSize + 7]), asfloat(event.data[2 + Shape<PST_TRIANGLE>::ObjSize + 8]));
+                Shape<PST_TRIANGLE> tri = Shape<PST_TRIANGLE>::create(vertex0, vertex1, vertex2, event.data[2 + Shape<PST_TRIANGLE>::ObjSize + 9]);
                 L = tri.template generate_and_pdf<interaction_type>(pdf, newRayMaxT, origin, interaction, isBSDF, xi);
             }
             break;
             case PST_RECTANGLE:
             {
-                float32_t3 offset = float32_t3(asfloat(intersect.data[2 + Shape<PST_RECTANGLE>::ObjSize]), asfloat(intersect.data[2 + Shape<PST_RECTANGLE>::ObjSize + 1]), asfloat(intersect.data[2 + Shape<PST_RECTANGLE>::ObjSize + 2]));
-                float32_t3 edge0 = float32_t3(asfloat(intersect.data[2 + Shape<PST_RECTANGLE>::ObjSize + 3]), asfloat(intersect.data[2 + Shape<PST_RECTANGLE>::ObjSize + 4]), asfloat(intersect.data[2 + Shape<PST_RECTANGLE>::ObjSize + 5]));
-                float32_t3 edge1 = float32_t3(asfloat(intersect.data[2 + Shape<PST_RECTANGLE>::ObjSize + 6]), asfloat(intersect.data[2 + Shape<PST_RECTANGLE>::ObjSize + 7]), asfloat(intersect.data[2 + Shape<PST_RECTANGLE>::ObjSize + 8]));
-                Shape<PST_RECTANGLE> rect = Shape<PST_RECTANGLE>::create(offset, edge0, edge1, intersect.data[2 + Shape<PST_RECTANGLE>::ObjSize + 9]);
+                vector3_type offset = vector3_type(asfloat(event.data[2 + Shape<PST_RECTANGLE>::ObjSize]), asfloat(event.data[2 + Shape<PST_RECTANGLE>::ObjSize + 1]), asfloat(event.data[2 + Shape<PST_RECTANGLE>::ObjSize + 2]));
+                vector3_type edge0 = vector3_type(asfloat(event.data[2 + Shape<PST_RECTANGLE>::ObjSize + 3]), asfloat(event.data[2 + Shape<PST_RECTANGLE>::ObjSize + 4]), asfloat(event.data[2 + Shape<PST_RECTANGLE>::ObjSize + 5]));
+                vector3_type edge1 = vector3_type(asfloat(event.data[2 + Shape<PST_RECTANGLE>::ObjSize + 6]), asfloat(event.data[2 + Shape<PST_RECTANGLE>::ObjSize + 7]), asfloat(event.data[2 + Shape<PST_RECTANGLE>::ObjSize + 8]));
+                Shape<PST_RECTANGLE> rect = Shape<PST_RECTANGLE>::create(offset, edge0, edge1, event.data[2 + Shape<PST_RECTANGLE>::ObjSize + 9]);
                 L = rect.template generate_and_pdf<interaction_type>(pdf, newRayMaxT, origin, interaction, isBSDF, xi);
             }
             break;
@@ -154,9 +155,9 @@ struct Estimator
         return L;
     }
 
-    static sample_type generate_and_quotient_and_pdf(NBL_REF_ARG(quotient_pdf_type) quotient_pdf, NBL_REF_ARG(scalar_type) newRayMaxT, NBL_CONST_REF_ARG(vector3_type) origin, NBL_CONST_REF_ARG(interaction_type) interaction, bool isBSDF, NBL_CONST_REF_ARG(vector3_type) xi, unit32_t depth, NBL_CONST_REF_ARG(Event) event)
+    static sample_type generate_and_quotient_and_pdf(NBL_REF_ARG(quotient_pdf_type) quotient_pdf, NBL_REF_ARG(scalar_type) newRayMaxT, NBL_CONST_REF_ARG(light_type) light, NBL_CONST_REF_ARG(vector3_type) origin, NBL_CONST_REF_ARG(interaction_type) interaction, bool isBSDF, NBL_CONST_REF_ARG(vector3_type) xi, uint32_t depth, NBL_CONST_REF_ARG(Event) event)
     {
-        const Event::Mode mode = event.mode;
+        const Event::Mode mode = (Event::Mode)event.mode;
         switch (mode)
         {
             case Event::Mode::RAY_QUERY:
@@ -171,7 +172,7 @@ struct Estimator
             break;
             case Event::Mode::PROCEDURAL:
             {
-                return procedural_generate_and_quotient_and_pdf(newRayMaxT, origin, interaction, isBSDF, xi, depth, event);
+                return procedural_generate_and_quotient_and_pdf(quotient_pdf, newRayMaxT, light, origin, interaction, isBSDF, xi, depth, event);
             }
             break;
             default:
diff --git a/31_HLSLPathTracer/app_resources/hlsl/pathtracer.hlsl b/31_HLSLPathTracer/app_resources/hlsl/pathtracer.hlsl
index b14c9baae..a740ec388 100644
--- a/31_HLSLPathTracer/app_resources/hlsl/pathtracer.hlsl
+++ b/31_HLSLPathTracer/app_resources/hlsl/pathtracer.hlsl
@@ -10,6 +10,7 @@
 #include "intersector.hlsl"
 #include "material_system.hlsl"
 #include "next_event_estimator.hlsl"
+#include "scene.hlsl"
 
 namespace nbl
 {
@@ -170,8 +171,8 @@ struct Unidirectional
             scalar_type t;
             sample_type nee_sample = nee.generate_and_quotient_and_pdf(
                 neeContrib_pdf, t,
-                intersection, interaction,
-                isBSDF, eps0, depth
+                lights[lightID], intersection, interaction,
+                isBSDF, eps0, depth, scene.toNextEvent(lightID)
             );
 
             // We don't allow non watertight transmitters in this renderer
@@ -236,7 +237,7 @@ struct Unidirectional
                     nee_ray.origin = intersection + nee_sample.L.direction * t * Tolerance<scalar_type>::getStart(depth);
                     nee_ray.direction = nee_sample.L.direction;
                     nee_ray.intersectionT = t;
-                    if (bsdf_quotient_pdf.pdf < numeric_limits<scalar_type>::max && getLuma(neeContrib_pdf.quotient) > lumaContributionThreshold && intersector.traceRay(nee_ray, scene).id == -1)
+                    if (bsdf_quotient_pdf.pdf < numeric_limits<scalar_type>::max && getLuma(neeContrib_pdf.quotient) > lumaContributionThreshold && intersector::traceRay(nee_ray, scene).id == -1)
                         ray._payload.accumulation += neeContrib_pdf.quotient;
                 }
             }
@@ -338,7 +339,7 @@ struct Unidirectional
             for (int d = 1; d <= depth && hit && rayAlive; d += 2)
             {
                 ray.intersectionT = numeric_limits<scalar_type>::max;
-                ray.objectID = intersector.traceRay(ray, scene);
+                ray.objectID = intersector::traceRay(ray, scene);
 
                 hit = ray.objectID.id != -1;
                 if (hit)
diff --git a/31_HLSLPathTracer/app_resources/hlsl/render.comp.hlsl b/31_HLSLPathTracer/app_resources/hlsl/render.comp.hlsl
index 1c8c15ec4..f9558c3d1 100644
--- a/31_HLSLPathTracer/app_resources/hlsl/render.comp.hlsl
+++ b/31_HLSLPathTracer/app_resources/hlsl/render.comp.hlsl
@@ -167,6 +167,10 @@ void main(uint32_t3 threadID : SV_DispatchThreadID)
     ptCreateParams.NDC = NDC;
     ptCreateParams.invMVP = pc.invMVP;
 
+    ptCreateParams.diffuseParams = bxdfs[0].params;
+    ptCreateParams.conductorParams = bxdfs[3].params;
+    ptCreateParams.dielectricParams = bxdfs[6].params;
+
     pathtracer_type pathtracer = pathtracer_type::create(ptCreateParams, samplerSequence);
 
     // set up scene (can do as global var?)
diff --git a/31_HLSLPathTracer/app_resources/hlsl/scene.hlsl b/31_HLSLPathTracer/app_resources/hlsl/scene.hlsl
index cbc9d153c..fc10d906c 100644
--- a/31_HLSLPathTracer/app_resources/hlsl/scene.hlsl
+++ b/31_HLSLPathTracer/app_resources/hlsl/scene.hlsl
@@ -129,41 +129,41 @@ struct Scene
             case PST_SPHERE:
             {
                 Shape<PST_SPHERE> sphere = spheres[id];
-                retval.data[2 + Shape<PST_SPHERE>::ObjSize] = asuint(sphere.position.x);
-                retval.data[2 + Shape<PST_SPHERE>::ObjSize + 1] = asuint(sphere.position.y);
-                retval.data[2 + Shape<PST_SPHERE>::ObjSize + 2] = asuint(sphere.position.z);
-                retval.data[2 + Shape<PST_SPHERE>::ObjSize + 3] = asuint(sphere.radius);
-                retval.data[2 + Shape<PST_SPHERE>::ObjSize + 4] = sphere.bsdfLightIDs;
+                retval.data[2] = asuint(sphere.position.x);
+                retval.data[3] = asuint(sphere.position.y);
+                retval.data[4] = asuint(sphere.position.z);
+                retval.data[5] = asuint(sphere.radius);
+                retval.data[6] = sphere.bsdfLightIDs;
             }
             break;
             case PST_TRIANGLE:
             {
                 Shape<PST_TRIANGLE> tri = triangles[id];
-                retval.data[2 + Shape<PST_TRIANGLE>::ObjSize] = asuint(tri.vertex0.x);
-                retval.data[2 + Shape<PST_TRIANGLE>::ObjSize + 1] = asuint(tri.vertex0.y);
-                retval.data[2 + Shape<PST_TRIANGLE>::ObjSize + 2] = asuint(tri.vertex0.z);
-                retval.data[2 + Shape<PST_TRIANGLE>::ObjSize + 3] = asuint(tri.vertex1.x);
-                retval.data[2 + Shape<PST_TRIANGLE>::ObjSize + 4] = asuint(tri.vertex1.y);
-                retval.data[2 + Shape<PST_TRIANGLE>::ObjSize + 5] = asuint(tri.vertex1.z);
-                retval.data[2 + Shape<PST_TRIANGLE>::ObjSize + 6] = asuint(tri.vertex2.x);
-                retval.data[2 + Shape<PST_TRIANGLE>::ObjSize + 7] = asuint(tri.vertex2.y);
-                retval.data[2 + Shape<PST_TRIANGLE>::ObjSize + 8] = asuint(tri.vertex2.z);
-                retval.data[2 + Shape<PST_TRIANGLE>::ObjSize + 9] = tri.bsdfLightIDs;
+                retval.data[2] = asuint(tri.vertex0.x);
+                retval.data[3] = asuint(tri.vertex0.y);
+                retval.data[4] = asuint(tri.vertex0.z);
+                retval.data[5] = asuint(tri.vertex1.x);
+                retval.data[6] = asuint(tri.vertex1.y);
+                retval.data[7] = asuint(tri.vertex1.z);
+                retval.data[8] = asuint(tri.vertex2.x);
+                retval.data[9] = asuint(tri.vertex2.y);
+                retval.data[10] = asuint(tri.vertex2.z);
+                retval.data[11] = tri.bsdfLightIDs;
             }
             break;
             case PST_RECTANGLE:
             {
                 Shape<PST_RECTANGLE> rect = rectangles[id];
-                retval.data[2 + Shape<PST_RECTANGLE>::ObjSize] = asuint(rect.offset.x);
-                retval.data[2 + Shape<PST_RECTANGLE>::ObjSize + 1] = asuint(rect.offset.y);
-                retval.data[2 + Shape<PST_RECTANGLE>::ObjSize + 2] = asuint(rect.offset.z);
-                retval.data[2 + Shape<PST_RECTANGLE>::ObjSize + 3] = asuint(rect.edge0.x);
-                retval.data[2 + Shape<PST_RECTANGLE>::ObjSize + 4] = asuint(rect.edge0.y);
-                retval.data[2 + Shape<PST_RECTANGLE>::ObjSize + 5] = asuint(rect.edge0.z);
-                retval.data[2 + Shape<PST_RECTANGLE>::ObjSize + 6] = asuint(rect.edge1.x);
-                retval.data[2 + Shape<PST_RECTANGLE>::ObjSize + 7] = asuint(rect.edge1.y);
-                retval.data[2 + Shape<PST_RECTANGLE>::ObjSize + 8] = asuint(rect.edge1.z);
-                retval.data[2 + Shape<PST_RECTANGLE>::ObjSize + 9] = rect.bsdfLightIDs;
+                retval.data[2] = asuint(rect.offset.x);
+                retval.data[3] = asuint(rect.offset.y);
+                retval.data[4] = asuint(rect.offset.z);
+                retval.data[5] = asuint(rect.edge0.x);
+                retval.data[6] = asuint(rect.edge0.y);
+                retval.data[7] = asuint(rect.edge0.z);
+                retval.data[8] = asuint(rect.edge1.x);
+                retval.data[9] = asuint(rect.edge1.y);
+                retval.data[10] = asuint(rect.edge1.z);
+                retval.data[11] = rect.bsdfLightIDs;
             }
             break;
             default:
diff --git a/31_HLSLPathTracer/main.cpp b/31_HLSLPathTracer/main.cpp
index 13aa59823..5aff6bde7 100644
--- a/31_HLSLPathTracer/main.cpp
+++ b/31_HLSLPathTracer/main.cpp
@@ -369,8 +369,8 @@ class ComputeShaderPathtracer final : public examples::SimpleWindowedApplication
 					options.preprocessorOptions.logger = m_logger.get();
 					options.preprocessorOptions.includeFinder = compiler->getDefaultIncludeFinder();
 
-					//std::string dxcOptionStr[] = { "-D" + defineMacro };
-					//options.dxcOptions = std::span(dxcOptionStr);
+					std::string dxcOptionStr[] = { "-D" + defineMacro };
+					options.dxcOptions = std::span(dxcOptionStr);
 
 					source = compiler->compileToSPIRV((const char*)source->getContent()->getPointer(), options);
 				};

From a802a97943bd9e17187a306f8058c21d2774678b Mon Sep 17 00:00:00 2001
From: keptsecret <sorchon@gmail.com>
Date: Fri, 21 Feb 2025 16:57:15 +0700
Subject: [PATCH 055/529] bug fixes #3

---
 .../app_resources/hlsl/common.hlsl            |  7 ++++---
 .../hlsl/next_event_estimator.hlsl            | 20 +++++++++----------
 .../app_resources/hlsl/pathtracer.hlsl        | 16 +++++++--------
 .../app_resources/hlsl/render.comp.hlsl       | 10 +++++-----
 .../app_resources/hlsl/scene.hlsl             | 18 ++++++++---------
 5 files changed, 36 insertions(+), 35 deletions(-)

diff --git a/31_HLSLPathTracer/app_resources/hlsl/common.hlsl b/31_HLSLPathTracer/app_resources/hlsl/common.hlsl
index 1b0aac72f..f12b72b5d 100644
--- a/31_HLSLPathTracer/app_resources/hlsl/common.hlsl
+++ b/31_HLSLPathTracer/app_resources/hlsl/common.hlsl
@@ -3,6 +3,7 @@
 
 #include <nbl/builtin/hlsl/spirv_intrinsics/core.hlsl>
 #include <nbl/builtin/hlsl/spirv_intrinsics/glsl.std.450.hlsl>
+#include <nbl/builtin/hlsl/glsl_compat/core.hlsl>
 #include <nbl/builtin/hlsl/numbers.hlsl>
 #include <nbl/builtin/hlsl/shapes/triangle.hlsl>
 #include <nbl/builtin/hlsl/shapes/rectangle.hlsl>
@@ -134,7 +135,7 @@ struct Shape<PST_SPHERE>
 
     static Shape<PST_SPHERE> create(NBL_CONST_REF_ARG(float32_t3) position, float32_t radius, uint32_t bsdfID, uint32_t lightID)
     {
-        uint32_t bsdfLightIDs = spirv::bitFieldInsert<uint32_t>(bsdfID, lightID, 16, 16);
+        uint32_t bsdfLightIDs = glsl::bitfieldInsert<uint32_t>(bsdfID, lightID, 16, 16);
         return create(position, radius, bsdfLightIDs);
     }
 
@@ -226,7 +227,7 @@ struct Shape<PST_TRIANGLE>
 
     static Shape<PST_TRIANGLE> create(NBL_CONST_REF_ARG(float32_t3) vertex0, NBL_CONST_REF_ARG(float32_t3) vertex1, NBL_CONST_REF_ARG(float32_t3) vertex2, uint32_t bsdfID, uint32_t lightID)
     {
-        uint32_t bsdfLightIDs = spirv::bitFieldInsert<uint32_t>(bsdfID, lightID, 16, 16);
+        uint32_t bsdfLightIDs = glsl::bitfieldInsert<uint32_t>(bsdfID, lightID, 16, 16);
         return create(vertex0, vertex1, vertex2, bsdfLightIDs);
     }
 
@@ -374,7 +375,7 @@ struct Shape<PST_RECTANGLE>
 
     static Shape<PST_RECTANGLE> create(NBL_CONST_REF_ARG(float32_t3) offset, NBL_CONST_REF_ARG(float32_t3) edge0, NBL_CONST_REF_ARG(float32_t3) edge1, uint32_t bsdfID, uint32_t lightID)
     {
-        uint32_t bsdfLightIDs = spirv::bitFieldInsert<uint32_t>(bsdfID, lightID, 16, 16);
+        uint32_t bsdfLightIDs = glsl::bitfieldInsert<uint32_t>(bsdfID, lightID, 16, 16);
         return create(offset, edge0, edge1, bsdfLightIDs);
     }
 
diff --git a/31_HLSLPathTracer/app_resources/hlsl/next_event_estimator.hlsl b/31_HLSLPathTracer/app_resources/hlsl/next_event_estimator.hlsl
index c7573fbb3..32a7b7476 100644
--- a/31_HLSLPathTracer/app_resources/hlsl/next_event_estimator.hlsl
+++ b/31_HLSLPathTracer/app_resources/hlsl/next_event_estimator.hlsl
@@ -119,26 +119,26 @@ struct Estimator
         {
             case PST_SPHERE:
             {
-                vector3_type position = vector3_type(asfloat(event.data[2 + Shape<PST_SPHERE>::ObjSize]), asfloat(event.data[2 + Shape<PST_SPHERE>::ObjSize + 1]), asfloat(event.data[2 + Shape<PST_SPHERE>::ObjSize + 2]));
-                Shape<PST_SPHERE> sphere = Shape<PST_SPHERE>::create(position, asfloat(event.data[2 + Shape<PST_SPHERE>::ObjSize + 3]), event.data[2 + Shape<PST_SPHERE>::ObjSize + 4]);
+                vector3_type position = vector3_type(asfloat(event.data[2]), asfloat(event.data[3]), asfloat(event.data[4]));
+                Shape<PST_SPHERE> sphere = Shape<PST_SPHERE>::create(position, asfloat(event.data[5]), event.data[6]);
                 L = sphere.template generate_and_pdf<interaction_type>(pdf, newRayMaxT, origin, interaction, isBSDF, xi);
             }
             break;
             case PST_TRIANGLE:
             {
-                vector3_type vertex0 = vector3_type(asfloat(event.data[2 + Shape<PST_TRIANGLE>::ObjSize]), asfloat(event.data[2 + Shape<PST_SPHERE>::ObjSize + 1]), asfloat(event.data[2 + Shape<PST_TRIANGLE>::ObjSize + 2]));
-                vector3_type vertex1 = vector3_type(asfloat(event.data[2 + Shape<PST_TRIANGLE>::ObjSize + 3]), asfloat(event.data[2 + Shape<PST_SPHERE>::ObjSize + 4]), asfloat(event.data[2 + Shape<PST_TRIANGLE>::ObjSize + 5]));
-                vector3_type vertex2 = vector3_type(asfloat(event.data[2 + Shape<PST_TRIANGLE>::ObjSize + 6]), asfloat(event.data[2 + Shape<PST_SPHERE>::ObjSize + 7]), asfloat(event.data[2 + Shape<PST_TRIANGLE>::ObjSize + 8]));
-                Shape<PST_TRIANGLE> tri = Shape<PST_TRIANGLE>::create(vertex0, vertex1, vertex2, event.data[2 + Shape<PST_TRIANGLE>::ObjSize + 9]);
+                vector3_type vertex0 = vector3_type(asfloat(event.data[2]), asfloat(event.data[3]), asfloat(event.data[4]));
+                vector3_type vertex1 = vector3_type(asfloat(event.data[5]), asfloat(event.data[6]), asfloat(event.data[7]));
+                vector3_type vertex2 = vector3_type(asfloat(event.data[8]), asfloat(event.data[9]), asfloat(event.data[10]));
+                Shape<PST_TRIANGLE> tri = Shape<PST_TRIANGLE>::create(vertex0, vertex1, vertex2, event.data[11]);
                 L = tri.template generate_and_pdf<interaction_type>(pdf, newRayMaxT, origin, interaction, isBSDF, xi);
             }
             break;
             case PST_RECTANGLE:
             {
-                vector3_type offset = vector3_type(asfloat(event.data[2 + Shape<PST_RECTANGLE>::ObjSize]), asfloat(event.data[2 + Shape<PST_RECTANGLE>::ObjSize + 1]), asfloat(event.data[2 + Shape<PST_RECTANGLE>::ObjSize + 2]));
-                vector3_type edge0 = vector3_type(asfloat(event.data[2 + Shape<PST_RECTANGLE>::ObjSize + 3]), asfloat(event.data[2 + Shape<PST_RECTANGLE>::ObjSize + 4]), asfloat(event.data[2 + Shape<PST_RECTANGLE>::ObjSize + 5]));
-                vector3_type edge1 = vector3_type(asfloat(event.data[2 + Shape<PST_RECTANGLE>::ObjSize + 6]), asfloat(event.data[2 + Shape<PST_RECTANGLE>::ObjSize + 7]), asfloat(event.data[2 + Shape<PST_RECTANGLE>::ObjSize + 8]));
-                Shape<PST_RECTANGLE> rect = Shape<PST_RECTANGLE>::create(offset, edge0, edge1, event.data[2 + Shape<PST_RECTANGLE>::ObjSize + 9]);
+                vector3_type offset = vector3_type(asfloat(event.data[2]), asfloat(event.data[3]), asfloat(event.data[4]));
+                vector3_type edge0 = vector3_type(asfloat(event.data[5]), asfloat(event.data[6]), asfloat(event.data[7]));
+                vector3_type edge1 = vector3_type(asfloat(event.data[8]), asfloat(event.data[9]), asfloat(event.data[10]));
+                Shape<PST_RECTANGLE> rect = Shape<PST_RECTANGLE>::create(offset, edge0, edge1, event.data[11]);
                 L = rect.template generate_and_pdf<interaction_type>(pdf, newRayMaxT, origin, interaction, isBSDF, xi);
             }
             break;
diff --git a/31_HLSLPathTracer/app_resources/hlsl/pathtracer.hlsl b/31_HLSLPathTracer/app_resources/hlsl/pathtracer.hlsl
index a740ec388..c47f24753 100644
--- a/31_HLSLPathTracer/app_resources/hlsl/pathtracer.hlsl
+++ b/31_HLSLPathTracer/app_resources/hlsl/pathtracer.hlsl
@@ -82,15 +82,15 @@ struct Unidirectional
         this_t retval;
         retval.randGen = randgen_type::create(params.rngState);
         retval.rayGen = raygen_type::create(params.pixOffsetParam, params.camPos, params.NDC, params.invMVP);
-        retval.materialSystem = material_system_type::create(diffuseParams, conductorParams, dielectricParams);
+        retval.materialSystem = material_system_type::create(params.diffuseParams, params.conductorParams, params.dielectricParams);
         retval.samplerSequence = samplerSequence;
         return retval;
     }
 
     vector3_type rand3d(uint32_t protoDimension, uint32_t _sample, uint32_t i)
     {
-        uint32_t address = spirv::bitfieldInsert(protoDimension, _sample, MAX_DEPTH_LOG2, MAX_SAMPLES_LOG2);
-	    unit32_t3 seqVal = texelFetch(sampleSequence, int(address) + i).xyz;
+        uint32_t address = glsl::bitfieldInsert<uint32_t>(protoDimension, _sample, MAX_DEPTH_LOG2, MAX_SAMPLES_LOG2);
+	    uint32_t3 seqVal = texelFetch(sampleSequence, int(address) + i).xyz;
 	    seqVal ^= randGen();
         return vector3_type(seqVal) * asfloat(0x2f800004u);
     }
@@ -101,7 +101,7 @@ struct Unidirectional
     }
 
     // TODO: probably will only work with procedural shapes, do the other ones
-    bool closestHitProgram(unit32_t depth, uint32_t _sample, NBL_REF_ARG(ray_type) ray, NBL_CONST_REF_ARG(scene_type) scene)
+    bool closestHitProgram(uint32_t depth, uint32_t _sample, NBL_REF_ARG(ray_type) ray, NBL_CONST_REF_ARG(scene_type) scene)
     {
         const uint32_t objectID = ray.objectID;
         const vector3_type intersection = ray.origin + ray.direction * ray.intersectionT;
@@ -117,8 +117,8 @@ struct Unidirectional
                 break;
             case ext::Intersector::IntersectData::Mode::PROCEDURAL:
             {
-                bsdfLightIDs = scene.getBsdfLightIDs(objectID.id);
-                vector3_type N = scene.getNormal(objectID.id)
+                bsdfLightIDs = scene.getBsdfLightIDs(objectID);
+                vector3_type N = scene.getNormal(objectID);
                 N = nbl::hlsl::normalize(N);
                 typename isotropic_type::ray_dir_info_type V;
                 V.direction = nbl::hlsl::normalize(-ray.direction);
@@ -133,14 +133,14 @@ struct Unidirectional
         vector3_type throughput = ray.payload.throughput;
 
         // emissive
-        const uint32_t lightID = spirv::bitfieldExtract(bsdfLightIDs, 16, 16);
+        const uint32_t lightID = glsl::bitfieldExtract(bsdfLightIDs, 16, 16);
         if (lightID != light_type::INVALID_ID)
         {
             float pdf;
             ray.payload.accumulation += nee.deferredEvalAndPdf(pdf, lights[lightID], ray, scene.toNextEvent(lightID)) * throughput / (1.0 + pdf * pdf * ray.payload.otherTechniqueHeuristic);
         }
 
-        const uint32_t bsdfID = spirv::bitfieldExtract(bsdfLightIDs, 0, 16);
+        const uint32_t bsdfID = glsl::bitfieldExtract(bsdfLightIDs, 0, 16);
         if (bsdfID == bxdfnode_type::INVALID_ID)
             return false;
 
diff --git a/31_HLSLPathTracer/app_resources/hlsl/render.comp.hlsl b/31_HLSLPathTracer/app_resources/hlsl/render.comp.hlsl
index f9558c3d1..4143b973d 100644
--- a/31_HLSLPathTracer/app_resources/hlsl/render.comp.hlsl
+++ b/31_HLSLPathTracer/app_resources/hlsl/render.comp.hlsl
@@ -80,7 +80,7 @@ using material_system_type = ext::MaterialSystem::System<diffuse_bxdf_type, cond
 using nee_type = ext::NextEventEstimator::Estimator<light_type, ray_type, sample_t, aniso_interaction>;
 using pathtracer_type = ext::PathTracer::Unidirectional<randgen_type, raygen_type, intersector_type, material_system_type, nee_type>;
 
-Shape<PST_SPHERE> spheres[SPHERE_COUNT] = {
+static const Shape<PST_SPHERE> spheres[SPHERE_COUNT] = {
     Shape<PST_SPHERE>::create(float3(0.0, -100.5, -1.0), 100.0, 0u, light_type::INVALID_ID),
     Shape<PST_SPHERE>::create(float3(2.0, 0.0, -1.0), 0.5, 1u, light_type::INVALID_ID),
     Shape<PST_SPHERE>::create(float3(0.0, 0.0, -1.0), 0.5, 2u, light_type::INVALID_ID),
@@ -97,7 +97,7 @@ Shape<PST_SPHERE> spheres[SPHERE_COUNT] = {
 #ifdef TRIANGLE_LIGHT
 #define LIGHT_TYPE PST_TRIANGLE
 #define TRIANGLE_COUNT 1
-Shape<PST_TRIANGLE> triangles[TRIANGLE_COUNT] = {
+static const Shape<PST_TRIANGLE> triangles[TRIANGLE_COUNT] = {
     Shape<PST_TRIANGLE>::create(float3(-1.8,0.35,0.3) * 10.0, float3(-1.2,0.35,0.0) * 10.0, float3(-1.5,0.8,-0.3) * 10.0, bxdfnode_type::INVALID_ID, 0u)
 };
 #endif
@@ -105,18 +105,18 @@ Shape<PST_TRIANGLE> triangles[TRIANGLE_COUNT] = {
 #ifdef RECTANGLE_LIGHT
 #define LIGHT_TYPE PST_RECTANGLE
 #define RECTANGLE_COUNT 1
-Shape<PST_RECTANGLE> rectangles[RECTANGLE_COUNT] = {
+static const Shape<PST_RECTANGLE> rectangles[RECTANGLE_COUNT] = {
     Shape<PST_RECTANGLE>::create(float3(-3.8,0.35,1.3), normalize(float3(2,0,-1))*7.0, normalize(float3(2,-5,4))*0.1, bxdfnode_type::INVALID_ID, 0u)
 };
 #endif
 
 #define LIGHT_COUNT 1
-light_type lights[LIGHT_COUNT] = {
+static const light_type lights[LIGHT_COUNT] = {
     light_type(spectral_t(30.0,25.0,15.0), ext::ObjectID(8u, ext::NextEventEstimator::Event::Mode::PROCEDURAL, LIGHT_TYPE))
 };
 
 #define BXDF_COUNT 7
-bxdfnode_type bxdfs[BXDF_COUNT] = {
+static const bxdfnode_type bxdfs[BXDF_COUNT] = {
     bxdfnode_type(ext::MaterialSystem::Material::Type::DIFFUSE, create_params_t(false, float2(0,0), spectral_t(1,1,1), spectral_t(1.25,1.25,1.25))),
     bxdfnode_type(ext::MaterialSystem::Material::Type::DIFFUSE, create_params_t(false, float2(0,0), spectral_t(1,1,1), spectral_t(1.25,2.5,2.5))),
     bxdfnode_type(ext::MaterialSystem::Material::Type::DIFFUSE, create_params_t(false, float2(0,0), spectral_t(1,1,1), spectral_t(2.5,1.25,2.5))),
diff --git a/31_HLSLPathTracer/app_resources/hlsl/scene.hlsl b/31_HLSLPathTracer/app_resources/hlsl/scene.hlsl
index fc10d906c..88940c54d 100644
--- a/31_HLSLPathTracer/app_resources/hlsl/scene.hlsl
+++ b/31_HLSLPathTracer/app_resources/hlsl/scene.hlsl
@@ -65,7 +65,7 @@ struct Scene
                     retval.data[2 + i * Shape<PST_SPHERE>::ObjSize] = asuint(sphere.position.x);
                     retval.data[2 + i * Shape<PST_SPHERE>::ObjSize + 1] = asuint(sphere.position.y);
                     retval.data[2 + i * Shape<PST_SPHERE>::ObjSize + 2] = asuint(sphere.position.z);
-                    retval.data[2 + i * Shape<PST_SPHERE>::ObjSize + 3] = asuint(sphere.radius);
+                    retval.data[2 + i * Shape<PST_SPHERE>::ObjSize + 3] = asuint(sphere.radius2);
                     retval.data[2 + i * Shape<PST_SPHERE>::ObjSize + 4] = sphere.bsdfLightIDs;
                 }
             }
@@ -174,18 +174,18 @@ struct Scene
     }
 
     // TODO: get these to work with AS types as well
-    uint32_t getBsdfLightIDs(uint32_t id)
+    uint32_t getBsdfLightIDs(NBL_CONST_REF_ARG(ObjectID) objectID)
     {
-        return (objectID.type == PST_SPHERE) ? spheres[id].bsdfLightIDs :
-                (objectID.type == PST_TRIANGLE) ? triangles[id].bsdfLightIDs :
-                (objectID.type == PST_RECTANGLE) ? rectangles[id].bsdfLightIDs : -1;
+        return (objectID.type == PST_SPHERE) ? spheres[objectID.id].bsdfLightIDs :
+                (objectID.type == PST_TRIANGLE) ? triangles[objectID.id].bsdfLightIDs :
+                (objectID.type == PST_RECTANGLE) ? rectangles[objectID.id].bsdfLightIDs : -1;
     }
 
-    float32_t3 getNormal(uint32_t id, NBL_CONST_REF_ARG(float32_t3) intersection)
+    float32_t3 getNormal(NBL_CONST_REF_ARG(ObjectID) objectID, NBL_CONST_REF_ARG(float32_t3) intersection)
     {
-        return (objectID.type == PST_SPHERE) ? scene.spheres[id].getNormal(intersection) :
-                (objectID.type == PST_TRIANGLE) ? scene.triangles[id].getNormalTimesArea() :
-                (objectID.type == PST_RECTANGLE) ? scene.rectangles[id].getNormalTimesArea() :
+        return (objectID.type == PST_SPHERE) ? scene.spheres[objectID.id].getNormal(intersection) :
+                (objectID.type == PST_TRIANGLE) ? scene.triangles[objectID.id].getNormalTimesArea() :
+                (objectID.type == PST_RECTANGLE) ? scene.rectangles[objectID.id].getNormalTimesArea() :
                 (float32_t3)0.0;
     }
 };

From eed47e73c53be25cb9be67924ca0d075897b64bc Mon Sep 17 00:00:00 2001
From: keptsecret <sorchon@gmail.com>
Date: Mon, 24 Feb 2025 10:41:08 +0700
Subject: [PATCH 056/529] fix include when embed resources off

---
 31_HLSLPathTracer/app_resources/glsl/litByRectangle.comp | 2 +-
 31_HLSLPathTracer/app_resources/glsl/litBySphere.comp    | 2 +-
 31_HLSLPathTracer/app_resources/glsl/litByTriangle.comp  | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/31_HLSLPathTracer/app_resources/glsl/litByRectangle.comp b/31_HLSLPathTracer/app_resources/glsl/litByRectangle.comp
index 300cef559..d898655c4 100644
--- a/31_HLSLPathTracer/app_resources/glsl/litByRectangle.comp
+++ b/31_HLSLPathTracer/app_resources/glsl/litByRectangle.comp
@@ -7,7 +7,7 @@
 
 #define SPHERE_COUNT 8
 #define POLYGON_METHOD 1 // 0 area sampling, 1 solid angle sampling, 2 approximate projected solid angle sampling
-#include "common.glsl"
+#include "app_resources/glsl/common.glsl"
 
 #define RECTANGLE_COUNT 1
 const vec3 edge0 = normalize(vec3(2,0,-1));
diff --git a/31_HLSLPathTracer/app_resources/glsl/litBySphere.comp b/31_HLSLPathTracer/app_resources/glsl/litBySphere.comp
index bd1a48575..c8ebb9f08 100644
--- a/31_HLSLPathTracer/app_resources/glsl/litBySphere.comp
+++ b/31_HLSLPathTracer/app_resources/glsl/litBySphere.comp
@@ -6,7 +6,7 @@
 #extension GL_GOOGLE_include_directive : require
 
 #define SPHERE_COUNT 9
-#include "common.glsl"
+#include "app_resources/glsl/common.glsl"
 
 
 void traceRay_extraShape(inout int objectID, inout float intersectionT, in vec3 origin, in vec3 direction)
diff --git a/31_HLSLPathTracer/app_resources/glsl/litByTriangle.comp b/31_HLSLPathTracer/app_resources/glsl/litByTriangle.comp
index ba23c82e5..36fe522f2 100644
--- a/31_HLSLPathTracer/app_resources/glsl/litByTriangle.comp
+++ b/31_HLSLPathTracer/app_resources/glsl/litByTriangle.comp
@@ -7,7 +7,7 @@
 
 #define SPHERE_COUNT 8
 #define POLYGON_METHOD 1 // 0 area sampling, 1 solid angle sampling, 2 approximate projected solid angle sampling
-#include "common.glsl"
+#include "app_resources/glsl/common.glsl"
 
 #define TRIANGLE_COUNT 1
 Triangle triangles[TRIANGLE_COUNT] = {

From 6e26dae254d190ea66e812fa0789e958716edacc Mon Sep 17 00:00:00 2001
From: keptsecret <sorchon@gmail.com>
Date: Mon, 24 Feb 2025 17:11:42 +0700
Subject: [PATCH 057/529] fixed more bugs #4

---
 .../app_resources/hlsl/common.hlsl            |   2 +-
 .../app_resources/hlsl/pathtracer.hlsl        |  35 +--
 .../app_resources/hlsl/render.comp.hlsl       | 273 +++++++++---------
 .../app_resources/hlsl/scene.hlsl             |  18 +-
 4 files changed, 165 insertions(+), 163 deletions(-)

diff --git a/31_HLSLPathTracer/app_resources/hlsl/common.hlsl b/31_HLSLPathTracer/app_resources/hlsl/common.hlsl
index f12b72b5d..cd2310fbf 100644
--- a/31_HLSLPathTracer/app_resources/hlsl/common.hlsl
+++ b/31_HLSLPathTracer/app_resources/hlsl/common.hlsl
@@ -184,7 +184,7 @@ struct Shape<PST_SPHERE>
             Z *= rcpDistance;
         
             const float cosThetaMax = nbl::hlsl::sqrt(cosThetaMax2);
-            const float cosTheta = nbl::hlsl::mix(1.0, cosThetaMax, xi.x);
+            const float cosTheta = nbl::hlsl::mix<float>(1.0, cosThetaMax, xi.x);
 
             float32_t3 L = Z * cosTheta;
 
diff --git a/31_HLSLPathTracer/app_resources/hlsl/pathtracer.hlsl b/31_HLSLPathTracer/app_resources/hlsl/pathtracer.hlsl
index c47f24753..f1237006c 100644
--- a/31_HLSLPathTracer/app_resources/hlsl/pathtracer.hlsl
+++ b/31_HLSLPathTracer/app_resources/hlsl/pathtracer.hlsl
@@ -4,6 +4,7 @@
 #include <nbl/builtin/hlsl/colorspace/EOTF.hlsl>
 #include <nbl/builtin/hlsl/colorspace/encodeCIEXYZ.hlsl>
 #include <nbl/builtin/hlsl/math/functions.hlsl>
+#include <nbl/builtin/hlsl/bxdf/bxdf_traits.hlsl>
 
 #include "rand_gen.hlsl"
 #include "ray_gen.hlsl"
@@ -77,13 +78,13 @@ struct Unidirectional
     //                     NextEventEstimator nee)
     // {}
 
-    static this_t create(NBL_CONST_REF_ARG(PathTracerCreationParams<create_params_type, scalar_type>) params, Buffer samplerSequence)
+    static this_t create(NBL_CONST_REF_ARG(PathTracerCreationParams<create_params_type, scalar_type>) params, Buffer sampleSequence)
     {
         this_t retval;
         retval.randGen = randgen_type::create(params.rngState);
         retval.rayGen = raygen_type::create(params.pixOffsetParam, params.camPos, params.NDC, params.invMVP);
         retval.materialSystem = material_system_type::create(params.diffuseParams, params.conductorParams, params.dielectricParams);
-        retval.samplerSequence = samplerSequence;
+        retval.sampleSequence = sampleSequence;
         return retval;
     }
 
@@ -103,13 +104,14 @@ struct Unidirectional
     // TODO: probably will only work with procedural shapes, do the other ones
     bool closestHitProgram(uint32_t depth, uint32_t _sample, NBL_REF_ARG(ray_type) ray, NBL_CONST_REF_ARG(scene_type) scene)
     {
-        const uint32_t objectID = ray.objectID;
+        const ObjectID objectID = ray.objectID;
         const vector3_type intersection = ray.origin + ray.direction * ray.intersectionT;
 
         uint32_t bsdfLightIDs;
         anisotropic_type interaction;
         isotropic_type iso_interaction;
-        switch (objectID.mode)
+        ext::Intersector::IntersectData::Mode mode = (ext::Intersector::IntersectData::Mode)objectID.mode;
+        switch (mode)
         {
             // TODO
             case ext::Intersector::IntersectData::Mode::RAY_QUERY:
@@ -137,14 +139,14 @@ struct Unidirectional
         if (lightID != light_type::INVALID_ID)
         {
             float pdf;
-            ray.payload.accumulation += nee.deferredEvalAndPdf(pdf, lights[lightID], ray, scene.toNextEvent(lightID)) * throughput / (1.0 + pdf * pdf * ray.payload.otherTechniqueHeuristic);
+            ray.payload.accumulation += nee.deferredEvalAndPdf(pdf, scene.lights[lightID], ray, scene.toNextEvent(lightID)) * throughput / (1.0 + pdf * pdf * ray.payload.otherTechniqueHeuristic);
         }
 
         const uint32_t bsdfID = glsl::bitfieldExtract(bsdfLightIDs, 0, 16);
         if (bsdfID == bxdfnode_type::INVALID_ID)
             return false;
 
-        BxDFNode bxdf = scene.bxdfs[bsdfID];
+        bxdfnode_type bxdf = scene.bxdfs[bsdfID];
 
         // TODO: ifdef kill diffuse specular paths
 
@@ -171,7 +173,7 @@ struct Unidirectional
             scalar_type t;
             sample_type nee_sample = nee.generate_and_quotient_and_pdf(
                 neeContrib_pdf, t,
-                lights[lightID], intersection, interaction,
+                scene.lights[lightID], intersection, interaction,
                 isBSDF, eps0, depth, scene.toNextEvent(lightID)
             );
 
@@ -206,7 +208,7 @@ struct Unidirectional
                             params = params_type::template create<sample_type, anisotropic_type, anisocache_type>(nee_sample, interaction, _cache, bxdf::BCM_MAX);
                         else
                         {
-                            isocache = (iso_cache)_cache;
+                            isocache_type isocache = (isocache_type)_cache;
                             params = params_type::template create<sample_type, isotropic_type, isocache_type>(nee_sample, iso_interaction, isocache, bxdf::BCM_MAX);
                         }
                     }
@@ -220,7 +222,7 @@ struct Unidirectional
                             params = params_type::template create<sample_type, anisotropic_type, anisocache_type>(nee_sample, interaction, _cache, bxdf::BCM_ABS);
                         else
                         {
-                            isocache = (iso_cache)_cache;
+                            isocache_type isocache = (isocache_type)_cache;
                             params = params_type::template create<sample_type, isotropic_type, isocache_type>(nee_sample, iso_interaction, isocache, bxdf::BCM_ABS);
                         }
                     }
@@ -237,7 +239,7 @@ struct Unidirectional
                     nee_ray.origin = intersection + nee_sample.L.direction * t * Tolerance<scalar_type>::getStart(depth);
                     nee_ray.direction = nee_sample.L.direction;
                     nee_ray.intersectionT = t;
-                    if (bsdf_quotient_pdf.pdf < numeric_limits<scalar_type>::max && getLuma(neeContrib_pdf.quotient) > lumaContributionThreshold && intersector::traceRay(nee_ray, scene).id == -1)
+                    if (bsdf_quotient_pdf.pdf < numeric_limits<scalar_type>::max && getLuma(neeContrib_pdf.quotient) > lumaContributionThreshold && intersector_type::traceRay(nee_ray, scene).id == -1)
                         ray._payload.accumulation += neeContrib_pdf.quotient;
                 }
             }
@@ -265,7 +267,7 @@ struct Unidirectional
                     params = params_type::template create<sample_type, anisotropic_type, anisocache_type>(bsdf_sample, interaction, _cache, bxdf::BCM_MAX);
                 else
                 {
-                    isocache = (iso_cache)_cache;
+                    isocache_type isocache = (isocache_type)_cache;
                     params = params_type::template create<sample_type, isotropic_type, isocache_type>(bsdf_sample, iso_interaction, isocache, bxdf::BCM_MAX);
                 }
             }
@@ -279,7 +281,7 @@ struct Unidirectional
                     params = params_type::template create<sample_type, anisotropic_type, anisocache_type>(bsdf_sample, interaction, _cache, bxdf::BCM_ABS);
                 else
                 {
-                    isocache = (iso_cache)_cache;
+                    isocache_type isocache = (isocache_type)_cache;
                     params = params_type::template create<sample_type, isotropic_type, isocache_type>(bsdf_sample, iso_interaction, isocache, bxdf::BCM_ABS);
                 }
             }
@@ -298,7 +300,7 @@ struct Unidirectional
             ray.payload.otherTechniqueHeuristic *= ray.payload.otherTechniqueHeuristic;
                     
             // trace new ray
-            ray.origin = intersection + bsdfSampleL * (1.0/*kSceneSize*/) * Tolerance<scalar_type>::getStart(depth);
+            ray.origin = intersection + bxdfSample * (1.0/*kSceneSize*/) * Tolerance<scalar_type>::getStart(depth);
             ray.direction = bxdfSample;
             // #if POLYGON_METHOD==2
             // ray._immutable.normalAtOrigin = interaction.isotropic.N;
@@ -339,7 +341,7 @@ struct Unidirectional
             for (int d = 1; d <= depth && hit && rayAlive; d += 2)
             {
                 ray.intersectionT = numeric_limits<scalar_type>::max;
-                ray.objectID = intersector::traceRay(ray, scene);
+                ray.objectID = intersector_type::traceRay(ray, scene);
 
                 hit = ray.objectID.id != -1;
                 if (hit)
@@ -348,7 +350,7 @@ struct Unidirectional
             if (!hit)
                 missProgram(ray);
 
-            spectral_type accumulation = ray.payload.accumulation;
+            measure_type accumulation = ray.payload.accumulation;
             scalar_type rcpSampleSize = 1.0 / (i + 1);
             Li += (accumulation - Li) * rcpSampleSize;
 
@@ -365,11 +367,10 @@ struct Unidirectional
 
     randgen_type randGen;
     raygen_type rayGen;
-    intersector_type intersector;
     material_system_type materialSystem;
     nee_type nee;
 
-    Buffer samplerSequence;
+    Buffer sampleSequence;
 };
 
 }
diff --git a/31_HLSLPathTracer/app_resources/hlsl/render.comp.hlsl b/31_HLSLPathTracer/app_resources/hlsl/render.comp.hlsl
index 4143b973d..cc64de33c 100644
--- a/31_HLSLPathTracer/app_resources/hlsl/render.comp.hlsl
+++ b/31_HLSLPathTracer/app_resources/hlsl/render.comp.hlsl
@@ -1,6 +1,7 @@
 #include "nbl/builtin/hlsl/cpp_compat.hlsl"
 #include "nbl/builtin/hlsl/glsl_compat/core.hlsl"
 #include "nbl/builtin/hlsl/random/pcg.hlsl"
+#include "nbl/builtin/hlsl/random/xoroshiro.hlsl"
 
 #include "nbl/builtin/hlsl/bxdf/reflection.hlsl"
 #include "nbl/builtin/hlsl/bxdf/transmission.hlsl"
@@ -40,7 +41,7 @@ struct SPushConstants
 [[vk::combinedImageSampler]][[vk::binding(2, 2)]] Texture2D<uint2> scramblebuf; // unused
 [[vk::combinedImageSampler]][[vk::binding(2, 2)]] SamplerState scrambleSampler;
 
-[[vk::binding(0, 0)]] RWTexture2D<float16_t4> outImage;
+[[vk::image_format("rgba16f")]][[vk::binding(0, 0)]] RWTexture2D<float32_t4> outImage;
 
 int32_t2 getCoordinates()
 {
@@ -64,142 +65,142 @@ using aniso_cache = bxdf::SAnisotropicMicrofacetCache<float>;
 using quotient_pdf_t = bxdf::quotient_and_pdf<float32_t3, float>;
 using spectral_t = vector<float, 3>;
 using params_t = bxdf::SBxDFParams<float>;
-using create_params_t = SBxDFCreationParams<scalar_type, measure_type>;
-
-using diffuse_bxdf_type = bxdf::reflection::SOrenNayarBxDF<sample_t, iso_interaction, aniso_interaction, spectral_t>;
-using conductor_bxdf_type = bxdf::reflection::SGGXBxDF<sample_t, iso_cache, aniso_cache, spectral_t>;
-using dielectric_bxdf_type = bxdf::transmission::SGGXDielectricBxDF<sample_t, iso_cache, aniso_cache, spectral_t>;
-
-using ray_type = ext::Ray<float>;
-using light_type = ext::Light<spectral_t>;
-using bxdfnode_type = ext::BxDFNode<spectral_t>;
-using randgen_type = ext::RandGen::Uniform3D<Xoroshiro64Star>;
-using raygen_type = ext::RayGen::Basic<ray_type>;
-using intersector_type = ext::Intersector::Comprehensive<ray_type>;
-using material_system_type = ext::MaterialSystem::System<diffuse_bxdf_type, conductor_bxdf_type, dielectric_bxdf_type>;
-using nee_type = ext::NextEventEstimator::Estimator<light_type, ray_type, sample_t, aniso_interaction>;
-using pathtracer_type = ext::PathTracer::Unidirectional<randgen_type, raygen_type, intersector_type, material_system_type, nee_type>;
-
-static const Shape<PST_SPHERE> spheres[SPHERE_COUNT] = {
-    Shape<PST_SPHERE>::create(float3(0.0, -100.5, -1.0), 100.0, 0u, light_type::INVALID_ID),
-    Shape<PST_SPHERE>::create(float3(2.0, 0.0, -1.0), 0.5, 1u, light_type::INVALID_ID),
-    Shape<PST_SPHERE>::create(float3(0.0, 0.0, -1.0), 0.5, 2u, light_type::INVALID_ID),
-    Shape<PST_SPHERE>::create(float3(-2.0, 0.0, -1.0), 0.5, 3u, light_type::INVALID_ID),
-    Shape<PST_SPHERE>::create(float3(2.0, 0.0, 1.0), 0.5, 4u, light_type::INVALID_ID),
-    Shape<PST_SPHERE>::create(float3(0.0, 0.0, 1.0), 0.5, 4u, light_type::INVALID_ID),
-    Shape<PST_SPHERE>::create(float3(-2.0, 0.0, 1.0), 0.5, 5u, light_type::INVALID_ID),
-    Shape<PST_SPHERE>::create(float3(0.5, 1.0, 0.5), 0.5, 6u, light_type::INVALID_ID)
-#ifdef SPHERE_LIGHT
-    ,Shape<PST_SPHERE>::create(float3(-1.5, 1.5, 0.0), 0.3, bxdfnode_type::INVALID_ID, 0u)
-#endif
-};
-
-#ifdef TRIANGLE_LIGHT
-#define LIGHT_TYPE PST_TRIANGLE
-#define TRIANGLE_COUNT 1
-static const Shape<PST_TRIANGLE> triangles[TRIANGLE_COUNT] = {
-    Shape<PST_TRIANGLE>::create(float3(-1.8,0.35,0.3) * 10.0, float3(-1.2,0.35,0.0) * 10.0, float3(-1.5,0.8,-0.3) * 10.0, bxdfnode_type::INVALID_ID, 0u)
-};
-#endif
-
-#ifdef RECTANGLE_LIGHT
-#define LIGHT_TYPE PST_RECTANGLE
-#define RECTANGLE_COUNT 1
-static const Shape<PST_RECTANGLE> rectangles[RECTANGLE_COUNT] = {
-    Shape<PST_RECTANGLE>::create(float3(-3.8,0.35,1.3), normalize(float3(2,0,-1))*7.0, normalize(float3(2,-5,4))*0.1, bxdfnode_type::INVALID_ID, 0u)
-};
-#endif
-
-#define LIGHT_COUNT 1
-static const light_type lights[LIGHT_COUNT] = {
-    light_type(spectral_t(30.0,25.0,15.0), ext::ObjectID(8u, ext::NextEventEstimator::Event::Mode::PROCEDURAL, LIGHT_TYPE))
-};
-
-#define BXDF_COUNT 7
-static const bxdfnode_type bxdfs[BXDF_COUNT] = {
-    bxdfnode_type(ext::MaterialSystem::Material::Type::DIFFUSE, create_params_t(false, float2(0,0), spectral_t(1,1,1), spectral_t(1.25,1.25,1.25))),
-    bxdfnode_type(ext::MaterialSystem::Material::Type::DIFFUSE, create_params_t(false, float2(0,0), spectral_t(1,1,1), spectral_t(1.25,2.5,2.5))),
-    bxdfnode_type(ext::MaterialSystem::Material::Type::DIFFUSE, create_params_t(false, float2(0,0), spectral_t(1,1,1), spectral_t(2.5,1.25,2.5))),
-    bxdfnode_type(ext::MaterialSystem::Material::Type::CONDUCTOR, create_params_t(false, float2(0,0), spectral_t(1,1,1), spectral_t(0.98,0.98,0.77))),
-    bxdfnode_type(ext::MaterialSystem::Material::Type::CONDUCTOR, create_params_t(false, float2(0,0), spectral_t(1,1,1), spectral_t(0.98,0.77,0.98))),
-    bxdfnode_type(ext::MaterialSystem::Material::Type::CONDUCTOR, create_params_t(false, float2(0.15,0.15), spectral_t(1,1,1), spectral_t(0.98,0.77,0.98))),
-    bxdfnode_type(ext::MaterialSystem::Material::Type::DIELECTRIC, create_params_t(false, float2(0.0625,0.0625), spectral_t(1,1,1), spectral_t(0.71,0.69,0.67)))
-};
-
-[numthreads(WorkgroupGridDim, WorkgroupGridDim, 1)]
+using create_params_t = bxdf::SBxDFCreationParams<float, spectral_t>;
+
+// using diffuse_bxdf_type = bxdf::reflection::SOrenNayarBxDF<sample_t, iso_interaction, aniso_interaction, spectral_t>;
+// using conductor_bxdf_type = bxdf::reflection::SGGXBxDF<sample_t, iso_cache, aniso_cache, spectral_t>;
+// using dielectric_bxdf_type = bxdf::transmission::SGGXDielectricBxDF<sample_t, iso_cache, aniso_cache, spectral_t>;
+
+// using ray_type = ext::Ray<float>;
+// using light_type = ext::Light<spectral_t>;
+// using bxdfnode_type = ext::BxDFNode<spectral_t>;
+// using randgen_type = ext::RandGen::Uniform3D<Xoroshiro64Star>;
+// using raygen_type = ext::RayGen::Basic<ray_type>;
+// using intersector_type = ext::Intersector::Comprehensive<ray_type>;
+// using material_system_type = ext::MaterialSystem::System<diffuse_bxdf_type, conductor_bxdf_type, dielectric_bxdf_type>;
+// using nee_type = ext::NextEventEstimator::Estimator<light_type, ray_type, sample_t, aniso_interaction>;
+// using pathtracer_type = ext::PathTracer::Unidirectional<randgen_type, raygen_type, intersector_type, material_system_type, nee_type>;
+
+// static const Shape<PST_SPHERE> spheres[SPHERE_COUNT] = {
+//     Shape<PST_SPHERE>::create(float3(0.0, -100.5, -1.0), 100.0, 0u, light_type::INVALID_ID),
+//     Shape<PST_SPHERE>::create(float3(2.0, 0.0, -1.0), 0.5, 1u, light_type::INVALID_ID),
+//     Shape<PST_SPHERE>::create(float3(0.0, 0.0, -1.0), 0.5, 2u, light_type::INVALID_ID),
+//     Shape<PST_SPHERE>::create(float3(-2.0, 0.0, -1.0), 0.5, 3u, light_type::INVALID_ID),
+//     Shape<PST_SPHERE>::create(float3(2.0, 0.0, 1.0), 0.5, 4u, light_type::INVALID_ID),
+//     Shape<PST_SPHERE>::create(float3(0.0, 0.0, 1.0), 0.5, 4u, light_type::INVALID_ID),
+//     Shape<PST_SPHERE>::create(float3(-2.0, 0.0, 1.0), 0.5, 5u, light_type::INVALID_ID),
+//     Shape<PST_SPHERE>::create(float3(0.5, 1.0, 0.5), 0.5, 6u, light_type::INVALID_ID)
+// #ifdef SPHERE_LIGHT
+//     ,Shape<PST_SPHERE>::create(float3(-1.5, 1.5, 0.0), 0.3, bxdfnode_type::INVALID_ID, 0u)
+// #endif
+// };
+
+// #ifdef TRIANGLE_LIGHT
+// #define LIGHT_TYPE PST_TRIANGLE
+// #define TRIANGLE_COUNT 1
+// static const Shape<PST_TRIANGLE> triangles[TRIANGLE_COUNT] = {
+//     Shape<PST_TRIANGLE>::create(float3(-1.8,0.35,0.3) * 10.0, float3(-1.2,0.35,0.0) * 10.0, float3(-1.5,0.8,-0.3) * 10.0, bxdfnode_type::INVALID_ID, 0u)
+// };
+// #endif
+
+// #ifdef RECTANGLE_LIGHT
+// #define LIGHT_TYPE PST_RECTANGLE
+// #define RECTANGLE_COUNT 1
+// static const Shape<PST_RECTANGLE> rectangles[RECTANGLE_COUNT] = {
+//     Shape<PST_RECTANGLE>::create(float3(-3.8,0.35,1.3), normalize(float3(2,0,-1))*7.0, normalize(float3(2,-5,4))*0.1, bxdfnode_type::INVALID_ID, 0u)
+// };
+// #endif
+
+// #define LIGHT_COUNT 1
+// static const light_type lights[LIGHT_COUNT] = {
+//     light_type(spectral_t(30.0,25.0,15.0), ext::ObjectID(8u, ext::NextEventEstimator::Event::Mode::PROCEDURAL, LIGHT_TYPE))
+// };
+
+// #define BXDF_COUNT 7
+// static const bxdfnode_type bxdfs[BXDF_COUNT] = {
+//     bxdfnode_type(ext::MaterialSystem::Material::Type::DIFFUSE, create_params_t(false, float2(0,0), spectral_t(1,1,1), spectral_t(1.25,1.25,1.25))),
+//     bxdfnode_type(ext::MaterialSystem::Material::Type::DIFFUSE, create_params_t(false, float2(0,0), spectral_t(1,1,1), spectral_t(1.25,2.5,2.5))),
+//     bxdfnode_type(ext::MaterialSystem::Material::Type::DIFFUSE, create_params_t(false, float2(0,0), spectral_t(1,1,1), spectral_t(2.5,1.25,2.5))),
+//     bxdfnode_type(ext::MaterialSystem::Material::Type::CONDUCTOR, create_params_t(false, float2(0,0), spectral_t(1,1,1), spectral_t(0.98,0.98,0.77))),
+//     bxdfnode_type(ext::MaterialSystem::Material::Type::CONDUCTOR, create_params_t(false, float2(0,0), spectral_t(1,1,1), spectral_t(0.98,0.77,0.98))),
+//     bxdfnode_type(ext::MaterialSystem::Material::Type::CONDUCTOR, create_params_t(false, float2(0.15,0.15), spectral_t(1,1,1), spectral_t(0.98,0.77,0.98))),
+//     bxdfnode_type(ext::MaterialSystem::Material::Type::DIELECTRIC, create_params_t(false, float2(0.0625,0.0625), spectral_t(1,1,1), spectral_t(0.71,0.69,0.67)))
+// };
+
+[numthreads(WorkgroupSize, WorkgroupSize, 1)]
 void main(uint32_t3 threadID : SV_DispatchThreadID)
 {
-    uint32_t width, height;
-    outImage.GetDimensions(width, height);
-    const int32_t2 coords = getCoordinates();
-    float32_t2 texCoord = float32_t2(coords) / float32_t2(width, height);
-    texCoord.y = 1.0 - texCoord.y;
-
-    if (false == (all((int32_t2)0 < coords)) && all(int32_t2(width, height) < coords)) {
-        return;
-    }
-
-    if (((pc.depth - 1) >> MAX_DEPTH_LOG2) > 0 || ((pc.sampleCount - 1) >> MAX_SAMPLES_LOG2) > 0)
-    {
-        float32_t4 pixelCol = float32_t4(1.0,0.0,0.0,1.0);
-        outImage[coords] = pixelCol;
-        return;
-    }
-
-    int flatIdx = glsl::gl_GlobalInvocationID().y * glsl::gl_NumWorkGroups().x * WorkgroupSize + glsl::gl_GlobalInvocationID().x;
-    PCG32x2 pcg = PCG32x2::construct(flatIdx);  // replaces scramblebuf?
-
-    // set up path tracer
-    const PathTracerCreationParams<create_params_t, float> ptCreateParams;
-    ptCreateParams.rngState = pcg();
-
-    uint2 scrambleDim;
-    scramblebuf.GetDimensions(scrambleDim.x, scrambleDim.y);
-    ptCreateParams.pixOffsetParam = (float2)1.0 / float2(scrambleDim);
-
-    float4 NDC = float4(texCoord * float2(2.0, -2.0) + float2(-1.0, 1.0), 0.0, 1.0);
-    {
-        vec4 tmp = mul(pc.invMVP, NDC);
-        ptCreateParams.camPos = tmp.xyz / tmp.w;
-        NDC.z = 1.0;
-    }
+//     uint32_t width, height;
+//     outImage.GetDimensions(width, height);
+//     const int32_t2 coords = getCoordinates();
+//     float32_t2 texCoord = float32_t2(coords) / float32_t2(width, height);
+//     texCoord.y = 1.0 - texCoord.y;
+
+//     if (false == (all((int32_t2)0 < coords)) && all(int32_t2(width, height) < coords)) {
+//         return;
+//     }
+
+//     if (((pc.depth - 1) >> MAX_DEPTH_LOG2) > 0 || ((pc.sampleCount - 1) >> MAX_SAMPLES_LOG2) > 0)
+//     {
+//         float32_t4 pixelCol = float32_t4(1.0,0.0,0.0,1.0);
+//         outImage[coords] = pixelCol;
+//         return;
+//     }
+
+//     int flatIdx = glsl::gl_GlobalInvocationID().y * glsl::gl_NumWorkGroups().x * WorkgroupSize + glsl::gl_GlobalInvocationID().x;
+//     PCG32x2 pcg = PCG32x2::construct(flatIdx);  // replaces scramblebuf?
+
+//     // set up path tracer
+//     const PathTracerCreationParams<create_params_t, float> ptCreateParams;
+//     ptCreateParams.rngState = pcg();
+
+//     uint2 scrambleDim;
+//     scramblebuf.GetDimensions(scrambleDim.x, scrambleDim.y);
+//     ptCreateParams.pixOffsetParam = (float2)1.0 / float2(scrambleDim);
+
+//     float4 NDC = float4(texCoord * float2(2.0, -2.0) + float2(-1.0, 1.0), 0.0, 1.0);
+//     {
+//         vec4 tmp = mul(pc.invMVP, NDC);
+//         ptCreateParams.camPos = tmp.xyz / tmp.w;
+//         NDC.z = 1.0;
+//     }
  
-    ptCreateParams.NDC = NDC;
-    ptCreateParams.invMVP = pc.invMVP;
-
-    ptCreateParams.diffuseParams = bxdfs[0].params;
-    ptCreateParams.conductorParams = bxdfs[3].params;
-    ptCreateParams.dielectricParams = bxdfs[6].params;
-
-    pathtracer_type pathtracer = pathtracer_type::create(ptCreateParams, samplerSequence);
-
-    // set up scene (can do as global var?)
-    Scene<light_type, bxdfnode_type> scene;
-    scene.sphereCount = SPHERE_COUNT;
-    for (uint32_t i = 0; i < SPHERE_COUNT; i++)
-        scene.spheres[i] = spheres[i];
-#ifdef TRIANGLE_LIGHT
-    scene.triangleCount = TRIANGLE_COUNT;
-    for (uint32_t i = 0; i < TRIANGLE_COUNT; i++)
-        scene.triangles[i] = triangles[i];
-#else
-    scene.triangleCount = 0;
-#endif
-#ifdef RECTANGLE_LIGHT
-    scene.rectangleCount = RECTANGLE_COUNT;
-    for (uint32_t i = 0; i < RECTANGLE_COUNT; i++)
-        scene.rectangles[i] = rectangles[i];
-#else
-    scene.rectangleCount = 0;
-#endif
-    scene.lightCount = LIGHT_COUNT;
-    for (uint32_t i = 0; i < LIGHT_COUNT; i++)
-        scene.lights[i] = lights[i];
-    scene.bxdfCount = BXDF_COUNT;
-    for (uint32_t i = 0; i < BXDF_COUNT; i++)
-        scene.bxdfs[i] = bxdfs[i];
-
-    float32_t3 color = pathtracer.getMeasure(pc.sampleCount, pc.depth, scene);
-    float32_t4 pixCol = float32_t4(color, 1.0);
-    outImage[coords] = pixCol;
+//     ptCreateParams.NDC = NDC;
+//     ptCreateParams.invMVP = pc.invMVP;
+
+//     ptCreateParams.diffuseParams = bxdfs[0].params;
+//     ptCreateParams.conductorParams = bxdfs[3].params;
+//     ptCreateParams.dielectricParams = bxdfs[6].params;
+
+//     pathtracer_type pathtracer = pathtracer_type::create(ptCreateParams, sampleSequence);
+
+//     // set up scene (can do as global var?)
+//     Scene<light_type, bxdfnode_type> scene;
+//     scene.sphereCount = SPHERE_COUNT;
+//     for (uint32_t i = 0; i < SPHERE_COUNT; i++)
+//         scene.spheres[i] = spheres[i];
+// #ifdef TRIANGLE_LIGHT
+//     scene.triangleCount = TRIANGLE_COUNT;
+//     for (uint32_t i = 0; i < TRIANGLE_COUNT; i++)
+//         scene.triangles[i] = triangles[i];
+// #else
+//     scene.triangleCount = 0;
+// #endif
+// #ifdef RECTANGLE_LIGHT
+//     scene.rectangleCount = RECTANGLE_COUNT;
+//     for (uint32_t i = 0; i < RECTANGLE_COUNT; i++)
+//         scene.rectangles[i] = rectangles[i];
+// #else
+//     scene.rectangleCount = 0;
+// #endif
+//     scene.lightCount = LIGHT_COUNT;
+//     for (uint32_t i = 0; i < LIGHT_COUNT; i++)
+//         scene.lights[i] = lights[i];
+//     scene.bxdfCount = BXDF_COUNT;
+//     for (uint32_t i = 0; i < BXDF_COUNT; i++)
+//         scene.bxdfs[i] = bxdfs[i];
+
+//     float32_t3 color = pathtracer.getMeasure(pc.sampleCount, pc.depth, scene);
+//     float32_t4 pixCol = float32_t4(color, 1.0);
+//     outImage[coords] = pixCol;
 }
diff --git a/31_HLSLPathTracer/app_resources/hlsl/scene.hlsl b/31_HLSLPathTracer/app_resources/hlsl/scene.hlsl
index 88940c54d..ed0c612f1 100644
--- a/31_HLSLPathTracer/app_resources/hlsl/scene.hlsl
+++ b/31_HLSLPathTracer/app_resources/hlsl/scene.hlsl
@@ -121,10 +121,10 @@ struct Scene
         retval.mode = objectID.mode;
 
         retval.data[0] = lightCount;
-        retval.data[1] = objectID.type;
+        retval.data[1] = objectID.shapeType;
 
         uint32_t id = objectID.id;
-        switch (type)
+        switch (objectID.shapeType)
         {
             case PST_SPHERE:
             {
@@ -132,7 +132,7 @@ struct Scene
                 retval.data[2] = asuint(sphere.position.x);
                 retval.data[3] = asuint(sphere.position.y);
                 retval.data[4] = asuint(sphere.position.z);
-                retval.data[5] = asuint(sphere.radius);
+                retval.data[5] = asuint(sphere.radius2);
                 retval.data[6] = sphere.bsdfLightIDs;
             }
             break;
@@ -176,16 +176,16 @@ struct Scene
     // TODO: get these to work with AS types as well
     uint32_t getBsdfLightIDs(NBL_CONST_REF_ARG(ObjectID) objectID)
     {
-        return (objectID.type == PST_SPHERE) ? spheres[objectID.id].bsdfLightIDs :
-                (objectID.type == PST_TRIANGLE) ? triangles[objectID.id].bsdfLightIDs :
-                (objectID.type == PST_RECTANGLE) ? rectangles[objectID.id].bsdfLightIDs : -1;
+        return (objectID.shapeType == PST_SPHERE) ? spheres[objectID.id].bsdfLightIDs :
+                (objectID.shapeType == PST_TRIANGLE) ? triangles[objectID.id].bsdfLightIDs :
+                (objectID.shapeType == PST_RECTANGLE) ? rectangles[objectID.id].bsdfLightIDs : -1;
     }
 
     float32_t3 getNormal(NBL_CONST_REF_ARG(ObjectID) objectID, NBL_CONST_REF_ARG(float32_t3) intersection)
     {
-        return (objectID.type == PST_SPHERE) ? scene.spheres[objectID.id].getNormal(intersection) :
-                (objectID.type == PST_TRIANGLE) ? scene.triangles[objectID.id].getNormalTimesArea() :
-                (objectID.type == PST_RECTANGLE) ? scene.rectangles[objectID.id].getNormalTimesArea() :
+        return (objectID.shapeType == PST_SPHERE) ? spheres[objectID.id].getNormal(intersection) :
+                (objectID.shapeType == PST_TRIANGLE) ? triangles[objectID.id].getNormalTimesArea() :
+                (objectID.shapeType == PST_RECTANGLE) ? rectangles[objectID.id].getNormalTimesArea() :
                 (float32_t3)0.0;
     }
 };

From da661c08d50eb60b8e95fe4a0028aac653a10c4b Mon Sep 17 00:00:00 2001
From: keptsecret <sorchon@gmail.com>
Date: Tue, 25 Feb 2025 12:12:38 +0700
Subject: [PATCH 058/529] fix compile hlsl shader bug

---
 31_HLSLPathTracer/main.cpp | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/31_HLSLPathTracer/main.cpp b/31_HLSLPathTracer/main.cpp
index 5aff6bde7..4a2c1110b 100644
--- a/31_HLSLPathTracer/main.cpp
+++ b/31_HLSLPathTracer/main.cpp
@@ -331,7 +331,7 @@ class ComputeShaderPathtracer final : public examples::SimpleWindowedApplication
 					auto shader = m_device->createShader(source.get());
 					if (!shader)
 					{
-						m_logger->log("Shader creationed failed: %s!", ILogger::ELL_ERROR, pathToShader);
+						m_logger->log("GLSL shader creationed failed: %s!", ILogger::ELL_ERROR, pathToShader);
 						std::exit(-1);
 					}
 
@@ -373,6 +373,15 @@ class ComputeShaderPathtracer final : public examples::SimpleWindowedApplication
 					options.dxcOptions = std::span(dxcOptionStr);
 
 					source = compiler->compileToSPIRV((const char*)source->getContent()->getPointer(), options);
+					
+					auto shader = m_device->createShader(source.get());
+					if (!shader)
+					{
+						m_logger->log("HLSL shader creationed failed: %s!", ILogger::ELL_ERROR, pathToShader);
+						std::exit(-1);
+					}
+
+					return shader;
 				};
 
 				// Create compute pipelines

From f97757bffcc28ad208a10dfb485214b8d9e1fdd1 Mon Sep 17 00:00:00 2001
From: keptsecret <sorchon@gmail.com>
Date: Tue, 25 Feb 2025 16:55:48 +0700
Subject: [PATCH 059/529] more bug fixes #5

---
 .../app_resources/hlsl/common.hlsl            |  68 +++++
 .../app_resources/hlsl/intersector.hlsl       |  42 +--
 .../app_resources/hlsl/material_system.hlsl   |   6 +-
 .../hlsl/next_event_estimator.hlsl            |  18 --
 .../app_resources/hlsl/pathtracer.hlsl        |   6 +-
 .../app_resources/hlsl/ray_gen.hlsl           |   2 +-
 .../app_resources/hlsl/render.comp.hlsl       | 266 +++++++++---------
 .../app_resources/hlsl/scene.hlsl             |   3 -
 31_HLSLPathTracer/main.cpp                    |   6 +-
 9 files changed, 222 insertions(+), 195 deletions(-)

diff --git a/31_HLSLPathTracer/app_resources/hlsl/common.hlsl b/31_HLSLPathTracer/app_resources/hlsl/common.hlsl
index cd2310fbf..a264fabd5 100644
--- a/31_HLSLPathTracer/app_resources/hlsl/common.hlsl
+++ b/31_HLSLPathTracer/app_resources/hlsl/common.hlsl
@@ -42,6 +42,15 @@ enum ProceduralShapeType : uint16_t
 
 struct ObjectID
 {
+    static ObjectID create(uint32_t id, uint32_t mode, ProceduralShapeType shapeType)
+    {
+        ObjectID retval;
+        retval.id = id;
+        retval.mode = mode;
+        retval.shapeType = shapeType;
+        return retval;
+    }
+
     uint32_t id;
     uint32_t mode;
     ProceduralShapeType shapeType;
@@ -85,6 +94,17 @@ struct BxDFNode
 
     NBL_CONSTEXPR_STATIC_INLINE uint32_t INVALID_ID = 0xffffu;
 
+    static BxDFNode<Spectrum> create(uint32_t materialType, bool isAniso, NBL_CONST_REF_ARG(float32_t2) A, NBL_CONST_REF_ARG(spectral_type) ior0, NBL_CONST_REF_ARG(spectral_type) ior1)
+    {
+        BxDFNode<Spectrum> retval;
+        retval.materialType = materialType;
+        retval.params.is_aniso = isAniso;
+        retval.params.A = A;
+        retval.params.ior0 = ior0;
+        retval.params.ior1 = ior1;
+        return retval;
+    }
+
     uint32_t materialType;
     params_type params;
 };
@@ -118,6 +138,54 @@ enum PTPolygonMethod : uint16_t
     PPM_APPROX_PROJECTED_SOLID_ANGLE
 };
 
+namespace Intersector
+{
+// ray query method
+// ray query struct holds AS info
+// pass in address to vertex/index buffers?
+
+// ray tracing pipeline method
+
+// procedural data store: [obj count] [intersect type] [obj1] [obj2] [...]
+
+struct IntersectData
+{
+    enum Mode : uint32_t    // enum class?
+    {
+        RAY_QUERY,
+        RAY_TRACING,
+        PROCEDURAL
+    };
+
+    NBL_CONSTEXPR_STATIC_INLINE uint32_t DataSize = 128;
+
+    uint32_t mode : 1;
+    uint32_t unused : 31;   // possible space for flags
+    uint32_t data[DataSize];
+};
+}
+
+namespace NextEventEstimator
+{
+// procedural data store: [light count] [event type] [obj]
+
+struct Event
+{
+    enum Mode : uint32_t    // enum class?
+    {
+        RAY_QUERY,
+        RAY_TRACING,
+        PROCEDURAL
+    };
+
+    NBL_CONSTEXPR_STATIC_INLINE uint32_t DataSize = 16;
+
+    uint32_t mode : 1;
+    uint32_t unused : 31;   // possible space for flags
+    uint32_t data[DataSize];
+};
+}
+
 template<ProceduralShapeType type>
 struct Shape;
 
diff --git a/31_HLSLPathTracer/app_resources/hlsl/intersector.hlsl b/31_HLSLPathTracer/app_resources/hlsl/intersector.hlsl
index 0bb6cb31c..880ae1169 100644
--- a/31_HLSLPathTracer/app_resources/hlsl/intersector.hlsl
+++ b/31_HLSLPathTracer/app_resources/hlsl/intersector.hlsl
@@ -2,6 +2,7 @@
 #define _NBL_HLSL_EXT_INTERSECTOR_INCLUDED_
 
 #include "common.hlsl"
+#include "scene.hlsl"
 #include <nbl/builtin/hlsl/limits.hlsl>
 
 namespace nbl
@@ -13,38 +14,18 @@ namespace ext
 namespace Intersector
 {
 
-// ray query method
-// ray query struct holds AS info
-// pass in address to vertex/index buffers?
-
-// ray tracing pipeline method
-
-// procedural data store: [obj count] [intersect type] [obj1] [obj2] [...]
-
-struct IntersectData
-{
-    enum Mode : uint32_t    // enum class?
-    {
-        RAY_QUERY,
-        RAY_TRACING,
-        PROCEDURAL
-    };
-
-    NBL_CONSTEXPR_STATIC_INLINE uint32_t DataSize = 128;
-
-    uint32_t mode : 1;
-    uint32_t unused : 31;   // possible space for flags
-    uint32_t data[DataSize];
-};
-
-template<class Ray>
+template<class Ray, typename Light, typename BxdfNode>
 struct Comprehensive
 {
     using scalar_type = typename Ray::scalar_type;
     using vector3_type = vector<scalar_type, 3>;
     using ray_type = Ray;
 
-    static ObjectID traceProcedural(NBL_REF_ARG(ray_type) ray, NBL_REF_ARG(IntersectData) intersect)
+    using light_type = Light;
+    using bxdfnode_type = BxdfNode;
+    using scene_type = Scene<light_type, bxdfnode_type>;
+
+    static ObjectID traceProcedural(NBL_REF_ARG(ray_type) ray, NBL_CONST_REF_ARG(IntersectData) intersect)
     {
         const bool anyHit = ray.intersectionT != numeric_limits<scalar_type>::max;
         const uint32_t objCount = intersect.data[0];
@@ -100,7 +81,7 @@ struct Comprehensive
         return objectID;
     }
 
-    static ObjectID traceRay(NBL_REF_ARG(ray_type) ray, NBL_REF_ARG(IntersectData) intersect)
+    static ObjectID traceRay(NBL_REF_ARG(ray_type) ray, NBL_CONST_REF_ARG(IntersectData) intersect)
     {
         const IntersectData::Mode mode = (IntersectData::Mode)intersect.mode;
         switch (mode)
@@ -122,15 +103,12 @@ struct Comprehensive
             break;
             default:
             {
-                ObjectID objID;
-                objID.id = -1;
-                return objID;
+                return ObjectID::create(-1, 0, PST_SPHERE);
             }
         }
     }
 
-    template<typename Scene>
-    static ObjectID traceRay(NBL_REF_ARG(ray_type) ray, NBL_CONST_REF_ARG(Scene) scene)
+    static ObjectID traceRay(NBL_REF_ARG(ray_type) ray, NBL_CONST_REF_ARG(scene_type) scene)
     {
         IntersectData data;
 
diff --git a/31_HLSLPathTracer/app_resources/hlsl/material_system.hlsl b/31_HLSLPathTracer/app_resources/hlsl/material_system.hlsl
index 9d638c232..16f8dcabf 100644
--- a/31_HLSLPathTracer/app_resources/hlsl/material_system.hlsl
+++ b/31_HLSLPathTracer/app_resources/hlsl/material_system.hlsl
@@ -51,9 +51,9 @@ struct System
     static this_t create(NBL_CONST_REF_ARG(create_params_t) diffuseParams, NBL_CONST_REF_ARG(create_params_t) conductorParams, NBL_CONST_REF_ARG(create_params_t) dielectricParams)
     {
         this_t retval;
-        retval.diffuseBxDF = DiffuseBxDF::create(diffuseParams);
-        retval.conductorBxDF = DiffuseBxDF::create(conductorParams);
-        retval.dielectricBxDF = DiffuseBxDF::create(dielectricParams);
+        retval.diffuseBxDF = diffuse_op_type::create(diffuseParams);
+        retval.conductorBxDF = conductor_op_type::create(conductorParams);
+        retval.dielectricBxDF = dielectric_op_type::create(dielectricParams);
         return retval;
     }
 
diff --git a/31_HLSLPathTracer/app_resources/hlsl/next_event_estimator.hlsl b/31_HLSLPathTracer/app_resources/hlsl/next_event_estimator.hlsl
index 32a7b7476..f0eeb0885 100644
--- a/31_HLSLPathTracer/app_resources/hlsl/next_event_estimator.hlsl
+++ b/31_HLSLPathTracer/app_resources/hlsl/next_event_estimator.hlsl
@@ -12,24 +12,6 @@ namespace ext
 namespace NextEventEstimator
 {
 
-// procedural data store: [light count] [event type] [obj]
-
-struct Event
-{
-    enum Mode : uint32_t    // enum class?
-    {
-        RAY_QUERY,
-        RAY_TRACING,
-        PROCEDURAL
-    };
-
-    NBL_CONSTEXPR_STATIC_INLINE uint32_t DataSize = 16;
-
-    uint32_t mode : 1;
-    uint32_t unused : 31;   // possible space for flags
-    uint32_t data[DataSize];
-};
-
 template<typename Light, typename Ray, class LightSample, class Aniso>
 struct Estimator
 {
diff --git a/31_HLSLPathTracer/app_resources/hlsl/pathtracer.hlsl b/31_HLSLPathTracer/app_resources/hlsl/pathtracer.hlsl
index f1237006c..460744940 100644
--- a/31_HLSLPathTracer/app_resources/hlsl/pathtracer.hlsl
+++ b/31_HLSLPathTracer/app_resources/hlsl/pathtracer.hlsl
@@ -91,7 +91,7 @@ struct Unidirectional
     vector3_type rand3d(uint32_t protoDimension, uint32_t _sample, uint32_t i)
     {
         uint32_t address = glsl::bitfieldInsert<uint32_t>(protoDimension, _sample, MAX_DEPTH_LOG2, MAX_SAMPLES_LOG2);
-	    uint32_t3 seqVal = texelFetch(sampleSequence, int(address) + i).xyz;
+	    uint32_t3 seqVal = sampleSequence[address + i].xyz;
 	    seqVal ^= randGen();
         return vector3_type(seqVal) * asfloat(0x2f800004u);
     }
@@ -120,7 +120,7 @@ struct Unidirectional
             case ext::Intersector::IntersectData::Mode::PROCEDURAL:
             {
                 bsdfLightIDs = scene.getBsdfLightIDs(objectID);
-                vector3_type N = scene.getNormal(objectID);
+                vector3_type N = scene.getNormal(objectID, intersection);
                 N = nbl::hlsl::normalize(N);
                 typename isotropic_type::ray_dir_info_type V;
                 V.direction = nbl::hlsl::normalize(-ray.direction);
@@ -332,7 +332,7 @@ struct Unidirectional
         scalar_type meanLumaSq = 0.0;
         for (uint32_t i = 0; i < numSamples; i++)
         {
-            vector3_type uvw = rand3d(0u, i);
+            vector3_type uvw = rand3d(0u, i, randGen.rng());    // TODO: take from scramblebuf?
             ray_type ray = rayGen.generate(uvw);
 
             // bounces
diff --git a/31_HLSLPathTracer/app_resources/hlsl/ray_gen.hlsl b/31_HLSLPathTracer/app_resources/hlsl/ray_gen.hlsl
index dcb695fbe..0759b1cd3 100644
--- a/31_HLSLPathTracer/app_resources/hlsl/ray_gen.hlsl
+++ b/31_HLSLPathTracer/app_resources/hlsl/ray_gen.hlsl
@@ -50,7 +50,7 @@ struct Basic
         remappedRand.x += truncation;
         tmp.xy += pixOffsetParam * nbl::hlsl::boxMullerTransform<scalar_type>(remappedRand, 1.5);
         // for depth of field we could do another stochastic point-pick
-        tmp = invMVP * tmp;
+        tmp = nbl::hlsl::mul(invMVP, tmp);
         ray.direction = nbl::hlsl::normalize(tmp.xyz / tmp.w - camPos);
 
         // #if POLYGON_METHOD==2
diff --git a/31_HLSLPathTracer/app_resources/hlsl/render.comp.hlsl b/31_HLSLPathTracer/app_resources/hlsl/render.comp.hlsl
index cc64de33c..5be6adf78 100644
--- a/31_HLSLPathTracer/app_resources/hlsl/render.comp.hlsl
+++ b/31_HLSLPathTracer/app_resources/hlsl/render.comp.hlsl
@@ -15,7 +15,7 @@
 
 #ifdef SPHERE_LIGHT
 #define SPHERE_COUNT 9
-#define LIGHT_TYPE PST_SPHERE
+#define LIGHT_TYPE ext::PST_SPHERE
 #else
 #define SPHERE_COUNT 8
 #endif
@@ -23,6 +23,8 @@
 using namespace nbl::hlsl;
 
 NBL_CONSTEXPR uint32_t WorkgroupSize = 32;
+NBL_CONSTEXPR uint32_t MAX_DEPTH_LOG2 = 4;
+NBL_CONSTEXPR uint32_t MAX_SAMPLES_LOG2 = 10;
 
 struct SPushConstants
 {
@@ -67,140 +69,140 @@ using spectral_t = vector<float, 3>;
 using params_t = bxdf::SBxDFParams<float>;
 using create_params_t = bxdf::SBxDFCreationParams<float, spectral_t>;
 
-// using diffuse_bxdf_type = bxdf::reflection::SOrenNayarBxDF<sample_t, iso_interaction, aniso_interaction, spectral_t>;
-// using conductor_bxdf_type = bxdf::reflection::SGGXBxDF<sample_t, iso_cache, aniso_cache, spectral_t>;
-// using dielectric_bxdf_type = bxdf::transmission::SGGXDielectricBxDF<sample_t, iso_cache, aniso_cache, spectral_t>;
-
-// using ray_type = ext::Ray<float>;
-// using light_type = ext::Light<spectral_t>;
-// using bxdfnode_type = ext::BxDFNode<spectral_t>;
-// using randgen_type = ext::RandGen::Uniform3D<Xoroshiro64Star>;
-// using raygen_type = ext::RayGen::Basic<ray_type>;
-// using intersector_type = ext::Intersector::Comprehensive<ray_type>;
-// using material_system_type = ext::MaterialSystem::System<diffuse_bxdf_type, conductor_bxdf_type, dielectric_bxdf_type>;
-// using nee_type = ext::NextEventEstimator::Estimator<light_type, ray_type, sample_t, aniso_interaction>;
-// using pathtracer_type = ext::PathTracer::Unidirectional<randgen_type, raygen_type, intersector_type, material_system_type, nee_type>;
-
-// static const Shape<PST_SPHERE> spheres[SPHERE_COUNT] = {
-//     Shape<PST_SPHERE>::create(float3(0.0, -100.5, -1.0), 100.0, 0u, light_type::INVALID_ID),
-//     Shape<PST_SPHERE>::create(float3(2.0, 0.0, -1.0), 0.5, 1u, light_type::INVALID_ID),
-//     Shape<PST_SPHERE>::create(float3(0.0, 0.0, -1.0), 0.5, 2u, light_type::INVALID_ID),
-//     Shape<PST_SPHERE>::create(float3(-2.0, 0.0, -1.0), 0.5, 3u, light_type::INVALID_ID),
-//     Shape<PST_SPHERE>::create(float3(2.0, 0.0, 1.0), 0.5, 4u, light_type::INVALID_ID),
-//     Shape<PST_SPHERE>::create(float3(0.0, 0.0, 1.0), 0.5, 4u, light_type::INVALID_ID),
-//     Shape<PST_SPHERE>::create(float3(-2.0, 0.0, 1.0), 0.5, 5u, light_type::INVALID_ID),
-//     Shape<PST_SPHERE>::create(float3(0.5, 1.0, 0.5), 0.5, 6u, light_type::INVALID_ID)
-// #ifdef SPHERE_LIGHT
-//     ,Shape<PST_SPHERE>::create(float3(-1.5, 1.5, 0.0), 0.3, bxdfnode_type::INVALID_ID, 0u)
-// #endif
-// };
-
-// #ifdef TRIANGLE_LIGHT
-// #define LIGHT_TYPE PST_TRIANGLE
-// #define TRIANGLE_COUNT 1
-// static const Shape<PST_TRIANGLE> triangles[TRIANGLE_COUNT] = {
-//     Shape<PST_TRIANGLE>::create(float3(-1.8,0.35,0.3) * 10.0, float3(-1.2,0.35,0.0) * 10.0, float3(-1.5,0.8,-0.3) * 10.0, bxdfnode_type::INVALID_ID, 0u)
-// };
-// #endif
-
-// #ifdef RECTANGLE_LIGHT
-// #define LIGHT_TYPE PST_RECTANGLE
-// #define RECTANGLE_COUNT 1
-// static const Shape<PST_RECTANGLE> rectangles[RECTANGLE_COUNT] = {
-//     Shape<PST_RECTANGLE>::create(float3(-3.8,0.35,1.3), normalize(float3(2,0,-1))*7.0, normalize(float3(2,-5,4))*0.1, bxdfnode_type::INVALID_ID, 0u)
-// };
-// #endif
-
-// #define LIGHT_COUNT 1
-// static const light_type lights[LIGHT_COUNT] = {
-//     light_type(spectral_t(30.0,25.0,15.0), ext::ObjectID(8u, ext::NextEventEstimator::Event::Mode::PROCEDURAL, LIGHT_TYPE))
-// };
-
-// #define BXDF_COUNT 7
-// static const bxdfnode_type bxdfs[BXDF_COUNT] = {
-//     bxdfnode_type(ext::MaterialSystem::Material::Type::DIFFUSE, create_params_t(false, float2(0,0), spectral_t(1,1,1), spectral_t(1.25,1.25,1.25))),
-//     bxdfnode_type(ext::MaterialSystem::Material::Type::DIFFUSE, create_params_t(false, float2(0,0), spectral_t(1,1,1), spectral_t(1.25,2.5,2.5))),
-//     bxdfnode_type(ext::MaterialSystem::Material::Type::DIFFUSE, create_params_t(false, float2(0,0), spectral_t(1,1,1), spectral_t(2.5,1.25,2.5))),
-//     bxdfnode_type(ext::MaterialSystem::Material::Type::CONDUCTOR, create_params_t(false, float2(0,0), spectral_t(1,1,1), spectral_t(0.98,0.98,0.77))),
-//     bxdfnode_type(ext::MaterialSystem::Material::Type::CONDUCTOR, create_params_t(false, float2(0,0), spectral_t(1,1,1), spectral_t(0.98,0.77,0.98))),
-//     bxdfnode_type(ext::MaterialSystem::Material::Type::CONDUCTOR, create_params_t(false, float2(0.15,0.15), spectral_t(1,1,1), spectral_t(0.98,0.77,0.98))),
-//     bxdfnode_type(ext::MaterialSystem::Material::Type::DIELECTRIC, create_params_t(false, float2(0.0625,0.0625), spectral_t(1,1,1), spectral_t(0.71,0.69,0.67)))
-// };
+using diffuse_bxdf_type = bxdf::reflection::SOrenNayarBxDF<sample_t, iso_interaction, aniso_interaction, spectral_t>;
+using conductor_bxdf_type = bxdf::reflection::SGGXBxDF<sample_t, iso_cache, aniso_cache, spectral_t>;
+using dielectric_bxdf_type = bxdf::transmission::SGGXDielectricBxDF<sample_t, iso_cache, aniso_cache, spectral_t>;
+
+using ray_type = ext::Ray<float>;
+using light_type = ext::Light<spectral_t>;
+using bxdfnode_type = ext::BxDFNode<spectral_t>;
+using randgen_type = ext::RandGen::Uniform3D<Xoroshiro64Star>;
+using raygen_type = ext::RayGen::Basic<ray_type>;
+using intersector_type = ext::Intersector::Comprehensive<ray_type, light_type, bxdfnode_type>;
+using material_system_type = ext::MaterialSystem::System<diffuse_bxdf_type, conductor_bxdf_type, dielectric_bxdf_type>;
+using nee_type = ext::NextEventEstimator::Estimator<light_type, ray_type, sample_t, aniso_interaction>;
+using pathtracer_type = ext::PathTracer::Unidirectional<randgen_type, raygen_type, intersector_type, material_system_type, nee_type>;
+
+static const ext::Shape<ext::PST_SPHERE> spheres[SPHERE_COUNT] = {
+    ext::Shape<ext::PST_SPHERE>::create(float3(0.0, -100.5, -1.0), 100.0, 0u, light_type::INVALID_ID),
+    ext::Shape<ext::PST_SPHERE>::create(float3(2.0, 0.0, -1.0), 0.5, 1u, light_type::INVALID_ID),
+    ext::Shape<ext::PST_SPHERE>::create(float3(0.0, 0.0, -1.0), 0.5, 2u, light_type::INVALID_ID),
+    ext::Shape<ext::PST_SPHERE>::create(float3(-2.0, 0.0, -1.0), 0.5, 3u, light_type::INVALID_ID),
+    ext::Shape<ext::PST_SPHERE>::create(float3(2.0, 0.0, 1.0), 0.5, 4u, light_type::INVALID_ID),
+    ext::Shape<ext::PST_SPHERE>::create(float3(0.0, 0.0, 1.0), 0.5, 4u, light_type::INVALID_ID),
+    ext::Shape<ext::PST_SPHERE>::create(float3(-2.0, 0.0, 1.0), 0.5, 5u, light_type::INVALID_ID),
+    ext::Shape<ext::PST_SPHERE>::create(float3(0.5, 1.0, 0.5), 0.5, 6u, light_type::INVALID_ID)
+#ifdef SPHERE_LIGHT
+    ,ext::Shape<ext::PST_SPHERE>::create(float3(-1.5, 1.5, 0.0), 0.3, bxdfnode_type::INVALID_ID, 0u)
+#endif
+};
+
+#ifdef TRIANGLE_LIGHT
+#define LIGHT_TYPE ext::PST_TRIANGLE
+#define TRIANGLE_COUNT 1
+static const ext::Shape<ext::PST_TRIANGLE> triangles[TRIANGLE_COUNT] = {
+    ext::Shape<ext::PST_TRIANGLE>::create(float3(-1.8,0.35,0.3) * 10.0, float3(-1.2,0.35,0.0) * 10.0, float3(-1.5,0.8,-0.3) * 10.0, bxdfnode_type::INVALID_ID, 0u)
+};
+#endif
+
+#ifdef RECTANGLE_LIGHT
+#define LIGHT_TYPE ext::PST_RECTANGLE
+#define RECTANGLE_COUNT 1
+static const ext::Shape<ext::PST_RECTANGLE> rectangles[RECTANGLE_COUNT] = {
+    ext::Shape<ext::PST_RECTANGLE>::create(float3(-3.8,0.35,1.3), normalize(float3(2,0,-1))*7.0, normalize(float3(2,-5,4))*0.1, bxdfnode_type::INVALID_ID, 0u)
+};
+#endif
+
+#define LIGHT_COUNT 1
+static const light_type lights[LIGHT_COUNT] = {
+    light_type(spectral_t(30.0,25.0,15.0), ext::ObjectID(8u, ext::NextEventEstimator::Event::Mode::PROCEDURAL, LIGHT_TYPE))
+};
+
+#define BXDF_COUNT 7
+static const bxdfnode_type bxdfs[BXDF_COUNT] = {
+    bxdfnode_type::create(ext::MaterialSystem::Material::Type::DIFFUSE, false, float2(0,0), spectral_t(1,1,1), spectral_t(1.25,1.25,1.25)),
+    bxdfnode_type::create(ext::MaterialSystem::Material::Type::DIFFUSE, false, float2(0,0), spectral_t(1,1,1), spectral_t(1.25,2.5,2.5)),
+    bxdfnode_type::create(ext::MaterialSystem::Material::Type::DIFFUSE, false, float2(0,0), spectral_t(1,1,1), spectral_t(2.5,1.25,2.5)),
+    bxdfnode_type::create(ext::MaterialSystem::Material::Type::CONDUCTOR, false, float2(0,0), spectral_t(1,1,1), spectral_t(0.98,0.98,0.77)),
+    bxdfnode_type::create(ext::MaterialSystem::Material::Type::CONDUCTOR, false, float2(0,0), spectral_t(1,1,1), spectral_t(0.98,0.77,0.98)),
+    bxdfnode_type::create(ext::MaterialSystem::Material::Type::CONDUCTOR, false, float2(0.15,0.15), spectral_t(1,1,1), spectral_t(0.98,0.77,0.98)),
+    bxdfnode_type::create(ext::MaterialSystem::Material::Type::DIELECTRIC, false, float2(0.0625,0.0625), spectral_t(1,1,1), spectral_t(0.71,0.69,0.67))
+};
 
 [numthreads(WorkgroupSize, WorkgroupSize, 1)]
 void main(uint32_t3 threadID : SV_DispatchThreadID)
 {
-//     uint32_t width, height;
-//     outImage.GetDimensions(width, height);
-//     const int32_t2 coords = getCoordinates();
-//     float32_t2 texCoord = float32_t2(coords) / float32_t2(width, height);
-//     texCoord.y = 1.0 - texCoord.y;
-
-//     if (false == (all((int32_t2)0 < coords)) && all(int32_t2(width, height) < coords)) {
-//         return;
-//     }
-
-//     if (((pc.depth - 1) >> MAX_DEPTH_LOG2) > 0 || ((pc.sampleCount - 1) >> MAX_SAMPLES_LOG2) > 0)
-//     {
-//         float32_t4 pixelCol = float32_t4(1.0,0.0,0.0,1.0);
-//         outImage[coords] = pixelCol;
-//         return;
-//     }
-
-//     int flatIdx = glsl::gl_GlobalInvocationID().y * glsl::gl_NumWorkGroups().x * WorkgroupSize + glsl::gl_GlobalInvocationID().x;
-//     PCG32x2 pcg = PCG32x2::construct(flatIdx);  // replaces scramblebuf?
-
-//     // set up path tracer
-//     const PathTracerCreationParams<create_params_t, float> ptCreateParams;
-//     ptCreateParams.rngState = pcg();
-
-//     uint2 scrambleDim;
-//     scramblebuf.GetDimensions(scrambleDim.x, scrambleDim.y);
-//     ptCreateParams.pixOffsetParam = (float2)1.0 / float2(scrambleDim);
-
-//     float4 NDC = float4(texCoord * float2(2.0, -2.0) + float2(-1.0, 1.0), 0.0, 1.0);
-//     {
-//         vec4 tmp = mul(pc.invMVP, NDC);
-//         ptCreateParams.camPos = tmp.xyz / tmp.w;
-//         NDC.z = 1.0;
-//     }
+    uint32_t width, height;
+    outImage.GetDimensions(width, height);
+    const int32_t2 coords = getCoordinates();
+    float32_t2 texCoord = float32_t2(coords) / float32_t2(width, height);
+    texCoord.y = 1.0 - texCoord.y;
+
+    if (false == (all((int32_t2)0 < coords)) && all(int32_t2(width, height) < coords)) {
+        return;
+    }
+
+    if (((pc.depth - 1) >> MAX_DEPTH_LOG2) > 0 || ((pc.sampleCount - 1) >> MAX_SAMPLES_LOG2) > 0)
+    {
+        float32_t4 pixelCol = float32_t4(1.0,0.0,0.0,1.0);
+        outImage[coords] = pixelCol;
+        return;
+    }
+
+    int flatIdx = glsl::gl_GlobalInvocationID().y * glsl::gl_NumWorkGroups().x * WorkgroupSize + glsl::gl_GlobalInvocationID().x;
+    PCG32x2 pcg = PCG32x2::construct(flatIdx);  // replaces scramblebuf?
+
+    // set up path tracer
+    ext::PathTracer::PathTracerCreationParams<create_params_t, float> ptCreateParams;
+    ptCreateParams.rngState = pcg();
+
+    uint2 scrambleDim;
+    scramblebuf.GetDimensions(scrambleDim.x, scrambleDim.y);
+    ptCreateParams.pixOffsetParam = (float2)1.0 / float2(scrambleDim);
+
+    float4 NDC = float4(texCoord * float2(2.0, -2.0) + float2(-1.0, 1.0), 0.0, 1.0);
+    {
+        float4 tmp = mul(pc.invMVP, NDC);
+        ptCreateParams.camPos = tmp.xyz / tmp.w;
+        NDC.z = 1.0;
+    }
  
-//     ptCreateParams.NDC = NDC;
-//     ptCreateParams.invMVP = pc.invMVP;
-
-//     ptCreateParams.diffuseParams = bxdfs[0].params;
-//     ptCreateParams.conductorParams = bxdfs[3].params;
-//     ptCreateParams.dielectricParams = bxdfs[6].params;
-
-//     pathtracer_type pathtracer = pathtracer_type::create(ptCreateParams, sampleSequence);
-
-//     // set up scene (can do as global var?)
-//     Scene<light_type, bxdfnode_type> scene;
-//     scene.sphereCount = SPHERE_COUNT;
-//     for (uint32_t i = 0; i < SPHERE_COUNT; i++)
-//         scene.spheres[i] = spheres[i];
-// #ifdef TRIANGLE_LIGHT
-//     scene.triangleCount = TRIANGLE_COUNT;
-//     for (uint32_t i = 0; i < TRIANGLE_COUNT; i++)
-//         scene.triangles[i] = triangles[i];
-// #else
-//     scene.triangleCount = 0;
-// #endif
-// #ifdef RECTANGLE_LIGHT
-//     scene.rectangleCount = RECTANGLE_COUNT;
-//     for (uint32_t i = 0; i < RECTANGLE_COUNT; i++)
-//         scene.rectangles[i] = rectangles[i];
-// #else
-//     scene.rectangleCount = 0;
-// #endif
-//     scene.lightCount = LIGHT_COUNT;
-//     for (uint32_t i = 0; i < LIGHT_COUNT; i++)
-//         scene.lights[i] = lights[i];
-//     scene.bxdfCount = BXDF_COUNT;
-//     for (uint32_t i = 0; i < BXDF_COUNT; i++)
-//         scene.bxdfs[i] = bxdfs[i];
-
-//     float32_t3 color = pathtracer.getMeasure(pc.sampleCount, pc.depth, scene);
-//     float32_t4 pixCol = float32_t4(color, 1.0);
-//     outImage[coords] = pixCol;
+    ptCreateParams.NDC = NDC;
+    ptCreateParams.invMVP = pc.invMVP;
+
+    ptCreateParams.diffuseParams = bxdfs[0].params;
+    ptCreateParams.conductorParams = bxdfs[3].params;
+    ptCreateParams.dielectricParams = bxdfs[6].params;
+
+    pathtracer_type pathtracer = pathtracer_type::create(ptCreateParams, sampleSequence);
+
+    // set up scene (can do as global var?)
+    ext::Scene<light_type, bxdfnode_type> scene;
+    scene.sphereCount = SPHERE_COUNT;
+    for (uint32_t i = 0; i < SPHERE_COUNT; i++)
+        scene.spheres[i] = spheres[i];
+#ifdef TRIANGLE_LIGHT
+    scene.triangleCount = TRIANGLE_COUNT;
+    for (uint32_t i = 0; i < TRIANGLE_COUNT; i++)
+        scene.triangles[i] = triangles[i];
+#else
+    scene.triangleCount = 0;
+#endif
+#ifdef RECTANGLE_LIGHT
+    scene.rectangleCount = RECTANGLE_COUNT;
+    for (uint32_t i = 0; i < RECTANGLE_COUNT; i++)
+        scene.rectangles[i] = rectangles[i];
+#else
+    scene.rectangleCount = 0;
+#endif
+    scene.lightCount = LIGHT_COUNT;
+    for (uint32_t i = 0; i < LIGHT_COUNT; i++)
+        scene.lights[i] = lights[i];
+    scene.bxdfCount = BXDF_COUNT;
+    for (uint32_t i = 0; i < BXDF_COUNT; i++)
+        scene.bxdfs[i] = bxdfs[i];
+
+    float32_t3 color = pathtracer.getMeasure(pc.sampleCount, pc.depth, scene);
+    float32_t4 pixCol = float32_t4(color, 1.0);
+    outImage[coords] = pixCol;
 }
diff --git a/31_HLSLPathTracer/app_resources/hlsl/scene.hlsl b/31_HLSLPathTracer/app_resources/hlsl/scene.hlsl
index ed0c612f1..48be039a7 100644
--- a/31_HLSLPathTracer/app_resources/hlsl/scene.hlsl
+++ b/31_HLSLPathTracer/app_resources/hlsl/scene.hlsl
@@ -2,9 +2,6 @@
 #define _NBL_HLSL_EXT_PATHTRACING_SCENE_INCLUDED_
 
 #include "common.hlsl"
-#include "material_system.hlsl"
-#include "next_event_estimator.hlsl"
-#include "intersector.hlsl"
 
 namespace nbl
 {
diff --git a/31_HLSLPathTracer/main.cpp b/31_HLSLPathTracer/main.cpp
index 4a2c1110b..4bb260b09 100644
--- a/31_HLSLPathTracer/main.cpp
+++ b/31_HLSLPathTracer/main.cpp
@@ -368,9 +368,9 @@ class ComputeShaderPathtracer final : public examples::SimpleWindowedApplication
 					options.preprocessorOptions.sourceIdentifier = source->getFilepathHint();
 					options.preprocessorOptions.logger = m_logger.get();
 					options.preprocessorOptions.includeFinder = compiler->getDefaultIncludeFinder();
-
-					std::string dxcOptionStr[] = { "-D" + defineMacro };
-					options.dxcOptions = std::span(dxcOptionStr);
+					
+					const IShaderCompiler::SMacroDefinition variantDefine = { defineMacro, "" };
+					options.preprocessorOptions.extraDefines = { &variantDefine, &variantDefine + 1 };
 
 					source = compiler->compileToSPIRV((const char*)source->getContent()->getPointer(), options);
 					

From 890c99297f59f19bfef5d7a9b64c68de4b1488f6 Mon Sep 17 00:00:00 2001
From: kevyuu <kevin.kayu@gmail.com>
Date: Tue, 25 Feb 2025 18:34:32 +0700
Subject: [PATCH 060/529] Update demo to use SShaderGroupHandle type

---
 71_RayTracingPipeline/main.cpp | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/71_RayTracingPipeline/main.cpp b/71_RayTracingPipeline/main.cpp
index d457e37dc..0c5473b73 100644
--- a/71_RayTracingPipeline/main.cpp
+++ b/71_RayTracingPipeline/main.cpp
@@ -356,7 +356,7 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication,
 
       shaderGroups.raygenGroup = { .shaderIndex = RTDS_RAYGEN };
 
-      SGeneralShaderGroup missGroups[EMT_COUNT];
+      IRayTracingPipelineBase::SGeneralShaderGroup missGroups[EMT_COUNT];
       missGroups[EMT_PRIMARY] = { .shaderIndex = RTDS_MISS };
       missGroups[EMT_OCCLUSION] = { .shaderIndex = RTDS_SHADOW_MISS };
       shaderGroups.missGroups = missGroups;
@@ -365,7 +365,7 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication,
         {
           return geomType * ERT_COUNT + rayType;
         };
-      SHitShaderGroup hitGroups[E_RAY_TYPE::ERT_COUNT * E_GEOM_TYPE::EGT_COUNT];
+      IRayTracingPipelineBase::SHitShaderGroup hitGroups[E_RAY_TYPE::ERT_COUNT * E_GEOM_TYPE::EGT_COUNT];
       hitGroups[getHitGroupIndex(EGT_TRIANGLES, ERT_PRIMARY)] = {
         .closestHitShaderIndex = RTDS_CLOSEST_HIT,
         .anyHitShaderIndex = RTDS_ANYHIT_PRIMARY,
@@ -384,7 +384,7 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication,
       };
       shaderGroups.hitGroups = hitGroups;
 
-      SGeneralShaderGroup callableGroups[ELT_COUNT];
+      IRayTracingPipelineBase::SGeneralShaderGroup callableGroups[ELT_COUNT];
       callableGroups[ELT_DIRECTIONAL] = { .shaderIndex = RTDS_DIRECTIONAL_CALL };
       callableGroups[ELT_POINT] = { .shaderIndex = RTDS_POINT_CALL };
       callableGroups[ELT_SPOT] = { .shaderIndex = RTDS_SPOT_CALL };
@@ -1354,13 +1354,13 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication,
     uint8_t* pData = reinterpret_cast<uint8_t*>(cpuBuffer->getPointer());
 
     // copy raygen region
-    memcpy(pData, pipeline->getRaygenGroupShaderHandle().data(), handleSize);
+    memcpy(pData, &pipeline->getRaygen(), handleSize);
 
     // copy miss region
     uint8_t* pMissData = pData + missRange.offset;
     for (int32_t missIx = 0; missIx < pipeline->getMissGroupCount(); missIx++)
     {
-      memcpy(pMissData, pipeline->getMissGroupShaderHandle(missIx).data(), handleSize);
+      memcpy(pMissData, &pipeline->getMiss(missIx), handleSize);
       pMissData += m_shaderBindingTable.missGroupsStride;
     }
 
@@ -1368,7 +1368,7 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication,
     uint8_t* pHitData = pData + hitRange.offset;
     for (int32_t hitIx = 0; hitIx < pipeline->getHitGroupCount(); hitIx++)
     {
-      memcpy(pHitData, pipeline->getHitGroupShaderHandle(hitIx).data(), handleSize);
+      memcpy(pHitData, &pipeline->getHit(hitIx), handleSize);
       pHitData += m_shaderBindingTable.hitGroupsStride;
     }
 
@@ -1376,7 +1376,7 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication,
     uint8_t* pCallableData = pData + callableRange.offset;
     for (int32_t callableIx = 0; callableIx < pipeline->getCallableGroupCount(); callableIx++)
     {
-      memcpy(pCallableData, pipeline->getCallableGroupShaderHandle(callableIx).data(), handleSize);
+      memcpy(pCallableData, &pipeline->getCallable(callableIx), handleSize);
       pCallableData += m_shaderBindingTable.callableGroupsStride;
     }
 

From 19ad8b03480ffbac7d4d28c4f9f7f73a06d3a841 Mon Sep 17 00:00:00 2001
From: kevyuu <kevin.kayu@gmail.com>
Date: Tue, 25 Feb 2025 23:23:26 +0700
Subject: [PATCH 061/529] Fix spot light

---
 71_RayTracingPipeline/app_resources/light_spot.rcall.hlsl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/71_RayTracingPipeline/app_resources/light_spot.rcall.hlsl b/71_RayTracingPipeline/app_resources/light_spot.rcall.hlsl
index f1357d30b..fcb130104 100644
--- a/71_RayTracingPipeline/app_resources/light_spot.rcall.hlsl
+++ b/71_RayTracingPipeline/app_resources/light_spot.rcall.hlsl
@@ -10,7 +10,7 @@ void main(inout RayLight cLight)
     cLight.outIntensity = LightIntensity / (cLight.outLightDistance * cLight.outLightDistance);
     cLight.outLightDir = normalize(lDir);
     float theta = dot(cLight.outLightDir, normalize(-pc.light.direction));
-    float epsilon = - pc.light.outerCutoff;
+    float epsilon = 1 - pc.light.outerCutoff;
     float spotIntensity = clamp((theta - pc.light.outerCutoff) / epsilon, 0.0, 1.0);
     cLight.outIntensity *= spotIntensity;
 }

From 8e759f24d5b386291660f50af1c04efbff3eff08 Mon Sep 17 00:00:00 2001
From: keptsecret <sorchon@gmail.com>
Date: Wed, 26 Feb 2025 16:53:31 +0700
Subject: [PATCH 062/529] more bug fixes #6

---
 .../app_resources/hlsl/common.hlsl            | 28 +++++++++++++------
 .../app_resources/hlsl/intersector.hlsl       | 11 ++++----
 .../app_resources/hlsl/material_system.hlsl   |  5 ++--
 .../hlsl/next_event_estimator.hlsl            | 15 ++++++++--
 .../app_resources/hlsl/pathtracer.hlsl        | 16 ++++++-----
 .../app_resources/hlsl/render.comp.hlsl       |  4 +--
 .../app_resources/hlsl/scene.hlsl             |  4 +--
 7 files changed, 54 insertions(+), 29 deletions(-)

diff --git a/31_HLSLPathTracer/app_resources/hlsl/common.hlsl b/31_HLSLPathTracer/app_resources/hlsl/common.hlsl
index a264fabd5..913225f8b 100644
--- a/31_HLSLPathTracer/app_resources/hlsl/common.hlsl
+++ b/31_HLSLPathTracer/app_resources/hlsl/common.hlsl
@@ -66,7 +66,10 @@ struct Ray
     // immutable
     vector3_type origin;
     vector3_type direction;
+    
     // TODO: polygon method == 2 stuff
+    vector3_type normalAtOrigin;
+    bool wasBSDFAtOrigin;
 
     // mutable
     scalar_type intersectionT;
@@ -82,6 +85,14 @@ struct Light
 
     NBL_CONSTEXPR_STATIC_INLINE uint32_t INVALID_ID = 0xffffu;
 
+    static Light<spectral_type> create(NBL_CONST_REF_ARG(spectral_type) radiance, NBL_CONST_REF_ARG(ObjectID) objectID)
+    {
+        Light<spectral_type> retval;
+        retval.radiance = radiance;
+        retval.objectID = objectID;
+        return retval;
+    }
+
     spectral_type radiance;
     ObjectID objectID;
 };
@@ -250,7 +261,7 @@ struct Shape<PST_SPHERE>
         {
             const float rcpDistance = 1.0 / nbl::hlsl::sqrt(distanceSQ);
             Z *= rcpDistance;
-        
+
             const float cosThetaMax = nbl::hlsl::sqrt(cosThetaMax2);
             const float cosTheta = nbl::hlsl::mix<float>(1.0, cosThetaMax, xi.x);
 
@@ -261,9 +272,9 @@ struct Shape<PST_SPHERE>
             float sinPhi, cosPhi;
             math::sincos(2.0 * numbers::pi<float> * xi.y - numbers::pi<float>, sinPhi, cosPhi);
             float32_t2x3 XY = math::frisvad<float>(Z);
-        
+
             L += (XY[0] * cosPhi + XY[1] * sinPhi) * sinTheta;
-        
+
             newRayMaxT = (cosTheta - nbl::hlsl::sqrt(cosTheta2 - cosThetaMax2)) / rcpDistance;
             pdf = 1.0 / (2.0 * numbers::pi<float> * (1.0 - cosThetaMax));
             return L;
@@ -342,14 +353,15 @@ struct Shape<PST_TRIANGLE>
             {
                 shapes::SphericalTriangle<float> st = shapes::SphericalTriangle<float>::create(vertex0, vertex1, vertex2, ray.origin);
                 const float rcpProb = st.solidAngleOfTriangle();
-                // if `rcpProb` is NAN then the triangle's solid angle was close to 0.0 
+                // if `rcpProb` is NAN then the triangle's solid angle was close to 0.0
                 return rcpProb > numeric_limits<float>::min ? (1.0 / rcpProb) : numeric_limits<float>::max;
             }
             break;
             case PPM_APPROX_PROJECTED_SOLID_ANGLE:
             {
                 shapes::SphericalTriangle<float> st = shapes::SphericalTriangle<float>::create(vertex0, vertex1, vertex2, ray.origin);
-                const float pdf = st.projectedSolidAngleOfTriangle(ray.normalAtOrigin, ray.wasBSDFAtOrigin, L);
+                sampling::ProjectedSphericalTriangle<float> pst = sampling::ProjectedSphericalTriangle<float>::create(st);
+                const float pdf = pst.pdf(ray.normalAtOrigin, ray.wasBSDFAtOrigin, L);
                 // if `pdf` is NAN then the triangle's projected solid angle was close to 0.0, if its close to INF then the triangle was very small
                 return pdf < numeric_limits<float>::max ? pdf : 0.0;
             }
@@ -371,11 +383,11 @@ struct Shape<PST_TRIANGLE>
                 const float sqrtU = nbl::hlsl::sqrt(xi.x);
                 float32_t3 pnt = vertex0 + edge0 * (1.0 - sqrtU) + edge1 * sqrtU * xi.y;
                 float32_t3 L = pnt - origin;
-                
+
                 const float distanceSq = nbl::hlsl::dot(L,L);
                 const float rcpDistance = 1.0 / nbl::hlsl::sqrt(distanceSq);
                 L *= rcpDistance;
-                
+
                 pdf = distanceSq / nbl::hlsl::abs(nbl::hlsl::dot(nbl::hlsl::cross(edge0, edge1) * 0.5f, L));
                 newRayMaxT = 1.0 / rcpDistance;
                 return L;
@@ -403,7 +415,7 @@ struct Shape<PST_TRIANGLE>
 
                 shapes::SphericalTriangle<float> st = shapes::SphericalTriangle<float>::create(vertex0, vertex1, vertex2, origin);
                 sampling::ProjectedSphericalTriangle<float> sst = sampling::ProjectedSphericalTriangle<float>::create(st);
-            
+
                 const float32_t3 L = sst.generate(rcpPdf, interaction.N, isBSDF, xi.xy);
 
                 pdf = rcpPdf > numeric_limits<float>::min ? (1.0 / rcpPdf) : 0.0;
diff --git a/31_HLSLPathTracer/app_resources/hlsl/intersector.hlsl b/31_HLSLPathTracer/app_resources/hlsl/intersector.hlsl
index 880ae1169..525af5525 100644
--- a/31_HLSLPathTracer/app_resources/hlsl/intersector.hlsl
+++ b/31_HLSLPathTracer/app_resources/hlsl/intersector.hlsl
@@ -68,12 +68,12 @@ struct Comprehensive
                     t = numeric_limits<float>::infinity;
                 break;
             }
-            
+
             bool closerIntersection = t > 0.0 && t < ray.intersectionT;
 
             ray.intersectionT = closerIntersection ? t : ray.intersectionT;
             objectID.id = closerIntersection ? i : objectID.id;
-            
+
             // allowing early out results in a performance regression, WTF!?
             //if (anyHit && closerIntersection)
             //break;
@@ -106,6 +106,7 @@ struct Comprehensive
                 return ObjectID::create(-1, 0, PST_SPHERE);
             }
         }
+        return ObjectID::create(-1, 0, PST_SPHERE);
     }
 
     static ObjectID traceRay(NBL_REF_ARG(ray_type) ray, NBL_CONST_REF_ARG(scene_type) scene)
@@ -114,7 +115,7 @@ struct Comprehensive
 
         ObjectID objectID;
         objectID.id = -1;  // start with no intersect
-                
+
         // prodedural shapes
         if (scene.sphereCount > 0)
         {
@@ -161,12 +162,12 @@ struct Comprehensive
 //                 t = sphere.intersect(ray.origin, ray.direction);
 //             }
 //             // TODO: other types
-            
+
 //             bool closerIntersection = t > 0.0 && t < ray.intersectionT;
 
 //             ray.intersectionT = closerIntersection ? t : ray.intersectionT;
 //             objectID = closerIntersection ? i : objectID;
-            
+
 //             // allowing early out results in a performance regression, WTF!?
 //             //if (anyHit && closerIntersection)
 //             //break;
diff --git a/31_HLSLPathTracer/app_resources/hlsl/material_system.hlsl b/31_HLSLPathTracer/app_resources/hlsl/material_system.hlsl
index 16f8dcabf..1a613080f 100644
--- a/31_HLSLPathTracer/app_resources/hlsl/material_system.hlsl
+++ b/31_HLSLPathTracer/app_resources/hlsl/material_system.hlsl
@@ -38,6 +38,7 @@ struct System
     using vector2_type = vector<scalar_type, 2>;
     using vector3_type = vector<scalar_type, 3>;
     using measure_type = typename DiffuseBxDF::spectral_type;
+    using sample_type = typename DiffuseBxDF::sample_type;
     using quotient_pdf_type = typename DiffuseBxDF::quotient_pdf_type;
     using anisotropic_type = typename DiffuseBxDF::anisotropic_type;
     using anisocache_type = typename ConductorBxDF::anisocache_type;
@@ -84,7 +85,7 @@ struct System
         }
     }
 
-    vector3_type generate(NBL_CONST_REF_ARG(Material) material, NBL_CONST_REF_ARG(create_params_t) cparams, anisotropic_type interaction, NBL_CONST_REF_ARG(vector3_type) u, NBL_REF_ARG(anisocache_type) cache)
+    sample_type generate(NBL_CONST_REF_ARG(Material) material, NBL_CONST_REF_ARG(create_params_t) cparams, anisotropic_type interaction, NBL_CONST_REF_ARG(vector3_type) u, NBL_REF_ARG(anisocache_type) cache)
     {
         switch(material.type)
         {
@@ -107,7 +108,7 @@ struct System
             }
             break;
             default:
-                return (vector3_type)numeric_limits<float>::infinity;
+                return (sample_type)numeric_limits<float>::infinity;
         }
     }
 
diff --git a/31_HLSLPathTracer/app_resources/hlsl/next_event_estimator.hlsl b/31_HLSLPathTracer/app_resources/hlsl/next_event_estimator.hlsl
index f0eeb0885..15dbf3a9b 100644
--- a/31_HLSLPathTracer/app_resources/hlsl/next_event_estimator.hlsl
+++ b/31_HLSLPathTracer/app_resources/hlsl/next_event_estimator.hlsl
@@ -103,7 +103,10 @@ struct Estimator
             {
                 vector3_type position = vector3_type(asfloat(event.data[2]), asfloat(event.data[3]), asfloat(event.data[4]));
                 Shape<PST_SPHERE> sphere = Shape<PST_SPHERE>::create(position, asfloat(event.data[5]), event.data[6]);
-                L = sphere.template generate_and_pdf<interaction_type>(pdf, newRayMaxT, origin, interaction, isBSDF, xi);
+                const vector3_type sampleL = sphere.template generate_and_pdf<interaction_type>(pdf, newRayMaxT, origin, interaction, isBSDF, xi);
+                const vector3_type V = interaction.V.getDirection();
+                const scalar_type VdotL = nbl::hlsl::dot<vector3_type>(V, sampleL);
+                L = sample_type::create(sampleL,VdotL,interaction.T,interaction.B,interaction.N);
             }
             break;
             case PST_TRIANGLE:
@@ -112,7 +115,10 @@ struct Estimator
                 vector3_type vertex1 = vector3_type(asfloat(event.data[5]), asfloat(event.data[6]), asfloat(event.data[7]));
                 vector3_type vertex2 = vector3_type(asfloat(event.data[8]), asfloat(event.data[9]), asfloat(event.data[10]));
                 Shape<PST_TRIANGLE> tri = Shape<PST_TRIANGLE>::create(vertex0, vertex1, vertex2, event.data[11]);
-                L = tri.template generate_and_pdf<interaction_type>(pdf, newRayMaxT, origin, interaction, isBSDF, xi);
+                const vector3_type sampleL = tri.template generate_and_pdf<interaction_type>(pdf, newRayMaxT, origin, interaction, isBSDF, xi);
+                const vector3_type V = interaction.V.getDirection();
+                const scalar_type VdotL = nbl::hlsl::dot<vector3_type>(V, sampleL);
+                L = sample_type::create(sampleL,VdotL,interaction.T,interaction.B,interaction.N);
             }
             break;
             case PST_RECTANGLE:
@@ -121,7 +127,10 @@ struct Estimator
                 vector3_type edge0 = vector3_type(asfloat(event.data[5]), asfloat(event.data[6]), asfloat(event.data[7]));
                 vector3_type edge1 = vector3_type(asfloat(event.data[8]), asfloat(event.data[9]), asfloat(event.data[10]));
                 Shape<PST_RECTANGLE> rect = Shape<PST_RECTANGLE>::create(offset, edge0, edge1, event.data[11]);
-                L = rect.template generate_and_pdf<interaction_type>(pdf, newRayMaxT, origin, interaction, isBSDF, xi);
+                const vector3_type sampleL = rect.template generate_and_pdf<interaction_type>(pdf, newRayMaxT, origin, interaction, isBSDF, xi);
+                const vector3_type V = interaction.V.getDirection();
+                const scalar_type VdotL = nbl::hlsl::dot<vector3_type>(V, sampleL);
+                L = sample_type::create(sampleL,VdotL,interaction.T,interaction.B,interaction.N);
             }
             break;
             default:
diff --git a/31_HLSLPathTracer/app_resources/hlsl/pathtracer.hlsl b/31_HLSLPathTracer/app_resources/hlsl/pathtracer.hlsl
index 460744940..62398a58e 100644
--- a/31_HLSLPathTracer/app_resources/hlsl/pathtracer.hlsl
+++ b/31_HLSLPathTracer/app_resources/hlsl/pathtracer.hlsl
@@ -227,11 +227,12 @@ struct Unidirectional
                         }
                     }
 
-                    quotient_pdf_type bsdf_quotient_pdf = materialSystem.quotient_and_pdf(material, bxdf.params, params) * throughput;
+                    quotient_pdf_type bsdf_quotient_pdf = materialSystem.quotient_and_pdf(material, bxdf.params, params);
+                    bsdf_quotient_pdf.quotient *= throughput;
                     neeContrib_pdf.quotient *= bsdf_quotient_pdf.quotient;
                     const scalar_type otherGenOverChoice = bsdf_quotient_pdf.pdf * rcpChoiceProb;
                     const scalar_type otherGenOverLightAndChoice = otherGenOverChoice / bsdf_quotient_pdf.pdf;
-                    neeContrib_pdf.quotient *= otherGenOverChoice/(1.f + otherGenOverLightAndChoice * otherGenOverLightAndChoice);   // balance heuristic
+                    neeContrib_pdf.quotient *= otherGenOverChoice / (1.f + otherGenOverLightAndChoice * otherGenOverLightAndChoice);   // balance heuristic
 
                     // TODO: ifdef NEE only
 
@@ -240,7 +241,7 @@ struct Unidirectional
                     nee_ray.direction = nee_sample.L.direction;
                     nee_ray.intersectionT = t;
                     if (bsdf_quotient_pdf.pdf < numeric_limits<scalar_type>::max && getLuma(neeContrib_pdf.quotient) > lumaContributionThreshold && intersector_type::traceRay(nee_ray, scene).id == -1)
-                        ray._payload.accumulation += neeContrib_pdf.quotient;
+                        ray.payload.accumulation += neeContrib_pdf.quotient;
                 }
             }
         }
@@ -256,7 +257,7 @@ struct Unidirectional
             sample_type bsdf_sample = materialSystem.generate(material, bxdf.params, interaction, eps1, _cache);
 
             // TODO: does not yet account for smooth dielectric
-            params_type params;            
+            params_type params;
             if (!isBSDF && bxdf.materialType == ext::MaterialSystem::Material::DIFFUSE)
             {
                 params = params_type::template create<sample_type, isotropic_type>(bsdf_sample, iso_interaction, bxdf::BCM_MAX);
@@ -287,7 +288,8 @@ struct Unidirectional
             }
 
             // the value of the bsdf divided by the probability of the sample being generated
-            throughput *= materialSystem.quotient_and_pdf(material, bxdf.params, params);
+            quotient_pdf_type bsdf_quotient_pdf = materialSystem.quotient_and_pdf(material, bxdf.params, params);
+            throughput *= bsdf_quotient_pdf.quotient;
             bxdfSample = bsdf_sample.L.direction;
         }
 
@@ -298,7 +300,7 @@ struct Unidirectional
             ray.payload.throughput = throughput;
             ray.payload.otherTechniqueHeuristic = neeProbability / bxdfPdf; // numerically stable, don't touch
             ray.payload.otherTechniqueHeuristic *= ray.payload.otherTechniqueHeuristic;
-                    
+
             // trace new ray
             ray.origin = intersection + bxdfSample * (1.0/*kSceneSize*/) * Tolerance<scalar_type>::getStart(depth);
             ray.direction = bxdfSample;
@@ -314,7 +316,7 @@ struct Unidirectional
 
     void missProgram(NBL_REF_ARG(ray_type) ray)
     {
-        vector3_type finalContribution = ray.payload.throughput; 
+        vector3_type finalContribution = ray.payload.throughput;
         // #ifdef USE_ENVMAP
         //     vec2 uv = SampleSphericalMap(_immutable.direction);
         //     finalContribution *= textureLod(envMap, uv, 0.0).rgb;
diff --git a/31_HLSLPathTracer/app_resources/hlsl/render.comp.hlsl b/31_HLSLPathTracer/app_resources/hlsl/render.comp.hlsl
index 5be6adf78..360d085a6 100644
--- a/31_HLSLPathTracer/app_resources/hlsl/render.comp.hlsl
+++ b/31_HLSLPathTracer/app_resources/hlsl/render.comp.hlsl
@@ -115,7 +115,7 @@ static const ext::Shape<ext::PST_RECTANGLE> rectangles[RECTANGLE_COUNT] = {
 
 #define LIGHT_COUNT 1
 static const light_type lights[LIGHT_COUNT] = {
-    light_type(spectral_t(30.0,25.0,15.0), ext::ObjectID(8u, ext::NextEventEstimator::Event::Mode::PROCEDURAL, LIGHT_TYPE))
+    light_type::create(spectral_t(30.0,25.0,15.0), ext::ObjectID::create(8u, ext::NextEventEstimator::Event::Mode::PROCEDURAL, LIGHT_TYPE))
 };
 
 #define BXDF_COUNT 7
@@ -166,7 +166,7 @@ void main(uint32_t3 threadID : SV_DispatchThreadID)
         ptCreateParams.camPos = tmp.xyz / tmp.w;
         NDC.z = 1.0;
     }
- 
+
     ptCreateParams.NDC = NDC;
     ptCreateParams.invMVP = pc.invMVP;
 
diff --git a/31_HLSLPathTracer/app_resources/hlsl/scene.hlsl b/31_HLSLPathTracer/app_resources/hlsl/scene.hlsl
index 48be039a7..79b66dbfb 100644
--- a/31_HLSLPathTracer/app_resources/hlsl/scene.hlsl
+++ b/31_HLSLPathTracer/app_resources/hlsl/scene.hlsl
@@ -32,7 +32,7 @@ struct Scene
 
     light_type lights[maxLightCount];
     uint32_t lightCount;
-    
+
     NBL_CONSTEXPR_STATIC_INLINE uint32_t maxBxdfCount = 16; // TODO: limit change?
 
     bxdfnode_type bxdfs[maxBxdfCount];
@@ -51,7 +51,7 @@ struct Scene
                             -1;
         retval.data[0] = objCount;
         retval.data[1] = type;
-        
+
         switch (type)
         {
             case PST_SPHERE:

From f098335b37df0262e704b06b22dd9f37ec53f2e9 Mon Sep 17 00:00:00 2001
From: Przemek <minikers21@gmail.com>
Date: Wed, 26 Feb 2025 11:09:16 +0100
Subject: [PATCH 063/529] DTM setup

---
 62_CAD/CTriangleMesh.cpp                      |  1 +
 62_CAD/CTriangleMesh.h                        | 43 ++++++++++++
 62_CAD/DrawResourcesFiller.cpp                | 20 ++++++
 62_CAD/DrawResourcesFiller.h                  |  3 +
 62_CAD/main.cpp                               | 69 ++++++++++++++++---
 62_CAD/shaders/globals.hlsl                   | 12 +++-
 62_CAD/shaders/main_pipeline/common.hlsl      |  5 +-
 .../shaders/main_pipeline/vertex_shader.hlsl  | 25 +++++++
 8 files changed, 165 insertions(+), 13 deletions(-)
 create mode 100644 62_CAD/CTriangleMesh.cpp
 create mode 100644 62_CAD/CTriangleMesh.h

diff --git a/62_CAD/CTriangleMesh.cpp b/62_CAD/CTriangleMesh.cpp
new file mode 100644
index 000000000..5564c0a51
--- /dev/null
+++ b/62_CAD/CTriangleMesh.cpp
@@ -0,0 +1 @@
+#include "CTriangleMesh.h"
\ No newline at end of file
diff --git a/62_CAD/CTriangleMesh.h b/62_CAD/CTriangleMesh.h
new file mode 100644
index 000000000..3f39fb750
--- /dev/null
+++ b/62_CAD/CTriangleMesh.h
@@ -0,0 +1,43 @@
+#pragma once
+
+#include <nabla.h>
+#include <nbl/builtin/hlsl/cpp_compat.hlsl>
+#include "shaders/globals.hlsl"
+
+using namespace nbl;
+
+class CTriangleMesh final
+{
+public:
+	inline void setVertices(core::vector<TriangleMeshVertex>&& vertices)
+	{
+		m_vertices = std::move(vertices);
+	}
+	inline void setIndices(core::vector<uint32_t>&& indices)
+	{
+		m_indices = std::move(indices);
+	}
+
+	inline const core::vector<TriangleMeshVertex>& getVertices() const
+	{
+		return m_vertices;
+	}
+	inline const core::vector<uint32_t>& getIndices() const
+	{
+		return m_indices;
+	}
+
+	inline size_t getVtxBuffByteSize() const
+	{
+		return sizeof(decltype(m_vertices)::value_type);
+	}
+	inline size_t getIdxBuffByteSize() const
+	{
+		return sizeof(decltype(m_indices)::value_type);
+	}
+
+
+private:
+	core::vector<TriangleMeshVertex> m_vertices;
+	core::vector<uint32_t> m_indices;
+};
\ No newline at end of file
diff --git a/62_CAD/DrawResourcesFiller.cpp b/62_CAD/DrawResourcesFiller.cpp
index 7cf96d693..32f72c07b 100644
--- a/62_CAD/DrawResourcesFiller.cpp
+++ b/62_CAD/DrawResourcesFiller.cpp
@@ -218,6 +218,26 @@ void DrawResourcesFiller::drawPolyline(const CPolylineBase& polyline, uint32_t p
 	}
 }
 
+void DrawResourcesFiller::drawTriangleMesh(const CTriangleMesh& mesh, core::unordered_map<float32_t, float32_t3> heightColorMap, SIntendedSubmitInfo& intendedNextSubmit)
+{
+	ICPUBuffer::SCreationParams geometryBuffParams;
+	
+	// concatenate the index and vertex buffer into the geometry buffer
+	const size_t indexBuffSize = mesh.getIdxBuffByteSize();
+	const size_t vtxBuffSize = mesh.getVtxBuffByteSize();
+	const size_t geometryBufferSizeDataSize = indexBuffSize + vtxBuffSize;
+
+	core::vector<uint8_t> geometryBufferData(geometryBufferSizeDataSize);
+	std::memcpy(geometryBufferData.data(), mesh.getIndices().data(), indexBuffSize);
+	std::memcpy(geometryBufferData.data() + indexBuffSize, mesh.getVertices().data(), vtxBuffSize);
+
+	SBufferRange<IGPUBuffer> geometryBuffRange;
+	geometryBuffRange.offset = 0;
+	geometryBuffRange.size = geometryBufferSizeDataSize;
+	geometryBuffRange.buffer = gpuDrawBuffers.drawObjectsBuffer;
+	m_utilities->updateBufferRangeViaStagingBuffer(intendedNextSubmit, geometryBuffRange, geometryBufferData.data());
+}
+
 // TODO[Erfan]: Makes more sense if parameters are: solidColor + fillPattern + patternColor
 void DrawResourcesFiller::drawHatch(
 		const Hatch& hatch,
diff --git a/62_CAD/DrawResourcesFiller.h b/62_CAD/DrawResourcesFiller.h
index e20514651..f9ab033e9 100644
--- a/62_CAD/DrawResourcesFiller.h
+++ b/62_CAD/DrawResourcesFiller.h
@@ -1,5 +1,6 @@
 #pragma once
 #include "Polyline.h"
+#include "CTriangleMesh.h"
 #include "Hatch.h"
 #include "IndexAllocator.h"
 #include <nbl/video/utilities/SIntendedSubmitInfo.h>
@@ -76,6 +77,8 @@ struct DrawResourcesFiller
 
 	void drawPolyline(const CPolylineBase& polyline, uint32_t polylineMainObjIdx, SIntendedSubmitInfo& intendedNextSubmit);
 	
+	void drawTriangleMesh(const CTriangleMesh& mesh, core::unordered_map<float32_t, float32_t3> heightColorMap, SIntendedSubmitInfo& intendedNextSubmit);
+
 	// ! Convinience function for Hatch with MSDF Pattern and a solid background
 	void drawHatch(
 		const Hatch& hatch,
diff --git a/62_CAD/main.cpp b/62_CAD/main.cpp
index 637c88eda..a893d9b40 100644
--- a/62_CAD/main.cpp
+++ b/62_CAD/main.cpp
@@ -57,6 +57,7 @@ enum class ExampleMode
 	CASE_6, // Custom Clip Projections
 	CASE_7, // Images
 	CASE_8, // MSDF and Text
+	CASE_9, // DTM
 	CASE_COUNT
 };
 
@@ -73,7 +74,7 @@ constexpr std::array<float, (uint32_t)ExampleMode::CASE_COUNT> cameraExtents =
 	600.0,	// CASE_8
 };
 
-constexpr ExampleMode mode = ExampleMode::CASE_4;
+constexpr ExampleMode mode = ExampleMode::CASE_9;
 
 class Camera2D
 {
@@ -865,7 +866,13 @@ class ComputerAidedDesign final : public examples::SimpleWindowedApplication, pu
 				m_device->updateDescriptorSets(DescriptorUpdatesCount, descriptorUpdates, 0u, nullptr);
 			}
 
-			pipelineLayout = m_device->createPipelineLayout({}, core::smart_refctd_ptr(descriptorSetLayout0), core::smart_refctd_ptr(descriptorSetLayout1), nullptr, nullptr);
+			const asset::SPushConstantRange range = {
+						.stageFlags = IShader::E_SHADER_STAGE::ESS_VERTEX,
+						.offset = 0,
+						.size = sizeof(PushConstants)
+			};
+
+			pipelineLayout = m_device->createPipelineLayout({ &range,1 }, core::smart_refctd_ptr(descriptorSetLayout0), core::smart_refctd_ptr(descriptorSetLayout1), nullptr, nullptr);
 		}
 
 		smart_refctd_ptr<IGPUShader> mainPipelineFragmentShaders = {};
@@ -1387,18 +1394,30 @@ class ComputerAidedDesign final : public examples::SimpleWindowedApplication, pu
 		const uint32_t currentIndexCount = drawResourcesFiller.getDrawObjectCount() * 6u;
 		IGPUDescriptorSet* descriptorSets[] = { descriptorSet0.get(), descriptorSet1.get() };
 		cb->bindDescriptorSets(asset::EPBP_GRAPHICS, pipelineLayout.get(), 0u, 2u, descriptorSets);
+		if (mode == ExampleMode::CASE_9)
+		{
 
-		// TODO[Przemek]: based on our call bind index buffer you uploaded to part of the `drawResourcesFiller.gpuDrawBuffers.geometryBuffer`
-		// Vertices will be pulled based on baseBDAPointer of where you uploaded the vertex + the VertexID in the vertex shader.
-		cb->bindIndexBuffer({ .offset = 0u, .buffer = drawResourcesFiller.gpuDrawBuffers.indexBuffer.get() }, asset::EIT_32BIT);
+			// TODO[Przemek]: based on our call bind index buffer you uploaded to part of the `drawResourcesFiller.gpuDrawBuffers.geometryBuffer`
+			// Vertices will be pulled based on baseBDAPointer of where you uploaded the vertex + the VertexID in the vertex shader.
+			cb->bindIndexBuffer({ .offset = 0u, .buffer = drawResourcesFiller.gpuDrawBuffers.geometryBuffer.get() }, asset::EIT_32BIT);
 
-		// TODO[Przemek]: binding the same pipelie, no need to change.
-		cb->bindGraphicsPipeline(graphicsPipeline.get());
-		
-		// TODO[Przemek]: contour settings, height shading settings, base bda pointers will need to be pushed via pushConstants before the draw currently as it's the easiest thing to do.
+			// TODO[Przemek]: binding the same pipelie, no need to change.
+			cb->bindGraphicsPipeline(graphicsPipeline.get());
+
+			// TODO[Przemek]: contour settings, height shading settings, base bda pointers will need to be pushed via pushConstants before the draw currently as it's the easiest thing to do.
+			cb->pushConstants(graphicsPipeline->getLayout(), IGPUShader::E_SHADER_STAGE::ESS_FRAGMENT, 0, sizeof(PushConstants), &m_pushConstants);
 
-		// TODO[Przemek]: draw parameters needs to reflect the mesh involved
-		cb->drawIndexed(currentIndexCount, 1u, 0u, 0u, 0u);
+			// TODO[Przemek]: draw parameters needs to reflect the mesh involved
+			cb->drawIndexed(m_triangleMeshIndexCount, 1u, 0u, 0u, 0u);
+		}
+		else
+		{
+			cb->bindDescriptorSets(asset::EPBP_GRAPHICS, pipelineLayout.get(), 0u, 2u, descriptorSets);
+			cb->bindIndexBuffer({ .offset = 0u, .buffer = drawResourcesFiller.gpuDrawBuffers.indexBuffer.get() }, asset::EIT_32BIT);
+			cb->bindGraphicsPipeline(graphicsPipeline.get());
+			cb->drawIndexed(currentIndexCount, 1u, 0u, 0u, 0u);
+		}
+		
 		if (fragmentShaderInterlockEnabled)
 		{
 			cb->bindGraphicsPipeline(resolveAlphaGraphicsPipeline.get());
@@ -3231,6 +3250,31 @@ class ComputerAidedDesign final : public examples::SimpleWindowedApplication, pu
 			}
 
 		}
+		else if (mode == ExampleMode::CASE_9)
+		{
+			core::vector<TriangleMeshVertex> vertices = {
+				{ float32_t2(0.0f, 10.0f), 0.0f },
+				{ float32_t2(-10.0f, -10.0f), 50.0f },
+				{ float32_t2(10.0f, -10.0f), 100.0f }
+			};
+
+			core::vector<uint32_t> indices = {
+				0, 1, 2
+			};
+
+			core::unordered_map<float32_t, float32_t3> heightColorMap;
+			heightColorMap.insert({ 0.0f, {0.0f, 1.0f, 0.0f} });
+			heightColorMap.insert({ 100.0f, {0.0f, 1.0f, 0.0f} });
+
+			m_triangleMeshIndexCount = indices.size();
+			m_pushConstants.verticesBaseAddress = sizeof(uint32_t) * m_triangleMeshIndexCount;
+
+			CTriangleMesh mesh;
+			mesh.setVertices(std::move(vertices));
+			mesh.setIndices(std::move(indices));
+
+			drawResourcesFiller.drawTriangleMesh(mesh, heightColorMap, intendedNextSubmit);
+		}
 		drawResourcesFiller.finalizeAllCopiesToGPU(intendedNextSubmit);
 	}
 
@@ -3311,6 +3355,9 @@ class ComputerAidedDesign final : public examples::SimpleWindowedApplication, pu
 	#endif
 	
 	std::unique_ptr<GeoTextureRenderer> m_geoTextureRenderer;
+
+	PushConstants m_pushConstants;
+	size_t m_triangleMeshIndexCount;
 };
 
 NBL_MAIN_FUNC(ComputerAidedDesign)
diff --git a/62_CAD/shaders/globals.hlsl b/62_CAD/shaders/globals.hlsl
index 392e796f4..4719f6df8 100644
--- a/62_CAD/shaders/globals.hlsl
+++ b/62_CAD/shaders/globals.hlsl
@@ -18,7 +18,6 @@
 
 using namespace nbl::hlsl;
 
-
 // because we can't use jit/device_capabilities.hlsl in c++ code
 #ifdef __HLSL_VERSION
 using pfloat64_t = portable_float64_t<jit::device_capabilities>;
@@ -32,6 +31,11 @@ using pfloat64_t3 = nbl::hlsl::vector<float64_t, 3>;
 
 using pfloat64_t3x3 = portable_matrix_t3x3<pfloat64_t>;
 
+struct PushConstants
+{
+    uint64_t verticesBaseAddress;
+};
+
 // TODO: Compute this in a compute shader from the world counterparts
 //      because this struct includes NDC coordinates, the values will change based camera zoom and move
 //      of course we could have the clip values to be in world units and also the matrix to transform to world instead of ndc but that requires extra computations(matrix multiplications) per vertex
@@ -265,6 +269,12 @@ NBL_CONSTEXPR float InvalidStyleStretchValue = nbl::hlsl::numeric_limits<float>:
 // TODO[Przemek]: we will need something similar to LineStyles but related to heigh shading settings which is user customizable (like LineStyle stipple patterns) and requires upper_bound to figure out the color based on height value.
 // We'll discuss that later or what it will be looking like and how it's gonna get passed to our shaders.
 
+struct TriangleMeshVertex
+{
+    float32_t2 pos;
+    float32_t height;
+};
+
 // The color parameter is also used for styling non-curve objects such as text glyphs and hatches with solid color
 struct LineStyle
 {
diff --git a/62_CAD/shaders/main_pipeline/common.hlsl b/62_CAD/shaders/main_pipeline/common.hlsl
index 17c851a19..ca13db341 100644
--- a/62_CAD/shaders/main_pipeline/common.hlsl
+++ b/62_CAD/shaders/main_pipeline/common.hlsl
@@ -98,7 +98,10 @@ struct PSInput
 
     void setCurrentWorldToScreenRatio(float worldToScreen) { interp_data5.y = worldToScreen; }
     float getCurrentWorldToScreenRatio() { return interp_data5.y; }
-    
+
+    void setHeightAtMeshVertex(float height) { interp_data5.x = height; }
+    float getHeightAtMeshVertex() { return interp_data5.x; }
+
     /* LINE */
     float2 getLineStart() { return data2.xy; }
     float2 getLineEnd() { return data2.zw; }
diff --git a/62_CAD/shaders/main_pipeline/vertex_shader.hlsl b/62_CAD/shaders/main_pipeline/vertex_shader.hlsl
index bff4182f6..3f9e55605 100644
--- a/62_CAD/shaders/main_pipeline/vertex_shader.hlsl
+++ b/62_CAD/shaders/main_pipeline/vertex_shader.hlsl
@@ -7,6 +7,8 @@
 #include <nbl/builtin/hlsl/algorithm.hlsl>
 #include <nbl/builtin/hlsl/jit/device_capabilities.hlsl>
 
+[[vk::push_constant]] PushConstants pc;
+
 // TODO[Lucas]: Move these functions to builtin hlsl functions (Even the shadertoy obb and aabb ones)
 float cross2D(float2 a, float2 b)
 {
@@ -92,6 +94,28 @@ PSInput main(uint vertexID : SV_VertexID)
     // ~~Later, most likely We will require pulling all 3 vertices of the triangle, that's where you need to know which triangle you're currently on, and instead of objectID = vertexID/4 which we currently do, you will do vertexID/3 and pull all 3 of it's vertices.~~
     // Ok, brainfart, a vertex can belong to multiple triangles, I was thinking of AA but triangles share vertices, nevermind my comment above.
 
+    TriangleMeshVertex vtx = vk::RawBufferLoad<TriangleMeshVertex>(pc.verticesBaseAddress + sizeof(TriangleMeshVertex) * vertexID, 8u);
+
+    PSInput outV;
+
+    pfloat64_t2 vtxPos;
+    vtxPos.x = _static_cast<pfloat64_t>(vtx.pos.x);
+    vtxPos.y = _static_cast<pfloat64_t>(vtx.pos.y);
+
+    DrawObject drawObj = drawObjects[0];
+    MainObject mainObj = mainObjects[drawObj.mainObjIndex];
+    ClipProjectionData clipProjectionData = getClipProjectionData(mainObj);
+
+    float2 transformedPos = transformPointScreenSpace(clipProjectionData.projectionToNDC, globals.resolution, vtxPos);
+
+    outV.position.xy = transformedPos;
+    outV.position.zw = float2(0.0, 1.0);
+    outV.setHeightAtMeshVertex(vtx.height);
+
+    return outV;
+
+#if 0
+
     const uint vertexIdx = vertexID & 0x3u;
     const uint objectID = vertexID >> 2;
 
@@ -589,4 +613,5 @@ PSInput main(uint vertexID : SV_VertexID)
 
     outV.clip = float4(outV.position.x - clipProjectionData.minClipNDC.x, outV.position.y - clipProjectionData.minClipNDC.y, clipProjectionData.maxClipNDC.x - outV.position.x, clipProjectionData.maxClipNDC.y - outV.position.y);
     return outV;
+#endif
 }

From eb38ef5169ffd2508dc74ff05632394c0100cb93 Mon Sep 17 00:00:00 2001
From: kevyuu <kevin.kayu@gmail.com>
Date: Mon, 3 Mar 2025 18:21:21 +0700
Subject: [PATCH 064/529] Adjust demo to ray_tracing_pipeline_demo fixes

---
 71_RayTracingPipeline/main.cpp | 34 +++++++++++++++++-----------------
 1 file changed, 17 insertions(+), 17 deletions(-)

diff --git a/71_RayTracingPipeline/main.cpp b/71_RayTracingPipeline/main.cpp
index 0c5473b73..5793ff8d3 100644
--- a/71_RayTracingPipeline/main.cpp
+++ b/71_RayTracingPipeline/main.cpp
@@ -354,12 +354,12 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication,
 
       auto& shaderGroups = params.shaderGroups;
 
-      shaderGroups.raygenGroup = { .shaderIndex = RTDS_RAYGEN };
+      shaderGroups.raygen = { .index = RTDS_RAYGEN };
 
       IRayTracingPipelineBase::SGeneralShaderGroup missGroups[EMT_COUNT];
-      missGroups[EMT_PRIMARY] = { .shaderIndex = RTDS_MISS };
-      missGroups[EMT_OCCLUSION] = { .shaderIndex = RTDS_SHADOW_MISS };
-      shaderGroups.missGroups = missGroups;
+      missGroups[EMT_PRIMARY] = { .index = RTDS_MISS };
+      missGroups[EMT_OCCLUSION] = { .index = RTDS_SHADOW_MISS };
+      shaderGroups.misses = missGroups;
 
       auto getHitGroupIndex = [](E_GEOM_TYPE geomType, E_RAY_TYPE rayType)
         {
@@ -367,28 +367,28 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication,
         };
       IRayTracingPipelineBase::SHitShaderGroup hitGroups[E_RAY_TYPE::ERT_COUNT * E_GEOM_TYPE::EGT_COUNT];
       hitGroups[getHitGroupIndex(EGT_TRIANGLES, ERT_PRIMARY)] = {
-        .closestHitShaderIndex = RTDS_CLOSEST_HIT,
-        .anyHitShaderIndex = RTDS_ANYHIT_PRIMARY,
+        .closestHit = RTDS_CLOSEST_HIT,
+        .anyHit = RTDS_ANYHIT_PRIMARY,
       };
       hitGroups[getHitGroupIndex(EGT_TRIANGLES, ERT_OCCLUSION)] = {
-        .anyHitShaderIndex = RTDS_ANYHIT_SHADOW,
+        .anyHit = RTDS_ANYHIT_SHADOW,
       };
       hitGroups[getHitGroupIndex(EGT_PROCEDURAL, ERT_PRIMARY)] = {
-        .closestHitShaderIndex = RTDS_SPHERE_CLOSEST_HIT,
-        .anyHitShaderIndex = RTDS_ANYHIT_PRIMARY,
-        .intersectionShaderIndex = RTDS_INTERSECTION,
+        .closestHit = RTDS_SPHERE_CLOSEST_HIT,
+        .anyHit = RTDS_ANYHIT_PRIMARY,
+        .intersectionShader = RTDS_INTERSECTION,
       };
       hitGroups[getHitGroupIndex(EGT_PROCEDURAL, ERT_OCCLUSION)] = {
-        .anyHitShaderIndex = RTDS_ANYHIT_SHADOW,
-        .intersectionShaderIndex = RTDS_INTERSECTION,
+        .anyHit = RTDS_ANYHIT_SHADOW,
+        .intersectionShader = RTDS_INTERSECTION,
       };
-      shaderGroups.hitGroups = hitGroups;
+      shaderGroups.hits = hitGroups;
 
       IRayTracingPipelineBase::SGeneralShaderGroup callableGroups[ELT_COUNT];
-      callableGroups[ELT_DIRECTIONAL] = { .shaderIndex = RTDS_DIRECTIONAL_CALL };
-      callableGroups[ELT_POINT] = { .shaderIndex = RTDS_POINT_CALL };
-      callableGroups[ELT_SPOT] = { .shaderIndex = RTDS_SPOT_CALL };
-      shaderGroups.callableGroups = callableGroups;
+      callableGroups[ELT_DIRECTIONAL] = { .index = RTDS_DIRECTIONAL_CALL };
+      callableGroups[ELT_POINT] = { .index = RTDS_POINT_CALL };
+      callableGroups[ELT_SPOT] = { .index = RTDS_SPOT_CALL };
+      shaderGroups.callables = callableGroups;
 
       params.cached.maxRecursionDepth = 1;
 

From b1831d983d2f8d8df7641d44d8cde857c6977a2c Mon Sep 17 00:00:00 2001
From: keptsecret <sorchon@gmail.com>
Date: Tue, 4 Mar 2025 16:56:49 +0700
Subject: [PATCH 065/529] refactor to use new frisvad

---
 31_HLSLPathTracer/app_resources/hlsl/common.hlsl | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/31_HLSLPathTracer/app_resources/hlsl/common.hlsl b/31_HLSLPathTracer/app_resources/hlsl/common.hlsl
index 913225f8b..2482806e2 100644
--- a/31_HLSLPathTracer/app_resources/hlsl/common.hlsl
+++ b/31_HLSLPathTracer/app_resources/hlsl/common.hlsl
@@ -271,9 +271,10 @@ struct Shape<PST_SPHERE>
             const float sinTheta = nbl::hlsl::sqrt(1.0 - cosTheta2);
             float sinPhi, cosPhi;
             math::sincos(2.0 * numbers::pi<float> * xi.y - numbers::pi<float>, sinPhi, cosPhi);
-            float32_t2x3 XY = math::frisvad<float>(Z);
+            float32_t3 X, Y;
+            math::frisvad<float32_t3>(Z, X, Y);
 
-            L += (XY[0] * cosPhi + XY[1] * sinPhi) * sinTheta;
+            L += (X * cosPhi + Y * sinPhi) * sinTheta;
 
             newRayMaxT = (cosTheta - nbl::hlsl::sqrt(cosTheta2 - cosThetaMax2)) / rcpDistance;
             pdf = 1.0 / (2.0 * numbers::pi<float> * (1.0 - cosThetaMax));

From 8058cff295589b358eba291ed22804601cc6c7bf Mon Sep 17 00:00:00 2001
From: Przemek <minikers21@gmail.com>
Date: Thu, 27 Feb 2025 12:18:59 +0100
Subject: [PATCH 066/529] Simple triangle draw

---
 62_CAD/CTriangleMesh.h                        | 26 ++++++---
 62_CAD/DrawResourcesFiller.cpp                | 54 +++++++++++++------
 62_CAD/DrawResourcesFiller.h                  |  2 +-
 62_CAD/main.cpp                               | 29 +++++-----
 62_CAD/shaders/globals.hlsl                   |  6 ++-
 .../main_pipeline/fragment_shader.hlsl        |  2 +
 .../shaders/main_pipeline/vertex_shader.hlsl  | 22 +++++---
 7 files changed, 95 insertions(+), 46 deletions(-)

diff --git a/62_CAD/CTriangleMesh.h b/62_CAD/CTriangleMesh.h
index 3f39fb750..6b5612a5c 100644
--- a/62_CAD/CTriangleMesh.h
+++ b/62_CAD/CTriangleMesh.h
@@ -9,7 +9,17 @@ using namespace nbl;
 class CTriangleMesh final
 {
 public:
-	inline void setVertices(core::vector<TriangleMeshVertex>&& vertices)
+	using index_t = uint32_t;
+	using vertex_t = TriangleMeshVertex;
+
+	struct DrawData
+	{
+		PushConstants pushConstants;
+		uint64_t indexBufferOffset;
+		uint64_t indexCount;
+	};
+
+	inline void setVertices(core::vector<vertex_t>&& vertices)
 	{
 		m_vertices = std::move(vertices);
 	}
@@ -18,7 +28,7 @@ class CTriangleMesh final
 		m_indices = std::move(indices);
 	}
 
-	inline const core::vector<TriangleMeshVertex>& getVertices() const
+	inline const core::vector<vertex_t>& getVertices() const
 	{
 		return m_vertices;
 	}
@@ -29,15 +39,19 @@ class CTriangleMesh final
 
 	inline size_t getVtxBuffByteSize() const
 	{
-		return sizeof(decltype(m_vertices)::value_type);
+		return sizeof(vertex_t) * m_vertices.size();
 	}
 	inline size_t getIdxBuffByteSize() const
 	{
-		return sizeof(decltype(m_indices)::value_type);
+		return sizeof(index_t) * m_indices.size();
+	}
+	inline size_t getIdxCnt() const
+	{
+		return m_indices.size();
 	}
 
 
 private:
-	core::vector<TriangleMeshVertex> m_vertices;
-	core::vector<uint32_t> m_indices;
+	core::vector<vertex_t> m_vertices;
+	core::vector<index_t> m_indices;
 };
\ No newline at end of file
diff --git a/62_CAD/DrawResourcesFiller.cpp b/62_CAD/DrawResourcesFiller.cpp
index 32f72c07b..291e0ad88 100644
--- a/62_CAD/DrawResourcesFiller.cpp
+++ b/62_CAD/DrawResourcesFiller.cpp
@@ -84,7 +84,7 @@ void DrawResourcesFiller::allocateGeometryBuffer(ILogicalDevice* logicalDevice,
 
 	IGPUBuffer::SCreationParams geometryCreationParams = {};
 	geometryCreationParams.size = size;
-	geometryCreationParams.usage = bitflag(IGPUBuffer::EUF_STORAGE_BUFFER_BIT) | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT | IGPUBuffer::EUF_TRANSFER_DST_BIT;
+	geometryCreationParams.usage = bitflag(IGPUBuffer::EUF_STORAGE_BUFFER_BIT) | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT | IGPUBuffer::EUF_TRANSFER_DST_BIT | IGPUBuffer::EUF_INDEX_BUFFER_BIT;
 	gpuDrawBuffers.geometryBuffer = logicalDevice->createBuffer(std::move(geometryCreationParams));
 	gpuDrawBuffers.geometryBuffer->setObjectDebugName("geometryBuffer");
 
@@ -218,24 +218,48 @@ void DrawResourcesFiller::drawPolyline(const CPolylineBase& polyline, uint32_t p
 	}
 }
 
-void DrawResourcesFiller::drawTriangleMesh(const CTriangleMesh& mesh, core::unordered_map<float32_t, float32_t3> heightColorMap, SIntendedSubmitInfo& intendedNextSubmit)
+void DrawResourcesFiller::drawTriangleMesh(const CTriangleMesh& mesh, CTriangleMesh::DrawData& drawData, SIntendedSubmitInfo& intendedNextSubmit)
 {
 	ICPUBuffer::SCreationParams geometryBuffParams;
 	
 	// concatenate the index and vertex buffer into the geometry buffer
-	const size_t indexBuffSize = mesh.getIdxBuffByteSize();
-	const size_t vtxBuffSize = mesh.getVtxBuffByteSize();
-	const size_t geometryBufferSizeDataSize = indexBuffSize + vtxBuffSize;
-
-	core::vector<uint8_t> geometryBufferData(geometryBufferSizeDataSize);
-	std::memcpy(geometryBufferData.data(), mesh.getIndices().data(), indexBuffSize);
-	std::memcpy(geometryBufferData.data() + indexBuffSize, mesh.getVertices().data(), vtxBuffSize);
-
-	SBufferRange<IGPUBuffer> geometryBuffRange;
-	geometryBuffRange.offset = 0;
-	geometryBuffRange.size = geometryBufferSizeDataSize;
-	geometryBuffRange.buffer = gpuDrawBuffers.drawObjectsBuffer;
-	m_utilities->updateBufferRangeViaStagingBuffer(intendedNextSubmit, geometryBuffRange, geometryBufferData.data());
+	const size_t indexBuffByteSize = mesh.getIdxBuffByteSize();
+	const size_t vtxBuffByteSize = mesh.getVtxBuffByteSize();
+	const size_t geometryBufferDataToAddByteSize = indexBuffByteSize + vtxBuffByteSize;
+
+	// copy into gemoetry cpu buffer insteaed
+
+	// TODO: rename, its not just points
+	const uint32_t maxGeometryBufferPoints = static_cast<uint32_t>(maxGeometryBufferSize - currentGeometryBufferSize);
+
+	// TODO: assert of geometry buffer size, do i need to check if size of objects to be added <= maxGeometryBufferPoints?
+	// TODO: auto submit instead of assert
+	assert(geometryBufferDataToAddByteSize <= maxGeometryBufferPoints);
+
+	// TODO: vertices need to be aligned to 8?
+	uint64_t vtxBufferAddress;
+	{
+		void* dst = reinterpret_cast<char*>(cpuDrawBuffers.geometryBuffer->getPointer()) + currentGeometryBufferSize;
+		void* dst1 = dst;
+
+		drawData.indexBufferOffset = currentGeometryBufferSize;
+		memcpy(dst, mesh.getIndices().data(), indexBuffByteSize);
+		currentGeometryBufferSize += indexBuffByteSize;
+
+		dst = reinterpret_cast<char*>(cpuDrawBuffers.geometryBuffer->getPointer()) + currentGeometryBufferSize;
+		drawData.pushConstants.triangleMeshVerticesBaseAddress = geometryBufferAddress + currentGeometryBufferSize;
+		memcpy(dst, mesh.getVertices().data(), vtxBuffByteSize);
+		currentGeometryBufferSize += vtxBuffByteSize;
+	}
+
+	drawData.indexCount = mesh.getIdxCnt();
+
+	// call addMainObject_SubmitIfNeeded, use its index in push constants
+
+	drawData.pushConstants.triangleMeshMainObjectIndex = addMainObject_SubmitIfNeeded(0, intendedNextSubmit);
+
+	// TODO: use this function later for auto submit
+	//submitCurrentDrawObjectsAndReset(intendedNextSubmit, 0);
 }
 
 // TODO[Erfan]: Makes more sense if parameters are: solidColor + fillPattern + patternColor
diff --git a/62_CAD/DrawResourcesFiller.h b/62_CAD/DrawResourcesFiller.h
index f9ab033e9..c3b31d32e 100644
--- a/62_CAD/DrawResourcesFiller.h
+++ b/62_CAD/DrawResourcesFiller.h
@@ -77,7 +77,7 @@ struct DrawResourcesFiller
 
 	void drawPolyline(const CPolylineBase& polyline, uint32_t polylineMainObjIdx, SIntendedSubmitInfo& intendedNextSubmit);
 	
-	void drawTriangleMesh(const CTriangleMesh& mesh, core::unordered_map<float32_t, float32_t3> heightColorMap, SIntendedSubmitInfo& intendedNextSubmit);
+	void drawTriangleMesh(const CTriangleMesh& mesh, CTriangleMesh::DrawData& pushConstants, SIntendedSubmitInfo& intendedNextSubmit);
 
 	// ! Convinience function for Hatch with MSDF Pattern and a solid background
 	void drawHatch(
diff --git a/62_CAD/main.cpp b/62_CAD/main.cpp
index a893d9b40..a14d9de55 100644
--- a/62_CAD/main.cpp
+++ b/62_CAD/main.cpp
@@ -1399,20 +1399,20 @@ class ComputerAidedDesign final : public examples::SimpleWindowedApplication, pu
 
 			// TODO[Przemek]: based on our call bind index buffer you uploaded to part of the `drawResourcesFiller.gpuDrawBuffers.geometryBuffer`
 			// Vertices will be pulled based on baseBDAPointer of where you uploaded the vertex + the VertexID in the vertex shader.
-			cb->bindIndexBuffer({ .offset = 0u, .buffer = drawResourcesFiller.gpuDrawBuffers.geometryBuffer.get() }, asset::EIT_32BIT);
+			cb->bindIndexBuffer({ .offset = m_triangleMeshDrawData.indexBufferOffset, .buffer = drawResourcesFiller.gpuDrawBuffers.geometryBuffer.get() }, asset::EIT_32BIT);
 
 			// TODO[Przemek]: binding the same pipelie, no need to change.
 			cb->bindGraphicsPipeline(graphicsPipeline.get());
 
 			// TODO[Przemek]: contour settings, height shading settings, base bda pointers will need to be pushed via pushConstants before the draw currently as it's the easiest thing to do.
-			cb->pushConstants(graphicsPipeline->getLayout(), IGPUShader::E_SHADER_STAGE::ESS_FRAGMENT, 0, sizeof(PushConstants), &m_pushConstants);
+
+			cb->pushConstants(graphicsPipeline->getLayout(), IGPUShader::E_SHADER_STAGE::ESS_VERTEX, 0, sizeof(PushConstants), &m_triangleMeshDrawData.pushConstants);
 
 			// TODO[Przemek]: draw parameters needs to reflect the mesh involved
-			cb->drawIndexed(m_triangleMeshIndexCount, 1u, 0u, 0u, 0u);
+			cb->drawIndexed(m_triangleMeshDrawData.indexCount, 1u, 0u, 0u, 0u);
 		}
 		else
 		{
-			cb->bindDescriptorSets(asset::EPBP_GRAPHICS, pipelineLayout.get(), 0u, 2u, descriptorSets);
 			cb->bindIndexBuffer({ .offset = 0u, .buffer = drawResourcesFiller.gpuDrawBuffers.indexBuffer.get() }, asset::EIT_32BIT);
 			cb->bindGraphicsPipeline(graphicsPipeline.get());
 			cb->drawIndexed(currentIndexCount, 1u, 0u, 0u, 0u);
@@ -3253,27 +3253,25 @@ class ComputerAidedDesign final : public examples::SimpleWindowedApplication, pu
 		else if (mode == ExampleMode::CASE_9)
 		{
 			core::vector<TriangleMeshVertex> vertices = {
-				{ float32_t2(0.0f, 10.0f), 0.0f },
-				{ float32_t2(-10.0f, -10.0f), 50.0f },
-				{ float32_t2(10.0f, -10.0f), 100.0f }
+				{ float32_t2(0.0f, 0.0f), 0.0f },
+				{ float32_t2(0.0f, 100.0f), 50.0f },
+				{ float32_t2(200.0f, 50.0f), 100.0f }
 			};
 
 			core::vector<uint32_t> indices = {
 				0, 1, 2
 			};
 
-			core::unordered_map<float32_t, float32_t3> heightColorMap;
-			heightColorMap.insert({ 0.0f, {0.0f, 1.0f, 0.0f} });
-			heightColorMap.insert({ 100.0f, {0.0f, 1.0f, 0.0f} });
-
-			m_triangleMeshIndexCount = indices.size();
-			m_pushConstants.verticesBaseAddress = sizeof(uint32_t) * m_triangleMeshIndexCount;
+			// TODO: height color map
+			//core::unordered_map<float32_t, float32_t3> heightColorMap;
+			//heightColorMap.insert({ 0.0f, {0.0f, 1.0f, 0.0f} });
+			//heightColorMap.insert({ 100.0f, {0.0f, 1.0f, 0.0f} });
 
 			CTriangleMesh mesh;
 			mesh.setVertices(std::move(vertices));
 			mesh.setIndices(std::move(indices));
 
-			drawResourcesFiller.drawTriangleMesh(mesh, heightColorMap, intendedNextSubmit);
+			drawResourcesFiller.drawTriangleMesh(mesh, m_triangleMeshDrawData, intendedNextSubmit);
 		}
 		drawResourcesFiller.finalizeAllCopiesToGPU(intendedNextSubmit);
 	}
@@ -3356,8 +3354,7 @@ class ComputerAidedDesign final : public examples::SimpleWindowedApplication, pu
 	
 	std::unique_ptr<GeoTextureRenderer> m_geoTextureRenderer;
 
-	PushConstants m_pushConstants;
-	size_t m_triangleMeshIndexCount;
+	CTriangleMesh::DrawData m_triangleMeshDrawData;
 };
 
 NBL_MAIN_FUNC(ComputerAidedDesign)
diff --git a/62_CAD/shaders/globals.hlsl b/62_CAD/shaders/globals.hlsl
index 4719f6df8..1902ba39e 100644
--- a/62_CAD/shaders/globals.hlsl
+++ b/62_CAD/shaders/globals.hlsl
@@ -33,7 +33,8 @@ using pfloat64_t3x3 = portable_matrix_t3x3<pfloat64_t>;
 
 struct PushConstants
 {
-    uint64_t verticesBaseAddress;
+    uint64_t triangleMeshVerticesBaseAddress;
+    uint32_t triangleMeshMainObjectIndex;
 };
 
 // TODO: Compute this in a compute shader from the world counterparts
@@ -111,7 +112,8 @@ enum class ObjectType : uint32_t
     CURVE_BOX = 2u,
     POLYLINE_CONNECTOR = 3u,
     FONT_GLYPH = 4u,
-    IMAGE = 5u
+    IMAGE = 5u,
+    TRIANGLE_MESH = 6u
 };
 
 enum class MajorAxis : uint32_t
diff --git a/62_CAD/shaders/main_pipeline/fragment_shader.hlsl b/62_CAD/shaders/main_pipeline/fragment_shader.hlsl
index e850622c3..2f21a6a0f 100644
--- a/62_CAD/shaders/main_pipeline/fragment_shader.hlsl
+++ b/62_CAD/shaders/main_pipeline/fragment_shader.hlsl
@@ -405,6 +405,8 @@ float32_t4 calculateFinalColor<true>(const uint2 fragCoord, const float localAlp
 [shader("pixel")]
 float4 fragMain(PSInput input) : SV_TARGET
 {
+    return float4(1.0f, 0.0f, 0.0f, 1.0f);
+
     float localAlpha = 0.0f;
     float3 textureColor = float3(0, 0, 0); // color sampled from a texture
 
diff --git a/62_CAD/shaders/main_pipeline/vertex_shader.hlsl b/62_CAD/shaders/main_pipeline/vertex_shader.hlsl
index 3f9e55605..a798549d5 100644
--- a/62_CAD/shaders/main_pipeline/vertex_shader.hlsl
+++ b/62_CAD/shaders/main_pipeline/vertex_shader.hlsl
@@ -94,27 +94,37 @@ PSInput main(uint vertexID : SV_VertexID)
     // ~~Later, most likely We will require pulling all 3 vertices of the triangle, that's where you need to know which triangle you're currently on, and instead of objectID = vertexID/4 which we currently do, you will do vertexID/3 and pull all 3 of it's vertices.~~
     // Ok, brainfart, a vertex can belong to multiple triangles, I was thinking of AA but triangles share vertices, nevermind my comment above.
 
-    TriangleMeshVertex vtx = vk::RawBufferLoad<TriangleMeshVertex>(pc.verticesBaseAddress + sizeof(TriangleMeshVertex) * vertexID, 8u);
-
+#define DTM
+#ifdef DTM
     PSInput outV;
 
+    // Default Initialize PS Input
+    outV.position.zw = float2(0.0, 1.0);
+    outV.data1 = uint4(0, 0, 0, 0);
+    outV.data2 = float4(0, 0, 0, 0);
+    outV.data3 = float4(0, 0, 0, 0);
+    outV.data4 = float4(0, 0, 0, 0);
+    outV.interp_data5 = float2(0, 0);
+    outV.setObjType(ObjectType::TRIANGLE_MESH);
+    outV.setMainObjectIdx(pc.triangleMeshMainObjectIndex);
+
+    TriangleMeshVertex vtx = vk::RawBufferLoad<TriangleMeshVertex>(pc.triangleMeshVerticesBaseAddress + sizeof(TriangleMeshVertex) * vertexID, 4u);
     pfloat64_t2 vtxPos;
     vtxPos.x = _static_cast<pfloat64_t>(vtx.pos.x);
     vtxPos.y = _static_cast<pfloat64_t>(vtx.pos.y);
 
-    DrawObject drawObj = drawObjects[0];
-    MainObject mainObj = mainObjects[drawObj.mainObjIndex];
+    MainObject mainObj = mainObjects[pc.triangleMeshMainObjectIndex];
     ClipProjectionData clipProjectionData = getClipProjectionData(mainObj);
 
     float2 transformedPos = transformPointScreenSpace(clipProjectionData.projectionToNDC, globals.resolution, vtxPos);
 
     outV.position.xy = transformedPos;
-    outV.position.zw = float2(0.0, 1.0);
+    outV.position = transformFromSreenSpaceToNdc(outV.position.xy, globals.resolution);
     outV.setHeightAtMeshVertex(vtx.height);
 
     return outV;
 
-#if 0
+#else
 
     const uint vertexIdx = vertexID & 0x3u;
     const uint objectID = vertexID >> 2;

From cb5662a63b6d24dbdc620aeb3606582dc51a4f9f Mon Sep 17 00:00:00 2001
From: keptsecret <sorchon@gmail.com>
Date: Fri, 7 Mar 2025 10:44:08 +0700
Subject: [PATCH 067/529] fix bugs again

---
 .../app_resources/hlsl/common.hlsl            |  4 ++--
 .../app_resources/hlsl/material_system.hlsl   | 17 ++++++++++----
 .../hlsl/next_event_estimator.hlsl            | 23 +++++++++++++------
 .../app_resources/hlsl/pathtracer.hlsl        |  7 +++---
 31_HLSLPathTracer/main.cpp                    |  6 ++---
 5 files changed, 38 insertions(+), 19 deletions(-)

diff --git a/31_HLSLPathTracer/app_resources/hlsl/common.hlsl b/31_HLSLPathTracer/app_resources/hlsl/common.hlsl
index 2482806e2..244a92107 100644
--- a/31_HLSLPathTracer/app_resources/hlsl/common.hlsl
+++ b/31_HLSLPathTracer/app_resources/hlsl/common.hlsl
@@ -66,7 +66,7 @@ struct Ray
     // immutable
     vector3_type origin;
     vector3_type direction;
-    
+
     // TODO: polygon method == 2 stuff
     vector3_type normalAtOrigin;
     bool wasBSDFAtOrigin;
@@ -417,7 +417,7 @@ struct Shape<PST_TRIANGLE>
                 shapes::SphericalTriangle<float> st = shapes::SphericalTriangle<float>::create(vertex0, vertex1, vertex2, origin);
                 sampling::ProjectedSphericalTriangle<float> sst = sampling::ProjectedSphericalTriangle<float>::create(st);
 
-                const float32_t3 L = sst.generate(rcpPdf, interaction.N, isBSDF, xi.xy);
+                const float32_t3 L = sst.generate(rcpPdf, interaction.isotropic.N, isBSDF, xi.xy);
 
                 pdf = rcpPdf > numeric_limits<float>::min ? (1.0 / rcpPdf) : 0.0;
 
diff --git a/31_HLSLPathTracer/app_resources/hlsl/material_system.hlsl b/31_HLSLPathTracer/app_resources/hlsl/material_system.hlsl
index 1a613080f..09236c85e 100644
--- a/31_HLSLPathTracer/app_resources/hlsl/material_system.hlsl
+++ b/31_HLSLPathTracer/app_resources/hlsl/material_system.hlsl
@@ -39,6 +39,7 @@ struct System
     using vector3_type = vector<scalar_type, 3>;
     using measure_type = typename DiffuseBxDF::spectral_type;
     using sample_type = typename DiffuseBxDF::sample_type;
+    using ray_dir_info_type = typename sample_type::ray_dir_info_type;
     using quotient_pdf_type = typename DiffuseBxDF::quotient_pdf_type;
     using anisotropic_type = typename DiffuseBxDF::anisotropic_type;
     using anisocache_type = typename ConductorBxDF::anisocache_type;
@@ -85,7 +86,7 @@ struct System
         }
     }
 
-    sample_type generate(NBL_CONST_REF_ARG(Material) material, NBL_CONST_REF_ARG(create_params_t) cparams, anisotropic_type interaction, NBL_CONST_REF_ARG(vector3_type) u, NBL_REF_ARG(anisocache_type) cache)
+    sample_type generate(NBL_CONST_REF_ARG(Material) material, NBL_CONST_REF_ARG(create_params_t) cparams, NBL_CONST_REF_ARG(anisotropic_type) interaction, NBL_CONST_REF_ARG(vector3_type) u, NBL_REF_ARG(anisocache_type) _cache)
     {
         switch(material.type)
         {
@@ -98,18 +99,26 @@ struct System
             case Material::Type::CONDUCTOR:
             {
                 conductorBxDF.init(cparams);
-                return conductorBxDF.generate(interaction, u.xy, cache);
+                return conductorBxDF.generate(interaction, u.xy, _cache);
             }
             break;
             case Material::Type::DIELECTRIC:
             {
                 dielectricBxDF.init(cparams);
-                return dielectricBxDF.generate(interaction, u, cache);
+                return dielectricBxDF.generate(interaction, u, _cache);
             }
             break;
             default:
-                return (sample_type)numeric_limits<float>::infinity;
+            {
+                ray_dir_info_type L;
+                L.direction = (vector3_type)0;
+                return sample_type::create(L, 0, (vector3_type)0);
+            }
         }
+
+        ray_dir_info_type L;
+        L.direction = (vector3_type)0;
+        return sample_type::create(L, 0, (vector3_type)0);
     }
 
     quotient_pdf_type quotient_and_pdf(NBL_CONST_REF_ARG(Material) material, NBL_CONST_REF_ARG(create_params_t) cparams, NBL_CONST_REF_ARG(params_t) params)
diff --git a/31_HLSLPathTracer/app_resources/hlsl/next_event_estimator.hlsl b/31_HLSLPathTracer/app_resources/hlsl/next_event_estimator.hlsl
index 15dbf3a9b..38a5fae15 100644
--- a/31_HLSLPathTracer/app_resources/hlsl/next_event_estimator.hlsl
+++ b/31_HLSLPathTracer/app_resources/hlsl/next_event_estimator.hlsl
@@ -23,6 +23,7 @@ struct Estimator
     using interaction_type = Aniso;
     using quotient_pdf_type = bxdf::quotient_and_pdf<spectral_type, scalar_type>;
     using sample_type = LightSample;
+    using ray_dir_info_type = typename sample_type::ray_dir_info_type;
 
     static spectral_type proceduralDeferredEvalAndPdf(NBL_REF_ARG(scalar_type) pdf, NBL_CONST_REF_ARG(light_type) light, NBL_CONST_REF_ARG(ray_type) ray, NBL_CONST_REF_ARG(Event) event)
     {
@@ -88,6 +89,7 @@ struct Estimator
             default:
                 return (spectral_type)0.0;
         }
+        return (spectral_type)0.0;
     }
 
     static sample_type procedural_generate_and_quotient_and_pdf(NBL_REF_ARG(quotient_pdf_type) quotient_pdf, NBL_REF_ARG(scalar_type) newRayMaxT, NBL_CONST_REF_ARG(light_type) light, NBL_CONST_REF_ARG(vector3_type) origin, NBL_CONST_REF_ARG(interaction_type) interaction, bool isBSDF, NBL_CONST_REF_ARG(vector3_type) xi, uint32_t depth, NBL_CONST_REF_ARG(Event) event)
@@ -104,9 +106,11 @@ struct Estimator
                 vector3_type position = vector3_type(asfloat(event.data[2]), asfloat(event.data[3]), asfloat(event.data[4]));
                 Shape<PST_SPHERE> sphere = Shape<PST_SPHERE>::create(position, asfloat(event.data[5]), event.data[6]);
                 const vector3_type sampleL = sphere.template generate_and_pdf<interaction_type>(pdf, newRayMaxT, origin, interaction, isBSDF, xi);
-                const vector3_type V = interaction.V.getDirection();
+                const vector3_type V = interaction.isotropic.V.getDirection();
                 const scalar_type VdotL = nbl::hlsl::dot<vector3_type>(V, sampleL);
-                L = sample_type::create(sampleL,VdotL,interaction.T,interaction.B,interaction.N);
+                ray_dir_info_type rayL;
+                rayL.direction = sampleL;
+                L = sample_type::create(rayL,VdotL,interaction.T,interaction.B,interaction.isotropic.N);
             }
             break;
             case PST_TRIANGLE:
@@ -116,9 +120,11 @@ struct Estimator
                 vector3_type vertex2 = vector3_type(asfloat(event.data[8]), asfloat(event.data[9]), asfloat(event.data[10]));
                 Shape<PST_TRIANGLE> tri = Shape<PST_TRIANGLE>::create(vertex0, vertex1, vertex2, event.data[11]);
                 const vector3_type sampleL = tri.template generate_and_pdf<interaction_type>(pdf, newRayMaxT, origin, interaction, isBSDF, xi);
-                const vector3_type V = interaction.V.getDirection();
+                const vector3_type V = interaction.isotropic.V.getDirection();
                 const scalar_type VdotL = nbl::hlsl::dot<vector3_type>(V, sampleL);
-                L = sample_type::create(sampleL,VdotL,interaction.T,interaction.B,interaction.N);
+                ray_dir_info_type rayL;
+                rayL.direction = sampleL;
+                L = sample_type::create(rayL,VdotL,interaction.T,interaction.B,interaction.isotropic.N);
             }
             break;
             case PST_RECTANGLE:
@@ -128,9 +134,11 @@ struct Estimator
                 vector3_type edge1 = vector3_type(asfloat(event.data[8]), asfloat(event.data[9]), asfloat(event.data[10]));
                 Shape<PST_RECTANGLE> rect = Shape<PST_RECTANGLE>::create(offset, edge0, edge1, event.data[11]);
                 const vector3_type sampleL = rect.template generate_and_pdf<interaction_type>(pdf, newRayMaxT, origin, interaction, isBSDF, xi);
-                const vector3_type V = interaction.V.getDirection();
+                const vector3_type V = interaction.isotropic.V.getDirection();
                 const scalar_type VdotL = nbl::hlsl::dot<vector3_type>(V, sampleL);
-                L = sample_type::create(sampleL,VdotL,interaction.T,interaction.B,interaction.N);
+                ray_dir_info_type rayL;
+                rayL.direction = sampleL;
+                L = sample_type::create(rayL,VdotL,interaction.T,interaction.B,interaction.isotropic.N);
             }
             break;
             default:
@@ -149,6 +157,7 @@ struct Estimator
     static sample_type generate_and_quotient_and_pdf(NBL_REF_ARG(quotient_pdf_type) quotient_pdf, NBL_REF_ARG(scalar_type) newRayMaxT, NBL_CONST_REF_ARG(light_type) light, NBL_CONST_REF_ARG(vector3_type) origin, NBL_CONST_REF_ARG(interaction_type) interaction, bool isBSDF, NBL_CONST_REF_ARG(vector3_type) xi, uint32_t depth, NBL_CONST_REF_ARG(Event) event)
     {
         const Event::Mode mode = (Event::Mode)event.mode;
+        sample_type L;
         switch (mode)
         {
             case Event::Mode::RAY_QUERY:
@@ -168,10 +177,10 @@ struct Estimator
             break;
             default:
             {
-                sample_type L;
                 return L;
             }
         }
+        return L;
     }
 };
 
diff --git a/31_HLSLPathTracer/app_resources/hlsl/pathtracer.hlsl b/31_HLSLPathTracer/app_resources/hlsl/pathtracer.hlsl
index 62398a58e..ba683d443 100644
--- a/31_HLSLPathTracer/app_resources/hlsl/pathtracer.hlsl
+++ b/31_HLSLPathTracer/app_resources/hlsl/pathtracer.hlsl
@@ -54,6 +54,7 @@ struct Unidirectional
     using vector3_type = vector<scalar_type, 3>;
     using measure_type = typename MaterialSystem::measure_type;
     using sample_type = typename NextEventEstimator::sample_type;
+    using ray_dir_info_type = typename sample_type::ray_dir_info_type;
     using ray_type = typename RayGen::ray_type;
     using light_type = Light<measure_type>;
     using bxdfnode_type = BxDFNode<measure_type>;
@@ -181,7 +182,7 @@ struct Unidirectional
             bool validPath = nee_sample.NdotL > numeric_limits<scalar_type>::min;
             // but if we allowed non-watertight transmitters (single water surface), it would make sense just to apply this line by itself
             anisocache_type _cache;
-            validPath = validPath && anisocache_type::compute(_cache, interaction, nee_sample, monochromeEta);
+            validPath = validPath && anisocache_type::template compute<ray_dir_info_type, ray_dir_info_type>(_cache, interaction, nee_sample, monochromeEta);
             bxdf.params.A = nbl::hlsl::max(bxdf.params.A, vector<scalar_type, 2>(0,0));
             bxdf.params.eta = monochromeEta;
 
@@ -268,7 +269,7 @@ struct Unidirectional
                     params = params_type::template create<sample_type, anisotropic_type, anisocache_type>(bsdf_sample, interaction, _cache, bxdf::BCM_MAX);
                 else
                 {
-                    isocache_type isocache = (isocache_type)_cache;
+                    isocache_type isocache = _cache.iso_cache;
                     params = params_type::template create<sample_type, isotropic_type, isocache_type>(bsdf_sample, iso_interaction, isocache, bxdf::BCM_MAX);
                 }
             }
@@ -282,7 +283,7 @@ struct Unidirectional
                     params = params_type::template create<sample_type, anisotropic_type, anisocache_type>(bsdf_sample, interaction, _cache, bxdf::BCM_ABS);
                 else
                 {
-                    isocache_type isocache = (isocache_type)_cache;
+                    isocache_type isocache = _cache.iso_cache;
                     params = params_type::template create<sample_type, isotropic_type, isocache_type>(bsdf_sample, iso_interaction, isocache, bxdf::BCM_ABS);
                 }
             }
diff --git a/31_HLSLPathTracer/main.cpp b/31_HLSLPathTracer/main.cpp
index 4bb260b09..036fcdb79 100644
--- a/31_HLSLPathTracer/main.cpp
+++ b/31_HLSLPathTracer/main.cpp
@@ -23,7 +23,7 @@ struct PTPushConstant {
 
 // TODO: Add a QueryPool for timestamping once its ready
 // TODO: Do buffer creation using assConv
-class ComputeShaderPathtracer final : public examples::SimpleWindowedApplication, public application_templates::MonoAssetManagerAndBuiltinResourceApplication
+class HLSLComputePathtracer final : public examples::SimpleWindowedApplication, public application_templates::MonoAssetManagerAndBuiltinResourceApplication
 {
 		using device_base_t = examples::SimpleWindowedApplication;
 		using asset_base_t = application_templates::MonoAssetManagerAndBuiltinResourceApplication;
@@ -69,7 +69,7 @@ class ComputeShaderPathtracer final : public examples::SimpleWindowedApplication
 		};
 
 	public:
-		inline ComputeShaderPathtracer(const path& _localInputCWD, const path& _localOutputCWD, const path& _sharedInputCWD, const path& _sharedOutputCWD)
+		inline HLSLComputePathtracer(const path& _localInputCWD, const path& _localOutputCWD, const path& _sharedInputCWD, const path& _sharedOutputCWD)
 			: IApplicationFramework(_localInputCWD, _localOutputCWD, _sharedInputCWD, _sharedOutputCWD) {}
 
 		inline bool isComputeOnly() const override { return false; }
@@ -1349,4 +1349,4 @@ class ComputeShaderPathtracer final : public examples::SimpleWindowedApplication
 		IGPUCommandBuffer::SClearColorValue clearColor = { .float32 = {0.f,0.f,0.f,1.f} };
 };
 
-NBL_MAIN_FUNC(ComputeShaderPathtracer)
+NBL_MAIN_FUNC(HLSLComputePathtracer)

From 8eaa71463bfc1cf1cda0002e67f0f67e1d3a4ba5 Mon Sep 17 00:00:00 2001
From: keptsecret <sorchon@gmail.com>
Date: Fri, 7 Mar 2025 16:58:44 +0700
Subject: [PATCH 068/529] fix intersector, no use intersectdata

---
 .../app_resources/hlsl/common.hlsl            |  82 +++++-----
 .../app_resources/hlsl/intersector.hlsl       | 105 ++++++++++---
 .../app_resources/hlsl/pathtracer.hlsl        |  18 ++-
 .../app_resources/hlsl/render.comp.hlsl       |   2 +-
 .../app_resources/hlsl/scene.hlsl             | 140 +++++++++---------
 31_HLSLPathTracer/main.cpp                    |   2 +-
 6 files changed, 207 insertions(+), 142 deletions(-)

diff --git a/31_HLSLPathTracer/app_resources/hlsl/common.hlsl b/31_HLSLPathTracer/app_resources/hlsl/common.hlsl
index 244a92107..0fd595bca 100644
--- a/31_HLSLPathTracer/app_resources/hlsl/common.hlsl
+++ b/31_HLSLPathTracer/app_resources/hlsl/common.hlsl
@@ -203,11 +203,11 @@ struct Shape;
 template<>
 struct Shape<PST_SPHERE>
 {
-    static Shape<PST_SPHERE> create(NBL_CONST_REF_ARG(float32_t3) position, float32_t radius, uint32_t bsdfLightIDs)
+    static Shape<PST_SPHERE> create(NBL_CONST_REF_ARG(float32_t3) position, float32_t radius2, uint32_t bsdfLightIDs)
     {
         Shape<PST_SPHERE> retval;
         retval.position = position;
-        retval.radius2 = radius * radius;
+        retval.radius2 = radius2;
         retval.bsdfLightIDs = bsdfLightIDs;
         return retval;
     }
@@ -215,20 +215,20 @@ struct Shape<PST_SPHERE>
     static Shape<PST_SPHERE> create(NBL_CONST_REF_ARG(float32_t3) position, float32_t radius, uint32_t bsdfID, uint32_t lightID)
     {
         uint32_t bsdfLightIDs = glsl::bitfieldInsert<uint32_t>(bsdfID, lightID, 16, 16);
-        return create(position, radius, bsdfLightIDs);
+        return create(position, radius * radius, bsdfLightIDs);
     }
 
     // return intersection distance if found, nan otherwise
     float intersect(NBL_CONST_REF_ARG(float32_t3) origin, NBL_CONST_REF_ARG(float32_t3) direction)
     {
         float32_t3 relOrigin = origin - position;
-        float relOriginLen2 = nbl::hlsl::dot(relOrigin, relOrigin);
+        float relOriginLen2 = hlsl::dot<float32_t3>(relOrigin, relOrigin);
 
-        float dirDotRelOrigin = nbl::hlsl::dot(direction, relOrigin);
+        float dirDotRelOrigin = hlsl::dot<float32_t3>(direction, relOrigin);
         float det = radius2 - relOriginLen2 + dirDotRelOrigin * dirDotRelOrigin;
 
         // do some speculative math here
-        float detsqrt = nbl::hlsl::sqrt(det);
+        float detsqrt = hlsl::sqrt<float32_t>(det);
         return -dirDotRelOrigin + (relOriginLen2 > radius2 ? (-detsqrt) : detsqrt);
     }
 
@@ -241,7 +241,7 @@ struct Shape<PST_SPHERE>
     float getSolidAngle(NBL_CONST_REF_ARG(float32_t3) origin)
     {
         float32_t3 dist = position - origin;
-        float cosThetaMax = nbl::hlsl::sqrt(1.0 - radius2 / nbl::hlsl::dot(dist, dist));
+        float cosThetaMax = hlsl::sqrt<float32_t>(1.0 - radius2 / hlsl::dot<float32_t3>(dist, dist));
         return 2.0 * numbers::pi<float> * (1.0 - cosThetaMax);
     }
 
@@ -255,28 +255,28 @@ struct Shape<PST_SPHERE>
     float32_t3 generate_and_pdf(NBL_REF_ARG(float32_t) pdf, NBL_REF_ARG(float32_t) newRayMaxT, NBL_CONST_REF_ARG(float32_t3) origin, NBL_CONST_REF_ARG(Aniso) interaction, bool isBSDF, float32_t3 xi)
     {
         float32_t3 Z = position - origin;
-        const float distanceSQ = nbl::hlsl::dot(Z,Z);
+        const float distanceSQ = hlsl::dot<float32_t3>(Z,Z);
         const float cosThetaMax2 = 1.0 - radius2 / distanceSQ;
         if (cosThetaMax2 > 0.0)
         {
-            const float rcpDistance = 1.0 / nbl::hlsl::sqrt(distanceSQ);
+            const float rcpDistance = 1.0 / hlsl::sqrt<float32_t>(distanceSQ);
             Z *= rcpDistance;
 
-            const float cosThetaMax = nbl::hlsl::sqrt(cosThetaMax2);
+            const float cosThetaMax = hlsl::sqrt<float32_t>(cosThetaMax2);
             const float cosTheta = nbl::hlsl::mix<float>(1.0, cosThetaMax, xi.x);
 
             float32_t3 L = Z * cosTheta;
 
             const float cosTheta2 = cosTheta * cosTheta;
-            const float sinTheta = nbl::hlsl::sqrt(1.0 - cosTheta2);
+            const float sinTheta = hlsl::sqrt<float32_t>(1.0 - cosTheta2);
             float sinPhi, cosPhi;
-            math::sincos(2.0 * numbers::pi<float> * xi.y - numbers::pi<float>, sinPhi, cosPhi);
+            math::sincos<float>(2.0 * numbers::pi<float> * xi.y - numbers::pi<float>, sinPhi, cosPhi);
             float32_t3 X, Y;
             math::frisvad<float32_t3>(Z, X, Y);
 
             L += (X * cosPhi + Y * sinPhi) * sinTheta;
 
-            newRayMaxT = (cosTheta - nbl::hlsl::sqrt(cosTheta2 - cosThetaMax2)) / rcpDistance;
+            newRayMaxT = (cosTheta - hlsl::sqrt<float32_t>(cosTheta2 - cosThetaMax2)) / rcpDistance;
             pdf = 1.0 / (2.0 * numbers::pi<float> * (1.0 - cosThetaMax));
             return L;
         }
@@ -315,26 +315,26 @@ struct Shape<PST_TRIANGLE>
     {
         const float32_t3 edges[2] = { vertex1 - vertex0, vertex2 - vertex0 };
 
-        const float32_t3 h = nbl::hlsl::cross(direction, edges[1]);
-        const float a = nbl::hlsl::dot(edges[0], h);
+        const float32_t3 h = hlsl::cross<float32_t3>(direction, edges[1]);
+        const float a = hlsl::dot<float32_t3>(edges[0], h);
 
         const float32_t3 relOrigin = origin - vertex0;
 
-        const float u = nbl::hlsl::dot(relOrigin, h) / a;
+        const float u = hlsl::dot<float32_t3>(relOrigin, h) / a;
 
-        const float32_t3 q = nbl::hlsl::cross(relOrigin, edges[0]);
-        const float v = nbl::hlsl::dot(direction, q) / a;
+        const float32_t3 q = hlsl::cross<float32_t3>(relOrigin, edges[0]);
+        const float v = hlsl::dot<float32_t3>(direction, q) / a;
 
-        const float t = nbl::hlsl::dot(edges[1], q) / a;
+        const float t = hlsl::dot<float32_t3>(edges[1], q) / a;
 
         const bool intersection = t > 0.f && u >= 0.f && v >= 0.f && (u + v) <= 1.f;
-        return intersection ? t : numeric_limits<float>::infinity;
+        return intersection ? t : bit_cast<float, uint32_t>(numeric_limits<float>::infinity);
     }
 
     float32_t3 getNormalTimesArea()
     {
         const float32_t3 edges[2] = { vertex1 - vertex0, vertex2 - vertex0 };
-        return nbl::hlsl::cross(edges[0], edges[1]) * 0.5f;
+        return hlsl::cross<float32_t3>(edges[0], edges[1]) * 0.5f;
     }
 
     template<typename Ray>
@@ -347,7 +347,7 @@ struct Shape<PST_TRIANGLE>
             {
                 const float dist = ray.intersectionT;
                 const float32_t3 L = ray.direction;
-                return dist * dist / nbl::hlsl::abs(nbl::hlsl::dot(getNormalTimesArea(), L));
+                return dist * dist / hlsl::abs<float32_t>(hlsl::dot<float32_t3>(getNormalTimesArea(), L));
             }
             break;
             case PPM_SOLID_ANGLE:
@@ -381,15 +381,15 @@ struct Shape<PST_TRIANGLE>
             {
                 const float32_t3 edge0 = vertex1 - vertex0;
                 const float32_t3 edge1 = vertex2 - vertex0;
-                const float sqrtU = nbl::hlsl::sqrt(xi.x);
+                const float sqrtU = hlsl::sqrt<float32_t>(xi.x);
                 float32_t3 pnt = vertex0 + edge0 * (1.0 - sqrtU) + edge1 * sqrtU * xi.y;
                 float32_t3 L = pnt - origin;
 
-                const float distanceSq = nbl::hlsl::dot(L,L);
-                const float rcpDistance = 1.0 / nbl::hlsl::sqrt(distanceSq);
+                const float distanceSq = hlsl::dot<float32_t3>(L,L);
+                const float rcpDistance = 1.0 / hlsl::sqrt<float32_t>(distanceSq);
                 L *= rcpDistance;
 
-                pdf = distanceSq / nbl::hlsl::abs(nbl::hlsl::dot(nbl::hlsl::cross(edge0, edge1) * 0.5f, L));
+                pdf = distanceSq / hlsl::abs<float32_t>(hlsl::dot<float32_t3>(hlsl::cross<float32_t3>(edge0, edge1) * 0.5f, L));
                 newRayMaxT = 1.0 / rcpDistance;
                 return L;
             }
@@ -406,7 +406,7 @@ struct Shape<PST_TRIANGLE>
                 pdf = rcpPdf > numeric_limits<float>::min ? (1.0 / rcpPdf) : 0.0;
 
                 const float32_t3 N = getNormalTimesArea();
-                newRayMaxT = nbl::hlsl::dot(N, vertex0 - origin) / nbl::hlsl::dot(N, L);
+                newRayMaxT = hlsl::dot<float32_t3>(N, vertex0 - origin) / hlsl::dot<float32_t3>(N, L);
                 return L;
             }
             break;
@@ -422,7 +422,7 @@ struct Shape<PST_TRIANGLE>
                 pdf = rcpPdf > numeric_limits<float>::min ? (1.0 / rcpPdf) : 0.0;
 
                 const float32_t3 N = getNormalTimesArea();
-                newRayMaxT = nbl::hlsl::dot(N, vertex0 - origin) / nbl::hlsl::dot(N, L);
+                newRayMaxT = hlsl::dot<float32_t3>(N, vertex0 - origin) / hlsl::dot<float32_t3>(N, L);
                 return L;
             }
             break;
@@ -462,25 +462,25 @@ struct Shape<PST_RECTANGLE>
 
     float intersect(NBL_CONST_REF_ARG(float32_t3) origin, NBL_CONST_REF_ARG(float32_t3) direction)
     {
-        const float32_t3 h = nbl::hlsl::cross(direction, edge1);
-        const float a = nbl::hlsl::dot(edge0, h);
+        const float32_t3 h = hlsl::cross<float32_t3>(direction, edge1);
+        const float a = hlsl::dot<float32_t3>(edge0, h);
 
         const float32_t3 relOrigin = origin - offset;
 
-        const float u = nbl::hlsl::dot(relOrigin,h)/a;
+        const float u = hlsl::dot<float32_t3>(relOrigin,h)/a;
 
-        const float32_t3 q = nbl::hlsl::cross(relOrigin, edge0);
-        const float v = nbl::hlsl::dot(direction, q) / a;
+        const float32_t3 q = hlsl::cross<float32_t3>(relOrigin, edge0);
+        const float v = hlsl::dot<float32_t3>(direction, q) / a;
 
-        const float t = nbl::hlsl::dot(edge1, q) / a;
+        const float t = hlsl::dot<float32_t3>(edge1, q) / a;
 
         const bool intersection = t > 0.f && u >= 0.f && v >= 0.f && u <= 1.f && v <= 1.f;
-        return intersection ? t : numeric_limits<float>::infinity;
+        return intersection ? t : bit_cast<float, uint32_t>(numeric_limits<float>::infinity);
     }
 
     float32_t3 getNormalTimesArea()
     {
-        return nbl::hlsl::cross(edge0, edge1);
+        return hlsl::cross<float32_t3>(edge0, edge1);
     }
 
     void getNormalBasis(NBL_REF_ARG(float32_t3x3) basis, NBL_REF_ARG(float32_t2) extents)
@@ -502,7 +502,7 @@ struct Shape<PST_RECTANGLE>
             {
                 const float dist = ray.intersectionT;
                 const float32_t3 L = ray.direction;
-                return dist * dist / nbl::hlsl::abs(nbl::hlsl::dot(getNormalTimesArea(), L));
+                return dist * dist / hlsl::abs<float32_t>(hlsl::dot<float32_t3>(getNormalTimesArea(), L));
             }
             break;
             // #ifdef TRIANGLE_REFERENCE ?
@@ -542,10 +542,10 @@ struct Shape<PST_RECTANGLE>
             case PPM_AREA:
             {
                 float32_t3 L = origin2origin + edge0 * xi.x + edge1 * xi.y;
-                const float distSq = nbl::hlsl::dot(L, L);
-                const float rcpDist = 1.0 / nbl::hlsl::sqrt(distSq);
+                const float distSq = hlsl::dot<float32_t3>(L, L);
+                const float rcpDist = 1.0 / hlsl::sqrt<float32_t>(distSq);
                 L *= rcpDist;
-                pdf = distSq / nbl::hlsl::abs(nbl::hlsl::dot(N, L));
+                pdf = distSq / hlsl::abs<float32_t>(hlsl::dot<float32_t3>(N, L));
                 newRayMaxT = 1.0 / rcpDist;
                 return L;
             }
@@ -572,7 +572,7 @@ struct Shape<PST_RECTANGLE>
                 else
                     pdf = numeric_limits<float>::infinity;
 
-                newRayMaxT = nbl::hlsl::dot(N, origin2origin) / nbl::hlsl::dot(N, L);
+                newRayMaxT = hlsl::dot<float32_t3>(N, origin2origin) / hlsl::dot<float32_t3>(N, L);
                 return L;
             }
             break;
diff --git a/31_HLSLPathTracer/app_resources/hlsl/intersector.hlsl b/31_HLSLPathTracer/app_resources/hlsl/intersector.hlsl
index 525af5525..68ea75dd3 100644
--- a/31_HLSLPathTracer/app_resources/hlsl/intersector.hlsl
+++ b/31_HLSLPathTracer/app_resources/hlsl/intersector.hlsl
@@ -25,6 +25,62 @@ struct Comprehensive
     using bxdfnode_type = BxdfNode;
     using scene_type = Scene<light_type, bxdfnode_type>;
 
+    static ObjectID traceRay(NBL_REF_ARG(ray_type) ray, NBL_CONST_REF_ARG(scene_type) scene)
+    {
+        ObjectID objectID;
+        objectID.id = -1;
+
+        // prodedural shapes
+        for (int i = 0; i < scene.sphereCount; i++)
+        {
+            float t = scene.spheres[i].intersect(ray.origin, ray.direction);
+
+            bool closerIntersection = t > 0.0 && t < ray.intersectionT;
+
+            if (closerIntersection)
+            {
+                ray.intersectionT = t;
+                objectID.id = i;
+                objectID.mode = IntersectData::Mode::PROCEDURAL;
+                objectID.shapeType = PST_SPHERE;
+            }
+        }
+        for (int i = 0; i < scene.triangleCount; i++)
+        {
+            float t = scene.triangles[i].intersect(ray.origin, ray.direction);
+
+            bool closerIntersection = t > 0.0 && t < ray.intersectionT;
+
+            if (closerIntersection)
+            {
+                ray.intersectionT = t;
+                objectID.id = i;
+                objectID.mode = IntersectData::Mode::PROCEDURAL;
+                objectID.shapeType = PST_TRIANGLE;
+            }
+        }
+        for (int i = 0; i < scene.rectangleCount; i++)
+        {
+            float t = scene.rectangles[i].intersect(ray.origin, ray.direction);
+
+            bool closerIntersection = t > 0.0 && t < ray.intersectionT;
+
+            if (closerIntersection)
+            {
+                ray.intersectionT = t;
+                objectID.id = i;
+                objectID.mode = IntersectData::Mode::PROCEDURAL;
+                objectID.shapeType = PST_TRIANGLE;
+            }
+        }
+
+        // TODO: trace AS
+
+        return objectID;
+    }
+
+    // note for future consideration: still need to encode to IntersectData?
+    // obsolete?
     static ObjectID traceProcedural(NBL_REF_ARG(ray_type) ray, NBL_CONST_REF_ARG(IntersectData) intersect)
     {
         const bool anyHit = ray.intersectionT != numeric_limits<scalar_type>::max;
@@ -81,6 +137,7 @@ struct Comprehensive
         return objectID;
     }
 
+    // obsolete?
     static ObjectID traceRay(NBL_REF_ARG(ray_type) ray, NBL_CONST_REF_ARG(IntersectData) intersect)
     {
         const IntersectData::Mode mode = (IntersectData::Mode)intersect.mode;
@@ -109,36 +166,36 @@ struct Comprehensive
         return ObjectID::create(-1, 0, PST_SPHERE);
     }
 
-    static ObjectID traceRay(NBL_REF_ARG(ray_type) ray, NBL_CONST_REF_ARG(scene_type) scene)
-    {
-        IntersectData data;
+    // static ObjectID traceRay(NBL_REF_ARG(ray_type) ray, NBL_CONST_REF_ARG(scene_type) scene)
+    // {
+    //     IntersectData data;
 
-        ObjectID objectID;
-        objectID.id = -1;  // start with no intersect
+    //     ObjectID objectID;
+    //     objectID.id = -1;  // start with no intersect
 
-        // prodedural shapes
-        if (scene.sphereCount > 0)
-        {
-            data = scene.toIntersectData(ext::Intersector::IntersectData::Mode::PROCEDURAL, PST_SPHERE);
-            objectID = traceRay(ray, data);
-        }
+    //     // prodedural shapes
+    //     if (scene.sphereCount > 0)
+    //     {
+    //         data = scene.toIntersectData(ext::Intersector::IntersectData::Mode::PROCEDURAL, PST_SPHERE);
+    //         objectID = traceRay(ray, data);
+    //     }
 
-        if (scene.triangleCount > 0)
-        {
-            data = scene.toIntersectData(ext::Intersector::IntersectData::Mode::PROCEDURAL, PST_TRIANGLE);
-            objectID = traceRay(ray, data);
-        }
+    //     if (scene.triangleCount > 0)
+    //     {
+    //         data = scene.toIntersectData(ext::Intersector::IntersectData::Mode::PROCEDURAL, PST_TRIANGLE);
+    //         objectID = traceRay(ray, data);
+    //     }
 
-        if (scene.rectangleCount > 0)
-        {
-            data = scene.toIntersectData(ext::Intersector::IntersectData::Mode::PROCEDURAL, PST_RECTANGLE);
-            objectID = traceRay(ray, data);
-        }
+    //     if (scene.rectangleCount > 0)
+    //     {
+    //         data = scene.toIntersectData(ext::Intersector::IntersectData::Mode::PROCEDURAL, PST_RECTANGLE);
+    //         objectID = traceRay(ray, data);
+    //     }
 
-        // TODO: trace AS
+    //     // TODO: trace AS
 
-        return objectID;
-    }
+    //     return objectID;
+    // }
 };
 
 // does everything in traceray in ex 30
diff --git a/31_HLSLPathTracer/app_resources/hlsl/pathtracer.hlsl b/31_HLSLPathTracer/app_resources/hlsl/pathtracer.hlsl
index ba683d443..6b49bc758 100644
--- a/31_HLSLPathTracer/app_resources/hlsl/pathtracer.hlsl
+++ b/31_HLSLPathTracer/app_resources/hlsl/pathtracer.hlsl
@@ -79,7 +79,7 @@ struct Unidirectional
     //                     NextEventEstimator nee)
     // {}
 
-    static this_t create(NBL_CONST_REF_ARG(PathTracerCreationParams<create_params_type, scalar_type>) params, Buffer sampleSequence)
+    static this_t create(NBL_CONST_REF_ARG(PathTracerCreationParams<create_params_type, scalar_type>) params, Buffer<uint3> sampleSequence)
     {
         this_t retval;
         retval.randGen = randgen_type::create(params.rngState);
@@ -341,15 +341,21 @@ struct Unidirectional
             // bounces
             bool hit = true;
             bool rayAlive = true;
-            for (int d = 1; d <= depth && hit && rayAlive; d += 2)
-            {
+            // TODO for (int d = 1; d <= depth && hit && rayAlive; d += 2)
+            // TODO {
                 ray.intersectionT = numeric_limits<scalar_type>::max;
+                ray.objectID.id = -1;
                 ray.objectID = intersector_type::traceRay(ray, scene);
 
                 hit = ray.objectID.id != -1;
                 if (hit)
-                    rayAlive = closestHitProgram(d, i, ray, scene);
-            }
+                {
+                    float pp = float(ray.objectID.id) / 10.0;
+                    ray.payload.accumulation = measure_type(pp, 1.0-pp, 0.3);
+                    // TODO rayAlive = closestHitProgram(1, i, ray, scene);
+                }
+
+            // TODO }
             if (!hit)
                 missProgram(ray);
 
@@ -373,7 +379,7 @@ struct Unidirectional
     material_system_type materialSystem;
     nee_type nee;
 
-    Buffer sampleSequence;
+    Buffer<uint3> sampleSequence;
 };
 
 }
diff --git a/31_HLSLPathTracer/app_resources/hlsl/render.comp.hlsl b/31_HLSLPathTracer/app_resources/hlsl/render.comp.hlsl
index 360d085a6..f23d042a8 100644
--- a/31_HLSLPathTracer/app_resources/hlsl/render.comp.hlsl
+++ b/31_HLSLPathTracer/app_resources/hlsl/render.comp.hlsl
@@ -38,7 +38,7 @@ struct SPushConstants
 [[vk::combinedImageSampler]][[vk::binding(0, 2)]] Texture2D<float3> envMap;      // unused
 [[vk::combinedImageSampler]][[vk::binding(0, 2)]] SamplerState envSampler;
 
-[[vk::binding(1, 2)]] Buffer sampleSequence;
+[[vk::binding(1, 2)]] Buffer<uint3> sampleSequence;
 
 [[vk::combinedImageSampler]][[vk::binding(2, 2)]] Texture2D<uint2> scramblebuf; // unused
 [[vk::combinedImageSampler]][[vk::binding(2, 2)]] SamplerState scrambleSampler;
diff --git a/31_HLSLPathTracer/app_resources/hlsl/scene.hlsl b/31_HLSLPathTracer/app_resources/hlsl/scene.hlsl
index 79b66dbfb..1c17e2531 100644
--- a/31_HLSLPathTracer/app_resources/hlsl/scene.hlsl
+++ b/31_HLSLPathTracer/app_resources/hlsl/scene.hlsl
@@ -40,75 +40,77 @@ struct Scene
 
     // AS ases;
 
-    Intersector::IntersectData toIntersectData(uint32_t mode, ProceduralShapeType type)
-    {
-        Intersector::IntersectData retval;
-        retval.mode = mode;
-
-        uint32_t objCount = (type == PST_SPHERE) ? sphereCount :
-                            (type == PST_TRIANGLE) ? triangleCount :
-                            (type == PST_RECTANGLE) ? rectangleCount :
-                            -1;
-        retval.data[0] = objCount;
-        retval.data[1] = type;
-
-        switch (type)
-        {
-            case PST_SPHERE:
-            {
-                for (int i = 0; i < objCount; i++)
-                {
-                    Shape<PST_SPHERE> sphere = spheres[i];
-                    retval.data[2 + i * Shape<PST_SPHERE>::ObjSize] = asuint(sphere.position.x);
-                    retval.data[2 + i * Shape<PST_SPHERE>::ObjSize + 1] = asuint(sphere.position.y);
-                    retval.data[2 + i * Shape<PST_SPHERE>::ObjSize + 2] = asuint(sphere.position.z);
-                    retval.data[2 + i * Shape<PST_SPHERE>::ObjSize + 3] = asuint(sphere.radius2);
-                    retval.data[2 + i * Shape<PST_SPHERE>::ObjSize + 4] = sphere.bsdfLightIDs;
-                }
-            }
-            break;
-            case PST_TRIANGLE:
-            {
-                for (int i = 0; i < objCount; i++)
-                {
-                    Shape<PST_TRIANGLE> tri = triangles[i];
-                    retval.data[2 + i * Shape<PST_TRIANGLE>::ObjSize] = asuint(tri.vertex0.x);
-                    retval.data[2 + i * Shape<PST_TRIANGLE>::ObjSize + 1] = asuint(tri.vertex0.y);
-                    retval.data[2 + i * Shape<PST_TRIANGLE>::ObjSize + 2] = asuint(tri.vertex0.z);
-                    retval.data[2 + i * Shape<PST_TRIANGLE>::ObjSize + 3] = asuint(tri.vertex1.x);
-                    retval.data[2 + i * Shape<PST_TRIANGLE>::ObjSize + 4] = asuint(tri.vertex1.y);
-                    retval.data[2 + i * Shape<PST_TRIANGLE>::ObjSize + 5] = asuint(tri.vertex1.z);
-                    retval.data[2 + i * Shape<PST_TRIANGLE>::ObjSize + 6] = asuint(tri.vertex2.x);
-                    retval.data[2 + i * Shape<PST_TRIANGLE>::ObjSize + 7] = asuint(tri.vertex2.y);
-                    retval.data[2 + i * Shape<PST_TRIANGLE>::ObjSize + 8] = asuint(tri.vertex2.z);
-                    retval.data[2 + i * Shape<PST_TRIANGLE>::ObjSize + 9] = tri.bsdfLightIDs;
-                }
-            }
-            break;
-            case PST_RECTANGLE:
-            {
-                for (int i = 0; i < objCount; i++)
-                {
-                    Shape<PST_RECTANGLE> rect = rectangles[i];
-                    retval.data[2 + i * Shape<PST_RECTANGLE>::ObjSize] = asuint(rect.offset.x);
-                    retval.data[2 + i * Shape<PST_RECTANGLE>::ObjSize + 1] = asuint(rect.offset.y);
-                    retval.data[2 + i * Shape<PST_RECTANGLE>::ObjSize + 2] = asuint(rect.offset.z);
-                    retval.data[2 + i * Shape<PST_RECTANGLE>::ObjSize + 3] = asuint(rect.edge0.x);
-                    retval.data[2 + i * Shape<PST_RECTANGLE>::ObjSize + 4] = asuint(rect.edge0.y);
-                    retval.data[2 + i * Shape<PST_RECTANGLE>::ObjSize + 5] = asuint(rect.edge0.z);
-                    retval.data[2 + i * Shape<PST_RECTANGLE>::ObjSize + 6] = asuint(rect.edge1.x);
-                    retval.data[2 + i * Shape<PST_RECTANGLE>::ObjSize + 7] = asuint(rect.edge1.y);
-                    retval.data[2 + i * Shape<PST_RECTANGLE>::ObjSize + 8] = asuint(rect.edge1.z);
-                    retval.data[2 + i * Shape<PST_RECTANGLE>::ObjSize + 9] = rect.bsdfLightIDs;
-                }
-            }
-            break;
-            default:
-                // for ASes
-                break;
-        }
-        return retval;
-    }
+    // obsolete?
+    // Intersector::IntersectData toIntersectData(uint32_t mode, ProceduralShapeType type)
+    // {
+    //     Intersector::IntersectData retval;
+    //     retval.mode = mode;
+
+    //     uint32_t objCount = (type == PST_SPHERE) ? sphereCount :
+    //                         (type == PST_TRIANGLE) ? triangleCount :
+    //                         (type == PST_RECTANGLE) ? rectangleCount :
+    //                         -1;
+    //     retval.data[0] = objCount;
+    //     retval.data[1] = type;
+
+    //     switch (type)
+    //     {
+    //         case PST_SPHERE:
+    //         {
+    //             for (int i = 0; i < objCount; i++)
+    //             {
+    //                 Shape<PST_SPHERE> sphere = spheres[i];
+    //                 uint32_t3 uintPos = bit_cast<uint32_t3, float32_t3>(sphere.position);
+    //                 retval.data[2 + i * Shape<PST_SPHERE>::ObjSize] = uintPos.x;
+    //                 retval.data[2 + i * Shape<PST_SPHERE>::ObjSize + 1] = uintPos.y;
+    //                 retval.data[2 + i * Shape<PST_SPHERE>::ObjSize + 2] = uintPos.z;
+    //                 retval.data[2 + i * Shape<PST_SPHERE>::ObjSize + 3] = bit_cast<uint32_t, float32_t>(sphere.radius2);
+    //                 retval.data[2 + i * Shape<PST_SPHERE>::ObjSize + 4] = sphere.bsdfLightIDs;
+    //             }
+    //         }
+    //         break;
+    //         case PST_TRIANGLE:
+    //         {
+    //             for (int i = 0; i < objCount; i++)
+    //             {
+    //                 Shape<PST_TRIANGLE> tri = triangles[i];
+    //                 retval.data[2 + i * Shape<PST_TRIANGLE>::ObjSize] = asuint(tri.vertex0.x);
+    //                 retval.data[2 + i * Shape<PST_TRIANGLE>::ObjSize + 1] = asuint(tri.vertex0.y);
+    //                 retval.data[2 + i * Shape<PST_TRIANGLE>::ObjSize + 2] = asuint(tri.vertex0.z);
+    //                 retval.data[2 + i * Shape<PST_TRIANGLE>::ObjSize + 3] = asuint(tri.vertex1.x);
+    //                 retval.data[2 + i * Shape<PST_TRIANGLE>::ObjSize + 4] = asuint(tri.vertex1.y);
+    //                 retval.data[2 + i * Shape<PST_TRIANGLE>::ObjSize + 5] = asuint(tri.vertex1.z);
+    //                 retval.data[2 + i * Shape<PST_TRIANGLE>::ObjSize + 6] = asuint(tri.vertex2.x);
+    //                 retval.data[2 + i * Shape<PST_TRIANGLE>::ObjSize + 7] = asuint(tri.vertex2.y);
+    //                 retval.data[2 + i * Shape<PST_TRIANGLE>::ObjSize + 8] = asuint(tri.vertex2.z);
+    //                 retval.data[2 + i * Shape<PST_TRIANGLE>::ObjSize + 9] = tri.bsdfLightIDs;
+    //             }
+    //         }
+    //         break;
+    //         case PST_RECTANGLE:
+    //         {
+    //             for (int i = 0; i < objCount; i++)
+    //             {
+    //                 Shape<PST_RECTANGLE> rect = rectangles[i];
+    //                 retval.data[2 + i * Shape<PST_RECTANGLE>::ObjSize] = asuint(rect.offset.x);
+    //                 retval.data[2 + i * Shape<PST_RECTANGLE>::ObjSize + 1] = asuint(rect.offset.y);
+    //                 retval.data[2 + i * Shape<PST_RECTANGLE>::ObjSize + 2] = asuint(rect.offset.z);
+    //                 retval.data[2 + i * Shape<PST_RECTANGLE>::ObjSize + 3] = asuint(rect.edge0.x);
+    //                 retval.data[2 + i * Shape<PST_RECTANGLE>::ObjSize + 4] = asuint(rect.edge0.y);
+    //                 retval.data[2 + i * Shape<PST_RECTANGLE>::ObjSize + 5] = asuint(rect.edge0.z);
+    //                 retval.data[2 + i * Shape<PST_RECTANGLE>::ObjSize + 6] = asuint(rect.edge1.x);
+    //                 retval.data[2 + i * Shape<PST_RECTANGLE>::ObjSize + 7] = asuint(rect.edge1.y);
+    //                 retval.data[2 + i * Shape<PST_RECTANGLE>::ObjSize + 8] = asuint(rect.edge1.z);
+    //                 retval.data[2 + i * Shape<PST_RECTANGLE>::ObjSize + 9] = rect.bsdfLightIDs;
+    //             }
+    //         }
+    //         break;
+    //         default:
+    //             // for ASes
+    //             break;
+    //     }
+    //     return retval;
+    // }
 
     NextEventEstimator::Event toNextEvent(uint32_t lightID)
     {
diff --git a/31_HLSLPathTracer/main.cpp b/31_HLSLPathTracer/main.cpp
index 036fcdb79..8da32083e 100644
--- a/31_HLSLPathTracer/main.cpp
+++ b/31_HLSLPathTracer/main.cpp
@@ -1341,7 +1341,7 @@ class HLSLComputePathtracer final : public examples::SimpleWindowedApplication,
 		float camYAngle = 165.f / 180.f * 3.14159f;
 		float camXAngle = 32.f / 180.f * 3.14159f;
 		int PTPipline = E_LIGHT_GEOMETRY::ELG_SPHERE;
-		int renderMode = E_RENDER_MODE::ERM_GLSL;
+		int renderMode = E_RENDER_MODE::ERM_HLSL;
 		int spp = 32;
 		int depth = 3;
 

From 2b07a421b22636acfcb432cf4dd1aef3c2f02ae0 Mon Sep 17 00:00:00 2001
From: Przemek <minikers21@gmail.com>
Date: Sat, 8 Mar 2025 14:27:40 +0100
Subject: [PATCH 069/529] Barycentrics

---
 62_CAD/shaders/main_pipeline/common.hlsl          | 13 +++++++++++++
 62_CAD/shaders/main_pipeline/fragment_shader.hlsl |  8 ++++++++
 62_CAD/shaders/main_pipeline/vertex_shader.hlsl   |  1 +
 3 files changed, 22 insertions(+)

diff --git a/62_CAD/shaders/main_pipeline/common.hlsl b/62_CAD/shaders/main_pipeline/common.hlsl
index ca13db341..a0a903a4d 100644
--- a/62_CAD/shaders/main_pipeline/common.hlsl
+++ b/62_CAD/shaders/main_pipeline/common.hlsl
@@ -74,6 +74,11 @@ struct PSInput
     [[vk::location(3)]] nointerpolation float4 data4 : COLOR4;
     // Data segments that need interpolation, mostly for hatches
     [[vk::location(5)]] float2 interp_data5 : COLOR5;
+#ifdef FRAGMENT_SHADER_INPUT
+    [[vk::location(6)]] [[vk::ext_decorate(/*spv::DecoratePerVertexKHR*/5285)]] nointerpolation float3 vertexScreenSpacePos[3] : COLOR6;
+#else
+    [[vk::location(6)]] nointerpolation float3 vertexScreenSpacePos : COLOR6;
+#endif 
     // ArcLenCalculator<float>
 
     // Set functions used in vshader, get functions used in fshader
@@ -211,6 +216,14 @@ struct PSInput
     
     void setImageUV(float2 uv) { interp_data5.xy = uv; }
     void setImageTextureId(uint32_t textureId) { data2.x = asfloat(textureId); }
+
+    /* TRIANGLE MESH */
+
+#ifndef FRAGMENT_SHADER_INPUT // vertex shader
+    void setScreenSpaceVertexPos(float3 pos) { vertexScreenSpacePos = pos; }
+#else // fragment shader
+    float3 getScreenSpaceVertexPos(uint32_t vertexIndex) { return vertexScreenSpacePos[vertexIndex]; }
+#endif 
 };
 
 // Set 0 - Scene Data and Globals, buffer bindings don't change the buffers only get updated
diff --git a/62_CAD/shaders/main_pipeline/fragment_shader.hlsl b/62_CAD/shaders/main_pipeline/fragment_shader.hlsl
index 2f21a6a0f..ab5885d3d 100644
--- a/62_CAD/shaders/main_pipeline/fragment_shader.hlsl
+++ b/62_CAD/shaders/main_pipeline/fragment_shader.hlsl
@@ -1,3 +1,4 @@
+#define FRAGMENT_SHADER_INPUT
 #include "common.hlsl"
 #include <nbl/builtin/hlsl/shapes/beziers.hlsl>
 #include <nbl/builtin/hlsl/shapes/line.hlsl>
@@ -7,6 +8,7 @@
 #include <nbl/builtin/hlsl/spirv_intrinsics/fragment_shader_pixel_interlock.hlsl>
 #include <nbl/builtin/hlsl/jit/device_capabilities.hlsl>
 #include <nbl/builtin/hlsl/text_rendering/msdf.hlsl>
+#include <nbl/builtin/hlsl/spirv_intrinsics/fragment_shader_barycentric.hlsl>
 
 template<typename float_t>
 struct DefaultClipper
@@ -405,6 +407,12 @@ float32_t4 calculateFinalColor<true>(const uint2 fragCoord, const float localAlp
 [shader("pixel")]
 float4 fragMain(PSInput input) : SV_TARGET
 {
+    float3 v0 = input.getScreenSpaceVertexPos(0);
+    float3 v1 = input.getScreenSpaceVertexPos(1);
+    float3 v2 = input.getScreenSpaceVertexPos(2);
+
+    printf("v0 = { %f, %f, %f }\nv1 = { %f, %f, %f }\nv2 = { %f, %f, %f }", v0.x, v0.y, v0.z, v1.x, v1.y, v1.z, v2.x, v2.y, v2.z);
+
     return float4(1.0f, 0.0f, 0.0f, 1.0f);
 
     float localAlpha = 0.0f;
diff --git a/62_CAD/shaders/main_pipeline/vertex_shader.hlsl b/62_CAD/shaders/main_pipeline/vertex_shader.hlsl
index a798549d5..f7abd6285 100644
--- a/62_CAD/shaders/main_pipeline/vertex_shader.hlsl
+++ b/62_CAD/shaders/main_pipeline/vertex_shader.hlsl
@@ -121,6 +121,7 @@ PSInput main(uint vertexID : SV_VertexID)
     outV.position.xy = transformedPos;
     outV.position = transformFromSreenSpaceToNdc(outV.position.xy, globals.resolution);
     outV.setHeightAtMeshVertex(vtx.height);
+    outV.setScreenSpaceVertexPos(float3(transformedPos, 1));
 
     return outV;
 

From 4d3e04698b85f31910d8d28b82925fe2b641adae Mon Sep 17 00:00:00 2001
From: keptsecret <sorchon@gmail.com>
Date: Mon, 10 Mar 2025 14:55:22 +0700
Subject: [PATCH 070/529] removed intersectdata usage, fix emissive bug

---
 .../app_resources/hlsl/common.hlsl            |  76 ++++---
 .../app_resources/hlsl/intersector.hlsl       | 207 +++++++-----------
 .../app_resources/hlsl/material_system.hlsl   |   6 +-
 .../hlsl/next_event_estimator.hlsl            |  40 ++--
 .../app_resources/hlsl/pathtracer.hlsl        |  38 ++--
 .../app_resources/hlsl/render.comp.hlsl       |   8 +-
 .../app_resources/hlsl/scene.hlsl             |  51 +++--
 7 files changed, 214 insertions(+), 212 deletions(-)

diff --git a/31_HLSLPathTracer/app_resources/hlsl/common.hlsl b/31_HLSLPathTracer/app_resources/hlsl/common.hlsl
index 0fd595bca..f67716060 100644
--- a/31_HLSLPathTracer/app_resources/hlsl/common.hlsl
+++ b/31_HLSLPathTracer/app_resources/hlsl/common.hlsl
@@ -105,17 +105,33 @@ struct BxDFNode
 
     NBL_CONSTEXPR_STATIC_INLINE uint32_t INVALID_ID = 0xffffu;
 
+    // for diffuse bxdfs
+    static BxDFNode<Spectrum> create(uint32_t materialType, bool isAniso, NBL_CONST_REF_ARG(float32_t2) A, NBL_CONST_REF_ARG(spectral_type) albedo)
+    {
+        BxDFNode<Spectrum> retval;
+        retval.albedo = albedo;
+        retval.materialType = materialType;
+        retval.params.is_aniso = isAniso;
+        retval.params.A = hlsl::max<float32_t2>(A, 1e-4);
+        retval.params.ior0 = (spectral_type)1.0;
+        retval.params.ior1 = (spectral_type)1.0;
+        return retval;
+    }
+
+    // for conductor + dielectric
     static BxDFNode<Spectrum> create(uint32_t materialType, bool isAniso, NBL_CONST_REF_ARG(float32_t2) A, NBL_CONST_REF_ARG(spectral_type) ior0, NBL_CONST_REF_ARG(spectral_type) ior1)
     {
         BxDFNode<Spectrum> retval;
+        retval.albedo = (spectral_type)1.0;
         retval.materialType = materialType;
         retval.params.is_aniso = isAniso;
-        retval.params.A = A;
+        retval.params.A = hlsl::max<float32_t2>(A, 1e-4);
         retval.params.ior0 = ior0;
         retval.params.ior1 = ior1;
         return retval;
     }
 
+    spectral_type albedo;
     uint32_t materialType;
     params_type params;
 };
@@ -149,32 +165,39 @@ enum PTPolygonMethod : uint16_t
     PPM_APPROX_PROJECTED_SOLID_ANGLE
 };
 
-namespace Intersector
-{
-// ray query method
-// ray query struct holds AS info
-// pass in address to vertex/index buffers?
+// namespace Intersector
+// {
+// // ray query method
+// // ray query struct holds AS info
+// // pass in address to vertex/index buffers?
 
-// ray tracing pipeline method
+// // ray tracing pipeline method
 
-// procedural data store: [obj count] [intersect type] [obj1] [obj2] [...]
+// // procedural data store: [obj count] [intersect type] [obj1] [obj2] [...]
 
-struct IntersectData
-{
-    enum Mode : uint32_t    // enum class?
-    {
-        RAY_QUERY,
-        RAY_TRACING,
-        PROCEDURAL
-    };
+// struct IntersectData
+// {
+//     enum Mode : uint32_t    // enum class?
+//     {
+//         RAY_QUERY,
+//         RAY_TRACING,
+//         PROCEDURAL
+//     };
 
-    NBL_CONSTEXPR_STATIC_INLINE uint32_t DataSize = 128;
+//     NBL_CONSTEXPR_STATIC_INLINE uint32_t DataSize = 128;
 
-    uint32_t mode : 1;
-    uint32_t unused : 31;   // possible space for flags
-    uint32_t data[DataSize];
+//     uint32_t mode : 2;
+//     uint32_t unused : 30;   // possible space for flags
+//     uint32_t data[DataSize];
+// };
+// }
+
+enum IntersectMode : uint32_t
+{
+    IM_RAY_QUERY,
+    IM_RAY_TRACING,
+    IM_PROCEDURAL
 };
-}
 
 namespace NextEventEstimator
 {
@@ -182,17 +205,10 @@ namespace NextEventEstimator
 
 struct Event
 {
-    enum Mode : uint32_t    // enum class?
-    {
-        RAY_QUERY,
-        RAY_TRACING,
-        PROCEDURAL
-    };
-
     NBL_CONSTEXPR_STATIC_INLINE uint32_t DataSize = 16;
 
-    uint32_t mode : 1;
-    uint32_t unused : 31;   // possible space for flags
+    uint32_t mode : 2;
+    uint32_t unused : 30;   // possible space for flags
     uint32_t data[DataSize];
 };
 }
diff --git a/31_HLSLPathTracer/app_resources/hlsl/intersector.hlsl b/31_HLSLPathTracer/app_resources/hlsl/intersector.hlsl
index 68ea75dd3..03a45f866 100644
--- a/31_HLSLPathTracer/app_resources/hlsl/intersector.hlsl
+++ b/31_HLSLPathTracer/app_resources/hlsl/intersector.hlsl
@@ -41,7 +41,7 @@ struct Comprehensive
             {
                 ray.intersectionT = t;
                 objectID.id = i;
-                objectID.mode = IntersectData::Mode::PROCEDURAL;
+                objectID.mode = IM_PROCEDURAL;
                 objectID.shapeType = PST_SPHERE;
             }
         }
@@ -55,7 +55,7 @@ struct Comprehensive
             {
                 ray.intersectionT = t;
                 objectID.id = i;
-                objectID.mode = IntersectData::Mode::PROCEDURAL;
+                objectID.mode = IM_PROCEDURAL;
                 objectID.shapeType = PST_TRIANGLE;
             }
         }
@@ -69,7 +69,7 @@ struct Comprehensive
             {
                 ray.intersectionT = t;
                 objectID.id = i;
-                objectID.mode = IntersectData::Mode::PROCEDURAL;
+                objectID.mode = IM_PROCEDURAL;
                 objectID.shapeType = PST_TRIANGLE;
             }
         }
@@ -81,90 +81,90 @@ struct Comprehensive
 
     // note for future consideration: still need to encode to IntersectData?
     // obsolete?
-    static ObjectID traceProcedural(NBL_REF_ARG(ray_type) ray, NBL_CONST_REF_ARG(IntersectData) intersect)
-    {
-        const bool anyHit = ray.intersectionT != numeric_limits<scalar_type>::max;
-        const uint32_t objCount = intersect.data[0];
-        const ProceduralShapeType type = (ProceduralShapeType)intersect.data[1];
-
-        ObjectID objectID = ray.objectID;
-        objectID.mode = IntersectData::Mode::PROCEDURAL;
-        objectID.shapeType = type;
-        for (int i = 0; i < objCount; i++)
-        {
-            float t;
-            switch (type)
-            {
-                case PST_SPHERE:
-                {
-                    vector3_type position = vector3_type(asfloat(intersect.data[2 + i * Shape<PST_SPHERE>::ObjSize]), asfloat(intersect.data[2 + i * Shape<PST_SPHERE>::ObjSize + 1]), asfloat(intersect.data[2 + i * Shape<PST_SPHERE>::ObjSize + 2]));
-                    Shape<PST_SPHERE> sphere = Shape<PST_SPHERE>::create(position, asfloat(intersect.data[2 + i * Shape<PST_SPHERE>::ObjSize + 3]), intersect.data[2 + i * Shape<PST_SPHERE>::ObjSize + 4]);
-                    t = sphere.intersect(ray.origin, ray.direction);
-                }
-                break;
-                case PST_TRIANGLE:
-                {
-                    vector3_type vertex0 = vector3_type(asfloat(intersect.data[2 + i * Shape<PST_TRIANGLE>::ObjSize]), asfloat(intersect.data[2 + i * Shape<PST_SPHERE>::ObjSize + 1]), asfloat(intersect.data[2 + i * Shape<PST_TRIANGLE>::ObjSize + 2]));
-                    vector3_type vertex1 = vector3_type(asfloat(intersect.data[2 + i * Shape<PST_TRIANGLE>::ObjSize + 3]), asfloat(intersect.data[2 + i * Shape<PST_SPHERE>::ObjSize + 4]), asfloat(intersect.data[2 + i * Shape<PST_TRIANGLE>::ObjSize + 5]));
-                    vector3_type vertex2 = vector3_type(asfloat(intersect.data[2 + i * Shape<PST_TRIANGLE>::ObjSize + 6]), asfloat(intersect.data[2 + i * Shape<PST_SPHERE>::ObjSize + 7]), asfloat(intersect.data[2 + i * Shape<PST_TRIANGLE>::ObjSize + 8]));
-                    Shape<PST_TRIANGLE> tri = Shape<PST_TRIANGLE>::create(vertex0, vertex1, vertex2, intersect.data[2 + i * Shape<PST_TRIANGLE>::ObjSize + 9]);
-                    t = tri.intersect(ray.origin, ray.direction);
-                }
-                break;
-                case PST_RECTANGLE:
-                {
-                    vector3_type offset = vector3_type(asfloat(intersect.data[2 + i * Shape<PST_RECTANGLE>::ObjSize]), asfloat(intersect.data[2 + i * Shape<PST_RECTANGLE>::ObjSize + 1]), asfloat(intersect.data[2 + i * Shape<PST_RECTANGLE>::ObjSize + 2]));
-                    vector3_type edge0 = vector3_type(asfloat(intersect.data[2 + i * Shape<PST_RECTANGLE>::ObjSize + 3]), asfloat(intersect.data[2 + i * Shape<PST_RECTANGLE>::ObjSize + 4]), asfloat(intersect.data[2 + i * Shape<PST_RECTANGLE>::ObjSize + 5]));
-                    vector3_type edge1 = vector3_type(asfloat(intersect.data[2 + i * Shape<PST_RECTANGLE>::ObjSize + 6]), asfloat(intersect.data[2 + i * Shape<PST_RECTANGLE>::ObjSize + 7]), asfloat(intersect.data[2 + i * Shape<PST_RECTANGLE>::ObjSize + 8]));
-                    Shape<PST_RECTANGLE> rect = Shape<PST_RECTANGLE>::create(offset, edge0, edge1, intersect.data[2 + i * Shape<PST_RECTANGLE>::ObjSize + 9]);
-                    t = rect.intersect(ray.origin, ray.direction);
-                }
-                break;
-                default:
-                    t = numeric_limits<float>::infinity;
-                break;
-            }
-
-            bool closerIntersection = t > 0.0 && t < ray.intersectionT;
-
-            ray.intersectionT = closerIntersection ? t : ray.intersectionT;
-            objectID.id = closerIntersection ? i : objectID.id;
-
-            // allowing early out results in a performance regression, WTF!?
-            //if (anyHit && closerIntersection)
-            //break;
-        }
-        return objectID;
-    }
+    // static ObjectID traceProcedural(NBL_REF_ARG(ray_type) ray, NBL_CONST_REF_ARG(IntersectData) intersect)
+    // {
+    //     const bool anyHit = ray.intersectionT != numeric_limits<scalar_type>::max;
+    //     const uint32_t objCount = intersect.data[0];
+    //     const ProceduralShapeType type = (ProceduralShapeType)intersect.data[1];
+
+    //     ObjectID objectID = ray.objectID;
+    //     objectID.mode = IM_PROCEDURAL;
+    //     objectID.shapeType = type;
+    //     for (int i = 0; i < objCount; i++)
+    //     {
+    //         float t;
+    //         switch (type)
+    //         {
+    //             case PST_SPHERE:
+    //             {
+    //                 vector3_type position = vector3_type(asfloat(intersect.data[2 + i * Shape<PST_SPHERE>::ObjSize]), asfloat(intersect.data[2 + i * Shape<PST_SPHERE>::ObjSize + 1]), asfloat(intersect.data[2 + i * Shape<PST_SPHERE>::ObjSize + 2]));
+    //                 Shape<PST_SPHERE> sphere = Shape<PST_SPHERE>::create(position, asfloat(intersect.data[2 + i * Shape<PST_SPHERE>::ObjSize + 3]), intersect.data[2 + i * Shape<PST_SPHERE>::ObjSize + 4]);
+    //                 t = sphere.intersect(ray.origin, ray.direction);
+    //             }
+    //             break;
+    //             case PST_TRIANGLE:
+    //             {
+    //                 vector3_type vertex0 = vector3_type(asfloat(intersect.data[2 + i * Shape<PST_TRIANGLE>::ObjSize]), asfloat(intersect.data[2 + i * Shape<PST_SPHERE>::ObjSize + 1]), asfloat(intersect.data[2 + i * Shape<PST_TRIANGLE>::ObjSize + 2]));
+    //                 vector3_type vertex1 = vector3_type(asfloat(intersect.data[2 + i * Shape<PST_TRIANGLE>::ObjSize + 3]), asfloat(intersect.data[2 + i * Shape<PST_SPHERE>::ObjSize + 4]), asfloat(intersect.data[2 + i * Shape<PST_TRIANGLE>::ObjSize + 5]));
+    //                 vector3_type vertex2 = vector3_type(asfloat(intersect.data[2 + i * Shape<PST_TRIANGLE>::ObjSize + 6]), asfloat(intersect.data[2 + i * Shape<PST_SPHERE>::ObjSize + 7]), asfloat(intersect.data[2 + i * Shape<PST_TRIANGLE>::ObjSize + 8]));
+    //                 Shape<PST_TRIANGLE> tri = Shape<PST_TRIANGLE>::create(vertex0, vertex1, vertex2, intersect.data[2 + i * Shape<PST_TRIANGLE>::ObjSize + 9]);
+    //                 t = tri.intersect(ray.origin, ray.direction);
+    //             }
+    //             break;
+    //             case PST_RECTANGLE:
+    //             {
+    //                 vector3_type offset = vector3_type(asfloat(intersect.data[2 + i * Shape<PST_RECTANGLE>::ObjSize]), asfloat(intersect.data[2 + i * Shape<PST_RECTANGLE>::ObjSize + 1]), asfloat(intersect.data[2 + i * Shape<PST_RECTANGLE>::ObjSize + 2]));
+    //                 vector3_type edge0 = vector3_type(asfloat(intersect.data[2 + i * Shape<PST_RECTANGLE>::ObjSize + 3]), asfloat(intersect.data[2 + i * Shape<PST_RECTANGLE>::ObjSize + 4]), asfloat(intersect.data[2 + i * Shape<PST_RECTANGLE>::ObjSize + 5]));
+    //                 vector3_type edge1 = vector3_type(asfloat(intersect.data[2 + i * Shape<PST_RECTANGLE>::ObjSize + 6]), asfloat(intersect.data[2 + i * Shape<PST_RECTANGLE>::ObjSize + 7]), asfloat(intersect.data[2 + i * Shape<PST_RECTANGLE>::ObjSize + 8]));
+    //                 Shape<PST_RECTANGLE> rect = Shape<PST_RECTANGLE>::create(offset, edge0, edge1, intersect.data[2 + i * Shape<PST_RECTANGLE>::ObjSize + 9]);
+    //                 t = rect.intersect(ray.origin, ray.direction);
+    //             }
+    //             break;
+    //             default:
+    //                 t = numeric_limits<float>::infinity;
+    //             break;
+    //         }
+
+    //         bool closerIntersection = t > 0.0 && t < ray.intersectionT;
+
+    //         ray.intersectionT = closerIntersection ? t : ray.intersectionT;
+    //         objectID.id = closerIntersection ? i : objectID.id;
+
+    //         // allowing early out results in a performance regression, WTF!?
+    //         //if (anyHit && closerIntersection)
+    //         //break;
+    //     }
+    //     return objectID;
+    // }
 
     // obsolete?
-    static ObjectID traceRay(NBL_REF_ARG(ray_type) ray, NBL_CONST_REF_ARG(IntersectData) intersect)
-    {
-        const IntersectData::Mode mode = (IntersectData::Mode)intersect.mode;
-        switch (mode)
-        {
-            case IntersectData::Mode::RAY_QUERY:
-            {
-                // TODO: do ray query stuff
-            }
-            break;
-            case IntersectData::Mode::RAY_TRACING:
-            {
-                // TODO: do ray tracing stuff
-            }
-            break;
-            case IntersectData::Mode::PROCEDURAL:
-            {
-                return traceProcedural(ray, intersect);
-            }
-            break;
-            default:
-            {
-                return ObjectID::create(-1, 0, PST_SPHERE);
-            }
-        }
-        return ObjectID::create(-1, 0, PST_SPHERE);
-    }
+    // static ObjectID traceRay(NBL_REF_ARG(ray_type) ray, NBL_CONST_REF_ARG(IntersectData) intersect)
+    // {
+    //     const uint32_t mode = intersect.mode;
+    //     switch (mode)
+    //     {
+    //         case IM_RAY_QUERY:
+    //         {
+    //             // TODO: do ray query stuff
+    //         }
+    //         break;
+    //         case IM_RAY_TRACING:
+    //         {
+    //             // TODO: do ray tracing stuff
+    //         }
+    //         break;
+    //         case IM_PROCEDURAL:
+    //         {
+    //             return traceProcedural(ray, intersect);
+    //         }
+    //         break;
+    //         default:
+    //         {
+    //             return ObjectID::create(-1, 0, PST_SPHERE);
+    //         }
+    //     }
+    //     return ObjectID::create(-1, 0, PST_SPHERE);
+    // }
 
     // static ObjectID traceRay(NBL_REF_ARG(ray_type) ray, NBL_CONST_REF_ARG(scene_type) scene)
     // {
@@ -198,43 +198,6 @@ struct Comprehensive
     // }
 };
 
-// does everything in traceray in ex 30
-// template<class Ray>
-// struct Procedural
-// {
-//     using scalar_type = typename Ray::scalar_type;
-//     using ray_type = Ray;
-
-//     static int traceRay(NBL_REF_ARG(ray_type) ray, IIntersection objects[32], int objCount)
-//     {
-//         const bool anyHit = ray.intersectionT != numeric_limits<scalar_type>::max;
-
-//         int objectID = -1;
-//         for (int i = 0; i < objCount; i++)
-//         {
-//             float t;
-//             if (objects[i].type == PST_SPHERE)  // we don't know what type of intersection it is so cast, there has to be a better way to do this
-//             {
-//                 Shape<PST_SPHERE> sphere = (Shape<PST_SPHERE>)objects[i];
-//                 t = sphere.intersect(ray.origin, ray.direction);
-//             }
-//             // TODO: other types
-
-//             bool closerIntersection = t > 0.0 && t < ray.intersectionT;
-
-//             ray.intersectionT = closerIntersection ? t : ray.intersectionT;
-//             objectID = closerIntersection ? i : objectID;
-
-//             // allowing early out results in a performance regression, WTF!?
-//             //if (anyHit && closerIntersection)
-//             //break;
-//         }
-//         return objectID;
-//     }
-
-//     // TODO? traceray with vertex/index buffer
-// };
-
 }
 }
 }
diff --git a/31_HLSLPathTracer/app_resources/hlsl/material_system.hlsl b/31_HLSLPathTracer/app_resources/hlsl/material_system.hlsl
index 09236c85e..7fb153791 100644
--- a/31_HLSLPathTracer/app_resources/hlsl/material_system.hlsl
+++ b/31_HLSLPathTracer/app_resources/hlsl/material_system.hlsl
@@ -25,8 +25,8 @@ struct Material
 
     NBL_CONSTEXPR_STATIC_INLINE uint32_t DataSize = 32;
 
-    uint32_t type : 1;
-    uint32_t unused : 31;   // possible space for flags
+    uint32_t type : 2;
+    uint32_t unused : 30;   // possible space for flags
     uint32_t data[DataSize];
 };
 
@@ -66,7 +66,7 @@ struct System
             case Material::Type::DIFFUSE:
             {
                 diffuseBxDF.init(cparams);
-                return (measure_type)diffuseBxDF.eval(params);
+                return cparams.albedo * (measure_type)diffuseBxDF.eval(params);
             }
             break;
             case Material::Type::CONDUCTOR:
diff --git a/31_HLSLPathTracer/app_resources/hlsl/next_event_estimator.hlsl b/31_HLSLPathTracer/app_resources/hlsl/next_event_estimator.hlsl
index 38a5fae15..949db8456 100644
--- a/31_HLSLPathTracer/app_resources/hlsl/next_event_estimator.hlsl
+++ b/31_HLSLPathTracer/app_resources/hlsl/next_event_estimator.hlsl
@@ -35,16 +35,28 @@ struct Estimator
         {
             case PST_SPHERE:
             {
-                vector3_type position = vector3_type(asfloat(event.data[2]), asfloat(event.data[3]), asfloat(event.data[4]));
-                Shape<PST_SPHERE> sphere = Shape<PST_SPHERE>::create(position, asfloat(event.data[5]), event.data[6]);
+                const vector3_type position = vector3_type(
+                    bit_cast<float32_t>(event.data[2]),
+                    bit_cast<float32_t>(event.data[3]),
+                    bit_cast<float32_t>(event.data[4]));
+                Shape<PST_SPHERE> sphere = Shape<PST_SPHERE>::create(position, bit_cast<float32_t>(event.data[5]), event.data[6]);
                 pdf *= sphere.template deferredPdf<ray_type>(ray);
             }
             break;
             case PST_TRIANGLE:
             {
-                vector3_type vertex0 = vector3_type(asfloat(event.data[2]), asfloat(event.data[3]), asfloat(event.data[4]));
-                vector3_type vertex1 = vector3_type(asfloat(event.data[5]), asfloat(event.data[6]), asfloat(event.data[7]));
-                vector3_type vertex2 = vector3_type(asfloat(event.data[8]), asfloat(event.data[9]), asfloat(event.data[10]));
+                const vector3_type vertex0 = vector3_type(
+                    bit_cast<float32_t>(event.data[2]),
+                    bit_cast<float32_t>(event.data[3]),
+                    bit_cast<float32_t>(event.data[4]));
+                const vector3_type vertex1 = vector3_type(
+                    bit_cast<float32_t>(event.data[5]), 
+                    bit_cast<float32_t>(event.data[6]),
+                    bit_cast<float32_t>(event.data[7]));
+                const vector3_type vertex2 = vector3_type(
+                    bit_cast<float32_t>(event.data[8]),
+                    bit_cast<float32_t>(event.data[9]),
+                    bit_cast<float32_t>(event.data[10]));
                 Shape<PST_TRIANGLE> tri = Shape<PST_TRIANGLE>::create(vertex0, vertex1, vertex2, event.data[11]);
                 pdf *= tri.template deferredPdf<ray_type>(ray);
             }
@@ -59,7 +71,7 @@ struct Estimator
             }
             break;
             default:
-                pdf = numeric_limits<float>::infinity;
+                pdf = bit_cast<float>(numeric_limits<float>::infinity);
                 break;
         }
 
@@ -68,20 +80,20 @@ struct Estimator
 
     static spectral_type deferredEvalAndPdf(NBL_REF_ARG(scalar_type) pdf, NBL_CONST_REF_ARG(light_type) light, NBL_CONST_REF_ARG(ray_type) ray, NBL_CONST_REF_ARG(Event) event)
     {
-        const Event::Mode mode = (Event::Mode)event.mode;
+        const uint32_t mode = event.mode;
         switch (mode)
         {
-            case Event::Mode::RAY_QUERY:
+            case IM_RAY_QUERY:
             {
                 // TODO: do ray query stuff
             }
             break;
-            case Event::Mode::RAY_TRACING:
+            case IM_RAY_TRACING:
             {
                 // TODO: do ray tracing stuff
             }
             break;
-            case Event::Mode::PROCEDURAL:
+            case IM_PROCEDURAL:
             {
                 return proceduralDeferredEvalAndPdf(pdf, light, ray, event);
             }
@@ -156,21 +168,21 @@ struct Estimator
 
     static sample_type generate_and_quotient_and_pdf(NBL_REF_ARG(quotient_pdf_type) quotient_pdf, NBL_REF_ARG(scalar_type) newRayMaxT, NBL_CONST_REF_ARG(light_type) light, NBL_CONST_REF_ARG(vector3_type) origin, NBL_CONST_REF_ARG(interaction_type) interaction, bool isBSDF, NBL_CONST_REF_ARG(vector3_type) xi, uint32_t depth, NBL_CONST_REF_ARG(Event) event)
     {
-        const Event::Mode mode = (Event::Mode)event.mode;
+        const uint32_t mode = event.mode;
         sample_type L;
         switch (mode)
         {
-            case Event::Mode::RAY_QUERY:
+            case IM_RAY_QUERY:
             {
                 // TODO: do ray query stuff
             }
             break;
-            case Event::Mode::RAY_TRACING:
+            case IM_RAY_TRACING:
             {
                 // TODO: do ray tracing stuff
             }
             break;
-            case Event::Mode::PROCEDURAL:
+            case IM_PROCEDURAL:
             {
                 return procedural_generate_and_quotient_and_pdf(quotient_pdf, newRayMaxT, light, origin, interaction, isBSDF, xi, depth, event);
             }
diff --git a/31_HLSLPathTracer/app_resources/hlsl/pathtracer.hlsl b/31_HLSLPathTracer/app_resources/hlsl/pathtracer.hlsl
index 6b49bc758..df4792a9c 100644
--- a/31_HLSLPathTracer/app_resources/hlsl/pathtracer.hlsl
+++ b/31_HLSLPathTracer/app_resources/hlsl/pathtracer.hlsl
@@ -111,14 +111,14 @@ struct Unidirectional
         uint32_t bsdfLightIDs;
         anisotropic_type interaction;
         isotropic_type iso_interaction;
-        ext::Intersector::IntersectData::Mode mode = (ext::Intersector::IntersectData::Mode)objectID.mode;
+        uint32_t mode = objectID.mode;
         switch (mode)
         {
             // TODO
-            case ext::Intersector::IntersectData::Mode::RAY_QUERY:
-            case ext::Intersector::IntersectData::Mode::RAY_TRACING:
+            case IM_RAY_QUERY:
+            case IM_RAY_TRACING:
                 break;
-            case ext::Intersector::IntersectData::Mode::PROCEDURAL:
+            case IM_PROCEDURAL:
             {
                 bsdfLightIDs = scene.getBsdfLightIDs(objectID);
                 vector3_type N = scene.getNormal(objectID, intersection);
@@ -139,10 +139,12 @@ struct Unidirectional
         const uint32_t lightID = glsl::bitfieldExtract(bsdfLightIDs, 16, 16);
         if (lightID != light_type::INVALID_ID)
         {
-            float pdf;
-            ray.payload.accumulation += nee.deferredEvalAndPdf(pdf, scene.lights[lightID], ray, scene.toNextEvent(lightID)) * throughput / (1.0 + pdf * pdf * ray.payload.otherTechniqueHeuristic);
+            float _pdf;
+            ray.payload.accumulation += nee.deferredEvalAndPdf(_pdf, scene.lights[lightID], ray, scene.toNextEvent(lightID)) * throughput / (1.0 + _pdf * _pdf * ray.payload.otherTechniqueHeuristic);
         }
 
+        return false;   // emissive only
+
         const uint32_t bsdfID = glsl::bitfieldExtract(bsdfLightIDs, 0, 16);
         if (bsdfID == bxdfnode_type::INVALID_ID)
             return false;
@@ -209,7 +211,7 @@ struct Unidirectional
                             params = params_type::template create<sample_type, anisotropic_type, anisocache_type>(nee_sample, interaction, _cache, bxdf::BCM_MAX);
                         else
                         {
-                            isocache_type isocache = (isocache_type)_cache;
+                            isocache_type isocache = _cache.iso_cache;
                             params = params_type::template create<sample_type, isotropic_type, isocache_type>(nee_sample, iso_interaction, isocache, bxdf::BCM_MAX);
                         }
                     }
@@ -223,7 +225,7 @@ struct Unidirectional
                             params = params_type::template create<sample_type, anisotropic_type, anisocache_type>(nee_sample, interaction, _cache, bxdf::BCM_ABS);
                         else
                         {
-                            isocache_type isocache = (isocache_type)_cache;
+                            isocache_type isocache = _cache.iso_cache;
                             params = params_type::template create<sample_type, isotropic_type, isocache_type>(nee_sample, iso_interaction, isocache, bxdf::BCM_ABS);
                         }
                     }
@@ -232,10 +234,11 @@ struct Unidirectional
                     bsdf_quotient_pdf.quotient *= throughput;
                     neeContrib_pdf.quotient *= bsdf_quotient_pdf.quotient;
                     const scalar_type otherGenOverChoice = bsdf_quotient_pdf.pdf * rcpChoiceProb;
-                    const scalar_type otherGenOverLightAndChoice = otherGenOverChoice / bsdf_quotient_pdf.pdf;
-                    neeContrib_pdf.quotient *= otherGenOverChoice / (1.f + otherGenOverLightAndChoice * otherGenOverLightAndChoice);   // balance heuristic
+                    // const scalar_type otherGenOverLightAndChoice = otherGenOverChoice / bsdf_quotient_pdf.pdf;
+                    // neeContrib_pdf.quotient *= otherGenOverChoice / (1.f + otherGenOverLightAndChoice * otherGenOverLightAndChoice);   // balance heuristic
 
                     // TODO: ifdef NEE only
+                    neeContrib_pdf.quotient *= otherGenOverChoice;
 
                     ray_type nee_ray;
                     nee_ray.origin = intersection + nee_sample.L.direction * t * Tolerance<scalar_type>::getStart(depth);
@@ -247,6 +250,8 @@ struct Unidirectional
             }
         }
 
+        return false;   // NEE only
+
         // sample BSDF
         scalar_type bxdfPdf;
         vector3_type bxdfSample;
@@ -341,21 +346,20 @@ struct Unidirectional
             // bounces
             bool hit = true;
             bool rayAlive = true;
-            // TODO for (int d = 1; d <= depth && hit && rayAlive; d += 2)
-            // TODO {
+            for (int d = 1; d <= depth && hit && rayAlive; d += 2)
+            {
                 ray.intersectionT = numeric_limits<scalar_type>::max;
-                ray.objectID.id = -1;
                 ray.objectID = intersector_type::traceRay(ray, scene);
 
                 hit = ray.objectID.id != -1;
                 if (hit)
                 {
-                    float pp = float(ray.objectID.id) / 10.0;
-                    ray.payload.accumulation = measure_type(pp, 1.0-pp, 0.3);
-                    // TODO rayAlive = closestHitProgram(1, i, ray, scene);
+                    // float pp = float(ray.objectID.id) / 10.0;
+                    // ray.payload.accumulation = measure_type(pp, 1.0-pp, 0.3);
+                    rayAlive = closestHitProgram(1, i, ray, scene);
                 }
 
-            // TODO }
+            }
             if (!hit)
                 missProgram(ray);
 
diff --git a/31_HLSLPathTracer/app_resources/hlsl/render.comp.hlsl b/31_HLSLPathTracer/app_resources/hlsl/render.comp.hlsl
index f23d042a8..e25961b56 100644
--- a/31_HLSLPathTracer/app_resources/hlsl/render.comp.hlsl
+++ b/31_HLSLPathTracer/app_resources/hlsl/render.comp.hlsl
@@ -115,14 +115,14 @@ static const ext::Shape<ext::PST_RECTANGLE> rectangles[RECTANGLE_COUNT] = {
 
 #define LIGHT_COUNT 1
 static const light_type lights[LIGHT_COUNT] = {
-    light_type::create(spectral_t(30.0,25.0,15.0), ext::ObjectID::create(8u, ext::NextEventEstimator::Event::Mode::PROCEDURAL, LIGHT_TYPE))
+    light_type::create(spectral_t(30.0,25.0,15.0), ext::ObjectID::create(8u, ext::IntersectMode::IM_PROCEDURAL, LIGHT_TYPE))
 };
 
 #define BXDF_COUNT 7
 static const bxdfnode_type bxdfs[BXDF_COUNT] = {
-    bxdfnode_type::create(ext::MaterialSystem::Material::Type::DIFFUSE, false, float2(0,0), spectral_t(1,1,1), spectral_t(1.25,1.25,1.25)),
-    bxdfnode_type::create(ext::MaterialSystem::Material::Type::DIFFUSE, false, float2(0,0), spectral_t(1,1,1), spectral_t(1.25,2.5,2.5)),
-    bxdfnode_type::create(ext::MaterialSystem::Material::Type::DIFFUSE, false, float2(0,0), spectral_t(1,1,1), spectral_t(2.5,1.25,2.5)),
+    bxdfnode_type::create(ext::MaterialSystem::Material::Type::DIFFUSE, false, float2(0,0), spectral_t(0.8,0.8,0.8)),
+    bxdfnode_type::create(ext::MaterialSystem::Material::Type::DIFFUSE, false, float2(0,0), spectral_t(0.8,0.4,0.4)),
+    bxdfnode_type::create(ext::MaterialSystem::Material::Type::DIFFUSE, false, float2(0,0), spectral_t(0.4,0.8,0.4)),
     bxdfnode_type::create(ext::MaterialSystem::Material::Type::CONDUCTOR, false, float2(0,0), spectral_t(1,1,1), spectral_t(0.98,0.98,0.77)),
     bxdfnode_type::create(ext::MaterialSystem::Material::Type::CONDUCTOR, false, float2(0,0), spectral_t(1,1,1), spectral_t(0.98,0.77,0.98)),
     bxdfnode_type::create(ext::MaterialSystem::Material::Type::CONDUCTOR, false, float2(0.15,0.15), spectral_t(1,1,1), spectral_t(0.98,0.77,0.98)),
diff --git a/31_HLSLPathTracer/app_resources/hlsl/scene.hlsl b/31_HLSLPathTracer/app_resources/hlsl/scene.hlsl
index 1c17e2531..5b4178ec4 100644
--- a/31_HLSLPathTracer/app_resources/hlsl/scene.hlsl
+++ b/31_HLSLPathTracer/app_resources/hlsl/scene.hlsl
@@ -128,40 +128,47 @@ struct Scene
             case PST_SPHERE:
             {
                 Shape<PST_SPHERE> sphere = spheres[id];
-                retval.data[2] = asuint(sphere.position.x);
-                retval.data[3] = asuint(sphere.position.y);
-                retval.data[4] = asuint(sphere.position.z);
-                retval.data[5] = asuint(sphere.radius2);
+                uint32_t3 position = bit_cast<uint32_t3>(sphere.position);
+                retval.data[2] = position.x;
+                retval.data[3] = position.y;
+                retval.data[4] = position.z;
+                retval.data[5] = bit_cast<uint32_t>(sphere.radius2);
                 retval.data[6] = sphere.bsdfLightIDs;
             }
             break;
             case PST_TRIANGLE:
             {
                 Shape<PST_TRIANGLE> tri = triangles[id];
-                retval.data[2] = asuint(tri.vertex0.x);
-                retval.data[3] = asuint(tri.vertex0.y);
-                retval.data[4] = asuint(tri.vertex0.z);
-                retval.data[5] = asuint(tri.vertex1.x);
-                retval.data[6] = asuint(tri.vertex1.y);
-                retval.data[7] = asuint(tri.vertex1.z);
-                retval.data[8] = asuint(tri.vertex2.x);
-                retval.data[9] = asuint(tri.vertex2.y);
-                retval.data[10] = asuint(tri.vertex2.z);
+                uint32_t3 vertex = bit_cast<uint32_t3>(tri.vertex0);
+                retval.data[2] = vertex.x;
+                retval.data[3] = vertex.y;
+                retval.data[4] = vertex.z;
+                vertex = bit_cast<uint32_t3>(tri.vertex1);
+                retval.data[5] = vertex.x;
+                retval.data[6] = vertex.y;
+                retval.data[7] = vertex.z;
+                vertex = bit_cast<uint32_t3>(tri.vertex2);
+                retval.data[8] = vertex.x;
+                retval.data[9] = vertex.y;
+                retval.data[10] = vertex.z;
                 retval.data[11] = tri.bsdfLightIDs;
             }
             break;
             case PST_RECTANGLE:
             {
                 Shape<PST_RECTANGLE> rect = rectangles[id];
-                retval.data[2] = asuint(rect.offset.x);
-                retval.data[3] = asuint(rect.offset.y);
-                retval.data[4] = asuint(rect.offset.z);
-                retval.data[5] = asuint(rect.edge0.x);
-                retval.data[6] = asuint(rect.edge0.y);
-                retval.data[7] = asuint(rect.edge0.z);
-                retval.data[8] = asuint(rect.edge1.x);
-                retval.data[9] = asuint(rect.edge1.y);
-                retval.data[10] = asuint(rect.edge1.z);
+                uint32_t3 tmp = bit_cast<uint32_t3>(rect.offset);
+                retval.data[2] = tmp.x;
+                retval.data[3] = tmp.y;
+                retval.data[4] = tmp.z;
+                tmp = bit_cast<uint32_t3>(rect.edge0);
+                retval.data[5] = tmp.x;
+                retval.data[6] = tmp.y;
+                retval.data[7] = tmp.z;
+                tmp = bit_cast<uint32_t3>(rect.edge1);
+                retval.data[8] = tmp.x;
+                retval.data[9] = tmp.y;
+                retval.data[10] = tmp.z;
                 retval.data[11] = rect.bsdfLightIDs;
             }
             break;

From e7d4670fca8009843abde9cf4fc1ddd6aedc9290 Mon Sep 17 00:00:00 2001
From: keptsecret <sorchon@gmail.com>
Date: Mon, 10 Mar 2025 17:42:58 +0700
Subject: [PATCH 071/529] fixed light sampling nee

---
 .../app_resources/glsl/common.glsl            |  2 +-
 .../app_resources/hlsl/material_system.hlsl   | 15 +++--
 .../hlsl/next_event_estimator.hlsl            | 57 ++++++++++++----
 .../app_resources/hlsl/pathtracer.hlsl        | 67 +++++++++----------
 4 files changed, 86 insertions(+), 55 deletions(-)

diff --git a/31_HLSLPathTracer/app_resources/glsl/common.glsl b/31_HLSLPathTracer/app_resources/glsl/common.glsl
index 2463f82cf..15b3662d0 100644
--- a/31_HLSLPathTracer/app_resources/glsl/common.glsl
+++ b/31_HLSLPathTracer/app_resources/glsl/common.glsl
@@ -7,7 +7,7 @@
 //#define VISUALIZE_HIGH_VARIANCE
 
 // debug
-//#define NEE_ONLY
+#define NEE_ONLY 1
 
 layout(set = 2, binding = 0) uniform sampler2D envMap; 
 layout(set = 2, binding = 1) uniform usamplerBuffer sampleSequence;
diff --git a/31_HLSLPathTracer/app_resources/hlsl/material_system.hlsl b/31_HLSLPathTracer/app_resources/hlsl/material_system.hlsl
index 7fb153791..af8d5b131 100644
--- a/31_HLSLPathTracer/app_resources/hlsl/material_system.hlsl
+++ b/31_HLSLPathTracer/app_resources/hlsl/material_system.hlsl
@@ -23,7 +23,7 @@ struct Material
         DIELECTRIC
     };
 
-    NBL_CONSTEXPR_STATIC_INLINE uint32_t DataSize = 32;
+    NBL_CONSTEXPR_STATIC_INLINE uint32_t DataSize = 1;
 
     uint32_t type : 2;
     uint32_t unused : 30;   // possible space for flags
@@ -66,7 +66,7 @@ struct System
             case Material::Type::DIFFUSE:
             {
                 diffuseBxDF.init(cparams);
-                return cparams.albedo * (measure_type)diffuseBxDF.eval(params);
+                return (measure_type)diffuseBxDF.eval(params);
             }
             break;
             case Material::Type::CONDUCTOR:
@@ -123,8 +123,13 @@ struct System
 
     quotient_pdf_type quotient_and_pdf(NBL_CONST_REF_ARG(Material) material, NBL_CONST_REF_ARG(create_params_t) cparams, NBL_CONST_REF_ARG(params_t) params)
     {
+        
+        const bool transmissive = material.type == Material::Type::DIELECTRIC;
+        const float clampedNdotV = math::conditionalAbsOrMax<float>(transmissive, params.uNdotV, 0.0);
+        const float clampedNdotL = math::conditionalAbsOrMax<float>(transmissive, params.uNdotL, 0.0);
+
         const float minimumProjVectorLen = 0.00000001;
-        if (params.NdotV > minimumProjVectorLen && params.NdotL > minimumProjVectorLen)
+        if (clampedNdotV > minimumProjVectorLen && clampedNdotL > minimumProjVectorLen)
         {
             switch(material.type)
             {
@@ -147,10 +152,10 @@ struct System
                 }
                 break;
                 default:
-                    return quotient_pdf_type::create((measure_type)0.0, numeric_limits<float>::infinity);
+                    return quotient_pdf_type::create((measure_type)0.0, 0.0);
             }
         }
-        return quotient_pdf_type::create((measure_type)0.0, numeric_limits<float>::infinity);
+        return quotient_pdf_type::create((measure_type)0.0, 0.0);
     }
 
     DiffuseBxDF diffuseBxDF;
diff --git a/31_HLSLPathTracer/app_resources/hlsl/next_event_estimator.hlsl b/31_HLSLPathTracer/app_resources/hlsl/next_event_estimator.hlsl
index 949db8456..65646b3c1 100644
--- a/31_HLSLPathTracer/app_resources/hlsl/next_event_estimator.hlsl
+++ b/31_HLSLPathTracer/app_resources/hlsl/next_event_estimator.hlsl
@@ -63,9 +63,18 @@ struct Estimator
             break;
             case PST_RECTANGLE:
             {
-                vector3_type offset = vector3_type(asfloat(event.data[2]), asfloat(event.data[3]), asfloat(event.data[4]));
-                vector3_type edge0 = vector3_type(asfloat(event.data[5]), asfloat(event.data[6]), asfloat(event.data[7]));
-                vector3_type edge1 = vector3_type(asfloat(event.data[8]), asfloat(event.data[9]), asfloat(event.data[10]));
+                const vector3_type offset = vector3_type(
+                    bit_cast<float32_t>(event.data[2]),
+                    bit_cast<float32_t>(event.data[3]),
+                    bit_cast<float32_t>(event.data[4]));
+                const vector3_type edge0 = vector3_type(
+                    bit_cast<float32_t>(event.data[5]),
+                    bit_cast<float32_t>(event.data[6]),
+                    bit_cast<float32_t>(event.data[7]));
+                const vector3_type edge1 = vector3_type(
+                    bit_cast<float32_t>(event.data[8]),
+                    bit_cast<float32_t>(event.data[9]),
+                    bit_cast<float32_t>(event.data[10]));
                 Shape<PST_RECTANGLE> rect = Shape<PST_RECTANGLE>::create(offset, edge0, edge1, event.data[11]);
                 pdf *= rect.template deferredPdf<ray_type>(ray);
             }
@@ -115,8 +124,12 @@ struct Estimator
         {
             case PST_SPHERE:
             {
-                vector3_type position = vector3_type(asfloat(event.data[2]), asfloat(event.data[3]), asfloat(event.data[4]));
-                Shape<PST_SPHERE> sphere = Shape<PST_SPHERE>::create(position, asfloat(event.data[5]), event.data[6]);
+                const vector3_type position = vector3_type(
+                    bit_cast<float32_t>(event.data[2]),
+                    bit_cast<float32_t>(event.data[3]),
+                    bit_cast<float32_t>(event.data[4]));
+                Shape<PST_SPHERE> sphere = Shape<PST_SPHERE>::create(position, bit_cast<float32_t>(event.data[5]), event.data[6]);
+
                 const vector3_type sampleL = sphere.template generate_and_pdf<interaction_type>(pdf, newRayMaxT, origin, interaction, isBSDF, xi);
                 const vector3_type V = interaction.isotropic.V.getDirection();
                 const scalar_type VdotL = nbl::hlsl::dot<vector3_type>(V, sampleL);
@@ -127,10 +140,20 @@ struct Estimator
             break;
             case PST_TRIANGLE:
             {
-                vector3_type vertex0 = vector3_type(asfloat(event.data[2]), asfloat(event.data[3]), asfloat(event.data[4]));
-                vector3_type vertex1 = vector3_type(asfloat(event.data[5]), asfloat(event.data[6]), asfloat(event.data[7]));
-                vector3_type vertex2 = vector3_type(asfloat(event.data[8]), asfloat(event.data[9]), asfloat(event.data[10]));
+                const vector3_type vertex0 = vector3_type(
+                    bit_cast<float32_t>(event.data[2]),
+                    bit_cast<float32_t>(event.data[3]),
+                    bit_cast<float32_t>(event.data[4]));
+                const vector3_type vertex1 = vector3_type(
+                    bit_cast<float32_t>(event.data[5]), 
+                    bit_cast<float32_t>(event.data[6]),
+                    bit_cast<float32_t>(event.data[7]));
+                const vector3_type vertex2 = vector3_type(
+                    bit_cast<float32_t>(event.data[8]),
+                    bit_cast<float32_t>(event.data[9]),
+                    bit_cast<float32_t>(event.data[10]));
                 Shape<PST_TRIANGLE> tri = Shape<PST_TRIANGLE>::create(vertex0, vertex1, vertex2, event.data[11]);
+
                 const vector3_type sampleL = tri.template generate_and_pdf<interaction_type>(pdf, newRayMaxT, origin, interaction, isBSDF, xi);
                 const vector3_type V = interaction.isotropic.V.getDirection();
                 const scalar_type VdotL = nbl::hlsl::dot<vector3_type>(V, sampleL);
@@ -141,10 +164,20 @@ struct Estimator
             break;
             case PST_RECTANGLE:
             {
-                vector3_type offset = vector3_type(asfloat(event.data[2]), asfloat(event.data[3]), asfloat(event.data[4]));
-                vector3_type edge0 = vector3_type(asfloat(event.data[5]), asfloat(event.data[6]), asfloat(event.data[7]));
-                vector3_type edge1 = vector3_type(asfloat(event.data[8]), asfloat(event.data[9]), asfloat(event.data[10]));
+                const vector3_type offset = vector3_type(
+                    bit_cast<float32_t>(event.data[2]),
+                    bit_cast<float32_t>(event.data[3]),
+                    bit_cast<float32_t>(event.data[4]));
+                const vector3_type edge0 = vector3_type(
+                    bit_cast<float32_t>(event.data[5]),
+                    bit_cast<float32_t>(event.data[6]),
+                    bit_cast<float32_t>(event.data[7]));
+                const vector3_type edge1 = vector3_type(
+                    bit_cast<float32_t>(event.data[8]),
+                    bit_cast<float32_t>(event.data[9]),
+                    bit_cast<float32_t>(event.data[10]));
                 Shape<PST_RECTANGLE> rect = Shape<PST_RECTANGLE>::create(offset, edge0, edge1, event.data[11]);
+
                 const vector3_type sampleL = rect.template generate_and_pdf<interaction_type>(pdf, newRayMaxT, origin, interaction, isBSDF, xi);
                 const vector3_type V = interaction.isotropic.V.getDirection();
                 const scalar_type VdotL = nbl::hlsl::dot<vector3_type>(V, sampleL);
@@ -154,7 +187,7 @@ struct Estimator
             }
             break;
             default:
-                pdf = numeric_limits<float>::infinity;
+                pdf = bit_cast<float>(numeric_limits<float>::infinity);
                 break;
         }
 
diff --git a/31_HLSLPathTracer/app_resources/hlsl/pathtracer.hlsl b/31_HLSLPathTracer/app_resources/hlsl/pathtracer.hlsl
index df4792a9c..6f1518a46 100644
--- a/31_HLSLPathTracer/app_resources/hlsl/pathtracer.hlsl
+++ b/31_HLSLPathTracer/app_resources/hlsl/pathtracer.hlsl
@@ -99,7 +99,7 @@ struct Unidirectional
 
     scalar_type getLuma(NBL_CONST_REF_ARG(vector3_type) col)
     {
-        return nbl::hlsl::dot(nbl::hlsl::transpose(colorspace::scRGBtoXYZ)[1], col);
+        return hlsl::dot<vector3_type>(hlsl::transpose(colorspace::scRGBtoXYZ)[1], col);
     }
 
     // TODO: probably will only work with procedural shapes, do the other ones
@@ -123,8 +123,8 @@ struct Unidirectional
                 bsdfLightIDs = scene.getBsdfLightIDs(objectID);
                 vector3_type N = scene.getNormal(objectID, intersection);
                 N = nbl::hlsl::normalize(N);
-                typename isotropic_type::ray_dir_info_type V;
-                V.direction = nbl::hlsl::normalize(-ray.direction);
+                ray_dir_info_type V;
+                V.direction = -ray.direction;
                 isotropic_type iso_interaction = isotropic_type::create(V, N);
                 interaction = anisotropic_type::create(iso_interaction);
             }
@@ -143,8 +143,6 @@ struct Unidirectional
             ray.payload.accumulation += nee.deferredEvalAndPdf(_pdf, scene.lights[lightID], ray, scene.toNextEvent(lightID)) * throughput / (1.0 + _pdf * _pdf * ray.payload.otherTechniqueHeuristic);
         }
 
-        return false;   // emissive only
-
         const uint32_t bsdfID = glsl::bitfieldExtract(bsdfLightIDs, 0, 16);
         if (bsdfID == bxdfnode_type::INVALID_ID)
             return false;
@@ -163,9 +161,9 @@ struct Unidirectional
         // thresholds
         const scalar_type bxdfPdfThreshold = 0.0001;
         const scalar_type lumaContributionThreshold = getLuma(colorspace::eotf::sRGB<vector3_type>((vector3_type)1.0 / 255.0)); // OETF smallest perceptible value
-        const vector3_type throughputCIE_Y = nbl::hlsl::transpose(colorspace::sRGBtoXYZ)[1] * throughput;   // TODO: this only works if spectral_type is dim 3
+        const vector3_type throughputCIE_Y = hlsl::transpose(colorspace::sRGBtoXYZ)[1] * throughput;    // TODO: this only works if spectral_type is dim 3
         const measure_type eta = bxdf.params.ior0 / bxdf.params.ior1;   // assume it's real, not imaginary?
-        const scalar_type monochromeEta = nbl::hlsl::dot(throughputCIE_Y, eta) / (throughputCIE_Y.r + throughputCIE_Y.g + throughputCIE_Y.b);  // TODO: imaginary eta?
+        const scalar_type monochromeEta = hlsl::dot<vector3_type>(throughputCIE_Y, eta) / (throughputCIE_Y.r + throughputCIE_Y.g + throughputCIE_Y.b);  // TODO: imaginary eta?
 
         // sample lights
         const scalar_type neeProbability = 1.0; // BSDFNode_getNEEProb(bsdf);
@@ -185,7 +183,6 @@ struct Unidirectional
             // but if we allowed non-watertight transmitters (single water surface), it would make sense just to apply this line by itself
             anisocache_type _cache;
             validPath = validPath && anisocache_type::template compute<ray_dir_info_type, ray_dir_info_type>(_cache, interaction, nee_sample, monochromeEta);
-            bxdf.params.A = nbl::hlsl::max(bxdf.params.A, vector<scalar_type, 2>(0,0));
             bxdf.params.eta = monochromeEta;
 
             if (neeContrib_pdf.pdf < numeric_limits<scalar_type>::max)
@@ -231,7 +228,7 @@ struct Unidirectional
                     }
 
                     quotient_pdf_type bsdf_quotient_pdf = materialSystem.quotient_and_pdf(material, bxdf.params, params);
-                    bsdf_quotient_pdf.quotient *= throughput;
+                    bsdf_quotient_pdf.quotient *= bxdf.albedo * throughput;
                     neeContrib_pdf.quotient *= bsdf_quotient_pdf.quotient;
                     const scalar_type otherGenOverChoice = bsdf_quotient_pdf.pdf * rcpChoiceProb;
                     // const scalar_type otherGenOverLightAndChoice = otherGenOverChoice / bsdf_quotient_pdf.pdf;
@@ -268,30 +265,30 @@ struct Unidirectional
             {
                 params = params_type::template create<sample_type, isotropic_type>(bsdf_sample, iso_interaction, bxdf::BCM_MAX);
             }
-            else if (!isBSDF && bxdf.materialType != ext::MaterialSystem::Material::DIFFUSE)
-            {
-                if (bxdf.params.is_aniso)
-                    params = params_type::template create<sample_type, anisotropic_type, anisocache_type>(bsdf_sample, interaction, _cache, bxdf::BCM_MAX);
-                else
-                {
-                    isocache_type isocache = _cache.iso_cache;
-                    params = params_type::template create<sample_type, isotropic_type, isocache_type>(bsdf_sample, iso_interaction, isocache, bxdf::BCM_MAX);
-                }
-            }
-            else if (isBSDF && bxdf.materialType == ext::MaterialSystem::Material::DIFFUSE)
-            {
-                params = params_type::template create<sample_type, isotropic_type>(bsdf_sample, iso_interaction, bxdf::BCM_ABS);
-            }
-            else if (isBSDF && bxdf.materialType != ext::MaterialSystem::Material::DIFFUSE)
-            {
-                if (bxdf.params.is_aniso)
-                    params = params_type::template create<sample_type, anisotropic_type, anisocache_type>(bsdf_sample, interaction, _cache, bxdf::BCM_ABS);
-                else
-                {
-                    isocache_type isocache = _cache.iso_cache;
-                    params = params_type::template create<sample_type, isotropic_type, isocache_type>(bsdf_sample, iso_interaction, isocache, bxdf::BCM_ABS);
-                }
-            }
+            // else if (!isBSDF && bxdf.materialType != ext::MaterialSystem::Material::DIFFUSE)
+            // {
+            //     if (bxdf.params.is_aniso)
+            //         params = params_type::template create<sample_type, anisotropic_type, anisocache_type>(bsdf_sample, interaction, _cache, bxdf::BCM_MAX);
+            //     else
+            //     {
+            //         isocache_type isocache = _cache.iso_cache;
+            //         params = params_type::template create<sample_type, isotropic_type, isocache_type>(bsdf_sample, iso_interaction, isocache, bxdf::BCM_MAX);
+            //     }
+            // }
+            // else if (isBSDF && bxdf.materialType == ext::MaterialSystem::Material::DIFFUSE)
+            // {
+            //     params = params_type::template create<sample_type, isotropic_type>(bsdf_sample, iso_interaction, bxdf::BCM_ABS);
+            // }
+            // else if (isBSDF && bxdf.materialType != ext::MaterialSystem::Material::DIFFUSE)
+            // {
+            //     if (bxdf.params.is_aniso)
+            //         params = params_type::template create<sample_type, anisotropic_type, anisocache_type>(bsdf_sample, interaction, _cache, bxdf::BCM_ABS);
+            //     else
+            //     {
+            //         isocache_type isocache = _cache.iso_cache;
+            //         params = params_type::template create<sample_type, isotropic_type, isocache_type>(bsdf_sample, iso_interaction, isocache, bxdf::BCM_ABS);
+            //     }
+            // }
 
             // the value of the bsdf divided by the probability of the sample being generated
             quotient_pdf_type bsdf_quotient_pdf = materialSystem.quotient_and_pdf(material, bxdf.params, params);
@@ -353,11 +350,7 @@ struct Unidirectional
 
                 hit = ray.objectID.id != -1;
                 if (hit)
-                {
-                    // float pp = float(ray.objectID.id) / 10.0;
-                    // ray.payload.accumulation = measure_type(pp, 1.0-pp, 0.3);
                     rayAlive = closestHitProgram(1, i, ray, scene);
-                }
 
             }
             if (!hit)

From 077d150bded805ce9ed10bc7711544576779ad39 Mon Sep 17 00:00:00 2001
From: keptsecret <sorchon@gmail.com>
Date: Tue, 11 Mar 2025 17:02:59 +0700
Subject: [PATCH 072/529] 1st working ver, sort of

---
 .../app_resources/glsl/common.glsl            |  2 +-
 .../app_resources/hlsl/common.hlsl            |  4 +-
 .../app_resources/hlsl/material_system.hlsl   |  7 +-
 .../app_resources/hlsl/pathtracer.hlsl        | 92 ++++++++++---------
 31_HLSLPathTracer/imgui.ini                   |  8 ++
 5 files changed, 63 insertions(+), 50 deletions(-)
 create mode 100644 31_HLSLPathTracer/imgui.ini

diff --git a/31_HLSLPathTracer/app_resources/glsl/common.glsl b/31_HLSLPathTracer/app_resources/glsl/common.glsl
index 15b3662d0..2463f82cf 100644
--- a/31_HLSLPathTracer/app_resources/glsl/common.glsl
+++ b/31_HLSLPathTracer/app_resources/glsl/common.glsl
@@ -7,7 +7,7 @@
 //#define VISUALIZE_HIGH_VARIANCE
 
 // debug
-#define NEE_ONLY 1
+//#define NEE_ONLY
 
 layout(set = 2, binding = 0) uniform sampler2D envMap; 
 layout(set = 2, binding = 1) uniform usamplerBuffer sampleSequence;
diff --git a/31_HLSLPathTracer/app_resources/hlsl/common.hlsl b/31_HLSLPathTracer/app_resources/hlsl/common.hlsl
index f67716060..ac1e0f09a 100644
--- a/31_HLSLPathTracer/app_resources/hlsl/common.hlsl
+++ b/31_HLSLPathTracer/app_resources/hlsl/common.hlsl
@@ -268,7 +268,7 @@ struct Shape<PST_SPHERE>
     }
 
     template<class Aniso>
-    float32_t3 generate_and_pdf(NBL_REF_ARG(float32_t) pdf, NBL_REF_ARG(float32_t) newRayMaxT, NBL_CONST_REF_ARG(float32_t3) origin, NBL_CONST_REF_ARG(Aniso) interaction, bool isBSDF, float32_t3 xi)
+    float32_t3 generate_and_pdf(NBL_REF_ARG(float32_t) pdf, NBL_REF_ARG(float32_t) newRayMaxT, NBL_CONST_REF_ARG(float32_t3) origin, NBL_CONST_REF_ARG(Aniso) interaction, bool isBSDF, NBL_CONST_REF_ARG(float32_t3) xi)
     {
         float32_t3 Z = position - origin;
         const float distanceSQ = hlsl::dot<float32_t3>(Z,Z);
@@ -279,7 +279,7 @@ struct Shape<PST_SPHERE>
             Z *= rcpDistance;
 
             const float cosThetaMax = hlsl::sqrt<float32_t>(cosThetaMax2);
-            const float cosTheta = nbl::hlsl::mix<float>(1.0, cosThetaMax, xi.x);
+            const float cosTheta = hlsl::mix<float>(1.0, cosThetaMax, xi.x);
 
             float32_t3 L = Z * cosTheta;
 
diff --git a/31_HLSLPathTracer/app_resources/hlsl/material_system.hlsl b/31_HLSLPathTracer/app_resources/hlsl/material_system.hlsl
index af8d5b131..0d739d9ec 100644
--- a/31_HLSLPathTracer/app_resources/hlsl/material_system.hlsl
+++ b/31_HLSLPathTracer/app_resources/hlsl/material_system.hlsl
@@ -123,13 +123,8 @@ struct System
 
     quotient_pdf_type quotient_and_pdf(NBL_CONST_REF_ARG(Material) material, NBL_CONST_REF_ARG(create_params_t) cparams, NBL_CONST_REF_ARG(params_t) params)
     {
-        
-        const bool transmissive = material.type == Material::Type::DIELECTRIC;
-        const float clampedNdotV = math::conditionalAbsOrMax<float>(transmissive, params.uNdotV, 0.0);
-        const float clampedNdotL = math::conditionalAbsOrMax<float>(transmissive, params.uNdotL, 0.0);
-
         const float minimumProjVectorLen = 0.00000001;
-        if (clampedNdotV > minimumProjVectorLen && clampedNdotL > minimumProjVectorLen)
+        if (params.NdotV > minimumProjVectorLen && params.NdotL > minimumProjVectorLen)
         {
             switch(material.type)
             {
diff --git a/31_HLSLPathTracer/app_resources/hlsl/pathtracer.hlsl b/31_HLSLPathTracer/app_resources/hlsl/pathtracer.hlsl
index 6f1518a46..6ed89de7a 100644
--- a/31_HLSLPathTracer/app_resources/hlsl/pathtracer.hlsl
+++ b/31_HLSLPathTracer/app_resources/hlsl/pathtracer.hlsl
@@ -94,7 +94,7 @@ struct Unidirectional
         uint32_t address = glsl::bitfieldInsert<uint32_t>(protoDimension, _sample, MAX_DEPTH_LOG2, MAX_SAMPLES_LOG2);
 	    uint32_t3 seqVal = sampleSequence[address + i].xyz;
 	    seqVal ^= randGen();
-        return vector3_type(seqVal) * asfloat(0x2f800004u);
+        return vector3_type(seqVal) * bit_cast<scalar_type>(0x2f800004u);
     }
 
     scalar_type getLuma(NBL_CONST_REF_ARG(vector3_type) col)
@@ -177,6 +177,7 @@ struct Unidirectional
                 scene.lights[lightID], intersection, interaction,
                 isBSDF, eps0, depth, scene.toNextEvent(lightID)
             );
+            //printf("%f %f %f\n", nee_sample.L.direction.x, nee_sample.L.direction.y, nee_sample.L.direction.z);
 
             // We don't allow non watertight transmitters in this renderer
             bool validPath = nee_sample.NdotL > numeric_limits<scalar_type>::min;
@@ -195,47 +196,51 @@ struct Unidirectional
                 {
                     ext::MaterialSystem::Material material;
                     material.type = bxdf.materialType;
-                    params_type params;
+
+                    bxdf::BxDFClampMode _clamp;
+                    _clamp = (bxdf.materialType == ext::MaterialSystem::Material::Type::DIELECTRIC) ? bxdf::BxDFClampMode::BCM_ABS : bxdf::BxDFClampMode::BCM_MAX;
+                    // example only uses isotropic bxdfs
+                    params_type params = params_type::template create<sample_type, isotropic_type, isocache_type>(nee_sample, interaction.isotropic, _cache.iso_cache, _clamp);
 
                     // TODO: does not yet account for smooth dielectric
-                    if (!isBSDF && bxdf.materialType == ext::MaterialSystem::Material::DIFFUSE)
-                    {
-                        params = params_type::template create<sample_type, isotropic_type>(nee_sample, iso_interaction, bxdf::BCM_MAX);
-                    }
-                    else if (!isBSDF && bxdf.materialType != ext::MaterialSystem::Material::DIFFUSE)
-                    {
-                        if (bxdf.params.is_aniso)
-                            params = params_type::template create<sample_type, anisotropic_type, anisocache_type>(nee_sample, interaction, _cache, bxdf::BCM_MAX);
-                        else
-                        {
-                            isocache_type isocache = _cache.iso_cache;
-                            params = params_type::template create<sample_type, isotropic_type, isocache_type>(nee_sample, iso_interaction, isocache, bxdf::BCM_MAX);
-                        }
-                    }
-                    else if (isBSDF && bxdf.materialType == ext::MaterialSystem::Material::DIFFUSE)
-                    {
-                        params = params_type::template create<sample_type, isotropic_type>(nee_sample, iso_interaction, bxdf::BCM_ABS);
-                    }
-                    else if (isBSDF && bxdf.materialType != ext::MaterialSystem::Material::DIFFUSE)
-                    {
-                        if (bxdf.params.is_aniso)
-                            params = params_type::template create<sample_type, anisotropic_type, anisocache_type>(nee_sample, interaction, _cache, bxdf::BCM_ABS);
-                        else
-                        {
-                            isocache_type isocache = _cache.iso_cache;
-                            params = params_type::template create<sample_type, isotropic_type, isocache_type>(nee_sample, iso_interaction, isocache, bxdf::BCM_ABS);
-                        }
-                    }
+                    // if (!isBSDF && bxdf.materialType == ext::MaterialSystem::Material::DIFFUSE)
+                    // {
+                    //     params = params_type::template create<sample_type, isotropic_type>(nee_sample, interaction.isotropic, bxdf::BCM_MAX);
+                    // }
+                    // else if (!isBSDF && bxdf.materialType != ext::MaterialSystem::Material::DIFFUSE)
+                    // {
+                    //     if (bxdf.params.is_aniso)
+                    //         params = params_type::template create<sample_type, anisotropic_type, anisocache_type>(nee_sample, interaction, _cache, bxdf::BCM_MAX);
+                    //     else
+                    //     {
+                    //         isocache_type isocache = _cache.iso_cache;
+                    //         params = params_type::template create<sample_type, isotropic_type, isocache_type>(nee_sample, interaction.isotropic, _cache.iso_cache, bxdf::BCM_MAX);
+                    //     }
+                    // }
+                    // else if (isBSDF && bxdf.materialType == ext::MaterialSystem::Material::DIFFUSE)
+                    // {
+                    //     params = params_type::template create<sample_type, isotropic_type>(nee_sample, interaction.isotropic, bxdf::BCM_ABS);
+                    // }
+                    // else if (isBSDF && bxdf.materialType != ext::MaterialSystem::Material::DIFFUSE)
+                    // {
+                    //     if (bxdf.params.is_aniso)
+                    //         params = params_type::template create<sample_type, anisotropic_type, anisocache_type>(nee_sample, interaction, _cache, bxdf::BCM_ABS);
+                    //     else
+                    //     {
+                    //         isocache_type isocache = _cache.iso_cache;
+                    //         params = params_type::template create<sample_type, isotropic_type, isocache_type>(nee_sample, interaction.isotropic, _cache.iso_cache, bxdf::BCM_ABS);
+                    //     }
+                    // }
 
                     quotient_pdf_type bsdf_quotient_pdf = materialSystem.quotient_and_pdf(material, bxdf.params, params);
                     bsdf_quotient_pdf.quotient *= bxdf.albedo * throughput;
                     neeContrib_pdf.quotient *= bsdf_quotient_pdf.quotient;
                     const scalar_type otherGenOverChoice = bsdf_quotient_pdf.pdf * rcpChoiceProb;
-                    // const scalar_type otherGenOverLightAndChoice = otherGenOverChoice / bsdf_quotient_pdf.pdf;
-                    // neeContrib_pdf.quotient *= otherGenOverChoice / (1.f + otherGenOverLightAndChoice * otherGenOverLightAndChoice);   // balance heuristic
+                    const scalar_type otherGenOverLightAndChoice = otherGenOverChoice / bsdf_quotient_pdf.pdf;
+                    neeContrib_pdf.quotient *= otherGenOverChoice / (1.f + otherGenOverLightAndChoice * otherGenOverLightAndChoice);   // balance heuristic
 
                     // TODO: ifdef NEE only
-                    neeContrib_pdf.quotient *= otherGenOverChoice;
+                    // neeContrib_pdf.quotient *= otherGenOverChoice;
 
                     ray_type nee_ray;
                     nee_ray.origin = intersection + nee_sample.L.direction * t * Tolerance<scalar_type>::getStart(depth);
@@ -247,7 +252,7 @@ struct Unidirectional
             }
         }
 
-        return false;   // NEE only
+        //return false;   // NEE only
 
         // sample BSDF
         scalar_type bxdfPdf;
@@ -259,12 +264,17 @@ struct Unidirectional
             anisocache_type _cache;
             sample_type bsdf_sample = materialSystem.generate(material, bxdf.params, interaction, eps1, _cache);
 
+            bxdf::BxDFClampMode _clamp;
+            _clamp = (bxdf.materialType == ext::MaterialSystem::Material::Type::DIELECTRIC) ? bxdf::BxDFClampMode::BCM_ABS : bxdf::BxDFClampMode::BCM_MAX;
+            // example only uses isotropic bxdfs
+            params_type params = params_type::template create<sample_type, isotropic_type, isocache_type>(bsdf_sample, interaction.isotropic, _cache.iso_cache, _clamp);
+
             // TODO: does not yet account for smooth dielectric
-            params_type params;
-            if (!isBSDF && bxdf.materialType == ext::MaterialSystem::Material::DIFFUSE)
-            {
-                params = params_type::template create<sample_type, isotropic_type>(bsdf_sample, iso_interaction, bxdf::BCM_MAX);
-            }
+            // params_type params;
+            // if (!isBSDF && bxdf.materialType == ext::MaterialSystem::Material::DIFFUSE)
+            // {
+            //     params = params_type::template create<sample_type, isotropic_type>(bsdf_sample, iso_interaction, bxdf::BCM_MAX);
+            // }
             // else if (!isBSDF && bxdf.materialType != ext::MaterialSystem::Material::DIFFUSE)
             // {
             //     if (bxdf.params.is_aniso)
@@ -292,7 +302,8 @@ struct Unidirectional
 
             // the value of the bsdf divided by the probability of the sample being generated
             quotient_pdf_type bsdf_quotient_pdf = materialSystem.quotient_and_pdf(material, bxdf.params, params);
-            throughput *= bsdf_quotient_pdf.quotient;
+            throughput *= bxdf.albedo * bsdf_quotient_pdf.quotient;
+            bxdfPdf = bsdf_quotient_pdf.pdf;
             bxdfSample = bsdf_sample.L.direction;
         }
 
@@ -351,7 +362,6 @@ struct Unidirectional
                 hit = ray.objectID.id != -1;
                 if (hit)
                     rayAlive = closestHitProgram(1, i, ray, scene);
-
             }
             if (!hit)
                 missProgram(ray);
diff --git a/31_HLSLPathTracer/imgui.ini b/31_HLSLPathTracer/imgui.ini
new file mode 100644
index 000000000..e60624929
--- /dev/null
+++ b/31_HLSLPathTracer/imgui.ini
@@ -0,0 +1,8 @@
+[Window][Debug##Default]
+Pos=60,60
+Size=400,400
+
+[Window][Controls]
+Pos=10,10
+Size=320,340
+

From beac3283c9680c1ff153556fbb44791b6d3c6578 Mon Sep 17 00:00:00 2001
From: kevyuu <kevin.kayu@gmail.com>
Date: Wed, 12 Mar 2025 22:31:19 +0700
Subject: [PATCH 073/529] Remove raygenGroupStride

---
 71_RayTracingPipeline/main.cpp | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/71_RayTracingPipeline/main.cpp b/71_RayTracingPipeline/main.cpp
index 5793ff8d3..e8cc6f947 100644
--- a/71_RayTracingPipeline/main.cpp
+++ b/71_RayTracingPipeline/main.cpp
@@ -28,7 +28,6 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication,
   struct ShaderBindingTable
   {
     SBufferRange<IGPUBuffer> raygenGroupRange;
-    uint32_t raygenGroupStride;
     SBufferRange<IGPUBuffer> hitGroupsRange;
     uint32_t hitGroupsStride;
     SBufferRange<IGPUBuffer> missGroupsRange;
@@ -742,7 +741,7 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication,
       }else
       {
         cmdbuf->traceRays(
-          m_shaderBindingTable.raygenGroupRange, m_shaderBindingTable.raygenGroupStride,
+          m_shaderBindingTable.raygenGroupRange,
           m_shaderBindingTable.missGroupsRange, m_shaderBindingTable.missGroupsStride,
           m_shaderBindingTable.hitGroupsRange, m_shaderBindingTable.hitGroupsStride,
           m_shaderBindingTable.callableGroupsRange, m_shaderBindingTable.callableGroupsStride,
@@ -1326,7 +1325,6 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication,
       .offset = 0,
       .size = core::alignUp(handleSizeAligned, limits.shaderGroupBaseAlignment)
     };
-    m_shaderBindingTable.raygenGroupStride = core::alignUp(handleSizeAligned, limits.shaderGroupBaseAlignment);
 
     missRange = {
       .offset = raygenRange.size,

From 6517442ad2e8a592e3d5778eeee2ea607d6bb51f Mon Sep 17 00:00:00 2001
From: kevyuu <kevin.kayu@gmail.com>
Date: Wed, 12 Mar 2025 22:56:38 +0700
Subject: [PATCH 074/529] Fix merge bug.

---
 71_RayTracingPipeline/main.cpp | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/71_RayTracingPipeline/main.cpp b/71_RayTracingPipeline/main.cpp
index e8cc6f947..26618d2b2 100644
--- a/71_RayTracingPipeline/main.cpp
+++ b/71_RayTracingPipeline/main.cpp
@@ -482,9 +482,9 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication,
       {
         IGPUSampler::SParams params;
         params.AnisotropicFilter = 1u;
-        params.TextureWrapU = ISampler::ETC_REPEAT;
-        params.TextureWrapV = ISampler::ETC_REPEAT;
-        params.TextureWrapW = ISampler::ETC_REPEAT;
+        params.TextureWrapU = ETC_REPEAT;
+        params.TextureWrapV = ETC_REPEAT;
+        params.TextureWrapW = ETC_REPEAT;
 
         m_ui.samplers.gui = m_device->createSampler(params);
         m_ui.samplers.gui->setObjectDebugName("Nabla IMGUI UI Sampler");
@@ -578,10 +578,10 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication,
           ImGui::SliderFloat3("Light Direction", &m_light.direction.x, -1.f, 1.f);
           ImGui::SliderFloat3("Light Position", &m_light.position.x, -20.f, 20.f);
 
-          float32_t dOuterCutoff = degrees(acos(m_light.outerCutoff));
+          float32_t dOuterCutoff = hlsl::degrees(acos(m_light.outerCutoff));
           if (ImGui::SliderFloat("Light Outer Cutoff", &dOuterCutoff, 0.0f, 45.0f))
           {
-            m_light.outerCutoff = cos(radians(dOuterCutoff));
+            m_light.outerCutoff = cos(hlsl::radians(dOuterCutoff));
           }
         }
         ImGui::Checkbox("Use Indirect Command", &m_useIndirectCommand);

From 64653126f9bb5b13a8150376717662b977bdf5e4 Mon Sep 17 00:00:00 2001
From: kevyuu <kevin.kayu@gmail.com>
Date: Wed, 12 Mar 2025 23:21:32 +0700
Subject: [PATCH 075/529] Fix demo to use the new traceRayIndirect

---
 71_RayTracingPipeline/main.cpp | 25 ++++++++++++++++++++-----
 1 file changed, 20 insertions(+), 5 deletions(-)

diff --git a/71_RayTracingPipeline/main.cpp b/71_RayTracingPipeline/main.cpp
index 26618d2b2..4106a958f 100644
--- a/71_RayTracingPipeline/main.cpp
+++ b/71_RayTracingPipeline/main.cpp
@@ -730,10 +730,6 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication,
       if (m_useIndirectCommand)
       {
         cmdbuf->traceRaysIndirect(
-          m_shaderBindingTable.raygenGroupRange, m_shaderBindingTable.raygenGroupStride,
-          m_shaderBindingTable.missGroupsRange, m_shaderBindingTable.missGroupsStride,
-          m_shaderBindingTable.hitGroupsRange, m_shaderBindingTable.hitGroupsStride,
-          m_shaderBindingTable.callableGroupsRange, m_shaderBindingTable.callableGroupsStride,
           SBufferBinding<const IGPUBuffer>{
             .offset = 0,
             .buffer = m_indirectBuffer,
@@ -1042,7 +1038,26 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication,
 
   bool createIndirectBuffer(video::CThreadSafeQueueAdapter* queue)
   {
-    const auto command = TraceRaysIndirectCommand_t{ WIN_W, WIN_H, 1 };
+    const auto getBufferRangeAddress = [](const SBufferRange<IGPUBuffer>& range)
+      {
+        return range.buffer->getDeviceAddress() + range.offset;
+      };
+    const auto command = TraceRaysIndirectCommand_t{
+      .raygenShaderRecordAddress = getBufferRangeAddress(m_shaderBindingTable.raygenGroupRange),
+      .raygenShaderRecordSize = m_shaderBindingTable.raygenGroupRange.size,
+      .missShaderBindingTableAddress = getBufferRangeAddress(m_shaderBindingTable.missGroupsRange),
+      .missShaderBindingTableSize = m_shaderBindingTable.missGroupsRange.size,
+      .missShaderBindingTableStride = m_shaderBindingTable.missGroupsStride,
+      .hitShaderBindingTableAddress = getBufferRangeAddress(m_shaderBindingTable.hitGroupsRange),
+      .hitShaderBindingTableSize = m_shaderBindingTable.hitGroupsRange.size,
+      .hitShaderBindingTableStride = m_shaderBindingTable.hitGroupsStride,
+      .callableShaderBindingTableAddress = getBufferRangeAddress(m_shaderBindingTable.callableGroupsRange),
+      .callableShaderBindingTableSize = m_shaderBindingTable.callableGroupsRange.size,
+      .callableShaderBindingTableStride = m_shaderBindingTable.callableGroupsStride,
+      .width = WIN_W,
+      .height = WIN_H,
+      .depth = 1,
+    };
     IGPUBuffer::SCreationParams params;
     params.usage = IGPUBuffer::EUF_TRANSFER_DST_BIT | IGPUBuffer::EUF_INDIRECT_BUFFER_BIT | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT;
     params.size = sizeof(TraceRaysIndirectCommand_t);

From 6abb635b5d47e689e8f8ad3eb1ef35887ed53df6 Mon Sep 17 00:00:00 2001
From: keptsecret <sorchon@gmail.com>
Date: Thu, 13 Mar 2025 15:56:08 +0700
Subject: [PATCH 076/529] fixed nan and accumulation going black problem

---
 .../app_resources/hlsl/common.hlsl            | 10 ++++++-
 .../hlsl/next_event_estimator.hlsl            |  6 ++---
 .../app_resources/hlsl/pathtracer.hlsl        | 22 +++++++---------
 .../app_resources/hlsl/render.comp.hlsl       | 26 +++----------------
 .../app_resources/hlsl/render_common.hlsl     | 23 ++++++++++++++++
 31_HLSLPathTracer/main.cpp                    | 19 +++++++++-----
 6 files changed, 61 insertions(+), 45 deletions(-)
 create mode 100644 31_HLSLPathTracer/app_resources/hlsl/render_common.hlsl

diff --git a/31_HLSLPathTracer/app_resources/hlsl/common.hlsl b/31_HLSLPathTracer/app_resources/hlsl/common.hlsl
index ac1e0f09a..9e2249732 100644
--- a/31_HLSLPathTracer/app_resources/hlsl/common.hlsl
+++ b/31_HLSLPathTracer/app_resources/hlsl/common.hlsl
@@ -18,7 +18,7 @@ namespace hlsl
 namespace ext
 {
 
-template<typename T>
+template<typename T>    // TODO make type T Spectrum
 struct Payload
 {
     using this_t = Payload<T>;
@@ -85,6 +85,14 @@ struct Light
 
     NBL_CONSTEXPR_STATIC_INLINE uint32_t INVALID_ID = 0xffffu;
 
+    static Light<spectral_type> create(NBL_CONST_REF_ARG(spectral_type) radiance, uint32_t objId, uint32_t mode, ProceduralShapeType shapeType)
+    {
+        Light<spectral_type> retval;
+        retval.radiance = radiance;
+        retval.objectID = ObjectID::create(objId, mode, shapeType);
+        return retval;
+    }
+
     static Light<spectral_type> create(NBL_CONST_REF_ARG(spectral_type) radiance, NBL_CONST_REF_ARG(ObjectID) objectID)
     {
         Light<spectral_type> retval;
diff --git a/31_HLSLPathTracer/app_resources/hlsl/next_event_estimator.hlsl b/31_HLSLPathTracer/app_resources/hlsl/next_event_estimator.hlsl
index 65646b3c1..c1528216d 100644
--- a/31_HLSLPathTracer/app_resources/hlsl/next_event_estimator.hlsl
+++ b/31_HLSLPathTracer/app_resources/hlsl/next_event_estimator.hlsl
@@ -89,7 +89,7 @@ struct Estimator
 
     static spectral_type deferredEvalAndPdf(NBL_REF_ARG(scalar_type) pdf, NBL_CONST_REF_ARG(light_type) light, NBL_CONST_REF_ARG(ray_type) ray, NBL_CONST_REF_ARG(Event) event)
     {
-        const uint32_t mode = event.mode;
+        const IntersectMode mode = (IntersectMode)event.mode;
         switch (mode)
         {
             case IM_RAY_QUERY:
@@ -192,7 +192,7 @@ struct Estimator
         }
 
         newRayMaxT *= Tolerance<scalar_type>::getEnd(depth);
-        pdf *= 1.0 / lightCount;
+        pdf *= 1.0 / scalar_type(lightCount);
         spectral_type quo = light.radiance / pdf;
         quotient_pdf = quotient_pdf_type::create(quo, pdf);
 
@@ -201,7 +201,7 @@ struct Estimator
 
     static sample_type generate_and_quotient_and_pdf(NBL_REF_ARG(quotient_pdf_type) quotient_pdf, NBL_REF_ARG(scalar_type) newRayMaxT, NBL_CONST_REF_ARG(light_type) light, NBL_CONST_REF_ARG(vector3_type) origin, NBL_CONST_REF_ARG(interaction_type) interaction, bool isBSDF, NBL_CONST_REF_ARG(vector3_type) xi, uint32_t depth, NBL_CONST_REF_ARG(Event) event)
     {
-        const uint32_t mode = event.mode;
+        const IntersectMode mode = (IntersectMode)event.mode;
         sample_type L;
         switch (mode)
         {
diff --git a/31_HLSLPathTracer/app_resources/hlsl/pathtracer.hlsl b/31_HLSLPathTracer/app_resources/hlsl/pathtracer.hlsl
index 6ed89de7a..5c01db852 100644
--- a/31_HLSLPathTracer/app_resources/hlsl/pathtracer.hlsl
+++ b/31_HLSLPathTracer/app_resources/hlsl/pathtracer.hlsl
@@ -79,13 +79,12 @@ struct Unidirectional
     //                     NextEventEstimator nee)
     // {}
 
-    static this_t create(NBL_CONST_REF_ARG(PathTracerCreationParams<create_params_type, scalar_type>) params, Buffer<uint3> sampleSequence)
+    static this_t create(NBL_CONST_REF_ARG(PathTracerCreationParams<create_params_type, scalar_type>) params)
     {
         this_t retval;
         retval.randGen = randgen_type::create(params.rngState);
         retval.rayGen = raygen_type::create(params.pixOffsetParam, params.camPos, params.NDC, params.invMVP);
         retval.materialSystem = material_system_type::create(params.diffuseParams, params.conductorParams, params.dielectricParams);
-        retval.sampleSequence = sampleSequence;
         return retval;
     }
 
@@ -170,14 +169,14 @@ struct Unidirectional
         scalar_type rcpChoiceProb;
         if (!math::partitionRandVariable(neeProbability, eps0.z, rcpChoiceProb) && depth < 2u)
         {
+            uint32_t randLightID = uint32_t(float32_t(randGen().x) / numeric_limits<uint32_t>::max) * scene.lightCount;
             quotient_pdf_type neeContrib_pdf;
             scalar_type t;
             sample_type nee_sample = nee.generate_and_quotient_and_pdf(
                 neeContrib_pdf, t,
-                scene.lights[lightID], intersection, interaction,
-                isBSDF, eps0, depth, scene.toNextEvent(lightID)
+                scene.lights[randLightID], intersection, interaction,
+                isBSDF, eps0, depth, scene.toNextEvent(randLightID)
             );
-            //printf("%f %f %f\n", nee_sample.L.direction.x, nee_sample.L.direction.y, nee_sample.L.direction.z);
 
             // We don't allow non watertight transmitters in this renderer
             bool validPath = nee_sample.NdotL > numeric_limits<scalar_type>::min;
@@ -233,8 +232,7 @@ struct Unidirectional
                     // }
 
                     quotient_pdf_type bsdf_quotient_pdf = materialSystem.quotient_and_pdf(material, bxdf.params, params);
-                    bsdf_quotient_pdf.quotient *= bxdf.albedo * throughput;
-                    neeContrib_pdf.quotient *= bsdf_quotient_pdf.quotient;
+                    neeContrib_pdf.quotient *= bxdf.albedo * throughput * bsdf_quotient_pdf.quotient;
                     const scalar_type otherGenOverChoice = bsdf_quotient_pdf.pdf * rcpChoiceProb;
                     const scalar_type otherGenOverLightAndChoice = otherGenOverChoice / bsdf_quotient_pdf.pdf;
                     neeContrib_pdf.quotient *= otherGenOverChoice / (1.f + otherGenOverLightAndChoice * otherGenOverLightAndChoice);   // balance heuristic
@@ -252,7 +250,7 @@ struct Unidirectional
             }
         }
 
-        //return false;   // NEE only
+        // return false;   // NEE only
 
         // sample BSDF
         scalar_type bxdfPdf;
@@ -312,8 +310,8 @@ struct Unidirectional
         if (bxdfPdf > bxdfPdfThreshold && getLuma(throughput) > lumaThroughputThreshold)
         {
             ray.payload.throughput = throughput;
-            ray.payload.otherTechniqueHeuristic = neeProbability / bxdfPdf; // numerically stable, don't touch
-            ray.payload.otherTechniqueHeuristic *= ray.payload.otherTechniqueHeuristic;
+            scalar_type otherTechniqueHeuristic = neeProbability / bxdfPdf; // numerically stable, don't touch
+            ray.payload.otherTechniqueHeuristic = otherTechniqueHeuristic * otherTechniqueHeuristic;
 
             // trace new ray
             ray.origin = intersection + bxdfSample * (1.0/*kSceneSize*/) * Tolerance<scalar_type>::getStart(depth);
@@ -354,7 +352,7 @@ struct Unidirectional
             // bounces
             bool hit = true;
             bool rayAlive = true;
-            for (int d = 1; d <= depth && hit && rayAlive; d += 2)
+            for (int d = 1; (d <= depth) && hit && rayAlive; d += 2)
             {
                 ray.intersectionT = numeric_limits<scalar_type>::max;
                 ray.objectID = intersector_type::traceRay(ray, scene);
@@ -385,8 +383,6 @@ struct Unidirectional
     raygen_type rayGen;
     material_system_type materialSystem;
     nee_type nee;
-
-    Buffer<uint3> sampleSequence;
 };
 
 }
diff --git a/31_HLSLPathTracer/app_resources/hlsl/render.comp.hlsl b/31_HLSLPathTracer/app_resources/hlsl/render.comp.hlsl
index e25961b56..d19007dd4 100644
--- a/31_HLSLPathTracer/app_resources/hlsl/render.comp.hlsl
+++ b/31_HLSLPathTracer/app_resources/hlsl/render.comp.hlsl
@@ -6,6 +6,7 @@
 #include "nbl/builtin/hlsl/bxdf/reflection.hlsl"
 #include "nbl/builtin/hlsl/bxdf/transmission.hlsl"
 
+#include "render_common.hlsl"
 #include "pathtracer.hlsl"
 
 // add these defines (one at a time) using -D argument to dxc
@@ -26,25 +27,6 @@ NBL_CONSTEXPR uint32_t WorkgroupSize = 32;
 NBL_CONSTEXPR uint32_t MAX_DEPTH_LOG2 = 4;
 NBL_CONSTEXPR uint32_t MAX_SAMPLES_LOG2 = 10;
 
-struct SPushConstants
-{
-    float32_t4x4 invMVP;
-    int sampleCount;
-    int depth;
-};
-
-[[vk::push_constant]] SPushConstants pc;
-
-[[vk::combinedImageSampler]][[vk::binding(0, 2)]] Texture2D<float3> envMap;      // unused
-[[vk::combinedImageSampler]][[vk::binding(0, 2)]] SamplerState envSampler;
-
-[[vk::binding(1, 2)]] Buffer<uint3> sampleSequence;
-
-[[vk::combinedImageSampler]][[vk::binding(2, 2)]] Texture2D<uint2> scramblebuf; // unused
-[[vk::combinedImageSampler]][[vk::binding(2, 2)]] SamplerState scrambleSampler;
-
-[[vk::image_format("rgba16f")]][[vk::binding(0, 0)]] RWTexture2D<float32_t4> outImage;
-
 int32_t2 getCoordinates()
 {
     return int32_t2(glsl::gl_GlobalInvocationID().xy);
@@ -115,7 +97,7 @@ static const ext::Shape<ext::PST_RECTANGLE> rectangles[RECTANGLE_COUNT] = {
 
 #define LIGHT_COUNT 1
 static const light_type lights[LIGHT_COUNT] = {
-    light_type::create(spectral_t(30.0,25.0,15.0), ext::ObjectID::create(8u, ext::IntersectMode::IM_PROCEDURAL, LIGHT_TYPE))
+    light_type::create(spectral_t(30.0,25.0,15.0), 8u, ext::IntersectMode::IM_PROCEDURAL, LIGHT_TYPE)
 };
 
 #define BXDF_COUNT 7
@@ -154,7 +136,7 @@ void main(uint32_t3 threadID : SV_DispatchThreadID)
 
     // set up path tracer
     ext::PathTracer::PathTracerCreationParams<create_params_t, float> ptCreateParams;
-    ptCreateParams.rngState = pcg();
+    ptCreateParams.rngState = scramblebuf[coords].rg;
 
     uint2 scrambleDim;
     scramblebuf.GetDimensions(scrambleDim.x, scrambleDim.y);
@@ -174,7 +156,7 @@ void main(uint32_t3 threadID : SV_DispatchThreadID)
     ptCreateParams.conductorParams = bxdfs[3].params;
     ptCreateParams.dielectricParams = bxdfs[6].params;
 
-    pathtracer_type pathtracer = pathtracer_type::create(ptCreateParams, sampleSequence);
+    pathtracer_type pathtracer = pathtracer_type::create(ptCreateParams);
 
     // set up scene (can do as global var?)
     ext::Scene<light_type, bxdfnode_type> scene;
diff --git a/31_HLSLPathTracer/app_resources/hlsl/render_common.hlsl b/31_HLSLPathTracer/app_resources/hlsl/render_common.hlsl
new file mode 100644
index 000000000..5e5cf89da
--- /dev/null
+++ b/31_HLSLPathTracer/app_resources/hlsl/render_common.hlsl
@@ -0,0 +1,23 @@
+#ifndef _NBL_HLSL_PATHTRACER_RENDER_COMMON_INCLUDED_
+#define _NBL_HLSL_PATHTRACER_RENDER_COMMON_INCLUDED_
+
+struct SPushConstants
+{
+    float32_t4x4 invMVP;
+    int sampleCount;
+    int depth;
+};
+
+[[vk::push_constant]] SPushConstants pc;
+
+[[vk::combinedImageSampler]][[vk::binding(0, 2)]] Texture2D<float3> envMap;      // unused
+[[vk::combinedImageSampler]][[vk::binding(0, 2)]] SamplerState envSampler;
+
+[[vk::binding(1, 2)]] Buffer<uint3> sampleSequence;
+
+[[vk::combinedImageSampler]][[vk::binding(2, 2)]] Texture2D<uint2> scramblebuf; // unused
+[[vk::combinedImageSampler]][[vk::binding(2, 2)]] SamplerState scrambleSampler;
+
+[[vk::image_format("rgba16f")]][[vk::binding(0, 0)]] RWTexture2D<float32_t4> outImage;
+
+#endif
diff --git a/31_HLSLPathTracer/main.cpp b/31_HLSLPathTracer/main.cpp
index 8da32083e..30a0fad8d 100644
--- a/31_HLSLPathTracer/main.cpp
+++ b/31_HLSLPathTracer/main.cpp
@@ -74,6 +74,13 @@ class HLSLComputePathtracer final : public examples::SimpleWindowedApplication,
 
 		inline bool isComputeOnly() const override { return false; }
 
+		//inline video::IAPIConnection::SFeatures getAPIFeaturesToEnable() override
+		//{
+		//	auto retval = device_base_t::getAPIFeaturesToEnable();
+		//	retval.synchronizationValidation = true;
+		//	return retval;
+		//}
+
 		inline core::vector<video::SPhysicalDeviceFilter::SurfaceCompatibility> getSurfaces() const override
 		{
 			if (!m_surface)
@@ -359,11 +366,11 @@ class HLSLComputePathtracer final : public examples::SimpleWindowedApplication,
 					options.stage = IShader::E_SHADER_STAGE::ESS_COMPUTE;	// should be compute
 					options.targetSpirvVersion = m_device->getPhysicalDevice()->getLimits().spirvVersion;
 					options.spirvOptimizer = nullptr;
-#ifndef _NBL_DEBUG
-					ISPIRVOptimizer::E_OPTIMIZER_PASS optPasses = ISPIRVOptimizer::EOP_STRIP_DEBUG_INFO;
-					auto opt = make_smart_refctd_ptr<ISPIRVOptimizer>(std::span<ISPIRVOptimizer::E_OPTIMIZER_PASS>(&optPasses, 1));
-					options.spirvOptimizer = opt.get();
-#endif
+//#ifndef _NBL_DEBUG
+//					ISPIRVOptimizer::E_OPTIMIZER_PASS optPasses = ISPIRVOptimizer::EOP_STRIP_DEBUG_INFO;
+//					auto opt = make_smart_refctd_ptr<ISPIRVOptimizer>(std::span<ISPIRVOptimizer::E_OPTIMIZER_PASS>(&optPasses, 1));
+//					options.spirvOptimizer = opt.get();
+//#endif
 					options.debugInfoFlags |= IShaderCompiler::E_DEBUG_INFO_FLAGS::EDIF_SOURCE_BIT;
 					options.preprocessorOptions.sourceIdentifier = source->getFilepathHint();
 					options.preprocessorOptions.logger = m_logger.get();
@@ -1343,7 +1350,7 @@ class HLSLComputePathtracer final : public examples::SimpleWindowedApplication,
 		int PTPipline = E_LIGHT_GEOMETRY::ELG_SPHERE;
 		int renderMode = E_RENDER_MODE::ERM_HLSL;
 		int spp = 32;
-		int depth = 3;
+		int depth = 1;
 
 		bool m_firstFrame = true;
 		IGPUCommandBuffer::SClearColorValue clearColor = { .float32 = {0.f,0.f,0.f,1.f} };

From 1eee3ca8ade05d2afdd1ae3eeb1033edee372a66 Mon Sep 17 00:00:00 2001
From: keptsecret <sorchon@gmail.com>
Date: Thu, 13 Mar 2025 16:06:04 +0700
Subject: [PATCH 077/529] fixed triangle light, rectangle needs checking

---
 31_HLSLPathTracer/app_resources/hlsl/render.comp.hlsl | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/31_HLSLPathTracer/app_resources/hlsl/render.comp.hlsl b/31_HLSLPathTracer/app_resources/hlsl/render.comp.hlsl
index d19007dd4..065d93b7b 100644
--- a/31_HLSLPathTracer/app_resources/hlsl/render.comp.hlsl
+++ b/31_HLSLPathTracer/app_resources/hlsl/render.comp.hlsl
@@ -97,7 +97,13 @@ static const ext::Shape<ext::PST_RECTANGLE> rectangles[RECTANGLE_COUNT] = {
 
 #define LIGHT_COUNT 1
 static const light_type lights[LIGHT_COUNT] = {
-    light_type::create(spectral_t(30.0,25.0,15.0), 8u, ext::IntersectMode::IM_PROCEDURAL, LIGHT_TYPE)
+    light_type::create(spectral_t(30.0,25.0,15.0),
+#ifdef SPHERE_LIGHT
+        8u,
+#else
+        0u,
+#endif
+        ext::IntersectMode::IM_PROCEDURAL, LIGHT_TYPE)
 };
 
 #define BXDF_COUNT 7

From 011fbfb376ef9d926b74960a05a5bfcfaf851fbf Mon Sep 17 00:00:00 2001
From: keptsecret <sorchon@gmail.com>
Date: Thu, 13 Mar 2025 16:24:12 +0700
Subject: [PATCH 078/529] simplified material data

---
 .../app_resources/hlsl/material_system.hlsl   | 61 +++++++++++--------
 .../app_resources/hlsl/pathtracer.hlsl        | 20 +++---
 .../app_resources/hlsl/render.comp.hlsl       | 14 ++---
 31_HLSLPathTracer/main.cpp                    |  2 +-
 4 files changed, 49 insertions(+), 48 deletions(-)

diff --git a/31_HLSLPathTracer/app_resources/hlsl/material_system.hlsl b/31_HLSLPathTracer/app_resources/hlsl/material_system.hlsl
index 0d739d9ec..feffee9ef 100644
--- a/31_HLSLPathTracer/app_resources/hlsl/material_system.hlsl
+++ b/31_HLSLPathTracer/app_resources/hlsl/material_system.hlsl
@@ -14,20 +14,27 @@ namespace ext
 namespace MaterialSystem
 {
 
-struct Material
-{
-    enum Type : uint32_t    // enum class?
-    {
-        DIFFUSE,
-        CONDUCTOR,
-        DIELECTRIC
-    };
+// struct Material
+// {
+//     enum Type : uint32_t    // enum class?
+//     {
+//         DIFFUSE,
+//         CONDUCTOR,
+//         DIELECTRIC
+//     };
 
-    NBL_CONSTEXPR_STATIC_INLINE uint32_t DataSize = 1;
+//     NBL_CONSTEXPR_STATIC_INLINE uint32_t DataSize = 1;
 
-    uint32_t type : 2;
-    uint32_t unused : 30;   // possible space for flags
-    uint32_t data[DataSize];
+//     uint32_t type : 2;
+//     uint32_t unused : 30;   // possible space for flags
+//     uint32_t data[DataSize];
+// };
+
+enum MaterialType : uint32_t    // enum class?
+{
+    DIFFUSE,
+    CONDUCTOR,
+    DIELECTRIC
 };
 
 template<class DiffuseBxDF, class ConductorBxDF, class DielectricBxDF>  // NOTE: these bxdfs should match the ones in Scene BxDFNode
@@ -59,23 +66,23 @@ struct System
         return retval;
     }
 
-    measure_type eval(NBL_CONST_REF_ARG(Material) material, NBL_CONST_REF_ARG(create_params_t) cparams, NBL_CONST_REF_ARG(params_t) params)
+    measure_type eval(uint32_t material, NBL_CONST_REF_ARG(create_params_t) cparams, NBL_CONST_REF_ARG(params_t) params)
     {
-        switch(material.type)
+        switch(material)
         {
-            case Material::Type::DIFFUSE:
+            case MaterialType::DIFFUSE:
             {
                 diffuseBxDF.init(cparams);
                 return (measure_type)diffuseBxDF.eval(params);
             }
             break;
-            case Material::Type::CONDUCTOR:
+            case MaterialType::CONDUCTOR:
             {
                 conductorBxDF.init(cparams);
                 return conductorBxDF.eval(params);
             }
             break;
-            case Material::Type::DIELECTRIC:
+            case MaterialType::DIELECTRIC:
             {
                 dielectricBxDF.init(cparams);
                 return dielectricBxDF.eval(params);
@@ -86,23 +93,23 @@ struct System
         }
     }
 
-    sample_type generate(NBL_CONST_REF_ARG(Material) material, NBL_CONST_REF_ARG(create_params_t) cparams, NBL_CONST_REF_ARG(anisotropic_type) interaction, NBL_CONST_REF_ARG(vector3_type) u, NBL_REF_ARG(anisocache_type) _cache)
+    sample_type generate(uint32_t material, NBL_CONST_REF_ARG(create_params_t) cparams, NBL_CONST_REF_ARG(anisotropic_type) interaction, NBL_CONST_REF_ARG(vector3_type) u, NBL_REF_ARG(anisocache_type) _cache)
     {
-        switch(material.type)
+        switch(material)
         {
-            case Material::Type::DIFFUSE:
+            case MaterialType::DIFFUSE:
             {
                 diffuseBxDF.init(cparams);
                 return diffuseBxDF.generate(interaction, u.xy);
             }
             break;
-            case Material::Type::CONDUCTOR:
+            case MaterialType::CONDUCTOR:
             {
                 conductorBxDF.init(cparams);
                 return conductorBxDF.generate(interaction, u.xy, _cache);
             }
             break;
-            case Material::Type::DIELECTRIC:
+            case MaterialType::DIELECTRIC:
             {
                 dielectricBxDF.init(cparams);
                 return dielectricBxDF.generate(interaction, u, _cache);
@@ -121,26 +128,26 @@ struct System
         return sample_type::create(L, 0, (vector3_type)0);
     }
 
-    quotient_pdf_type quotient_and_pdf(NBL_CONST_REF_ARG(Material) material, NBL_CONST_REF_ARG(create_params_t) cparams, NBL_CONST_REF_ARG(params_t) params)
+    quotient_pdf_type quotient_and_pdf(uint32_t material, NBL_CONST_REF_ARG(create_params_t) cparams, NBL_CONST_REF_ARG(params_t) params)
     {
         const float minimumProjVectorLen = 0.00000001;
         if (params.NdotV > minimumProjVectorLen && params.NdotL > minimumProjVectorLen)
         {
-            switch(material.type)
+            switch(material)
             {
-                case Material::Type::DIFFUSE:
+                case MaterialType::DIFFUSE:
                 {
                     diffuseBxDF.init(cparams);
                     return diffuseBxDF.quotient_and_pdf(params);
                 }
                 break;
-                case Material::Type::CONDUCTOR:
+                case MaterialType::CONDUCTOR:
                 {
                     conductorBxDF.init(cparams);
                     return conductorBxDF.quotient_and_pdf(params);
                 }
                 break;
-                case Material::Type::DIELECTRIC:
+                case MaterialType::DIELECTRIC:
                 {
                     dielectricBxDF.init(cparams);
                     return dielectricBxDF.quotient_and_pdf(params);
diff --git a/31_HLSLPathTracer/app_resources/hlsl/pathtracer.hlsl b/31_HLSLPathTracer/app_resources/hlsl/pathtracer.hlsl
index 5c01db852..553094e21 100644
--- a/31_HLSLPathTracer/app_resources/hlsl/pathtracer.hlsl
+++ b/31_HLSLPathTracer/app_resources/hlsl/pathtracer.hlsl
@@ -150,8 +150,8 @@ struct Unidirectional
 
         // TODO: ifdef kill diffuse specular paths
 
-        const bool isBSDF = (bxdf.materialType == ext::MaterialSystem::Material::Type::DIFFUSE) ? bxdf_traits<diffuse_op_type>::type == BT_BSDF :
-                            (bxdf.materialType == ext::MaterialSystem::Material::Type::CONDUCTOR) ? bxdf_traits<conductor_op_type>::type == BT_BSDF :
+        const bool isBSDF = (bxdf.materialType == ext::MaterialSystem::MaterialType::DIFFUSE) ? bxdf_traits<diffuse_op_type>::type == BT_BSDF :
+                            (bxdf.materialType == ext::MaterialSystem::MaterialType::CONDUCTOR) ? bxdf_traits<conductor_op_type>::type == BT_BSDF :
                             bxdf_traits<dielectric_op_type>::type == BT_BSDF;
 
         vector3_type eps0 = rand3d(depth, _sample, 0u);
@@ -193,11 +193,8 @@ struct Unidirectional
                     ray.payload.accumulation += vector3_type(0.f, 1000.f, 0.f);
                 else if (validPath)
                 {
-                    ext::MaterialSystem::Material material;
-                    material.type = bxdf.materialType;
-
                     bxdf::BxDFClampMode _clamp;
-                    _clamp = (bxdf.materialType == ext::MaterialSystem::Material::Type::DIELECTRIC) ? bxdf::BxDFClampMode::BCM_ABS : bxdf::BxDFClampMode::BCM_MAX;
+                    _clamp = (bxdf.materialType == ext::MaterialSystem::MaterialType::DIELECTRIC) ? bxdf::BxDFClampMode::BCM_ABS : bxdf::BxDFClampMode::BCM_MAX;
                     // example only uses isotropic bxdfs
                     params_type params = params_type::template create<sample_type, isotropic_type, isocache_type>(nee_sample, interaction.isotropic, _cache.iso_cache, _clamp);
 
@@ -231,7 +228,7 @@ struct Unidirectional
                     //     }
                     // }
 
-                    quotient_pdf_type bsdf_quotient_pdf = materialSystem.quotient_and_pdf(material, bxdf.params, params);
+                    quotient_pdf_type bsdf_quotient_pdf = materialSystem.quotient_and_pdf(bxdf.materialType, bxdf.params, params);
                     neeContrib_pdf.quotient *= bxdf.albedo * throughput * bsdf_quotient_pdf.quotient;
                     const scalar_type otherGenOverChoice = bsdf_quotient_pdf.pdf * rcpChoiceProb;
                     const scalar_type otherGenOverLightAndChoice = otherGenOverChoice / bsdf_quotient_pdf.pdf;
@@ -256,14 +253,11 @@ struct Unidirectional
         scalar_type bxdfPdf;
         vector3_type bxdfSample;
         {
-            ext::MaterialSystem::Material material;
-            material.type = bxdf.materialType;
-
             anisocache_type _cache;
-            sample_type bsdf_sample = materialSystem.generate(material, bxdf.params, interaction, eps1, _cache);
+            sample_type bsdf_sample = materialSystem.generate(bxdf.materialType, bxdf.params, interaction, eps1, _cache);
 
             bxdf::BxDFClampMode _clamp;
-            _clamp = (bxdf.materialType == ext::MaterialSystem::Material::Type::DIELECTRIC) ? bxdf::BxDFClampMode::BCM_ABS : bxdf::BxDFClampMode::BCM_MAX;
+            _clamp = (bxdf.materialType == ext::MaterialSystem::MaterialType::DIELECTRIC) ? bxdf::BxDFClampMode::BCM_ABS : bxdf::BxDFClampMode::BCM_MAX;
             // example only uses isotropic bxdfs
             params_type params = params_type::template create<sample_type, isotropic_type, isocache_type>(bsdf_sample, interaction.isotropic, _cache.iso_cache, _clamp);
 
@@ -299,7 +293,7 @@ struct Unidirectional
             // }
 
             // the value of the bsdf divided by the probability of the sample being generated
-            quotient_pdf_type bsdf_quotient_pdf = materialSystem.quotient_and_pdf(material, bxdf.params, params);
+            quotient_pdf_type bsdf_quotient_pdf = materialSystem.quotient_and_pdf(bxdf.materialType, bxdf.params, params);
             throughput *= bxdf.albedo * bsdf_quotient_pdf.quotient;
             bxdfPdf = bsdf_quotient_pdf.pdf;
             bxdfSample = bsdf_sample.L.direction;
diff --git a/31_HLSLPathTracer/app_resources/hlsl/render.comp.hlsl b/31_HLSLPathTracer/app_resources/hlsl/render.comp.hlsl
index 065d93b7b..5dea2d1bf 100644
--- a/31_HLSLPathTracer/app_resources/hlsl/render.comp.hlsl
+++ b/31_HLSLPathTracer/app_resources/hlsl/render.comp.hlsl
@@ -108,13 +108,13 @@ static const light_type lights[LIGHT_COUNT] = {
 
 #define BXDF_COUNT 7
 static const bxdfnode_type bxdfs[BXDF_COUNT] = {
-    bxdfnode_type::create(ext::MaterialSystem::Material::Type::DIFFUSE, false, float2(0,0), spectral_t(0.8,0.8,0.8)),
-    bxdfnode_type::create(ext::MaterialSystem::Material::Type::DIFFUSE, false, float2(0,0), spectral_t(0.8,0.4,0.4)),
-    bxdfnode_type::create(ext::MaterialSystem::Material::Type::DIFFUSE, false, float2(0,0), spectral_t(0.4,0.8,0.4)),
-    bxdfnode_type::create(ext::MaterialSystem::Material::Type::CONDUCTOR, false, float2(0,0), spectral_t(1,1,1), spectral_t(0.98,0.98,0.77)),
-    bxdfnode_type::create(ext::MaterialSystem::Material::Type::CONDUCTOR, false, float2(0,0), spectral_t(1,1,1), spectral_t(0.98,0.77,0.98)),
-    bxdfnode_type::create(ext::MaterialSystem::Material::Type::CONDUCTOR, false, float2(0.15,0.15), spectral_t(1,1,1), spectral_t(0.98,0.77,0.98)),
-    bxdfnode_type::create(ext::MaterialSystem::Material::Type::DIELECTRIC, false, float2(0.0625,0.0625), spectral_t(1,1,1), spectral_t(0.71,0.69,0.67))
+    bxdfnode_type::create(ext::MaterialSystem::MaterialType::DIFFUSE, false, float2(0,0), spectral_t(0.8,0.8,0.8)),
+    bxdfnode_type::create(ext::MaterialSystem::MaterialType::DIFFUSE, false, float2(0,0), spectral_t(0.8,0.4,0.4)),
+    bxdfnode_type::create(ext::MaterialSystem::MaterialType::DIFFUSE, false, float2(0,0), spectral_t(0.4,0.8,0.4)),
+    bxdfnode_type::create(ext::MaterialSystem::MaterialType::CONDUCTOR, false, float2(0,0), spectral_t(1,1,1), spectral_t(0.98,0.98,0.77)),
+    bxdfnode_type::create(ext::MaterialSystem::MaterialType::CONDUCTOR, false, float2(0,0), spectral_t(1,1,1), spectral_t(0.98,0.77,0.98)),
+    bxdfnode_type::create(ext::MaterialSystem::MaterialType::CONDUCTOR, false, float2(0.15,0.15), spectral_t(1,1,1), spectral_t(0.98,0.77,0.98)),
+    bxdfnode_type::create(ext::MaterialSystem::MaterialType::DIELECTRIC, false, float2(0.0625,0.0625), spectral_t(1,1,1), spectral_t(0.71,0.69,0.67))
 };
 
 [numthreads(WorkgroupSize, WorkgroupSize, 1)]
diff --git a/31_HLSLPathTracer/main.cpp b/31_HLSLPathTracer/main.cpp
index 30a0fad8d..46597d738 100644
--- a/31_HLSLPathTracer/main.cpp
+++ b/31_HLSLPathTracer/main.cpp
@@ -1350,7 +1350,7 @@ class HLSLComputePathtracer final : public examples::SimpleWindowedApplication,
 		int PTPipline = E_LIGHT_GEOMETRY::ELG_SPHERE;
 		int renderMode = E_RENDER_MODE::ERM_HLSL;
 		int spp = 32;
-		int depth = 1;
+		int depth = 3;
 
 		bool m_firstFrame = true;
 		IGPUCommandBuffer::SClearColorValue clearColor = { .float32 = {0.f,0.f,0.f,1.f} };

From 0232ee8d04c1159ec22fe9ca1e406dc462e09970 Mon Sep 17 00:00:00 2001
From: Przemek <minikers21@gmail.com>
Date: Thu, 13 Mar 2025 16:52:13 +0100
Subject: [PATCH 079/529] Drawing triangle sdf

---
 62_CAD/DrawResourcesFiller.cpp                |  2 +-
 62_CAD/main.cpp                               |  8 +++
 62_CAD/shaders/main_pipeline/common.hlsl      |  6 +-
 .../main_pipeline/fragment_shader.hlsl        | 69 ++++++++++++++++---
 4 files changed, 71 insertions(+), 14 deletions(-)

diff --git a/62_CAD/DrawResourcesFiller.cpp b/62_CAD/DrawResourcesFiller.cpp
index 291e0ad88..424569b8f 100644
--- a/62_CAD/DrawResourcesFiller.cpp
+++ b/62_CAD/DrawResourcesFiller.cpp
@@ -256,7 +256,7 @@ void DrawResourcesFiller::drawTriangleMesh(const CTriangleMesh& mesh, CTriangleM
 
 	// call addMainObject_SubmitIfNeeded, use its index in push constants
 
-	drawData.pushConstants.triangleMeshMainObjectIndex = addMainObject_SubmitIfNeeded(0, intendedNextSubmit);
+	drawData.pushConstants.triangleMeshMainObjectIndex = addMainObject_SubmitIfNeeded(InvalidStyleIdx, intendedNextSubmit);
 
 	// TODO: use this function later for auto submit
 	//submitCurrentDrawObjectsAndReset(intendedNextSubmit, 0);
diff --git a/62_CAD/main.cpp b/62_CAD/main.cpp
index a14d9de55..3aad8c4d3 100644
--- a/62_CAD/main.cpp
+++ b/62_CAD/main.cpp
@@ -1501,6 +1501,14 @@ class ComputerAidedDesign final : public examples::SimpleWindowedApplication, pu
 		retval.fragmentShaderPixelInterlock = FragmentShaderPixelInterlock;
 		return retval;
 	}
+
+	virtual video::SPhysicalDeviceLimits getRequiredDeviceLimits() const override
+	{
+		video::SPhysicalDeviceLimits retval = base_t::getRequiredDeviceLimits();
+		retval.fragmentShaderBarycentric = true;
+
+		return retval;
+	}
 		
 	virtual video::IAPIConnection::SFeatures getAPIFeaturesToEnable() override
 	{
diff --git a/62_CAD/shaders/main_pipeline/common.hlsl b/62_CAD/shaders/main_pipeline/common.hlsl
index a0a903a4d..4fd45ab5c 100644
--- a/62_CAD/shaders/main_pipeline/common.hlsl
+++ b/62_CAD/shaders/main_pipeline/common.hlsl
@@ -75,10 +75,10 @@ struct PSInput
     // Data segments that need interpolation, mostly for hatches
     [[vk::location(5)]] float2 interp_data5 : COLOR5;
 #ifdef FRAGMENT_SHADER_INPUT
-    [[vk::location(6)]] [[vk::ext_decorate(/*spv::DecoratePerVertexKHR*/5285)]] nointerpolation float3 vertexScreenSpacePos[3] : COLOR6;
+    [[vk::location(6)]] [[vk::ext_decorate(/*spv::DecoratePerVertexKHR*/5285)]] float3 vertexScreenSpacePos[3] : COLOR6;
 #else
-    [[vk::location(6)]] nointerpolation float3 vertexScreenSpacePos : COLOR6;
-#endif 
+    [[vk::location(6)]] float3 vertexScreenSpacePos : COLOR6;
+#endif
     // ArcLenCalculator<float>
 
     // Set functions used in vshader, get functions used in fshader
diff --git a/62_CAD/shaders/main_pipeline/fragment_shader.hlsl b/62_CAD/shaders/main_pipeline/fragment_shader.hlsl
index ab5885d3d..845cb36d7 100644
--- a/62_CAD/shaders/main_pipeline/fragment_shader.hlsl
+++ b/62_CAD/shaders/main_pipeline/fragment_shader.hlsl
@@ -407,22 +407,71 @@ float32_t4 calculateFinalColor<true>(const uint2 fragCoord, const float localAlp
 [shader("pixel")]
 float4 fragMain(PSInput input) : SV_TARGET
 {
-    float3 v0 = input.getScreenSpaceVertexPos(0);
-    float3 v1 = input.getScreenSpaceVertexPos(1);
-    float3 v2 = input.getScreenSpaceVertexPos(2);
-
-    printf("v0 = { %f, %f, %f }\nv1 = { %f, %f, %f }\nv2 = { %f, %f, %f }", v0.x, v0.y, v0.z, v1.x, v1.y, v1.z, v2.x, v2.y, v2.z);
-
-    return float4(1.0f, 0.0f, 0.0f, 1.0f);
-
     float localAlpha = 0.0f;
     float3 textureColor = float3(0, 0, 0); // color sampled from a texture
 
-    // TODO[Przemek]: Disable All the object rendering paths if you want.
     ObjectType objType = input.getObjType();
     const uint32_t currentMainObjectIdx = input.getMainObjectIdx();
     const MainObject mainObj = mainObjects[currentMainObjectIdx];
-    
+
+    // TRIANGLE RENDERING
+    {
+        float3 v0 = input.getScreenSpaceVertexPos(0);
+        float3 v1 = input.getScreenSpaceVertexPos(1);
+        float3 v2 = input.getScreenSpaceVertexPos(2);
+
+        float2 start;
+        float2 end;
+        const float3 baryCoord = nbl::hlsl::spirv::BaryCoordKHR;
+
+        // TODO: figure out if branching can be reduced
+        if (baryCoord.x < baryCoord.y && baryCoord.x < baryCoord.z)
+        {
+            start = v1;
+            end = v2;
+        }
+        else if (baryCoord.y < baryCoord.x && baryCoord.y < baryCoord.z)
+        {
+            start = v0;
+            end = v2;
+        }
+        else if (baryCoord.z < baryCoord.x && baryCoord.z < baryCoord.y)
+        {
+            start = v0;
+            end = v1;
+        }
+
+        float distance = nbl::hlsl::numeric_limits<float>::max;
+        const uint32_t styleIdx = mainObj.styleIdx;
+        const float thickness = 2.0f;
+        const float phaseShift = 0.0f;
+        const float stretch = 0.0f;
+        const float worldToScreenRatio = input.getCurrentWorldToScreenRatio();
+
+        nbl::hlsl::shapes::Line<float> lineSegment = nbl::hlsl::shapes::Line<float>::construct(start, end);
+        nbl::hlsl::shapes::Line<float>::ArcLengthCalculator arcLenCalc = nbl::hlsl::shapes::Line<float>::ArcLengthCalculator::construct(lineSegment);
+
+        LineStyle style = lineStyles[styleIdx];
+
+        // TODO: stipples
+        //if (!style.hasStipples() || stretch == InvalidStyleStretchValue)
+        //{
+            //distance = ClippedSignedDistance< nbl::hlsl::shapes::Line<float> >::sdf(lineSegment, input.position.xy, thickness, style.isRoadStyleFlag);
+        //}
+        //else
+        //{
+        //    LineStyleClipper clipper = LineStyleClipper::construct(lineStyles[styleIdx], lineSegment, arcLenCalc, phaseShift, stretch, worldToScreenRatio);
+        //    distance = ClippedSignedDistance<nbl::hlsl::shapes::Line<float>, LineStyleClipper>::sdf(lineSegment, input.position.xy, thickness, style.isRoadStyleFlag, clipper);
+        //}
+
+        distance = ClippedSignedDistance< nbl::hlsl::shapes::Line<float> >::sdf(lineSegment, input.position.xy, thickness, true);
+
+        localAlpha = smoothstep(+globals.antiAliasingFactor, -globals.antiAliasingFactor, distance);
+    }
+
+    textureColor = float3(1.0f, 1.0f, 1.0f);
+    return calculateFinalColor<nbl::hlsl::jit::device_capabilities::fragmentShaderPixelInterlock>(uint2(input.position.xy), localAlpha, currentMainObjectIdx, textureColor, true);
+
     // figure out local alpha with sdf
     if (objType == ObjectType::LINE || objType == ObjectType::QUAD_BEZIER || objType == ObjectType::POLYLINE_CONNECTOR)
     {

From 4d1dca47c8e95081341cbf5fb31ab64b27fa8e1b Mon Sep 17 00:00:00 2001
From: kevyuu <kevin.kayu@gmail.com>
Date: Thu, 13 Mar 2025 23:54:12 +0700
Subject: [PATCH 080/529] Small fixes on ray trace pipeline demo.

---
 71_RayTracingPipeline/app_resources/common.hlsl           | 2 +-
 71_RayTracingPipeline/app_resources/light_spot.rcall.hlsl | 2 +-
 71_RayTracingPipeline/app_resources/raytrace.rgen.hlsl    | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/71_RayTracingPipeline/app_resources/common.hlsl b/71_RayTracingPipeline/app_resources/common.hlsl
index 5b69c4a76..6c052dff1 100644
--- a/71_RayTracingPipeline/app_resources/common.hlsl
+++ b/71_RayTracingPipeline/app_resources/common.hlsl
@@ -64,7 +64,7 @@ struct MaterialPacked
     bool isTransparent() NBL_CONST_MEMBER_FUNC
     {
         return alpha != MAX_UNORM_10;
-}
+    }
 };
 
 inline MaterialPacked packMaterial(Material material)
diff --git a/71_RayTracingPipeline/app_resources/light_spot.rcall.hlsl b/71_RayTracingPipeline/app_resources/light_spot.rcall.hlsl
index fcb130104..f298e4643 100644
--- a/71_RayTracingPipeline/app_resources/light_spot.rcall.hlsl
+++ b/71_RayTracingPipeline/app_resources/light_spot.rcall.hlsl
@@ -10,7 +10,7 @@ void main(inout RayLight cLight)
     cLight.outIntensity = LightIntensity / (cLight.outLightDistance * cLight.outLightDistance);
     cLight.outLightDir = normalize(lDir);
     float theta = dot(cLight.outLightDir, normalize(-pc.light.direction));
-    float epsilon = 1 - pc.light.outerCutoff;
+    float epsilon = 1.f - pc.light.outerCutoff;
     float spotIntensity = clamp((theta - pc.light.outerCutoff) / epsilon, 0.0, 1.0);
     cLight.outIntensity *= spotIntensity;
 }
diff --git a/71_RayTracingPipeline/app_resources/raytrace.rgen.hlsl b/71_RayTracingPipeline/app_resources/raytrace.rgen.hlsl
index df6a5215d..f15b424ea 100644
--- a/71_RayTracingPipeline/app_resources/raytrace.rgen.hlsl
+++ b/71_RayTracingPipeline/app_resources/raytrace.rgen.hlsl
@@ -91,7 +91,7 @@ void main()
 
             uint32_t shadowRayFlags = RAY_FLAG_ACCEPT_FIRST_HIT_AND_END_SEARCH | RAY_FLAG_FORCE_NON_OPAQUE | RAY_FLAG_SKIP_CLOSEST_HIT_SHADER;
             OcclusionPayload occlusionPayload;
-            occlusionPayload.attenuation = 1; // negative attenuation indicate occlusion happening. will be multiplied by -1 in miss shader.
+            occlusionPayload.attenuation = 1;
             TraceRay(topLevelAS, shadowRayFlags, 0xFF, ERT_OCCLUSION, 0, EMT_OCCLUSION, rayDesc, occlusionPayload);
 
             attenuation = occlusionPayload.attenuation;

From cc84091d68ceedeb954c1867b3420e0e5119789b Mon Sep 17 00:00:00 2001
From: kevyuu <kevin.kayu@gmail.com>
Date: Thu, 13 Mar 2025 23:55:45 +0700
Subject: [PATCH 081/529] Optimize ray tracing demo occlusion tracing

---
 .../app_resources/raytrace.rgen.hlsl                |  2 +-
 .../app_resources/raytrace_shadow.rahit.hlsl        | 13 ++-----------
 .../raytrace_shadow_triangle.rchit.hlsl             |  7 +++++++
 71_RayTracingPipeline/main.cpp                      |  8 +++++++-
 4 files changed, 17 insertions(+), 13 deletions(-)
 create mode 100644 71_RayTracingPipeline/app_resources/raytrace_shadow_triangle.rchit.hlsl

diff --git a/71_RayTracingPipeline/app_resources/raytrace.rgen.hlsl b/71_RayTracingPipeline/app_resources/raytrace.rgen.hlsl
index f15b424ea..c74774880 100644
--- a/71_RayTracingPipeline/app_resources/raytrace.rgen.hlsl
+++ b/71_RayTracingPipeline/app_resources/raytrace.rgen.hlsl
@@ -89,7 +89,7 @@ void main()
             rayDesc.TMin = 0.01;
             rayDesc.TMax = cLight.outLightDistance;
 
-            uint32_t shadowRayFlags = RAY_FLAG_ACCEPT_FIRST_HIT_AND_END_SEARCH | RAY_FLAG_FORCE_NON_OPAQUE | RAY_FLAG_SKIP_CLOSEST_HIT_SHADER;
+            uint32_t shadowRayFlags = RAY_FLAG_ACCEPT_FIRST_HIT_AND_END_SEARCH;
             OcclusionPayload occlusionPayload;
             occlusionPayload.attenuation = 1;
             TraceRay(topLevelAS, shadowRayFlags, 0xFF, ERT_OCCLUSION, 0, EMT_OCCLUSION, rayDesc, occlusionPayload);
diff --git a/71_RayTracingPipeline/app_resources/raytrace_shadow.rahit.hlsl b/71_RayTracingPipeline/app_resources/raytrace_shadow.rahit.hlsl
index c59f7367e..e76f1da55 100644
--- a/71_RayTracingPipeline/app_resources/raytrace_shadow.rahit.hlsl
+++ b/71_RayTracingPipeline/app_resources/raytrace_shadow.rahit.hlsl
@@ -9,15 +9,6 @@ void main(inout OcclusionPayload payload, in BuiltInTriangleIntersectionAttribut
     const STriangleGeomInfo geom = vk::RawBufferLoad < STriangleGeomInfo > (pc.triangleGeomInfoBuffer + instID * sizeof(STriangleGeomInfo));
     const Material material = unpackMaterial(geom.material);
     
-    if (material.isTransparent())
-    {
-        payload.attenuation = material.alpha * payload.attenuation;
-        IgnoreHit();
-    }
-    else
-    {
-        payload.attenuation = 0;
-        AcceptHitAndEndSearch();
-    }
-
+    payload.attenuation = material.alpha * payload.attenuation;
+    IgnoreHit();
 }
diff --git a/71_RayTracingPipeline/app_resources/raytrace_shadow_triangle.rchit.hlsl b/71_RayTracingPipeline/app_resources/raytrace_shadow_triangle.rchit.hlsl
new file mode 100644
index 000000000..c85c7c32d
--- /dev/null
+++ b/71_RayTracingPipeline/app_resources/raytrace_shadow_triangle.rchit.hlsl
@@ -0,0 +1,7 @@
+#include "common.hlsl"
+
+[shader("closesthit")]
+void main(inout OcclusionPayload payload, in BuiltInTriangleIntersectionAttributes attribs)
+{
+    payload.attenuation = 0;
+}
diff --git a/71_RayTracingPipeline/main.cpp b/71_RayTracingPipeline/main.cpp
index 4106a958f..cb7ef1d8e 100644
--- a/71_RayTracingPipeline/main.cpp
+++ b/71_RayTracingPipeline/main.cpp
@@ -164,6 +164,7 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication,
     const auto anyHitShaderShadowPayload = loadCompileAndCreateShader("app_resources/raytrace_shadow.rahit.hlsl");
     const auto missShader = loadCompileAndCreateShader("app_resources/raytrace.rmiss.hlsl");
     const auto shadowMissShader = loadCompileAndCreateShader("app_resources/raytrace_shadow.rmiss.hlsl");
+    const auto shadowClosestHitShader = loadCompileAndCreateShader("app_resources/raytrace_shadow_triangle.rchit.hlsl");
     const auto directionalLightCallShader = loadCompileAndCreateShader("app_resources/light_directional.rcall.hlsl");
     const auto pointLightCallShader = loadCompileAndCreateShader("app_resources/light_point.rcall.hlsl");
     const auto spotLightCallShader = loadCompileAndCreateShader("app_resources/light_spot.rcall.hlsl");
@@ -324,6 +325,7 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication,
         RTDS_RAYGEN,
         RTDS_MISS,
         RTDS_SHADOW_MISS,
+        RTDS_CLOSEST_HIT_SHADOW,
         RTDS_CLOSEST_HIT,
         RTDS_SPHERE_CLOSEST_HIT,
         RTDS_ANYHIT_PRIMARY,
@@ -339,6 +341,7 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication,
       shaders[RTDS_RAYGEN] = {.shader = raygenShader.get()};
       shaders[RTDS_MISS] = {.shader = missShader.get()};
       shaders[RTDS_SHADOW_MISS] = {.shader = shadowMissShader.get()};
+      shaders[RTDS_CLOSEST_HIT_SHADOW] = { .shader = shadowClosestHitShader.get() };
       shaders[RTDS_CLOSEST_HIT] = {.shader = closestHitShader.get()};
       shaders[RTDS_SPHERE_CLOSEST_HIT] = {.shader = proceduralClosestHitShader.get()};
       shaders[RTDS_ANYHIT_PRIMARY] = {.shader = anyHitShaderColorPayload.get()};
@@ -350,6 +353,7 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication,
 
       params.layout = pipelineLayout.get();
       params.shaders = std::span(shaders);
+      params.flags = IGPURayTracingPipeline::SCreationParams::FLAGS::NO_NULL_INTERSECTION_SHADERS;
 
       auto& shaderGroups = params.shaderGroups;
 
@@ -357,7 +361,7 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication,
 
       IRayTracingPipelineBase::SGeneralShaderGroup missGroups[EMT_COUNT];
       missGroups[EMT_PRIMARY] = { .index = RTDS_MISS };
-      missGroups[EMT_OCCLUSION] = { .index = RTDS_SHADOW_MISS };
+      missGroups[EMT_OCCLUSION] = { .index = IGPURayTracingPipeline::SGeneralShaderGroup::Unused };
       shaderGroups.misses = missGroups;
 
       auto getHitGroupIndex = [](E_GEOM_TYPE geomType, E_RAY_TYPE rayType)
@@ -370,6 +374,7 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication,
         .anyHit = RTDS_ANYHIT_PRIMARY,
       };
       hitGroups[getHitGroupIndex(EGT_TRIANGLES, ERT_OCCLUSION)] = {
+        .closestHit = RTDS_CLOSEST_HIT_SHADOW,
         .anyHit = RTDS_ANYHIT_SHADOW,
       };
       hitGroups[getHitGroupIndex(EGT_PROCEDURAL, ERT_PRIMARY)] = {
@@ -378,6 +383,7 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication,
         .intersectionShader = RTDS_INTERSECTION,
       };
       hitGroups[getHitGroupIndex(EGT_PROCEDURAL, ERT_OCCLUSION)] = {
+        .closestHit = RTDS_CLOSEST_HIT_SHADOW,
         .anyHit = RTDS_ANYHIT_SHADOW,
         .intersectionShader = RTDS_INTERSECTION,
       };

From 63b64e3182dc395d83ada3fe95f46b5febc41d29 Mon Sep 17 00:00:00 2001
From: keptsecret <sorchon@gmail.com>
Date: Fri, 14 Mar 2025 11:14:01 +0700
Subject: [PATCH 082/529] made scene a static global var

---
 .../app_resources/hlsl/render.comp.hlsl       | 70 +++++++++----------
 .../app_resources/hlsl/scene.hlsl             | 67 +++++++++++++++---
 2 files changed, 91 insertions(+), 46 deletions(-)

diff --git a/31_HLSLPathTracer/app_resources/hlsl/render.comp.hlsl b/31_HLSLPathTracer/app_resources/hlsl/render.comp.hlsl
index 5dea2d1bf..f8cf2ae22 100644
--- a/31_HLSLPathTracer/app_resources/hlsl/render.comp.hlsl
+++ b/31_HLSLPathTracer/app_resources/hlsl/render.comp.hlsl
@@ -6,9 +6,6 @@
 #include "nbl/builtin/hlsl/bxdf/reflection.hlsl"
 #include "nbl/builtin/hlsl/bxdf/transmission.hlsl"
 
-#include "render_common.hlsl"
-#include "pathtracer.hlsl"
-
 // add these defines (one at a time) using -D argument to dxc
 // #define SPHERE_LIGHT
 // #define TRIANGLE_LIGHT
@@ -17,10 +14,33 @@
 #ifdef SPHERE_LIGHT
 #define SPHERE_COUNT 9
 #define LIGHT_TYPE ext::PST_SPHERE
-#else
+
+#define TRIANGLE_COUNT 0
+#define RECTANGLE_COUNT 0
+#endif
+
+#ifdef TRIANGLE_LIGHT
+#define TRIANGLE_COUNT 1
+#define LIGHT_TYPE ext::PST_TRIANGLE
+
+#define SPHERE_COUNT 8
+#define RECTANGLE_COUNT 0
+#endif
+
+#ifdef RECTANGLE_LIGHT
+#define RECTANGLE_COUNT 1
+#define LIGHT_TYPE ext::PST_RECTANGLE
+
 #define SPHERE_COUNT 8
+#define TRIANGLE_COUNT 0
 #endif
 
+#define LIGHT_COUNT 1
+#define BXDF_COUNT 7
+
+#include "render_common.hlsl"
+#include "pathtracer.hlsl"
+
 using namespace nbl::hlsl;
 
 NBL_CONSTEXPR uint32_t WorkgroupSize = 32;
@@ -80,22 +100,21 @@ static const ext::Shape<ext::PST_SPHERE> spheres[SPHERE_COUNT] = {
 };
 
 #ifdef TRIANGLE_LIGHT
-#define LIGHT_TYPE ext::PST_TRIANGLE
-#define TRIANGLE_COUNT 1
 static const ext::Shape<ext::PST_TRIANGLE> triangles[TRIANGLE_COUNT] = {
     ext::Shape<ext::PST_TRIANGLE>::create(float3(-1.8,0.35,0.3) * 10.0, float3(-1.2,0.35,0.0) * 10.0, float3(-1.5,0.8,-0.3) * 10.0, bxdfnode_type::INVALID_ID, 0u)
 };
+#else
+static const ext::Shape<ext::PST_TRIANGLE> triangles[1];
 #endif
 
 #ifdef RECTANGLE_LIGHT
-#define LIGHT_TYPE ext::PST_RECTANGLE
-#define RECTANGLE_COUNT 1
 static const ext::Shape<ext::PST_RECTANGLE> rectangles[RECTANGLE_COUNT] = {
     ext::Shape<ext::PST_RECTANGLE>::create(float3(-3.8,0.35,1.3), normalize(float3(2,0,-1))*7.0, normalize(float3(2,-5,4))*0.1, bxdfnode_type::INVALID_ID, 0u)
 };
+#else
+static const ext::Shape<ext::PST_RECTANGLE> rectangles[1];
 #endif
 
-#define LIGHT_COUNT 1
 static const light_type lights[LIGHT_COUNT] = {
     light_type::create(spectral_t(30.0,25.0,15.0),
 #ifdef SPHERE_LIGHT
@@ -106,7 +125,6 @@ static const light_type lights[LIGHT_COUNT] = {
         ext::IntersectMode::IM_PROCEDURAL, LIGHT_TYPE)
 };
 
-#define BXDF_COUNT 7
 static const bxdfnode_type bxdfs[BXDF_COUNT] = {
     bxdfnode_type::create(ext::MaterialSystem::MaterialType::DIFFUSE, false, float2(0,0), spectral_t(0.8,0.8,0.8)),
     bxdfnode_type::create(ext::MaterialSystem::MaterialType::DIFFUSE, false, float2(0,0), spectral_t(0.8,0.4,0.4)),
@@ -117,6 +135,12 @@ static const bxdfnode_type bxdfs[BXDF_COUNT] = {
     bxdfnode_type::create(ext::MaterialSystem::MaterialType::DIELECTRIC, false, float2(0.0625,0.0625), spectral_t(1,1,1), spectral_t(0.71,0.69,0.67))
 };
 
+static const ext::Scene<light_type, bxdfnode_type> scene = ext::Scene<light_type, bxdfnode_type>::create(
+    spheres, triangles, rectangles,
+    SPHERE_COUNT, TRIANGLE_COUNT, RECTANGLE_COUNT,
+    lights, LIGHT_COUNT, bxdfs, BXDF_COUNT
+);
+
 [numthreads(WorkgroupSize, WorkgroupSize, 1)]
 void main(uint32_t3 threadID : SV_DispatchThreadID)
 {
@@ -164,32 +188,6 @@ void main(uint32_t3 threadID : SV_DispatchThreadID)
 
     pathtracer_type pathtracer = pathtracer_type::create(ptCreateParams);
 
-    // set up scene (can do as global var?)
-    ext::Scene<light_type, bxdfnode_type> scene;
-    scene.sphereCount = SPHERE_COUNT;
-    for (uint32_t i = 0; i < SPHERE_COUNT; i++)
-        scene.spheres[i] = spheres[i];
-#ifdef TRIANGLE_LIGHT
-    scene.triangleCount = TRIANGLE_COUNT;
-    for (uint32_t i = 0; i < TRIANGLE_COUNT; i++)
-        scene.triangles[i] = triangles[i];
-#else
-    scene.triangleCount = 0;
-#endif
-#ifdef RECTANGLE_LIGHT
-    scene.rectangleCount = RECTANGLE_COUNT;
-    for (uint32_t i = 0; i < RECTANGLE_COUNT; i++)
-        scene.rectangles[i] = rectangles[i];
-#else
-    scene.rectangleCount = 0;
-#endif
-    scene.lightCount = LIGHT_COUNT;
-    for (uint32_t i = 0; i < LIGHT_COUNT; i++)
-        scene.lights[i] = lights[i];
-    scene.bxdfCount = BXDF_COUNT;
-    for (uint32_t i = 0; i < BXDF_COUNT; i++)
-        scene.bxdfs[i] = bxdfs[i];
-
     float32_t3 color = pathtracer.getMeasure(pc.sampleCount, pc.depth, scene);
     float32_t4 pixCol = float32_t4(color, 1.0);
     outImage[coords] = pixCol;
diff --git a/31_HLSLPathTracer/app_resources/hlsl/scene.hlsl b/31_HLSLPathTracer/app_resources/hlsl/scene.hlsl
index 5b4178ec4..887d20c48 100644
--- a/31_HLSLPathTracer/app_resources/hlsl/scene.hlsl
+++ b/31_HLSLPathTracer/app_resources/hlsl/scene.hlsl
@@ -15,31 +15,78 @@ struct Scene
 {
     using light_type = Light;
     using bxdfnode_type = BxdfNode;
+    using this_t = Scene<Light, BxdfNode>;
 
-    NBL_CONSTEXPR_STATIC_INLINE uint32_t maxSphereCount = 25;
-    NBL_CONSTEXPR_STATIC_INLINE uint32_t maxTriangleCount = 12;
-    NBL_CONSTEXPR_STATIC_INLINE uint32_t maxRectangleCount = 12;
+    // NBL_CONSTEXPR_STATIC_INLINE uint32_t maxSphereCount = 25;
+    // NBL_CONSTEXPR_STATIC_INLINE uint32_t maxTriangleCount = 12;
+    // NBL_CONSTEXPR_STATIC_INLINE uint32_t maxRectangleCount = 12;
 
-    Shape<PST_SPHERE> spheres[maxSphereCount];
-    Shape<PST_TRIANGLE> triangles[maxTriangleCount];
-    Shape<PST_RECTANGLE> rectangles[maxRectangleCount];
+#if SPHERE_COUNT < 1
+#define SCENE_SPHERE_COUNT 1
+#else
+#define SCENE_SPHERE_COUNT SPHERE_COUNT
+#endif
+
+#if TRIANGLE_COUNT < 1
+#define SCENE_TRIANGLE_COUNT 1
+#else
+#define SCENE_TRIANGLE_COUNT TRIANGLE_COUNT
+#endif
+
+#if RECTANGLE_COUNT < 1
+#define SCENE_RECTANGLE_COUNT 1
+#else
+#define SCENE_RECTANGLE_COUNT RECTANGLE_COUNT
+#endif
+
+    Shape<PST_SPHERE> spheres[SCENE_SPHERE_COUNT];
+    Shape<PST_TRIANGLE> triangles[SCENE_TRIANGLE_COUNT];
+    Shape<PST_RECTANGLE> rectangles[SCENE_RECTANGLE_COUNT];
 
     uint32_t sphereCount;
     uint32_t triangleCount;
     uint32_t rectangleCount;
 
-    NBL_CONSTEXPR_STATIC_INLINE uint32_t maxLightCount = 4;
+    // NBL_CONSTEXPR_STATIC_INLINE uint32_t maxLightCount = 4;
 
-    light_type lights[maxLightCount];
+    light_type lights[LIGHT_COUNT];
     uint32_t lightCount;
 
-    NBL_CONSTEXPR_STATIC_INLINE uint32_t maxBxdfCount = 16; // TODO: limit change?
+    // NBL_CONSTEXPR_STATIC_INLINE uint32_t maxBxdfCount = 16;
 
-    bxdfnode_type bxdfs[maxBxdfCount];
+    bxdfnode_type bxdfs[BXDF_COUNT];
     uint32_t bxdfCount;
 
     // AS ases;
 
+    static this_t create(
+        NBL_CONST_REF_ARG(Shape<PST_SPHERE>) spheres[SCENE_SPHERE_COUNT],
+        NBL_CONST_REF_ARG(Shape<PST_TRIANGLE>) triangles[SCENE_TRIANGLE_COUNT],
+        NBL_CONST_REF_ARG(Shape<PST_RECTANGLE>) rectangles[SCENE_RECTANGLE_COUNT],
+        uint32_t sphereCount, uint32_t triangleCount, uint32_t rectangleCount,
+        NBL_CONST_REF_ARG(light_type) lights[LIGHT_COUNT], uint32_t lightCount,
+        NBL_CONST_REF_ARG(bxdfnode_type) bxdfs[BXDF_COUNT], uint32_t bxdfCount)
+    {
+        this_t retval;
+        retval.spheres = spheres;
+        retval.triangles = triangles;
+        retval.rectangles = rectangles;
+        retval.sphereCount = sphereCount;
+        retval.triangleCount = triangleCount;
+        retval.rectangleCount = rectangleCount;
+
+        retval.lights = lights;
+        retval.lightCount = lightCount;
+
+        retval.bxdfs = bxdfs;
+        retval.bxdfCount = bxdfCount;
+        return retval;
+    }
+
+#undef SCENE_SPHERE_COUNT
+#undef SCENE_TRIANGLE_COUNT
+#undef SCENE_RECTANGLE_COUNT
+
     // obsolete?
     // Intersector::IntersectData toIntersectData(uint32_t mode, ProceduralShapeType type)
     // {

From 7bd69e96d5512998df8efd85e5cdac33e1bde18d Mon Sep 17 00:00:00 2001
From: keptsecret <sorchon@gmail.com>
Date: Fri, 14 Mar 2025 16:56:18 +0700
Subject: [PATCH 083/529] fixed most of rectangle light issues, still red
 pixels

---
 .../app_resources/hlsl/common.hlsl            | 23 +++--
 .../hlsl/next_event_estimator.hlsl            | 86 +++++++++----------
 31_HLSLPathTracer/main.cpp                    |  4 +-
 3 files changed, 56 insertions(+), 57 deletions(-)

diff --git a/31_HLSLPathTracer/app_resources/hlsl/common.hlsl b/31_HLSLPathTracer/app_resources/hlsl/common.hlsl
index 9e2249732..28261a634 100644
--- a/31_HLSLPathTracer/app_resources/hlsl/common.hlsl
+++ b/31_HLSLPathTracer/app_resources/hlsl/common.hlsl
@@ -388,7 +388,7 @@ struct Shape<PST_TRIANGLE>
                 sampling::ProjectedSphericalTriangle<float> pst = sampling::ProjectedSphericalTriangle<float>::create(st);
                 const float pdf = pst.pdf(ray.normalAtOrigin, ray.wasBSDFAtOrigin, L);
                 // if `pdf` is NAN then the triangle's projected solid angle was close to 0.0, if its close to INF then the triangle was very small
-                return pdf < numeric_limits<float>::max ? pdf : 0.0;
+                return pdf < numeric_limits<float>::max ? pdf : numeric_limits<float>::max;
             }
             break;
             default:
@@ -427,7 +427,7 @@ struct Shape<PST_TRIANGLE>
 
                 const float32_t3 L = sst.generate(rcpPdf, xi.xy);
 
-                pdf = rcpPdf > numeric_limits<float>::min ? (1.0 / rcpPdf) : 0.0;
+                pdf = rcpPdf > numeric_limits<float>::min ? (1.0 / rcpPdf) : numeric_limits<float>::max;
 
                 const float32_t3 N = getNormalTimesArea();
                 newRayMaxT = hlsl::dot<float32_t3>(N, vertex0 - origin) / hlsl::dot<float32_t3>(N, L);
@@ -443,7 +443,7 @@ struct Shape<PST_TRIANGLE>
 
                 const float32_t3 L = sst.generate(rcpPdf, interaction.isotropic.N, isBSDF, xi.xy);
 
-                pdf = rcpPdf > numeric_limits<float>::min ? (1.0 / rcpPdf) : 0.0;
+                pdf = rcpPdf > numeric_limits<float>::min ? (1.0 / rcpPdf) : numeric_limits<float>::max;
 
                 const float32_t3 N = getNormalTimesArea();
                 newRayMaxT = hlsl::dot<float32_t3>(N, vertex0 - origin) / hlsl::dot<float32_t3>(N, L);
@@ -513,8 +513,6 @@ struct Shape<PST_RECTANGLE>
         basis[0] = edge0 / extents[0];
         basis[1] = edge1 / extents[1];
         basis[2] = normalize(cross(basis[0],basis[1]));
-
-        basis = nbl::hlsl::transpose<float32_t3x3>(basis);    // TODO: double check transpose
     }
 
     template<typename Ray>
@@ -541,17 +539,18 @@ struct Shape<PST_RECTANGLE>
                 if (solidAngle > numeric_limits<float>::min)
                     pdf = 1.f / solidAngle;
                 else
-                    pdf = numeric_limits<float>::infinity;
+                    pdf = bit_cast<float>(numeric_limits<float>::infinity);
                 return pdf;
             }
             break;
             case PPM_APPROX_PROJECTED_SOLID_ANGLE:
             {
-                return numeric_limits<float>::infinity;
+                // currently broken
+                return bit_cast<float>(numeric_limits<float>::infinity);
             }
             break;
             default:
-                return numeric_limits<float>::infinity;
+                return bit_cast<float>(numeric_limits<float>::infinity);
         }
     }
 
@@ -577,7 +576,6 @@ struct Shape<PST_RECTANGLE>
             // #ifdef TRIANGLE_REFERENCE ?
             case PPM_SOLID_ANGLE:
             {
-                float pdf;
                 float32_t3x3 rectNormalBasis;
                 float32_t2 rectExtents;
                 getNormalBasis(rectNormalBasis, rectExtents);
@@ -594,7 +592,7 @@ struct Shape<PST_RECTANGLE>
                     pdf = 1.f / solidAngle;
                 }
                 else
-                    pdf = numeric_limits<float>::infinity;
+                    pdf = bit_cast<float>(numeric_limits<float>::infinity);
 
                 newRayMaxT = hlsl::dot<float32_t3>(N, origin2origin) / hlsl::dot<float32_t3>(N, L);
                 return L;
@@ -602,12 +600,13 @@ struct Shape<PST_RECTANGLE>
             break;
             case PPM_APPROX_PROJECTED_SOLID_ANGLE:
             {
-                pdf = numeric_limits<float>::infinity;
+                // currently broken
+                pdf = bit_cast<float>(numeric_limits<float>::infinity);
                 return (float32_t3)0.0;
             }
             break;
             default:
-                pdf = numeric_limits<float>::infinity;
+                pdf = bit_cast<float>(numeric_limits<float>::infinity);
                 return (float32_t3)0.0;
         }
     }
diff --git a/31_HLSLPathTracer/app_resources/hlsl/next_event_estimator.hlsl b/31_HLSLPathTracer/app_resources/hlsl/next_event_estimator.hlsl
index c1528216d..9c41f6627 100644
--- a/31_HLSLPathTracer/app_resources/hlsl/next_event_estimator.hlsl
+++ b/31_HLSLPathTracer/app_resources/hlsl/next_event_estimator.hlsl
@@ -89,28 +89,28 @@ struct Estimator
 
     static spectral_type deferredEvalAndPdf(NBL_REF_ARG(scalar_type) pdf, NBL_CONST_REF_ARG(light_type) light, NBL_CONST_REF_ARG(ray_type) ray, NBL_CONST_REF_ARG(Event) event)
     {
-        const IntersectMode mode = (IntersectMode)event.mode;
-        switch (mode)
-        {
-            case IM_RAY_QUERY:
-            {
-                // TODO: do ray query stuff
-            }
-            break;
-            case IM_RAY_TRACING:
-            {
-                // TODO: do ray tracing stuff
-            }
-            break;
-            case IM_PROCEDURAL:
-            {
+        // const IntersectMode mode = (IntersectMode)event.mode;
+        // switch (mode)
+        // {
+        //     case IM_RAY_QUERY:
+        //     {
+        //         // TODO: do ray query stuff
+        //     }
+        //     break;
+        //     case IM_RAY_TRACING:
+        //     {
+        //         // TODO: do ray tracing stuff
+        //     }
+        //     break;
+        //     case IM_PROCEDURAL:
+        //     {
                 return proceduralDeferredEvalAndPdf(pdf, light, ray, event);
-            }
-            break;
-            default:
-                return (spectral_type)0.0;
-        }
-        return (spectral_type)0.0;
+        //     }
+        //     break;
+        //     default:
+        //         return (spectral_type)0.0;
+        // }
+        // return (spectral_type)0.0;
     }
 
     static sample_type procedural_generate_and_quotient_and_pdf(NBL_REF_ARG(quotient_pdf_type) quotient_pdf, NBL_REF_ARG(scalar_type) newRayMaxT, NBL_CONST_REF_ARG(light_type) light, NBL_CONST_REF_ARG(vector3_type) origin, NBL_CONST_REF_ARG(interaction_type) interaction, bool isBSDF, NBL_CONST_REF_ARG(vector3_type) xi, uint32_t depth, NBL_CONST_REF_ARG(Event) event)
@@ -203,29 +203,29 @@ struct Estimator
     {
         const IntersectMode mode = (IntersectMode)event.mode;
         sample_type L;
-        switch (mode)
-        {
-            case IM_RAY_QUERY:
-            {
-                // TODO: do ray query stuff
-            }
-            break;
-            case IM_RAY_TRACING:
-            {
-                // TODO: do ray tracing stuff
-            }
-            break;
-            case IM_PROCEDURAL:
-            {
+        // switch (mode)
+        // {
+        //     case IM_RAY_QUERY:
+        //     {
+        //         // TODO: do ray query stuff
+        //     }
+        //     break;
+        //     case IM_RAY_TRACING:
+        //     {
+        //         // TODO: do ray tracing stuff
+        //     }
+        //     break;
+        //     case IM_PROCEDURAL:
+        //     {
                 return procedural_generate_and_quotient_and_pdf(quotient_pdf, newRayMaxT, light, origin, interaction, isBSDF, xi, depth, event);
-            }
-            break;
-            default:
-            {
-                return L;
-            }
-        }
-        return L;
+        //     }
+        //     break;
+        //     default:
+        //     {
+        //         return L;
+        //     }
+        // }
+        // return L;
     }
 };
 
diff --git a/31_HLSLPathTracer/main.cpp b/31_HLSLPathTracer/main.cpp
index 46597d738..b8e3ea044 100644
--- a/31_HLSLPathTracer/main.cpp
+++ b/31_HLSLPathTracer/main.cpp
@@ -1347,10 +1347,10 @@ class HLSLComputePathtracer final : public examples::SimpleWindowedApplication,
 		float viewWidth = 10.f;
 		float camYAngle = 165.f / 180.f * 3.14159f;
 		float camXAngle = 32.f / 180.f * 3.14159f;
-		int PTPipline = E_LIGHT_GEOMETRY::ELG_SPHERE;
+		int PTPipline = E_LIGHT_GEOMETRY::ELG_RECTANGLE;
 		int renderMode = E_RENDER_MODE::ERM_HLSL;
 		int spp = 32;
-		int depth = 3;
+		int depth = 1;
 
 		bool m_firstFrame = true;
 		IGPUCommandBuffer::SClearColorValue clearColor = { .float32 = {0.f,0.f,0.f,1.f} };

From 38d8285dc7101dece86fa5d3733b6056ff0a6266 Mon Sep 17 00:00:00 2001
From: kevyuu <kevin.kayu@gmail.com>
Date: Fri, 14 Mar 2025 21:47:50 +0700
Subject: [PATCH 084/529] Use unused shader instead of stub shader for
 occlusion ray miss shader.

---
 .../app_resources/raytrace_shadow.rmiss.hlsl                | 6 ------
 71_RayTracingPipeline/main.cpp                              | 3 ---
 2 files changed, 9 deletions(-)
 delete mode 100644 71_RayTracingPipeline/app_resources/raytrace_shadow.rmiss.hlsl

diff --git a/71_RayTracingPipeline/app_resources/raytrace_shadow.rmiss.hlsl b/71_RayTracingPipeline/app_resources/raytrace_shadow.rmiss.hlsl
deleted file mode 100644
index baad9a3e9..000000000
--- a/71_RayTracingPipeline/app_resources/raytrace_shadow.rmiss.hlsl
+++ /dev/null
@@ -1,6 +0,0 @@
-#include "common.hlsl"
-
-[shader("miss")]
-void main(inout OcclusionPayload payload)
-{
-}
diff --git a/71_RayTracingPipeline/main.cpp b/71_RayTracingPipeline/main.cpp
index cb7ef1d8e..9a85ea423 100644
--- a/71_RayTracingPipeline/main.cpp
+++ b/71_RayTracingPipeline/main.cpp
@@ -163,7 +163,6 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication,
     const auto anyHitShaderColorPayload = loadCompileAndCreateShader("app_resources/raytrace.rahit.hlsl");
     const auto anyHitShaderShadowPayload = loadCompileAndCreateShader("app_resources/raytrace_shadow.rahit.hlsl");
     const auto missShader = loadCompileAndCreateShader("app_resources/raytrace.rmiss.hlsl");
-    const auto shadowMissShader = loadCompileAndCreateShader("app_resources/raytrace_shadow.rmiss.hlsl");
     const auto shadowClosestHitShader = loadCompileAndCreateShader("app_resources/raytrace_shadow_triangle.rchit.hlsl");
     const auto directionalLightCallShader = loadCompileAndCreateShader("app_resources/light_directional.rcall.hlsl");
     const auto pointLightCallShader = loadCompileAndCreateShader("app_resources/light_point.rcall.hlsl");
@@ -324,7 +323,6 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication,
       {
         RTDS_RAYGEN,
         RTDS_MISS,
-        RTDS_SHADOW_MISS,
         RTDS_CLOSEST_HIT_SHADOW,
         RTDS_CLOSEST_HIT,
         RTDS_SPHERE_CLOSEST_HIT,
@@ -340,7 +338,6 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication,
       IGPUShader::SSpecInfo shaders[RTDS_COUNT];
       shaders[RTDS_RAYGEN] = {.shader = raygenShader.get()};
       shaders[RTDS_MISS] = {.shader = missShader.get()};
-      shaders[RTDS_SHADOW_MISS] = {.shader = shadowMissShader.get()};
       shaders[RTDS_CLOSEST_HIT_SHADOW] = { .shader = shadowClosestHitShader.get() };
       shaders[RTDS_CLOSEST_HIT] = {.shader = closestHitShader.get()};
       shaders[RTDS_SPHERE_CLOSEST_HIT] = {.shader = proceduralClosestHitShader.get()};

From 83d8de494f9d7e68c7ac30dde28a63c62d2cd45b Mon Sep 17 00:00:00 2001
From: Przemek <minikers21@gmail.com>
Date: Fri, 14 Mar 2025 16:28:53 +0100
Subject: [PATCH 085/529] Added dtmSettingsBuff

---
 62_CAD/CTriangleMesh.h                        |  9 ++
 62_CAD/DrawResourcesFiller.cpp                | 74 +++++++++++++-
 62_CAD/DrawResourcesFiller.h                  | 24 ++++-
 62_CAD/SingleLineText.cpp                     |  2 +-
 62_CAD/main.cpp                               | 98 +++++++++++++++----
 62_CAD/shaders/globals.hlsl                   | 17 +++-
 62_CAD/shaders/main_pipeline/common.hlsl      |  9 +-
 .../main_pipeline/fragment_shader.hlsl        | 41 ++++----
 .../shaders/main_pipeline/vertex_shader.hlsl  |  8 ++
 9 files changed, 225 insertions(+), 57 deletions(-)

diff --git a/62_CAD/CTriangleMesh.h b/62_CAD/CTriangleMesh.h
index 6b5612a5c..d71198005 100644
--- a/62_CAD/CTriangleMesh.h
+++ b/62_CAD/CTriangleMesh.h
@@ -6,6 +6,15 @@
 
 using namespace nbl;
 
+struct DTMSettingsInfo
+{
+	LineStyleInfo outlineLineStyleInfo;
+	LineStyleInfo contourLineStyleInfo;
+	// TODO: heights
+
+
+};
+
 class CTriangleMesh final
 {
 public:
diff --git a/62_CAD/DrawResourcesFiller.cpp b/62_CAD/DrawResourcesFiller.cpp
index 424569b8f..995ecfacc 100644
--- a/62_CAD/DrawResourcesFiller.cpp
+++ b/62_CAD/DrawResourcesFiller.cpp
@@ -116,6 +116,24 @@ void DrawResourcesFiller::allocateStylesBuffer(ILogicalDevice* logicalDevice, ui
 	}
 }
 
+void DrawResourcesFiller::allocateDTMSettingsBuffer(ILogicalDevice* logicalDevice, uint32_t dtmSettingsCount)
+{
+	maxDtmSettings = dtmSettingsCount;
+	size_t dtmSettingsBufferSize = dtmSettingsCount * sizeof(DTMSettings);
+	
+	IGPUBuffer::SCreationParams dtmSettingsCreationParams = {};
+	dtmSettingsCreationParams.size = dtmSettingsBufferSize;
+	dtmSettingsCreationParams.usage = IGPUBuffer::EUF_STORAGE_BUFFER_BIT | IGPUBuffer::EUF_TRANSFER_DST_BIT;
+	gpuDrawBuffers.dtmSettingsBuffer = logicalDevice->createBuffer(std::move(dtmSettingsCreationParams));
+	gpuDrawBuffers.dtmSettingsBuffer->setObjectDebugName("dtmSettingsBuffer");
+	
+	IDeviceMemoryBacked::SDeviceMemoryRequirements memReq = gpuDrawBuffers.dtmSettingsBuffer->getMemoryReqs();
+	memReq.memoryTypeBits &= logicalDevice->getPhysicalDevice()->getDeviceLocalMemoryTypeBits();
+	auto stylesBufferMem = logicalDevice->allocate(memReq, gpuDrawBuffers.dtmSettingsBuffer.get());
+	
+	cpuDrawBuffers.dtmSettingsBuffer = ICPUBuffer::create({ dtmSettingsBufferSize });
+}
+
 void DrawResourcesFiller::allocateMSDFTextures(ILogicalDevice* logicalDevice, uint32_t maxMSDFs, uint32_t2 msdfsExtent)
 {
 	msdfLRUCache = std::unique_ptr<MSDFsLRUCache>(new MSDFsLRUCache(maxMSDFs));
@@ -172,7 +190,7 @@ void DrawResourcesFiller::drawPolyline(const CPolylineBase& polyline, const Line
 
 	uint32_t styleIdx = addLineStyle_SubmitIfNeeded(lineStyleInfo, intendedNextSubmit);
 
-	uint32_t mainObjIdx = addMainObject_SubmitIfNeeded(styleIdx, intendedNextSubmit);
+	uint32_t mainObjIdx = addMainObject_SubmitIfNeeded(styleIdx, InvalidDTMSettingsIdx, intendedNextSubmit);
 
 	drawPolyline(polyline, mainObjIdx, intendedNextSubmit);
 }
@@ -218,7 +236,7 @@ void DrawResourcesFiller::drawPolyline(const CPolylineBase& polyline, uint32_t p
 	}
 }
 
-void DrawResourcesFiller::drawTriangleMesh(const CTriangleMesh& mesh, CTriangleMesh::DrawData& drawData, SIntendedSubmitInfo& intendedNextSubmit)
+void DrawResourcesFiller::drawTriangleMesh(const CTriangleMesh& mesh, CTriangleMesh::DrawData& drawData, const DTMSettingsInfo& dtmSettingsInfo, SIntendedSubmitInfo& intendedNextSubmit)
 {
 	ICPUBuffer::SCreationParams geometryBuffParams;
 	
@@ -256,7 +274,9 @@ void DrawResourcesFiller::drawTriangleMesh(const CTriangleMesh& mesh, CTriangleM
 
 	// call addMainObject_SubmitIfNeeded, use its index in push constants
 
-	drawData.pushConstants.triangleMeshMainObjectIndex = addMainObject_SubmitIfNeeded(InvalidStyleIdx, intendedNextSubmit);
+	uint32_t dtmSettingsIndex = addDTMSettings_SubmitIfNeeded(dtmSettingsInfo, intendedNextSubmit);
+
+	drawData.pushConstants.triangleMeshMainObjectIndex = addMainObject_SubmitIfNeeded(InvalidStyleIdx, InvalidDTMSettingsIdx, intendedNextSubmit);
 
 	// TODO: use this function later for auto submit
 	//submitCurrentDrawObjectsAndReset(intendedNextSubmit, 0);
@@ -304,7 +324,7 @@ void DrawResourcesFiller::drawHatch(
 	lineStyle.screenSpaceLineWidth = nbl::hlsl::bit_cast<float, uint32_t>(textureIdx);
 	const uint32_t styleIdx = addLineStyle_SubmitIfNeeded(lineStyle, intendedNextSubmit);
 
-	uint32_t mainObjIdx = addMainObject_SubmitIfNeeded(styleIdx, intendedNextSubmit);
+	uint32_t mainObjIdx = addMainObject_SubmitIfNeeded(styleIdx, InvalidDTMSettingsIdx, intendedNextSubmit);
 	uint32_t currentObjectInSection = 0u; // Object here refers to DrawObject used in vertex shader. You can think of it as a Cage.
 	while (currentObjectInSection < hatch.getHatchBoxCount())
 	{
@@ -379,10 +399,27 @@ uint32_t DrawResourcesFiller::addLineStyle_SubmitIfNeeded(const LineStyleInfo& l
 	return outLineStyleIdx;
 }
 
-uint32_t DrawResourcesFiller::addMainObject_SubmitIfNeeded(uint32_t styleIdx, SIntendedSubmitInfo& intendedNextSubmit)
+uint32_t DrawResourcesFiller::addDTMSettings_SubmitIfNeeded(const DTMSettingsInfo& dtmSettings, SIntendedSubmitInfo& intendedNextSubmit)
+{
+	uint32_t outDTMSettingIdx = addDTMSettings_Internal(dtmSettings, intendedNextSubmit);
+	if (outDTMSettingIdx == InvalidStyleIdx)
+	{
+		finalizeAllCopiesToGPU(intendedNextSubmit);
+		submitDraws(intendedNextSubmit);
+		resetGeometryCounters();
+		resetMainObjectCounters();
+		resetLineStyleCounters();
+		outDTMSettingIdx = addDTMSettings_Internal(dtmSettings, intendedNextSubmit);
+		assert(outDTMSettingIdx != InvalidDTMSettingsIdx);
+	}
+	return outDTMSettingIdx;
+}
+
+uint32_t DrawResourcesFiller::addMainObject_SubmitIfNeeded(uint32_t styleIdx, uint32_t dtmSettingsIdx, SIntendedSubmitInfo& intendedNextSubmit)
 {
 	MainObject mainObject = {};
 	mainObject.styleIdx = styleIdx;
+	mainObject.dtmSettingsIdx = dtmSettingsIdx;
 	mainObject.clipProjectionAddress = acquireCurrentClipProjectionAddress(intendedNextSubmit);
 	uint32_t outMainObjectIdx = addMainObject_Internal(mainObject);
 	if (outMainObjectIdx == InvalidMainObjectIdx)
@@ -728,6 +765,33 @@ uint32_t DrawResourcesFiller::addLineStyle_Internal(const LineStyleInfo& lineSty
 	return currentLineStylesCount++;
 }
 
+uint32_t DrawResourcesFiller::addDTMSettings_Internal(const DTMSettingsInfo& dtmSettingsInfo, SIntendedSubmitInfo& intendedNextSubmit)
+{
+	DTMSettings dtmSettings;
+
+	// TODO: this needs to be redone.. what if submit happens after that line?
+	// we need to make sure somehow that function below will not submit, we need both outline and contour styles in GPU memory
+	dtmSettings.outlineLineStyleIdx = addLineStyle_SubmitIfNeeded(dtmSettingsInfo.outlineLineStyleInfo, intendedNextSubmit);
+	dtmSettings.contourLineStyleIdx = addLineStyle_SubmitIfNeeded(dtmSettingsInfo.contourLineStyleInfo, intendedNextSubmit);
+
+	DTMSettings* settingsArray = reinterpret_cast<DTMSettings*>(cpuDrawBuffers.dtmSettingsBuffer->getPointer());
+	for (uint32_t i = 0u; i < currentDTMSettingsCount; ++i)
+	{
+		const DTMSettings& itr = settingsArray[i];
+		if (itr == dtmSettings)
+			return i;
+	}
+
+	if (currentDTMSettingsCount >= maxDtmSettings)
+		return InvalidDTMSettingsIdx;
+
+	void* dst = settingsArray + currentDTMSettingsCount;
+	memcpy(dst, &dtmSettings, sizeof(DTMSettings));
+	return currentDTMSettingsCount++;
+
+	return InvalidDTMSettingsIdx;
+}
+
 uint64_t DrawResourcesFiller::acquireCurrentClipProjectionAddress(SIntendedSubmitInfo& intendedNextSubmit)
 {
 	if (clipProjectionAddresses.empty())
diff --git a/62_CAD/DrawResourcesFiller.h b/62_CAD/DrawResourcesFiller.h
index c3b31d32e..ef7eab307 100644
--- a/62_CAD/DrawResourcesFiller.h
+++ b/62_CAD/DrawResourcesFiller.h
@@ -27,6 +27,7 @@ struct DrawBuffers
 	smart_refctd_ptr<BufferType> drawObjectsBuffer;
 	smart_refctd_ptr<BufferType> geometryBuffer;
 	smart_refctd_ptr<BufferType> lineStylesBuffer;
+	smart_refctd_ptr<BufferType> dtmSettingsBuffer;
 };
 
 // ! DrawResourcesFiller
@@ -57,6 +58,8 @@ struct DrawResourcesFiller
 	void allocateGeometryBuffer(ILogicalDevice* logicalDevice, size_t size);
 
 	void allocateStylesBuffer(ILogicalDevice* logicalDevice, uint32_t lineStylesCount);
+
+	void allocateDTMSettingsBuffer(ILogicalDevice* logicalDevice, uint32_t dtmSettingsCount);
 	
 	void allocateMSDFTextures(ILogicalDevice* logicalDevice, uint32_t maxMSDFs, uint32_t2 msdfsExtent);
 
@@ -77,7 +80,7 @@ struct DrawResourcesFiller
 
 	void drawPolyline(const CPolylineBase& polyline, uint32_t polylineMainObjIdx, SIntendedSubmitInfo& intendedNextSubmit);
 	
-	void drawTriangleMesh(const CTriangleMesh& mesh, CTriangleMesh::DrawData& pushConstants, SIntendedSubmitInfo& intendedNextSubmit);
+	void drawTriangleMesh(const CTriangleMesh& mesh, CTriangleMesh::DrawData& drawData, const DTMSettingsInfo& dtmSettings, SIntendedSubmitInfo& intendedNextSubmit);
 
 	// ! Convinience function for Hatch with MSDF Pattern and a solid background
 	void drawHatch(
@@ -145,7 +148,7 @@ struct DrawResourcesFiller
 					return false;
 			};
 		
-		uint32_t mainObjIdx = addMainObject_SubmitIfNeeded(InvalidStyleIdx, intendedNextSubmit);
+		uint32_t mainObjIdx = addMainObject_SubmitIfNeeded(InvalidStyleIdx, InvalidDTMSettingsIdx, intendedNextSubmit);
 
 		ImageObjectInfo info = {};
 		info.topLeft = topLeftPos;
@@ -194,12 +197,15 @@ struct DrawResourcesFiller
 		resetGeometryCounters();
 		resetMainObjectCounters();
 		resetLineStyleCounters();
+		resetDTMSettingsCounters();
 	}
 
 	DrawBuffers<ICPUBuffer> cpuDrawBuffers;
 	DrawBuffers<IGPUBuffer> gpuDrawBuffers;
 
 	uint32_t addLineStyle_SubmitIfNeeded(const LineStyleInfo& lineStyle, SIntendedSubmitInfo& intendedNextSubmit);
+
+	uint32_t addDTMSettings_SubmitIfNeeded(const DTMSettingsInfo& dtmSettings, SIntendedSubmitInfo& intendedNextSubmit);
 	
 	// TODO[Przemek]: Read after reading the fragment shader comments and having a basic understanding of the relationship between "mainObject" and our programmable blending resolve:
 	// Use `addMainObject_SubmitIfNeeded` to push your single mainObject you'll be using for the enitre triangle mesh (this will ensure overlaps between triangles of the same mesh is resolved correctly)
@@ -209,7 +215,7 @@ struct DrawResourcesFiller
 	// Never call this function multiple times in a row before indexing it in a drawable, because future auto-submits may invalidate mainObjects, so do them one by one, for example:
 	// Valid: addMainObject1 --> addXXX(mainObj1) ---> addMainObject2 ---> addXXX(mainObj2) ....
 	// Invalid: addMainObject1 ---> addMainObject2 ---> addXXX(mainObj1) ---> addXXX(mainObj2) ....
-	uint32_t addMainObject_SubmitIfNeeded(uint32_t styleIdx, SIntendedSubmitInfo& intendedNextSubmit);
+	uint32_t addMainObject_SubmitIfNeeded(uint32_t styleIdx, uint32_t dtmSettingsIdx, SIntendedSubmitInfo& intendedNextSubmit);
 
 	// we need to store the clip projection stack to make sure the front is always available in memory
 	void pushClipProjectionData(const ClipProjectionData& clipProjectionData);
@@ -257,6 +263,8 @@ struct DrawResourcesFiller
 
 	uint32_t addLineStyle_Internal(const LineStyleInfo& lineStyleInfo);
 
+	uint32_t addDTMSettings_Internal(const DTMSettingsInfo& dtmSettings, SIntendedSubmitInfo& intendedNextSubmit);
+
 	// Gets the current clip projection data (the top of stack) gpu addreess inside the geometryBuffer
 	// If it's been invalidated then it will request to upload again with a possible auto-submit on low geometry buffer memory.
 	uint64_t acquireCurrentClipProjectionAddress(SIntendedSubmitInfo& intendedNextSubmit);
@@ -314,6 +322,12 @@ struct DrawResourcesFiller
 		inMemLineStylesCount = 0u;
 	}
 
+	void resetDTMSettingsCounters()
+	{
+		currentDTMSettingsCount = 0u;
+		inMemDTMSettingsCount = 0u;
+	}
+
 	MainObject* getMainObject(uint32_t idx)
 	{
 		MainObject* mainObjsArray = reinterpret_cast<MainObject*>(cpuDrawBuffers.mainObjectsBuffer->getPointer());
@@ -432,6 +446,10 @@ struct DrawResourcesFiller
 	uint32_t currentLineStylesCount = 0u;
 	uint32_t maxLineStyles = 0u;
 
+	uint32_t inMemDTMSettingsCount = 0u;
+	uint32_t currentDTMSettingsCount = 0u;
+	uint32_t maxDtmSettings = 0u;
+
 	uint64_t geometryBufferAddress = 0u; // Actual BDA offset 0 of the gpu buffer
 
 	std::deque<ClipProjectionData> clipProjections; // stack of clip projectios stored so we can resubmit them if geometry buffer got reset.
diff --git a/62_CAD/SingleLineText.cpp b/62_CAD/SingleLineText.cpp
index 4b41cb628..f68f78db3 100644
--- a/62_CAD/SingleLineText.cpp
+++ b/62_CAD/SingleLineText.cpp
@@ -64,7 +64,7 @@ void SingleLineText::Draw(
 	lineStyle.screenSpaceLineWidth = tan(tiltTiltAngle);
 	lineStyle.worldSpaceLineWidth = boldInPixels;
 	const uint32_t styleIdx = drawResourcesFiller.addLineStyle_SubmitIfNeeded(lineStyle, intendedNextSubmit);
-	auto glyphObjectIdx = drawResourcesFiller.addMainObject_SubmitIfNeeded(styleIdx, intendedNextSubmit);
+	auto glyphObjectIdx = drawResourcesFiller.addMainObject_SubmitIfNeeded(styleIdx, InvalidDTMSettingsIdx, intendedNextSubmit);
 
 	for (const auto& glyphBox : m_glyphBoxes)
 	{
diff --git a/62_CAD/main.cpp b/62_CAD/main.cpp
index 3aad8c4d3..cce87e3b2 100644
--- a/62_CAD/main.cpp
+++ b/62_CAD/main.cpp
@@ -294,6 +294,7 @@ class ComputerAidedDesign final : public examples::SimpleWindowedApplication, pu
 		drawResourcesFiller.allocateMainObjectsBuffer(m_device.get(), maxObjects);
 		drawResourcesFiller.allocateDrawObjectsBuffer(m_device.get(), maxObjects * 5u);
 		drawResourcesFiller.allocateStylesBuffer(m_device.get(), 512u);
+		drawResourcesFiller.allocateDTMSettingsBuffer(m_device.get(), 512u);
 
 		// * 3 because I just assume there is on average 3x beziers per actual object (cause we approximate other curves/arcs with beziers now)
 		// + 128 ClipProjData
@@ -711,20 +712,27 @@ class ComputerAidedDesign final : public examples::SimpleWindowedApplication, pu
 				},
 				{
 					.binding = 4u,
+					.type = asset::IDescriptor::E_TYPE::ET_STORAGE_BUFFER,
+					.createFlags = IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_NONE,
+					.stageFlags = asset::IShader::E_SHADER_STAGE::ESS_VERTEX | asset::IShader::E_SHADER_STAGE::ESS_FRAGMENT,
+					.count = 1u,
+				},
+				{
+					.binding = 5u,
 					.type = asset::IDescriptor::E_TYPE::ET_COMBINED_IMAGE_SAMPLER,
 					.createFlags = IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_NONE,
 					.stageFlags = asset::IShader::E_SHADER_STAGE::ESS_FRAGMENT,
 					.count = 1u,
 				},
 				{
-					.binding = 5u,
+					.binding = 6u,
 					.type = asset::IDescriptor::E_TYPE::ET_SAMPLER,
 					.createFlags = IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_NONE,
 					.stageFlags = asset::IShader::E_SHADER_STAGE::ESS_FRAGMENT,
 					.count = 1u,
 				},
 				{
-					.binding = 6u,
+					.binding = 7u,
 					.type = asset::IDescriptor::E_TYPE::ET_SAMPLED_IMAGE,
 					.createFlags = bindlessTextureFlags,
 					.stageFlags = asset::IShader::E_SHADER_STAGE::ESS_FRAGMENT,
@@ -768,7 +776,7 @@ class ComputerAidedDesign final : public examples::SimpleWindowedApplication, pu
 			{
 				descriptorSet0 = descriptorPool->createDescriptorSet(smart_refctd_ptr(descriptorSetLayout0));
 				descriptorSet1 = descriptorPool->createDescriptorSet(smart_refctd_ptr(descriptorSetLayout1));
-				constexpr uint32_t DescriptorCountSet0 = 6u;
+				constexpr uint32_t DescriptorCountSet0 = 7u;
 				video::IGPUDescriptorSet::SDescriptorInfo descriptorInfosSet0[DescriptorCountSet0] = {};
 
 				// Descriptors For Set 0:
@@ -788,11 +796,15 @@ class ComputerAidedDesign final : public examples::SimpleWindowedApplication, pu
 				descriptorInfosSet0[3u].info.buffer.size = drawResourcesFiller.gpuDrawBuffers.lineStylesBuffer->getCreationParams().size;
 				descriptorInfosSet0[3u].desc = drawResourcesFiller.gpuDrawBuffers.lineStylesBuffer;
 				
-				descriptorInfosSet0[4u].info.combinedImageSampler.imageLayout = IImage::LAYOUT::READ_ONLY_OPTIMAL;
-				descriptorInfosSet0[4u].info.combinedImageSampler.sampler = msdfTextureSampler;
-				descriptorInfosSet0[4u].desc = drawResourcesFiller.getMSDFsTextureArray();
+				descriptorInfosSet0[4u].info.buffer.offset = 0u;
+				descriptorInfosSet0[4u].info.buffer.size = drawResourcesFiller.gpuDrawBuffers.dtmSettingsBuffer->getCreationParams().size;
+				descriptorInfosSet0[4u].desc = drawResourcesFiller.gpuDrawBuffers.dtmSettingsBuffer;
+
+				descriptorInfosSet0[5u].info.combinedImageSampler.imageLayout = IImage::LAYOUT::READ_ONLY_OPTIMAL;
+				descriptorInfosSet0[5u].info.combinedImageSampler.sampler = msdfTextureSampler;
+				descriptorInfosSet0[5u].desc = drawResourcesFiller.getMSDFsTextureArray();
 				
-				descriptorInfosSet0[5u].desc = msdfTextureSampler; // TODO[Erfan]: different sampler and make immutable?
+				descriptorInfosSet0[6u].desc = msdfTextureSampler; // TODO[Erfan]: different sampler and make immutable?
 				
 				// This is bindless to we write to it later.
 				// descriptorInfosSet0[6u].info.image.imageLayout = IImage::LAYOUT::READ_ONLY_OPTIMAL;
@@ -813,55 +825,67 @@ class ComputerAidedDesign final : public examples::SimpleWindowedApplication, pu
 				video::IGPUDescriptorSet::SWriteDescriptorSet descriptorUpdates[DescriptorUpdatesCount] = {};
 				
 				// Set 0 Updates:
+					// globals
 				descriptorUpdates[0u].dstSet = descriptorSet0.get();
 				descriptorUpdates[0u].binding = 0u;
 				descriptorUpdates[0u].arrayElement = 0u;
 				descriptorUpdates[0u].count = 1u;
 				descriptorUpdates[0u].info = &descriptorInfosSet0[0u];
 
+					// drawObjectsBuffer
 				descriptorUpdates[1u].dstSet = descriptorSet0.get();
 				descriptorUpdates[1u].binding = 1u;
 				descriptorUpdates[1u].arrayElement = 0u;
 				descriptorUpdates[1u].count = 1u;
 				descriptorUpdates[1u].info = &descriptorInfosSet0[1u];
 
+					// mainObjectsBuffer
 				descriptorUpdates[2u].dstSet = descriptorSet0.get();
 				descriptorUpdates[2u].binding = 2u;
 				descriptorUpdates[2u].arrayElement = 0u;
 				descriptorUpdates[2u].count = 1u;
 				descriptorUpdates[2u].info = &descriptorInfosSet0[2u];
 
+					// lineStylesBuffer
 				descriptorUpdates[3u].dstSet = descriptorSet0.get();
 				descriptorUpdates[3u].binding = 3u;
 				descriptorUpdates[3u].arrayElement = 0u;
 				descriptorUpdates[3u].count = 1u;
 				descriptorUpdates[3u].info = &descriptorInfosSet0[3u];
 				
+					// dtmSettingsBuffer
 				descriptorUpdates[4u].dstSet = descriptorSet0.get();
 				descriptorUpdates[4u].binding = 4u;
 				descriptorUpdates[4u].arrayElement = 0u;
 				descriptorUpdates[4u].count = 1u;
 				descriptorUpdates[4u].info = &descriptorInfosSet0[4u];
-				
+
+					// mdfs textures
 				descriptorUpdates[5u].dstSet = descriptorSet0.get();
 				descriptorUpdates[5u].binding = 5u;
 				descriptorUpdates[5u].arrayElement = 0u;
 				descriptorUpdates[5u].count = 1u;
 				descriptorUpdates[5u].info = &descriptorInfosSet0[5u];
-
-				// Set 1 Updates:
-				descriptorUpdates[6u].dstSet = descriptorSet1.get();
-				descriptorUpdates[6u].binding = 0u;
+				
+					// mdfs samplers	
+				descriptorUpdates[6u].dstSet = descriptorSet0.get();
+				descriptorUpdates[6u].binding = 6u;
 				descriptorUpdates[6u].arrayElement = 0u;
 				descriptorUpdates[6u].count = 1u;
-				descriptorUpdates[6u].info = &descriptorInfosSet1[0u];
+				descriptorUpdates[6u].info = &descriptorInfosSet0[6u];
 
+				// Set 1 Updates:
 				descriptorUpdates[7u].dstSet = descriptorSet1.get();
-				descriptorUpdates[7u].binding = 1u;
+				descriptorUpdates[7u].binding = 0u;
 				descriptorUpdates[7u].arrayElement = 0u;
 				descriptorUpdates[7u].count = 1u;
-				descriptorUpdates[7u].info = &descriptorInfosSet1[1u];
+				descriptorUpdates[7u].info = &descriptorInfosSet1[0u];
 
+				descriptorUpdates[8u].dstSet = descriptorSet1.get();
+				descriptorUpdates[8u].binding = 1u;
+				descriptorUpdates[8u].arrayElement = 0u;
+				descriptorUpdates[8u].count = 1u;
+				descriptorUpdates[8u].info = &descriptorInfosSet1[1u];
 
 				m_device->updateDescriptorSets(DescriptorUpdatesCount, descriptorUpdates, 0u, nullptr);
 			}
@@ -3123,7 +3147,7 @@ class ComputerAidedDesign final : public examples::SimpleWindowedApplication, pu
 					lineStyle.color = float32_t4(1.0, 1.0, 1.0, 1.0);
 					const uint32_t styleIdx = drawResourcesFiller.addLineStyle_SubmitIfNeeded(lineStyle, intendedNextSubmit);
 
-					glyphObjectIdx = drawResourcesFiller.addMainObject_SubmitIfNeeded(styleIdx, intendedNextSubmit);
+					glyphObjectIdx = drawResourcesFiller.addMainObject_SubmitIfNeeded(styleIdx, InvalidDTMSettingsIdx, intendedNextSubmit);
 				}
 
 				float64_t2 currentBaselineStart = float64_t2(0.0, 0.0);
@@ -3261,13 +3285,33 @@ class ComputerAidedDesign final : public examples::SimpleWindowedApplication, pu
 		else if (mode == ExampleMode::CASE_9)
 		{
 			core::vector<TriangleMeshVertex> vertices = {
-				{ float32_t2(0.0f, 0.0f), 0.0f },
-				{ float32_t2(0.0f, 100.0f), 50.0f },
-				{ float32_t2(200.0f, 50.0f), 100.0f }
+				{ float32_t2(-200.0f, -200.0f), 0.0f },
+				{ float32_t2(-50.0f, -200.0f), 0.0f },
+				{ float32_t2(100.0f, -200.0f), 0.0f },
+				{ float32_t2(-125.0f, -70.1f), 0.0f },
+				{ float32_t2(25.0f, -70.1f), 0.0f },
+				{ float32_t2(175.0f, -70.1f), 0.0f },
+				{ float32_t2(-200.0f, 59.8f), 0.0f },
+				{ float32_t2(-50.0f, 59.8f), 0.0f },
+				{ float32_t2(100.0f, 59.8f), 0.0f },
+				{ float32_t2(-125.0f, 189.7f), 0.0f },
+				{ float32_t2(25.0f, 189.7f), 0.0f },
+				{ float32_t2(175.0f, 189.7f), 0.0f }
 			};
 
 			core::vector<uint32_t> indices = {
-				0, 1, 2
+				0, 1, 3,
+				1, 3, 4,
+				1, 2, 4,
+				2, 4, 5,
+				3, 4, 6,
+				4, 6, 7,
+				4, 5, 7,
+				5, 7, 8,
+				6, 7, 9,
+				7, 9, 10,
+				7, 8, 10,
+				8, 10, 11
 			};
 
 			// TODO: height color map
@@ -3279,7 +3323,19 @@ class ComputerAidedDesign final : public examples::SimpleWindowedApplication, pu
 			mesh.setVertices(std::move(vertices));
 			mesh.setIndices(std::move(indices));
 
-			drawResourcesFiller.drawTriangleMesh(mesh, m_triangleMeshDrawData, intendedNextSubmit);
+			DTMSettingsInfo dtmSettingsInfo;
+
+			LineStyleInfo outlineStyle = {};
+			dtmSettingsInfo.outlineLineStyleInfo.screenSpaceLineWidth = 0.0f;
+			dtmSettingsInfo.outlineLineStyleInfo.worldSpaceLineWidth = 5.0f;
+			dtmSettingsInfo.outlineLineStyleInfo.color = float32_t4(0.0f, 0.5f, 0.5f, 1.0f);
+
+			LineStyleInfo contourStyle = {};
+			dtmSettingsInfo.contourLineStyleInfo.screenSpaceLineWidth = 0.0f;
+			dtmSettingsInfo.contourLineStyleInfo.worldSpaceLineWidth = 5.0f;
+			dtmSettingsInfo.contourLineStyleInfo.color = float32_t4(1.0f, 0.5f, 0.31f, 1.0f);
+
+			drawResourcesFiller.drawTriangleMesh(mesh, m_triangleMeshDrawData, dtmSettingsInfo, intendedNextSubmit);
 		}
 		drawResourcesFiller.finalizeAllCopiesToGPU(intendedNextSubmit);
 	}
diff --git a/62_CAD/shaders/globals.hlsl b/62_CAD/shaders/globals.hlsl
index 1902ba39e..e7029a79e 100644
--- a/62_CAD/shaders/globals.hlsl
+++ b/62_CAD/shaders/globals.hlsl
@@ -126,7 +126,7 @@ enum class MajorAxis : uint32_t
 struct MainObject
 {
     uint32_t styleIdx;
-    uint32_t pad; // do I even need this on the gpu side? it's stored in structured buffer not bda
+    uint32_t dtmSettingsIdx; // do I even need this on the gpu side? it's stored in structured buffer not bda
     uint64_t clipProjectionAddress;
 };
 
@@ -328,6 +328,13 @@ struct LineStyle
     }
 };
 
+struct DTMSettings
+{
+    uint32_t outlineLineStyleIdx; // index into line styles
+    uint32_t contourLineStyleIdx; // index into line styles
+    // TODO:
+    // ContourSettings -> min, max, interval
+};
 #ifndef __HLSL_VERSION
 inline bool operator==(const LineStyle& lhs, const LineStyle& rhs)
 {
@@ -350,12 +357,20 @@ inline bool operator==(const LineStyle& lhs, const LineStyle& rhs)
 
     return isStipplePatternArrayEqual;
 }
+
+inline bool operator==(const DTMSettings& lhs, const DTMSettings& rhs)
+{
+    return lhs.outlineLineStyleIdx == rhs.outlineLineStyleIdx &&
+        lhs.contourLineStyleIdx == rhs.contourLineStyleIdx;
+}
+
 #endif
 
 NBL_CONSTEXPR uint32_t MainObjectIdxBits = 24u; // It will be packed next to alpha in a texture
 NBL_CONSTEXPR uint32_t AlphaBits = 32u - MainObjectIdxBits;
 NBL_CONSTEXPR uint32_t MaxIndexableMainObjects = (1u << MainObjectIdxBits) - 1u;
 NBL_CONSTEXPR uint32_t InvalidStyleIdx = nbl::hlsl::numeric_limits<uint32_t>::max;
+NBL_CONSTEXPR uint32_t InvalidDTMSettingsIdx = nbl::hlsl::numeric_limits<uint32_t>::max;
 NBL_CONSTEXPR uint32_t InvalidMainObjectIdx = MaxIndexableMainObjects;
 NBL_CONSTEXPR uint64_t InvalidClipProjectionAddress = nbl::hlsl::numeric_limits<uint64_t>::max;
 NBL_CONSTEXPR uint32_t InvalidTextureIdx = nbl::hlsl::numeric_limits<uint32_t>::max;
diff --git a/62_CAD/shaders/main_pipeline/common.hlsl b/62_CAD/shaders/main_pipeline/common.hlsl
index 4fd45ab5c..dc47604ad 100644
--- a/62_CAD/shaders/main_pipeline/common.hlsl
+++ b/62_CAD/shaders/main_pipeline/common.hlsl
@@ -231,12 +231,13 @@ struct PSInput
 [[vk::binding(1, 0)]] StructuredBuffer<DrawObject> drawObjects : register(t0);
 [[vk::binding(2, 0)]] StructuredBuffer<MainObject> mainObjects : register(t1);
 [[vk::binding(3, 0)]] StructuredBuffer<LineStyle> lineStyles : register(t2);
+[[vk::binding(4, 0)]] StructuredBuffer<DTMSettings> dtmSettingsBuff : register(t3);
 
-[[vk::combinedImageSampler]][[vk::binding(4, 0)]] Texture2DArray<float3> msdfTextures : register(t3);
-[[vk::combinedImageSampler]][[vk::binding(4, 0)]] SamplerState msdfSampler : register(s3);
+[[vk::combinedImageSampler]][[vk::binding(5, 0)]] Texture2DArray<float3> msdfTextures : register(t4);
+[[vk::combinedImageSampler]][[vk::binding(5, 0)]] SamplerState msdfSampler : register(s4);
 
-[[vk::binding(5, 0)]] SamplerState textureSampler : register(s4);
-[[vk::binding(6, 0)]] Texture2D textures[128] : register(t4);
+[[vk::binding(6, 0)]] SamplerState textureSampler : register(s5);
+[[vk::binding(7, 0)]] Texture2D textures[128] : register(t5);
 
 // Set 1 - Window dependant data which has higher update frequency due to multiple windows and resize need image recreation and descriptor writes
 [[vk::binding(0, 1)]] globallycoherent RWTexture2D<uint> pseudoStencil : register(u0);
diff --git a/62_CAD/shaders/main_pipeline/fragment_shader.hlsl b/62_CAD/shaders/main_pipeline/fragment_shader.hlsl
index 845cb36d7..21d114d90 100644
--- a/62_CAD/shaders/main_pipeline/fragment_shader.hlsl
+++ b/62_CAD/shaders/main_pipeline/fragment_shader.hlsl
@@ -427,25 +427,24 @@ float4 fragMain(PSInput input) : SV_TARGET
         // TODO: figure out if branching can be reduced
         if (baryCoord.x < baryCoord.y && baryCoord.x < baryCoord.z)
         {
-            start = v1;
-            end = v2;
+            start = float2(v1.x, v1.y);
+            end = float2(v2.x, v2.y);
         }
         else if (baryCoord.y < baryCoord.x && baryCoord.y < baryCoord.z)
         {
-            start = v0;
-            end = v2;
+            start = float2(v1.x, v1.y);
+            end = float2(v2.x, v2.y);
         }
         else if (baryCoord.z < baryCoord.x && baryCoord.z < baryCoord.y)
         {
-            start = v0;
-            end = v1;
+            start = float2(v0.x, v0.y);
+            end = float2(v1.x, v1.y);
         }
 
-        float distance = nbl::hlsl::numeric_limits<float>::max;
         const uint32_t styleIdx = mainObj.styleIdx;
-        const float thickness = 2.0f;
-        const float phaseShift = 0.0f;
-        const float stretch = 0.0f;
+        const float thickness = input.getLineThickness();
+        const float phaseShift = 0.0f; // input.getCurrentPhaseShift();
+        const float stretch = 0.0f; // input.getPatternStretch();
         const float worldToScreenRatio = input.getCurrentWorldToScreenRatio();
 
         nbl::hlsl::shapes::Line<float> lineSegment = nbl::hlsl::shapes::Line<float>::construct(start, end);
@@ -453,18 +452,16 @@ float4 fragMain(PSInput input) : SV_TARGET
 
         LineStyle style = lineStyles[styleIdx];
 
-        // TODO: stipples
-        //if (!style.hasStipples() || stretch == InvalidStyleStretchValue)
-        //{
-            //distance = ClippedSignedDistance< nbl::hlsl::shapes::Line<float> >::sdf(lineSegment, input.position.xy, thickness, style.isRoadStyleFlag);
-        //}
-        //else
-        //{
-        //    LineStyleClipper clipper = LineStyleClipper::construct(lineStyles[styleIdx], lineSegment, arcLenCalc, phaseShift, stretch, worldToScreenRatio);
-        //    distance = ClippedSignedDistance<nbl::hlsl::shapes::Line<float>, LineStyleClipper>::sdf(lineSegment, input.position.xy, thickness, style.isRoadStyleFlag, clipper);
-        //}
-
-        distance = ClippedSignedDistance< nbl::hlsl::shapes::Line<float> >::sdf(lineSegment, input.position.xy, thickness, true);
+        float distance = nbl::hlsl::numeric_limits<float>::max;
+        if (!style.hasStipples() || stretch == InvalidStyleStretchValue)
+        {
+            distance = ClippedSignedDistance< nbl::hlsl::shapes::Line<float> >::sdf(lineSegment, input.position.xy, thickness, style.isRoadStyleFlag);
+        }
+        else
+        {
+            LineStyleClipper clipper = LineStyleClipper::construct(lineStyles[styleIdx], lineSegment, arcLenCalc, phaseShift, stretch, worldToScreenRatio);
+            distance = ClippedSignedDistance<nbl::hlsl::shapes::Line<float>, LineStyleClipper>::sdf(lineSegment, input.position.xy, thickness, style.isRoadStyleFlag, clipper);
+        }
 
         localAlpha = smoothstep(+globals.antiAliasingFactor, -globals.antiAliasingFactor, distance);
     }
diff --git a/62_CAD/shaders/main_pipeline/vertex_shader.hlsl b/62_CAD/shaders/main_pipeline/vertex_shader.hlsl
index f7abd6285..08418c844 100644
--- a/62_CAD/shaders/main_pipeline/vertex_shader.hlsl
+++ b/62_CAD/shaders/main_pipeline/vertex_shader.hlsl
@@ -123,6 +123,14 @@ PSInput main(uint vertexID : SV_VertexID)
     outV.setHeightAtMeshVertex(vtx.height);
     outV.setScreenSpaceVertexPos(float3(transformedPos, 1));
 
+    // TODO: line style of contour line has to be set too!
+    DTMSettings dtmSettings = dtmSettingsBuff[mainObj.dtmSettingsIdx];
+    LineStyle outlineStyle = lineStyles[dtmSettings.outlineLineStyleIdx];
+    LineStyle contourStyle = lineStyles[dtmSettings.contourLineStyleIdx];
+    const float screenSpaceLineWidth = outlineStyle.screenSpaceLineWidth + _static_cast<float>(_static_cast<pfloat64_t>(outlineStyle.worldSpaceLineWidth) * globals.screenToWorldRatio);
+    const float sdfLineThickness = screenSpaceLineWidth * 0.5f;
+    outV.setLineThickness(sdfLineThickness);
+
     return outV;
 
 #else

From d4647d588afd7420830d7b45ca6efde0825f1ecc Mon Sep 17 00:00:00 2001
From: Przemek <minikers21@gmail.com>
Date: Fri, 14 Mar 2025 17:58:53 +0100
Subject: [PATCH 086/529] Added `finalizeDTMSettingsCopiesToGPU` function

---
 62_CAD/DrawResourcesFiller.cpp                | 21 +++++++++++++++++++
 62_CAD/DrawResourcesFiller.h                  |  2 ++
 .../main_pipeline/fragment_shader.hlsl        | 13 ++++++------
 3 files changed, 30 insertions(+), 6 deletions(-)

diff --git a/62_CAD/DrawResourcesFiller.cpp b/62_CAD/DrawResourcesFiller.cpp
index 995ecfacc..44837e415 100644
--- a/62_CAD/DrawResourcesFiller.cpp
+++ b/62_CAD/DrawResourcesFiller.cpp
@@ -379,6 +379,7 @@ bool DrawResourcesFiller::finalizeAllCopiesToGPU(SIntendedSubmitInfo& intendedNe
 	success &= finalizeMainObjectCopiesToGPU(intendedNextSubmit);
 	success &= finalizeGeometryCopiesToGPU(intendedNextSubmit);
 	success &= finalizeLineStyleCopiesToGPU(intendedNextSubmit);
+	success &= finalizeDTMSettingsCopiesToGPU(intendedNextSubmit);
 	success &= finalizeTextureCopies(intendedNextSubmit);
 	return success;
 }
@@ -533,6 +534,26 @@ bool DrawResourcesFiller::finalizeLineStyleCopiesToGPU(SIntendedSubmitInfo& inte
 	return success;
 }
 
+bool DrawResourcesFiller::finalizeDTMSettingsCopiesToGPU(SIntendedSubmitInfo& intendedNextSubmit)
+{
+	bool success = true;
+	// Copy LineStyles
+	uint32_t remainingLineStyles = currentDTMSettingsCount - inMemDTMSettingsCount;
+	SBufferRange<IGPUBuffer> dtmSettingsRange = { sizeof(DTMSettings) * inMemDTMSettingsCount, sizeof(DTMSettings) * remainingLineStyles, gpuDrawBuffers.dtmSettingsBuffer };
+	if (dtmSettingsRange.size > 0u)
+	{
+		const DTMSettings* srcDTMSettingsData = reinterpret_cast<DTMSettings*>(cpuDrawBuffers.dtmSettingsBuffer->getPointer()) + inMemDTMSettingsCount;
+		if (m_utilities->updateBufferRangeViaStagingBuffer(intendedNextSubmit, dtmSettingsRange, srcDTMSettingsData))
+			inMemDTMSettingsCount = currentDTMSettingsCount;
+		else
+		{
+			// TODO: Log
+			success = false;
+		}
+	}
+	return success;
+}
+
 bool DrawResourcesFiller::finalizeTextureCopies(SIntendedSubmitInfo& intendedNextSubmit)
 {
 	msdfTextureArrayIndicesUsed.clear(); // clear msdf textures used in the frame, because the frame finished and called this function.
diff --git a/62_CAD/DrawResourcesFiller.h b/62_CAD/DrawResourcesFiller.h
index ef7eab307..98dffa90e 100644
--- a/62_CAD/DrawResourcesFiller.h
+++ b/62_CAD/DrawResourcesFiller.h
@@ -247,6 +247,8 @@ struct DrawResourcesFiller
 	bool finalizeGeometryCopiesToGPU(SIntendedSubmitInfo& intendedNextSubmit);
 
 	bool finalizeLineStyleCopiesToGPU(SIntendedSubmitInfo& intendedNextSubmit);
+
+	bool finalizeDTMSettingsCopiesToGPU(SIntendedSubmitInfo& intendedNextSubmit);
 	
 	bool finalizeCustomClipProjectionCopiesToGPU(SIntendedSubmitInfo& intendedNextSubmit);
 	
diff --git a/62_CAD/shaders/main_pipeline/fragment_shader.hlsl b/62_CAD/shaders/main_pipeline/fragment_shader.hlsl
index 21d114d90..a4176d1ef 100644
--- a/62_CAD/shaders/main_pipeline/fragment_shader.hlsl
+++ b/62_CAD/shaders/main_pipeline/fragment_shader.hlsl
@@ -441,7 +441,6 @@ float4 fragMain(PSInput input) : SV_TARGET
             end = float2(v1.x, v1.y);
         }
 
-        const uint32_t styleIdx = mainObj.styleIdx;
         const float thickness = input.getLineThickness();
         const float phaseShift = 0.0f; // input.getCurrentPhaseShift();
         const float stretch = 0.0f; // input.getPatternStretch();
@@ -450,17 +449,19 @@ float4 fragMain(PSInput input) : SV_TARGET
         nbl::hlsl::shapes::Line<float> lineSegment = nbl::hlsl::shapes::Line<float>::construct(start, end);
         nbl::hlsl::shapes::Line<float>::ArcLengthCalculator arcLenCalc = nbl::hlsl::shapes::Line<float>::ArcLengthCalculator::construct(lineSegment);
 
-        LineStyle style = lineStyles[styleIdx];
+        DTMSettings dtmSettings = dtmSettingsBuff[mainObj.dtmSettingsIdx];
+        LineStyle outlineStyle = lineStyles[dtmSettings.outlineLineStyleIdx];
+        LineStyle contourStyle = lineStyles[dtmSettings.contourLineStyleIdx];
 
         float distance = nbl::hlsl::numeric_limits<float>::max;
-        if (!style.hasStipples() || stretch == InvalidStyleStretchValue)
+        if (!outlineStyle.hasStipples() || stretch == InvalidStyleStretchValue)
         {
-            distance = ClippedSignedDistance< nbl::hlsl::shapes::Line<float> >::sdf(lineSegment, input.position.xy, thickness, style.isRoadStyleFlag);
+            distance = ClippedSignedDistance< nbl::hlsl::shapes::Line<float> >::sdf(lineSegment, input.position.xy, thickness, outlineStyle.isRoadStyleFlag);
         }
         else
         {
-            LineStyleClipper clipper = LineStyleClipper::construct(lineStyles[styleIdx], lineSegment, arcLenCalc, phaseShift, stretch, worldToScreenRatio);
-            distance = ClippedSignedDistance<nbl::hlsl::shapes::Line<float>, LineStyleClipper>::sdf(lineSegment, input.position.xy, thickness, style.isRoadStyleFlag, clipper);
+            LineStyleClipper clipper = LineStyleClipper::construct(outlineStyle, lineSegment, arcLenCalc, phaseShift, stretch, worldToScreenRatio);
+            distance = ClippedSignedDistance<nbl::hlsl::shapes::Line<float>, LineStyleClipper>::sdf(lineSegment, input.position.xy, thickness, outlineStyle.isRoadStyleFlag, clipper);
         }
 
         localAlpha = smoothstep(+globals.antiAliasingFactor, -globals.antiAliasingFactor, distance);

From ab0aa1231e1fd0eb24ade01a918f139a7c6f758a Mon Sep 17 00:00:00 2001
From: keptsecret <sorchon@gmail.com>
Date: Mon, 17 Mar 2025 13:56:17 +0700
Subject: [PATCH 087/529] fix for nan samples

---
 31_HLSLPathTracer/app_resources/hlsl/common.hlsl | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/31_HLSLPathTracer/app_resources/hlsl/common.hlsl b/31_HLSLPathTracer/app_resources/hlsl/common.hlsl
index 28261a634..d5cbbea81 100644
--- a/31_HLSLPathTracer/app_resources/hlsl/common.hlsl
+++ b/31_HLSLPathTracer/app_resources/hlsl/common.hlsl
@@ -588,7 +588,8 @@ struct Shape<PST_RECTANGLE>
                 if (solidAngle > numeric_limits<float>::min)
                 {
                     float32_t3 sph_sample = sphUv[0] * edge0 + sphUv[1] * edge1 + offset;
-                    L = nbl::hlsl::normalize(sph_sample - origin);
+                    L = sph_sample - origin;
+                    L = hlsl::mix<float32_t3>(nbl::hlsl::normalize(L), (float32_t3)0.0, hlsl::abs<float32_t3>(L) > (float32_t3)numeric_limits<float>::min); // TODO? sometimes L is vec3(0), find cause
                     pdf = 1.f / solidAngle;
                 }
                 else

From 96c7497430a12d064c932e53644b85a7b1984e1d Mon Sep 17 00:00:00 2001
From: keptsecret <sorchon@gmail.com>
Date: Mon, 17 Mar 2025 14:07:21 +0700
Subject: [PATCH 088/529] revert to intial scene settings

---
 31_HLSLPathTracer/main.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/31_HLSLPathTracer/main.cpp b/31_HLSLPathTracer/main.cpp
index b8e3ea044..46597d738 100644
--- a/31_HLSLPathTracer/main.cpp
+++ b/31_HLSLPathTracer/main.cpp
@@ -1347,10 +1347,10 @@ class HLSLComputePathtracer final : public examples::SimpleWindowedApplication,
 		float viewWidth = 10.f;
 		float camYAngle = 165.f / 180.f * 3.14159f;
 		float camXAngle = 32.f / 180.f * 3.14159f;
-		int PTPipline = E_LIGHT_GEOMETRY::ELG_RECTANGLE;
+		int PTPipline = E_LIGHT_GEOMETRY::ELG_SPHERE;
 		int renderMode = E_RENDER_MODE::ERM_HLSL;
 		int spp = 32;
-		int depth = 1;
+		int depth = 3;
 
 		bool m_firstFrame = true;
 		IGPUCommandBuffer::SClearColorValue clearColor = { .float32 = {0.f,0.f,0.f,1.f} };

From f96dfcc01a02cb3d3b80368386155e5c287f8f5c Mon Sep 17 00:00:00 2001
From: kevyuu <kevin.kayu@gmail.com>
Date: Mon, 17 Mar 2025 22:23:21 +0700
Subject: [PATCH 089/529] Use nbl::hlsl::_static_cast for converting
 MaterialPacked to Material and vice versa

---
 .../app_resources/common.hlsl                 | 64 ++++++++++++-------
 .../app_resources/raytrace.rahit.hlsl         |  2 +-
 .../app_resources/raytrace.rgen.hlsl          |  2 +-
 .../app_resources/raytrace_shadow.rahit.hlsl  |  2 +-
 71_RayTracingPipeline/main.cpp                |  4 +-
 5 files changed, 47 insertions(+), 27 deletions(-)

diff --git a/71_RayTracingPipeline/app_resources/common.hlsl b/71_RayTracingPipeline/app_resources/common.hlsl
index 6c052dff1..0b5f4b170 100644
--- a/71_RayTracingPipeline/app_resources/common.hlsl
+++ b/71_RayTracingPipeline/app_resources/common.hlsl
@@ -2,6 +2,7 @@
 #define RQG_COMMON_HLSL
 
 #include "nbl/builtin/hlsl/cpp_compat.hlsl"
+#include "nbl/builtin/hlsl/cpp_compat/basic.h"
 
 NBL_CONSTEXPR uint32_t WorkgroupSize = 16;
 NBL_CONSTEXPR uint32_t MAX_UNORM_10 = 1023;
@@ -67,28 +68,6 @@ struct MaterialPacked
     }
 };
 
-inline MaterialPacked packMaterial(Material material)
-{
-    MaterialPacked packed;
-    packed.ambient = packUnorm3x10(material.ambient);      
-    packed.diffuse = packUnorm3x10(material.diffuse);
-    packed.specular = packUnorm3x10(material.specular);      
-    packed.shininess = packUnorm22(material.shininess);
-    packed.alpha = packUnorm10(material.alpha);
-    return packed;
-}
-
-inline Material unpackMaterial(MaterialPacked packed)
-{
-    Material material;
-    material.ambient = unpackUnorm3x10(packed.ambient);
-    material.diffuse = unpackUnorm3x10(packed.diffuse);
-    material.specular = unpackUnorm3x10(packed.specular);
-    material.shininess = unpackUnorm22(packed.shininess);
-    material.alpha = unpackUnorm10(packed.alpha);
-    return material;
-}
-
 struct SProceduralGeomInfo
 {
     MaterialPacked material;
@@ -236,4 +215,45 @@ float32_t3 computeSpecular(Material mat, float32_t3 view_dir,
 }
 #endif
 
+namespace nbl
+{
+namespace hlsl
+{
+namespace impl
+{
+
+template<>
+struct static_cast_helper<Material, MaterialPacked>
+{
+    static inline Material cast(MaterialPacked packed)
+    {
+        Material material;
+        material.ambient = unpackUnorm3x10(packed.ambient);
+        material.diffuse = unpackUnorm3x10(packed.diffuse);
+        material.specular = unpackUnorm3x10(packed.specular);
+        material.shininess = unpackUnorm22(packed.shininess);
+        material.alpha = unpackUnorm10(packed.alpha);
+        return material;
+    }
+};
+
+template<>
+struct static_cast_helper<MaterialPacked, Material>
+{
+    static inline MaterialPacked cast(Material material)
+    {
+        MaterialPacked packed;
+        packed.ambient = packUnorm3x10(material.ambient);
+        packed.diffuse = packUnorm3x10(material.diffuse);
+        packed.specular = packUnorm3x10(material.specular);
+        packed.shininess = packUnorm22(material.shininess);
+        packed.alpha = packUnorm10(material.alpha);
+        return packed;
+    }
+};
+
+}
+}
+}
+
 #endif  // RQG_COMMON_HLSL
diff --git a/71_RayTracingPipeline/app_resources/raytrace.rahit.hlsl b/71_RayTracingPipeline/app_resources/raytrace.rahit.hlsl
index 2923e95d9..c499e0506 100644
--- a/71_RayTracingPipeline/app_resources/raytrace.rahit.hlsl
+++ b/71_RayTracingPipeline/app_resources/raytrace.rahit.hlsl
@@ -7,7 +7,7 @@ void main(inout PrimaryPayload payload, in BuiltInTriangleIntersectionAttributes
 {
     const int instID = InstanceID();
     const STriangleGeomInfo geom = vk::RawBufferLoad < STriangleGeomInfo > (pc.triangleGeomInfoBuffer + instID * sizeof(STriangleGeomInfo));
-    const Material material = unpackMaterial(geom.material);
+    const Material material = nbl::hlsl::_static_cast<Material>(geom.material);
     
     if (material.alpha > payload.alphaThreshold)
     {
diff --git a/71_RayTracingPipeline/app_resources/raytrace.rgen.hlsl b/71_RayTracingPipeline/app_resources/raytrace.rgen.hlsl
index c74774880..bd8f6dcba 100644
--- a/71_RayTracingPipeline/app_resources/raytrace.rgen.hlsl
+++ b/71_RayTracingPipeline/app_resources/raytrace.rgen.hlsl
@@ -72,7 +72,7 @@ void main()
 
         const float32_t3 worldPosition = pc.camPos + (camDirection * payload.rayDistance);
         const float32_t3 worldNormal = payload.worldNormal;
-        const Material material = unpackMaterial(payload.material);
+        const Material material = nbl::hlsl::_static_cast<Material>(payload.material);
         RayLight cLight;
         cLight.inHitPosition = worldPosition;
         CallShader(pc.light.type, cLight);
diff --git a/71_RayTracingPipeline/app_resources/raytrace_shadow.rahit.hlsl b/71_RayTracingPipeline/app_resources/raytrace_shadow.rahit.hlsl
index e76f1da55..88a9b79db 100644
--- a/71_RayTracingPipeline/app_resources/raytrace_shadow.rahit.hlsl
+++ b/71_RayTracingPipeline/app_resources/raytrace_shadow.rahit.hlsl
@@ -7,7 +7,7 @@ void main(inout OcclusionPayload payload, in BuiltInTriangleIntersectionAttribut
 {
     const int instID = InstanceID();
     const STriangleGeomInfo geom = vk::RawBufferLoad < STriangleGeomInfo > (pc.triangleGeomInfoBuffer + instID * sizeof(STriangleGeomInfo));
-    const Material material = unpackMaterial(geom.material);
+    const Material material = nbl::hlsl::_static_cast<Material>(geom.material);
     
     payload.attenuation = material.alpha * payload.attenuation;
     IgnoreHit();
diff --git a/71_RayTracingPipeline/main.cpp b/71_RayTracingPipeline/main.cpp
index 9a85ea423..363d3b59f 100644
--- a/71_RayTracingPipeline/main.cpp
+++ b/71_RayTracingPipeline/main.cpp
@@ -1253,7 +1253,7 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication,
           .vertexStride = cpuObject.data.inputParams.bindings[0].stride,
           .indexType = cpuObject.data.indexType,
           .indexCount = cpuObject.data.indexCount,
-          .material = packMaterial(cpuObject.material),
+          .material = hlsl::_static_cast<MaterialPacked>(cpuObject.material),
           .transform = cpuObject.transform,
           });
       }
@@ -1292,7 +1292,7 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication,
       {
         const auto middle_i = NumberOfProceduralGeometries / 2.0;
         SProceduralGeomInfo sphere = {
-          .material = packMaterial({
+          .material = hlsl::_static_cast<MaterialPacked>(Material{
             .ambient = {0.1, 0.05 * i, 0.1},
             .diffuse = {0.3, 0.2 * i, 0.3},
             .specular = {0.8, 0.8, 0.8},

From 88b3275c0e40b7b78a4e842a7fb493be572155af Mon Sep 17 00:00:00 2001
From: kevyuu <kevin.kayu@gmail.com>
Date: Mon, 17 Mar 2025 22:23:52 +0700
Subject: [PATCH 090/529] Fix create shader binding table to use the new span
 api

---
 71_RayTracingPipeline/main.cpp | 29 ++++++++++++++++++-----------
 1 file changed, 18 insertions(+), 11 deletions(-)

diff --git a/71_RayTracingPipeline/main.cpp b/71_RayTracingPipeline/main.cpp
index 363d3b59f..036acd510 100644
--- a/71_RayTracingPipeline/main.cpp
+++ b/71_RayTracingPipeline/main.cpp
@@ -3,6 +3,7 @@
 // For conditions of distribution and use, see copyright notice in nabla.h
 
 #include "common.hpp"
+#include "nbl/builtin/builtinResources.h"
 #include "nbl/ext/FullScreenTriangle/FullScreenTriangle.h"
 #include "nbl/builtin/hlsl/indirect_commands.hlsl"
 
@@ -377,12 +378,12 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication,
       hitGroups[getHitGroupIndex(EGT_PROCEDURAL, ERT_PRIMARY)] = {
         .closestHit = RTDS_SPHERE_CLOSEST_HIT,
         .anyHit = RTDS_ANYHIT_PRIMARY,
-        .intersectionShader = RTDS_INTERSECTION,
+        .intersection = RTDS_INTERSECTION,
       };
       hitGroups[getHitGroupIndex(EGT_PROCEDURAL, ERT_OCCLUSION)] = {
         .closestHit = RTDS_CLOSEST_HIT_SHADOW,
         .anyHit = RTDS_ANYHIT_SHADOW,
-        .intersectionShader = RTDS_INTERSECTION,
+        .intersection = RTDS_INTERSECTION,
       };
       shaderGroups.hits = hitGroups;
 
@@ -1335,9 +1336,15 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication,
     const auto handleSizeAligned = nbl::core::alignUp(handleSize, limits.shaderGroupHandleAlignment);
 
     auto& raygenRange = m_shaderBindingTable.raygenGroupRange;
+
     auto& hitRange = m_shaderBindingTable.hitGroupsRange;
+    const auto hitHandles = pipeline->getHitHandles();
+
     auto& missRange = m_shaderBindingTable.missGroupsRange;
+    const auto missHandles = pipeline->getMissHandles();
+
     auto& callableRange = m_shaderBindingTable.callableGroupsRange;
+    const auto callableHandles = pipeline->getCallableHandles();
 
     raygenRange = {
       .offset = 0,
@@ -1346,19 +1353,19 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication,
 
     missRange = {
       .offset = raygenRange.size,
-      .size = core::alignUp(pipeline->getMissGroupCount() * handleSizeAligned, limits.shaderGroupBaseAlignment),
+      .size = core::alignUp(missHandles.size() * handleSizeAligned, limits.shaderGroupBaseAlignment),
     };
     m_shaderBindingTable.missGroupsStride = handleSizeAligned;
 
     hitRange = {
       .offset = missRange.offset + missRange.size,
-      .size = core::alignUp(pipeline->getHitGroupCount() * handleSizeAligned, limits.shaderGroupBaseAlignment),
+      .size = core::alignUp(hitHandles.size() * handleSizeAligned, limits.shaderGroupBaseAlignment),
     };
     m_shaderBindingTable.hitGroupsStride = handleSizeAligned;
 
     callableRange = {
       .offset = hitRange.offset + hitRange.size,
-      .size = core::alignUp(pipeline->getCallableGroupCount() * handleSizeAligned, limits.shaderGroupBaseAlignment),
+      .size = core::alignUp(callableHandles.size() * handleSizeAligned, limits.shaderGroupBaseAlignment),
     };
     m_shaderBindingTable.callableGroupsStride = handleSizeAligned;
 
@@ -1374,25 +1381,25 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication,
 
     // copy miss region
     uint8_t* pMissData = pData + missRange.offset;
-    for (int32_t missIx = 0; missIx < pipeline->getMissGroupCount(); missIx++)
+    for (const auto& handle : missHandles)
     {
-      memcpy(pMissData, &pipeline->getMiss(missIx), handleSize);
+      memcpy(pMissData, &handle, handleSize);
       pMissData += m_shaderBindingTable.missGroupsStride;
     }
 
     // copy hit region
     uint8_t* pHitData = pData + hitRange.offset;
-    for (int32_t hitIx = 0; hitIx < pipeline->getHitGroupCount(); hitIx++)
+    for (const auto& handle : hitHandles)
     {
-      memcpy(pHitData, &pipeline->getHit(hitIx), handleSize);
+      memcpy(pHitData, &handle, handleSize);
       pHitData += m_shaderBindingTable.hitGroupsStride;
     }
 
     // copy callable region
     uint8_t* pCallableData = pData + callableRange.offset;
-    for (int32_t callableIx = 0; callableIx < pipeline->getCallableGroupCount(); callableIx++)
+    for (const auto& handle : callableHandles)
     {
-      memcpy(pCallableData, &pipeline->getCallable(callableIx), handleSize);
+      memcpy(pCallableData, &handle, handleSize);
       pCallableData += m_shaderBindingTable.callableGroupsStride;
     }
 

From cca8f7248bc63c4ba2fe786e9c77a594d02c99e9 Mon Sep 17 00:00:00 2001
From: kevyuu <kevin.kayu@gmail.com>
Date: Tue, 18 Mar 2025 21:10:05 +0700
Subject: [PATCH 091/529] Some optimization on ray tracing demo

---
 .../app_resources/raytrace.rgen.hlsl          | 22 +++++++------------
 1 file changed, 8 insertions(+), 14 deletions(-)

diff --git a/71_RayTracingPipeline/app_resources/raytrace.rgen.hlsl b/71_RayTracingPipeline/app_resources/raytrace.rgen.hlsl
index bd8f6dcba..fc6383dcf 100644
--- a/71_RayTracingPipeline/app_resources/raytrace.rgen.hlsl
+++ b/71_RayTracingPipeline/app_resources/raytrace.rgen.hlsl
@@ -2,6 +2,7 @@
 
 #include "nbl/builtin/hlsl/jit/device_capabilities.hlsl"
 #include "nbl/builtin/hlsl/random/xoroshiro.hlsl"
+#include "nbl/builtin/hlsl/random/pcg.hlsl"
 
 #include "nbl/builtin/hlsl/glsl_compat/core.hlsl"
 #include "nbl/builtin/hlsl/spirv_intrinsics/raytracing.hlsl"
@@ -15,13 +16,6 @@ static const float32_t3 s_clearColor = float32_t3(0.3, 0.3, 0.8);
 
 [[vk::binding(1, 0)]] RWTexture2D<float32_t4> colorImage;
 
-uint32_t pcgHash(uint32_t v)
-{
-    const uint32_t state = v * 747796405u + 2891336453u;
-    const uint32_t word = ((state >> ((state >> 28u) + 4u)) ^ state) * 277803737u;
-    return (word >> 22u) ^ word;
-}
-
 float32_t nextRandomUnorm(inout nbl::hlsl::Xoroshiro64StarStar rnd)
 {
     return float32_t(rnd()) / float32_t(0xFFFFFFFF);
@@ -34,8 +28,8 @@ void main()
     const uint32_t3 launchSize = DispatchRaysDimensions();
     const uint32_t2 coords = launchID.xy;
 
-    const uint32_t seed1 = pcgHash(pc.frameCounter);
-    const uint32_t seed2 = pcgHash(launchID.y * launchSize.x + launchID.x);
+    const uint32_t seed1 = nbl::hlsl::Pcg::construct(pc.frameCounter)();
+    const uint32_t seed2 = nbl::hlsl::Pcg::construct(launchID.y * launchSize.x + launchID.x)();
     nbl::hlsl::Xoroshiro64StarStar rnd = nbl::hlsl::Xoroshiro64StarStar::construct(uint32_t2(seed1, seed2));
 
     float32_t3 hitValues = float32_t3(0, 0, 0);
@@ -77,8 +71,6 @@ void main()
         cLight.inHitPosition = worldPosition;
         CallShader(pc.light.type, cLight);
 
-        const float32_t3 diffuse = computeDiffuse(material, cLight.outLightDir, worldNormal);
-        float32_t3 specular = float32_t3(0, 0, 0);
         float32_t attenuation = 1;
 
         if (dot(worldNormal, cLight.outLightDir) > 0)
@@ -95,12 +87,14 @@ void main()
             TraceRay(topLevelAS, shadowRayFlags, 0xFF, ERT_OCCLUSION, 0, EMT_OCCLUSION, rayDesc, occlusionPayload);
 
             attenuation = occlusionPayload.attenuation;
-            if (occlusionPayload.attenuation > 0)
+            if (occlusionPayload.attenuation > 0.0001)
             {
-                specular = computeSpecular(material, camDirection, cLight.outLightDir, worldNormal);
+                const float32_t3 diffuse = computeDiffuse(material, cLight.outLightDir, worldNormal);
+                const float32_t3 specular = computeSpecular(material, camDirection, cLight.outLightDir, worldNormal);
+                hitValues += (cLight.outIntensity * attenuation * (diffuse + specular));
             }
         }
-        hitValues += ((cLight.outIntensity * attenuation * (diffuse + specular)) + material.ambient);
+        hitValues += material.ambient;
     }
 
     const float32_t3 hitValue = hitValues / s_sampleCount;

From b483aa6b7474526bb6c89e05c1965ff43880cfde Mon Sep 17 00:00:00 2001
From: keptsecret <sorchon@gmail.com>
Date: Wed, 19 Mar 2025 10:42:37 +0700
Subject: [PATCH 092/529] better hlsl dispatch

---
 .../app_resources/hlsl/render.comp.hlsl       |  8 ++++---
 31_HLSLPathTracer/main.cpp                    | 21 +++++++++++--------
 2 files changed, 17 insertions(+), 12 deletions(-)

diff --git a/31_HLSLPathTracer/app_resources/hlsl/render.comp.hlsl b/31_HLSLPathTracer/app_resources/hlsl/render.comp.hlsl
index f8cf2ae22..b54f5721d 100644
--- a/31_HLSLPathTracer/app_resources/hlsl/render.comp.hlsl
+++ b/31_HLSLPathTracer/app_resources/hlsl/render.comp.hlsl
@@ -43,13 +43,15 @@
 
 using namespace nbl::hlsl;
 
-NBL_CONSTEXPR uint32_t WorkgroupSize = 32;
+NBL_CONSTEXPR uint32_t WorkgroupSize = 256;
 NBL_CONSTEXPR uint32_t MAX_DEPTH_LOG2 = 4;
 NBL_CONSTEXPR uint32_t MAX_SAMPLES_LOG2 = 10;
 
 int32_t2 getCoordinates()
 {
-    return int32_t2(glsl::gl_GlobalInvocationID().xy);
+    uint32_t width, height;
+    outImage.GetDimensions(width, height);
+    return int32_t2(glsl::gl_GlobalInvocationID().x % width, glsl::gl_GlobalInvocationID().x / width);
 }
 
 float32_t2 getTexCoords()
@@ -141,7 +143,7 @@ static const ext::Scene<light_type, bxdfnode_type> scene = ext::Scene<light_type
     lights, LIGHT_COUNT, bxdfs, BXDF_COUNT
 );
 
-[numthreads(WorkgroupSize, WorkgroupSize, 1)]
+[numthreads(WorkgroupSize, 1, 1)]
 void main(uint32_t3 threadID : SV_DispatchThreadID)
 {
     uint32_t width, height;
diff --git a/31_HLSLPathTracer/main.cpp b/31_HLSLPathTracer/main.cpp
index 46597d738..10889f37f 100644
--- a/31_HLSLPathTracer/main.cpp
+++ b/31_HLSLPathTracer/main.cpp
@@ -366,12 +366,12 @@ class HLSLComputePathtracer final : public examples::SimpleWindowedApplication,
 					options.stage = IShader::E_SHADER_STAGE::ESS_COMPUTE;	// should be compute
 					options.targetSpirvVersion = m_device->getPhysicalDevice()->getLimits().spirvVersion;
 					options.spirvOptimizer = nullptr;
-//#ifndef _NBL_DEBUG
-//					ISPIRVOptimizer::E_OPTIMIZER_PASS optPasses = ISPIRVOptimizer::EOP_STRIP_DEBUG_INFO;
-//					auto opt = make_smart_refctd_ptr<ISPIRVOptimizer>(std::span<ISPIRVOptimizer::E_OPTIMIZER_PASS>(&optPasses, 1));
-//					options.spirvOptimizer = opt.get();
-//#endif
-					options.debugInfoFlags |= IShaderCompiler::E_DEBUG_INFO_FLAGS::EDIF_SOURCE_BIT;
+#ifndef _NBL_DEBUG
+					ISPIRVOptimizer::E_OPTIMIZER_PASS optPasses = ISPIRVOptimizer::EOP_STRIP_DEBUG_INFO;
+					auto opt = make_smart_refctd_ptr<ISPIRVOptimizer>(std::span<ISPIRVOptimizer::E_OPTIMIZER_PASS>(&optPasses, 1));
+					options.spirvOptimizer = opt.get();
+#endif
+					options.debugInfoFlags = IShaderCompiler::E_DEBUG_INFO_FLAGS::EDIF_NONE;
 					options.preprocessorOptions.sourceIdentifier = source->getFilepathHint();
 					options.preprocessorOptions.logger = m_logger.get();
 					options.preprocessorOptions.includeFinder = compiler->getDefaultIncludeFinder();
@@ -418,8 +418,8 @@ class HLSLComputePathtracer final : public examples::SimpleWindowedApplication,
 							params.shader.shader = ptShader.get();
 							params.shader.entryPoint = "main";
 							params.shader.entries = nullptr;
-							params.shader.requireFullSubgroups = true;
-							params.shader.requiredSubgroupSize = static_cast<IGPUShader::SSpecInfo::SUBGROUP_SIZE>(5);
+							 params.shader.requireFullSubgroups = true;
+							 params.shader.requiredSubgroupSize = static_cast<IGPUShader::SSpecInfo::SUBGROUP_SIZE>(5);
 							if (!m_device->createComputePipelines(nullptr, { &params, 1 }, m_PTGLSLPipelines.data() + index))
 								return logFail("Failed to create GLSL compute pipeline!\n");
 						}
@@ -1068,7 +1068,10 @@ class HLSLComputePathtracer final : public examples::SimpleWindowedApplication,
 					cmdbuf->bindDescriptorSets(EPBP_COMPUTE, pipeline->getLayout(), 0u, 1u, &m_descriptorSet0.get());
 					cmdbuf->bindDescriptorSets(EPBP_COMPUTE, pipeline->getLayout(), 2u, 1u, &m_descriptorSet2.get());
 					cmdbuf->pushConstants(pipeline->getLayout(), IShader::E_SHADER_STAGE::ESS_COMPUTE, 0, sizeof(PTPushConstant), &pc);
-					cmdbuf->dispatch(1 + (WindowDimensions.x - 1) / DefaultWorkGroupSize, 1 + (WindowDimensions.y - 1) / DefaultWorkGroupSize, 1u);
+					if (renderMode == E_RENDER_MODE::ERM_HLSL)
+						cmdbuf->dispatch(1 + (WindowDimensions.x * WindowDimensions.y - 1) / 256u, 1u, 1u);
+					else
+						cmdbuf->dispatch(1 + (WindowDimensions.x - 1) / DefaultWorkGroupSize, 1 + (WindowDimensions.y - 1) / DefaultWorkGroupSize, 1u);
 				}
 
 				// TRANSITION m_outImgView to READ (because of descriptorSets0 -> ComputeShader Writes into the image)

From 773733d3bed7f17073ff02af29d16700767988a9 Mon Sep 17 00:00:00 2001
From: keptsecret <sorchon@gmail.com>
Date: Wed, 19 Mar 2025 15:23:06 +0700
Subject: [PATCH 093/529] refactor NEE to use templated light types and
 sampling

---
 .../app_resources/hlsl/common.hlsl            | 272 +-------
 .../hlsl/next_event_estimator.hlsl            | 579 ++++++++++++------
 .../app_resources/hlsl/pathtracer.hlsl        |  75 +--
 .../app_resources/hlsl/render.comp.hlsl       |  22 +-
 .../app_resources/hlsl/scene.hlsl             | 139 -----
 31_HLSLPathTracer/main.cpp                    |   9 +-
 6 files changed, 413 insertions(+), 683 deletions(-)

diff --git a/31_HLSLPathTracer/app_resources/hlsl/common.hlsl b/31_HLSLPathTracer/app_resources/hlsl/common.hlsl
index d5cbbea81..dea682c8b 100644
--- a/31_HLSLPathTracer/app_resources/hlsl/common.hlsl
+++ b/31_HLSLPathTracer/app_resources/hlsl/common.hlsl
@@ -35,6 +35,7 @@ struct Payload
 
 enum ProceduralShapeType : uint16_t
 {
+    PST_NONE = 0,
     PST_SPHERE,
     PST_TRIANGLE,
     PST_RECTANGLE
@@ -173,33 +174,6 @@ enum PTPolygonMethod : uint16_t
     PPM_APPROX_PROJECTED_SOLID_ANGLE
 };
 
-// namespace Intersector
-// {
-// // ray query method
-// // ray query struct holds AS info
-// // pass in address to vertex/index buffers?
-
-// // ray tracing pipeline method
-
-// // procedural data store: [obj count] [intersect type] [obj1] [obj2] [...]
-
-// struct IntersectData
-// {
-//     enum Mode : uint32_t    // enum class?
-//     {
-//         RAY_QUERY,
-//         RAY_TRACING,
-//         PROCEDURAL
-//     };
-
-//     NBL_CONSTEXPR_STATIC_INLINE uint32_t DataSize = 128;
-
-//     uint32_t mode : 2;
-//     uint32_t unused : 30;   // possible space for flags
-//     uint32_t data[DataSize];
-// };
-// }
-
 enum IntersectMode : uint32_t
 {
     IM_RAY_QUERY,
@@ -207,20 +181,6 @@ enum IntersectMode : uint32_t
     IM_PROCEDURAL
 };
 
-namespace NextEventEstimator
-{
-// procedural data store: [light count] [event type] [obj]
-
-struct Event
-{
-    NBL_CONSTEXPR_STATIC_INLINE uint32_t DataSize = 16;
-
-    uint32_t mode : 2;
-    uint32_t unused : 30;   // possible space for flags
-    uint32_t data[DataSize];
-};
-}
-
 template<ProceduralShapeType type>
 struct Shape;
 
@@ -269,45 +229,6 @@ struct Shape<PST_SPHERE>
         return 2.0 * numbers::pi<float> * (1.0 - cosThetaMax);
     }
 
-    template<typename Ray>
-    float deferredPdf(NBL_CONST_REF_ARG(Ray) ray)
-    {
-        return 1.0 / getSolidAngle(ray.origin);
-    }
-
-    template<class Aniso>
-    float32_t3 generate_and_pdf(NBL_REF_ARG(float32_t) pdf, NBL_REF_ARG(float32_t) newRayMaxT, NBL_CONST_REF_ARG(float32_t3) origin, NBL_CONST_REF_ARG(Aniso) interaction, bool isBSDF, NBL_CONST_REF_ARG(float32_t3) xi)
-    {
-        float32_t3 Z = position - origin;
-        const float distanceSQ = hlsl::dot<float32_t3>(Z,Z);
-        const float cosThetaMax2 = 1.0 - radius2 / distanceSQ;
-        if (cosThetaMax2 > 0.0)
-        {
-            const float rcpDistance = 1.0 / hlsl::sqrt<float32_t>(distanceSQ);
-            Z *= rcpDistance;
-
-            const float cosThetaMax = hlsl::sqrt<float32_t>(cosThetaMax2);
-            const float cosTheta = hlsl::mix<float>(1.0, cosThetaMax, xi.x);
-
-            float32_t3 L = Z * cosTheta;
-
-            const float cosTheta2 = cosTheta * cosTheta;
-            const float sinTheta = hlsl::sqrt<float32_t>(1.0 - cosTheta2);
-            float sinPhi, cosPhi;
-            math::sincos<float>(2.0 * numbers::pi<float> * xi.y - numbers::pi<float>, sinPhi, cosPhi);
-            float32_t3 X, Y;
-            math::frisvad<float32_t3>(Z, X, Y);
-
-            L += (X * cosPhi + Y * sinPhi) * sinTheta;
-
-            newRayMaxT = (cosTheta - hlsl::sqrt<float32_t>(cosTheta2 - cosThetaMax2)) / rcpDistance;
-            pdf = 1.0 / (2.0 * numbers::pi<float> * (1.0 - cosThetaMax));
-            return L;
-        }
-        pdf = 0.0;
-        return float32_t3(0.0,0.0,0.0);
-    }
-
     NBL_CONSTEXPR_STATIC_INLINE uint32_t ObjSize = 5;
 
     float32_t3 position;
@@ -361,100 +282,6 @@ struct Shape<PST_TRIANGLE>
         return hlsl::cross<float32_t3>(edges[0], edges[1]) * 0.5f;
     }
 
-    template<typename Ray>
-    float deferredPdf(NBL_CONST_REF_ARG(Ray) ray)
-    {
-        const float32_t3 L = ray.direction;
-        switch (polygonMethod)
-        {
-            case PPM_AREA:
-            {
-                const float dist = ray.intersectionT;
-                const float32_t3 L = ray.direction;
-                return dist * dist / hlsl::abs<float32_t>(hlsl::dot<float32_t3>(getNormalTimesArea(), L));
-            }
-            break;
-            case PPM_SOLID_ANGLE:
-            {
-                shapes::SphericalTriangle<float> st = shapes::SphericalTriangle<float>::create(vertex0, vertex1, vertex2, ray.origin);
-                const float rcpProb = st.solidAngleOfTriangle();
-                // if `rcpProb` is NAN then the triangle's solid angle was close to 0.0
-                return rcpProb > numeric_limits<float>::min ? (1.0 / rcpProb) : numeric_limits<float>::max;
-            }
-            break;
-            case PPM_APPROX_PROJECTED_SOLID_ANGLE:
-            {
-                shapes::SphericalTriangle<float> st = shapes::SphericalTriangle<float>::create(vertex0, vertex1, vertex2, ray.origin);
-                sampling::ProjectedSphericalTriangle<float> pst = sampling::ProjectedSphericalTriangle<float>::create(st);
-                const float pdf = pst.pdf(ray.normalAtOrigin, ray.wasBSDFAtOrigin, L);
-                // if `pdf` is NAN then the triangle's projected solid angle was close to 0.0, if its close to INF then the triangle was very small
-                return pdf < numeric_limits<float>::max ? pdf : numeric_limits<float>::max;
-            }
-            break;
-            default:
-                return 0.0;
-        }
-    }
-
-    template<class Aniso>
-    float32_t3 generate_and_pdf(NBL_REF_ARG(float32_t) pdf, NBL_REF_ARG(float32_t) newRayMaxT, NBL_CONST_REF_ARG(float32_t3) origin, NBL_CONST_REF_ARG(Aniso) interaction, bool isBSDF, float32_t3 xi)
-    {
-        switch(polygonMethod)
-        {
-            case PPM_AREA:
-            {
-                const float32_t3 edge0 = vertex1 - vertex0;
-                const float32_t3 edge1 = vertex2 - vertex0;
-                const float sqrtU = hlsl::sqrt<float32_t>(xi.x);
-                float32_t3 pnt = vertex0 + edge0 * (1.0 - sqrtU) + edge1 * sqrtU * xi.y;
-                float32_t3 L = pnt - origin;
-
-                const float distanceSq = hlsl::dot<float32_t3>(L,L);
-                const float rcpDistance = 1.0 / hlsl::sqrt<float32_t>(distanceSq);
-                L *= rcpDistance;
-
-                pdf = distanceSq / hlsl::abs<float32_t>(hlsl::dot<float32_t3>(hlsl::cross<float32_t3>(edge0, edge1) * 0.5f, L));
-                newRayMaxT = 1.0 / rcpDistance;
-                return L;
-            }
-            break;
-            case PPM_SOLID_ANGLE:
-            {
-                float rcpPdf;
-
-                shapes::SphericalTriangle<float> st = shapes::SphericalTriangle<float>::create(vertex0, vertex1, vertex2, origin);
-                sampling::SphericalTriangle<float> sst = sampling::SphericalTriangle<float>::create(st);
-
-                const float32_t3 L = sst.generate(rcpPdf, xi.xy);
-
-                pdf = rcpPdf > numeric_limits<float>::min ? (1.0 / rcpPdf) : numeric_limits<float>::max;
-
-                const float32_t3 N = getNormalTimesArea();
-                newRayMaxT = hlsl::dot<float32_t3>(N, vertex0 - origin) / hlsl::dot<float32_t3>(N, L);
-                return L;
-            }
-            break;
-            case PPM_APPROX_PROJECTED_SOLID_ANGLE:
-            {
-                float rcpPdf;
-
-                shapes::SphericalTriangle<float> st = shapes::SphericalTriangle<float>::create(vertex0, vertex1, vertex2, origin);
-                sampling::ProjectedSphericalTriangle<float> sst = sampling::ProjectedSphericalTriangle<float>::create(st);
-
-                const float32_t3 L = sst.generate(rcpPdf, interaction.isotropic.N, isBSDF, xi.xy);
-
-                pdf = rcpPdf > numeric_limits<float>::min ? (1.0 / rcpPdf) : numeric_limits<float>::max;
-
-                const float32_t3 N = getNormalTimesArea();
-                newRayMaxT = hlsl::dot<float32_t3>(N, vertex0 - origin) / hlsl::dot<float32_t3>(N, L);
-                return L;
-            }
-            break;
-            default:
-                return (float32_t3)0.0;
-        }
-    }
-
     NBL_CONSTEXPR_STATIC_INLINE uint32_t ObjSize = 10;
 
     float32_t3 vertex0;
@@ -515,103 +342,6 @@ struct Shape<PST_RECTANGLE>
         basis[2] = normalize(cross(basis[0],basis[1]));
     }
 
-    template<typename Ray>
-    float deferredPdf(NBL_CONST_REF_ARG(Ray) ray)
-    {
-        switch (polygonMethod)
-        {
-            case PPM_AREA:
-            {
-                const float dist = ray.intersectionT;
-                const float32_t3 L = ray.direction;
-                return dist * dist / hlsl::abs<float32_t>(hlsl::dot<float32_t3>(getNormalTimesArea(), L));
-            }
-            break;
-            // #ifdef TRIANGLE_REFERENCE ?
-            case PPM_SOLID_ANGLE:
-            {
-                float pdf;
-                float32_t3x3 rectNormalBasis;
-                float32_t2 rectExtents;
-                getNormalBasis(rectNormalBasis, rectExtents);
-                shapes::SphericalRectangle<float> sphR0 = shapes::SphericalRectangle<float>::create(ray.origin, offset, rectNormalBasis);
-                float solidAngle = sphR0.solidAngleOfRectangle(rectExtents);
-                if (solidAngle > numeric_limits<float>::min)
-                    pdf = 1.f / solidAngle;
-                else
-                    pdf = bit_cast<float>(numeric_limits<float>::infinity);
-                return pdf;
-            }
-            break;
-            case PPM_APPROX_PROJECTED_SOLID_ANGLE:
-            {
-                // currently broken
-                return bit_cast<float>(numeric_limits<float>::infinity);
-            }
-            break;
-            default:
-                return bit_cast<float>(numeric_limits<float>::infinity);
-        }
-    }
-
-    template<class Aniso>
-    float32_t3 generate_and_pdf(NBL_REF_ARG(float32_t) pdf, NBL_REF_ARG(float32_t) newRayMaxT, NBL_CONST_REF_ARG(float32_t3) origin, NBL_CONST_REF_ARG(Aniso) interaction, bool isBSDF, float32_t3 xi)
-    {
-        const float32_t3 N = getNormalTimesArea();
-        const float32_t3 origin2origin = offset - origin;
-
-        switch (polygonMethod)
-        {
-            case PPM_AREA:
-            {
-                float32_t3 L = origin2origin + edge0 * xi.x + edge1 * xi.y;
-                const float distSq = hlsl::dot<float32_t3>(L, L);
-                const float rcpDist = 1.0 / hlsl::sqrt<float32_t>(distSq);
-                L *= rcpDist;
-                pdf = distSq / hlsl::abs<float32_t>(hlsl::dot<float32_t3>(N, L));
-                newRayMaxT = 1.0 / rcpDist;
-                return L;
-            }
-            break;
-            // #ifdef TRIANGLE_REFERENCE ?
-            case PPM_SOLID_ANGLE:
-            {
-                float32_t3x3 rectNormalBasis;
-                float32_t2 rectExtents;
-                getNormalBasis(rectNormalBasis, rectExtents);
-                shapes::SphericalRectangle<float> sphR0 = shapes::SphericalRectangle<float>::create(origin, offset, rectNormalBasis);
-                float32_t3 L = (float32_t3)0.0;
-                float solidAngle = sphR0.solidAngleOfRectangle(rectExtents);
-
-                sampling::SphericalRectangle<float> ssph = sampling::SphericalRectangle<float>::create(sphR0);
-                float32_t2 sphUv = ssph.generate(rectExtents, xi.xy, solidAngle);
-                if (solidAngle > numeric_limits<float>::min)
-                {
-                    float32_t3 sph_sample = sphUv[0] * edge0 + sphUv[1] * edge1 + offset;
-                    L = sph_sample - origin;
-                    L = hlsl::mix<float32_t3>(nbl::hlsl::normalize(L), (float32_t3)0.0, hlsl::abs<float32_t3>(L) > (float32_t3)numeric_limits<float>::min); // TODO? sometimes L is vec3(0), find cause
-                    pdf = 1.f / solidAngle;
-                }
-                else
-                    pdf = bit_cast<float>(numeric_limits<float>::infinity);
-
-                newRayMaxT = hlsl::dot<float32_t3>(N, origin2origin) / hlsl::dot<float32_t3>(N, L);
-                return L;
-            }
-            break;
-            case PPM_APPROX_PROJECTED_SOLID_ANGLE:
-            {
-                // currently broken
-                pdf = bit_cast<float>(numeric_limits<float>::infinity);
-                return (float32_t3)0.0;
-            }
-            break;
-            default:
-                pdf = bit_cast<float>(numeric_limits<float>::infinity);
-                return (float32_t3)0.0;
-        }
-    }
-
     NBL_CONSTEXPR_STATIC_INLINE uint32_t ObjSize = 10;
 
     float32_t3 offset;
diff --git a/31_HLSLPathTracer/app_resources/hlsl/next_event_estimator.hlsl b/31_HLSLPathTracer/app_resources/hlsl/next_event_estimator.hlsl
index 9c41f6627..7c157aadf 100644
--- a/31_HLSLPathTracer/app_resources/hlsl/next_event_estimator.hlsl
+++ b/31_HLSLPathTracer/app_resources/hlsl/next_event_estimator.hlsl
@@ -12,220 +12,425 @@ namespace ext
 namespace NextEventEstimator
 {
 
-template<typename Light, typename Ray, class LightSample, class Aniso>
-struct Estimator
+template<ProceduralShapeType PST, PTPolygonMethod PPM>
+struct ShapeSampling;
+
+template<PTPolygonMethod PPM>
+struct ShapeSampling<PST_SPHERE, PPM>
+{
+    static ShapeSampling<PST_SPHERE, PPM> create(NBL_CONST_REF_ARG(Shape<PST_SPHERE>) sphere)
+    {
+        ShapeSampling<PST_SPHERE, PPM> retval;
+        retval.sphere = sphere;
+        return retval;
+    }
+
+    template<typename Ray>
+    float deferredPdf(NBL_CONST_REF_ARG(Ray) ray)
+    {
+        return 1.0 / sphere.getSolidAngle(ray.origin);
+    }
+
+    template<class Aniso>
+    float32_t3 generate_and_pdf(NBL_REF_ARG(float32_t) pdf, NBL_REF_ARG(float32_t) newRayMaxT, NBL_CONST_REF_ARG(float32_t3) origin, NBL_CONST_REF_ARG(Aniso) interaction, bool isBSDF, NBL_CONST_REF_ARG(float32_t3) xi)
+    {
+        float32_t3 Z = sphere.position - origin;
+        const float distanceSQ = hlsl::dot<float32_t3>(Z,Z);
+        const float cosThetaMax2 = 1.0 - sphere.radius2 / distanceSQ;
+        if (cosThetaMax2 > 0.0)
+        {
+            const float rcpDistance = 1.0 / hlsl::sqrt<float32_t>(distanceSQ);
+            Z *= rcpDistance;
+
+            const float cosThetaMax = hlsl::sqrt<float32_t>(cosThetaMax2);
+            const float cosTheta = hlsl::mix<float>(1.0, cosThetaMax, xi.x);
+
+            float32_t3 L = Z * cosTheta;
+
+            const float cosTheta2 = cosTheta * cosTheta;
+            const float sinTheta = hlsl::sqrt<float32_t>(1.0 - cosTheta2);
+            float sinPhi, cosPhi;
+            math::sincos<float>(2.0 * numbers::pi<float> * xi.y - numbers::pi<float>, sinPhi, cosPhi);
+            float32_t3 X, Y;
+            math::frisvad<float32_t3>(Z, X, Y);
+
+            L += (X * cosPhi + Y * sinPhi) * sinTheta;
+
+            newRayMaxT = (cosTheta - hlsl::sqrt<float32_t>(cosTheta2 - cosThetaMax2)) / rcpDistance;
+            pdf = 1.0 / (2.0 * numbers::pi<float> * (1.0 - cosThetaMax));
+            return L;
+        }
+        pdf = 0.0;
+        return float32_t3(0.0,0.0,0.0);
+    }
+
+    Shape<PST_SPHERE> sphere;
+};
+
+template<>
+struct ShapeSampling<PST_TRIANGLE, PPM_AREA>
+{
+    static ShapeSampling<PST_TRIANGLE, PPM_AREA> create(NBL_CONST_REF_ARG(Shape<PST_TRIANGLE>) tri)
+    {
+        ShapeSampling<PST_TRIANGLE, PPM_AREA> retval;
+        retval.tri = tri;
+        return retval;
+    }
+
+    template<typename Ray>
+    float deferredPdf(NBL_CONST_REF_ARG(Ray) ray)
+    {
+        const float dist = ray.intersectionT;
+        const float32_t3 L = ray.direction;
+        return dist * dist / hlsl::abs<float32_t>(hlsl::dot<float32_t3>(tri.getNormalTimesArea(), L));
+    }
+
+    template<class Aniso>
+    float32_t3 generate_and_pdf(NBL_REF_ARG(float32_t) pdf, NBL_REF_ARG(float32_t) newRayMaxT, NBL_CONST_REF_ARG(float32_t3) origin, NBL_CONST_REF_ARG(Aniso) interaction, bool isBSDF, NBL_CONST_REF_ARG(float32_t3) xi)
+    {
+        const float32_t3 edge0 = tri.vertex1 - tri.vertex0;
+        const float32_t3 edge1 = tri.vertex2 - tri.vertex0;
+        const float sqrtU = hlsl::sqrt<float32_t>(xi.x);
+        float32_t3 pnt = tri.vertex0 + edge0 * (1.0 - sqrtU) + edge1 * sqrtU * xi.y;
+        float32_t3 L = pnt - origin;
+
+        const float distanceSq = hlsl::dot<float32_t3>(L,L);
+        const float rcpDistance = 1.0 / hlsl::sqrt<float32_t>(distanceSq);
+        L *= rcpDistance;
+
+        pdf = distanceSq / hlsl::abs<float32_t>(hlsl::dot<float32_t3>(hlsl::cross<float32_t3>(edge0, edge1) * 0.5f, L));
+        newRayMaxT = 1.0 / rcpDistance;
+        return L;
+    }
+
+    Shape<PST_TRIANGLE> tri;
+};
+
+template<>
+struct ShapeSampling<PST_TRIANGLE, PPM_SOLID_ANGLE>
+{
+    static ShapeSampling<PST_TRIANGLE, PPM_SOLID_ANGLE> create(NBL_CONST_REF_ARG(Shape<PST_TRIANGLE>) tri)
+    {
+        ShapeSampling<PST_TRIANGLE, PPM_SOLID_ANGLE> retval;
+        retval.tri = tri;
+        return retval;
+    }
+
+    template<typename Ray>
+    float deferredPdf(NBL_CONST_REF_ARG(Ray) ray)
+    {
+        shapes::SphericalTriangle<float> st = shapes::SphericalTriangle<float>::create(tri.vertex0, tri.vertex1, tri.vertex2, ray.origin);
+        const float rcpProb = st.solidAngleOfTriangle();
+        // if `rcpProb` is NAN then the triangle's solid angle was close to 0.0
+        return rcpProb > numeric_limits<float>::min ? (1.0 / rcpProb) : numeric_limits<float>::max;
+    }
+
+    template<class Aniso>
+    float32_t3 generate_and_pdf(NBL_REF_ARG(float32_t) pdf, NBL_REF_ARG(float32_t) newRayMaxT, NBL_CONST_REF_ARG(float32_t3) origin, NBL_CONST_REF_ARG(Aniso) interaction, bool isBSDF, NBL_CONST_REF_ARG(float32_t3) xi)
+    {
+        float rcpPdf;
+        shapes::SphericalTriangle<float> st = shapes::SphericalTriangle<float>::create(tri.vertex0, tri.vertex1, tri.vertex2, origin);
+        sampling::SphericalTriangle<float> sst = sampling::SphericalTriangle<float>::create(st);
+
+        const float32_t3 L = sst.generate(rcpPdf, xi.xy);
+
+        pdf = rcpPdf > numeric_limits<float>::min ? (1.0 / rcpPdf) : numeric_limits<float>::max;
+
+        const float32_t3 N = tri.getNormalTimesArea();
+        newRayMaxT = hlsl::dot<float32_t3>(N, tri.vertex0 - origin) / hlsl::dot<float32_t3>(N, L);
+        return L;
+    }
+
+    Shape<PST_TRIANGLE> tri;
+};
+
+template<>
+struct ShapeSampling<PST_TRIANGLE, PPM_APPROX_PROJECTED_SOLID_ANGLE>
+{
+    static ShapeSampling<PST_TRIANGLE, PPM_APPROX_PROJECTED_SOLID_ANGLE> create(NBL_CONST_REF_ARG(Shape<PST_TRIANGLE>) tri)
+    {
+        ShapeSampling<PST_TRIANGLE, PPM_APPROX_PROJECTED_SOLID_ANGLE> retval;
+        retval.tri = tri;
+        return retval;
+    }
+
+    template<typename Ray>
+    float deferredPdf(NBL_CONST_REF_ARG(Ray) ray)
+    {
+        const float32_t3 L = ray.direction;
+        shapes::SphericalTriangle<float> st = shapes::SphericalTriangle<float>::create(tri.vertex0, tri.vertex1, tri.vertex2, ray.origin);
+        sampling::ProjectedSphericalTriangle<float> pst = sampling::ProjectedSphericalTriangle<float>::create(st);
+        const float pdf = pst.pdf(ray.normalAtOrigin, ray.wasBSDFAtOrigin, L);
+        // if `pdf` is NAN then the triangle's projected solid angle was close to 0.0, if its close to INF then the triangle was very small
+        return pdf < numeric_limits<float>::max ? pdf : numeric_limits<float>::max;
+    }
+
+    template<class Aniso>
+    float32_t3 generate_and_pdf(NBL_REF_ARG(float32_t) pdf, NBL_REF_ARG(float32_t) newRayMaxT, NBL_CONST_REF_ARG(float32_t3) origin, NBL_CONST_REF_ARG(Aniso) interaction, bool isBSDF, NBL_CONST_REF_ARG(float32_t3) xi)
+    {
+        float rcpPdf;
+        shapes::SphericalTriangle<float> st = shapes::SphericalTriangle<float>::create(tri.vertex0, tri.vertex1, tri.vertex2, origin);
+        sampling::ProjectedSphericalTriangle<float> sst = sampling::ProjectedSphericalTriangle<float>::create(st);
+
+        const float32_t3 L = sst.generate(rcpPdf, interaction.isotropic.N, isBSDF, xi.xy);
+
+        pdf = rcpPdf > numeric_limits<float>::min ? (1.0 / rcpPdf) : numeric_limits<float>::max;
+
+        const float32_t3 N = tri.getNormalTimesArea();
+        newRayMaxT = hlsl::dot<float32_t3>(N, tri.vertex0 - origin) / hlsl::dot<float32_t3>(N, L);
+        return L;
+    }
+
+    Shape<PST_TRIANGLE> tri;
+};
+
+template<>
+struct ShapeSampling<PST_RECTANGLE, PPM_AREA>
+{
+    static ShapeSampling<PST_RECTANGLE, PPM_AREA> create(NBL_CONST_REF_ARG(Shape<PST_RECTANGLE>) rect)
+    {
+        ShapeSampling<PST_RECTANGLE, PPM_AREA> retval;
+        retval.rect = rect;
+        return retval;
+    }
+
+    template<typename Ray>
+    float deferredPdf(NBL_CONST_REF_ARG(Ray) ray)
+    {
+        const float dist = ray.intersectionT;
+        const float32_t3 L = ray.direction;
+        return dist * dist / hlsl::abs<float32_t>(hlsl::dot<float32_t3>(rect.getNormalTimesArea(), L));
+    }
+
+    template<class Aniso>
+    float32_t3 generate_and_pdf(NBL_REF_ARG(float32_t) pdf, NBL_REF_ARG(float32_t) newRayMaxT, NBL_CONST_REF_ARG(float32_t3) origin, NBL_CONST_REF_ARG(Aniso) interaction, bool isBSDF, NBL_CONST_REF_ARG(float32_t3) xi)
+    {
+        const float32_t3 N = rect.getNormalTimesArea();
+        const float32_t3 origin2origin = rect.offset - origin;
+
+        float32_t3 L = origin2origin + rect.edge0 * xi.x + rect.edge1 * xi.y;
+        const float distSq = hlsl::dot<float32_t3>(L, L);
+        const float rcpDist = 1.0 / hlsl::sqrt<float32_t>(distSq);
+        L *= rcpDist;
+        pdf = distSq / hlsl::abs<float32_t>(hlsl::dot<float32_t3>(N, L));
+        newRayMaxT = 1.0 / rcpDist;
+        return L;
+    }
+
+    Shape<PST_RECTANGLE> rect;
+};
+
+template<>
+struct ShapeSampling<PST_RECTANGLE, PPM_SOLID_ANGLE>
+{
+    static ShapeSampling<PST_RECTANGLE, PPM_SOLID_ANGLE> create(NBL_CONST_REF_ARG(Shape<PST_RECTANGLE>) rect)
+    {
+        ShapeSampling<PST_RECTANGLE, PPM_SOLID_ANGLE> retval;
+        retval.rect = rect;
+        return retval;
+    }
+
+    template<typename Ray>
+    float deferredPdf(NBL_CONST_REF_ARG(Ray) ray)
+    {
+        float pdf;
+        float32_t3x3 rectNormalBasis;
+        float32_t2 rectExtents;
+        rect.getNormalBasis(rectNormalBasis, rectExtents);
+        shapes::SphericalRectangle<float> sphR0 = shapes::SphericalRectangle<float>::create(ray.origin, rect.offset, rectNormalBasis);
+        float solidAngle = sphR0.solidAngleOfRectangle(rectExtents);
+        if (solidAngle > numeric_limits<float>::min)
+            pdf = 1.f / solidAngle;
+        else
+            pdf = bit_cast<float>(numeric_limits<float>::infinity);
+        return pdf;
+    }
+
+    template<class Aniso>
+    float32_t3 generate_and_pdf(NBL_REF_ARG(float32_t) pdf, NBL_REF_ARG(float32_t) newRayMaxT, NBL_CONST_REF_ARG(float32_t3) origin, NBL_CONST_REF_ARG(Aniso) interaction, bool isBSDF, NBL_CONST_REF_ARG(float32_t3) xi)
+    {
+        const float32_t3 N = rect.getNormalTimesArea();
+        const float32_t3 origin2origin = rect.offset - origin;
+
+        float32_t3x3 rectNormalBasis;
+        float32_t2 rectExtents;
+        rect.getNormalBasis(rectNormalBasis, rectExtents);
+        shapes::SphericalRectangle<float> sphR0 = shapes::SphericalRectangle<float>::create(origin, rect.offset, rectNormalBasis);
+        float32_t3 L = (float32_t3)0.0;
+        float solidAngle = sphR0.solidAngleOfRectangle(rectExtents);
+
+        sampling::SphericalRectangle<float> ssph = sampling::SphericalRectangle<float>::create(sphR0);
+        float32_t2 sphUv = ssph.generate(rectExtents, xi.xy, solidAngle);
+        if (solidAngle > numeric_limits<float>::min)
+        {
+            float32_t3 sph_sample = sphUv[0] * rect.edge0 + sphUv[1] * rect.edge1 + rect.offset;
+            L = sph_sample - origin;
+            L = hlsl::mix<float32_t3>(nbl::hlsl::normalize(L), (float32_t3)0.0, hlsl::abs<float32_t3>(L) > (float32_t3)numeric_limits<float>::min); // TODO? sometimes L is vec3(0), find cause
+            pdf = 1.f / solidAngle;
+        }
+        else
+            pdf = bit_cast<float>(numeric_limits<float>::infinity);
+
+        newRayMaxT = hlsl::dot<float32_t3>(N, origin2origin) / hlsl::dot<float32_t3>(N, L);
+        return L;
+    }
+
+    Shape<PST_RECTANGLE> rect;
+};
+
+// PPM_APPROX_PROJECTED_SOLID_ANGLE not available for PST_TRIANGLE
+
+
+template<class Scene, typename Ray, class LightSample, class Aniso, IntersectMode Mode, ProceduralShapeType PST, PTPolygonMethod PPM>
+struct Estimator;
+
+template<class Scene, typename Ray, class LightSample, class Aniso, PTPolygonMethod PPM>
+struct Estimator<Scene, Ray, LightSample, Aniso, IM_PROCEDURAL, PST_SPHERE, PPM>
 {
     using scalar_type = typename Ray::scalar_type;
     using vector3_type = vector<scalar_type, 3>;
     using ray_type = Ray;
-    using light_type = Light;
-    using spectral_type = typename Light::spectral_type;
+    using scene_type = Scene;
+    using light_type = typename Scene::light_type;
+    using spectral_type = typename light_type::spectral_type;
     using interaction_type = Aniso;
     using quotient_pdf_type = bxdf::quotient_and_pdf<spectral_type, scalar_type>;
     using sample_type = LightSample;
     using ray_dir_info_type = typename sample_type::ray_dir_info_type;
 
-    static spectral_type proceduralDeferredEvalAndPdf(NBL_REF_ARG(scalar_type) pdf, NBL_CONST_REF_ARG(light_type) light, NBL_CONST_REF_ARG(ray_type) ray, NBL_CONST_REF_ARG(Event) event)
+    static spectral_type deferredEvalAndPdf(NBL_REF_ARG(scalar_type) pdf, NBL_CONST_REF_ARG(scene_type) scene, uint32_t lightID, NBL_CONST_REF_ARG(ray_type) ray)
     {
-        const uint32_t lightCount = event.data[0];
-        const ProceduralShapeType type = (ProceduralShapeType)event.data[1];
-
-        pdf = 1.0 / lightCount;
-        switch (type)
-        {
-            case PST_SPHERE:
-            {
-                const vector3_type position = vector3_type(
-                    bit_cast<float32_t>(event.data[2]),
-                    bit_cast<float32_t>(event.data[3]),
-                    bit_cast<float32_t>(event.data[4]));
-                Shape<PST_SPHERE> sphere = Shape<PST_SPHERE>::create(position, bit_cast<float32_t>(event.data[5]), event.data[6]);
-                pdf *= sphere.template deferredPdf<ray_type>(ray);
-            }
-            break;
-            case PST_TRIANGLE:
-            {
-                const vector3_type vertex0 = vector3_type(
-                    bit_cast<float32_t>(event.data[2]),
-                    bit_cast<float32_t>(event.data[3]),
-                    bit_cast<float32_t>(event.data[4]));
-                const vector3_type vertex1 = vector3_type(
-                    bit_cast<float32_t>(event.data[5]), 
-                    bit_cast<float32_t>(event.data[6]),
-                    bit_cast<float32_t>(event.data[7]));
-                const vector3_type vertex2 = vector3_type(
-                    bit_cast<float32_t>(event.data[8]),
-                    bit_cast<float32_t>(event.data[9]),
-                    bit_cast<float32_t>(event.data[10]));
-                Shape<PST_TRIANGLE> tri = Shape<PST_TRIANGLE>::create(vertex0, vertex1, vertex2, event.data[11]);
-                pdf *= tri.template deferredPdf<ray_type>(ray);
-            }
-            break;
-            case PST_RECTANGLE:
-            {
-                const vector3_type offset = vector3_type(
-                    bit_cast<float32_t>(event.data[2]),
-                    bit_cast<float32_t>(event.data[3]),
-                    bit_cast<float32_t>(event.data[4]));
-                const vector3_type edge0 = vector3_type(
-                    bit_cast<float32_t>(event.data[5]),
-                    bit_cast<float32_t>(event.data[6]),
-                    bit_cast<float32_t>(event.data[7]));
-                const vector3_type edge1 = vector3_type(
-                    bit_cast<float32_t>(event.data[8]),
-                    bit_cast<float32_t>(event.data[9]),
-                    bit_cast<float32_t>(event.data[10]));
-                Shape<PST_RECTANGLE> rect = Shape<PST_RECTANGLE>::create(offset, edge0, edge1, event.data[11]);
-                pdf *= rect.template deferredPdf<ray_type>(ray);
-            }
-            break;
-            default:
-                pdf = bit_cast<float>(numeric_limits<float>::infinity);
-                break;
-        }
+        pdf = 1.0 / scene.lightCount;
+        const light_type light = scene.lights[lightID];
+        const Shape<PST_SPHERE> sphere = scene.spheres[light.objectID.id];
+        const ShapeSampling<PST_SPHERE, PPM> sampling = ShapeSampling<PST_SPHERE, PPM>::create(sphere);
+        pdf *= sampling.template deferredPdf<ray_type>(ray);
 
         return light.radiance;
     }
 
-    static spectral_type deferredEvalAndPdf(NBL_REF_ARG(scalar_type) pdf, NBL_CONST_REF_ARG(light_type) light, NBL_CONST_REF_ARG(ray_type) ray, NBL_CONST_REF_ARG(Event) event)
-    {
-        // const IntersectMode mode = (IntersectMode)event.mode;
-        // switch (mode)
-        // {
-        //     case IM_RAY_QUERY:
-        //     {
-        //         // TODO: do ray query stuff
-        //     }
-        //     break;
-        //     case IM_RAY_TRACING:
-        //     {
-        //         // TODO: do ray tracing stuff
-        //     }
-        //     break;
-        //     case IM_PROCEDURAL:
-        //     {
-                return proceduralDeferredEvalAndPdf(pdf, light, ray, event);
-        //     }
-        //     break;
-        //     default:
-        //         return (spectral_type)0.0;
-        // }
-        // return (spectral_type)0.0;
-    }
-
-    static sample_type procedural_generate_and_quotient_and_pdf(NBL_REF_ARG(quotient_pdf_type) quotient_pdf, NBL_REF_ARG(scalar_type) newRayMaxT, NBL_CONST_REF_ARG(light_type) light, NBL_CONST_REF_ARG(vector3_type) origin, NBL_CONST_REF_ARG(interaction_type) interaction, bool isBSDF, NBL_CONST_REF_ARG(vector3_type) xi, uint32_t depth, NBL_CONST_REF_ARG(Event) event)
-    {
-        const uint32_t lightCount = event.data[0];
-        const ProceduralShapeType type = (ProceduralShapeType)event.data[1];
+    static sample_type generate_and_quotient_and_pdf(NBL_REF_ARG(quotient_pdf_type) quotient_pdf, NBL_REF_ARG(scalar_type) newRayMaxT, NBL_CONST_REF_ARG(scene_type) scene, uint32_t lightID, NBL_CONST_REF_ARG(vector3_type) origin, NBL_CONST_REF_ARG(interaction_type) interaction, bool isBSDF, NBL_CONST_REF_ARG(vector3_type) xi, uint32_t depth)
+    {
+        sample_type L;
+        scalar_type pdf;
+
+        const light_type light = scene.lights[lightID];
+        const Shape<PST_SPHERE> sphere = scene.spheres[light.objectID.id];
+        const ShapeSampling<PST_SPHERE, PPM> sampling = ShapeSampling<PST_SPHERE, PPM>::create(sphere);
+
+        const vector3_type sampleL = sampling.template generate_and_pdf<interaction_type>(pdf, newRayMaxT, origin, interaction, isBSDF, xi);
+        const vector3_type V = interaction.isotropic.V.getDirection();
+        const scalar_type VdotL = nbl::hlsl::dot<vector3_type>(V, sampleL);
+        ray_dir_info_type rayL;
+        rayL.direction = sampleL;
+        L = sample_type::create(rayL,VdotL,interaction.T,interaction.B,interaction.isotropic.N);
+
+        newRayMaxT *= Tolerance<scalar_type>::getEnd(depth);
+        pdf *= 1.0 / scalar_type(scene.lightCount);
+        spectral_type quo = light.radiance / pdf;
+        quotient_pdf = quotient_pdf_type::create(quo, pdf);
+
+        return L;
+    }
+};
+
+template<class Scene, typename Ray, class LightSample, class Aniso, PTPolygonMethod PPM>
+struct Estimator<Scene, Ray, LightSample, Aniso, IM_PROCEDURAL, PST_TRIANGLE, PPM>
+{
+    using scalar_type = typename Ray::scalar_type;
+    using vector3_type = vector<scalar_type, 3>;
+    using ray_type = Ray;
+    using scene_type = Scene;
+    using light_type = typename Scene::light_type;
+    using spectral_type = typename light_type::spectral_type;
+    using interaction_type = Aniso;
+    using quotient_pdf_type = bxdf::quotient_and_pdf<spectral_type, scalar_type>;
+    using sample_type = LightSample;
+    using ray_dir_info_type = typename sample_type::ray_dir_info_type;
+
+    static spectral_type deferredEvalAndPdf(NBL_REF_ARG(scalar_type) pdf, NBL_CONST_REF_ARG(scene_type) scene, uint32_t lightID, NBL_CONST_REF_ARG(ray_type) ray)
+    {
+        pdf = 1.0 / scene.lightCount;
+        const light_type light = scene.lights[lightID];
+        const Shape<PST_TRIANGLE> tri = scene.triangles[light.objectID.id];
+        const ShapeSampling<PST_TRIANGLE, PPM> sampling = ShapeSampling<PST_TRIANGLE, PPM>::create(tri);
+        pdf *= sampling.template deferredPdf<ray_type>(ray);
+
+        return light.radiance;
+    }
 
+    static sample_type generate_and_quotient_and_pdf(NBL_REF_ARG(quotient_pdf_type) quotient_pdf, NBL_REF_ARG(scalar_type) newRayMaxT, NBL_CONST_REF_ARG(scene_type) scene, uint32_t lightID, NBL_CONST_REF_ARG(vector3_type) origin, NBL_CONST_REF_ARG(interaction_type) interaction, bool isBSDF, NBL_CONST_REF_ARG(vector3_type) xi, uint32_t depth)
+    {
         sample_type L;
         scalar_type pdf;
-        switch (type)
-        {
-            case PST_SPHERE:
-            {
-                const vector3_type position = vector3_type(
-                    bit_cast<float32_t>(event.data[2]),
-                    bit_cast<float32_t>(event.data[3]),
-                    bit_cast<float32_t>(event.data[4]));
-                Shape<PST_SPHERE> sphere = Shape<PST_SPHERE>::create(position, bit_cast<float32_t>(event.data[5]), event.data[6]);
-
-                const vector3_type sampleL = sphere.template generate_and_pdf<interaction_type>(pdf, newRayMaxT, origin, interaction, isBSDF, xi);
-                const vector3_type V = interaction.isotropic.V.getDirection();
-                const scalar_type VdotL = nbl::hlsl::dot<vector3_type>(V, sampleL);
-                ray_dir_info_type rayL;
-                rayL.direction = sampleL;
-                L = sample_type::create(rayL,VdotL,interaction.T,interaction.B,interaction.isotropic.N);
-            }
-            break;
-            case PST_TRIANGLE:
-            {
-                const vector3_type vertex0 = vector3_type(
-                    bit_cast<float32_t>(event.data[2]),
-                    bit_cast<float32_t>(event.data[3]),
-                    bit_cast<float32_t>(event.data[4]));
-                const vector3_type vertex1 = vector3_type(
-                    bit_cast<float32_t>(event.data[5]), 
-                    bit_cast<float32_t>(event.data[6]),
-                    bit_cast<float32_t>(event.data[7]));
-                const vector3_type vertex2 = vector3_type(
-                    bit_cast<float32_t>(event.data[8]),
-                    bit_cast<float32_t>(event.data[9]),
-                    bit_cast<float32_t>(event.data[10]));
-                Shape<PST_TRIANGLE> tri = Shape<PST_TRIANGLE>::create(vertex0, vertex1, vertex2, event.data[11]);
-
-                const vector3_type sampleL = tri.template generate_and_pdf<interaction_type>(pdf, newRayMaxT, origin, interaction, isBSDF, xi);
-                const vector3_type V = interaction.isotropic.V.getDirection();
-                const scalar_type VdotL = nbl::hlsl::dot<vector3_type>(V, sampleL);
-                ray_dir_info_type rayL;
-                rayL.direction = sampleL;
-                L = sample_type::create(rayL,VdotL,interaction.T,interaction.B,interaction.isotropic.N);
-            }
-            break;
-            case PST_RECTANGLE:
-            {
-                const vector3_type offset = vector3_type(
-                    bit_cast<float32_t>(event.data[2]),
-                    bit_cast<float32_t>(event.data[3]),
-                    bit_cast<float32_t>(event.data[4]));
-                const vector3_type edge0 = vector3_type(
-                    bit_cast<float32_t>(event.data[5]),
-                    bit_cast<float32_t>(event.data[6]),
-                    bit_cast<float32_t>(event.data[7]));
-                const vector3_type edge1 = vector3_type(
-                    bit_cast<float32_t>(event.data[8]),
-                    bit_cast<float32_t>(event.data[9]),
-                    bit_cast<float32_t>(event.data[10]));
-                Shape<PST_RECTANGLE> rect = Shape<PST_RECTANGLE>::create(offset, edge0, edge1, event.data[11]);
-
-                const vector3_type sampleL = rect.template generate_and_pdf<interaction_type>(pdf, newRayMaxT, origin, interaction, isBSDF, xi);
-                const vector3_type V = interaction.isotropic.V.getDirection();
-                const scalar_type VdotL = nbl::hlsl::dot<vector3_type>(V, sampleL);
-                ray_dir_info_type rayL;
-                rayL.direction = sampleL;
-                L = sample_type::create(rayL,VdotL,interaction.T,interaction.B,interaction.isotropic.N);
-            }
-            break;
-            default:
-                pdf = bit_cast<float>(numeric_limits<float>::infinity);
-                break;
-        }
+
+        const light_type light = scene.lights[lightID];
+        const Shape<PST_TRIANGLE> tri = scene.triangles[light.objectID.id];
+        const ShapeSampling<PST_TRIANGLE, PPM> sampling = ShapeSampling<PST_TRIANGLE, PPM>::create(tri);
+
+        const vector3_type sampleL = sampling.template generate_and_pdf<interaction_type>(pdf, newRayMaxT, origin, interaction, isBSDF, xi);
+        const vector3_type V = interaction.isotropic.V.getDirection();
+        const scalar_type VdotL = nbl::hlsl::dot<vector3_type>(V, sampleL);
+        ray_dir_info_type rayL;
+        rayL.direction = sampleL;
+        L = sample_type::create(rayL,VdotL,interaction.T,interaction.B,interaction.isotropic.N);
 
         newRayMaxT *= Tolerance<scalar_type>::getEnd(depth);
-        pdf *= 1.0 / scalar_type(lightCount);
+        pdf *= 1.0 / scalar_type(scene.lightCount);
         spectral_type quo = light.radiance / pdf;
         quotient_pdf = quotient_pdf_type::create(quo, pdf);
 
         return L;
     }
+};
+
+template<typename Scene, typename Ray, class LightSample, class Aniso, PTPolygonMethod PPM>
+struct Estimator<Scene, Ray, LightSample, Aniso, IM_PROCEDURAL, PST_RECTANGLE, PPM>
+{
+    using scalar_type = typename Ray::scalar_type;
+    using vector3_type = vector<scalar_type, 3>;
+    using ray_type = Ray;
+    using scene_type = Scene;
+    using light_type = typename Scene::light_type;
+    using spectral_type = typename light_type::spectral_type;
+    using interaction_type = Aniso;
+    using quotient_pdf_type = bxdf::quotient_and_pdf<spectral_type, scalar_type>;
+    using sample_type = LightSample;
+    using ray_dir_info_type = typename sample_type::ray_dir_info_type;
+
+    static spectral_type deferredEvalAndPdf(NBL_REF_ARG(scalar_type) pdf, NBL_CONST_REF_ARG(scene_type) scene, uint32_t lightID, NBL_CONST_REF_ARG(ray_type) ray)
+    {
+        pdf = 1.0 / scene.lightCount;
+        const light_type light = scene.lights[lightID];
+        const Shape<PST_RECTANGLE> rect = scene.rectangles[light.objectID.id];
+        const ShapeSampling<PST_RECTANGLE, PPM> sampling = ShapeSampling<PST_RECTANGLE, PPM>::create(rect);
+        pdf *= sampling.template deferredPdf<ray_type>(ray);
+
+        return light.radiance;
+    }
 
-    static sample_type generate_and_quotient_and_pdf(NBL_REF_ARG(quotient_pdf_type) quotient_pdf, NBL_REF_ARG(scalar_type) newRayMaxT, NBL_CONST_REF_ARG(light_type) light, NBL_CONST_REF_ARG(vector3_type) origin, NBL_CONST_REF_ARG(interaction_type) interaction, bool isBSDF, NBL_CONST_REF_ARG(vector3_type) xi, uint32_t depth, NBL_CONST_REF_ARG(Event) event)
+    static sample_type generate_and_quotient_and_pdf(NBL_REF_ARG(quotient_pdf_type) quotient_pdf, NBL_REF_ARG(scalar_type) newRayMaxT, NBL_CONST_REF_ARG(scene_type) scene, uint32_t lightID, NBL_CONST_REF_ARG(vector3_type) origin, NBL_CONST_REF_ARG(interaction_type) interaction, bool isBSDF, NBL_CONST_REF_ARG(vector3_type) xi, uint32_t depth)
     {
-        const IntersectMode mode = (IntersectMode)event.mode;
         sample_type L;
-        // switch (mode)
-        // {
-        //     case IM_RAY_QUERY:
-        //     {
-        //         // TODO: do ray query stuff
-        //     }
-        //     break;
-        //     case IM_RAY_TRACING:
-        //     {
-        //         // TODO: do ray tracing stuff
-        //     }
-        //     break;
-        //     case IM_PROCEDURAL:
-        //     {
-                return procedural_generate_and_quotient_and_pdf(quotient_pdf, newRayMaxT, light, origin, interaction, isBSDF, xi, depth, event);
-        //     }
-        //     break;
-        //     default:
-        //     {
-        //         return L;
-        //     }
-        // }
-        // return L;
+        scalar_type pdf;
+
+        const light_type light = scene.lights[lightID];
+        const Shape<PST_RECTANGLE> rect = scene.rectangles[light.objectID.id];
+        const ShapeSampling<PST_RECTANGLE, PPM> sampling = ShapeSampling<PST_RECTANGLE, PPM>::create(rect);
+
+        const vector3_type sampleL = sampling.template generate_and_pdf<interaction_type>(pdf, newRayMaxT, origin, interaction, isBSDF, xi);
+        const vector3_type V = interaction.isotropic.V.getDirection();
+        const scalar_type VdotL = nbl::hlsl::dot<vector3_type>(V, sampleL);
+        ray_dir_info_type rayL;
+        rayL.direction = sampleL;
+        L = sample_type::create(rayL,VdotL,interaction.T,interaction.B,interaction.isotropic.N);
+
+        newRayMaxT *= Tolerance<scalar_type>::getEnd(depth);
+        pdf *= 1.0 / scalar_type(scene.lightCount);
+        spectral_type quo = light.radiance / pdf;
+        quotient_pdf = quotient_pdf_type::create(quo, pdf);
+
+        return L;
     }
 };
 
diff --git a/31_HLSLPathTracer/app_resources/hlsl/pathtracer.hlsl b/31_HLSLPathTracer/app_resources/hlsl/pathtracer.hlsl
index 553094e21..3082e599e 100644
--- a/31_HLSLPathTracer/app_resources/hlsl/pathtracer.hlsl
+++ b/31_HLSLPathTracer/app_resources/hlsl/pathtracer.hlsl
@@ -71,14 +71,6 @@ struct Unidirectional
     using conductor_op_type = typename MaterialSystem::conductor_op_type;
     using dielectric_op_type = typename MaterialSystem::dielectric_op_type;
 
-    // static this_t create(RandGen randGen,
-    //                     RayGen rayGen,
-    //                     Intersector intersector,
-    //                     MaterialSystem materialSystem,
-    //                     /* PathGuider pathGuider, */
-    //                     NextEventEstimator nee)
-    // {}
-
     static this_t create(NBL_CONST_REF_ARG(PathTracerCreationParams<create_params_type, scalar_type>) params)
     {
         this_t retval;
@@ -139,7 +131,7 @@ struct Unidirectional
         if (lightID != light_type::INVALID_ID)
         {
             float _pdf;
-            ray.payload.accumulation += nee.deferredEvalAndPdf(_pdf, scene.lights[lightID], ray, scene.toNextEvent(lightID)) * throughput / (1.0 + _pdf * _pdf * ray.payload.otherTechniqueHeuristic);
+            ray.payload.accumulation += nee.deferredEvalAndPdf(_pdf, scene, lightID, ray) * throughput / (1.0 + _pdf * _pdf * ray.payload.otherTechniqueHeuristic);
         }
 
         const uint32_t bsdfID = glsl::bitfieldExtract(bsdfLightIDs, 0, 16);
@@ -174,8 +166,8 @@ struct Unidirectional
             scalar_type t;
             sample_type nee_sample = nee.generate_and_quotient_and_pdf(
                 neeContrib_pdf, t,
-                scene.lights[randLightID], intersection, interaction,
-                isBSDF, eps0, depth, scene.toNextEvent(randLightID)
+                scene, randLightID, intersection, interaction,
+                isBSDF, eps0, depth
             );
 
             // We don't allow non watertight transmitters in this renderer
@@ -198,36 +190,6 @@ struct Unidirectional
                     // example only uses isotropic bxdfs
                     params_type params = params_type::template create<sample_type, isotropic_type, isocache_type>(nee_sample, interaction.isotropic, _cache.iso_cache, _clamp);
 
-                    // TODO: does not yet account for smooth dielectric
-                    // if (!isBSDF && bxdf.materialType == ext::MaterialSystem::Material::DIFFUSE)
-                    // {
-                    //     params = params_type::template create<sample_type, isotropic_type>(nee_sample, interaction.isotropic, bxdf::BCM_MAX);
-                    // }
-                    // else if (!isBSDF && bxdf.materialType != ext::MaterialSystem::Material::DIFFUSE)
-                    // {
-                    //     if (bxdf.params.is_aniso)
-                    //         params = params_type::template create<sample_type, anisotropic_type, anisocache_type>(nee_sample, interaction, _cache, bxdf::BCM_MAX);
-                    //     else
-                    //     {
-                    //         isocache_type isocache = _cache.iso_cache;
-                    //         params = params_type::template create<sample_type, isotropic_type, isocache_type>(nee_sample, interaction.isotropic, _cache.iso_cache, bxdf::BCM_MAX);
-                    //     }
-                    // }
-                    // else if (isBSDF && bxdf.materialType == ext::MaterialSystem::Material::DIFFUSE)
-                    // {
-                    //     params = params_type::template create<sample_type, isotropic_type>(nee_sample, interaction.isotropic, bxdf::BCM_ABS);
-                    // }
-                    // else if (isBSDF && bxdf.materialType != ext::MaterialSystem::Material::DIFFUSE)
-                    // {
-                    //     if (bxdf.params.is_aniso)
-                    //         params = params_type::template create<sample_type, anisotropic_type, anisocache_type>(nee_sample, interaction, _cache, bxdf::BCM_ABS);
-                    //     else
-                    //     {
-                    //         isocache_type isocache = _cache.iso_cache;
-                    //         params = params_type::template create<sample_type, isotropic_type, isocache_type>(nee_sample, interaction.isotropic, _cache.iso_cache, bxdf::BCM_ABS);
-                    //     }
-                    // }
-
                     quotient_pdf_type bsdf_quotient_pdf = materialSystem.quotient_and_pdf(bxdf.materialType, bxdf.params, params);
                     neeContrib_pdf.quotient *= bxdf.albedo * throughput * bsdf_quotient_pdf.quotient;
                     const scalar_type otherGenOverChoice = bsdf_quotient_pdf.pdf * rcpChoiceProb;
@@ -261,37 +223,6 @@ struct Unidirectional
             // example only uses isotropic bxdfs
             params_type params = params_type::template create<sample_type, isotropic_type, isocache_type>(bsdf_sample, interaction.isotropic, _cache.iso_cache, _clamp);
 
-            // TODO: does not yet account for smooth dielectric
-            // params_type params;
-            // if (!isBSDF && bxdf.materialType == ext::MaterialSystem::Material::DIFFUSE)
-            // {
-            //     params = params_type::template create<sample_type, isotropic_type>(bsdf_sample, iso_interaction, bxdf::BCM_MAX);
-            // }
-            // else if (!isBSDF && bxdf.materialType != ext::MaterialSystem::Material::DIFFUSE)
-            // {
-            //     if (bxdf.params.is_aniso)
-            //         params = params_type::template create<sample_type, anisotropic_type, anisocache_type>(bsdf_sample, interaction, _cache, bxdf::BCM_MAX);
-            //     else
-            //     {
-            //         isocache_type isocache = _cache.iso_cache;
-            //         params = params_type::template create<sample_type, isotropic_type, isocache_type>(bsdf_sample, iso_interaction, isocache, bxdf::BCM_MAX);
-            //     }
-            // }
-            // else if (isBSDF && bxdf.materialType == ext::MaterialSystem::Material::DIFFUSE)
-            // {
-            //     params = params_type::template create<sample_type, isotropic_type>(bsdf_sample, iso_interaction, bxdf::BCM_ABS);
-            // }
-            // else if (isBSDF && bxdf.materialType != ext::MaterialSystem::Material::DIFFUSE)
-            // {
-            //     if (bxdf.params.is_aniso)
-            //         params = params_type::template create<sample_type, anisotropic_type, anisocache_type>(bsdf_sample, interaction, _cache, bxdf::BCM_ABS);
-            //     else
-            //     {
-            //         isocache_type isocache = _cache.iso_cache;
-            //         params = params_type::template create<sample_type, isotropic_type, isocache_type>(bsdf_sample, iso_interaction, isocache, bxdf::BCM_ABS);
-            //     }
-            // }
-
             // the value of the bsdf divided by the probability of the sample being generated
             quotient_pdf_type bsdf_quotient_pdf = materialSystem.quotient_and_pdf(bxdf.materialType, bxdf.params, params);
             throughput *= bxdf.albedo * bsdf_quotient_pdf.quotient;
diff --git a/31_HLSLPathTracer/app_resources/hlsl/render.comp.hlsl b/31_HLSLPathTracer/app_resources/hlsl/render.comp.hlsl
index b54f5721d..5e8102f6f 100644
--- a/31_HLSLPathTracer/app_resources/hlsl/render.comp.hlsl
+++ b/31_HLSLPathTracer/app_resources/hlsl/render.comp.hlsl
@@ -13,24 +13,18 @@
 
 #ifdef SPHERE_LIGHT
 #define SPHERE_COUNT 9
-#define LIGHT_TYPE ext::PST_SPHERE
-
 #define TRIANGLE_COUNT 0
 #define RECTANGLE_COUNT 0
 #endif
 
 #ifdef TRIANGLE_LIGHT
 #define TRIANGLE_COUNT 1
-#define LIGHT_TYPE ext::PST_TRIANGLE
-
 #define SPHERE_COUNT 8
 #define RECTANGLE_COUNT 0
 #endif
 
 #ifdef RECTANGLE_LIGHT
 #define RECTANGLE_COUNT 1
-#define LIGHT_TYPE ext::PST_RECTANGLE
-
 #define SPHERE_COUNT 8
 #define TRIANGLE_COUNT 0
 #endif
@@ -47,6 +41,18 @@ NBL_CONSTEXPR uint32_t WorkgroupSize = 256;
 NBL_CONSTEXPR uint32_t MAX_DEPTH_LOG2 = 4;
 NBL_CONSTEXPR uint32_t MAX_SAMPLES_LOG2 = 10;
 
+#ifdef SPHERE_LIGHT
+NBL_CONSTEXPR ext::ProceduralShapeType LIGHT_TYPE = ext::PST_SPHERE;
+#endif
+#ifdef TRIANGLE_LIGHT
+NBL_CONSTEXPR ext::ProceduralShapeType LIGHT_TYPE = ext::PST_TRIANGLE;
+#endif
+#ifdef RECTANGLE_LIGHT
+NBL_CONSTEXPR ext::ProceduralShapeType LIGHT_TYPE = ext::PST_RECTANGLE;
+#endif
+
+NBL_CONSTEXPR ext::PTPolygonMethod POLYGON_METHOD = ext::PPM_SOLID_ANGLE;
+
 int32_t2 getCoordinates()
 {
     uint32_t width, height;
@@ -80,11 +86,12 @@ using dielectric_bxdf_type = bxdf::transmission::SGGXDielectricBxDF<sample_t, is
 using ray_type = ext::Ray<float>;
 using light_type = ext::Light<spectral_t>;
 using bxdfnode_type = ext::BxDFNode<spectral_t>;
+using scene_type = ext::Scene<light_type, bxdfnode_type>;
 using randgen_type = ext::RandGen::Uniform3D<Xoroshiro64Star>;
 using raygen_type = ext::RayGen::Basic<ray_type>;
 using intersector_type = ext::Intersector::Comprehensive<ray_type, light_type, bxdfnode_type>;
 using material_system_type = ext::MaterialSystem::System<diffuse_bxdf_type, conductor_bxdf_type, dielectric_bxdf_type>;
-using nee_type = ext::NextEventEstimator::Estimator<light_type, ray_type, sample_t, aniso_interaction>;
+using nee_type = ext::NextEventEstimator::Estimator<scene_type, ray_type, sample_t, aniso_interaction, ext::IntersectMode::IM_PROCEDURAL, LIGHT_TYPE, POLYGON_METHOD>;
 using pathtracer_type = ext::PathTracer::Unidirectional<randgen_type, raygen_type, intersector_type, material_system_type, nee_type>;
 
 static const ext::Shape<ext::PST_SPHERE> spheres[SPHERE_COUNT] = {
@@ -164,7 +171,6 @@ void main(uint32_t3 threadID : SV_DispatchThreadID)
     }
 
     int flatIdx = glsl::gl_GlobalInvocationID().y * glsl::gl_NumWorkGroups().x * WorkgroupSize + glsl::gl_GlobalInvocationID().x;
-    PCG32x2 pcg = PCG32x2::construct(flatIdx);  // replaces scramblebuf?
 
     // set up path tracer
     ext::PathTracer::PathTracerCreationParams<create_params_t, float> ptCreateParams;
diff --git a/31_HLSLPathTracer/app_resources/hlsl/scene.hlsl b/31_HLSLPathTracer/app_resources/hlsl/scene.hlsl
index 887d20c48..40fb01057 100644
--- a/31_HLSLPathTracer/app_resources/hlsl/scene.hlsl
+++ b/31_HLSLPathTracer/app_resources/hlsl/scene.hlsl
@@ -87,145 +87,6 @@ struct Scene
 #undef SCENE_TRIANGLE_COUNT
 #undef SCENE_RECTANGLE_COUNT
 
-    // obsolete?
-    // Intersector::IntersectData toIntersectData(uint32_t mode, ProceduralShapeType type)
-    // {
-    //     Intersector::IntersectData retval;
-    //     retval.mode = mode;
-
-    //     uint32_t objCount = (type == PST_SPHERE) ? sphereCount :
-    //                         (type == PST_TRIANGLE) ? triangleCount :
-    //                         (type == PST_RECTANGLE) ? rectangleCount :
-    //                         -1;
-    //     retval.data[0] = objCount;
-    //     retval.data[1] = type;
-
-    //     switch (type)
-    //     {
-    //         case PST_SPHERE:
-    //         {
-    //             for (int i = 0; i < objCount; i++)
-    //             {
-    //                 Shape<PST_SPHERE> sphere = spheres[i];
-    //                 uint32_t3 uintPos = bit_cast<uint32_t3, float32_t3>(sphere.position);
-    //                 retval.data[2 + i * Shape<PST_SPHERE>::ObjSize] = uintPos.x;
-    //                 retval.data[2 + i * Shape<PST_SPHERE>::ObjSize + 1] = uintPos.y;
-    //                 retval.data[2 + i * Shape<PST_SPHERE>::ObjSize + 2] = uintPos.z;
-    //                 retval.data[2 + i * Shape<PST_SPHERE>::ObjSize + 3] = bit_cast<uint32_t, float32_t>(sphere.radius2);
-    //                 retval.data[2 + i * Shape<PST_SPHERE>::ObjSize + 4] = sphere.bsdfLightIDs;
-    //             }
-    //         }
-    //         break;
-    //         case PST_TRIANGLE:
-    //         {
-    //             for (int i = 0; i < objCount; i++)
-    //             {
-    //                 Shape<PST_TRIANGLE> tri = triangles[i];
-    //                 retval.data[2 + i * Shape<PST_TRIANGLE>::ObjSize] = asuint(tri.vertex0.x);
-    //                 retval.data[2 + i * Shape<PST_TRIANGLE>::ObjSize + 1] = asuint(tri.vertex0.y);
-    //                 retval.data[2 + i * Shape<PST_TRIANGLE>::ObjSize + 2] = asuint(tri.vertex0.z);
-    //                 retval.data[2 + i * Shape<PST_TRIANGLE>::ObjSize + 3] = asuint(tri.vertex1.x);
-    //                 retval.data[2 + i * Shape<PST_TRIANGLE>::ObjSize + 4] = asuint(tri.vertex1.y);
-    //                 retval.data[2 + i * Shape<PST_TRIANGLE>::ObjSize + 5] = asuint(tri.vertex1.z);
-    //                 retval.data[2 + i * Shape<PST_TRIANGLE>::ObjSize + 6] = asuint(tri.vertex2.x);
-    //                 retval.data[2 + i * Shape<PST_TRIANGLE>::ObjSize + 7] = asuint(tri.vertex2.y);
-    //                 retval.data[2 + i * Shape<PST_TRIANGLE>::ObjSize + 8] = asuint(tri.vertex2.z);
-    //                 retval.data[2 + i * Shape<PST_TRIANGLE>::ObjSize + 9] = tri.bsdfLightIDs;
-    //             }
-    //         }
-    //         break;
-    //         case PST_RECTANGLE:
-    //         {
-    //             for (int i = 0; i < objCount; i++)
-    //             {
-    //                 Shape<PST_RECTANGLE> rect = rectangles[i];
-    //                 retval.data[2 + i * Shape<PST_RECTANGLE>::ObjSize] = asuint(rect.offset.x);
-    //                 retval.data[2 + i * Shape<PST_RECTANGLE>::ObjSize + 1] = asuint(rect.offset.y);
-    //                 retval.data[2 + i * Shape<PST_RECTANGLE>::ObjSize + 2] = asuint(rect.offset.z);
-    //                 retval.data[2 + i * Shape<PST_RECTANGLE>::ObjSize + 3] = asuint(rect.edge0.x);
-    //                 retval.data[2 + i * Shape<PST_RECTANGLE>::ObjSize + 4] = asuint(rect.edge0.y);
-    //                 retval.data[2 + i * Shape<PST_RECTANGLE>::ObjSize + 5] = asuint(rect.edge0.z);
-    //                 retval.data[2 + i * Shape<PST_RECTANGLE>::ObjSize + 6] = asuint(rect.edge1.x);
-    //                 retval.data[2 + i * Shape<PST_RECTANGLE>::ObjSize + 7] = asuint(rect.edge1.y);
-    //                 retval.data[2 + i * Shape<PST_RECTANGLE>::ObjSize + 8] = asuint(rect.edge1.z);
-    //                 retval.data[2 + i * Shape<PST_RECTANGLE>::ObjSize + 9] = rect.bsdfLightIDs;
-    //             }
-    //         }
-    //         break;
-    //         default:
-    //             // for ASes
-    //             break;
-    //     }
-    //     return retval;
-    // }
-
-    NextEventEstimator::Event toNextEvent(uint32_t lightID)
-    {
-        NextEventEstimator::Event retval;
-
-        ObjectID objectID = lights[lightID].objectID;
-        retval.mode = objectID.mode;
-
-        retval.data[0] = lightCount;
-        retval.data[1] = objectID.shapeType;
-
-        uint32_t id = objectID.id;
-        switch (objectID.shapeType)
-        {
-            case PST_SPHERE:
-            {
-                Shape<PST_SPHERE> sphere = spheres[id];
-                uint32_t3 position = bit_cast<uint32_t3>(sphere.position);
-                retval.data[2] = position.x;
-                retval.data[3] = position.y;
-                retval.data[4] = position.z;
-                retval.data[5] = bit_cast<uint32_t>(sphere.radius2);
-                retval.data[6] = sphere.bsdfLightIDs;
-            }
-            break;
-            case PST_TRIANGLE:
-            {
-                Shape<PST_TRIANGLE> tri = triangles[id];
-                uint32_t3 vertex = bit_cast<uint32_t3>(tri.vertex0);
-                retval.data[2] = vertex.x;
-                retval.data[3] = vertex.y;
-                retval.data[4] = vertex.z;
-                vertex = bit_cast<uint32_t3>(tri.vertex1);
-                retval.data[5] = vertex.x;
-                retval.data[6] = vertex.y;
-                retval.data[7] = vertex.z;
-                vertex = bit_cast<uint32_t3>(tri.vertex2);
-                retval.data[8] = vertex.x;
-                retval.data[9] = vertex.y;
-                retval.data[10] = vertex.z;
-                retval.data[11] = tri.bsdfLightIDs;
-            }
-            break;
-            case PST_RECTANGLE:
-            {
-                Shape<PST_RECTANGLE> rect = rectangles[id];
-                uint32_t3 tmp = bit_cast<uint32_t3>(rect.offset);
-                retval.data[2] = tmp.x;
-                retval.data[3] = tmp.y;
-                retval.data[4] = tmp.z;
-                tmp = bit_cast<uint32_t3>(rect.edge0);
-                retval.data[5] = tmp.x;
-                retval.data[6] = tmp.y;
-                retval.data[7] = tmp.z;
-                tmp = bit_cast<uint32_t3>(rect.edge1);
-                retval.data[8] = tmp.x;
-                retval.data[9] = tmp.y;
-                retval.data[10] = tmp.z;
-                retval.data[11] = rect.bsdfLightIDs;
-            }
-            break;
-            default:
-                // for ASes
-                break;
-        }
-        return retval;
-    }
-
     // TODO: get these to work with AS types as well
     uint32_t getBsdfLightIDs(NBL_CONST_REF_ARG(ObjectID) objectID)
     {
diff --git a/31_HLSLPathTracer/main.cpp b/31_HLSLPathTracer/main.cpp
index 10889f37f..ae9f162a4 100644
--- a/31_HLSLPathTracer/main.cpp
+++ b/31_HLSLPathTracer/main.cpp
@@ -48,7 +48,7 @@ class HLSLComputePathtracer final : public examples::SimpleWindowedApplication,
 		constexpr static inline uint32_t2 WindowDimensions = { 1280, 720 };
 		constexpr static inline uint32_t MaxFramesInFlight = 5;
 		constexpr static inline clock_t::duration DisplayImageDuration = std::chrono::milliseconds(900);
-		constexpr static inline uint32_t DefaultWorkGroupSize = 16u;
+		constexpr static inline uint32_t DefaultWorkGroupSize = 256u;
 		constexpr static inline uint32_t MaxDescriptorCount = 256u;
 		constexpr static inline uint32_t MaxDepthLog2 = 4u; // 5
 		constexpr static inline uint32_t MaxSamplesLog2 = 10u; // 18
@@ -1068,10 +1068,7 @@ class HLSLComputePathtracer final : public examples::SimpleWindowedApplication,
 					cmdbuf->bindDescriptorSets(EPBP_COMPUTE, pipeline->getLayout(), 0u, 1u, &m_descriptorSet0.get());
 					cmdbuf->bindDescriptorSets(EPBP_COMPUTE, pipeline->getLayout(), 2u, 1u, &m_descriptorSet2.get());
 					cmdbuf->pushConstants(pipeline->getLayout(), IShader::E_SHADER_STAGE::ESS_COMPUTE, 0, sizeof(PTPushConstant), &pc);
-					if (renderMode == E_RENDER_MODE::ERM_HLSL)
-						cmdbuf->dispatch(1 + (WindowDimensions.x * WindowDimensions.y - 1) / 256u, 1u, 1u);
-					else
-						cmdbuf->dispatch(1 + (WindowDimensions.x - 1) / DefaultWorkGroupSize, 1 + (WindowDimensions.y - 1) / DefaultWorkGroupSize, 1u);
+					cmdbuf->dispatch(1 + (WindowDimensions.x * WindowDimensions.y - 1) / DefaultWorkGroupSize, 1u, 1u);
 				}
 
 				// TRANSITION m_outImgView to READ (because of descriptorSets0 -> ComputeShader Writes into the image)
@@ -1351,7 +1348,7 @@ class HLSLComputePathtracer final : public examples::SimpleWindowedApplication,
 		float camYAngle = 165.f / 180.f * 3.14159f;
 		float camXAngle = 32.f / 180.f * 3.14159f;
 		int PTPipline = E_LIGHT_GEOMETRY::ELG_SPHERE;
-		int renderMode = E_RENDER_MODE::ERM_HLSL;
+		int renderMode = E_RENDER_MODE::ERM_GLSL;
 		int spp = 32;
 		int depth = 3;
 

From b889b60e4db77dbc435ea6c5baefbcba0089e01c Mon Sep 17 00:00:00 2001
From: keptsecret <sorchon@gmail.com>
Date: Wed, 19 Mar 2025 15:24:14 +0700
Subject: [PATCH 094/529] use 1D workgroup dispatch

---
 31_HLSLPathTracer/app_resources/glsl/common.glsl | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/31_HLSLPathTracer/app_resources/glsl/common.glsl b/31_HLSLPathTracer/app_resources/glsl/common.glsl
index 2463f82cf..b09c90824 100644
--- a/31_HLSLPathTracer/app_resources/glsl/common.glsl
+++ b/31_HLSLPathTracer/app_resources/glsl/common.glsl
@@ -16,12 +16,13 @@ layout(set = 2, binding = 2) uniform usampler2D scramblebuf;
 layout(set=0, binding=0, rgba16f) uniform image2D outImage;
 
 #ifndef _NBL_GLSL_WORKGROUP_SIZE_
-#define _NBL_GLSL_WORKGROUP_SIZE_ 32
-layout(local_size_x=_NBL_GLSL_WORKGROUP_SIZE_, local_size_y=_NBL_GLSL_WORKGROUP_SIZE_, local_size_z=1) in;
+#define _NBL_GLSL_WORKGROUP_SIZE_ 256
+layout(local_size_x=_NBL_GLSL_WORKGROUP_SIZE_, local_size_y=1, local_size_z=1) in;
 #endif
 
 ivec2 getCoordinates() {
-    return ivec2(gl_GlobalInvocationID.xy);
+    ivec2 imageSize = imageSize(outImage);
+    return ivec2(gl_GlobalInvocationID.x % imageSize.x, gl_GlobalInvocationID.x / imageSize.x);
 }
 
 vec2 getTexCoords() {

From 79ee9da780900a7977d630ef156128f7287d2222 Mon Sep 17 00:00:00 2001
From: keptsecret <sorchon@gmail.com>
Date: Wed, 19 Mar 2025 15:24:57 +0700
Subject: [PATCH 095/529] removed obsolete commented sections

---
 .../app_resources/hlsl/intersector.hlsl       | 118 ------------------
 1 file changed, 118 deletions(-)

diff --git a/31_HLSLPathTracer/app_resources/hlsl/intersector.hlsl b/31_HLSLPathTracer/app_resources/hlsl/intersector.hlsl
index 03a45f866..e59fdc2c3 100644
--- a/31_HLSLPathTracer/app_resources/hlsl/intersector.hlsl
+++ b/31_HLSLPathTracer/app_resources/hlsl/intersector.hlsl
@@ -78,124 +78,6 @@ struct Comprehensive
 
         return objectID;
     }
-
-    // note for future consideration: still need to encode to IntersectData?
-    // obsolete?
-    // static ObjectID traceProcedural(NBL_REF_ARG(ray_type) ray, NBL_CONST_REF_ARG(IntersectData) intersect)
-    // {
-    //     const bool anyHit = ray.intersectionT != numeric_limits<scalar_type>::max;
-    //     const uint32_t objCount = intersect.data[0];
-    //     const ProceduralShapeType type = (ProceduralShapeType)intersect.data[1];
-
-    //     ObjectID objectID = ray.objectID;
-    //     objectID.mode = IM_PROCEDURAL;
-    //     objectID.shapeType = type;
-    //     for (int i = 0; i < objCount; i++)
-    //     {
-    //         float t;
-    //         switch (type)
-    //         {
-    //             case PST_SPHERE:
-    //             {
-    //                 vector3_type position = vector3_type(asfloat(intersect.data[2 + i * Shape<PST_SPHERE>::ObjSize]), asfloat(intersect.data[2 + i * Shape<PST_SPHERE>::ObjSize + 1]), asfloat(intersect.data[2 + i * Shape<PST_SPHERE>::ObjSize + 2]));
-    //                 Shape<PST_SPHERE> sphere = Shape<PST_SPHERE>::create(position, asfloat(intersect.data[2 + i * Shape<PST_SPHERE>::ObjSize + 3]), intersect.data[2 + i * Shape<PST_SPHERE>::ObjSize + 4]);
-    //                 t = sphere.intersect(ray.origin, ray.direction);
-    //             }
-    //             break;
-    //             case PST_TRIANGLE:
-    //             {
-    //                 vector3_type vertex0 = vector3_type(asfloat(intersect.data[2 + i * Shape<PST_TRIANGLE>::ObjSize]), asfloat(intersect.data[2 + i * Shape<PST_SPHERE>::ObjSize + 1]), asfloat(intersect.data[2 + i * Shape<PST_TRIANGLE>::ObjSize + 2]));
-    //                 vector3_type vertex1 = vector3_type(asfloat(intersect.data[2 + i * Shape<PST_TRIANGLE>::ObjSize + 3]), asfloat(intersect.data[2 + i * Shape<PST_SPHERE>::ObjSize + 4]), asfloat(intersect.data[2 + i * Shape<PST_TRIANGLE>::ObjSize + 5]));
-    //                 vector3_type vertex2 = vector3_type(asfloat(intersect.data[2 + i * Shape<PST_TRIANGLE>::ObjSize + 6]), asfloat(intersect.data[2 + i * Shape<PST_SPHERE>::ObjSize + 7]), asfloat(intersect.data[2 + i * Shape<PST_TRIANGLE>::ObjSize + 8]));
-    //                 Shape<PST_TRIANGLE> tri = Shape<PST_TRIANGLE>::create(vertex0, vertex1, vertex2, intersect.data[2 + i * Shape<PST_TRIANGLE>::ObjSize + 9]);
-    //                 t = tri.intersect(ray.origin, ray.direction);
-    //             }
-    //             break;
-    //             case PST_RECTANGLE:
-    //             {
-    //                 vector3_type offset = vector3_type(asfloat(intersect.data[2 + i * Shape<PST_RECTANGLE>::ObjSize]), asfloat(intersect.data[2 + i * Shape<PST_RECTANGLE>::ObjSize + 1]), asfloat(intersect.data[2 + i * Shape<PST_RECTANGLE>::ObjSize + 2]));
-    //                 vector3_type edge0 = vector3_type(asfloat(intersect.data[2 + i * Shape<PST_RECTANGLE>::ObjSize + 3]), asfloat(intersect.data[2 + i * Shape<PST_RECTANGLE>::ObjSize + 4]), asfloat(intersect.data[2 + i * Shape<PST_RECTANGLE>::ObjSize + 5]));
-    //                 vector3_type edge1 = vector3_type(asfloat(intersect.data[2 + i * Shape<PST_RECTANGLE>::ObjSize + 6]), asfloat(intersect.data[2 + i * Shape<PST_RECTANGLE>::ObjSize + 7]), asfloat(intersect.data[2 + i * Shape<PST_RECTANGLE>::ObjSize + 8]));
-    //                 Shape<PST_RECTANGLE> rect = Shape<PST_RECTANGLE>::create(offset, edge0, edge1, intersect.data[2 + i * Shape<PST_RECTANGLE>::ObjSize + 9]);
-    //                 t = rect.intersect(ray.origin, ray.direction);
-    //             }
-    //             break;
-    //             default:
-    //                 t = numeric_limits<float>::infinity;
-    //             break;
-    //         }
-
-    //         bool closerIntersection = t > 0.0 && t < ray.intersectionT;
-
-    //         ray.intersectionT = closerIntersection ? t : ray.intersectionT;
-    //         objectID.id = closerIntersection ? i : objectID.id;
-
-    //         // allowing early out results in a performance regression, WTF!?
-    //         //if (anyHit && closerIntersection)
-    //         //break;
-    //     }
-    //     return objectID;
-    // }
-
-    // obsolete?
-    // static ObjectID traceRay(NBL_REF_ARG(ray_type) ray, NBL_CONST_REF_ARG(IntersectData) intersect)
-    // {
-    //     const uint32_t mode = intersect.mode;
-    //     switch (mode)
-    //     {
-    //         case IM_RAY_QUERY:
-    //         {
-    //             // TODO: do ray query stuff
-    //         }
-    //         break;
-    //         case IM_RAY_TRACING:
-    //         {
-    //             // TODO: do ray tracing stuff
-    //         }
-    //         break;
-    //         case IM_PROCEDURAL:
-    //         {
-    //             return traceProcedural(ray, intersect);
-    //         }
-    //         break;
-    //         default:
-    //         {
-    //             return ObjectID::create(-1, 0, PST_SPHERE);
-    //         }
-    //     }
-    //     return ObjectID::create(-1, 0, PST_SPHERE);
-    // }
-
-    // static ObjectID traceRay(NBL_REF_ARG(ray_type) ray, NBL_CONST_REF_ARG(scene_type) scene)
-    // {
-    //     IntersectData data;
-
-    //     ObjectID objectID;
-    //     objectID.id = -1;  // start with no intersect
-
-    //     // prodedural shapes
-    //     if (scene.sphereCount > 0)
-    //     {
-    //         data = scene.toIntersectData(ext::Intersector::IntersectData::Mode::PROCEDURAL, PST_SPHERE);
-    //         objectID = traceRay(ray, data);
-    //     }
-
-    //     if (scene.triangleCount > 0)
-    //     {
-    //         data = scene.toIntersectData(ext::Intersector::IntersectData::Mode::PROCEDURAL, PST_TRIANGLE);
-    //         objectID = traceRay(ray, data);
-    //     }
-
-    //     if (scene.rectangleCount > 0)
-    //     {
-    //         data = scene.toIntersectData(ext::Intersector::IntersectData::Mode::PROCEDURAL, PST_RECTANGLE);
-    //         objectID = traceRay(ray, data);
-    //     }
-
-    //     // TODO: trace AS
-
-    //     return objectID;
-    // }
 };
 
 }

From ca8f2ec8fa84a2bd1bfeb4348263f82d14026bca Mon Sep 17 00:00:00 2001
From: keptsecret <sorchon@gmail.com>
Date: Wed, 19 Mar 2025 16:01:15 +0700
Subject: [PATCH 096/529] some minor corrections

---
 .../app_resources/hlsl/common.hlsl            |  6 +---
 .../hlsl/next_event_estimator.hlsl            | 28 +++++++++++--------
 .../app_resources/hlsl/pathtracer.hlsl        |  9 +++---
 31_HLSLPathTracer/main.cpp                    |  2 +-
 4 files changed, 23 insertions(+), 22 deletions(-)

diff --git a/31_HLSLPathTracer/app_resources/hlsl/common.hlsl b/31_HLSLPathTracer/app_resources/hlsl/common.hlsl
index dea682c8b..2e2561345 100644
--- a/31_HLSLPathTracer/app_resources/hlsl/common.hlsl
+++ b/31_HLSLPathTracer/app_resources/hlsl/common.hlsl
@@ -68,7 +68,7 @@ struct Ray
     vector3_type origin;
     vector3_type direction;
 
-    // TODO: polygon method == 2 stuff
+    // polygon method == PPM_APPROX_PROJECTED_SOLID_ANGLE
     vector3_type normalAtOrigin;
     bool wasBSDFAtOrigin;
 
@@ -246,7 +246,6 @@ struct Shape<PST_TRIANGLE>
         retval.vertex1 = vertex1;
         retval.vertex2 = vertex2;
         retval.bsdfLightIDs = bsdfLightIDs;
-        retval.polygonMethod = PPM_SOLID_ANGLE;
         return retval;
     }
 
@@ -288,7 +287,6 @@ struct Shape<PST_TRIANGLE>
     float32_t3 vertex1;
     float32_t3 vertex2;
     uint32_t bsdfLightIDs;
-    PTPolygonMethod polygonMethod;
 };
 
 template<>
@@ -301,7 +299,6 @@ struct Shape<PST_RECTANGLE>
         retval.edge0 = edge0;
         retval.edge1 = edge1;
         retval.bsdfLightIDs = bsdfLightIDs;
-        retval.polygonMethod = PPM_SOLID_ANGLE;
         return retval;
     }
 
@@ -348,7 +345,6 @@ struct Shape<PST_RECTANGLE>
     float32_t3 edge0;
     float32_t3 edge1;
     uint32_t bsdfLightIDs;
-    PTPolygonMethod polygonMethod;
 };
 
 }
diff --git a/31_HLSLPathTracer/app_resources/hlsl/next_event_estimator.hlsl b/31_HLSLPathTracer/app_resources/hlsl/next_event_estimator.hlsl
index 7c157aadf..51c018ac5 100644
--- a/31_HLSLPathTracer/app_resources/hlsl/next_event_estimator.hlsl
+++ b/31_HLSLPathTracer/app_resources/hlsl/next_event_estimator.hlsl
@@ -298,6 +298,10 @@ struct Estimator<Scene, Ray, LightSample, Aniso, IM_PROCEDURAL, PST_SPHERE, PPM>
     using sample_type = LightSample;
     using ray_dir_info_type = typename sample_type::ray_dir_info_type;
 
+    // affected by https://github.com/microsoft/DirectXShaderCompiler/issues/7007
+    // NBL_CONSTEXPR_STATIC_INLINE PTPolygonMethod PolygonMethod = PPM;
+    enum : uint16_t { PolygonMethod = PPM };
+
     static spectral_type deferredEvalAndPdf(NBL_REF_ARG(scalar_type) pdf, NBL_CONST_REF_ARG(scene_type) scene, uint32_t lightID, NBL_CONST_REF_ARG(ray_type) ray)
     {
         pdf = 1.0 / scene.lightCount;
@@ -311,19 +315,17 @@ struct Estimator<Scene, Ray, LightSample, Aniso, IM_PROCEDURAL, PST_SPHERE, PPM>
 
     static sample_type generate_and_quotient_and_pdf(NBL_REF_ARG(quotient_pdf_type) quotient_pdf, NBL_REF_ARG(scalar_type) newRayMaxT, NBL_CONST_REF_ARG(scene_type) scene, uint32_t lightID, NBL_CONST_REF_ARG(vector3_type) origin, NBL_CONST_REF_ARG(interaction_type) interaction, bool isBSDF, NBL_CONST_REF_ARG(vector3_type) xi, uint32_t depth)
     {
-        sample_type L;
-        scalar_type pdf;
-
         const light_type light = scene.lights[lightID];
         const Shape<PST_SPHERE> sphere = scene.spheres[light.objectID.id];
         const ShapeSampling<PST_SPHERE, PPM> sampling = ShapeSampling<PST_SPHERE, PPM>::create(sphere);
 
+        scalar_type pdf;
         const vector3_type sampleL = sampling.template generate_and_pdf<interaction_type>(pdf, newRayMaxT, origin, interaction, isBSDF, xi);
         const vector3_type V = interaction.isotropic.V.getDirection();
         const scalar_type VdotL = nbl::hlsl::dot<vector3_type>(V, sampleL);
         ray_dir_info_type rayL;
         rayL.direction = sampleL;
-        L = sample_type::create(rayL,VdotL,interaction.T,interaction.B,interaction.isotropic.N);
+        sample_type L = sample_type::create(rayL,VdotL,interaction.T,interaction.B,interaction.isotropic.N);
 
         newRayMaxT *= Tolerance<scalar_type>::getEnd(depth);
         pdf *= 1.0 / scalar_type(scene.lightCount);
@@ -348,6 +350,9 @@ struct Estimator<Scene, Ray, LightSample, Aniso, IM_PROCEDURAL, PST_TRIANGLE, PP
     using sample_type = LightSample;
     using ray_dir_info_type = typename sample_type::ray_dir_info_type;
 
+    // NBL_CONSTEXPR_STATIC_INLINE PTPolygonMethod PolygonMethod = PPM;
+    enum : uint16_t { PolygonMethod = PPM };
+
     static spectral_type deferredEvalAndPdf(NBL_REF_ARG(scalar_type) pdf, NBL_CONST_REF_ARG(scene_type) scene, uint32_t lightID, NBL_CONST_REF_ARG(ray_type) ray)
     {
         pdf = 1.0 / scene.lightCount;
@@ -361,19 +366,17 @@ struct Estimator<Scene, Ray, LightSample, Aniso, IM_PROCEDURAL, PST_TRIANGLE, PP
 
     static sample_type generate_and_quotient_and_pdf(NBL_REF_ARG(quotient_pdf_type) quotient_pdf, NBL_REF_ARG(scalar_type) newRayMaxT, NBL_CONST_REF_ARG(scene_type) scene, uint32_t lightID, NBL_CONST_REF_ARG(vector3_type) origin, NBL_CONST_REF_ARG(interaction_type) interaction, bool isBSDF, NBL_CONST_REF_ARG(vector3_type) xi, uint32_t depth)
     {
-        sample_type L;
-        scalar_type pdf;
-
         const light_type light = scene.lights[lightID];
         const Shape<PST_TRIANGLE> tri = scene.triangles[light.objectID.id];
         const ShapeSampling<PST_TRIANGLE, PPM> sampling = ShapeSampling<PST_TRIANGLE, PPM>::create(tri);
 
+        scalar_type pdf;
         const vector3_type sampleL = sampling.template generate_and_pdf<interaction_type>(pdf, newRayMaxT, origin, interaction, isBSDF, xi);
         const vector3_type V = interaction.isotropic.V.getDirection();
         const scalar_type VdotL = nbl::hlsl::dot<vector3_type>(V, sampleL);
         ray_dir_info_type rayL;
         rayL.direction = sampleL;
-        L = sample_type::create(rayL,VdotL,interaction.T,interaction.B,interaction.isotropic.N);
+        sample_type L = sample_type::create(rayL,VdotL,interaction.T,interaction.B,interaction.isotropic.N);
 
         newRayMaxT *= Tolerance<scalar_type>::getEnd(depth);
         pdf *= 1.0 / scalar_type(scene.lightCount);
@@ -398,6 +401,9 @@ struct Estimator<Scene, Ray, LightSample, Aniso, IM_PROCEDURAL, PST_RECTANGLE, P
     using sample_type = LightSample;
     using ray_dir_info_type = typename sample_type::ray_dir_info_type;
 
+    // NBL_CONSTEXPR_STATIC_INLINE PTPolygonMethod PolygonMethod = PPM;
+    enum : uint16_t { PolygonMethod = PPM };
+
     static spectral_type deferredEvalAndPdf(NBL_REF_ARG(scalar_type) pdf, NBL_CONST_REF_ARG(scene_type) scene, uint32_t lightID, NBL_CONST_REF_ARG(ray_type) ray)
     {
         pdf = 1.0 / scene.lightCount;
@@ -411,19 +417,17 @@ struct Estimator<Scene, Ray, LightSample, Aniso, IM_PROCEDURAL, PST_RECTANGLE, P
 
     static sample_type generate_and_quotient_and_pdf(NBL_REF_ARG(quotient_pdf_type) quotient_pdf, NBL_REF_ARG(scalar_type) newRayMaxT, NBL_CONST_REF_ARG(scene_type) scene, uint32_t lightID, NBL_CONST_REF_ARG(vector3_type) origin, NBL_CONST_REF_ARG(interaction_type) interaction, bool isBSDF, NBL_CONST_REF_ARG(vector3_type) xi, uint32_t depth)
     {
-        sample_type L;
-        scalar_type pdf;
-
         const light_type light = scene.lights[lightID];
         const Shape<PST_RECTANGLE> rect = scene.rectangles[light.objectID.id];
         const ShapeSampling<PST_RECTANGLE, PPM> sampling = ShapeSampling<PST_RECTANGLE, PPM>::create(rect);
 
+        scalar_type pdf;
         const vector3_type sampleL = sampling.template generate_and_pdf<interaction_type>(pdf, newRayMaxT, origin, interaction, isBSDF, xi);
         const vector3_type V = interaction.isotropic.V.getDirection();
         const scalar_type VdotL = nbl::hlsl::dot<vector3_type>(V, sampleL);
         ray_dir_info_type rayL;
         rayL.direction = sampleL;
-        L = sample_type::create(rayL,VdotL,interaction.T,interaction.B,interaction.isotropic.N);
+        sample_type L = sample_type::create(rayL,VdotL,interaction.T,interaction.B,interaction.isotropic.N);
 
         newRayMaxT *= Tolerance<scalar_type>::getEnd(depth);
         pdf *= 1.0 / scalar_type(scene.lightCount);
diff --git a/31_HLSLPathTracer/app_resources/hlsl/pathtracer.hlsl b/31_HLSLPathTracer/app_resources/hlsl/pathtracer.hlsl
index 3082e599e..f5d5206dc 100644
--- a/31_HLSLPathTracer/app_resources/hlsl/pathtracer.hlsl
+++ b/31_HLSLPathTracer/app_resources/hlsl/pathtracer.hlsl
@@ -241,10 +241,11 @@ struct Unidirectional
             // trace new ray
             ray.origin = intersection + bxdfSample * (1.0/*kSceneSize*/) * Tolerance<scalar_type>::getStart(depth);
             ray.direction = bxdfSample;
-            // #if POLYGON_METHOD==2
-            // ray._immutable.normalAtOrigin = interaction.isotropic.N;
-            // ray._immutable.wasBSDFAtOrigin = isBSDF;
-            // #endif
+            if ((PTPolygonMethod)nee_type::PolygonMethod == PPM_APPROX_PROJECTED_SOLID_ANGLE)
+            {
+                ray.normalAtOrigin = interaction.isotropic.N;
+                ray.wasBSDFAtOrigin = isBSDF;
+            }
             return true;
         }
 
diff --git a/31_HLSLPathTracer/main.cpp b/31_HLSLPathTracer/main.cpp
index ae9f162a4..e3e0b7d7a 100644
--- a/31_HLSLPathTracer/main.cpp
+++ b/31_HLSLPathTracer/main.cpp
@@ -1348,7 +1348,7 @@ class HLSLComputePathtracer final : public examples::SimpleWindowedApplication,
 		float camYAngle = 165.f / 180.f * 3.14159f;
 		float camXAngle = 32.f / 180.f * 3.14159f;
 		int PTPipline = E_LIGHT_GEOMETRY::ELG_SPHERE;
-		int renderMode = E_RENDER_MODE::ERM_GLSL;
+		int renderMode = E_RENDER_MODE::ERM_HLSL;
 		int spp = 32;
 		int depth = 3;
 

From 16951a3e6b9870760437f8531c298355dcd8545b Mon Sep 17 00:00:00 2001
From: Przemek <minikers21@gmail.com>
Date: Wed, 19 Mar 2025 11:42:12 +0100
Subject: [PATCH 097/529] Stippled outline

---
 62_CAD/DrawResourcesFiller.cpp                |  5 +++--
 62_CAD/main.cpp                               |  7 +++++--
 .../main_pipeline/fragment_shader.hlsl        | 20 ++++++++++++++-----
 .../shaders/main_pipeline/vertex_shader.hlsl  |  4 ++++
 4 files changed, 27 insertions(+), 9 deletions(-)

diff --git a/62_CAD/DrawResourcesFiller.cpp b/62_CAD/DrawResourcesFiller.cpp
index 44837e415..49c81f3ff 100644
--- a/62_CAD/DrawResourcesFiller.cpp
+++ b/62_CAD/DrawResourcesFiller.cpp
@@ -276,7 +276,7 @@ void DrawResourcesFiller::drawTriangleMesh(const CTriangleMesh& mesh, CTriangleM
 
 	uint32_t dtmSettingsIndex = addDTMSettings_SubmitIfNeeded(dtmSettingsInfo, intendedNextSubmit);
 
-	drawData.pushConstants.triangleMeshMainObjectIndex = addMainObject_SubmitIfNeeded(InvalidStyleIdx, InvalidDTMSettingsIdx, intendedNextSubmit);
+	drawData.pushConstants.triangleMeshMainObjectIndex = addMainObject_SubmitIfNeeded(InvalidStyleIdx, dtmSettingsIndex, intendedNextSubmit);
 
 	// TODO: use this function later for auto submit
 	//submitCurrentDrawObjectsAndReset(intendedNextSubmit, 0);
@@ -522,7 +522,8 @@ bool DrawResourcesFiller::finalizeLineStyleCopiesToGPU(SIntendedSubmitInfo& inte
 	SBufferRange<IGPUBuffer> stylesRange = { sizeof(LineStyle) * inMemLineStylesCount, sizeof(LineStyle) * remainingLineStyles, gpuDrawBuffers.lineStylesBuffer };
 	if (stylesRange.size > 0u)
 	{
-		const LineStyle* srcLineStylesData = reinterpret_cast<LineStyle*>(cpuDrawBuffers.lineStylesBuffer->getPointer()) + inMemLineStylesCount;
+		LineStyle* srcLineStylesData = reinterpret_cast<LineStyle*>(cpuDrawBuffers.lineStylesBuffer->getPointer()) + inMemLineStylesCount;
+
 		if (m_utilities->updateBufferRangeViaStagingBuffer(intendedNextSubmit, stylesRange, srcLineStylesData))
 			inMemLineStylesCount = currentLineStylesCount;
 		else
diff --git a/62_CAD/main.cpp b/62_CAD/main.cpp
index cce87e3b2..07df1febb 100644
--- a/62_CAD/main.cpp
+++ b/62_CAD/main.cpp
@@ -3300,7 +3300,7 @@ class ComputerAidedDesign final : public examples::SimpleWindowedApplication, pu
 			};
 
 			core::vector<uint32_t> indices = {
-				0, 1, 3,
+				0, 3, 1,
 				1, 3, 4,
 				1, 2, 4,
 				2, 4, 5,
@@ -3327,8 +3327,10 @@ class ComputerAidedDesign final : public examples::SimpleWindowedApplication, pu
 
 			LineStyleInfo outlineStyle = {};
 			dtmSettingsInfo.outlineLineStyleInfo.screenSpaceLineWidth = 0.0f;
-			dtmSettingsInfo.outlineLineStyleInfo.worldSpaceLineWidth = 5.0f;
+			dtmSettingsInfo.outlineLineStyleInfo.worldSpaceLineWidth = 2.0f;
 			dtmSettingsInfo.outlineLineStyleInfo.color = float32_t4(0.0f, 0.5f, 0.5f, 1.0f);
+			std::array<double, 4> outlineStipplePattern = { 0.0f, -5.0f, 2.0f, -5.0f };
+			dtmSettingsInfo.outlineLineStyleInfo.setStipplePatternData(outlineStipplePattern);
 
 			LineStyleInfo contourStyle = {};
 			dtmSettingsInfo.contourLineStyleInfo.screenSpaceLineWidth = 0.0f;
@@ -3337,6 +3339,7 @@ class ComputerAidedDesign final : public examples::SimpleWindowedApplication, pu
 
 			drawResourcesFiller.drawTriangleMesh(mesh, m_triangleMeshDrawData, dtmSettingsInfo, intendedNextSubmit);
 		}
+
 		drawResourcesFiller.finalizeAllCopiesToGPU(intendedNextSubmit);
 	}
 
diff --git a/62_CAD/shaders/main_pipeline/fragment_shader.hlsl b/62_CAD/shaders/main_pipeline/fragment_shader.hlsl
index a4176d1ef..4449d4687 100644
--- a/62_CAD/shaders/main_pipeline/fragment_shader.hlsl
+++ b/62_CAD/shaders/main_pipeline/fragment_shader.hlsl
@@ -425,6 +425,7 @@ float4 fragMain(PSInput input) : SV_TARGET
         const float3 baryCoord = nbl::hlsl::spirv::BaryCoordKHR;
 
         // TODO: figure out if branching can be reduced
+        // finding line start and end points by excluding vertex with the lowest barycentric coordinate value
         if (baryCoord.x < baryCoord.y && baryCoord.x < baryCoord.z)
         {
             start = float2(v1.x, v1.y);
@@ -432,7 +433,7 @@ float4 fragMain(PSInput input) : SV_TARGET
         }
         else if (baryCoord.y < baryCoord.x && baryCoord.y < baryCoord.z)
         {
-            start = float2(v1.x, v1.y);
+            start = float2(v0.x, v0.y);
             end = float2(v2.x, v2.y);
         }
         else if (baryCoord.z < baryCoord.x && baryCoord.z < baryCoord.y)
@@ -441,13 +442,20 @@ float4 fragMain(PSInput input) : SV_TARGET
             end = float2(v1.x, v1.y);
         }
 
+        // long story short, in order for stipple patterns to be consistent:
+        // - point with lesser x coord should be starting point
+        // - if x coord of both points are equal then point with lesser y value should be starting point
+        if (end.x < start.x)
+            nbl::hlsl::swap(start, end);
+        else if (end.x == start.x && end.y < start.y)
+            nbl::hlsl::swap(start, end);
+
         const float thickness = input.getLineThickness();
         const float phaseShift = 0.0f; // input.getCurrentPhaseShift();
-        const float stretch = 0.0f; // input.getPatternStretch();
+        const float stretch =  1.0f;
         const float worldToScreenRatio = input.getCurrentWorldToScreenRatio();
 
         nbl::hlsl::shapes::Line<float> lineSegment = nbl::hlsl::shapes::Line<float>::construct(start, end);
-        nbl::hlsl::shapes::Line<float>::ArcLengthCalculator arcLenCalc = nbl::hlsl::shapes::Line<float>::ArcLengthCalculator::construct(lineSegment);
 
         DTMSettings dtmSettings = dtmSettingsBuff[mainObj.dtmSettingsIdx];
         LineStyle outlineStyle = lineStyles[dtmSettings.outlineLineStyleIdx];
@@ -460,14 +468,16 @@ float4 fragMain(PSInput input) : SV_TARGET
         }
         else
         {
+            nbl::hlsl::shapes::Line<float>::ArcLengthCalculator arcLenCalc = nbl::hlsl::shapes::Line<float>::ArcLengthCalculator::construct(lineSegment);
+            printf("stretch = %f, worldToScreenRatio = %f", stretch, worldToScreenRatio);
             LineStyleClipper clipper = LineStyleClipper::construct(outlineStyle, lineSegment, arcLenCalc, phaseShift, stretch, worldToScreenRatio);
             distance = ClippedSignedDistance<nbl::hlsl::shapes::Line<float>, LineStyleClipper>::sdf(lineSegment, input.position.xy, thickness, outlineStyle.isRoadStyleFlag, clipper);
         }
 
         localAlpha = smoothstep(+globals.antiAliasingFactor, -globals.antiAliasingFactor, distance);
+        textureColor = float3(outlineStyle.color.x, outlineStyle.color.y, outlineStyle.color.z);
     }
 
-    textureColor = float3(1.0f, 1.0f, 1.0f);
     return calculateFinalColor<nbl::hlsl::jit::device_capabilities::fragmentShaderPixelInterlock>(uint2(input.position.xy), localAlpha, currentMainObjectIdx, textureColor, true);
 
     // figure out local alpha with sdf
@@ -485,7 +495,6 @@ float4 fragMain(PSInput input) : SV_TARGET
             const float worldToScreenRatio = input.getCurrentWorldToScreenRatio();
 
             nbl::hlsl::shapes::Line<float> lineSegment = nbl::hlsl::shapes::Line<float>::construct(start, end);
-            nbl::hlsl::shapes::Line<float>::ArcLengthCalculator arcLenCalc = nbl::hlsl::shapes::Line<float>::ArcLengthCalculator::construct(lineSegment);
 
             LineStyle style = lineStyles[styleIdx];
 
@@ -495,6 +504,7 @@ float4 fragMain(PSInput input) : SV_TARGET
             }
             else
             {
+                nbl::hlsl::shapes::Line<float>::ArcLengthCalculator arcLenCalc = nbl::hlsl::shapes::Line<float>::ArcLengthCalculator::construct(lineSegment);
                 LineStyleClipper clipper = LineStyleClipper::construct(lineStyles[styleIdx], lineSegment, arcLenCalc, phaseShift, stretch, worldToScreenRatio);
                 distance = ClippedSignedDistance<nbl::hlsl::shapes::Line<float>, LineStyleClipper>::sdf(lineSegment, input.position.xy, thickness, style.isRoadStyleFlag, clipper);
             }
diff --git a/62_CAD/shaders/main_pipeline/vertex_shader.hlsl b/62_CAD/shaders/main_pipeline/vertex_shader.hlsl
index 08418c844..e68ad1dec 100644
--- a/62_CAD/shaders/main_pipeline/vertex_shader.hlsl
+++ b/62_CAD/shaders/main_pipeline/vertex_shader.hlsl
@@ -122,6 +122,10 @@ PSInput main(uint vertexID : SV_VertexID)
     outV.position = transformFromSreenSpaceToNdc(outV.position.xy, globals.resolution);
     outV.setHeightAtMeshVertex(vtx.height);
     outV.setScreenSpaceVertexPos(float3(transformedPos, 1));
+    outV.setCurrentWorldToScreenRatio(
+        _static_cast<float>((_static_cast<pfloat64_t>(2.0f) /
+            (clipProjectionData.projectionToNDC[0].x * _static_cast<pfloat64_t>(globals.resolution.x))))
+    );
 
     // TODO: line style of contour line has to be set too!
     DTMSettings dtmSettings = dtmSettingsBuff[mainObj.dtmSettingsIdx];

From a215f450b8432c27c5ed0a352185e22b34aa2794 Mon Sep 17 00:00:00 2001
From: Przemek <minikers21@gmail.com>
Date: Thu, 20 Mar 2025 12:36:03 +0100
Subject: [PATCH 098/529] Contour drawing setup

---
 62_CAD/main.cpp                               | 24 ++++++++---------
 62_CAD/shaders/main_pipeline/common.hlsl      |  4 +--
 .../main_pipeline/fragment_shader.hlsl        | 27 +++++++++++++++++--
 .../shaders/main_pipeline/vertex_shader.hlsl  |  2 +-
 4 files changed, 40 insertions(+), 17 deletions(-)

diff --git a/62_CAD/main.cpp b/62_CAD/main.cpp
index 07df1febb..da3c93acd 100644
--- a/62_CAD/main.cpp
+++ b/62_CAD/main.cpp
@@ -3285,18 +3285,18 @@ class ComputerAidedDesign final : public examples::SimpleWindowedApplication, pu
 		else if (mode == ExampleMode::CASE_9)
 		{
 			core::vector<TriangleMeshVertex> vertices = {
-				{ float32_t2(-200.0f, -200.0f), 0.0f },
-				{ float32_t2(-50.0f, -200.0f), 0.0f },
-				{ float32_t2(100.0f, -200.0f), 0.0f },
-				{ float32_t2(-125.0f, -70.1f), 0.0f },
-				{ float32_t2(25.0f, -70.1f), 0.0f },
-				{ float32_t2(175.0f, -70.1f), 0.0f },
-				{ float32_t2(-200.0f, 59.8f), 0.0f },
-				{ float32_t2(-50.0f, 59.8f), 0.0f },
-				{ float32_t2(100.0f, 59.8f), 0.0f },
-				{ float32_t2(-125.0f, 189.7f), 0.0f },
-				{ float32_t2(25.0f, 189.7f), 0.0f },
-				{ float32_t2(175.0f, 189.7f), 0.0f }
+				{ float32_t2(-200.0f, -200.0f), 10.0f },
+				{ float32_t2(-50.0f, -200.0f), 50.0f },
+				{ float32_t2(100.0f, -200.0f), 90.0f },
+				{ float32_t2(-125.0f, -70.1f), 10.0f },
+				{ float32_t2(25.0f, -70.1f), 50.0f },
+				{ float32_t2(175.0f, -70.1f), 90.0f },
+				{ float32_t2(-200.0f, 59.8f), 10.0f },
+				{ float32_t2(-50.0f, 59.8f), 50.0f },
+				{ float32_t2(100.0f, 59.8f), 90.0f },
+				{ float32_t2(-125.0f, 189.7f), 10.0f },
+				{ float32_t2(25.0f, 189.7f), 50.0f },
+				{ float32_t2(175.0f, 189.7f), 90.0f }
 			};
 
 			core::vector<uint32_t> indices = {
diff --git a/62_CAD/shaders/main_pipeline/common.hlsl b/62_CAD/shaders/main_pipeline/common.hlsl
index dc47604ad..73121fe36 100644
--- a/62_CAD/shaders/main_pipeline/common.hlsl
+++ b/62_CAD/shaders/main_pipeline/common.hlsl
@@ -104,8 +104,8 @@ struct PSInput
     void setCurrentWorldToScreenRatio(float worldToScreen) { interp_data5.y = worldToScreen; }
     float getCurrentWorldToScreenRatio() { return interp_data5.y; }
 
-    void setHeightAtMeshVertex(float height) { interp_data5.x = height; }
-    float getHeightAtMeshVertex() { return interp_data5.x; }
+    void setHeight(float height) { interp_data5.x = height; }
+    float getHeight() { return interp_data5.x; }
 
     /* LINE */
     float2 getLineStart() { return data2.xy; }
diff --git a/62_CAD/shaders/main_pipeline/fragment_shader.hlsl b/62_CAD/shaders/main_pipeline/fragment_shader.hlsl
index 4449d4687..5311fa48d 100644
--- a/62_CAD/shaders/main_pipeline/fragment_shader.hlsl
+++ b/62_CAD/shaders/main_pipeline/fragment_shader.hlsl
@@ -414,12 +414,31 @@ float4 fragMain(PSInput input) : SV_TARGET
     const uint32_t currentMainObjectIdx = input.getMainObjectIdx();
     const MainObject mainObj = mainObjects[currentMainObjectIdx];
 
+    
+
     // TRIANGLE RENDERING
     {
         float3 v0 = input.getScreenSpaceVertexPos(0);
         float3 v1 = input.getScreenSpaceVertexPos(1);
         float3 v2 = input.getScreenSpaceVertexPos(2);
 
+        // CONTOUR
+
+        // TODO: move to ubo or push constants
+        const float startHeight = 10.0f;
+        const float endHeight = 100.0f;
+        const float interval = 10.0f;
+        float height = input.getHeight();
+
+        // TODO: it actually can output a negative number, fix
+        int contourLineIdx = nbl::hlsl::_static_cast<int>((height - startHeight + (interval * 0.5f)) / interval);
+
+        float backgroundColor = contourLineIdx;
+        backgroundColor *= 0.1f;
+        textureColor = float3(backgroundColor, backgroundColor, backgroundColor);
+
+        // OUTLINE
+
         float2 start;
         float2 end;
         const float3 baryCoord = nbl::hlsl::spirv::BaryCoordKHR;
@@ -469,13 +488,17 @@ float4 fragMain(PSInput input) : SV_TARGET
         else
         {
             nbl::hlsl::shapes::Line<float>::ArcLengthCalculator arcLenCalc = nbl::hlsl::shapes::Line<float>::ArcLengthCalculator::construct(lineSegment);
-            printf("stretch = %f, worldToScreenRatio = %f", stretch, worldToScreenRatio);
             LineStyleClipper clipper = LineStyleClipper::construct(outlineStyle, lineSegment, arcLenCalc, phaseShift, stretch, worldToScreenRatio);
             distance = ClippedSignedDistance<nbl::hlsl::shapes::Line<float>, LineStyleClipper>::sdf(lineSegment, input.position.xy, thickness, outlineStyle.isRoadStyleFlag, clipper);
         }
 
         localAlpha = smoothstep(+globals.antiAliasingFactor, -globals.antiAliasingFactor, distance);
-        textureColor = float3(outlineStyle.color.x, outlineStyle.color.y, outlineStyle.color.z);
+
+        // TODO: remove, this is just a hack to draw background
+        if (localAlpha < 0.00001)
+            localAlpha = 1.0f;
+        else
+            textureColor = float3(outlineStyle.color.x, outlineStyle.color.y, outlineStyle.color.z);
     }
 
     return calculateFinalColor<nbl::hlsl::jit::device_capabilities::fragmentShaderPixelInterlock>(uint2(input.position.xy), localAlpha, currentMainObjectIdx, textureColor, true);
diff --git a/62_CAD/shaders/main_pipeline/vertex_shader.hlsl b/62_CAD/shaders/main_pipeline/vertex_shader.hlsl
index e68ad1dec..a1788a91e 100644
--- a/62_CAD/shaders/main_pipeline/vertex_shader.hlsl
+++ b/62_CAD/shaders/main_pipeline/vertex_shader.hlsl
@@ -120,7 +120,7 @@ PSInput main(uint vertexID : SV_VertexID)
 
     outV.position.xy = transformedPos;
     outV.position = transformFromSreenSpaceToNdc(outV.position.xy, globals.resolution);
-    outV.setHeightAtMeshVertex(vtx.height);
+    outV.setHeight(vtx.height);
     outV.setScreenSpaceVertexPos(float3(transformedPos, 1));
     outV.setCurrentWorldToScreenRatio(
         _static_cast<float>((_static_cast<pfloat64_t>(2.0f) /

From c9041343c0ca756c07aa94008f2da790d8ec5b3b Mon Sep 17 00:00:00 2001
From: kevyuu <kevin.kayu@gmail.com>
Date: Thu, 20 Mar 2025 20:34:14 +0700
Subject: [PATCH 099/529] Return materialId instead of materialPacked from
 rchit

---
 .../app_resources/common.hlsl                 | 140 +++++++++++++++++-
 .../app_resources/raytrace.rchit.hlsl         |  94 +-----------
 .../app_resources/raytrace.rgen.hlsl          |  20 ++-
 .../app_resources/raytrace.rint.hlsl          |   4 +-
 .../raytrace_procedural.rchit.hlsl            |   3 +-
 5 files changed, 155 insertions(+), 106 deletions(-)

diff --git a/71_RayTracingPipeline/app_resources/common.hlsl b/71_RayTracingPipeline/app_resources/common.hlsl
index 0b5f4b170..d64851b17 100644
--- a/71_RayTracingPipeline/app_resources/common.hlsl
+++ b/71_RayTracingPipeline/app_resources/common.hlsl
@@ -75,6 +75,7 @@ struct SProceduralGeomInfo
     float32_t radius;
 };
 
+
 struct STriangleGeomInfo
 {
     MaterialPacked material;
@@ -154,13 +155,6 @@ struct RayLight
     float32_t outIntensity;
 };
 
-struct ProceduralHitAttribute
-{
-    MaterialPacked material;
-    float32_t3 center;
-};
-
-
 #ifdef __HLSL_VERSION
 
 struct [raypayload] OcclusionPayload
@@ -168,12 +162,50 @@ struct [raypayload] OcclusionPayload
     float32_t attenuation : read(caller) : write(caller, anyhit);
 };
 
+struct MaterialId
+{
+    const static uint32_t PROCEDURAL_FLAG = (1 << 31);
+    const static uint32_t PROCEDURAL_MASK = ~PROCEDURAL_FLAG;
+
+    uint32_t data;
+
+    static MaterialId createProcedural(uint32_t index)
+    {
+        MaterialId id;
+        id.data = index | PROCEDURAL_FLAG;
+        return id;
+    }
+
+    static MaterialId createTriangle(uint32_t index)
+    {
+        MaterialId id;
+        id.data = index;
+        return id;
+    }
+
+    uint32_t getMaterialIndex()
+    {
+        return data & PROCEDURAL_MASK;
+    }
+
+    bool isHitProceduralGeom()
+    {
+        return data & PROCEDURAL_FLAG;
+    }
+};
+
 struct [raypayload] PrimaryPayload
 {
-    MaterialPacked material : read(caller) : write(closesthit);
     float32_t3 worldNormal : read(caller) : write(closesthit);
     float32_t rayDistance : read(caller) : write(closesthit, miss);
     float32_t alphaThreshold : read(closesthit, anyhit) : write(caller);
+    MaterialId materialId : read(caller) : write(closesthit);
+
+};
+
+struct ProceduralHitAttribute
+{
+    float32_t3 center;
 };
 
 enum ObjectType : uint32_t  // matches c++
@@ -213,6 +245,98 @@ float32_t3 computeSpecular(Material mat, float32_t3 view_dir,
 
 	return float32_t3(mat.specular * specular);
 }
+
+float3 unpackNormals3x10(uint32_t v)
+{
+    // host side changes float32_t3 to EF_A2B10G10R10_SNORM_PACK32
+    // follows unpacking scheme from https://github.com/KhronosGroup/SPIRV-Cross/blob/main/reference/shaders-hlsl/frag/unorm-snorm-packing.frag
+    int signedValue = int(v);
+    int3 pn = int3(signedValue << 22, signedValue << 12, signedValue << 2) >> 22;
+    return clamp(float3(pn) / 511.0, -1.0, 1.0);
+}
+
+float32_t3 fetchVertexNormal(int instID, int primID, STriangleGeomInfo geom, float2 bary)
+{
+    uint idxOffset = primID * 3;
+
+    const uint indexType = geom.indexType;
+    const uint vertexStride = geom.vertexStride;
+
+    const uint32_t objType = geom.objType;
+    const uint64_t indexBufferAddress = geom.indexBufferAddress;
+
+    uint i0, i1, i2;
+    switch (indexType)
+    {
+        case 0: // EIT_16BIT
+        {
+                i0 = uint32_t(vk::RawBufferLoad < uint16_t > (indexBufferAddress + (idxOffset + 0) * sizeof(uint16_t), 2u));
+                i1 = uint32_t(vk::RawBufferLoad < uint16_t > (indexBufferAddress + (idxOffset + 1) * sizeof(uint16_t), 2u));
+                i2 = uint32_t(vk::RawBufferLoad < uint16_t > (indexBufferAddress + (idxOffset + 2) * sizeof(uint16_t), 2u));
+            }
+            break;
+        case 1: // EIT_32BIT
+        {
+                i0 = vk::RawBufferLoad < uint32_t > (indexBufferAddress + (idxOffset + 0) * sizeof(uint32_t));
+                i1 = vk::RawBufferLoad < uint32_t > (indexBufferAddress + (idxOffset + 1) * sizeof(uint32_t));
+                i2 = vk::RawBufferLoad < uint32_t > (indexBufferAddress + (idxOffset + 2) * sizeof(uint32_t));
+            }
+            break;
+        default: // EIT_NONE
+        {
+                i0 = idxOffset;
+                i1 = idxOffset + 1;
+                i2 = idxOffset + 2;
+            }
+    }
+
+    const uint64_t normalVertexBufferAddress = geom.vertexBufferAddress + s_offsetsToNormalBytes[objType];
+    float3 n0, n1, n2;
+    switch (objType)
+    {
+        case OT_CUBE:
+        {
+                uint32_t v0 = vk::RawBufferLoad < uint32_t > (normalVertexBufferAddress + i0 * vertexStride, 2u);
+                uint32_t v1 = vk::RawBufferLoad < uint32_t > (normalVertexBufferAddress + i1 * vertexStride, 2u);
+                uint32_t v2 = vk::RawBufferLoad < uint32_t > (normalVertexBufferAddress + i2 * vertexStride, 2u);
+
+                n0 = normalize(nbl::hlsl::spirv::unpackSnorm4x8(v0).xyz);
+                n1 = normalize(nbl::hlsl::spirv::unpackSnorm4x8(v1).xyz);
+                n2 = normalize(nbl::hlsl::spirv::unpackSnorm4x8(v2).xyz);
+            }
+            break;
+        case OT_SPHERE:
+        case OT_CYLINDER:
+        case OT_ARROW:
+        case OT_CONE:
+        {
+                uint32_t v0 = vk::RawBufferLoad < uint32_t > (normalVertexBufferAddress + i0 * vertexStride);
+                uint32_t v1 = vk::RawBufferLoad < uint32_t > (normalVertexBufferAddress + i1 * vertexStride);
+                uint32_t v2 = vk::RawBufferLoad < uint32_t > (normalVertexBufferAddress + i2 * vertexStride);
+
+                n0 = normalize(unpackNormals3x10(v0));
+                n1 = normalize(unpackNormals3x10(v1));
+                n2 = normalize(unpackNormals3x10(v2));
+            }
+            break;
+        case OT_RECTANGLE:
+        case OT_DISK:
+        case OT_ICOSPHERE:
+        default:
+        {
+                n0 = normalize(vk::RawBufferLoad <
+                float3 > (normalVertexBufferAddress + i0 * vertexStride));
+                n1 = normalize(vk::RawBufferLoad <
+                float3 > (normalVertexBufferAddress + i1 * vertexStride));
+                n2 = normalize(vk::RawBufferLoad <
+                float3 > (normalVertexBufferAddress + i2 * vertexStride));
+            }
+    }
+
+    float3 barycentrics = float3(0.0, bary);
+    barycentrics.x = 1.0 - barycentrics.y - barycentrics.z;
+    return normalize(barycentrics.x * n0 + barycentrics.y * n1 + barycentrics.z * n2);
+}
 #endif
 
 namespace nbl
diff --git a/71_RayTracingPipeline/app_resources/raytrace.rchit.hlsl b/71_RayTracingPipeline/app_resources/raytrace.rchit.hlsl
index fdb252cda..cf68e52eb 100644
--- a/71_RayTracingPipeline/app_resources/raytrace.rchit.hlsl
+++ b/71_RayTracingPipeline/app_resources/raytrace.rchit.hlsl
@@ -2,97 +2,6 @@
 
 [[vk::push_constant]] SPushConstants pc;
 
-float3 unpackNormals3x10(uint32_t v)
-{
-    // host side changes float32_t3 to EF_A2B10G10R10_SNORM_PACK32
-    // follows unpacking scheme from https://github.com/KhronosGroup/SPIRV-Cross/blob/main/reference/shaders-hlsl/frag/unorm-snorm-packing.frag
-    int signedValue = int(v);
-    int3 pn = int3(signedValue << 22, signedValue << 12, signedValue << 2) >> 22;
-    return clamp(float3(pn) / 511.0, -1.0, 1.0);
-}
-
-float32_t3 fetchVertexNormal(int instID, int primID, STriangleGeomInfo geom, float2 bary)
-{
-    uint idxOffset = primID * 3;
-
-    const uint indexType = geom.indexType;
-    const uint vertexStride = geom.vertexStride;
-
-    const uint32_t objType = geom.objType;
-    const uint64_t indexBufferAddress = geom.indexBufferAddress;
-
-    uint i0, i1, i2;
-    switch (indexType)
-    {
-        case 0: // EIT_16BIT
-        {
-                i0 = uint32_t(vk::RawBufferLoad < uint16_t > (indexBufferAddress + (idxOffset + 0) * sizeof(uint16_t), 2u));
-                i1 = uint32_t(vk::RawBufferLoad < uint16_t > (indexBufferAddress + (idxOffset + 1) * sizeof(uint16_t), 2u));
-                i2 = uint32_t(vk::RawBufferLoad < uint16_t > (indexBufferAddress + (idxOffset + 2) * sizeof(uint16_t), 2u));
-            }
-            break;
-        case 1: // EIT_32BIT
-        {
-                i0 = vk::RawBufferLoad < uint32_t > (indexBufferAddress + (idxOffset + 0) * sizeof(uint32_t));
-                i1 = vk::RawBufferLoad < uint32_t > (indexBufferAddress + (idxOffset + 1) * sizeof(uint32_t));
-                i2 = vk::RawBufferLoad < uint32_t > (indexBufferAddress + (idxOffset + 2) * sizeof(uint32_t));
-            }
-            break;
-        default: // EIT_NONE
-        {
-                i0 = idxOffset;
-                i1 = idxOffset + 1;
-                i2 = idxOffset + 2;
-            }
-    }
-
-    const uint64_t normalVertexBufferAddress = geom.vertexBufferAddress + s_offsetsToNormalBytes[objType];
-    float3 n0, n1, n2;
-    switch (objType)
-    {
-        case OT_CUBE:
-        {
-                uint32_t v0 = vk::RawBufferLoad < uint32_t > (normalVertexBufferAddress + i0 * vertexStride, 2u);
-                uint32_t v1 = vk::RawBufferLoad < uint32_t > (normalVertexBufferAddress + i1 * vertexStride, 2u);
-                uint32_t v2 = vk::RawBufferLoad < uint32_t > (normalVertexBufferAddress + i2 * vertexStride, 2u);
-
-                n0 = normalize(nbl::hlsl::spirv::unpackSnorm4x8(v0).xyz);
-                n1 = normalize(nbl::hlsl::spirv::unpackSnorm4x8(v1).xyz);
-                n2 = normalize(nbl::hlsl::spirv::unpackSnorm4x8(v2).xyz);
-            }
-            break;
-        case OT_SPHERE:
-        case OT_CYLINDER:
-        case OT_ARROW:
-        case OT_CONE:
-        {
-                uint32_t v0 = vk::RawBufferLoad < uint32_t > (normalVertexBufferAddress + i0 * vertexStride);
-                uint32_t v1 = vk::RawBufferLoad < uint32_t > (normalVertexBufferAddress + i1 * vertexStride);
-                uint32_t v2 = vk::RawBufferLoad < uint32_t > (normalVertexBufferAddress + i2 * vertexStride);
-
-                n0 = normalize(unpackNormals3x10(v0));
-                n1 = normalize(unpackNormals3x10(v1));
-                n2 = normalize(unpackNormals3x10(v2));
-            }
-            break;
-        case OT_RECTANGLE:
-        case OT_DISK:
-        case OT_ICOSPHERE:
-        default:
-        {
-                n0 = normalize(vk::RawBufferLoad <
-                float3 > (normalVertexBufferAddress + i0 * vertexStride));
-                n1 = normalize(vk::RawBufferLoad <
-                float3 > (normalVertexBufferAddress + i1 * vertexStride));
-                n2 = normalize(vk::RawBufferLoad <
-                float3 > (normalVertexBufferAddress + i2 * vertexStride));
-            }
-    }
-
-    float3 barycentrics = float3(0.0, bary);
-    barycentrics.x = 1.0 - barycentrics.y - barycentrics.z;
-    return normalize(barycentrics.x * n0 + barycentrics.y * n1 + barycentrics.z * n2);
-}
 
 [shader("closesthit")]
 void main(inout PrimaryPayload payload, in BuiltInTriangleIntersectionAttributes attribs)
@@ -103,7 +12,8 @@ void main(inout PrimaryPayload payload, in BuiltInTriangleIntersectionAttributes
     const float32_t3 vertexNormal = fetchVertexNormal(instID, primID, geom, attribs.barycentrics);
     const float32_t3 worldNormal = normalize(mul(vertexNormal, WorldToObject3x4()).xyz);
 
-    payload.material = geom.material;
+    payload.materialId = MaterialId::createTriangle(instID);
+
     payload.worldNormal = worldNormal;
     payload.rayDistance = RayTCurrent();
 
diff --git a/71_RayTracingPipeline/app_resources/raytrace.rgen.hlsl b/71_RayTracingPipeline/app_resources/raytrace.rgen.hlsl
index fc6383dcf..ef84ced3e 100644
--- a/71_RayTracingPipeline/app_resources/raytrace.rgen.hlsl
+++ b/71_RayTracingPipeline/app_resources/raytrace.rgen.hlsl
@@ -58,15 +58,29 @@ void main()
         payload.alphaThreshold = nextRandomUnorm(rnd);
         TraceRay(topLevelAS, RAY_FLAG_NONE, 0xff, ERT_PRIMARY, 0, EMT_PRIMARY, rayDesc, payload);
 
-        if (payload.rayDistance < 0)
+        const float32_t rayDistance = payload.rayDistance;
+        if (rayDistance < 0)
         {
             hitValues += s_clearColor;
             continue;
         }
 
-        const float32_t3 worldPosition = pc.camPos + (camDirection * payload.rayDistance);
+        const float32_t3 worldPosition = pc.camPos + (camDirection * rayDistance);
         const float32_t3 worldNormal = payload.worldNormal;
-        const Material material = nbl::hlsl::_static_cast<Material>(payload.material);
+
+        Material material;
+        MaterialId materialId = payload.materialId;
+        // we use negative index to indicate that this is a procedural geometry
+        if (materialId.isHitProceduralGeom())
+        {
+            const MaterialPacked materialPacked = vk::RawBufferLoad<MaterialPacked>(pc.proceduralGeomInfoBuffer + materialId.getMaterialIndex() * sizeof(SProceduralGeomInfo));
+            material = nbl::hlsl::_static_cast<Material>(materialPacked);
+        }
+        else
+        {
+            const MaterialPacked materialPacked = vk::RawBufferLoad<MaterialPacked>(pc.triangleGeomInfoBuffer + materialId.getMaterialIndex() * sizeof(STriangleGeomInfo));
+            material = nbl::hlsl::_static_cast<Material>(materialPacked);
+        }
         RayLight cLight;
         cLight.inHitPosition = worldPosition;
         CallShader(pc.light.type, cLight);
diff --git a/71_RayTracingPipeline/app_resources/raytrace.rint.hlsl b/71_RayTracingPipeline/app_resources/raytrace.rint.hlsl
index b9941fc59..ab623382d 100644
--- a/71_RayTracingPipeline/app_resources/raytrace.rint.hlsl
+++ b/71_RayTracingPipeline/app_resources/raytrace.rint.hlsl
@@ -38,16 +38,16 @@ void main()
     const int primID = PrimitiveIndex();
 
     // Sphere data
-    SProceduralGeomInfo sphere = vk::RawBufferLoad < SProceduralGeomInfo > (pc.proceduralGeomInfoBuffer + primID * sizeof(SProceduralGeomInfo));
+    SProceduralGeomInfo sphere = vk::RawBufferLoad<SProceduralGeomInfo>(pc.proceduralGeomInfoBuffer + primID * sizeof(SProceduralGeomInfo));
 
     const float32_t tHit = hitSphere(sphere, ray);
     
     ProceduralHitAttribute hitAttrib;
+
     // Report hit point
     if (tHit > 0)
     {
         hitAttrib.center = sphere.center;
-        hitAttrib.material = sphere.material;
         ReportHit(tHit, 0, hitAttrib);
     }
 }
\ No newline at end of file
diff --git a/71_RayTracingPipeline/app_resources/raytrace_procedural.rchit.hlsl b/71_RayTracingPipeline/app_resources/raytrace_procedural.rchit.hlsl
index 0a58ccba8..df9ef9623 100644
--- a/71_RayTracingPipeline/app_resources/raytrace_procedural.rchit.hlsl
+++ b/71_RayTracingPipeline/app_resources/raytrace_procedural.rchit.hlsl
@@ -8,7 +8,8 @@ void main(inout PrimaryPayload payload, in ProceduralHitAttribute attrib)
     const float32_t3 worldPosition = WorldRayOrigin() + WorldRayDirection() * RayTCurrent();
     const float32_t3 worldNormal = normalize(worldPosition - attrib.center);
 
-    payload.material = attrib.material;
+    payload.materialId = MaterialId::createProcedural(PrimitiveIndex()); // we use negative value to indicate that this is procedural
+
     payload.worldNormal = worldNormal;
     payload.rayDistance = RayTCurrent();
 

From 6b3ae5402a0c2b7c85506d9c905a89ddd7257e14 Mon Sep 17 00:00:00 2001
From: kevyuu <kevin.kayu@gmail.com>
Date: Thu, 20 Mar 2025 20:38:51 +0700
Subject: [PATCH 100/529] Add ray tracing no null flags

---
 71_RayTracingPipeline/main.cpp | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/71_RayTracingPipeline/main.cpp b/71_RayTracingPipeline/main.cpp
index 036acd510..5fe6f8847 100644
--- a/71_RayTracingPipeline/main.cpp
+++ b/71_RayTracingPipeline/main.cpp
@@ -351,7 +351,10 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication,
 
       params.layout = pipelineLayout.get();
       params.shaders = std::span(shaders);
-      params.flags = IGPURayTracingPipeline::SCreationParams::FLAGS::NO_NULL_INTERSECTION_SHADERS;
+      using RayTracingFlags = IGPURayTracingPipeline::SCreationParams::FLAGS;
+      params.flags = core::bitflag(RayTracingFlags::NO_NULL_INTERSECTION_SHADERS) | 
+        RayTracingFlags::NO_NULL_ANY_HIT_SHADERS |
+        RayTracingFlags::NO_NULL_CLOSEST_HIT_SHADERS;
 
       auto& shaderGroups = params.shaderGroups;
 

From b2abf0042c60d1524ef14a93ebe384172f3de3d1 Mon Sep 17 00:00:00 2001
From: kevyuu <kevin.kayu@gmail.com>
Date: Thu, 20 Mar 2025 20:58:48 +0700
Subject: [PATCH 101/529] Add setRayTracingStackSize in the demo

---
 71_RayTracingPipeline/main.cpp | 29 ++++++++++++++++++++++++++++-
 1 file changed, 28 insertions(+), 1 deletion(-)

diff --git a/71_RayTracingPipeline/main.cpp b/71_RayTracingPipeline/main.cpp
index 5fe6f8847..1e4619b46 100644
--- a/71_RayTracingPipeline/main.cpp
+++ b/71_RayTracingPipeline/main.cpp
@@ -397,11 +397,13 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication,
       shaderGroups.callables = callableGroups;
 
       params.cached.maxRecursionDepth = 1;
+      params.cached.dynamicStackSize = true;
 
       if (!m_device->createRayTracingPipelines(nullptr, { &params, 1 }, &m_rayTracingPipeline))
         return logFail("Failed to create ray tracing pipeline");
-      m_logger->log("Ray Tracing Pipeline Created!", system::ILogger::ELL_INFO);
 
+      calculateRayTracingStackSize(m_rayTracingPipeline);
+      
       if (!createShaderBindingTable(gQueue, m_rayTracingPipeline))
         return logFail("Could not create shader binding table");
 
@@ -732,6 +734,7 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication,
       memcpy(&pc.invMVP, invModelViewProjectionMatrix.pointer(), sizeof(pc.invMVP));
 
       cmdbuf->bindRayTracingPipeline(m_rayTracingPipeline.get());
+      cmdbuf->setRayTracingPipelineStackSize(m_rayTracingStackSize);
       cmdbuf->pushConstants(m_rayTracingPipeline->getLayout(), IShader::E_SHADER_STAGE::ESS_ALL_RAY_TRACING, 0, sizeof(SPushConstants), &pc);
       cmdbuf->bindDescriptorSets(EPBP_RAY_TRACING, m_rayTracingPipeline->getLayout(), 0, 1, &m_rayTracingDs.get());
       if (m_useIndirectCommand)
@@ -1332,6 +1335,29 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication,
     return true;
   }
 
+  void calculateRayTracingStackSize(const smart_refctd_ptr<video::IGPURayTracingPipeline>& pipeline)
+  {
+    const auto raygenStackSize = pipeline->getRaygenStackSize();
+    auto getMaxSize = [&](auto ranges, auto valProj) -> uint16_t
+      {
+        auto maxValue = 0;
+        for (const auto& val : ranges)
+        {
+          maxValue = std::max<uint16_t>(maxValue, std::invoke(valProj, val));
+        }
+        return maxValue;
+      };
+
+    const auto closestHitStackMax = getMaxSize(pipeline->getHitStackSizes(), &IGPURayTracingPipeline::SHitGroupStackSize::closestHit);
+    const auto anyHitStackMax = getMaxSize(pipeline->getHitStackSizes(), &IGPURayTracingPipeline::SHitGroupStackSize::anyHit);
+    const auto intersectionStackMax = getMaxSize(pipeline->getHitStackSizes(), &IGPURayTracingPipeline::SHitGroupStackSize::intersection);
+    const auto missStackMax = getMaxSize(pipeline->getMissStackSizes(), std::identity{});
+    const auto callableStackMax = getMaxSize(pipeline->getCallableStackSizes(), std::identity{});
+    auto firstDepthStackSizeMax = std::max(closestHitStackMax, missStackMax);
+    firstDepthStackSizeMax = std::max<uint16_t>(firstDepthStackSizeMax, intersectionStackMax + anyHitStackMax);
+    m_rayTracingStackSize = raygenStackSize + std::max(firstDepthStackSizeMax, callableStackMax);
+  }
+
   bool createShaderBindingTable(video::CThreadSafeQueueAdapter* queue, const smart_refctd_ptr<video::IGPURayTracingPipeline>& pipeline)
   {
     const auto& limits = m_device->getPhysicalDevice()->getLimits();
@@ -1823,6 +1849,7 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication,
   smart_refctd_ptr<IDescriptorPool> m_rayTracingDsPool;
   smart_refctd_ptr<IGPUDescriptorSet> m_rayTracingDs;
   smart_refctd_ptr<IGPURayTracingPipeline> m_rayTracingPipeline;
+  uint64_t m_rayTracingStackSize;
   ShaderBindingTable m_shaderBindingTable;
 
   smart_refctd_ptr<IGPUDescriptorSet> m_presentDs;

From e95f09d5d20181c4107064cec08bddc689a7f399 Mon Sep 17 00:00:00 2001
From: keptsecret <sorchon@gmail.com>
Date: Fri, 21 Mar 2025 16:48:41 +0700
Subject: [PATCH 102/529] changed workgroup size to 512

---
 31_HLSLPathTracer/app_resources/glsl/common.glsl |  2 +-
 .../app_resources/hlsl/render.comp.hlsl          |  2 +-
 31_HLSLPathTracer/imgui.ini                      |  8 --------
 31_HLSLPathTracer/main.cpp                       | 16 ++++++++--------
 4 files changed, 10 insertions(+), 18 deletions(-)
 delete mode 100644 31_HLSLPathTracer/imgui.ini

diff --git a/31_HLSLPathTracer/app_resources/glsl/common.glsl b/31_HLSLPathTracer/app_resources/glsl/common.glsl
index b09c90824..9015f755d 100644
--- a/31_HLSLPathTracer/app_resources/glsl/common.glsl
+++ b/31_HLSLPathTracer/app_resources/glsl/common.glsl
@@ -16,7 +16,7 @@ layout(set = 2, binding = 2) uniform usampler2D scramblebuf;
 layout(set=0, binding=0, rgba16f) uniform image2D outImage;
 
 #ifndef _NBL_GLSL_WORKGROUP_SIZE_
-#define _NBL_GLSL_WORKGROUP_SIZE_ 256
+#define _NBL_GLSL_WORKGROUP_SIZE_ 1024
 layout(local_size_x=_NBL_GLSL_WORKGROUP_SIZE_, local_size_y=1, local_size_z=1) in;
 #endif
 
diff --git a/31_HLSLPathTracer/app_resources/hlsl/render.comp.hlsl b/31_HLSLPathTracer/app_resources/hlsl/render.comp.hlsl
index 5e8102f6f..d0c969b8b 100644
--- a/31_HLSLPathTracer/app_resources/hlsl/render.comp.hlsl
+++ b/31_HLSLPathTracer/app_resources/hlsl/render.comp.hlsl
@@ -37,7 +37,7 @@
 
 using namespace nbl::hlsl;
 
-NBL_CONSTEXPR uint32_t WorkgroupSize = 256;
+NBL_CONSTEXPR uint32_t WorkgroupSize = 1024;
 NBL_CONSTEXPR uint32_t MAX_DEPTH_LOG2 = 4;
 NBL_CONSTEXPR uint32_t MAX_SAMPLES_LOG2 = 10;
 
diff --git a/31_HLSLPathTracer/imgui.ini b/31_HLSLPathTracer/imgui.ini
deleted file mode 100644
index e60624929..000000000
--- a/31_HLSLPathTracer/imgui.ini
+++ /dev/null
@@ -1,8 +0,0 @@
-[Window][Debug##Default]
-Pos=60,60
-Size=400,400
-
-[Window][Controls]
-Pos=10,10
-Size=320,340
-
diff --git a/31_HLSLPathTracer/main.cpp b/31_HLSLPathTracer/main.cpp
index e3e0b7d7a..add980078 100644
--- a/31_HLSLPathTracer/main.cpp
+++ b/31_HLSLPathTracer/main.cpp
@@ -48,7 +48,7 @@ class HLSLComputePathtracer final : public examples::SimpleWindowedApplication,
 		constexpr static inline uint32_t2 WindowDimensions = { 1280, 720 };
 		constexpr static inline uint32_t MaxFramesInFlight = 5;
 		constexpr static inline clock_t::duration DisplayImageDuration = std::chrono::milliseconds(900);
-		constexpr static inline uint32_t DefaultWorkGroupSize = 256u;
+		constexpr static inline uint32_t DefaultWorkGroupSize = 1024u;
 		constexpr static inline uint32_t MaxDescriptorCount = 256u;
 		constexpr static inline uint32_t MaxDepthLog2 = 4u; // 5
 		constexpr static inline uint32_t MaxSamplesLog2 = 10u; // 18
@@ -366,12 +366,12 @@ class HLSLComputePathtracer final : public examples::SimpleWindowedApplication,
 					options.stage = IShader::E_SHADER_STAGE::ESS_COMPUTE;	// should be compute
 					options.targetSpirvVersion = m_device->getPhysicalDevice()->getLimits().spirvVersion;
 					options.spirvOptimizer = nullptr;
-#ifndef _NBL_DEBUG
-					ISPIRVOptimizer::E_OPTIMIZER_PASS optPasses = ISPIRVOptimizer::EOP_STRIP_DEBUG_INFO;
-					auto opt = make_smart_refctd_ptr<ISPIRVOptimizer>(std::span<ISPIRVOptimizer::E_OPTIMIZER_PASS>(&optPasses, 1));
-					options.spirvOptimizer = opt.get();
-#endif
-					options.debugInfoFlags = IShaderCompiler::E_DEBUG_INFO_FLAGS::EDIF_NONE;
+//#ifndef _NBL_DEBUG
+//					ISPIRVOptimizer::E_OPTIMIZER_PASS optPasses = ISPIRVOptimizer::EOP_STRIP_DEBUG_INFO;
+//					auto opt = make_smart_refctd_ptr<ISPIRVOptimizer>(std::span<ISPIRVOptimizer::E_OPTIMIZER_PASS>(&optPasses, 1));
+//					options.spirvOptimizer = opt.get();
+//#endif
+					options.debugInfoFlags |= IShaderCompiler::E_DEBUG_INFO_FLAGS::EDIF_LINE_BIT;
 					options.preprocessorOptions.sourceIdentifier = source->getFilepathHint();
 					options.preprocessorOptions.logger = m_logger.get();
 					options.preprocessorOptions.includeFinder = compiler->getDefaultIncludeFinder();
@@ -1348,7 +1348,7 @@ class HLSLComputePathtracer final : public examples::SimpleWindowedApplication,
 		float camYAngle = 165.f / 180.f * 3.14159f;
 		float camXAngle = 32.f / 180.f * 3.14159f;
 		int PTPipline = E_LIGHT_GEOMETRY::ELG_SPHERE;
-		int renderMode = E_RENDER_MODE::ERM_HLSL;
+		int renderMode = E_RENDER_MODE::ERM_GLSL;
 		int spp = 32;
 		int depth = 3;
 

From 56994a9d36ae0e21e54a07aa76e1e5bbe2e2d959 Mon Sep 17 00:00:00 2001
From: keptsecret <sorchon@gmail.com>
Date: Fri, 21 Mar 2025 16:51:39 +0700
Subject: [PATCH 103/529] workgroup size 512 for sure this time

---
 31_HLSLPathTracer/app_resources/glsl/common.glsl   |  2 +-
 .../app_resources/hlsl/render.comp.hlsl            |  2 +-
 31_HLSLPathTracer/main.cpp                         | 14 +++++++-------
 3 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/31_HLSLPathTracer/app_resources/glsl/common.glsl b/31_HLSLPathTracer/app_resources/glsl/common.glsl
index 9015f755d..1a1594e6a 100644
--- a/31_HLSLPathTracer/app_resources/glsl/common.glsl
+++ b/31_HLSLPathTracer/app_resources/glsl/common.glsl
@@ -16,7 +16,7 @@ layout(set = 2, binding = 2) uniform usampler2D scramblebuf;
 layout(set=0, binding=0, rgba16f) uniform image2D outImage;
 
 #ifndef _NBL_GLSL_WORKGROUP_SIZE_
-#define _NBL_GLSL_WORKGROUP_SIZE_ 1024
+#define _NBL_GLSL_WORKGROUP_SIZE_ 512
 layout(local_size_x=_NBL_GLSL_WORKGROUP_SIZE_, local_size_y=1, local_size_z=1) in;
 #endif
 
diff --git a/31_HLSLPathTracer/app_resources/hlsl/render.comp.hlsl b/31_HLSLPathTracer/app_resources/hlsl/render.comp.hlsl
index d0c969b8b..b0d221a20 100644
--- a/31_HLSLPathTracer/app_resources/hlsl/render.comp.hlsl
+++ b/31_HLSLPathTracer/app_resources/hlsl/render.comp.hlsl
@@ -37,7 +37,7 @@
 
 using namespace nbl::hlsl;
 
-NBL_CONSTEXPR uint32_t WorkgroupSize = 1024;
+NBL_CONSTEXPR uint32_t WorkgroupSize = 512;
 NBL_CONSTEXPR uint32_t MAX_DEPTH_LOG2 = 4;
 NBL_CONSTEXPR uint32_t MAX_SAMPLES_LOG2 = 10;
 
diff --git a/31_HLSLPathTracer/main.cpp b/31_HLSLPathTracer/main.cpp
index add980078..8394889db 100644
--- a/31_HLSLPathTracer/main.cpp
+++ b/31_HLSLPathTracer/main.cpp
@@ -48,7 +48,7 @@ class HLSLComputePathtracer final : public examples::SimpleWindowedApplication,
 		constexpr static inline uint32_t2 WindowDimensions = { 1280, 720 };
 		constexpr static inline uint32_t MaxFramesInFlight = 5;
 		constexpr static inline clock_t::duration DisplayImageDuration = std::chrono::milliseconds(900);
-		constexpr static inline uint32_t DefaultWorkGroupSize = 1024u;
+		constexpr static inline uint32_t DefaultWorkGroupSize = 512u;
 		constexpr static inline uint32_t MaxDescriptorCount = 256u;
 		constexpr static inline uint32_t MaxDepthLog2 = 4u; // 5
 		constexpr static inline uint32_t MaxSamplesLog2 = 10u; // 18
@@ -366,11 +366,11 @@ class HLSLComputePathtracer final : public examples::SimpleWindowedApplication,
 					options.stage = IShader::E_SHADER_STAGE::ESS_COMPUTE;	// should be compute
 					options.targetSpirvVersion = m_device->getPhysicalDevice()->getLimits().spirvVersion;
 					options.spirvOptimizer = nullptr;
-//#ifndef _NBL_DEBUG
-//					ISPIRVOptimizer::E_OPTIMIZER_PASS optPasses = ISPIRVOptimizer::EOP_STRIP_DEBUG_INFO;
-//					auto opt = make_smart_refctd_ptr<ISPIRVOptimizer>(std::span<ISPIRVOptimizer::E_OPTIMIZER_PASS>(&optPasses, 1));
-//					options.spirvOptimizer = opt.get();
-//#endif
+#ifndef _NBL_DEBUG
+					ISPIRVOptimizer::E_OPTIMIZER_PASS optPasses = ISPIRVOptimizer::EOP_STRIP_DEBUG_INFO;
+					auto opt = make_smart_refctd_ptr<ISPIRVOptimizer>(std::span<ISPIRVOptimizer::E_OPTIMIZER_PASS>(&optPasses, 1));
+					options.spirvOptimizer = opt.get();
+#endif
 					options.debugInfoFlags |= IShaderCompiler::E_DEBUG_INFO_FLAGS::EDIF_LINE_BIT;
 					options.preprocessorOptions.sourceIdentifier = source->getFilepathHint();
 					options.preprocessorOptions.logger = m_logger.get();
@@ -1348,7 +1348,7 @@ class HLSLComputePathtracer final : public examples::SimpleWindowedApplication,
 		float camYAngle = 165.f / 180.f * 3.14159f;
 		float camXAngle = 32.f / 180.f * 3.14159f;
 		int PTPipline = E_LIGHT_GEOMETRY::ELG_SPHERE;
-		int renderMode = E_RENDER_MODE::ERM_GLSL;
+		int renderMode = E_RENDER_MODE::ERM_HLSL;
 		int spp = 32;
 		int depth = 3;
 

From 3cdfb4baf2df319643620a8189c277dec20cb163 Mon Sep 17 00:00:00 2001
From: keptsecret <sorchon@gmail.com>
Date: Mon, 24 Mar 2025 14:43:11 +0700
Subject: [PATCH 104/529] use morton and virtual indexing

---
 .../app_resources/glsl/common.glsl            | 199 +++++++++---------
 .../app_resources/hlsl/render.comp.hlsl       |  93 ++++----
 31_HLSLPathTracer/main.cpp                    |   3 +-
 3 files changed, 156 insertions(+), 139 deletions(-)

diff --git a/31_HLSLPathTracer/app_resources/glsl/common.glsl b/31_HLSLPathTracer/app_resources/glsl/common.glsl
index 1a1594e6a..c04ad2b11 100644
--- a/31_HLSLPathTracer/app_resources/glsl/common.glsl
+++ b/31_HLSLPathTracer/app_resources/glsl/common.glsl
@@ -9,7 +9,7 @@
 // debug
 //#define NEE_ONLY
 
-layout(set = 2, binding = 0) uniform sampler2D envMap; 
+layout(set = 2, binding = 0) uniform sampler2D envMap;
 layout(set = 2, binding = 1) uniform usamplerBuffer sampleSequence;
 layout(set = 2, binding = 2) uniform usampler2D scramblebuf;
 
@@ -35,6 +35,7 @@ vec2 getTexCoords() {
 #include <nbl/builtin/glsl/limits/numeric.glsl>
 #include <nbl/builtin/glsl/math/constants.glsl>
 #include <nbl/builtin/glsl/utils/common.glsl>
+#include <nbl/builtin/glsl/utils/morton.glsl>
 
 #include <nbl/builtin/glsl/sampling/box_muller_transform.glsl>
 
@@ -51,7 +52,7 @@ struct Sphere
     vec3 position;
     float radius2;
     uint bsdfLightIDs;
-}; 
+};
 
 Sphere Sphere_Sphere(in vec3 position, in float radius, in uint bsdfID, in uint lightID)
 {
@@ -188,7 +189,7 @@ void Rectangle_getNormalBasis(in Rectangle rect, out mat3 basis, out vec2 extent
     basis[0] = rect.edge0/extents[0];
     basis[1] = rect.edge1/extents[1];
     basis[2] = normalize(cross(basis[0],basis[1]));
-}        
+}
 
 // return intersection distance if found, nbl_glsl_FLT_NAN otherwise
 float Rectangle_intersect(in Rectangle rect, in vec3 origin, in vec3 direction)
@@ -222,7 +223,7 @@ vec3 Rectangle_getNormalTimesArea(in Rectangle rect)
 #define OP_BITS_OFFSET 0
 #define OP_BITS_SIZE 2
 struct BSDFNode
-{ 
+{
     uvec4 data[2];
 };
 
@@ -386,13 +387,13 @@ vec2 SampleSphericalMap(vec3 v)
 {
     vec2 uv = vec2(atan(v.z, v.x), asin(v.y));
     uv *= nbl_glsl_RECIPROCAL_PI*0.5;
-    uv += 0.5; 
+    uv += 0.5;
     return uv;
 }
 
 void missProgram(in ImmutableRay_t _immutable, inout Payload_t _payload)
 {
-    vec3 finalContribution = _payload.throughput; 
+    vec3 finalContribution = _payload.throughput;
     // #define USE_ENVMAP
 #ifdef USE_ENVMAP
 	vec2 uv = SampleSphericalMap(_immutable.direction);
@@ -415,7 +416,7 @@ nbl_glsl_LightSample nbl_glsl_bsdf_cos_generate(in nbl_glsl_AnisotropicViewSurfa
 {
     const float a = BSDFNode_getRoughness(bsdf);
     const mat2x3 ior = BSDFNode_getEta(bsdf);
-    
+
     // fresnel stuff for dielectrics
     float orientedEta, rcpOrientedEta;
     const bool viewerInsideMedium = nbl_glsl_getOrientedEtas(orientedEta,rcpOrientedEta,interaction.isotropic.NdotV,monochromeEta);
@@ -519,7 +520,7 @@ int traceRay(inout float intersectionT, in vec3 origin, in vec3 direction)
 
         intersectionT = closerIntersection ? t : intersectionT;
 		objectID = closerIntersection ? i:objectID;
-        
+
         // allowing early out results in a performance regression, WTF!?
         //if (anyHit && closerIntersection)
            //break;
@@ -543,7 +544,7 @@ nbl_glsl_LightSample nbl_glsl_light_generate_and_remainder_and_pdf(out vec3 rema
 {
     // normally we'd pick from set of lights, using `xi.z`
     const Light light = lights[0];
-    
+
     vec3 L = nbl_glsl_light_generate_and_pdf(pdf,newRayMaxT,origin,interaction,isBSDF,xi,Light_getObjectID(light));
 
     newRayMaxT *= getEndTolerance(depth);
@@ -663,7 +664,7 @@ bool closestHitProgram(in uint depth, in uint _sample, inout Ray_t ray, inout nb
             //
             bsdfSampleL = bsdf_sample.L;
         }
-        
+
         // additional threshold
         const float lumaThroughputThreshold = lumaContributionThreshold;
         if (bsdfPdf>bsdfPdfThreshold && getLuma(throughput)>lumaThroughputThreshold)
@@ -671,7 +672,7 @@ bool closestHitProgram(in uint depth, in uint _sample, inout Ray_t ray, inout nb
             ray._payload.throughput = throughput;
             ray._payload.otherTechniqueHeuristic = neeProbability/bsdfPdf; // numerically stable, don't touch
             ray._payload.otherTechniqueHeuristic *= ray._payload.otherTechniqueHeuristic;
-                    
+
             // trace new ray
             ray._immutable.origin = intersection+bsdfSampleL*(1.0/*kSceneSize*/)*getStartTolerance(depth);
             ray._immutable.direction = bsdfSampleL;
@@ -688,109 +689,115 @@ bool closestHitProgram(in uint depth, in uint _sample, inout Ray_t ray, inout nb
 void main()
 {
     const ivec2 imageExtents = imageSize(outImage);
-    const ivec2 coords = getCoordinates();
-    vec2 texCoord = vec2(coords) / vec2(imageExtents);
-    texCoord.y = 1.0 - texCoord.y;
-
-    if (false == (all(lessThanEqual(ivec2(0),coords)) && all(greaterThan(imageExtents,coords)))) {
-        return;
-    }
 
-    if (((PTPushConstant.depth-1)>>MAX_DEPTH_LOG2)>0 || ((PTPushConstant.sampleCount-1)>>MAX_SAMPLES_LOG2)>0)
+    uint virtualThreadIndex;
+    for (uint virtualThreadBase = gl_WorkGroupID.x * _NBL_GLSL_WORKGROUP_SIZE_; virtualThreadBase < 1920*1080; virtualThreadBase += gl_NumWorkGroups.x * _NBL_GLSL_WORKGROUP_SIZE_) // not sure why 1280*720 doesn't cover entire window
     {
-        vec4 pixelCol = vec4(1.0,0.0,0.0,1.0);
-        imageStore(outImage, coords, pixelCol);
-        return;
-    }
-
-	nbl_glsl_xoroshiro64star_state_t scramble_start_state = texelFetch(scramblebuf,coords,0).rg;
-    const vec2 pixOffsetParam = vec2(1.0)/vec2(textureSize(scramblebuf,0));
+        virtualThreadIndex = virtualThreadBase + gl_LocalInvocationIndex.x;
+        const ivec2 coords = ivec2(nbl_glsl_morton_decode2d32b(virtualThreadIndex));    // getCoordinates();
+        vec2 texCoord = vec2(coords) / vec2(imageExtents);
+        texCoord.y = 1.0 - texCoord.y;
 
+        if (false == (all(lessThanEqual(ivec2(0),coords)) && all(greaterThan(imageExtents,coords)))) {
+            continue;
+        }
 
-    const mat4 invMVP = PTPushConstant.invMVP;
-    
-    vec4 NDC = vec4(texCoord*vec2(2.0,-2.0)+vec2(-1.0,1.0),0.0,1.0);
-    vec3 camPos;
-    {
-        vec4 tmp = invMVP*NDC;
-        camPos = tmp.xyz/tmp.w;
-        NDC.z = 1.0;
-    }
+        if (((PTPushConstant.depth-1)>>MAX_DEPTH_LOG2)>0 || ((PTPushConstant.sampleCount-1)>>MAX_SAMPLES_LOG2)>0)
+        {
+            vec4 pixelCol = vec4(1.0,0.0,0.0,1.0);
+            imageStore(outImage, coords, pixelCol);
+            continue;
+        }
 
-    vec3 color = vec3(0.0);
-    float meanLumaSquared = 0.0;
-    // TODO: if we collapse the nested for loop, then all GPUs will get `PTPushConstant.depth` factor speedup, not just NV with separate PC
-    for (int i=0; i<PTPushConstant.sampleCount; i++)
-    {
-        nbl_glsl_xoroshiro64star_state_t scramble_state = scramble_start_state;
+        nbl_glsl_xoroshiro64star_state_t scramble_start_state = texelFetch(scramblebuf,coords,0).rg;
+        const vec2 pixOffsetParam = vec2(1.0)/vec2(textureSize(scramblebuf,0));
 
-        Ray_t ray;
-        // raygen
-        {
-            ray._immutable.origin = camPos;
-
-            vec4 tmp = NDC;
-            // apply stochastic reconstruction filter
-            const float gaussianFilterCutoff = 2.5;
-            const float truncation = exp(-0.5*gaussianFilterCutoff*gaussianFilterCutoff);
-            vec2 remappedRand = rand3d(0u,i,scramble_state)[0].xy;
-            remappedRand.x *= 1.0-truncation;
-            remappedRand.x += truncation;
-            tmp.xy += pixOffsetParam*nbl_glsl_BoxMullerTransform(remappedRand,1.5);
-            // for depth of field we could do another stochastic point-pick
-            tmp = invMVP*tmp;
-            ray._immutable.direction = normalize(tmp.xyz/tmp.w-camPos);
 
-            #if POLYGON_METHOD==2
-                ray._immutable.normalAtOrigin = vec3(0.0,0.0,0.0);
-                ray._immutable.wasBSDFAtOrigin = false;
-            #endif
+        const mat4 invMVP = PTPushConstant.invMVP;
 
-            ray._payload.accumulation = vec3(0.0);
-            ray._payload.otherTechniqueHeuristic = 0.0; // needed for direct eye-light paths
-            ray._payload.throughput = vec3(1.0);
-            #ifdef KILL_DIFFUSE_SPECULAR_PATHS
-            ray._payload.hasDiffuse = false;
-            #endif
+        vec4 NDC = vec4(texCoord*vec2(2.0,-2.0)+vec2(-1.0,1.0),0.0,1.0);
+        vec3 camPos;
+        {
+            vec4 tmp = invMVP*NDC;
+            camPos = tmp.xyz/tmp.w;
+            NDC.z = 1.0;
         }
 
-        // bounces
+        vec3 color = vec3(0.0);
+        float meanLumaSquared = 0.0;
+        // TODO: if we collapse the nested for loop, then all GPUs will get `PTPushConstant.depth` factor speedup, not just NV with separate PC
+        for (int i=0; i<PTPushConstant.sampleCount; i++)
         {
-            bool hit = true; bool rayAlive = true;
-            for (int d=1; d<=PTPushConstant.depth && hit && rayAlive; d+=2)
+            nbl_glsl_xoroshiro64star_state_t scramble_state = scramble_start_state;
+
+            Ray_t ray;
+            // raygen
             {
-                ray._mutable.intersectionT = nbl_glsl_FLT_MAX;
-                ray._mutable.objectID = traceRay(ray._mutable.intersectionT,ray._immutable.origin,ray._immutable.direction);
-                hit = ray._mutable.objectID!=-1;
-                if (hit)
-                    rayAlive = closestHitProgram(d, i, ray, scramble_state);
+                ray._immutable.origin = camPos;
+
+                vec4 tmp = NDC;
+                // apply stochastic reconstruction filter
+                const float gaussianFilterCutoff = 2.5;
+                const float truncation = exp(-0.5*gaussianFilterCutoff*gaussianFilterCutoff);
+                vec2 remappedRand = rand3d(0u,i,scramble_state)[0].xy;
+                remappedRand.x *= 1.0-truncation;
+                remappedRand.x += truncation;
+                tmp.xy += pixOffsetParam*nbl_glsl_BoxMullerTransform(remappedRand,1.5);
+                // for depth of field we could do another stochastic point-pick
+                tmp = invMVP*tmp;
+                ray._immutable.direction = normalize(tmp.xyz/tmp.w-camPos);
+
+                #if POLYGON_METHOD==2
+                    ray._immutable.normalAtOrigin = vec3(0.0,0.0,0.0);
+                    ray._immutable.wasBSDFAtOrigin = false;
+                #endif
+
+                ray._payload.accumulation = vec3(0.0);
+                ray._payload.otherTechniqueHeuristic = 0.0; // needed for direct eye-light paths
+                ray._payload.throughput = vec3(1.0);
+                #ifdef KILL_DIFFUSE_SPECULAR_PATHS
+                ray._payload.hasDiffuse = false;
+                #endif
+            }
+
+            // bounces
+            {
+                bool hit = true; bool rayAlive = true;
+                for (int d=1; d<=PTPushConstant.depth && hit && rayAlive; d+=2)
+                {
+                    ray._mutable.intersectionT = nbl_glsl_FLT_MAX;
+                    ray._mutable.objectID = traceRay(ray._mutable.intersectionT,ray._immutable.origin,ray._immutable.direction);
+                    hit = ray._mutable.objectID!=-1;
+                    if (hit)
+                        rayAlive = closestHitProgram(d, i, ray, scramble_state);
+                }
+                // was last trace a miss?
+                if (!hit)
+                    missProgram(ray._immutable,ray._payload);
             }
-            // was last trace a miss?
-            if (!hit)
-                missProgram(ray._immutable,ray._payload);
-        }
 
-        vec3 accumulation = ray._payload.accumulation;
+            vec3 accumulation = ray._payload.accumulation;
+
+            float rcpSampleSize = 1.0/float(i+1);
+            color += (accumulation-color)*rcpSampleSize;
+
+            #ifdef VISUALIZE_HIGH_VARIANCE
+                float luma = getLuma(accumulation);
+                meanLumaSquared += (luma*luma-meanLumaSquared)*rcpSampleSize;
+            #endif
+        }
 
-        float rcpSampleSize = 1.0/float(i+1);
-        color += (accumulation-color)*rcpSampleSize;
-        
         #ifdef VISUALIZE_HIGH_VARIANCE
-            float luma = getLuma(accumulation);
-            meanLumaSquared += (luma*luma-meanLumaSquared)*rcpSampleSize;
+            float variance = getLuma(color);
+            variance *= variance;
+            variance = meanLumaSquared-variance;
+            if (variance>5.0)
+                color = vec3(1.0,0.0,0.0);
         #endif
-    }
 
-    #ifdef VISUALIZE_HIGH_VARIANCE
-        float variance = getLuma(color);
-        variance *= variance;
-        variance = meanLumaSquared-variance;
-        if (variance>5.0)
-            color = vec3(1.0,0.0,0.0);
-    #endif
-
-    vec4 pixelCol = vec4(color, 1.0);
-    imageStore(outImage, coords, pixelCol);
+        vec4 pixelCol = vec4(color, 1.0);
+        imageStore(outImage, coords, pixelCol);
+    }
 }
 /** TODO: Improving Rendering
 
diff --git a/31_HLSLPathTracer/app_resources/hlsl/render.comp.hlsl b/31_HLSLPathTracer/app_resources/hlsl/render.comp.hlsl
index b0d221a20..ed7e4a85e 100644
--- a/31_HLSLPathTracer/app_resources/hlsl/render.comp.hlsl
+++ b/31_HLSLPathTracer/app_resources/hlsl/render.comp.hlsl
@@ -2,6 +2,7 @@
 #include "nbl/builtin/hlsl/glsl_compat/core.hlsl"
 #include "nbl/builtin/hlsl/random/pcg.hlsl"
 #include "nbl/builtin/hlsl/random/xoroshiro.hlsl"
+#include "nbl/builtin/hlsl/math/morton.hlsl"
 
 #include "nbl/builtin/hlsl/bxdf/reflection.hlsl"
 #include "nbl/builtin/hlsl/bxdf/transmission.hlsl"
@@ -35,7 +36,8 @@
 #include "render_common.hlsl"
 #include "pathtracer.hlsl"
 
-using namespace nbl::hlsl;
+using namespace nbl;
+using namespace hlsl;
 
 NBL_CONSTEXPR uint32_t WorkgroupSize = 512;
 NBL_CONSTEXPR uint32_t MAX_DEPTH_LOG2 = 4;
@@ -155,48 +157,55 @@ void main(uint32_t3 threadID : SV_DispatchThreadID)
 {
     uint32_t width, height;
     outImage.GetDimensions(width, height);
-    const int32_t2 coords = getCoordinates();
-    float32_t2 texCoord = float32_t2(coords) / float32_t2(width, height);
-    texCoord.y = 1.0 - texCoord.y;
 
-    if (false == (all((int32_t2)0 < coords)) && all(int32_t2(width, height) < coords)) {
-        return;
-    }
-
-    if (((pc.depth - 1) >> MAX_DEPTH_LOG2) > 0 || ((pc.sampleCount - 1) >> MAX_SAMPLES_LOG2) > 0)
-    {
-        float32_t4 pixelCol = float32_t4(1.0,0.0,0.0,1.0);
-        outImage[coords] = pixelCol;
-        return;
-    }
-
-    int flatIdx = glsl::gl_GlobalInvocationID().y * glsl::gl_NumWorkGroups().x * WorkgroupSize + glsl::gl_GlobalInvocationID().x;
-
-    // set up path tracer
-    ext::PathTracer::PathTracerCreationParams<create_params_t, float> ptCreateParams;
-    ptCreateParams.rngState = scramblebuf[coords].rg;
-
-    uint2 scrambleDim;
-    scramblebuf.GetDimensions(scrambleDim.x, scrambleDim.y);
-    ptCreateParams.pixOffsetParam = (float2)1.0 / float2(scrambleDim);
-
-    float4 NDC = float4(texCoord * float2(2.0, -2.0) + float2(-1.0, 1.0), 0.0, 1.0);
+    uint32_t virtualThreadIndex;
+    [loop]
+    for (uint32_t virtualThreadBase = glsl::gl_WorkGroupID().x * WorkgroupSize; virtualThreadBase < 1920*1080; virtualThreadBase += glsl::gl_NumWorkGroups().x * WorkgroupSize) // not sure why 1280*720 doesn't cover entire window
     {
-        float4 tmp = mul(pc.invMVP, NDC);
-        ptCreateParams.camPos = tmp.xyz / tmp.w;
-        NDC.z = 1.0;
+        virtualThreadIndex = virtualThreadBase + glsl::gl_LocalInvocationIndex().x;
+        const int32_t2 coords = (int32_t2)math::Morton<uint32_t>::decode2d(virtualThreadIndex);   // getCoordinates();
+        float32_t2 texCoord = float32_t2(coords) / float32_t2(width, height);
+        texCoord.y = 1.0 - texCoord.y;
+
+        if (false == (hlsl::all((int32_t2)0 < coords)) && hlsl::all(int32_t2(width, height) < coords)) {
+            continue;
+        }
+
+        if (((pc.depth - 1) >> MAX_DEPTH_LOG2) > 0 || ((pc.sampleCount - 1) >> MAX_SAMPLES_LOG2) > 0)
+        {
+            float32_t4 pixelCol = float32_t4(1.0,0.0,0.0,1.0);
+            outImage[coords] = pixelCol;
+            continue;
+        }
+
+        int flatIdx = glsl::gl_GlobalInvocationID().y * glsl::gl_NumWorkGroups().x * WorkgroupSize + glsl::gl_GlobalInvocationID().x;
+
+        // set up path tracer
+        ext::PathTracer::PathTracerCreationParams<create_params_t, float> ptCreateParams;
+        ptCreateParams.rngState = scramblebuf[coords].rg;
+
+        uint2 scrambleDim;
+        scramblebuf.GetDimensions(scrambleDim.x, scrambleDim.y);
+        ptCreateParams.pixOffsetParam = (float2)1.0 / float2(scrambleDim);
+
+        float4 NDC = float4(texCoord * float2(2.0, -2.0) + float2(-1.0, 1.0), 0.0, 1.0);
+        {
+            float4 tmp = hlsl::mul(pc.invMVP, NDC);
+            ptCreateParams.camPos = tmp.xyz / tmp.w;
+            NDC.z = 1.0;
+        }
+
+        ptCreateParams.NDC = NDC;
+        ptCreateParams.invMVP = pc.invMVP;
+
+        ptCreateParams.diffuseParams = bxdfs[0].params;
+        ptCreateParams.conductorParams = bxdfs[3].params;
+        ptCreateParams.dielectricParams = bxdfs[6].params;
+
+        pathtracer_type pathtracer = pathtracer_type::create(ptCreateParams);
+
+        float32_t3 color = pathtracer.getMeasure(pc.sampleCount, pc.depth, scene);
+        float32_t4 pixCol = float32_t4(color, 1.0);
+        outImage[coords] = pixCol;
     }
-
-    ptCreateParams.NDC = NDC;
-    ptCreateParams.invMVP = pc.invMVP;
-
-    ptCreateParams.diffuseParams = bxdfs[0].params;
-    ptCreateParams.conductorParams = bxdfs[3].params;
-    ptCreateParams.dielectricParams = bxdfs[6].params;
-
-    pathtracer_type pathtracer = pathtracer_type::create(ptCreateParams);
-
-    float32_t3 color = pathtracer.getMeasure(pc.sampleCount, pc.depth, scene);
-    float32_t4 pixCol = float32_t4(color, 1.0);
-    outImage[coords] = pixCol;
 }
diff --git a/31_HLSLPathTracer/main.cpp b/31_HLSLPathTracer/main.cpp
index 8394889db..db1e198c5 100644
--- a/31_HLSLPathTracer/main.cpp
+++ b/31_HLSLPathTracer/main.cpp
@@ -1068,7 +1068,8 @@ class HLSLComputePathtracer final : public examples::SimpleWindowedApplication,
 					cmdbuf->bindDescriptorSets(EPBP_COMPUTE, pipeline->getLayout(), 0u, 1u, &m_descriptorSet0.get());
 					cmdbuf->bindDescriptorSets(EPBP_COMPUTE, pipeline->getLayout(), 2u, 1u, &m_descriptorSet2.get());
 					cmdbuf->pushConstants(pipeline->getLayout(), IShader::E_SHADER_STAGE::ESS_COMPUTE, 0, sizeof(PTPushConstant), &pc);
-					cmdbuf->dispatch(1 + (WindowDimensions.x * WindowDimensions.y - 1) / DefaultWorkGroupSize, 1u, 1u);
+					uint32_t dispatchSize = m_physicalDevice->getLimits().computeOptimalPersistentWorkgroupDispatchSize(WindowDimensions.x * WindowDimensions.y, DefaultWorkGroupSize);
+					cmdbuf->dispatch(dispatchSize, 1u, 1u);
 				}
 
 				// TRANSITION m_outImgView to READ (because of descriptorSets0 -> ComputeShader Writes into the image)

From 5f93cec878eafcd03a0af1b3d1e4a136deb9bade Mon Sep 17 00:00:00 2001
From: keptsecret <sorchon@gmail.com>
Date: Mon, 24 Mar 2025 15:32:09 +0700
Subject: [PATCH 105/529] reverted virtual index, fix hlsl colors

---
 .../app_resources/glsl/common.glsl            | 173 +++++++++---------
 .../app_resources/hlsl/render.comp.hlsl       |  96 +++++-----
 31_HLSLPathTracer/main.cpp                    |   3 +-
 3 files changed, 128 insertions(+), 144 deletions(-)

diff --git a/31_HLSLPathTracer/app_resources/glsl/common.glsl b/31_HLSLPathTracer/app_resources/glsl/common.glsl
index c04ad2b11..6c2b5f42f 100644
--- a/31_HLSLPathTracer/app_resources/glsl/common.glsl
+++ b/31_HLSLPathTracer/app_resources/glsl/common.glsl
@@ -35,7 +35,6 @@ vec2 getTexCoords() {
 #include <nbl/builtin/glsl/limits/numeric.glsl>
 #include <nbl/builtin/glsl/math/constants.glsl>
 #include <nbl/builtin/glsl/utils/common.glsl>
-#include <nbl/builtin/glsl/utils/morton.glsl>
 
 #include <nbl/builtin/glsl/sampling/box_muller_transform.glsl>
 
@@ -689,115 +688,109 @@ bool closestHitProgram(in uint depth, in uint _sample, inout Ray_t ray, inout nb
 void main()
 {
     const ivec2 imageExtents = imageSize(outImage);
+    const ivec2 coords = getCoordinates();
+    vec2 texCoord = vec2(coords) / vec2(imageExtents);
+    texCoord.y = 1.0 - texCoord.y;
 
-    uint virtualThreadIndex;
-    for (uint virtualThreadBase = gl_WorkGroupID.x * _NBL_GLSL_WORKGROUP_SIZE_; virtualThreadBase < 1920*1080; virtualThreadBase += gl_NumWorkGroups.x * _NBL_GLSL_WORKGROUP_SIZE_) // not sure why 1280*720 doesn't cover entire window
+    if (false == (all(lessThanEqual(ivec2(0),coords)) && all(greaterThan(imageExtents,coords)))) {
+        return;
+    }
+
+    if (((PTPushConstant.depth-1)>>MAX_DEPTH_LOG2)>0 || ((PTPushConstant.sampleCount-1)>>MAX_SAMPLES_LOG2)>0)
     {
-        virtualThreadIndex = virtualThreadBase + gl_LocalInvocationIndex.x;
-        const ivec2 coords = ivec2(nbl_glsl_morton_decode2d32b(virtualThreadIndex));    // getCoordinates();
-        vec2 texCoord = vec2(coords) / vec2(imageExtents);
-        texCoord.y = 1.0 - texCoord.y;
+        vec4 pixelCol = vec4(1.0,0.0,0.0,1.0);
+        imageStore(outImage, coords, pixelCol);
+        return;
+    }
 
-        if (false == (all(lessThanEqual(ivec2(0),coords)) && all(greaterThan(imageExtents,coords)))) {
-            continue;
-        }
+    nbl_glsl_xoroshiro64star_state_t scramble_start_state = texelFetch(scramblebuf,coords,0).rg;
+    const vec2 pixOffsetParam = vec2(1.0)/vec2(textureSize(scramblebuf,0));
 
-        if (((PTPushConstant.depth-1)>>MAX_DEPTH_LOG2)>0 || ((PTPushConstant.sampleCount-1)>>MAX_SAMPLES_LOG2)>0)
-        {
-            vec4 pixelCol = vec4(1.0,0.0,0.0,1.0);
-            imageStore(outImage, coords, pixelCol);
-            continue;
-        }
 
-        nbl_glsl_xoroshiro64star_state_t scramble_start_state = texelFetch(scramblebuf,coords,0).rg;
-        const vec2 pixOffsetParam = vec2(1.0)/vec2(textureSize(scramblebuf,0));
+    const mat4 invMVP = PTPushConstant.invMVP;
 
+    vec4 NDC = vec4(texCoord*vec2(2.0,-2.0)+vec2(-1.0,1.0),0.0,1.0);
+    vec3 camPos;
+    {
+        vec4 tmp = invMVP*NDC;
+        camPos = tmp.xyz/tmp.w;
+        NDC.z = 1.0;
+    }
 
-        const mat4 invMVP = PTPushConstant.invMVP;
+    vec3 color = vec3(0.0);
+    float meanLumaSquared = 0.0;
+    // TODO: if we collapse the nested for loop, then all GPUs will get `PTPushConstant.depth` factor speedup, not just NV with separate PC
+    for (int i=0; i<PTPushConstant.sampleCount; i++)
+    {
+        nbl_glsl_xoroshiro64star_state_t scramble_state = scramble_start_state;
 
-        vec4 NDC = vec4(texCoord*vec2(2.0,-2.0)+vec2(-1.0,1.0),0.0,1.0);
-        vec3 camPos;
+        Ray_t ray;
+        // raygen
         {
-            vec4 tmp = invMVP*NDC;
-            camPos = tmp.xyz/tmp.w;
-            NDC.z = 1.0;
-        }
+            ray._immutable.origin = camPos;
+
+            vec4 tmp = NDC;
+            // apply stochastic reconstruction filter
+            const float gaussianFilterCutoff = 2.5;
+            const float truncation = exp(-0.5*gaussianFilterCutoff*gaussianFilterCutoff);
+            vec2 remappedRand = rand3d(0u,i,scramble_state)[0].xy;
+            remappedRand.x *= 1.0-truncation;
+            remappedRand.x += truncation;
+            tmp.xy += pixOffsetParam*nbl_glsl_BoxMullerTransform(remappedRand,1.5);
+            // for depth of field we could do another stochastic point-pick
+            tmp = invMVP*tmp;
+            ray._immutable.direction = normalize(tmp.xyz/tmp.w-camPos);
 
-        vec3 color = vec3(0.0);
-        float meanLumaSquared = 0.0;
-        // TODO: if we collapse the nested for loop, then all GPUs will get `PTPushConstant.depth` factor speedup, not just NV with separate PC
-        for (int i=0; i<PTPushConstant.sampleCount; i++)
-        {
-            nbl_glsl_xoroshiro64star_state_t scramble_state = scramble_start_state;
+            #if POLYGON_METHOD==2
+                ray._immutable.normalAtOrigin = vec3(0.0,0.0,0.0);
+                ray._immutable.wasBSDFAtOrigin = false;
+            #endif
 
-            Ray_t ray;
-            // raygen
-            {
-                ray._immutable.origin = camPos;
-
-                vec4 tmp = NDC;
-                // apply stochastic reconstruction filter
-                const float gaussianFilterCutoff = 2.5;
-                const float truncation = exp(-0.5*gaussianFilterCutoff*gaussianFilterCutoff);
-                vec2 remappedRand = rand3d(0u,i,scramble_state)[0].xy;
-                remappedRand.x *= 1.0-truncation;
-                remappedRand.x += truncation;
-                tmp.xy += pixOffsetParam*nbl_glsl_BoxMullerTransform(remappedRand,1.5);
-                // for depth of field we could do another stochastic point-pick
-                tmp = invMVP*tmp;
-                ray._immutable.direction = normalize(tmp.xyz/tmp.w-camPos);
-
-                #if POLYGON_METHOD==2
-                    ray._immutable.normalAtOrigin = vec3(0.0,0.0,0.0);
-                    ray._immutable.wasBSDFAtOrigin = false;
-                #endif
-
-                ray._payload.accumulation = vec3(0.0);
-                ray._payload.otherTechniqueHeuristic = 0.0; // needed for direct eye-light paths
-                ray._payload.throughput = vec3(1.0);
-                #ifdef KILL_DIFFUSE_SPECULAR_PATHS
-                ray._payload.hasDiffuse = false;
-                #endif
-            }
+            ray._payload.accumulation = vec3(0.0);
+            ray._payload.otherTechniqueHeuristic = 0.0; // needed for direct eye-light paths
+            ray._payload.throughput = vec3(1.0);
+            #ifdef KILL_DIFFUSE_SPECULAR_PATHS
+            ray._payload.hasDiffuse = false;
+            #endif
+        }
 
-            // bounces
+        // bounces
+        {
+            bool hit = true; bool rayAlive = true;
+            for (int d=1; d<=PTPushConstant.depth && hit && rayAlive; d+=2)
             {
-                bool hit = true; bool rayAlive = true;
-                for (int d=1; d<=PTPushConstant.depth && hit && rayAlive; d+=2)
-                {
-                    ray._mutable.intersectionT = nbl_glsl_FLT_MAX;
-                    ray._mutable.objectID = traceRay(ray._mutable.intersectionT,ray._immutable.origin,ray._immutable.direction);
-                    hit = ray._mutable.objectID!=-1;
-                    if (hit)
-                        rayAlive = closestHitProgram(d, i, ray, scramble_state);
-                }
-                // was last trace a miss?
-                if (!hit)
-                    missProgram(ray._immutable,ray._payload);
+                ray._mutable.intersectionT = nbl_glsl_FLT_MAX;
+                ray._mutable.objectID = traceRay(ray._mutable.intersectionT,ray._immutable.origin,ray._immutable.direction);
+                hit = ray._mutable.objectID!=-1;
+                if (hit)
+                    rayAlive = closestHitProgram(d, i, ray, scramble_state);
             }
+            // was last trace a miss?
+            if (!hit)
+                missProgram(ray._immutable,ray._payload);
+        }
 
-            vec3 accumulation = ray._payload.accumulation;
-
-            float rcpSampleSize = 1.0/float(i+1);
-            color += (accumulation-color)*rcpSampleSize;
+        vec3 accumulation = ray._payload.accumulation;
 
-            #ifdef VISUALIZE_HIGH_VARIANCE
-                float luma = getLuma(accumulation);
-                meanLumaSquared += (luma*luma-meanLumaSquared)*rcpSampleSize;
-            #endif
-        }
+        float rcpSampleSize = 1.0/float(i+1);
+        color += (accumulation-color)*rcpSampleSize;
 
         #ifdef VISUALIZE_HIGH_VARIANCE
-            float variance = getLuma(color);
-            variance *= variance;
-            variance = meanLumaSquared-variance;
-            if (variance>5.0)
-                color = vec3(1.0,0.0,0.0);
+            float luma = getLuma(accumulation);
+            meanLumaSquared += (luma*luma-meanLumaSquared)*rcpSampleSize;
         #endif
-
-        vec4 pixelCol = vec4(color, 1.0);
-        imageStore(outImage, coords, pixelCol);
     }
+
+    #ifdef VISUALIZE_HIGH_VARIANCE
+        float variance = getLuma(color);
+        variance *= variance;
+        variance = meanLumaSquared-variance;
+        if (variance>5.0)
+            color = vec3(1.0,0.0,0.0);
+    #endif
+
+    vec4 pixelCol = vec4(color, 1.0);
+    imageStore(outImage, coords, pixelCol);
 }
 /** TODO: Improving Rendering
 
diff --git a/31_HLSLPathTracer/app_resources/hlsl/render.comp.hlsl b/31_HLSLPathTracer/app_resources/hlsl/render.comp.hlsl
index ed7e4a85e..b187a1b33 100644
--- a/31_HLSLPathTracer/app_resources/hlsl/render.comp.hlsl
+++ b/31_HLSLPathTracer/app_resources/hlsl/render.comp.hlsl
@@ -2,7 +2,6 @@
 #include "nbl/builtin/hlsl/glsl_compat/core.hlsl"
 #include "nbl/builtin/hlsl/random/pcg.hlsl"
 #include "nbl/builtin/hlsl/random/xoroshiro.hlsl"
-#include "nbl/builtin/hlsl/math/morton.hlsl"
 
 #include "nbl/builtin/hlsl/bxdf/reflection.hlsl"
 #include "nbl/builtin/hlsl/bxdf/transmission.hlsl"
@@ -140,9 +139,9 @@ static const bxdfnode_type bxdfs[BXDF_COUNT] = {
     bxdfnode_type::create(ext::MaterialSystem::MaterialType::DIFFUSE, false, float2(0,0), spectral_t(0.8,0.8,0.8)),
     bxdfnode_type::create(ext::MaterialSystem::MaterialType::DIFFUSE, false, float2(0,0), spectral_t(0.8,0.4,0.4)),
     bxdfnode_type::create(ext::MaterialSystem::MaterialType::DIFFUSE, false, float2(0,0), spectral_t(0.4,0.8,0.4)),
-    bxdfnode_type::create(ext::MaterialSystem::MaterialType::CONDUCTOR, false, float2(0,0), spectral_t(1,1,1), spectral_t(0.98,0.98,0.77)),
-    bxdfnode_type::create(ext::MaterialSystem::MaterialType::CONDUCTOR, false, float2(0,0), spectral_t(1,1,1), spectral_t(0.98,0.77,0.98)),
-    bxdfnode_type::create(ext::MaterialSystem::MaterialType::CONDUCTOR, false, float2(0.15,0.15), spectral_t(1,1,1), spectral_t(0.98,0.77,0.98)),
+    bxdfnode_type::create(ext::MaterialSystem::MaterialType::CONDUCTOR, false, float2(0,0), spectral_t(1.02,1.02,1.3), spectral_t(1.0,1.0,2.0)),
+    bxdfnode_type::create(ext::MaterialSystem::MaterialType::CONDUCTOR, false, float2(0,0), spectral_t(1.02,1.3,1.02), spectral_t(1.0,2.0,1.0)),
+    bxdfnode_type::create(ext::MaterialSystem::MaterialType::CONDUCTOR, false, float2(0.15,0.15), spectral_t(1.02,1.3,1.02), spectral_t(1.0,2.0,1.0)),
     bxdfnode_type::create(ext::MaterialSystem::MaterialType::DIELECTRIC, false, float2(0.0625,0.0625), spectral_t(1,1,1), spectral_t(0.71,0.69,0.67))
 };
 
@@ -157,55 +156,48 @@ void main(uint32_t3 threadID : SV_DispatchThreadID)
 {
     uint32_t width, height;
     outImage.GetDimensions(width, height);
+    const int32_t2 coords = getCoordinates();
+    float32_t2 texCoord = float32_t2(coords) / float32_t2(width, height);
+    texCoord.y = 1.0 - texCoord.y;
 
-    uint32_t virtualThreadIndex;
-    [loop]
-    for (uint32_t virtualThreadBase = glsl::gl_WorkGroupID().x * WorkgroupSize; virtualThreadBase < 1920*1080; virtualThreadBase += glsl::gl_NumWorkGroups().x * WorkgroupSize) // not sure why 1280*720 doesn't cover entire window
+    if (false == (all((int32_t2)0 < coords)) && all(int32_t2(width, height) < coords)) {
+        return;
+    }
+
+    if (((pc.depth - 1) >> MAX_DEPTH_LOG2) > 0 || ((pc.sampleCount - 1) >> MAX_SAMPLES_LOG2) > 0)
+    {
+        float32_t4 pixelCol = float32_t4(1.0,0.0,0.0,1.0);
+        outImage[coords] = pixelCol;
+        return;
+    }
+
+    int flatIdx = glsl::gl_GlobalInvocationID().y * glsl::gl_NumWorkGroups().x * WorkgroupSize + glsl::gl_GlobalInvocationID().x;
+
+    // set up path tracer
+    ext::PathTracer::PathTracerCreationParams<create_params_t, float> ptCreateParams;
+    ptCreateParams.rngState = scramblebuf[coords].rg;
+
+    uint2 scrambleDim;
+    scramblebuf.GetDimensions(scrambleDim.x, scrambleDim.y);
+    ptCreateParams.pixOffsetParam = (float2)1.0 / float2(scrambleDim);
+
+    float4 NDC = float4(texCoord * float2(2.0, -2.0) + float2(-1.0, 1.0), 0.0, 1.0);
     {
-        virtualThreadIndex = virtualThreadBase + glsl::gl_LocalInvocationIndex().x;
-        const int32_t2 coords = (int32_t2)math::Morton<uint32_t>::decode2d(virtualThreadIndex);   // getCoordinates();
-        float32_t2 texCoord = float32_t2(coords) / float32_t2(width, height);
-        texCoord.y = 1.0 - texCoord.y;
-
-        if (false == (hlsl::all((int32_t2)0 < coords)) && hlsl::all(int32_t2(width, height) < coords)) {
-            continue;
-        }
-
-        if (((pc.depth - 1) >> MAX_DEPTH_LOG2) > 0 || ((pc.sampleCount - 1) >> MAX_SAMPLES_LOG2) > 0)
-        {
-            float32_t4 pixelCol = float32_t4(1.0,0.0,0.0,1.0);
-            outImage[coords] = pixelCol;
-            continue;
-        }
-
-        int flatIdx = glsl::gl_GlobalInvocationID().y * glsl::gl_NumWorkGroups().x * WorkgroupSize + glsl::gl_GlobalInvocationID().x;
-
-        // set up path tracer
-        ext::PathTracer::PathTracerCreationParams<create_params_t, float> ptCreateParams;
-        ptCreateParams.rngState = scramblebuf[coords].rg;
-
-        uint2 scrambleDim;
-        scramblebuf.GetDimensions(scrambleDim.x, scrambleDim.y);
-        ptCreateParams.pixOffsetParam = (float2)1.0 / float2(scrambleDim);
-
-        float4 NDC = float4(texCoord * float2(2.0, -2.0) + float2(-1.0, 1.0), 0.0, 1.0);
-        {
-            float4 tmp = hlsl::mul(pc.invMVP, NDC);
-            ptCreateParams.camPos = tmp.xyz / tmp.w;
-            NDC.z = 1.0;
-        }
-
-        ptCreateParams.NDC = NDC;
-        ptCreateParams.invMVP = pc.invMVP;
-
-        ptCreateParams.diffuseParams = bxdfs[0].params;
-        ptCreateParams.conductorParams = bxdfs[3].params;
-        ptCreateParams.dielectricParams = bxdfs[6].params;
-
-        pathtracer_type pathtracer = pathtracer_type::create(ptCreateParams);
-
-        float32_t3 color = pathtracer.getMeasure(pc.sampleCount, pc.depth, scene);
-        float32_t4 pixCol = float32_t4(color, 1.0);
-        outImage[coords] = pixCol;
+        float4 tmp = mul(pc.invMVP, NDC);
+        ptCreateParams.camPos = tmp.xyz / tmp.w;
+        NDC.z = 1.0;
     }
+
+    ptCreateParams.NDC = NDC;
+    ptCreateParams.invMVP = pc.invMVP;
+
+    ptCreateParams.diffuseParams = bxdfs[0].params;
+    ptCreateParams.conductorParams = bxdfs[3].params;
+    ptCreateParams.dielectricParams = bxdfs[6].params;
+
+    pathtracer_type pathtracer = pathtracer_type::create(ptCreateParams);
+
+    float32_t3 color = pathtracer.getMeasure(pc.sampleCount, pc.depth, scene);
+    float32_t4 pixCol = float32_t4(color, 1.0);
+    outImage[coords] = pixCol;
 }
diff --git a/31_HLSLPathTracer/main.cpp b/31_HLSLPathTracer/main.cpp
index db1e198c5..8394889db 100644
--- a/31_HLSLPathTracer/main.cpp
+++ b/31_HLSLPathTracer/main.cpp
@@ -1068,8 +1068,7 @@ class HLSLComputePathtracer final : public examples::SimpleWindowedApplication,
 					cmdbuf->bindDescriptorSets(EPBP_COMPUTE, pipeline->getLayout(), 0u, 1u, &m_descriptorSet0.get());
 					cmdbuf->bindDescriptorSets(EPBP_COMPUTE, pipeline->getLayout(), 2u, 1u, &m_descriptorSet2.get());
 					cmdbuf->pushConstants(pipeline->getLayout(), IShader::E_SHADER_STAGE::ESS_COMPUTE, 0, sizeof(PTPushConstant), &pc);
-					uint32_t dispatchSize = m_physicalDevice->getLimits().computeOptimalPersistentWorkgroupDispatchSize(WindowDimensions.x * WindowDimensions.y, DefaultWorkGroupSize);
-					cmdbuf->dispatch(dispatchSize, 1u, 1u);
+					cmdbuf->dispatch(1 + (WindowDimensions.x * WindowDimensions.y - 1) / DefaultWorkGroupSize, 1u, 1u);
 				}
 
 				// TRANSITION m_outImgView to READ (because of descriptorSets0 -> ComputeShader Writes into the image)

From 78de4f546a100b78ce6998f4cd49099b604176fa Mon Sep 17 00:00:00 2001
From: keptsecret <sorchon@gmail.com>
Date: Tue, 25 Mar 2025 15:45:07 +0700
Subject: [PATCH 106/529] fixed some bugs for cpp compat

---
 31_HLSLPathTracer/app_resources/hlsl/common.hlsl | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/31_HLSLPathTracer/app_resources/hlsl/common.hlsl b/31_HLSLPathTracer/app_resources/hlsl/common.hlsl
index 2e2561345..31bcca26a 100644
--- a/31_HLSLPathTracer/app_resources/hlsl/common.hlsl
+++ b/31_HLSLPathTracer/app_resources/hlsl/common.hlsl
@@ -10,6 +10,8 @@
 #include <nbl/builtin/hlsl/sampling/spherical_triangle.hlsl>
 #include <nbl/builtin/hlsl/sampling/projected_spherical_triangle.hlsl>
 #include <nbl/builtin/hlsl/sampling/spherical_rectangle.hlsl>
+#include <nbl/builtin/hlsl/bxdf/common.hlsl>
+#include <nbl/builtin/hlsl/spirv_intrinsics/glsl.std.450.hlsl>
 
 namespace nbl
 {
@@ -121,7 +123,7 @@ struct BxDFNode
         retval.albedo = albedo;
         retval.materialType = materialType;
         retval.params.is_aniso = isAniso;
-        retval.params.A = hlsl::max<float32_t2>(A, 1e-4);
+        retval.params.A = hlsl::max<float32_t2>(A, (float32_t2)1e-4);
         retval.params.ior0 = (spectral_type)1.0;
         retval.params.ior1 = (spectral_type)1.0;
         return retval;
@@ -134,7 +136,7 @@ struct BxDFNode
         retval.albedo = (spectral_type)1.0;
         retval.materialType = materialType;
         retval.params.is_aniso = isAniso;
-        retval.params.A = hlsl::max<float32_t2>(A, 1e-4);
+        retval.params.A = hlsl::max<float32_t2>(A, (float32_t2)1e-4);
         retval.params.ior0 = ior0;
         retval.params.ior1 = ior1;
         return retval;
@@ -218,7 +220,7 @@ struct Shape<PST_SPHERE>
 
     float32_t3 getNormal(NBL_CONST_REF_ARG(float32_t3) hitPosition)
     {
-        const float radiusRcp = spirv::inverseSqrt<float32_t>(radius2);
+        const float radiusRcp = hlsl::rsqrt<float32_t>(radius2);
         return (hitPosition - position) * radiusRcp;
     }
 

From 8900f9cf5b2f4c8d424aabbaf5237a371957e21f Mon Sep 17 00:00:00 2001
From: Przemek <minikers21@gmail.com>
Date: Wed, 26 Mar 2025 11:18:25 +0100
Subject: [PATCH 107/529] Implemented height shading

---
 62_CAD/CTriangleMesh.h                        |  52 ++-
 62_CAD/DrawResourcesFiller.cpp                |  25 +-
 62_CAD/main.cpp                               |  69 +++-
 62_CAD/shaders/globals.hlsl                   |  33 +-
 62_CAD/shaders/main_pipeline/common.hlsl      |  12 +-
 .../main_pipeline/fragment_shader.hlsl        | 299 ++++++++++++++----
 .../shaders/main_pipeline/vertex_shader.hlsl  |  11 +-
 7 files changed, 422 insertions(+), 79 deletions(-)

diff --git a/62_CAD/CTriangleMesh.h b/62_CAD/CTriangleMesh.h
index d71198005..a6a86472e 100644
--- a/62_CAD/CTriangleMesh.h
+++ b/62_CAD/CTriangleMesh.h
@@ -8,11 +8,61 @@ using namespace nbl;
 
 struct DTMSettingsInfo
 {
+	enum E_HEIGHT_SHADING_MODE
+	{
+		DISCRETE_VARIABLE_LENGTH_INTERVALS,
+		DISCRETE_FIXED_LENGTH_INTERVALS,
+		CONTINOUS_INTERVALS
+	};
+
 	LineStyleInfo outlineLineStyleInfo;
 	LineStyleInfo contourLineStyleInfo;
-	// TODO: heights
+	
+	float contourLinesStartHeight;
+	float contourLinesEndHeight;
+	float contourLinesHeightInterval;
+
+	float minShadingHeight;
+	float maxShadingHeight;
+	float intervalWidth;
+	E_HEIGHT_SHADING_MODE heightShadingMode;
+
+	void addHeightColorMapEntry(uint32_t height, float32_t3 color)
+	{
+		heightColorSet.emplace(height, color);
+	}
+
+	bool fillShaderDTMSettingsHeightColorMap(DTMSettings& dtmSettings) const
+	{
+		const uint32_t mapSize = heightColorSet.size();
+		if (mapSize > DTMSettings::HeightColorMapMaxEntries)
+			return false;
+		dtmSettings.heightColorEntryCount = mapSize;
+
+		int index = 0;
+		for (auto it = heightColorSet.begin(); it != heightColorSet.end(); ++it)
+		{
+			dtmSettings.heightColorMapHeights[index] = it->height;
+			dtmSettings.heightColorMapColors[index] = it->color;
+			++index;
+		}
+
+		return true;
+	}
 
+private:
+	struct HeightColor
+	{
+		uint32_t height;
+		float32_t3 color;
+
+		bool operator<(const HeightColor& other) const
+		{
+			return height < other.height;
+		}
+	};
 
+	std::set<HeightColor> heightColorSet;
 };
 
 class CTriangleMesh final
diff --git a/62_CAD/DrawResourcesFiller.cpp b/62_CAD/DrawResourcesFiller.cpp
index 49c81f3ff..0611f5900 100644
--- a/62_CAD/DrawResourcesFiller.cpp
+++ b/62_CAD/DrawResourcesFiller.cpp
@@ -790,12 +790,34 @@ uint32_t DrawResourcesFiller::addLineStyle_Internal(const LineStyleInfo& lineSty
 uint32_t DrawResourcesFiller::addDTMSettings_Internal(const DTMSettingsInfo& dtmSettingsInfo, SIntendedSubmitInfo& intendedNextSubmit)
 {
 	DTMSettings dtmSettings;
+	dtmSettings.contourLinesStartHeight = dtmSettingsInfo.contourLinesStartHeight;
+	dtmSettings.contourLinesEndHeight = dtmSettingsInfo.contourLinesEndHeight;
+	dtmSettings.contourLinesHeightInterval = dtmSettingsInfo.contourLinesHeightInterval;
 
 	// TODO: this needs to be redone.. what if submit happens after that line?
 	// we need to make sure somehow that function below will not submit, we need both outline and contour styles in GPU memory
 	dtmSettings.outlineLineStyleIdx = addLineStyle_SubmitIfNeeded(dtmSettingsInfo.outlineLineStyleInfo, intendedNextSubmit);
 	dtmSettings.contourLineStyleIdx = addLineStyle_SubmitIfNeeded(dtmSettingsInfo.contourLineStyleInfo, intendedNextSubmit);
 
+	dtmSettings.minShadingHeight = dtmSettingsInfo.minShadingHeight;
+	dtmSettings.maxShadingHeight = dtmSettingsInfo.maxShadingHeight;
+	switch (dtmSettingsInfo.heightShadingMode)
+	{
+	case DTMSettingsInfo::E_HEIGHT_SHADING_MODE::DISCRETE_VARIABLE_LENGTH_INTERVALS:
+		dtmSettings.intervalWidth = std::numeric_limits<float>::infinity();
+		break;
+	case DTMSettingsInfo::E_HEIGHT_SHADING_MODE::DISCRETE_FIXED_LENGTH_INTERVALS:
+		dtmSettings.intervalWidth = dtmSettingsInfo.intervalWidth;
+		break;
+	case DTMSettingsInfo::E_HEIGHT_SHADING_MODE::CONTINOUS_INTERVALS:
+		dtmSettings.intervalWidth = 0.0f;
+		break;
+	}
+	_NBL_DEBUG_BREAK_IF(!dtmSettingsInfo.fillShaderDTMSettingsHeightColorMap(dtmSettings));
+
+	if (currentDTMSettingsCount >= maxDtmSettings)
+		return InvalidDTMSettingsIdx;
+
 	DTMSettings* settingsArray = reinterpret_cast<DTMSettings*>(cpuDrawBuffers.dtmSettingsBuffer->getPointer());
 	for (uint32_t i = 0u; i < currentDTMSettingsCount; ++i)
 	{
@@ -804,9 +826,6 @@ uint32_t DrawResourcesFiller::addDTMSettings_Internal(const DTMSettingsInfo& dtm
 			return i;
 	}
 
-	if (currentDTMSettingsCount >= maxDtmSettings)
-		return InvalidDTMSettingsIdx;
-
 	void* dst = settingsArray + currentDTMSettingsCount;
 	memcpy(dst, &dtmSettings, sizeof(DTMSettings));
 	return currentDTMSettingsCount++;
diff --git a/62_CAD/main.cpp b/62_CAD/main.cpp
index da3c93acd..53ec24dca 100644
--- a/62_CAD/main.cpp
+++ b/62_CAD/main.cpp
@@ -72,6 +72,7 @@ constexpr std::array<float, (uint32_t)ExampleMode::CASE_COUNT> cameraExtents =
 	10.0,	// CASE_6
 	10.0,	// CASE_7
 	600.0,	// CASE_8
+	600.0	// CASE_9
 };
 
 constexpr ExampleMode mode = ExampleMode::CASE_9;
@@ -3284,7 +3285,7 @@ class ComputerAidedDesign final : public examples::SimpleWindowedApplication, pu
 		}
 		else if (mode == ExampleMode::CASE_9)
 		{
-			core::vector<TriangleMeshVertex> vertices = {
+			/*core::vector<TriangleMeshVertex> vertices = {
 				{ float32_t2(-200.0f, -200.0f), 10.0f },
 				{ float32_t2(-50.0f, -200.0f), 50.0f },
 				{ float32_t2(100.0f, -200.0f), 90.0f },
@@ -3312,30 +3313,80 @@ class ComputerAidedDesign final : public examples::SimpleWindowedApplication, pu
 				7, 9, 10,
 				7, 8, 10,
 				8, 10, 11
+			};*/
+
+			core::vector<TriangleMeshVertex> vertices = {
+				{ float32_t2(0.0f, 0.0f), 100.0f },
+				{ float32_t2(-200.0f, -200.0f), 10.0f },
+				{ float32_t2(200.0f, -200.0f), 10.0f },
+				{ float32_t2(200.0f, 200.0f), -20.0f },
+				{ float32_t2(-200.0f, 200.0f), 10.0f },
 			};
 
-			// TODO: height color map
-			//core::unordered_map<float32_t, float32_t3> heightColorMap;
-			//heightColorMap.insert({ 0.0f, {0.0f, 1.0f, 0.0f} });
-			//heightColorMap.insert({ 100.0f, {0.0f, 1.0f, 0.0f} });
+			core::vector<uint32_t> indices = {
+				0, 1, 2,
+				0, 2, 3,
+				0, 3, 4,
+				0, 4, 1
+			};
 
 			CTriangleMesh mesh;
 			mesh.setVertices(std::move(vertices));
 			mesh.setIndices(std::move(indices));
 
 			DTMSettingsInfo dtmSettingsInfo;
+			dtmSettingsInfo.contourLinesStartHeight = 20;
+			dtmSettingsInfo.contourLinesEndHeight = 90;
+			dtmSettingsInfo.contourLinesHeightInterval = 10;
 
 			LineStyleInfo outlineStyle = {};
 			dtmSettingsInfo.outlineLineStyleInfo.screenSpaceLineWidth = 0.0f;
-			dtmSettingsInfo.outlineLineStyleInfo.worldSpaceLineWidth = 2.0f;
-			dtmSettingsInfo.outlineLineStyleInfo.color = float32_t4(0.0f, 0.5f, 0.5f, 1.0f);
+			dtmSettingsInfo.outlineLineStyleInfo.worldSpaceLineWidth = 3.0f;
+			dtmSettingsInfo.outlineLineStyleInfo.color = float32_t4(0.0f, 0.39f, 0.0f, 1.0f);
 			std::array<double, 4> outlineStipplePattern = { 0.0f, -5.0f, 2.0f, -5.0f };
 			dtmSettingsInfo.outlineLineStyleInfo.setStipplePatternData(outlineStipplePattern);
 
 			LineStyleInfo contourStyle = {};
 			dtmSettingsInfo.contourLineStyleInfo.screenSpaceLineWidth = 0.0f;
-			dtmSettingsInfo.contourLineStyleInfo.worldSpaceLineWidth = 5.0f;
-			dtmSettingsInfo.contourLineStyleInfo.color = float32_t4(1.0f, 0.5f, 0.31f, 1.0f);
+			dtmSettingsInfo.contourLineStyleInfo.worldSpaceLineWidth = 1.0f;
+			dtmSettingsInfo.contourLineStyleInfo.color = float32_t4(0.0f, 0.0f, 1.0f, 1.0f);
+			std::array<double, 4> contourStipplePattern = { 0.0f, -5.0f, 10.0f, -5.0f };
+			dtmSettingsInfo.contourLineStyleInfo.setStipplePatternData(contourStipplePattern);
+
+			//DTMSettingsInfo::E_HEIGHT_SHADING_MODE shadingModeExample = DTMSettingsInfo::E_HEIGHT_SHADING_MODE::DISCRETE_VARIABLE_LENGTH_INTERVALS;
+			//DTMSettingsInfo::E_HEIGHT_SHADING_MODE shadingModeExample = DTMSettingsInfo::E_HEIGHT_SHADING_MODE::DISCRETE_FIXED_LENGTH_INTERVALS;
+			DTMSettingsInfo::E_HEIGHT_SHADING_MODE shadingModeExample = DTMSettingsInfo::E_HEIGHT_SHADING_MODE::CONTINOUS_INTERVALS;
+
+			// DISCRETE_VARIABLE_LENGTH_INTERVALS
+
+			switch (shadingModeExample)
+			{
+				case DTMSettingsInfo::E_HEIGHT_SHADING_MODE::DISCRETE_VARIABLE_LENGTH_INTERVALS:
+				{
+					dtmSettingsInfo.minShadingHeight = 20.0f;
+					dtmSettingsInfo.maxShadingHeight = 70.0f;
+					dtmSettingsInfo.heightShadingMode = DTMSettingsInfo::E_HEIGHT_SHADING_MODE::DISCRETE_VARIABLE_LENGTH_INTERVALS;
+					dtmSettingsInfo.addHeightColorMapEntry(30, float32_t3(0.5f, 1.0f, 1.0f));
+					dtmSettingsInfo.addHeightColorMapEntry(45, float32_t3(0.0f, 1.0f, 0.0f));
+					dtmSettingsInfo.addHeightColorMapEntry(60, float32_t3(1.0f, 1.0f, 0.0f));
+					dtmSettingsInfo.addHeightColorMapEntry(80, float32_t3(1.0f, 0.0f, 0.0f));
+					break;
+				}
+				case DTMSettingsInfo::E_HEIGHT_SHADING_MODE::DISCRETE_FIXED_LENGTH_INTERVALS:
+				{
+					break;
+				}
+				case DTMSettingsInfo::E_HEIGHT_SHADING_MODE::CONTINOUS_INTERVALS:
+				{
+					dtmSettingsInfo.minShadingHeight = -10.0f;
+					dtmSettingsInfo.maxShadingHeight = 100.0f;
+					dtmSettingsInfo.heightShadingMode = DTMSettingsInfo::E_HEIGHT_SHADING_MODE::CONTINOUS_INTERVALS;
+					dtmSettingsInfo.addHeightColorMapEntry(20, float32_t3(0.0f, 1.0f, 0.0f));
+					dtmSettingsInfo.addHeightColorMapEntry(50, float32_t3(1.0f, 1.0f, 0.0f));
+					dtmSettingsInfo.addHeightColorMapEntry(80, float32_t3(1.0f, 0.0f, 0.0f));
+					break;
+				}
+			}
 
 			drawResourcesFiller.drawTriangleMesh(mesh, m_triangleMeshDrawData, dtmSettingsInfo, intendedNextSubmit);
 		}
diff --git a/62_CAD/shaders/globals.hlsl b/62_CAD/shaders/globals.hlsl
index e7029a79e..d718ee76a 100644
--- a/62_CAD/shaders/globals.hlsl
+++ b/62_CAD/shaders/globals.hlsl
@@ -330,10 +330,39 @@ struct LineStyle
 
 struct DTMSettings
 {
+    const static uint32_t HeightColorMapMaxEntries = 16u;
     uint32_t outlineLineStyleIdx; // index into line styles
     uint32_t contourLineStyleIdx; // index into line styles
-    // TODO:
-    // ContourSettings -> min, max, interval
+    
+    // contour lines
+    float contourLinesStartHeight;
+    float contourLinesEndHeight;
+    float contourLinesHeightInterval;
+
+    // height-color map
+    float minShadingHeight;
+    float maxShadingHeight;
+    float intervalWidth;
+    uint32_t heightColorEntryCount;
+    float heightColorMapHeights[HeightColorMapMaxEntries];
+    float32_t3 heightColorMapColors[HeightColorMapMaxEntries];
+
+    enum E_HEIGHT_SHADING_MODE
+    {
+        DISCRETE_VARIABLE_LENGTH_INTERVALS,
+        DISCRETE_FIXED_LENGTH_INTERVALS,
+        CONTINOUS_INTERVALS
+    };
+
+    E_HEIGHT_SHADING_MODE determineHeightShadingMode()
+    {
+        if (nbl::hlsl::isinf(intervalWidth))
+            return DISCRETE_VARIABLE_LENGTH_INTERVALS;
+        if (intervalWidth == 0.0f)
+            return CONTINOUS_INTERVALS;
+
+        return DISCRETE_FIXED_LENGTH_INTERVALS;
+    }
 };
 #ifndef __HLSL_VERSION
 inline bool operator==(const LineStyle& lhs, const LineStyle& rhs)
diff --git a/62_CAD/shaders/main_pipeline/common.hlsl b/62_CAD/shaders/main_pipeline/common.hlsl
index 73121fe36..b2fcda9c2 100644
--- a/62_CAD/shaders/main_pipeline/common.hlsl
+++ b/62_CAD/shaders/main_pipeline/common.hlsl
@@ -104,9 +104,6 @@ struct PSInput
     void setCurrentWorldToScreenRatio(float worldToScreen) { interp_data5.y = worldToScreen; }
     float getCurrentWorldToScreenRatio() { return interp_data5.y; }
 
-    void setHeight(float height) { interp_data5.x = height; }
-    float getHeight() { return interp_data5.x; }
-
     /* LINE */
     float2 getLineStart() { return data2.xy; }
     float2 getLineEnd() { return data2.zw; }
@@ -219,6 +216,15 @@ struct PSInput
 
     /* TRIANGLE MESH */
 
+    float getOutlineThickness() { return asfloat(data1.z); }
+    float getContourLineThickness() { return asfloat(data1.w); }
+
+    void setOutlineThickness(float lineThickness) { data1.z = asuint(lineThickness); }
+    void setContourLineThickness(float stretch) { data1.w = asuint(stretch); }
+
+    void setHeight(float height) { interp_data5.x = height; }
+    float getHeight() { return interp_data5.x; }
+
 #ifndef FRAGMENT_SHADER_INPUT // vertex shader
     void setScreenSpaceVertexPos(float3 pos) { vertexScreenSpacePos = pos; }
 #else // fragment shader
diff --git a/62_CAD/shaders/main_pipeline/fragment_shader.hlsl b/62_CAD/shaders/main_pipeline/fragment_shader.hlsl
index 5311fa48d..2173ae50f 100644
--- a/62_CAD/shaders/main_pipeline/fragment_shader.hlsl
+++ b/62_CAD/shaders/main_pipeline/fragment_shader.hlsl
@@ -414,91 +414,276 @@ float4 fragMain(PSInput input) : SV_TARGET
     const uint32_t currentMainObjectIdx = input.getMainObjectIdx();
     const MainObject mainObj = mainObjects[currentMainObjectIdx];
 
-    
-
     // TRIANGLE RENDERING
     {
-        float3 v0 = input.getScreenSpaceVertexPos(0);
-        float3 v1 = input.getScreenSpaceVertexPos(1);
-        float3 v2 = input.getScreenSpaceVertexPos(2);
+        const float outlineThickness = input.getOutlineThickness();
+        const float contourThickness = input.getContourLineThickness();
+        const float phaseShift = 0.0f; // input.getCurrentPhaseShift();
+        const float stretch = 1.0f; // TODO: figure out what is it for
+        const float worldToScreenRatio = input.getCurrentWorldToScreenRatio();
+
+        DTMSettings dtmSettings = dtmSettingsBuff[mainObj.dtmSettingsIdx];
+        LineStyle outlineStyle = lineStyles[dtmSettings.outlineLineStyleIdx];
+        LineStyle contourStyle = lineStyles[dtmSettings.contourLineStyleIdx];
+
+        float3 v[3];
+        v[0] = input.getScreenSpaceVertexPos(0);
+        v[1] = input.getScreenSpaceVertexPos(1);
+        v[2] = input.getScreenSpaceVertexPos(2);
+
+        const float3 baryCoord = nbl::hlsl::spirv::BaryCoordKHR;
+
+        // indices of points constructing every edge
+        uint2 edgePoints[3];
+        edgePoints[0] = uint2(0, 1);
+        edgePoints[1] = uint2(1, 2);
+        edgePoints[2] = uint2(2, 0);
+
+        // index of vertex opposing an edge, needed for calculation of triangle heights
+        uint opposingVertexIdx[3];
+        opposingVertexIdx[0] = 2;
+        opposingVertexIdx[1] = 0;
+        opposingVertexIdx[2] = 1;
+
+        float height = input.getHeight();
+
+        // HEIGHT SHADING
+        const bool isHeightBetweenMinAndMax = height >= dtmSettings.minShadingHeight && height <= dtmSettings.maxShadingHeight;
+        const bool isHeightColorMapNotEmpty = dtmSettings.heightColorEntryCount;
+        if (isHeightColorMapNotEmpty && isHeightBetweenMinAndMax)
+        {
+            DTMSettings::E_HEIGHT_SHADING_MODE mode = dtmSettings.determineHeightShadingMode();
+            switch (mode)
+            {
+                case DTMSettings::E_HEIGHT_SHADING_MODE::DISCRETE_VARIABLE_LENGTH_INTERVALS:
+                {
+                    const uint32_t heightMapSize = dtmSettings.heightColorEntryCount;
+                    for (int i = 0; i < heightMapSize; ++i)
+                    {
+                        if (dtmSettings.heightColorMapHeights[i] > height)
+                        {
+                            textureColor = dtmSettings.heightColorMapColors[i];
+                            break;
+                        }
+                    }
+
+                    localAlpha = 1.0f;
+                    break;
+                }
+                case DTMSettings::E_HEIGHT_SHADING_MODE::DISCRETE_FIXED_LENGTH_INTERVALS:
+                {
+                    /*const uint32_t heightMapSize = dtmSettings.heightColorEntryCount;
+                    uint32_t upperBoundHeightIndex = nbl::hlsl::numeric_limits<uint32_t>::max;
+                    uint32_t lowerBoundHeightIndex;
+                    // TODO: binary search
+                    for (int i = 0; i < heightMapSize; ++i)
+                    {
+                        if (dtmSettings.heightColorMapHeights[i] > height)
+                        {
+                            upperBoundHeightIndex = i;
+                            lowerBoundHeightIndex = i == 0 ? 0 : i - 1;
+                            break;
+                        }
+                    }
+
+                    if (upperBoundHeightIndex != nbl::hlsl::numeric_limits<uint32_t>::max)
+                    {
+                        float upperBoundHeight = dtmSettings.heightColorMapColors[upperBoundHeightIndex];
+                        float lowerBoundHeight = dtmSettings.heightColorMapColors[upperBoundHeightIndex];
+
+
+                        float3 upperBoundColor = dtmSettings.heightColorMapColors[upperBoundHeightIndex];
+                        float3 lowerBoundColor = dtmSettings.heightColorMapColors[lowerBoundHeightIndex];
+
+                        localAlpha = 1.0f;
+                    }*/
+
+                    break;
+                }
+                case DTMSettings::E_HEIGHT_SHADING_MODE::CONTINOUS_INTERVALS:
+                {
+
+                    const uint32_t heightMapSize = dtmSettings.heightColorEntryCount;
+                    uint32_t upperBoundHeightIndex = nbl::hlsl::numeric_limits<uint32_t>::max;
+                    uint32_t lowerBoundHeightIndex;
+                    // TODO: binary search
+                    for (int i = 0; i < heightMapSize; ++i)
+                    {
+                        if (dtmSettings.heightColorMapHeights[i] > height)
+                        {
+                            upperBoundHeightIndex = i;
+                            lowerBoundHeightIndex = i;
+                            if (i != 0)
+                                --lowerBoundHeightIndex;
+
+                            break;
+                        }
+                    }
+                    if (upperBoundHeightIndex == nbl::hlsl::numeric_limits<uint32_t>::max)
+                    {
+                        upperBoundHeightIndex = heightMapSize - 1;
+                        lowerBoundHeightIndex = upperBoundHeightIndex;
+                        if (upperBoundHeightIndex != 0)
+                            --lowerBoundHeightIndex;
+                    }
+
+                    if (upperBoundHeightIndex != nbl::hlsl::numeric_limits<uint32_t>::max)
+                    {
+                        float upperBoundHeight = dtmSettings.heightColorMapHeights[upperBoundHeightIndex];
+                        float lowerBoundHeight = dtmSettings.heightColorMapHeights[lowerBoundHeightIndex];
+
+                        float3 upperBoundColor = dtmSettings.heightColorMapColors[upperBoundHeightIndex];
+                        float3 lowerBoundColor = dtmSettings.heightColorMapColors[lowerBoundHeightIndex];
+
+                        float interpolationVal;
+                        if (upperBoundHeightIndex == 0)
+                            interpolationVal = 1.0f;
+                        else
+                            interpolationVal = (height - lowerBoundHeight) / (upperBoundHeight - lowerBoundHeight);
+
+                        printf("idx = %i, t = %f, up = %f, lo = %f", upperBoundHeightIndex, interpolationVal, upperBoundHeight, lowerBoundHeight);
+
+                        textureColor = lerp(lowerBoundColor, upperBoundColor, interpolationVal);
+
+                        localAlpha = 1.0f;
+                    }
+
+                    break;
+                }
+            }
+        }
 
         // CONTOUR
 
         // TODO: move to ubo or push constants
-        const float startHeight = 10.0f;
-        const float endHeight = 100.0f;
-        const float interval = 10.0f;
-        float height = input.getHeight();
+        const float startHeight = dtmSettings.contourLinesStartHeight;
+        const float endHeight = dtmSettings.contourLinesEndHeight;
+        const float interval = dtmSettings.contourLinesHeightInterval;
+
+        // TODO: can be precomputed
+        const int maxContourLineIdx = (endHeight - startHeight + 1) / interval;
 
         // TODO: it actually can output a negative number, fix
         int contourLineIdx = nbl::hlsl::_static_cast<int>((height - startHeight + (interval * 0.5f)) / interval);
+        contourLineIdx = clamp(contourLineIdx, 0, maxContourLineIdx);
+        float contourLineHeight = startHeight + interval * contourLineIdx;
+
+        int contourLinePointsIdx = 0;
+        float2 contourLinePoints[2];
+        // TODO: case where heights we are looking for are on all three vertices
+        for (int i = 0; i < 3; ++i)
+        {
+            if (contourLinePointsIdx == 3)
+                break;
 
-        float backgroundColor = contourLineIdx;
-        backgroundColor *= 0.1f;
-        textureColor = float3(backgroundColor, backgroundColor, backgroundColor);
+            const uint2 currentEdgePoints = edgePoints[i];
+            float3 p0 = v[currentEdgePoints[0]];
+            float3 p1 = v[currentEdgePoints[1]];
 
-        // OUTLINE
+            if (p1.z < p0.z)
+                nbl::hlsl::swap(p0, p1);
 
-        float2 start;
-        float2 end;
-        const float3 baryCoord = nbl::hlsl::spirv::BaryCoordKHR;
+            float minHeight = p0.z;
+            float maxHeight = p1.z;
 
-        // TODO: figure out if branching can be reduced
-        // finding line start and end points by excluding vertex with the lowest barycentric coordinate value
-        if (baryCoord.x < baryCoord.y && baryCoord.x < baryCoord.z)
-        {
-            start = float2(v1.x, v1.y);
-            end = float2(v2.x, v2.y);
-        }
-        else if (baryCoord.y < baryCoord.x && baryCoord.y < baryCoord.z)
-        {
-            start = float2(v0.x, v0.y);
-            end = float2(v2.x, v2.y);
+            if (height >= minHeight && height <= maxHeight)
+            {
+                float2 edge = float2(p1.x, p1.y) - float2(p0.x, p0.y);
+                float scale = (contourLineHeight - minHeight) / (maxHeight - minHeight);
+
+                contourLinePoints[contourLinePointsIdx] = scale * edge + float2(p0.x, p0.y);
+                ++contourLinePointsIdx;
+            }
         }
-        else if (baryCoord.z < baryCoord.x && baryCoord.z < baryCoord.y)
+
         {
-            start = float2(v0.x, v0.y);
-            end = float2(v1.x, v1.y);
+            nbl::hlsl::shapes::Line<float> lineSegment = nbl::hlsl::shapes::Line<float>::construct(contourLinePoints[0], contourLinePoints[1]);
+
+            float distance = nbl::hlsl::numeric_limits<float>::max;
+            if (!contourStyle.hasStipples() || stretch == InvalidStyleStretchValue)
+            {
+                distance = ClippedSignedDistance< nbl::hlsl::shapes::Line<float> >::sdf(lineSegment, input.position.xy, contourThickness, contourStyle.isRoadStyleFlag);
+            }
+            else
+            {
+                nbl::hlsl::shapes::Line<float>::ArcLengthCalculator arcLenCalc = nbl::hlsl::shapes::Line<float>::ArcLengthCalculator::construct(lineSegment);
+                LineStyleClipper clipper = LineStyleClipper::construct(contourStyle, lineSegment, arcLenCalc, phaseShift, stretch, worldToScreenRatio);
+                distance = ClippedSignedDistance<nbl::hlsl::shapes::Line<float>, LineStyleClipper>::sdf(lineSegment, input.position.xy, contourThickness, contourStyle.isRoadStyleFlag, clipper);
+            }
+
+            float contourLocalAlpha = smoothstep(+globals.antiAliasingFactor, -globals.antiAliasingFactor, distance);
+            textureColor = lerp(textureColor, contourStyle.color.rgb, contourLocalAlpha);
+            localAlpha = max(localAlpha, contourLocalAlpha);
         }
 
-        // long story short, in order for stipple patterns to be consistent:
-        // - point with lesser x coord should be starting point
-        // - if x coord of both points are equal then point with lesser y value should be starting point
-        if (end.x < start.x)
-            nbl::hlsl::swap(start, end);
-        else if (end.x == start.x && end.y < start.y)
-            nbl::hlsl::swap(start, end);
+        
 
-        const float thickness = input.getLineThickness();
-        const float phaseShift = 0.0f; // input.getCurrentPhaseShift();
-        const float stretch =  1.0f;
-        const float worldToScreenRatio = input.getCurrentWorldToScreenRatio();
+        // OUTLINE
 
-        nbl::hlsl::shapes::Line<float> lineSegment = nbl::hlsl::shapes::Line<float>::construct(start, end);
+        // find sdf of every edge
+        float triangleAreaTimesTwo;
+        {
+            float3 AB = v[0] - v[1];
+            float3 AC = v[0] - v[2];
+            AB.z = 0.0f;
+            AC.z = 0.0f;
 
-        DTMSettings dtmSettings = dtmSettingsBuff[mainObj.dtmSettingsIdx];
-        LineStyle outlineStyle = lineStyles[dtmSettings.outlineLineStyleIdx];
-        LineStyle contourStyle = lineStyles[dtmSettings.contourLineStyleIdx];
+            // TODO: figure out if there is a faster solution
+            triangleAreaTimesTwo = length(cross(AB, AC));
+        }
 
-        float distance = nbl::hlsl::numeric_limits<float>::max;
+        // calculate sdf of every edge as it wasn't stippled
+        float distances[3];
+        for (int i = 0; i < 3; ++i)
+        {
+            const uint2 currentEdgePoints = edgePoints[i];
+            float3 A = v[currentEdgePoints[0]];
+            float3 B = v[currentEdgePoints[1]];
+            float3 AB = B - A;
+            float ABLen = length(AB);
+
+            distances[i] = (triangleAreaTimesTwo / ABLen) * baryCoord[opposingVertexIdx[i]];
+        }
+
+        float minDistance = nbl::hlsl::numeric_limits<float>::max;
         if (!outlineStyle.hasStipples() || stretch == InvalidStyleStretchValue)
         {
-            distance = ClippedSignedDistance< nbl::hlsl::shapes::Line<float> >::sdf(lineSegment, input.position.xy, thickness, outlineStyle.isRoadStyleFlag);
+            for (uint i = 0; i < 3; ++i)
+                distances[i] -= outlineThickness;
+
+            minDistance = min(distances[0], min(distances[1], distances[2]));
         }
         else
         {
-            nbl::hlsl::shapes::Line<float>::ArcLengthCalculator arcLenCalc = nbl::hlsl::shapes::Line<float>::ArcLengthCalculator::construct(lineSegment);
-            LineStyleClipper clipper = LineStyleClipper::construct(outlineStyle, lineSegment, arcLenCalc, phaseShift, stretch, worldToScreenRatio);
-            distance = ClippedSignedDistance<nbl::hlsl::shapes::Line<float>, LineStyleClipper>::sdf(lineSegment, input.position.xy, thickness, outlineStyle.isRoadStyleFlag, clipper);
-        }
+            for (int i = 0; i < 3; ++i)
+            {
+                if (distances[i] > outlineThickness)
+                    continue;
 
-        localAlpha = smoothstep(+globals.antiAliasingFactor, -globals.antiAliasingFactor, distance);
+                const uint2 currentEdgePoints = edgePoints[i];
+                float3 p0 = v[currentEdgePoints[0]];
+                float3 p1 = v[currentEdgePoints[1]];
 
-        // TODO: remove, this is just a hack to draw background
-        if (localAlpha < 0.00001)
-            localAlpha = 1.0f;
-        else
-            textureColor = float3(outlineStyle.color.x, outlineStyle.color.y, outlineStyle.color.z);
+                if (p1.x < p0.x)
+                    nbl::hlsl::swap(p0, p1);
+                else if (p1.x == p0.x && p1.y < p0.y)
+                    nbl::hlsl::swap(p0, p1);
+
+                nbl::hlsl::shapes::Line<float> lineSegment = nbl::hlsl::shapes::Line<float>::construct(float2(p0.x, p0.y), float2(p1.x, p1.y));
+
+                float distance = nbl::hlsl::numeric_limits<float>::max;
+                nbl::hlsl::shapes::Line<float>::ArcLengthCalculator arcLenCalc = nbl::hlsl::shapes::Line<float>::ArcLengthCalculator::construct(lineSegment);
+                LineStyleClipper clipper = LineStyleClipper::construct(outlineStyle, lineSegment, arcLenCalc, phaseShift, stretch, worldToScreenRatio);
+                distance = ClippedSignedDistance<nbl::hlsl::shapes::Line<float>, LineStyleClipper>::sdf(lineSegment, input.position.xy, outlineThickness, outlineStyle.isRoadStyleFlag, clipper);
+
+                minDistance = min(minDistance, distance);
+            }
+
+        }
+
+        float outlineLocalAlpha = smoothstep(+globals.antiAliasingFactor, -globals.antiAliasingFactor, minDistance);
+        textureColor = lerp(textureColor, outlineStyle.color.rgb, outlineLocalAlpha);
+        localAlpha = max(localAlpha, outlineLocalAlpha);
     }
 
     return calculateFinalColor<nbl::hlsl::jit::device_capabilities::fragmentShaderPixelInterlock>(uint2(input.position.xy), localAlpha, currentMainObjectIdx, textureColor, true);
diff --git a/62_CAD/shaders/main_pipeline/vertex_shader.hlsl b/62_CAD/shaders/main_pipeline/vertex_shader.hlsl
index a1788a91e..2853d9a52 100644
--- a/62_CAD/shaders/main_pipeline/vertex_shader.hlsl
+++ b/62_CAD/shaders/main_pipeline/vertex_shader.hlsl
@@ -121,7 +121,7 @@ PSInput main(uint vertexID : SV_VertexID)
     outV.position.xy = transformedPos;
     outV.position = transformFromSreenSpaceToNdc(outV.position.xy, globals.resolution);
     outV.setHeight(vtx.height);
-    outV.setScreenSpaceVertexPos(float3(transformedPos, 1));
+    outV.setScreenSpaceVertexPos(float3(transformedPos, vtx.height));
     outV.setCurrentWorldToScreenRatio(
         _static_cast<float>((_static_cast<pfloat64_t>(2.0f) /
             (clipProjectionData.projectionToNDC[0].x * _static_cast<pfloat64_t>(globals.resolution.x))))
@@ -131,9 +131,12 @@ PSInput main(uint vertexID : SV_VertexID)
     DTMSettings dtmSettings = dtmSettingsBuff[mainObj.dtmSettingsIdx];
     LineStyle outlineStyle = lineStyles[dtmSettings.outlineLineStyleIdx];
     LineStyle contourStyle = lineStyles[dtmSettings.contourLineStyleIdx];
-    const float screenSpaceLineWidth = outlineStyle.screenSpaceLineWidth + _static_cast<float>(_static_cast<pfloat64_t>(outlineStyle.worldSpaceLineWidth) * globals.screenToWorldRatio);
-    const float sdfLineThickness = screenSpaceLineWidth * 0.5f;
-    outV.setLineThickness(sdfLineThickness);
+    const float screenSpaceOutlineWidth = outlineStyle.screenSpaceLineWidth + _static_cast<float>(_static_cast<pfloat64_t>(outlineStyle.worldSpaceLineWidth) * globals.screenToWorldRatio);
+    const float sdfOutlineThickness = screenSpaceOutlineWidth * 0.5f;
+    const float screenSpaceContourLineWidth = contourStyle.screenSpaceLineWidth + _static_cast<float>(_static_cast<pfloat64_t>(contourStyle.worldSpaceLineWidth) * globals.screenToWorldRatio);
+    const float sdfContourLineThickness = screenSpaceContourLineWidth * 0.5f;
+    outV.setOutlineThickness(sdfOutlineThickness);
+    outV.setContourLineThickness(sdfContourLineThickness);
 
     return outV;
 

From 5a87097970d77f93ff85a1242f3f1c55623c38f3 Mon Sep 17 00:00:00 2001
From: Przemek <minikers21@gmail.com>
Date: Wed, 26 Mar 2025 13:30:28 +0100
Subject: [PATCH 108/529] Fixes

---
 62_CAD/CTriangleMesh.h                        |   6 +-
 62_CAD/DrawResourcesFiller.cpp                |   3 -
 62_CAD/main.cpp                               |  49 +++---
 62_CAD/shaders/globals.hlsl                   |   2 -
 .../main_pipeline/fragment_shader.hlsl        | 146 ++++++++----------
 5 files changed, 94 insertions(+), 112 deletions(-)

diff --git a/62_CAD/CTriangleMesh.h b/62_CAD/CTriangleMesh.h
index a6a86472e..6711011ea 100644
--- a/62_CAD/CTriangleMesh.h
+++ b/62_CAD/CTriangleMesh.h
@@ -22,12 +22,10 @@ struct DTMSettingsInfo
 	float contourLinesEndHeight;
 	float contourLinesHeightInterval;
 
-	float minShadingHeight;
-	float maxShadingHeight;
 	float intervalWidth;
 	E_HEIGHT_SHADING_MODE heightShadingMode;
 
-	void addHeightColorMapEntry(uint32_t height, float32_t3 color)
+	void addHeightColorMapEntry(float height, float32_t3 color)
 	{
 		heightColorSet.emplace(height, color);
 	}
@@ -53,7 +51,7 @@ struct DTMSettingsInfo
 private:
 	struct HeightColor
 	{
-		uint32_t height;
+		float height;
 		float32_t3 color;
 
 		bool operator<(const HeightColor& other) const
diff --git a/62_CAD/DrawResourcesFiller.cpp b/62_CAD/DrawResourcesFiller.cpp
index 0611f5900..5e0c85260 100644
--- a/62_CAD/DrawResourcesFiller.cpp
+++ b/62_CAD/DrawResourcesFiller.cpp
@@ -798,9 +798,6 @@ uint32_t DrawResourcesFiller::addDTMSettings_Internal(const DTMSettingsInfo& dtm
 	// we need to make sure somehow that function below will not submit, we need both outline and contour styles in GPU memory
 	dtmSettings.outlineLineStyleIdx = addLineStyle_SubmitIfNeeded(dtmSettingsInfo.outlineLineStyleInfo, intendedNextSubmit);
 	dtmSettings.contourLineStyleIdx = addLineStyle_SubmitIfNeeded(dtmSettingsInfo.contourLineStyleInfo, intendedNextSubmit);
-
-	dtmSettings.minShadingHeight = dtmSettingsInfo.minShadingHeight;
-	dtmSettings.maxShadingHeight = dtmSettingsInfo.maxShadingHeight;
 	switch (dtmSettingsInfo.heightShadingMode)
 	{
 	case DTMSettingsInfo::E_HEIGHT_SHADING_MODE::DISCRETE_VARIABLE_LENGTH_INTERVALS:
diff --git a/62_CAD/main.cpp b/62_CAD/main.cpp
index 53ec24dca..ba2bf0da0 100644
--- a/62_CAD/main.cpp
+++ b/62_CAD/main.cpp
@@ -644,6 +644,8 @@ class ComputerAidedDesign final : public examples::SimpleWindowedApplication, pu
 	double m_timeElapsed = 0.0;
 	std::chrono::steady_clock::time_point lastTime;
 	uint32_t m_hatchDebugStep = 0u;
+	DTMSettingsInfo::E_HEIGHT_SHADING_MODE m_shadingModeExample = DTMSettingsInfo::E_HEIGHT_SHADING_MODE::DISCRETE_VARIABLE_LENGTH_INTERVALS;
+
 
 	inline bool onAppInitialized(smart_refctd_ptr<ISystem>&& system) override
 	{
@@ -1161,6 +1163,18 @@ class ComputerAidedDesign final : public examples::SimpleWindowedApplication, pu
 					{
 						m_hatchDebugStep--;
 					}
+					if (ev.action == nbl::ui::SKeyboardEvent::E_KEY_ACTION::ECA_PRESSED && ev.keyCode == nbl::ui::E_KEY_CODE::EKC_1)
+					{
+						m_shadingModeExample = DTMSettingsInfo::E_HEIGHT_SHADING_MODE::DISCRETE_VARIABLE_LENGTH_INTERVALS;
+					}
+					if (ev.action == nbl::ui::SKeyboardEvent::E_KEY_ACTION::ECA_PRESSED && ev.keyCode == nbl::ui::E_KEY_CODE::EKC_2)
+					{
+						m_shadingModeExample = DTMSettingsInfo::E_HEIGHT_SHADING_MODE::DISCRETE_FIXED_LENGTH_INTERVALS;
+					}
+					if (ev.action == nbl::ui::SKeyboardEvent::E_KEY_ACTION::ECA_PRESSED && ev.keyCode == nbl::ui::E_KEY_CODE::EKC_3)
+					{
+						m_shadingModeExample = DTMSettingsInfo::E_HEIGHT_SHADING_MODE::CONTINOUS_INTERVALS;
+					}
 				}
 			}
 		, m_logger.get());
@@ -3353,37 +3367,36 @@ class ComputerAidedDesign final : public examples::SimpleWindowedApplication, pu
 			std::array<double, 4> contourStipplePattern = { 0.0f, -5.0f, 10.0f, -5.0f };
 			dtmSettingsInfo.contourLineStyleInfo.setStipplePatternData(contourStipplePattern);
 
-			//DTMSettingsInfo::E_HEIGHT_SHADING_MODE shadingModeExample = DTMSettingsInfo::E_HEIGHT_SHADING_MODE::DISCRETE_VARIABLE_LENGTH_INTERVALS;
-			//DTMSettingsInfo::E_HEIGHT_SHADING_MODE shadingModeExample = DTMSettingsInfo::E_HEIGHT_SHADING_MODE::DISCRETE_FIXED_LENGTH_INTERVALS;
-			DTMSettingsInfo::E_HEIGHT_SHADING_MODE shadingModeExample = DTMSettingsInfo::E_HEIGHT_SHADING_MODE::CONTINOUS_INTERVALS;
-
-			// DISCRETE_VARIABLE_LENGTH_INTERVALS
-
-			switch (shadingModeExample)
+			// PRESS 1, 2, 3 TO SWITCH HEIGHT SHADING MODE
+			// 1 - DISCRETE_VARIABLE_LENGTH_INTERVALS
+			// 2 - DISCRETE_FIXED_LENGTH_INTERVALS
+			// 3 - CONTINOUS_INTERVALS
+			switch (m_shadingModeExample)
 			{
 				case DTMSettingsInfo::E_HEIGHT_SHADING_MODE::DISCRETE_VARIABLE_LENGTH_INTERVALS:
 				{
-					dtmSettingsInfo.minShadingHeight = 20.0f;
-					dtmSettingsInfo.maxShadingHeight = 70.0f;
 					dtmSettingsInfo.heightShadingMode = DTMSettingsInfo::E_HEIGHT_SHADING_MODE::DISCRETE_VARIABLE_LENGTH_INTERVALS;
-					dtmSettingsInfo.addHeightColorMapEntry(30, float32_t3(0.5f, 1.0f, 1.0f));
-					dtmSettingsInfo.addHeightColorMapEntry(45, float32_t3(0.0f, 1.0f, 0.0f));
-					dtmSettingsInfo.addHeightColorMapEntry(60, float32_t3(1.0f, 1.0f, 0.0f));
-					dtmSettingsInfo.addHeightColorMapEntry(80, float32_t3(1.0f, 0.0f, 0.0f));
+					dtmSettingsInfo.addHeightColorMapEntry(20.0f, float32_t3(0.5f, 1.0f, 1.0f));
+					dtmSettingsInfo.addHeightColorMapEntry(25.0f, float32_t3(0.0f, 1.0f, 0.0f));
+					dtmSettingsInfo.addHeightColorMapEntry(70.0f, float32_t3(1.0f, 1.0f, 0.0f));
+					dtmSettingsInfo.addHeightColorMapEntry(80.0f, float32_t3(1.0f, 0.0f, 0.0f));
 					break;
 				}
 				case DTMSettingsInfo::E_HEIGHT_SHADING_MODE::DISCRETE_FIXED_LENGTH_INTERVALS:
 				{
+					dtmSettingsInfo.intervalWidth = 8.0f;
+					dtmSettingsInfo.heightShadingMode = DTMSettingsInfo::E_HEIGHT_SHADING_MODE::DISCRETE_FIXED_LENGTH_INTERVALS;
+					dtmSettingsInfo.addHeightColorMapEntry(0.0f, float32_t3(0.0f, 1.0f, 0.0f));
+					dtmSettingsInfo.addHeightColorMapEntry(50.0f, float32_t3(1.0f, 1.0f, 0.0f));
+					dtmSettingsInfo.addHeightColorMapEntry(100.0f, float32_t3(1.0f, 0.0f, 0.0f));
 					break;
 				}
 				case DTMSettingsInfo::E_HEIGHT_SHADING_MODE::CONTINOUS_INTERVALS:
 				{
-					dtmSettingsInfo.minShadingHeight = -10.0f;
-					dtmSettingsInfo.maxShadingHeight = 100.0f;
 					dtmSettingsInfo.heightShadingMode = DTMSettingsInfo::E_HEIGHT_SHADING_MODE::CONTINOUS_INTERVALS;
-					dtmSettingsInfo.addHeightColorMapEntry(20, float32_t3(0.0f, 1.0f, 0.0f));
-					dtmSettingsInfo.addHeightColorMapEntry(50, float32_t3(1.0f, 1.0f, 0.0f));
-					dtmSettingsInfo.addHeightColorMapEntry(80, float32_t3(1.0f, 0.0f, 0.0f));
+					dtmSettingsInfo.addHeightColorMapEntry(-10.0f, float32_t3(0.0f, 1.0f, 0.0f));
+					dtmSettingsInfo.addHeightColorMapEntry(30.0f, float32_t3(1.0f, 1.0f, 0.0f));
+					dtmSettingsInfo.addHeightColorMapEntry(90.0f, float32_t3(1.0f, 0.0f, 0.0f));
 					break;
 				}
 			}
diff --git a/62_CAD/shaders/globals.hlsl b/62_CAD/shaders/globals.hlsl
index d718ee76a..7a05819ac 100644
--- a/62_CAD/shaders/globals.hlsl
+++ b/62_CAD/shaders/globals.hlsl
@@ -340,8 +340,6 @@ struct DTMSettings
     float contourLinesHeightInterval;
 
     // height-color map
-    float minShadingHeight;
-    float maxShadingHeight;
     float intervalWidth;
     uint32_t heightColorEntryCount;
     float heightColorMapHeights[HeightColorMapMaxEntries];
diff --git a/62_CAD/shaders/main_pipeline/fragment_shader.hlsl b/62_CAD/shaders/main_pipeline/fragment_shader.hlsl
index 2173ae50f..ad8a35c82 100644
--- a/62_CAD/shaders/main_pipeline/fragment_shader.hlsl
+++ b/62_CAD/shaders/main_pipeline/fragment_shader.hlsl
@@ -444,112 +444,88 @@ float4 fragMain(PSInput input) : SV_TARGET
         opposingVertexIdx[0] = 2;
         opposingVertexIdx[1] = 0;
         opposingVertexIdx[2] = 1;
-
+        
         float height = input.getHeight();
 
         // HEIGHT SHADING
-        const bool isHeightBetweenMinAndMax = height >= dtmSettings.minShadingHeight && height <= dtmSettings.maxShadingHeight;
-        const bool isHeightColorMapNotEmpty = dtmSettings.heightColorEntryCount;
+        const uint32_t heightMapSize = dtmSettings.heightColorEntryCount;
+        float minShadingHeight = dtmSettings.heightColorMapHeights[0];
+        float maxShadingHeight = dtmSettings.heightColorMapHeights[heightMapSize - 1];
+
+        printf("min = %f, max = %f", minShadingHeight, maxShadingHeight);
+
+        const bool isHeightBetweenMinAndMax = height >= minShadingHeight && height <= maxShadingHeight;
+        const bool isHeightColorMapNotEmpty = heightMapSize > 0;
         if (isHeightColorMapNotEmpty && isHeightBetweenMinAndMax)
         {
             DTMSettings::E_HEIGHT_SHADING_MODE mode = dtmSettings.determineHeightShadingMode();
-            switch (mode)
+
+            if(mode == DTMSettings::E_HEIGHT_SHADING_MODE::DISCRETE_VARIABLE_LENGTH_INTERVALS)
             {
-                case DTMSettings::E_HEIGHT_SHADING_MODE::DISCRETE_VARIABLE_LENGTH_INTERVALS:
+                for (int i = 0; i < heightMapSize; ++i)
                 {
-                    const uint32_t heightMapSize = dtmSettings.heightColorEntryCount;
-                    for (int i = 0; i < heightMapSize; ++i)
+                    if (dtmSettings.heightColorMapHeights[i] > height)
                     {
-                        if (dtmSettings.heightColorMapHeights[i] > height)
-                        {
-                            textureColor = dtmSettings.heightColorMapColors[i];
-                            break;
-                        }
+                        textureColor = dtmSettings.heightColorMapColors[i];
+                        break;
                     }
-
-                    localAlpha = 1.0f;
-                    break;
                 }
-                case DTMSettings::E_HEIGHT_SHADING_MODE::DISCRETE_FIXED_LENGTH_INTERVALS:
-                {
-                    /*const uint32_t heightMapSize = dtmSettings.heightColorEntryCount;
-                    uint32_t upperBoundHeightIndex = nbl::hlsl::numeric_limits<uint32_t>::max;
-                    uint32_t lowerBoundHeightIndex;
-                    // TODO: binary search
-                    for (int i = 0; i < heightMapSize; ++i)
-                    {
-                        if (dtmSettings.heightColorMapHeights[i] > height)
-                        {
-                            upperBoundHeightIndex = i;
-                            lowerBoundHeightIndex = i == 0 ? 0 : i - 1;
-                            break;
-                        }
-                    }
-
-                    if (upperBoundHeightIndex != nbl::hlsl::numeric_limits<uint32_t>::max)
-                    {
-                        float upperBoundHeight = dtmSettings.heightColorMapColors[upperBoundHeightIndex];
-                        float lowerBoundHeight = dtmSettings.heightColorMapColors[upperBoundHeightIndex];
-
-
-                        float3 upperBoundColor = dtmSettings.heightColorMapColors[upperBoundHeightIndex];
-                        float3 lowerBoundColor = dtmSettings.heightColorMapColors[lowerBoundHeightIndex];
 
-                        localAlpha = 1.0f;
-                    }*/
-
-                    break;
+                localAlpha = 1.0f;
+            }
+            else
+            {
+                float heightTmp;
+                if (mode == DTMSettings::E_HEIGHT_SHADING_MODE::DISCRETE_FIXED_LENGTH_INTERVALS)
+                {
+                    float interval = dtmSettings.intervalWidth;
+                    int sectionIndex = int((height - minShadingHeight) / interval);
+                    heightTmp = minShadingHeight + float(sectionIndex) * interval;
                 }
-                case DTMSettings::E_HEIGHT_SHADING_MODE::CONTINOUS_INTERVALS:
+                else if (mode == DTMSettings::E_HEIGHT_SHADING_MODE::CONTINOUS_INTERVALS)
                 {
+                    heightTmp = height;
+                }
 
-                    const uint32_t heightMapSize = dtmSettings.heightColorEntryCount;
-                    uint32_t upperBoundHeightIndex = nbl::hlsl::numeric_limits<uint32_t>::max;
-                    uint32_t lowerBoundHeightIndex;
-                    // TODO: binary search
-                    for (int i = 0; i < heightMapSize; ++i)
-                    {
-                        if (dtmSettings.heightColorMapHeights[i] > height)
-                        {
-                            upperBoundHeightIndex = i;
-                            lowerBoundHeightIndex = i;
-                            if (i != 0)
-                                --lowerBoundHeightIndex;
 
-                            break;
-                        }
-                    }
-                    if (upperBoundHeightIndex == nbl::hlsl::numeric_limits<uint32_t>::max)
+                const uint32_t heightMapSize = dtmSettings.heightColorEntryCount;
+                uint32_t upperBoundHeightIndex = nbl::hlsl::numeric_limits<uint32_t>::max;
+                uint32_t lowerBoundHeightIndex;
+                // TODO: binary search
+                for (int i = 0; i < heightMapSize; ++i)
+                {
+                    if (dtmSettings.heightColorMapHeights[i] > heightTmp)
                     {
-                        upperBoundHeightIndex = heightMapSize - 1;
-                        lowerBoundHeightIndex = upperBoundHeightIndex;
-                        if (upperBoundHeightIndex != 0)
+                        upperBoundHeightIndex = i;
+                        lowerBoundHeightIndex = i;
+                        if (i != 0)
                             --lowerBoundHeightIndex;
-                    }
-
-                    if (upperBoundHeightIndex != nbl::hlsl::numeric_limits<uint32_t>::max)
-                    {
-                        float upperBoundHeight = dtmSettings.heightColorMapHeights[upperBoundHeightIndex];
-                        float lowerBoundHeight = dtmSettings.heightColorMapHeights[lowerBoundHeightIndex];
-
-                        float3 upperBoundColor = dtmSettings.heightColorMapColors[upperBoundHeightIndex];
-                        float3 lowerBoundColor = dtmSettings.heightColorMapColors[lowerBoundHeightIndex];
-
-                        float interpolationVal;
-                        if (upperBoundHeightIndex == 0)
-                            interpolationVal = 1.0f;
-                        else
-                            interpolationVal = (height - lowerBoundHeight) / (upperBoundHeight - lowerBoundHeight);
 
-                        printf("idx = %i, t = %f, up = %f, lo = %f", upperBoundHeightIndex, interpolationVal, upperBoundHeight, lowerBoundHeight);
-
-                        textureColor = lerp(lowerBoundColor, upperBoundColor, interpolationVal);
-
-                        localAlpha = 1.0f;
+                        break;
                     }
-
-                    break;
                 }
+                if (upperBoundHeightIndex == nbl::hlsl::numeric_limits<uint32_t>::max)
+                {
+                    upperBoundHeightIndex = heightMapSize - 1;
+                    lowerBoundHeightIndex = upperBoundHeightIndex;
+                    if (upperBoundHeightIndex != 0)
+                        --lowerBoundHeightIndex;
+                }
+
+                float upperBoundHeight = dtmSettings.heightColorMapHeights[upperBoundHeightIndex];
+                float lowerBoundHeight = dtmSettings.heightColorMapHeights[lowerBoundHeightIndex];
+                
+                float3 upperBoundColor = dtmSettings.heightColorMapColors[upperBoundHeightIndex];
+                float3 lowerBoundColor = dtmSettings.heightColorMapColors[lowerBoundHeightIndex];
+                
+                float interpolationVal;
+                if (upperBoundHeightIndex == 0)
+                    interpolationVal = 1.0f;
+                else
+                    interpolationVal = (heightTmp - lowerBoundHeight) / (upperBoundHeight - lowerBoundHeight);
+                
+                textureColor = lerp(lowerBoundColor, upperBoundColor, interpolationVal);
+                localAlpha = 1.0f;
             }
         }
 

From 3237e4b656aa5c7bb80ad08f24f67a1f1d35d3b1 Mon Sep 17 00:00:00 2001
From: Przemek <minikers21@gmail.com>
Date: Wed, 26 Mar 2025 15:30:48 +0100
Subject: [PATCH 109/529] Implemented transparent height shading

---
 62_CAD/CTriangleMesh.h                        |  4 +--
 62_CAD/main.cpp                               | 25 ++++++-------
 62_CAD/shaders/globals.hlsl                   |  2 +-
 .../main_pipeline/fragment_shader.hlsl        | 35 +++++++++----------
 4 files changed, 32 insertions(+), 34 deletions(-)

diff --git a/62_CAD/CTriangleMesh.h b/62_CAD/CTriangleMesh.h
index 6711011ea..34fc243f7 100644
--- a/62_CAD/CTriangleMesh.h
+++ b/62_CAD/CTriangleMesh.h
@@ -25,7 +25,7 @@ struct DTMSettingsInfo
 	float intervalWidth;
 	E_HEIGHT_SHADING_MODE heightShadingMode;
 
-	void addHeightColorMapEntry(float height, float32_t3 color)
+	void addHeightColorMapEntry(float height, float32_t4 color)
 	{
 		heightColorSet.emplace(height, color);
 	}
@@ -52,7 +52,7 @@ struct DTMSettingsInfo
 	struct HeightColor
 	{
 		float height;
-		float32_t3 color;
+		float32_t4 color;
 
 		bool operator<(const HeightColor& other) const
 		{
diff --git a/62_CAD/main.cpp b/62_CAD/main.cpp
index ba2bf0da0..bfd346022 100644
--- a/62_CAD/main.cpp
+++ b/62_CAD/main.cpp
@@ -3356,14 +3356,14 @@ class ComputerAidedDesign final : public examples::SimpleWindowedApplication, pu
 			LineStyleInfo outlineStyle = {};
 			dtmSettingsInfo.outlineLineStyleInfo.screenSpaceLineWidth = 0.0f;
 			dtmSettingsInfo.outlineLineStyleInfo.worldSpaceLineWidth = 3.0f;
-			dtmSettingsInfo.outlineLineStyleInfo.color = float32_t4(0.0f, 0.39f, 0.0f, 1.0f);
+			dtmSettingsInfo.outlineLineStyleInfo.color = float32_t4(0.0f, 0.39f, 0.0f, 0.5f);
 			std::array<double, 4> outlineStipplePattern = { 0.0f, -5.0f, 2.0f, -5.0f };
 			dtmSettingsInfo.outlineLineStyleInfo.setStipplePatternData(outlineStipplePattern);
 
 			LineStyleInfo contourStyle = {};
 			dtmSettingsInfo.contourLineStyleInfo.screenSpaceLineWidth = 0.0f;
 			dtmSettingsInfo.contourLineStyleInfo.worldSpaceLineWidth = 1.0f;
-			dtmSettingsInfo.contourLineStyleInfo.color = float32_t4(0.0f, 0.0f, 1.0f, 1.0f);
+			dtmSettingsInfo.contourLineStyleInfo.color = float32_t4(0.0f, 0.0f, 1.0f, 0.7f);
 			std::array<double, 4> contourStipplePattern = { 0.0f, -5.0f, 10.0f, -5.0f };
 			dtmSettingsInfo.contourLineStyleInfo.setStipplePatternData(contourStipplePattern);
 
@@ -3376,27 +3376,28 @@ class ComputerAidedDesign final : public examples::SimpleWindowedApplication, pu
 				case DTMSettingsInfo::E_HEIGHT_SHADING_MODE::DISCRETE_VARIABLE_LENGTH_INTERVALS:
 				{
 					dtmSettingsInfo.heightShadingMode = DTMSettingsInfo::E_HEIGHT_SHADING_MODE::DISCRETE_VARIABLE_LENGTH_INTERVALS;
-					dtmSettingsInfo.addHeightColorMapEntry(20.0f, float32_t3(0.5f, 1.0f, 1.0f));
-					dtmSettingsInfo.addHeightColorMapEntry(25.0f, float32_t3(0.0f, 1.0f, 0.0f));
-					dtmSettingsInfo.addHeightColorMapEntry(70.0f, float32_t3(1.0f, 1.0f, 0.0f));
-					dtmSettingsInfo.addHeightColorMapEntry(80.0f, float32_t3(1.0f, 0.0f, 0.0f));
+					dtmSettingsInfo.addHeightColorMapEntry(20.0f, float32_t4(0.5f, 1.0f, 1.0f, 1.0f));
+					dtmSettingsInfo.addHeightColorMapEntry(25.0f, float32_t4(0.0f, 1.0f, 0.0f, 1.0f));
+					dtmSettingsInfo.addHeightColorMapEntry(70.0f, float32_t4(1.0f, 1.0f, 0.0f, 1.0f));
+					dtmSettingsInfo.addHeightColorMapEntry(80.0f, float32_t4(1.0f, 0.0f, 0.0f, 1.0f));
 					break;
 				}
 				case DTMSettingsInfo::E_HEIGHT_SHADING_MODE::DISCRETE_FIXED_LENGTH_INTERVALS:
 				{
 					dtmSettingsInfo.intervalWidth = 8.0f;
 					dtmSettingsInfo.heightShadingMode = DTMSettingsInfo::E_HEIGHT_SHADING_MODE::DISCRETE_FIXED_LENGTH_INTERVALS;
-					dtmSettingsInfo.addHeightColorMapEntry(0.0f, float32_t3(0.0f, 1.0f, 0.0f));
-					dtmSettingsInfo.addHeightColorMapEntry(50.0f, float32_t3(1.0f, 1.0f, 0.0f));
-					dtmSettingsInfo.addHeightColorMapEntry(100.0f, float32_t3(1.0f, 0.0f, 0.0f));
+					float animatedAlpha = (std::cos(m_timeElapsed * 0.0003) + 1.0) * 0.5;
+					dtmSettingsInfo.addHeightColorMapEntry(0.0f, float32_t4(0.0f, 1.0f, 0.0f, animatedAlpha));
+					dtmSettingsInfo.addHeightColorMapEntry(50.0f, float32_t4(1.0f, 1.0f, 0.0f, animatedAlpha));
+					dtmSettingsInfo.addHeightColorMapEntry(100.0f, float32_t4(1.0f, 0.0f, 0.0f, animatedAlpha));
 					break;
 				}
 				case DTMSettingsInfo::E_HEIGHT_SHADING_MODE::CONTINOUS_INTERVALS:
 				{
 					dtmSettingsInfo.heightShadingMode = DTMSettingsInfo::E_HEIGHT_SHADING_MODE::CONTINOUS_INTERVALS;
-					dtmSettingsInfo.addHeightColorMapEntry(-10.0f, float32_t3(0.0f, 1.0f, 0.0f));
-					dtmSettingsInfo.addHeightColorMapEntry(30.0f, float32_t3(1.0f, 1.0f, 0.0f));
-					dtmSettingsInfo.addHeightColorMapEntry(90.0f, float32_t3(1.0f, 0.0f, 0.0f));
+					dtmSettingsInfo.addHeightColorMapEntry(-10.0f, float32_t4(0.0f, 1.0f, 0.0f, 1.0f));
+					dtmSettingsInfo.addHeightColorMapEntry(30.0f, float32_t4(1.0f, 1.0f, 0.0f, 1.0f));
+					dtmSettingsInfo.addHeightColorMapEntry(90.0f, float32_t4(1.0f, 0.0f, 0.0f, 1.0f));
 					break;
 				}
 			}
diff --git a/62_CAD/shaders/globals.hlsl b/62_CAD/shaders/globals.hlsl
index 7a05819ac..84f9416e3 100644
--- a/62_CAD/shaders/globals.hlsl
+++ b/62_CAD/shaders/globals.hlsl
@@ -343,7 +343,7 @@ struct DTMSettings
     float intervalWidth;
     uint32_t heightColorEntryCount;
     float heightColorMapHeights[HeightColorMapMaxEntries];
-    float32_t3 heightColorMapColors[HeightColorMapMaxEntries];
+    float32_t4 heightColorMapColors[HeightColorMapMaxEntries];
 
     enum E_HEIGHT_SHADING_MODE
     {
diff --git a/62_CAD/shaders/main_pipeline/fragment_shader.hlsl b/62_CAD/shaders/main_pipeline/fragment_shader.hlsl
index ad8a35c82..0d5ec486d 100644
--- a/62_CAD/shaders/main_pipeline/fragment_shader.hlsl
+++ b/62_CAD/shaders/main_pipeline/fragment_shader.hlsl
@@ -452,8 +452,6 @@ float4 fragMain(PSInput input) : SV_TARGET
         float minShadingHeight = dtmSettings.heightColorMapHeights[0];
         float maxShadingHeight = dtmSettings.heightColorMapHeights[heightMapSize - 1];
 
-        printf("min = %f, max = %f", minShadingHeight, maxShadingHeight);
-
         const bool isHeightBetweenMinAndMax = height >= minShadingHeight && height <= maxShadingHeight;
         const bool isHeightColorMapNotEmpty = heightMapSize > 0;
         if (isHeightColorMapNotEmpty && isHeightBetweenMinAndMax)
@@ -462,16 +460,24 @@ float4 fragMain(PSInput input) : SV_TARGET
 
             if(mode == DTMSettings::E_HEIGHT_SHADING_MODE::DISCRETE_VARIABLE_LENGTH_INTERVALS)
             {
+                uint32_t upperBoundHeightIndex = nbl::hlsl::numeric_limits<uint32_t>::max;
+                uint32_t lowerBoundHeightIndex;
+                // TODO: binary search
                 for (int i = 0; i < heightMapSize; ++i)
                 {
                     if (dtmSettings.heightColorMapHeights[i] > height)
                     {
-                        textureColor = dtmSettings.heightColorMapColors[i];
+                        upperBoundHeightIndex = i;
+                        lowerBoundHeightIndex = i;
+                        if (i != 0)
+                            --lowerBoundHeightIndex;
+
                         break;
                     }
                 }
 
-                localAlpha = 1.0f;
+                textureColor = dtmSettings.heightColorMapColors[upperBoundHeightIndex].rgb;
+                localAlpha = dtmSettings.heightColorMapColors[upperBoundHeightIndex].a;
             }
             else
             {
@@ -487,8 +493,6 @@ float4 fragMain(PSInput input) : SV_TARGET
                     heightTmp = height;
                 }
 
-
-                const uint32_t heightMapSize = dtmSettings.heightColorEntryCount;
                 uint32_t upperBoundHeightIndex = nbl::hlsl::numeric_limits<uint32_t>::max;
                 uint32_t lowerBoundHeightIndex;
                 // TODO: binary search
@@ -504,19 +508,12 @@ float4 fragMain(PSInput input) : SV_TARGET
                         break;
                     }
                 }
-                if (upperBoundHeightIndex == nbl::hlsl::numeric_limits<uint32_t>::max)
-                {
-                    upperBoundHeightIndex = heightMapSize - 1;
-                    lowerBoundHeightIndex = upperBoundHeightIndex;
-                    if (upperBoundHeightIndex != 0)
-                        --lowerBoundHeightIndex;
-                }
 
                 float upperBoundHeight = dtmSettings.heightColorMapHeights[upperBoundHeightIndex];
                 float lowerBoundHeight = dtmSettings.heightColorMapHeights[lowerBoundHeightIndex];
                 
-                float3 upperBoundColor = dtmSettings.heightColorMapColors[upperBoundHeightIndex];
-                float3 lowerBoundColor = dtmSettings.heightColorMapColors[lowerBoundHeightIndex];
+                float4 upperBoundColor = dtmSettings.heightColorMapColors[upperBoundHeightIndex];
+                float4 lowerBoundColor = dtmSettings.heightColorMapColors[lowerBoundHeightIndex];
                 
                 float interpolationVal;
                 if (upperBoundHeightIndex == 0)
@@ -524,8 +521,8 @@ float4 fragMain(PSInput input) : SV_TARGET
                 else
                     interpolationVal = (heightTmp - lowerBoundHeight) / (upperBoundHeight - lowerBoundHeight);
                 
-                textureColor = lerp(lowerBoundColor, upperBoundColor, interpolationVal);
-                localAlpha = 1.0f;
+                textureColor = lerp(lowerBoundColor.rgb, upperBoundColor.rgb, interpolationVal);
+                localAlpha = lerp(lowerBoundColor.a, upperBoundColor.a, interpolationVal);;
             }
         }
 
@@ -587,7 +584,7 @@ float4 fragMain(PSInput input) : SV_TARGET
                 distance = ClippedSignedDistance<nbl::hlsl::shapes::Line<float>, LineStyleClipper>::sdf(lineSegment, input.position.xy, contourThickness, contourStyle.isRoadStyleFlag, clipper);
             }
 
-            float contourLocalAlpha = smoothstep(+globals.antiAliasingFactor, -globals.antiAliasingFactor, distance);
+            float contourLocalAlpha = smoothstep(+globals.antiAliasingFactor, -globals.antiAliasingFactor, distance) * contourStyle.color.a;
             textureColor = lerp(textureColor, contourStyle.color.rgb, contourLocalAlpha);
             localAlpha = max(localAlpha, contourLocalAlpha);
         }
@@ -657,7 +654,7 @@ float4 fragMain(PSInput input) : SV_TARGET
 
         }
 
-        float outlineLocalAlpha = smoothstep(+globals.antiAliasingFactor, -globals.antiAliasingFactor, minDistance);
+        float outlineLocalAlpha = smoothstep(+globals.antiAliasingFactor, -globals.antiAliasingFactor, minDistance) * outlineStyle.color.a;
         textureColor = lerp(textureColor, outlineStyle.color.rgb, outlineLocalAlpha);
         localAlpha = max(localAlpha, outlineLocalAlpha);
     }

From 8090a2d5afc1b33eb6259ef5d20e0402fce682c5 Mon Sep 17 00:00:00 2001
From: keptsecret <sorchon@gmail.com>
Date: Thu, 27 Mar 2025 15:26:02 +0700
Subject: [PATCH 110/529] initial benchmark example copy

---
 71_ArithmeticBench/CMakeLists.txt             |  25 +
 71_ArithmeticBench/app_resources/common.hlsl  |  96 ++++
 .../app_resources/shaderCommon.hlsl           |  55 +++
 .../app_resources/testSubgroup.comp.hlsl      |  18 +
 .../app_resources/testWorkgroup.comp.hlsl     | 107 ++++
 71_ArithmeticBench/config.json.template       |  28 ++
 71_ArithmeticBench/main.cpp                   | 462 ++++++++++++++++++
 71_ArithmeticBench/pipeline.groovy            |  50 ++
 CMakeLists.txt                                |   4 +-
 9 files changed, 844 insertions(+), 1 deletion(-)
 create mode 100644 71_ArithmeticBench/CMakeLists.txt
 create mode 100644 71_ArithmeticBench/app_resources/common.hlsl
 create mode 100644 71_ArithmeticBench/app_resources/shaderCommon.hlsl
 create mode 100644 71_ArithmeticBench/app_resources/testSubgroup.comp.hlsl
 create mode 100644 71_ArithmeticBench/app_resources/testWorkgroup.comp.hlsl
 create mode 100644 71_ArithmeticBench/config.json.template
 create mode 100644 71_ArithmeticBench/main.cpp
 create mode 100644 71_ArithmeticBench/pipeline.groovy

diff --git a/71_ArithmeticBench/CMakeLists.txt b/71_ArithmeticBench/CMakeLists.txt
new file mode 100644
index 000000000..0724366c9
--- /dev/null
+++ b/71_ArithmeticBench/CMakeLists.txt
@@ -0,0 +1,25 @@
+
+include(common RESULT_VARIABLE RES)
+if(NOT RES)
+	message(FATAL_ERROR "common.cmake not found. Should be in {repo_root}/cmake directory")
+endif()
+
+nbl_create_executable_project("" "" "" "" "${NBL_EXECUTABLE_PROJECT_CREATION_PCH_TARGET}")
+
+if(NBL_EMBED_BUILTIN_RESOURCES)
+	set(_BR_TARGET_ ${EXECUTABLE_NAME}_builtinResourceData)
+	set(RESOURCE_DIR "app_resources")
+
+	get_filename_component(_SEARCH_DIRECTORIES_ "${CMAKE_CURRENT_SOURCE_DIR}" ABSOLUTE)
+	get_filename_component(_OUTPUT_DIRECTORY_SOURCE_ "${CMAKE_CURRENT_BINARY_DIR}/src" ABSOLUTE)
+	get_filename_component(_OUTPUT_DIRECTORY_HEADER_ "${CMAKE_CURRENT_BINARY_DIR}/include" ABSOLUTE)
+
+    file(GLOB_RECURSE BUILTIN_RESOURCE_FILES RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}/${RESOURCE_DIR}" CONFIGURE_DEPENDS "${CMAKE_CURRENT_SOURCE_DIR}/${RESOURCE_DIR}/*")
+    foreach(RES_FILE ${BUILTIN_RESOURCE_FILES})
+      LIST_BUILTIN_RESOURCE(RESOURCES_TO_EMBED "${RES_FILE}")
+    endforeach()
+
+	ADD_CUSTOM_BUILTIN_RESOURCES(${_BR_TARGET_} RESOURCES_TO_EMBED "${_SEARCH_DIRECTORIES_}" "${RESOURCE_DIR}" "nbl::this_example::builtin" "${_OUTPUT_DIRECTORY_HEADER_}" "${_OUTPUT_DIRECTORY_SOURCE_}")
+
+	LINK_BUILTIN_RESOURCES_TO_TARGET(${EXECUTABLE_NAME} ${_BR_TARGET_})
+endif()
\ No newline at end of file
diff --git a/71_ArithmeticBench/app_resources/common.hlsl b/71_ArithmeticBench/app_resources/common.hlsl
new file mode 100644
index 000000000..10892a2b9
--- /dev/null
+++ b/71_ArithmeticBench/app_resources/common.hlsl
@@ -0,0 +1,96 @@
+#include "nbl/builtin/hlsl/cpp_compat.hlsl"
+#include "nbl/builtin/hlsl/functional.hlsl"
+
+template<uint32_t kScanElementCount=1024*1024>
+struct Output
+{
+	NBL_CONSTEXPR_STATIC_INLINE uint32_t ScanElementCount = kScanElementCount;
+
+	uint32_t subgroupSize;
+	uint32_t data[ScanElementCount];
+};
+
+// Thanks to our unified HLSL/C++ STD lib we're able to remove a whole load of code
+template<typename T>
+struct bit_and : nbl::hlsl::bit_and<T>
+{
+	using base_t = nbl::hlsl::bit_and<T>;
+
+	NBL_CONSTEXPR_STATIC_INLINE uint16_t BindingIndex = 0;
+#ifndef __HLSL_VERSION
+	static inline constexpr const char* name = "bit_and";
+#endif
+};
+template<typename T>
+struct bit_or : nbl::hlsl::bit_or<T>
+{
+	using base_t = nbl::hlsl::bit_or<T>;
+
+	NBL_CONSTEXPR_STATIC_INLINE uint16_t BindingIndex = 1;
+#ifndef __HLSL_VERSION
+	static inline constexpr const char* name = "bit_xor";
+#endif
+};
+template<typename T>
+struct bit_xor : nbl::hlsl::bit_xor<T>
+{
+	using base_t = nbl::hlsl::bit_xor<T>;
+
+	NBL_CONSTEXPR_STATIC_INLINE uint16_t BindingIndex = 2;
+#ifndef __HLSL_VERSION
+	static inline constexpr const char* name = "bit_or";
+#endif
+};
+template<typename T>
+struct plus : nbl::hlsl::plus<T>
+{
+	using base_t = nbl::hlsl::plus<T>;
+
+	NBL_CONSTEXPR_STATIC_INLINE uint16_t BindingIndex = 3;
+#ifndef __HLSL_VERSION
+	static inline constexpr const char* name = "plus";
+#endif
+};
+template<typename T>
+struct multiplies : nbl::hlsl::multiplies<T>
+{
+	using base_t = nbl::hlsl::multiplies<T>;
+
+	NBL_CONSTEXPR_STATIC_INLINE uint16_t BindingIndex = 4;
+#ifndef __HLSL_VERSION
+	static inline constexpr const char* name = "multiplies";
+#endif
+};
+template<typename T>
+struct minimum : nbl::hlsl::minimum<T>
+{
+	using base_t = nbl::hlsl::minimum<T>;
+
+	NBL_CONSTEXPR_STATIC_INLINE uint16_t BindingIndex = 5;
+#ifndef __HLSL_VERSION
+	static inline constexpr const char* name = "minimum";
+#endif
+};
+template<typename T>
+struct maximum : nbl::hlsl::maximum<T>
+{
+	using base_t = nbl::hlsl::maximum<T>;
+
+	NBL_CONSTEXPR_STATIC_INLINE uint16_t BindingIndex = 6;
+#ifndef __HLSL_VERSION
+	static inline constexpr const char* name = "maximum";
+#endif
+};
+
+template<typename T>
+struct ballot : nbl::hlsl::plus<T>
+{
+	using base_t = nbl::hlsl::plus<T>;
+
+	NBL_CONSTEXPR_STATIC_INLINE uint16_t BindingIndex = 7;
+#ifndef __HLSL_VERSION
+	static inline constexpr const char* name = "bitcount";
+#endif
+};
+
+#include "nbl/builtin/hlsl/subgroup/basic.hlsl"
\ No newline at end of file
diff --git a/71_ArithmeticBench/app_resources/shaderCommon.hlsl b/71_ArithmeticBench/app_resources/shaderCommon.hlsl
new file mode 100644
index 000000000..13ee8d21e
--- /dev/null
+++ b/71_ArithmeticBench/app_resources/shaderCommon.hlsl
@@ -0,0 +1,55 @@
+#include "common.hlsl"
+
+#include "nbl/builtin/hlsl/glsl_compat/core.hlsl"
+#include "nbl/builtin/hlsl/subgroup/basic.hlsl"
+#include "nbl/builtin/hlsl/subgroup/arithmetic_portability.hlsl"
+
+#include "nbl/builtin/hlsl/jit/device_capabilities.hlsl"
+
+// https://github.com/microsoft/DirectXShaderCompiler/issues/6144
+uint32_t3 nbl::hlsl::glsl::gl_WorkGroupSize() {return uint32_t3(WORKGROUP_SIZE,1,1);}
+
+// unfortunately DXC chokes on descriptors as static members
+// https://github.com/microsoft/DirectXShaderCompiler/issues/5940
+[[vk::binding(0, 0)]] StructuredBuffer<uint32_t> inputValue;
+[[vk::binding(1, 0)]] RWByteAddressBuffer output[8];
+
+// because subgroups don't match `gl_LocalInvocationIndex` snake curve addressing, we also can't load inputs that way
+uint32_t globalIndex();
+// since we test ITEMS_PER_WG<WorkgroupSize we need this so workgroups don't overwrite each other's outputs
+bool canStore();
+
+//typedef decltype(inputValue[0]) type_t;
+typedef uint32_t type_t;
+
+
+#ifndef OPERATION
+#error "Define OPERATION!"
+#endif
+template<template<class> class binop>
+static void subtest(NBL_CONST_REF_ARG(type_t) sourceVal)
+{
+	if (globalIndex()==0u)
+		output[binop<type_t>::BindingIndex].template Store<uint32_t>(0,nbl::hlsl::glsl::gl_SubgroupSize());
+		
+	operation_t<typename binop<type_t>::base_t,nbl::hlsl::jit::device_capabilities> func;
+	if (canStore())
+		output[binop<type_t>::BindingIndex].template Store<type_t>(sizeof(uint32_t)+sizeof(type_t)*globalIndex(),func(sourceVal));
+}
+
+
+type_t test()
+{
+	const type_t sourceVal = inputValue[globalIndex()];
+
+	subtest<bit_and>(sourceVal);
+	subtest<bit_xor>(sourceVal);
+	subtest<bit_or>(sourceVal);
+	subtest<plus>(sourceVal);
+	subtest<multiplies>(sourceVal);
+	subtest<minimum>(sourceVal);
+	subtest<maximum>(sourceVal);
+	return sourceVal;
+}
+
+#include "nbl/builtin/hlsl/workgroup/basic.hlsl"
\ No newline at end of file
diff --git a/71_ArithmeticBench/app_resources/testSubgroup.comp.hlsl b/71_ArithmeticBench/app_resources/testSubgroup.comp.hlsl
new file mode 100644
index 000000000..479265d73
--- /dev/null
+++ b/71_ArithmeticBench/app_resources/testSubgroup.comp.hlsl
@@ -0,0 +1,18 @@
+#pragma shader_stage(compute)
+
+#define operation_t nbl::hlsl::OPERATION
+
+#include "shaderCommon.hlsl"
+
+uint32_t globalIndex()
+{
+	return nbl::hlsl::glsl::gl_WorkGroupID().x*WORKGROUP_SIZE+nbl::hlsl::workgroup::SubgroupContiguousIndex();
+}
+
+bool canStore() {return true;}
+
+[numthreads(WORKGROUP_SIZE,1,1)]
+void main()
+{
+	test();
+}
\ No newline at end of file
diff --git a/71_ArithmeticBench/app_resources/testWorkgroup.comp.hlsl b/71_ArithmeticBench/app_resources/testWorkgroup.comp.hlsl
new file mode 100644
index 000000000..9bafae47f
--- /dev/null
+++ b/71_ArithmeticBench/app_resources/testWorkgroup.comp.hlsl
@@ -0,0 +1,107 @@
+#pragma shader_stage(compute)
+
+
+#include "nbl/builtin/hlsl/workgroup/scratch_size.hlsl"
+
+static const uint32_t ArithmeticSz = nbl::hlsl::workgroup::scratch_size_arithmetic<ITEMS_PER_WG>::value;
+static const uint32_t BallotSz = nbl::hlsl::workgroup::scratch_size_ballot<ITEMS_PER_WG>::value;
+static const uint32_t ScratchSz = ArithmeticSz+BallotSz;
+
+// TODO: Can we make it a static variable in the ScratchProxy struct?
+groupshared uint32_t scratch[ScratchSz];
+
+
+#include "nbl/builtin/hlsl/workgroup/arithmetic.hlsl"
+
+
+template<uint16_t offset>
+struct ScratchProxy
+{
+	void get(const uint32_t ix, NBL_REF_ARG(uint32_t) value)
+	{
+		value = scratch[ix+offset];
+	}
+	void set(const uint32_t ix, const uint32_t value)
+	{
+		scratch[ix+offset] = value;
+	}
+
+	uint32_t atomicOr(const uint32_t ix, const uint32_t value)
+	{
+		return nbl::hlsl::glsl::atomicOr(scratch[ix],value);
+	}
+
+	void workgroupExecutionAndMemoryBarrier()
+	{
+		nbl::hlsl::glsl::barrier();
+		//nbl::hlsl::glsl::memoryBarrierShared(); implied by the above
+	}
+};
+
+static ScratchProxy<0> arithmeticAccessor;
+
+
+#include "nbl/builtin/hlsl/workgroup/broadcast.hlsl"
+
+
+template<class Binop, class device_capabilities>
+struct operation_t
+{
+	using type_t = typename Binop::type_t;
+
+	type_t operator()(type_t value)
+	{
+		type_t retval = nbl::hlsl::OPERATION<Binop,ITEMS_PER_WG,device_capabilities>::template __call<ScratchProxy<0> >(value,arithmeticAccessor);
+		// we barrier before because we alias the accessors for Binop
+		arithmeticAccessor.workgroupExecutionAndMemoryBarrier();
+		return retval;
+	}
+};
+
+
+#include "shaderCommon.hlsl"
+
+static ScratchProxy<ArithmeticSz> ballotAccessor;
+
+
+uint32_t globalIndex()
+{
+	return nbl::hlsl::glsl::gl_WorkGroupID().x*ITEMS_PER_WG+nbl::hlsl::workgroup::SubgroupContiguousIndex();
+}
+
+bool canStore()
+{
+	return nbl::hlsl::workgroup::SubgroupContiguousIndex()<ITEMS_PER_WG;
+}
+
+[numthreads(WORKGROUP_SIZE,1,1)]
+void main()
+{
+	const type_t sourceVal = test();
+	if (globalIndex()==0u)
+		output[ballot<type_t>::BindingIndex].template Store<uint32_t>(0,nbl::hlsl::glsl::gl_SubgroupSize());
+
+	// we can only ballot booleans, so low bit
+	nbl::hlsl::workgroup::ballot<ScratchProxy<ArithmeticSz> >(bool(sourceVal & 0x1u), ballotAccessor);
+	// need to barrier between ballot and usages of a ballot by myself
+	ballotAccessor.workgroupExecutionAndMemoryBarrier();
+
+	uint32_t destVal = 0xdeadbeefu;
+#define CONSTEXPR_OP_TYPE_TEST(IS_OP) nbl::hlsl::is_same<nbl::hlsl::OPERATION<nbl::hlsl::bit_xor<float>,0x45>,nbl::hlsl::workgroup::IS_OP<nbl::hlsl::bit_xor<float>,0x45> >::value
+#define BALLOT_TEMPLATE_ARGS ITEMS_PER_WG,decltype(ballotAccessor),decltype(arithmeticAccessor),nbl::hlsl::jit::device_capabilities
+	if (CONSTEXPR_OP_TYPE_TEST(reduction))
+		destVal = nbl::hlsl::workgroup::ballotBitCount<BALLOT_TEMPLATE_ARGS>(ballotAccessor,arithmeticAccessor);
+	else if (CONSTEXPR_OP_TYPE_TEST(inclusive_scan))
+		destVal = nbl::hlsl::workgroup::ballotInclusiveBitCount<BALLOT_TEMPLATE_ARGS>(ballotAccessor,arithmeticAccessor);
+	else if (CONSTEXPR_OP_TYPE_TEST(exclusive_scan))
+		destVal = nbl::hlsl::workgroup::ballotExclusiveBitCount<BALLOT_TEMPLATE_ARGS>(ballotAccessor,arithmeticAccessor);
+	else
+	{
+		assert(false);
+	}
+#undef BALLOT_TEMPLATE_ARGS
+#undef CONSTEXPR_OP_TYPE_TEST
+
+	if (canStore())
+		output[ballot<type_t>::BindingIndex].template Store<type_t>(sizeof(uint32_t)+sizeof(type_t)*globalIndex(),destVal);
+}
\ No newline at end of file
diff --git a/71_ArithmeticBench/config.json.template b/71_ArithmeticBench/config.json.template
new file mode 100644
index 000000000..f961745c1
--- /dev/null
+++ b/71_ArithmeticBench/config.json.template
@@ -0,0 +1,28 @@
+{
+  "enableParallelBuild": true,
+  "threadsPerBuildProcess" : 2,
+  "isExecuted": false,
+  "scriptPath": "",
+  "cmake": {
+    "configurations": [ "Release", "Debug", "RelWithDebInfo" ],
+    "buildModes": [],
+    "requiredOptions": []
+  }, 
+  "profiles": [
+    {
+      "backend": "vulkan",
+      "platform": "windows",
+      "buildModes": [],
+      "runConfiguration": "Release",
+      "gpuArchitectures": []
+    }
+  ],
+  "dependencies": [],
+  "data": [
+    {
+      "dependencies": [],
+      "command": [""],
+      "outputs": []
+    }
+  ]
+}
\ No newline at end of file
diff --git a/71_ArithmeticBench/main.cpp b/71_ArithmeticBench/main.cpp
new file mode 100644
index 000000000..0952d2b57
--- /dev/null
+++ b/71_ArithmeticBench/main.cpp
@@ -0,0 +1,462 @@
+#include "nbl/application_templates/BasicMultiQueueApplication.hpp"
+#include "nbl/application_templates/MonoAssetManagerAndBuiltinResourceApplication.hpp"
+#include "app_resources/common.hlsl"
+
+using namespace nbl;
+using namespace core;
+using namespace asset;
+using namespace system;
+using namespace video;
+
+// method emulations on the CPU, to verify the results of the GPU methods
+template<class Binop>
+struct emulatedReduction
+{
+	using type_t = typename Binop::type_t;
+
+	static inline void impl(type_t* out, const type_t* in, const uint32_t itemCount)
+	{
+		const type_t red = std::reduce(in,in+itemCount,Binop::identity,Binop());
+		std::fill(out,out+itemCount,red);
+	}
+
+	static inline constexpr const char* name = "reduction";
+};
+template<class Binop>
+struct emulatedScanInclusive
+{
+	using type_t = typename Binop::type_t;
+
+	static inline void impl(type_t* out, const type_t* in, const uint32_t itemCount)
+	{
+		std::inclusive_scan(in,in+itemCount,out,Binop());
+	}
+	static inline constexpr const char* name = "inclusive_scan";
+};
+template<class Binop>
+struct emulatedScanExclusive
+{
+	using type_t = typename Binop::type_t;
+
+	static inline void impl(type_t* out, const type_t* in, const uint32_t itemCount)
+	{
+		std::exclusive_scan(in,in+itemCount,out,Binop::identity,Binop());
+	}
+	static inline constexpr const char* name = "exclusive_scan";
+};
+
+class ArithmeticBenchApp final : public application_templates::BasicMultiQueueApplication, public application_templates::MonoAssetManagerAndBuiltinResourceApplication
+{
+	using device_base_t = application_templates::BasicMultiQueueApplication;
+	using asset_base_t = application_templates::MonoAssetManagerAndBuiltinResourceApplication;
+
+public:
+	ArithmeticBenchApp(const path& _localInputCWD, const path& _localOutputCWD, const path& _sharedInputCWD, const path& _sharedOutputCWD) :
+		system::IApplicationFramework(_localInputCWD, _localOutputCWD, _sharedInputCWD, _sharedOutputCWD) {}
+
+	bool onAppInitialized(smart_refctd_ptr<ISystem>&& system) override
+	{
+		if (!device_base_t::onAppInitialized(std::move(system)))
+			return false;
+		if (!asset_base_t::onAppInitialized(std::move(system)))
+			return false;
+
+		transferDownQueue = getTransferDownQueue();
+		computeQueue = getComputeQueue();
+
+		// TODO: get the element count from argv
+		const uint32_t elementCount = Output<>::ScanElementCount;
+		// populate our random data buffer on the CPU and create a GPU copy
+		inputData = new uint32_t[elementCount];
+		smart_refctd_ptr<IGPUBuffer> gpuinputDataBuffer;
+		{
+			std::mt19937 randGenerator(0xdeadbeefu);
+			for (uint32_t i = 0u; i < elementCount; i++)
+				inputData[i] = randGenerator(); // TODO: change to using xoroshiro, then we can skip having the input buffer at all
+
+			IGPUBuffer::SCreationParams inputDataBufferCreationParams = {};
+			inputDataBufferCreationParams.size = sizeof(Output<>::data[0]) * elementCount;
+			inputDataBufferCreationParams.usage = IGPUBuffer::EUF_STORAGE_BUFFER_BIT | IGPUBuffer::EUF_TRANSFER_DST_BIT;
+			m_utils->createFilledDeviceLocalBufferOnDedMem(
+				SIntendedSubmitInfo{.queue=getTransferUpQueue()},
+				std::move(inputDataBufferCreationParams),
+				inputData
+			).move_into(gpuinputDataBuffer);
+		}
+
+		// create 8 buffers for 8 operations
+		for (auto i=0u; i<OutputBufferCount; i++)
+		{
+			IGPUBuffer::SCreationParams params = {};
+			params.size = sizeof(uint32_t) + gpuinputDataBuffer->getSize();
+			params.usage = bitflag(IGPUBuffer::EUF_STORAGE_BUFFER_BIT) | IGPUBuffer::EUF_TRANSFER_SRC_BIT;
+
+			outputBuffers[i] = m_device->createBuffer(std::move(params));
+			auto mreq = outputBuffers[i]->getMemoryReqs();
+			mreq.memoryTypeBits &= m_physicalDevice->getDeviceLocalMemoryTypeBits();
+			assert(mreq.memoryTypeBits);
+
+			auto bufferMem = m_device->allocate(mreq, outputBuffers[i].get());
+			assert(bufferMem.isValid());
+		}
+
+		// create Descriptor Set and Pipeline Layout
+		{
+			// create Descriptor Set Layout
+			smart_refctd_ptr<IGPUDescriptorSetLayout> dsLayout;
+			{
+				IGPUDescriptorSetLayout::SBinding binding[2];
+				for (uint32_t i = 0u; i < 2; i++)
+					binding[i] = {{},i,IDescriptor::E_TYPE::ET_STORAGE_BUFFER,IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_NONE,IShader::E_SHADER_STAGE::ESS_COMPUTE,1u,nullptr };
+				binding[1].count = OutputBufferCount;
+				dsLayout = m_device->createDescriptorSetLayout(binding);
+			}
+
+			// set and transient pool
+			auto descPool = m_device->createDescriptorPoolForDSLayouts(IDescriptorPool::ECF_NONE,{&dsLayout.get(),1});
+			descriptorSet = descPool->createDescriptorSet(smart_refctd_ptr(dsLayout));
+			{
+				IGPUDescriptorSet::SDescriptorInfo infos[1+OutputBufferCount];
+				infos[0].desc = gpuinputDataBuffer;
+				infos[0].info.buffer = { 0u,gpuinputDataBuffer->getSize() };
+				for (uint32_t i = 1u; i <= OutputBufferCount; i++)
+				{
+					auto buff = outputBuffers[i - 1];
+					infos[i].info.buffer = { 0u,buff->getSize() };
+					infos[i].desc = std::move(buff); // save an atomic in the refcount
+
+				}
+
+				IGPUDescriptorSet::SWriteDescriptorSet writes[2];
+				for (uint32_t i=0u; i<2; i++)
+					writes[i] = {descriptorSet.get(),i,0u,1u,infos+i};
+				writes[1].count = OutputBufferCount;
+
+				m_device->updateDescriptorSets(2, writes, 0u, nullptr);
+			}
+
+			pipelineLayout = m_device->createPipelineLayout({},std::move(dsLayout));
+		}
+
+		const auto spirv_isa_cache_path = localOutputCWD/"spirv_isa_cache.bin";
+		// enclose to make sure file goes out of scope and we can reopen it
+		{
+			smart_refctd_ptr<const IFile> spirv_isa_cache_input;
+			// try to load SPIR-V to ISA cache
+			{
+				ISystem::future_t<smart_refctd_ptr<IFile>> fileCreate;
+				m_system->createFile(fileCreate,spirv_isa_cache_path,IFile::ECF_READ|IFile::ECF_MAPPABLE|IFile::ECF_COHERENT);
+				if (auto lock=fileCreate.acquire())
+					spirv_isa_cache_input = *lock;
+			}
+			// create the cache
+			{
+				std::span<const uint8_t> spirv_isa_cache_data = {};
+				if (spirv_isa_cache_input)
+					spirv_isa_cache_data = {reinterpret_cast<const uint8_t*>(spirv_isa_cache_input->getMappedPointer()),spirv_isa_cache_input->getSize()};
+				else
+					m_logger->log("Failed to load SPIR-V 2 ISA cache!",ILogger::ELL_PERFORMANCE);
+				// Normally we'd deserialize a `ICPUPipelineCache` properly and pass that instead
+				m_spirv_isa_cache = m_device->createPipelineCache(spirv_isa_cache_data);
+			}
+		}
+		{
+			// TODO: rename `deleteDirectory` to just `delete`? and a `IFile::setSize()` ?
+			m_system->deleteDirectory(spirv_isa_cache_path);
+			ISystem::future_t<smart_refctd_ptr<IFile>> fileCreate;
+			m_system->createFile(fileCreate,spirv_isa_cache_path,IFile::ECF_WRITE);
+			// I can be relatively sure I'll succeed to acquire the future, the pointer to created file might be null though.
+			m_spirv_isa_cache_output=*fileCreate.acquire();
+			if (!m_spirv_isa_cache_output)
+				logFail("Failed to Create SPIR-V to ISA cache file.");
+		}
+
+		// load shader source from file
+		auto getShaderSource = [&](const char* filePath) -> auto
+		{
+			IAssetLoader::SAssetLoadParams lparams = {};
+			lparams.logger = m_logger.get();
+			lparams.workingDirectory = "";
+			auto bundle = m_assetMgr->getAsset(filePath, lparams);
+			if (bundle.getContents().empty() || bundle.getAssetType()!=IAsset::ET_SHADER)
+			{
+				m_logger->log("Shader %s not found!", ILogger::ELL_ERROR, filePath);
+				exit(-1);
+			}
+			auto firstAssetInBundle = bundle.getContents()[0];
+			return smart_refctd_ptr_static_cast<ICPUShader>(firstAssetInBundle);
+		};
+
+		auto subgroupTestSource = getShaderSource("app_resources/testSubgroup.comp.hlsl");
+		auto workgroupTestSource = getShaderSource("app_resources/testWorkgroup.comp.hlsl");
+		// now create or retrieve final resources to run our tests
+		sema = m_device->createSemaphore(timelineValue);
+		resultsBuffer = ICPUBuffer::create({ outputBuffers[0]->getSize() });
+		{
+			smart_refctd_ptr<nbl::video::IGPUCommandPool> cmdpool = m_device->createCommandPool(computeQueue->getFamilyIndex(),IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT);
+			if (!cmdpool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY,{&cmdbuf,1}))
+			{
+				logFail("Failed to create Command Buffers!\n");
+				return false;
+			}
+		}
+
+		const auto MaxWorkgroupSize = m_physicalDevice->getLimits().maxComputeWorkGroupInvocations;
+		const auto MinSubgroupSize = m_physicalDevice->getLimits().minSubgroupSize;
+		const auto MaxSubgroupSize = m_physicalDevice->getLimits().maxSubgroupSize;
+		for (auto subgroupSize=MinSubgroupSize; subgroupSize <= MaxSubgroupSize; subgroupSize *= 2u)
+		{
+			const uint8_t subgroupSizeLog2 = hlsl::findMSB(subgroupSize);
+			for (uint32_t workgroupSize = subgroupSize; workgroupSize <= MaxWorkgroupSize; workgroupSize += subgroupSize)
+			{
+				// make sure renderdoc captures everything for debugging
+				m_api->startCapture();
+				m_logger->log("Testing Workgroup Size %u with Subgroup Size %u", ILogger::ELL_INFO, workgroupSize, subgroupSize);
+
+				bool passed = true;
+				// TODO async the testing
+				passed = runTest<emulatedReduction, false>(subgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize) && passed;
+				logTestOutcome(passed, workgroupSize);
+				passed = runTest<emulatedScanInclusive, false>(subgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize) && passed;
+				logTestOutcome(passed, workgroupSize);
+				passed = runTest<emulatedScanExclusive, false>(subgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize) && passed;
+				logTestOutcome(passed, workgroupSize);
+				for (uint32_t itemsPerWG = workgroupSize; itemsPerWG > workgroupSize - subgroupSize; itemsPerWG--)
+				{
+					m_logger->log("Testing Item Count %u", ILogger::ELL_INFO, itemsPerWG);
+					passed = runTest<emulatedReduction, true>(workgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize, itemsPerWG) && passed;
+					logTestOutcome(passed, itemsPerWG);
+					passed = runTest<emulatedScanInclusive, true>(workgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize, itemsPerWG) && passed;
+					logTestOutcome(passed, itemsPerWG);
+					passed = runTest<emulatedScanExclusive, true>(workgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize, itemsPerWG) && passed;
+					logTestOutcome(passed, itemsPerWG);
+				}
+				m_api->endCapture();
+
+				// save cache every now and then	
+				{
+					auto cpu = m_spirv_isa_cache->convertToCPUCache();
+					// Normally we'd beautifully JSON serialize the thing, allow multiple devices & drivers + metadata
+					auto bin = cpu->getEntries().begin()->second.bin;
+					IFile::success_t success;
+					m_spirv_isa_cache_output->write(success,bin->data(),0ull,bin->size());
+					if (!success)
+						logFail("Could not write Create SPIR-V to ISA cache to disk!");
+				}
+			}
+		}
+
+		return true;
+	}
+
+	virtual bool onAppTerminated() override
+	{
+		m_logger->log("==========Result==========", ILogger::ELL_INFO);
+		m_logger->log("Fail Count: %u", ILogger::ELL_INFO, totalFailCount);
+		delete[] inputData;
+		return true;
+	}
+
+	// the unit test is carried out on init
+	void workLoopBody() override {}
+
+	//
+	bool keepRunning() override { return false; }
+
+private:
+	void logTestOutcome(bool passed, uint32_t workgroupSize)
+	{
+		if (passed)
+			m_logger->log("Passed test #%u", ILogger::ELL_INFO, workgroupSize);
+		else
+		{
+			totalFailCount++;
+			m_logger->log("Failed test #%u", ILogger::ELL_ERROR, workgroupSize);
+		}
+	}
+
+	// create pipeline (specialized every test) [TODO: turn into a future/async]
+	smart_refctd_ptr<IGPUComputePipeline> createPipeline(const ICPUShader* overridenUnspecialized, const uint8_t subgroupSizeLog2)
+	{
+		auto shader = m_device->createShader(overridenUnspecialized);
+		IGPUComputePipeline::SCreationParams params = {};
+		params.layout = pipelineLayout.get();
+		params.shader = {
+			.entryPoint = "main",
+			.shader = shader.get(),
+			.entries = nullptr,
+			.requiredSubgroupSize = static_cast<IGPUShader::SSpecInfo::SUBGROUP_SIZE>(subgroupSizeLog2),
+			.requireFullSubgroups = true
+		};
+		core::smart_refctd_ptr<IGPUComputePipeline> pipeline;
+		if (!m_device->createComputePipelines(m_spirv_isa_cache.get(),{&params,1},&pipeline))
+			return nullptr;
+		return pipeline;
+	}
+
+	/*template<template<class> class Arithmetic, bool WorkgroupTest>
+	bool runTest(const smart_refctd_ptr<const ICPUShader>& source, const uint32_t elementCount, const uint32_t workgroupSize, uint32_t itemsPerWG = ~0u)
+	{
+		return true;
+	}*/
+
+	template<template<class> class Arithmetic, bool WorkgroupTest>
+	bool runTest(const smart_refctd_ptr<const ICPUShader>& source, const uint32_t elementCount, const uint8_t subgroupSizeLog2, const uint32_t workgroupSize, uint32_t itemsPerWG = ~0u)
+	{
+		std::string arith_name = Arithmetic<bit_xor<float>>::name;
+
+		smart_refctd_ptr<ICPUShader> overridenUnspecialized;
+		if constexpr (WorkgroupTest)
+		{
+			overridenUnspecialized = CHLSLCompiler::createOverridenCopy(
+				source.get(), "#define OPERATION %s\n#define WORKGROUP_SIZE %d\n#define ITEMS_PER_WG %d\n",
+				(("workgroup::") + arith_name).c_str(), workgroupSize, itemsPerWG
+			);
+		}
+		else
+		{
+			itemsPerWG = workgroupSize;
+			overridenUnspecialized = CHLSLCompiler::createOverridenCopy(
+				source.get(), "#define OPERATION %s\n#define WORKGROUP_SIZE %d\n",
+				(("subgroup::") + arith_name).c_str(), workgroupSize
+			);
+		}
+		auto pipeline = createPipeline(overridenUnspecialized.get(),subgroupSizeLog2);
+
+		// TODO: overlap dispatches with memory readbacks (requires multiple copies of `buffers`)
+		const uint32_t workgroupCount = elementCount / itemsPerWG;
+		cmdbuf->begin(IGPUCommandBuffer::USAGE::NONE);
+		cmdbuf->bindComputePipeline(pipeline.get());
+		cmdbuf->bindDescriptorSets(EPBP_COMPUTE, pipeline->getLayout(), 0u, 1u, &descriptorSet.get());
+		cmdbuf->dispatch(workgroupCount, 1, 1);
+		{
+			IGPUCommandBuffer::SPipelineBarrierDependencyInfo::buffer_barrier_t memoryBarrier[OutputBufferCount];
+			for (auto i=0u; i<OutputBufferCount; i++)
+			{
+				memoryBarrier[i] = {
+					.barrier = {
+						.dep = {
+							.srcStageMask = PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT,
+							.srcAccessMask = ACCESS_FLAGS::SHADER_WRITE_BITS,
+							// in theory we don't need the HOST BITS cause we block on a semaphore but might as well add them
+							.dstStageMask = PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT|PIPELINE_STAGE_FLAGS::HOST_BIT,
+							.dstAccessMask = ACCESS_FLAGS::SHADER_WRITE_BITS|ACCESS_FLAGS::HOST_READ_BIT
+						}
+					},
+					.range = {0ull,outputBuffers[i]->getSize(),outputBuffers[i]}
+				};
+			}
+			IGPUCommandBuffer::SPipelineBarrierDependencyInfo info = {.memBarriers={},.bufBarriers=memoryBarrier};
+			cmdbuf->pipelineBarrier(asset::E_DEPENDENCY_FLAGS::EDF_NONE,info);
+		}
+		cmdbuf->end();
+
+		const IQueue::SSubmitInfo::SSemaphoreInfo signal[1] = {{.semaphore=sema.get(),.value=++timelineValue}};
+		const IQueue::SSubmitInfo::SCommandBufferInfo cmdbufs[1] = {{.cmdbuf=cmdbuf.get()}};
+		const IQueue::SSubmitInfo submits[1] = {{.commandBuffers=cmdbufs,.signalSemaphores=signal}};
+		computeQueue->submit(submits);
+		const ISemaphore::SWaitInfo wait[1] = {{.semaphore=sema.get(),.value=timelineValue}};
+		m_device->blockForSemaphores(wait);
+
+		// check results
+		bool passed = validateResults<Arithmetic, bit_and<uint32_t>, WorkgroupTest>(itemsPerWG, workgroupCount);
+		passed = validateResults<Arithmetic, bit_xor<uint32_t>, WorkgroupTest>(itemsPerWG, workgroupCount) && passed;
+		passed = validateResults<Arithmetic, bit_or<uint32_t>, WorkgroupTest>(itemsPerWG, workgroupCount) && passed;
+		passed = validateResults<Arithmetic, plus<uint32_t>, WorkgroupTest>(itemsPerWG, workgroupCount) && passed;
+		passed = validateResults<Arithmetic, multiplies<uint32_t>, WorkgroupTest>(itemsPerWG, workgroupCount) && passed;
+		passed = validateResults<Arithmetic, minimum<uint32_t>, WorkgroupTest>(itemsPerWG, workgroupCount) && passed;
+		passed = validateResults<Arithmetic, maximum<uint32_t>, WorkgroupTest>(itemsPerWG, workgroupCount) && passed;
+		if constexpr (WorkgroupTest)
+			passed = validateResults<Arithmetic, ballot<uint32_t>, WorkgroupTest>(itemsPerWG, workgroupCount) && passed;
+
+		return passed;
+	}
+
+	//returns true if result matches
+	template<template<class> class Arithmetic, class Binop, bool WorkgroupTest>
+	bool validateResults(const uint32_t itemsPerWG, const uint32_t workgroupCount)
+	{
+		bool success = true;
+
+		// download data
+		const SBufferRange<IGPUBuffer> bufferRange = {0u, resultsBuffer->getSize(), outputBuffers[Binop::BindingIndex]};
+		m_utils->downloadBufferRangeViaStagingBufferAutoSubmit(SIntendedSubmitInfo{.queue=transferDownQueue},bufferRange,resultsBuffer->getPointer());
+
+		using type_t = typename Binop::type_t;
+		const auto dataFromBuffer = reinterpret_cast<const uint32_t*>(resultsBuffer->getPointer());
+		const auto subgroupSize = dataFromBuffer[0];
+		if (subgroupSize<nbl::hlsl::subgroup::MinSubgroupSize || subgroupSize>nbl::hlsl::subgroup::MaxSubgroupSize)
+		{
+			m_logger->log("Unexpected Subgroup Size %u", ILogger::ELL_ERROR, subgroupSize);
+			return false;
+		}
+
+		const auto testData = reinterpret_cast<const type_t*>(dataFromBuffer + 1);
+		// TODO: parallel for (the temporary values need to be threadlocal or what?)
+		// now check if the data obtained has valid values
+		type_t* tmp = new type_t[itemsPerWG];
+		type_t* ballotInput = new type_t[itemsPerWG];
+		for (uint32_t workgroupID = 0u; success && workgroupID < workgroupCount; workgroupID++)
+		{
+			const auto workgroupOffset = workgroupID * itemsPerWG;
+
+			if constexpr (WorkgroupTest)
+			{
+				if constexpr (std::is_same_v<ballot<type_t>, Binop>)
+				{
+					for (auto i = 0u; i < itemsPerWG; i++)
+						ballotInput[i] = inputData[i + workgroupOffset] & 0x1u;
+					Arithmetic<Binop>::impl(tmp, ballotInput, itemsPerWG);
+				}
+				else
+					Arithmetic<Binop>::impl(tmp, inputData + workgroupOffset, itemsPerWG);
+			}
+			else
+			{
+				for (uint32_t pseudoSubgroupID = 0u; pseudoSubgroupID < itemsPerWG; pseudoSubgroupID += subgroupSize)
+					Arithmetic<Binop>::impl(tmp + pseudoSubgroupID, inputData + workgroupOffset + pseudoSubgroupID, subgroupSize);
+			}
+
+			for (uint32_t localInvocationIndex = 0u; localInvocationIndex < itemsPerWG; localInvocationIndex++)
+			{
+				const auto globalInvocationIndex = workgroupOffset + localInvocationIndex;
+				const auto cpuVal = tmp[localInvocationIndex];
+				const auto gpuVal = testData[globalInvocationIndex];
+				if (cpuVal != gpuVal)
+				{
+					m_logger->log(
+						"Failed test #%d  (%s)  (%s) Expected %u got %u for workgroup %d and localinvoc %d",
+						ILogger::ELL_ERROR, itemsPerWG, WorkgroupTest ? "workgroup" : "subgroup", Binop::name,
+						cpuVal, gpuVal, workgroupID, localInvocationIndex
+					);
+					success = false;
+					break;
+				}
+			}
+		}
+		delete[] ballotInput;
+		delete[] tmp;
+
+		return success;
+	}
+
+	IQueue* transferDownQueue;
+	IQueue* computeQueue;
+	smart_refctd_ptr<IGPUPipelineCache> m_spirv_isa_cache;
+	smart_refctd_ptr<IFile> m_spirv_isa_cache_output;
+
+	uint32_t* inputData = nullptr;
+	constexpr static inline uint32_t OutputBufferCount = 8u;
+	smart_refctd_ptr<IGPUBuffer> outputBuffers[OutputBufferCount];
+	smart_refctd_ptr<IGPUDescriptorSet> descriptorSet;
+	smart_refctd_ptr<IGPUPipelineLayout> pipelineLayout;
+
+	smart_refctd_ptr<ISemaphore> sema;
+	uint64_t timelineValue = 0;
+	smart_refctd_ptr<IGPUCommandBuffer> cmdbuf;
+	smart_refctd_ptr<ICPUBuffer> resultsBuffer;
+
+	uint32_t totalFailCount = 0;
+};
+
+NBL_MAIN_FUNC(ArithmeticBenchApp)
\ No newline at end of file
diff --git a/71_ArithmeticBench/pipeline.groovy b/71_ArithmeticBench/pipeline.groovy
new file mode 100644
index 000000000..7ea9947e0
--- /dev/null
+++ b/71_ArithmeticBench/pipeline.groovy
@@ -0,0 +1,50 @@
+import org.DevshGraphicsProgramming.Agent
+import org.DevshGraphicsProgramming.BuilderInfo
+import org.DevshGraphicsProgramming.IBuilder
+
+class CArithemticUnitTestBuilder extends IBuilder
+{
+	public CArithemticUnitTestBuilder(Agent _agent, _info)
+	{
+		super(_agent, _info)
+	}
+	
+	@Override
+	public boolean prepare(Map axisMapping)
+	{
+		return true
+	}
+	
+	@Override
+  	public boolean build(Map axisMapping)
+	{
+		IBuilder.CONFIGURATION config = axisMapping.get("CONFIGURATION")
+		IBuilder.BUILD_TYPE buildType = axisMapping.get("BUILD_TYPE")
+		
+		def nameOfBuildDirectory = getNameOfBuildDirectory(buildType)
+		def nameOfConfig = getNameOfConfig(config)
+		
+		agent.execute("cmake --build ${info.rootProjectPath}/${nameOfBuildDirectory}/${info.targetProjectPathRelativeToRoot} --target ${info.targetBaseName} --config ${nameOfConfig} -j12 -v")
+		
+		return true
+	}
+	
+	@Override
+  	public boolean test(Map axisMapping)
+	{
+		return true
+	}
+	
+	@Override
+	public boolean install(Map axisMapping)
+	{
+		return true
+	}
+}
+
+def create(Agent _agent, _info)
+{
+	return new CArithemticUnitTestBuilder(_agent, _info)
+}
+
+return this
\ No newline at end of file
diff --git a/CMakeLists.txt b/CMakeLists.txt
index fb03f95a4..4434eacc1 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -95,7 +95,9 @@ if(NBL_BUILD_EXAMPLES)
 	add_subdirectory(67_RayQueryGeometry EXCLUDE_FROM_ALL)
 	add_subdirectory(68_JpegLoading EXCLUDE_FROM_ALL)
 
-  add_subdirectory(70_FLIPFluids EXCLUDE_FROM_ALL)
+  	add_subdirectory(70_FLIPFluids EXCLUDE_FROM_ALL)
+	
+	add_subdirectory(71_ArithmeticBench EXCLUDE_FROM_ALL)
 
 	NBL_HOOK_COMMON_API("${NBL_COMMON_API_TARGETS}")
 endif()

From 78c716c7b77099220955ef556f51dd4bca92912e Mon Sep 17 00:00:00 2001
From: Przemek <minikers21@gmail.com>
Date: Thu, 27 Mar 2025 13:40:54 +0100
Subject: [PATCH 111/529] Fixes

---
 62_CAD/CTriangleMesh.h                        |  6 +-
 62_CAD/DrawResourcesFiller.cpp                | 31 ++++---
 62_CAD/main.cpp                               | 25 ++---
 62_CAD/shaders/globals.hlsl                   |  6 +-
 62_CAD/shaders/main_pipeline/common.hlsl      |  6 +-
 .../main_pipeline/fragment_shader.hlsl        | 91 +++++++++----------
 .../shaders/main_pipeline/vertex_shader.hlsl  | 13 +--
 7 files changed, 88 insertions(+), 90 deletions(-)

diff --git a/62_CAD/CTriangleMesh.h b/62_CAD/CTriangleMesh.h
index 34fc243f7..374fae1b4 100644
--- a/62_CAD/CTriangleMesh.h
+++ b/62_CAD/CTriangleMesh.h
@@ -94,15 +94,15 @@ class CTriangleMesh final
 		return m_indices;
 	}
 
-	inline size_t getVtxBuffByteSize() const
+	inline size_t getVertexBuffByteSize() const
 	{
 		return sizeof(vertex_t) * m_vertices.size();
 	}
-	inline size_t getIdxBuffByteSize() const
+	inline size_t getIndexBuffByteSize() const
 	{
 		return sizeof(index_t) * m_indices.size();
 	}
-	inline size_t getIdxCnt() const
+	inline size_t getIndexCount() const
 	{
 		return m_indices.size();
 	}
diff --git a/62_CAD/DrawResourcesFiller.cpp b/62_CAD/DrawResourcesFiller.cpp
index 5e0c85260..58c4d0c72 100644
--- a/62_CAD/DrawResourcesFiller.cpp
+++ b/62_CAD/DrawResourcesFiller.cpp
@@ -84,7 +84,7 @@ void DrawResourcesFiller::allocateGeometryBuffer(ILogicalDevice* logicalDevice,
 
 	IGPUBuffer::SCreationParams geometryCreationParams = {};
 	geometryCreationParams.size = size;
-	geometryCreationParams.usage = bitflag(IGPUBuffer::EUF_STORAGE_BUFFER_BIT) | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT | IGPUBuffer::EUF_TRANSFER_DST_BIT | IGPUBuffer::EUF_INDEX_BUFFER_BIT;
+	geometryCreationParams.usage = bitflag(IGPUBuffer::EUF_STORAGE_BUFFER_BIT) | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT | IGPUBuffer::EUF_TRANSFER_DST_BIT | IGPUBuffer::EUF_INDEX_BUFFER_BIT; // INDEX_BUFFER USAGE for DTMs
 	gpuDrawBuffers.geometryBuffer = logicalDevice->createBuffer(std::move(geometryCreationParams));
 	gpuDrawBuffers.geometryBuffer->setObjectDebugName("geometryBuffer");
 
@@ -241,18 +241,18 @@ void DrawResourcesFiller::drawTriangleMesh(const CTriangleMesh& mesh, CTriangleM
 	ICPUBuffer::SCreationParams geometryBuffParams;
 	
 	// concatenate the index and vertex buffer into the geometry buffer
-	const size_t indexBuffByteSize = mesh.getIdxBuffByteSize();
-	const size_t vtxBuffByteSize = mesh.getVtxBuffByteSize();
+	const size_t indexBuffByteSize = mesh.getIndexBuffByteSize();
+	const size_t vtxBuffByteSize = mesh.getVertexBuffByteSize();
 	const size_t geometryBufferDataToAddByteSize = indexBuffByteSize + vtxBuffByteSize;
 
 	// copy into gemoetry cpu buffer insteaed
 
 	// TODO: rename, its not just points
-	const uint32_t maxGeometryBufferPoints = static_cast<uint32_t>(maxGeometryBufferSize - currentGeometryBufferSize);
+	const uint32_t remainingGeometryBufferSize = static_cast<uint32_t>(maxGeometryBufferSize - currentGeometryBufferSize);
 
-	// TODO: assert of geometry buffer size, do i need to check if size of objects to be added <= maxGeometryBufferPoints?
+	// TODO: assert of geometry buffer size, do i need to check if size of objects to be added <= remainingGeometryBufferSize?
 	// TODO: auto submit instead of assert
-	assert(geometryBufferDataToAddByteSize <= maxGeometryBufferPoints);
+	assert(geometryBufferDataToAddByteSize <= remainingGeometryBufferSize);
 
 	// TODO: vertices need to be aligned to 8?
 	uint64_t vtxBufferAddress;
@@ -270,7 +270,7 @@ void DrawResourcesFiller::drawTriangleMesh(const CTriangleMesh& mesh, CTriangleM
 		currentGeometryBufferSize += vtxBuffByteSize;
 	}
 
-	drawData.indexCount = mesh.getIdxCnt();
+	drawData.indexCount = mesh.getIndexCount();
 
 	// call addMainObject_SubmitIfNeeded, use its index in push constants
 
@@ -394,6 +394,7 @@ uint32_t DrawResourcesFiller::addLineStyle_SubmitIfNeeded(const LineStyleInfo& l
 		resetGeometryCounters();
 		resetMainObjectCounters();
 		resetLineStyleCounters();
+		resetDTMSettingsCounters();
 		outLineStyleIdx = addLineStyle_Internal(lineStyle);
 		assert(outLineStyleIdx != InvalidStyleIdx);
 	}
@@ -410,6 +411,7 @@ uint32_t DrawResourcesFiller::addDTMSettings_SubmitIfNeeded(const DTMSettingsInf
 		resetGeometryCounters();
 		resetMainObjectCounters();
 		resetLineStyleCounters();
+		resetDTMSettingsCounters();
 		outDTMSettingIdx = addDTMSettings_Internal(dtmSettings, intendedNextSubmit);
 		assert(outDTMSettingIdx != InvalidDTMSettingsIdx);
 	}
@@ -538,9 +540,9 @@ bool DrawResourcesFiller::finalizeLineStyleCopiesToGPU(SIntendedSubmitInfo& inte
 bool DrawResourcesFiller::finalizeDTMSettingsCopiesToGPU(SIntendedSubmitInfo& intendedNextSubmit)
 {
 	bool success = true;
-	// Copy LineStyles
-	uint32_t remainingLineStyles = currentDTMSettingsCount - inMemDTMSettingsCount;
-	SBufferRange<IGPUBuffer> dtmSettingsRange = { sizeof(DTMSettings) * inMemDTMSettingsCount, sizeof(DTMSettings) * remainingLineStyles, gpuDrawBuffers.dtmSettingsBuffer };
+	// Copy DTM settings
+	uint32_t remainingDTMSettings = currentDTMSettingsCount - inMemDTMSettingsCount;
+	SBufferRange<IGPUBuffer> dtmSettingsRange = { sizeof(DTMSettings) * inMemDTMSettingsCount, sizeof(DTMSettings) * remainingDTMSettings, gpuDrawBuffers.dtmSettingsBuffer };
 	if (dtmSettingsRange.size > 0u)
 	{
 		const DTMSettings* srcDTMSettingsData = reinterpret_cast<DTMSettings*>(cpuDrawBuffers.dtmSettingsBuffer->getPointer()) + inMemDTMSettingsCount;
@@ -794,10 +796,13 @@ uint32_t DrawResourcesFiller::addDTMSettings_Internal(const DTMSettingsInfo& dtm
 	dtmSettings.contourLinesEndHeight = dtmSettingsInfo.contourLinesEndHeight;
 	dtmSettings.contourLinesHeightInterval = dtmSettingsInfo.contourLinesHeightInterval;
 
-	// TODO: this needs to be redone.. what if submit happens after that line?
-	// we need to make sure somehow that function below will not submit, we need both outline and contour styles in GPU memory
+	if (currentLineStylesCount + 2 > maxLineStyles)
+		return InvalidDTMSettingsIdx;
+
+	assert(currentLineStylesCount + 2 <= maxLineStyles);
 	dtmSettings.outlineLineStyleIdx = addLineStyle_SubmitIfNeeded(dtmSettingsInfo.outlineLineStyleInfo, intendedNextSubmit);
 	dtmSettings.contourLineStyleIdx = addLineStyle_SubmitIfNeeded(dtmSettingsInfo.contourLineStyleInfo, intendedNextSubmit);
+
 	switch (dtmSettingsInfo.heightShadingMode)
 	{
 	case DTMSettingsInfo::E_HEIGHT_SHADING_MODE::DISCRETE_VARIABLE_LENGTH_INTERVALS:
@@ -864,7 +869,7 @@ uint64_t DrawResourcesFiller::addClipProjectionData_Internal(const ClipProjectio
 	if (maxGeometryBufferClipProjData <= 0)
 		return InvalidClipProjectionAddress;
 	
-	void* dst = reinterpret_cast<char*>(cpuDrawBuffers.geometryBuffer->getPointer()) + currentGeometryBufferSize;
+	uint8_t* dst = reinterpret_cast<uint8_t*>(cpuDrawBuffers.geometryBuffer->getPointer()) + currentGeometryBufferSize;
 	memcpy(dst, &clipProjectionData, sizeof(ClipProjectionData));
 
 	const uint64_t ret = currentGeometryBufferSize + geometryBufferAddress;
diff --git a/62_CAD/main.cpp b/62_CAD/main.cpp
index bfd346022..17afb122a 100644
--- a/62_CAD/main.cpp
+++ b/62_CAD/main.cpp
@@ -646,7 +646,6 @@ class ComputerAidedDesign final : public examples::SimpleWindowedApplication, pu
 	uint32_t m_hatchDebugStep = 0u;
 	DTMSettingsInfo::E_HEIGHT_SHADING_MODE m_shadingModeExample = DTMSettingsInfo::E_HEIGHT_SHADING_MODE::DISCRETE_VARIABLE_LENGTH_INTERVALS;
 
-
 	inline bool onAppInitialized(smart_refctd_ptr<ISystem>&& system) override
 	{
 		m_inputSystem = make_smart_refctd_ptr<InputSystem>(logger_opt_smart_ptr(smart_refctd_ptr(m_logger)));
@@ -3330,11 +3329,11 @@ class ComputerAidedDesign final : public examples::SimpleWindowedApplication, pu
 			};*/
 
 			core::vector<TriangleMeshVertex> vertices = {
-				{ float32_t2(0.0f, 0.0f), 100.0f },
-				{ float32_t2(-200.0f, -200.0f), 10.0f },
-				{ float32_t2(200.0f, -200.0f), 10.0f },
-				{ float32_t2(200.0f, 200.0f), -20.0f },
-				{ float32_t2(-200.0f, 200.0f), 10.0f },
+				{ float32_t2(0.0, 0.0), 100.0 },
+				{ float32_t2(-200.0, -200.0), 10.0 },
+				{ float32_t2(200.0, -200.0), 10.0 },
+				{ float32_t2(200.0, 200.0), -20.0 },
+				{ float32_t2(-200.0, 200.0), 10.0 },
 			};
 
 			core::vector<uint32_t> indices = {
@@ -3376,20 +3375,22 @@ class ComputerAidedDesign final : public examples::SimpleWindowedApplication, pu
 				case DTMSettingsInfo::E_HEIGHT_SHADING_MODE::DISCRETE_VARIABLE_LENGTH_INTERVALS:
 				{
 					dtmSettingsInfo.heightShadingMode = DTMSettingsInfo::E_HEIGHT_SHADING_MODE::DISCRETE_VARIABLE_LENGTH_INTERVALS;
+					
+					float animatedAlpha = (std::cos(m_timeElapsed * 0.0005) + 1.0) * 0.5;
+					dtmSettingsInfo.addHeightColorMapEntry(-10.0f, float32_t4(1.0f, 1.0f, 1.0f, 1.0f));
 					dtmSettingsInfo.addHeightColorMapEntry(20.0f, float32_t4(0.5f, 1.0f, 1.0f, 1.0f));
 					dtmSettingsInfo.addHeightColorMapEntry(25.0f, float32_t4(0.0f, 1.0f, 0.0f, 1.0f));
-					dtmSettingsInfo.addHeightColorMapEntry(70.0f, float32_t4(1.0f, 1.0f, 0.0f, 1.0f));
-					dtmSettingsInfo.addHeightColorMapEntry(80.0f, float32_t4(1.0f, 0.0f, 0.0f, 1.0f));
+					dtmSettingsInfo.addHeightColorMapEntry(70.0f, float32_t4(1.0f, 1.0f, 0.0f, animatedAlpha));
+					dtmSettingsInfo.addHeightColorMapEntry(100.0f, float32_t4(1.0f, 0.0f, 0.0f, 1.0f));
 					break;
 				}
 				case DTMSettingsInfo::E_HEIGHT_SHADING_MODE::DISCRETE_FIXED_LENGTH_INTERVALS:
 				{
 					dtmSettingsInfo.intervalWidth = 8.0f;
 					dtmSettingsInfo.heightShadingMode = DTMSettingsInfo::E_HEIGHT_SHADING_MODE::DISCRETE_FIXED_LENGTH_INTERVALS;
-					float animatedAlpha = (std::cos(m_timeElapsed * 0.0003) + 1.0) * 0.5;
-					dtmSettingsInfo.addHeightColorMapEntry(0.0f, float32_t4(0.0f, 1.0f, 0.0f, animatedAlpha));
-					dtmSettingsInfo.addHeightColorMapEntry(50.0f, float32_t4(1.0f, 1.0f, 0.0f, animatedAlpha));
-					dtmSettingsInfo.addHeightColorMapEntry(100.0f, float32_t4(1.0f, 0.0f, 0.0f, animatedAlpha));
+					dtmSettingsInfo.addHeightColorMapEntry(0.0f, float32_t4(0.0f, 1.0f, 0.0f, 1.0f));
+					dtmSettingsInfo.addHeightColorMapEntry(50.0f, float32_t4(1.0f, 1.0f, 0.0f, 1.0f));
+					dtmSettingsInfo.addHeightColorMapEntry(100.0f, float32_t4(1.0f, 0.0f, 0.0f, 1.0f));
 					break;
 				}
 				case DTMSettingsInfo::E_HEIGHT_SHADING_MODE::CONTINOUS_INTERVALS:
diff --git a/62_CAD/shaders/globals.hlsl b/62_CAD/shaders/globals.hlsl
index 84f9416e3..8412b29ad 100644
--- a/62_CAD/shaders/globals.hlsl
+++ b/62_CAD/shaders/globals.hlsl
@@ -126,7 +126,7 @@ enum class MajorAxis : uint32_t
 struct MainObject
 {
     uint32_t styleIdx;
-    uint32_t dtmSettingsIdx; // do I even need this on the gpu side? it's stored in structured buffer not bda
+    uint32_t dtmSettingsIdx;
     uint64_t clipProjectionAddress;
 };
 
@@ -273,8 +273,8 @@ NBL_CONSTEXPR float InvalidStyleStretchValue = nbl::hlsl::numeric_limits<float>:
 
 struct TriangleMeshVertex
 {
-    float32_t2 pos;
-    float32_t height;
+    pfloat64_t2 pos;
+    pfloat64_t height;
 };
 
 // The color parameter is also used for styling non-curve objects such as text glyphs and hatches with solid color
diff --git a/62_CAD/shaders/main_pipeline/common.hlsl b/62_CAD/shaders/main_pipeline/common.hlsl
index b2fcda9c2..261e336f3 100644
--- a/62_CAD/shaders/main_pipeline/common.hlsl
+++ b/62_CAD/shaders/main_pipeline/common.hlsl
@@ -226,9 +226,9 @@ struct PSInput
     float getHeight() { return interp_data5.x; }
 
 #ifndef FRAGMENT_SHADER_INPUT // vertex shader
-    void setScreenSpaceVertexPos(float3 pos) { vertexScreenSpacePos = pos; }
+    void setScreenSpaceVertexAttribs(float3 pos) { vertexScreenSpacePos = pos; }
 #else // fragment shader
-    float3 getScreenSpaceVertexPos(uint32_t vertexIndex) { return vertexScreenSpacePos[vertexIndex]; }
+    float3 getScreenSpaceVertexAttribs(uint32_t vertexIndex) { return vertexScreenSpacePos[vertexIndex]; }
 #endif 
 };
 
@@ -237,7 +237,7 @@ struct PSInput
 [[vk::binding(1, 0)]] StructuredBuffer<DrawObject> drawObjects : register(t0);
 [[vk::binding(2, 0)]] StructuredBuffer<MainObject> mainObjects : register(t1);
 [[vk::binding(3, 0)]] StructuredBuffer<LineStyle> lineStyles : register(t2);
-[[vk::binding(4, 0)]] StructuredBuffer<DTMSettings> dtmSettingsBuff : register(t3);
+[[vk::binding(4, 0)]] StructuredBuffer<DTMSettings> dtmSettings : register(t3);
 
 [[vk::combinedImageSampler]][[vk::binding(5, 0)]] Texture2DArray<float3> msdfTextures : register(t4);
 [[vk::combinedImageSampler]][[vk::binding(5, 0)]] SamplerState msdfSampler : register(s4);
diff --git a/62_CAD/shaders/main_pipeline/fragment_shader.hlsl b/62_CAD/shaders/main_pipeline/fragment_shader.hlsl
index 0d5ec486d..42a303fc2 100644
--- a/62_CAD/shaders/main_pipeline/fragment_shader.hlsl
+++ b/62_CAD/shaders/main_pipeline/fragment_shader.hlsl
@@ -334,6 +334,18 @@ float miterSDF(float2 p, float thickness, float2 a, float2 b, float ra, float rb
 typedef StyleClipper< nbl::hlsl::shapes::Quadratic<float> > BezierStyleClipper;
 typedef StyleClipper< nbl::hlsl::shapes::Line<float> > LineStyleClipper;
 
+// for usage in upper_bound function
+struct DTMSettingsHeightsAccessor
+{
+    DTMSettings dtmSettings;
+    using value_type = float;
+
+    float operator[](const uint32_t ix)
+    {
+        return dtmSettings.heightColorMapHeights[ix];
+    }
+};
+
 // We need to specialize color calculation based on FragmentShaderInterlock feature availability for our transparency algorithm
 // because there is no `if constexpr` in hlsl
 // @params
@@ -422,14 +434,14 @@ float4 fragMain(PSInput input) : SV_TARGET
         const float stretch = 1.0f; // TODO: figure out what is it for
         const float worldToScreenRatio = input.getCurrentWorldToScreenRatio();
 
-        DTMSettings dtmSettings = dtmSettingsBuff[mainObj.dtmSettingsIdx];
-        LineStyle outlineStyle = lineStyles[dtmSettings.outlineLineStyleIdx];
-        LineStyle contourStyle = lineStyles[dtmSettings.contourLineStyleIdx];
+        DTMSettings dtm = dtmSettings[mainObj.dtmSettingsIdx];
+        LineStyle outlineStyle = lineStyles[dtm.outlineLineStyleIdx];
+        LineStyle contourStyle = lineStyles[dtm.contourLineStyleIdx];
 
         float3 v[3];
-        v[0] = input.getScreenSpaceVertexPos(0);
-        v[1] = input.getScreenSpaceVertexPos(1);
-        v[2] = input.getScreenSpaceVertexPos(2);
+        v[0] = input.getScreenSpaceVertexAttribs(0);
+        v[1] = input.getScreenSpaceVertexAttribs(1);
+        v[2] = input.getScreenSpaceVertexAttribs(2);
 
         const float3 baryCoord = nbl::hlsl::spirv::BaryCoordKHR;
 
@@ -448,43 +460,31 @@ float4 fragMain(PSInput input) : SV_TARGET
         float height = input.getHeight();
 
         // HEIGHT SHADING
-        const uint32_t heightMapSize = dtmSettings.heightColorEntryCount;
-        float minShadingHeight = dtmSettings.heightColorMapHeights[0];
-        float maxShadingHeight = dtmSettings.heightColorMapHeights[heightMapSize - 1];
+        const uint32_t heightMapSize = dtm.heightColorEntryCount;
+        float minShadingHeight = dtm.heightColorMapHeights[0];
+        float maxShadingHeight = dtm.heightColorMapHeights[heightMapSize - 1];
 
         const bool isHeightBetweenMinAndMax = height >= minShadingHeight && height <= maxShadingHeight;
         const bool isHeightColorMapNotEmpty = heightMapSize > 0;
         if (isHeightColorMapNotEmpty && isHeightBetweenMinAndMax)
         {
-            DTMSettings::E_HEIGHT_SHADING_MODE mode = dtmSettings.determineHeightShadingMode();
+            DTMSettings::E_HEIGHT_SHADING_MODE mode = dtm.determineHeightShadingMode();
 
             if(mode == DTMSettings::E_HEIGHT_SHADING_MODE::DISCRETE_VARIABLE_LENGTH_INTERVALS)
             {
-                uint32_t upperBoundHeightIndex = nbl::hlsl::numeric_limits<uint32_t>::max;
-                uint32_t lowerBoundHeightIndex;
-                // TODO: binary search
-                for (int i = 0; i < heightMapSize; ++i)
-                {
-                    if (dtmSettings.heightColorMapHeights[i] > height)
-                    {
-                        upperBoundHeightIndex = i;
-                        lowerBoundHeightIndex = i;
-                        if (i != 0)
-                            --lowerBoundHeightIndex;
+                DTMSettingsHeightsAccessor dtmHeightsAccessor = { dtm };
+                uint32_t upperBoundHeightIndex = nbl::hlsl::upper_bound(dtmHeightsAccessor, 0, heightMapSize, height);
+                uint32_t lowerBoundHeightIndex = upperBoundHeightIndex == 0 ? upperBoundHeightIndex : upperBoundHeightIndex - 1;
 
-                        break;
-                    }
-                }
-
-                textureColor = dtmSettings.heightColorMapColors[upperBoundHeightIndex].rgb;
-                localAlpha = dtmSettings.heightColorMapColors[upperBoundHeightIndex].a;
+                textureColor = dtm.heightColorMapColors[upperBoundHeightIndex].rgb;
+                localAlpha = dtm.heightColorMapColors[upperBoundHeightIndex].a;
             }
             else
             {
                 float heightTmp;
                 if (mode == DTMSettings::E_HEIGHT_SHADING_MODE::DISCRETE_FIXED_LENGTH_INTERVALS)
                 {
-                    float interval = dtmSettings.intervalWidth;
+                    float interval = dtm.intervalWidth;
                     int sectionIndex = int((height - minShadingHeight) / interval);
                     heightTmp = minShadingHeight + float(sectionIndex) * interval;
                 }
@@ -493,27 +493,15 @@ float4 fragMain(PSInput input) : SV_TARGET
                     heightTmp = height;
                 }
 
-                uint32_t upperBoundHeightIndex = nbl::hlsl::numeric_limits<uint32_t>::max;
-                uint32_t lowerBoundHeightIndex;
-                // TODO: binary search
-                for (int i = 0; i < heightMapSize; ++i)
-                {
-                    if (dtmSettings.heightColorMapHeights[i] > heightTmp)
-                    {
-                        upperBoundHeightIndex = i;
-                        lowerBoundHeightIndex = i;
-                        if (i != 0)
-                            --lowerBoundHeightIndex;
-
-                        break;
-                    }
-                }
+                DTMSettingsHeightsAccessor dtmHeightsAccessor = { dtm };
+                uint32_t upperBoundHeightIndex = nbl::hlsl::upper_bound(dtmHeightsAccessor, 0, heightMapSize, height);
+                uint32_t lowerBoundHeightIndex = upperBoundHeightIndex == 0 ? upperBoundHeightIndex : upperBoundHeightIndex - 1;
 
-                float upperBoundHeight = dtmSettings.heightColorMapHeights[upperBoundHeightIndex];
-                float lowerBoundHeight = dtmSettings.heightColorMapHeights[lowerBoundHeightIndex];
+                float upperBoundHeight = dtm.heightColorMapHeights[upperBoundHeightIndex];
+                float lowerBoundHeight = dtm.heightColorMapHeights[lowerBoundHeightIndex];
                 
-                float4 upperBoundColor = dtmSettings.heightColorMapColors[upperBoundHeightIndex];
-                float4 lowerBoundColor = dtmSettings.heightColorMapColors[lowerBoundHeightIndex];
+                float4 upperBoundColor = dtm.heightColorMapColors[upperBoundHeightIndex];
+                float4 lowerBoundColor = dtm.heightColorMapColors[lowerBoundHeightIndex];
                 
                 float interpolationVal;
                 if (upperBoundHeightIndex == 0)
@@ -529,9 +517,9 @@ float4 fragMain(PSInput input) : SV_TARGET
         // CONTOUR
 
         // TODO: move to ubo or push constants
-        const float startHeight = dtmSettings.contourLinesStartHeight;
-        const float endHeight = dtmSettings.contourLinesEndHeight;
-        const float interval = dtmSettings.contourLinesHeightInterval;
+        const float startHeight = dtm.contourLinesStartHeight;
+        const float endHeight = dtm.contourLinesEndHeight;
+        const float interval = dtm.contourLinesHeightInterval;
 
         // TODO: can be precomputed
         const int maxContourLineIdx = (endHeight - startHeight + 1) / interval;
@@ -637,6 +625,9 @@ float4 fragMain(PSInput input) : SV_TARGET
                 float3 p0 = v[currentEdgePoints[0]];
                 float3 p1 = v[currentEdgePoints[1]];
 
+                // long story short, in order for stipple patterns to be consistent:
+                // - point with lesser x coord should be starting point
+                // - if x coord of both points are equal then point with lesser y value should be starting point
                 if (p1.x < p0.x)
                     nbl::hlsl::swap(p0, p1);
                 else if (p1.x == p0.x && p1.y < p0.y)
diff --git a/62_CAD/shaders/main_pipeline/vertex_shader.hlsl b/62_CAD/shaders/main_pipeline/vertex_shader.hlsl
index 2853d9a52..6011defce 100644
--- a/62_CAD/shaders/main_pipeline/vertex_shader.hlsl
+++ b/62_CAD/shaders/main_pipeline/vertex_shader.hlsl
@@ -108,7 +108,7 @@ PSInput main(uint vertexID : SV_VertexID)
     outV.setObjType(ObjectType::TRIANGLE_MESH);
     outV.setMainObjectIdx(pc.triangleMeshMainObjectIndex);
 
-    TriangleMeshVertex vtx = vk::RawBufferLoad<TriangleMeshVertex>(pc.triangleMeshVerticesBaseAddress + sizeof(TriangleMeshVertex) * vertexID, 4u);
+    TriangleMeshVertex vtx = vk::RawBufferLoad<TriangleMeshVertex>(pc.triangleMeshVerticesBaseAddress + sizeof(TriangleMeshVertex) * vertexID, 8u);
     pfloat64_t2 vtxPos;
     vtxPos.x = _static_cast<pfloat64_t>(vtx.pos.x);
     vtxPos.y = _static_cast<pfloat64_t>(vtx.pos.y);
@@ -120,17 +120,18 @@ PSInput main(uint vertexID : SV_VertexID)
 
     outV.position.xy = transformedPos;
     outV.position = transformFromSreenSpaceToNdc(outV.position.xy, globals.resolution);
-    outV.setHeight(vtx.height);
-    outV.setScreenSpaceVertexPos(float3(transformedPos, vtx.height));
+    const float heightAsFloat = nbl::hlsl::_static_cast<float>(vtx.height);
+    outV.setHeight(heightAsFloat);
+    outV.setScreenSpaceVertexAttribs(float3(transformedPos, heightAsFloat));
     outV.setCurrentWorldToScreenRatio(
         _static_cast<float>((_static_cast<pfloat64_t>(2.0f) /
             (clipProjectionData.projectionToNDC[0].x * _static_cast<pfloat64_t>(globals.resolution.x))))
     );
 
     // TODO: line style of contour line has to be set too!
-    DTMSettings dtmSettings = dtmSettingsBuff[mainObj.dtmSettingsIdx];
-    LineStyle outlineStyle = lineStyles[dtmSettings.outlineLineStyleIdx];
-    LineStyle contourStyle = lineStyles[dtmSettings.contourLineStyleIdx];
+    DTMSettings dtm = dtmSettings[mainObj.dtmSettingsIdx];
+    LineStyle outlineStyle = lineStyles[dtm.outlineLineStyleIdx];
+    LineStyle contourStyle = lineStyles[dtm.contourLineStyleIdx];
     const float screenSpaceOutlineWidth = outlineStyle.screenSpaceLineWidth + _static_cast<float>(_static_cast<pfloat64_t>(outlineStyle.worldSpaceLineWidth) * globals.screenToWorldRatio);
     const float sdfOutlineThickness = screenSpaceOutlineWidth * 0.5f;
     const float screenSpaceContourLineWidth = contourStyle.screenSpaceLineWidth + _static_cast<float>(_static_cast<pfloat64_t>(contourStyle.worldSpaceLineWidth) * globals.screenToWorldRatio);

From f8237715997821ec0bf5f7fa2fed92dbabe56e52 Mon Sep 17 00:00:00 2001
From: keptsecret <sorchon@gmail.com>
Date: Fri, 28 Mar 2025 10:46:10 +0700
Subject: [PATCH 112/529] use dropdown, more options

---
 31_HLSLPathTracer/main.cpp | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/31_HLSLPathTracer/main.cpp b/31_HLSLPathTracer/main.cpp
index 8394889db..706a0f713 100644
--- a/31_HLSLPathTracer/main.cpp
+++ b/31_HLSLPathTracer/main.cpp
@@ -41,7 +41,7 @@ class HLSLComputePathtracer final : public examples::SimpleWindowedApplication,
 		{
 			ERM_GLSL,
 			ERM_HLSL,
-			ERM_CHECKERED,
+			// ERM_CHECKERED,
 			ERM_COUNT
 		};
 
@@ -68,6 +68,11 @@ class HLSLComputePathtracer final : public examples::SimpleWindowedApplication,
 			"ELG_RECTANGLE"
 		};
 
+		const char* shaderTypes[E_RENDER_MODE::ERM_COUNT] = {
+			"ERM_GLSL",
+			"ERM_HLSL"
+		};
+
 	public:
 		inline HLSLComputePathtracer(const path& _localInputCWD, const path& _localOutputCWD, const path& _sharedInputCWD, const path& _sharedOutputCWD)
 			: IApplicationFramework(_localInputCWD, _localOutputCWD, _sharedInputCWD, _sharedOutputCWD) {}
@@ -935,7 +940,8 @@ class HLSLComputePathtracer final : public examples::SimpleWindowedApplication,
 					ImGui::SliderFloat("Fov", &fov, 20.f, 150.f);
 					ImGui::SliderFloat("zNear", &zNear, 0.1f, 100.f);
 					ImGui::SliderFloat("zFar", &zFar, 110.f, 10000.f);
-					ImGui::ListBox("Shader", &PTPipline, shaderNames, E_LIGHT_GEOMETRY::ELG_COUNT);
+					ImGui::Combo("Shader", &PTPipeline, shaderNames, E_LIGHT_GEOMETRY::ELG_COUNT);
+					ImGui::Combo("Render Mode", &renderMode, shaderTypes, E_RENDER_MODE::ERM_COUNT);
 					ImGui::SliderInt("SPP", &spp, 1, MaxBufferSamples);
 					ImGui::SliderInt("Depth", &depth, 1, MaxBufferDimensions / 3);
 
@@ -1063,7 +1069,7 @@ class HLSLComputePathtracer final : public examples::SimpleWindowedApplication,
 
 				// cube envmap handle
 				{
-					auto pipeline = renderMode == E_RENDER_MODE::ERM_HLSL ? m_PTHLSLPipelines[PTPipline].get() : m_PTGLSLPipelines[PTPipline].get();
+					auto pipeline = renderMode == E_RENDER_MODE::ERM_HLSL ? m_PTHLSLPipelines[PTPipeline].get() : m_PTGLSLPipelines[PTPipeline].get();
 					cmdbuf->bindComputePipeline(pipeline);
 					cmdbuf->bindDescriptorSets(EPBP_COMPUTE, pipeline->getLayout(), 0u, 1u, &m_descriptorSet0.get());
 					cmdbuf->bindDescriptorSets(EPBP_COMPUTE, pipeline->getLayout(), 2u, 1u, &m_descriptorSet2.get());
@@ -1347,7 +1353,7 @@ class HLSLComputePathtracer final : public examples::SimpleWindowedApplication,
 		float viewWidth = 10.f;
 		float camYAngle = 165.f / 180.f * 3.14159f;
 		float camXAngle = 32.f / 180.f * 3.14159f;
-		int PTPipline = E_LIGHT_GEOMETRY::ELG_SPHERE;
+		int PTPipeline = E_LIGHT_GEOMETRY::ELG_SPHERE;
 		int renderMode = E_RENDER_MODE::ERM_HLSL;
 		int spp = 32;
 		int depth = 3;

From 1535561525c1df59d227969692ae7405b507962b Mon Sep 17 00:00:00 2001
From: keptsecret <sorchon@gmail.com>
Date: Fri, 28 Mar 2025 12:45:14 +0700
Subject: [PATCH 113/529] added persistent workgroup toggle

---
 .../app_resources/glsl/common.glsl            | 25 ++++++
 .../app_resources/hlsl/render.comp.hlsl       | 24 ++++++
 31_HLSLPathTracer/main.cpp                    | 82 +++++++++++++++++--
 3 files changed, 123 insertions(+), 8 deletions(-)

diff --git a/31_HLSLPathTracer/app_resources/glsl/common.glsl b/31_HLSLPathTracer/app_resources/glsl/common.glsl
index 6c2b5f42f..6b6e96710 100644
--- a/31_HLSLPathTracer/app_resources/glsl/common.glsl
+++ b/31_HLSLPathTracer/app_resources/glsl/common.glsl
@@ -35,6 +35,9 @@ vec2 getTexCoords() {
 #include <nbl/builtin/glsl/limits/numeric.glsl>
 #include <nbl/builtin/glsl/math/constants.glsl>
 #include <nbl/builtin/glsl/utils/common.glsl>
+#ifdef PERSISTENT_WORKGROUPS
+#include <nbl/builtin/glsl/utils/morton.glsl>
+#endif
 
 #include <nbl/builtin/glsl/sampling/box_muller_transform.glsl>
 
@@ -688,19 +691,37 @@ bool closestHitProgram(in uint depth, in uint _sample, inout Ray_t ray, inout nb
 void main()
 {
     const ivec2 imageExtents = imageSize(outImage);
+
+#ifdef PERSISTENT_WORKGROUPS
+    uint virtualThreadIndex;
+    for (uint virtualThreadBase = gl_WorkGroupID.x * _NBL_GLSL_WORKGROUP_SIZE_; virtualThreadBase < 1920*1080; virtualThreadBase += gl_NumWorkGroups.x * _NBL_GLSL_WORKGROUP_SIZE_) // not sure why 1280*720 doesn't cover draw surface
+    {
+        virtualThreadIndex = virtualThreadBase + gl_LocalInvocationIndex.x;
+        const ivec2 coords = ivec2(nbl_glsl_morton_decode2d32b(virtualThreadIndex));
+#else
     const ivec2 coords = getCoordinates();
+#endif
+
     vec2 texCoord = vec2(coords) / vec2(imageExtents);
     texCoord.y = 1.0 - texCoord.y;
 
     if (false == (all(lessThanEqual(ivec2(0),coords)) && all(greaterThan(imageExtents,coords)))) {
+#ifdef PERSISTENT_WORKGROUPS
+        continue;
+#else
         return;
+#endif
     }
 
     if (((PTPushConstant.depth-1)>>MAX_DEPTH_LOG2)>0 || ((PTPushConstant.sampleCount-1)>>MAX_SAMPLES_LOG2)>0)
     {
         vec4 pixelCol = vec4(1.0,0.0,0.0,1.0);
         imageStore(outImage, coords, pixelCol);
+#ifdef PERSISTENT_WORKGROUPS
+        continue;
+#else
         return;
+#endif
     }
 
     nbl_glsl_xoroshiro64star_state_t scramble_start_state = texelFetch(scramblebuf,coords,0).rg;
@@ -791,6 +812,10 @@ void main()
 
     vec4 pixelCol = vec4(color, 1.0);
     imageStore(outImage, coords, pixelCol);
+
+#ifdef PERSISTENT_WORKGROUPS
+    }
+#endif
 }
 /** TODO: Improving Rendering
 
diff --git a/31_HLSLPathTracer/app_resources/hlsl/render.comp.hlsl b/31_HLSLPathTracer/app_resources/hlsl/render.comp.hlsl
index b187a1b33..81736f508 100644
--- a/31_HLSLPathTracer/app_resources/hlsl/render.comp.hlsl
+++ b/31_HLSLPathTracer/app_resources/hlsl/render.comp.hlsl
@@ -2,6 +2,9 @@
 #include "nbl/builtin/hlsl/glsl_compat/core.hlsl"
 #include "nbl/builtin/hlsl/random/pcg.hlsl"
 #include "nbl/builtin/hlsl/random/xoroshiro.hlsl"
+#ifdef PERSISTENT_WORKGROUPS
+#include "nbl/builtin/hlsl/math/morton.hlsl"
+#endif
 
 #include "nbl/builtin/hlsl/bxdf/reflection.hlsl"
 #include "nbl/builtin/hlsl/bxdf/transmission.hlsl"
@@ -156,19 +159,36 @@ void main(uint32_t3 threadID : SV_DispatchThreadID)
 {
     uint32_t width, height;
     outImage.GetDimensions(width, height);
+#ifdef PERSISTENT_WORKGROUPS
+    uint32_t virtualThreadIndex;
+    [loop]
+    for (uint32_t virtualThreadBase = glsl::gl_WorkGroupID().x * WorkgroupSize; virtualThreadBase < 1920*1080; virtualThreadBase += glsl::gl_NumWorkGroups().x * WorkgroupSize) // not sure why 1280*720 doesn't cover draw surface
+    {
+        virtualThreadIndex = virtualThreadBase + glsl::gl_LocalInvocationIndex().x;
+        const int32_t2 coords = (int32_t2)math::Morton<uint32_t>::decode2d(virtualThreadIndex);
+#else
     const int32_t2 coords = getCoordinates();
+#endif
     float32_t2 texCoord = float32_t2(coords) / float32_t2(width, height);
     texCoord.y = 1.0 - texCoord.y;
 
     if (false == (all((int32_t2)0 < coords)) && all(int32_t2(width, height) < coords)) {
+#ifdef PERSISTENT_WORKGROUPS
+        continue;
+#else
         return;
+#endif
     }
 
     if (((pc.depth - 1) >> MAX_DEPTH_LOG2) > 0 || ((pc.sampleCount - 1) >> MAX_SAMPLES_LOG2) > 0)
     {
         float32_t4 pixelCol = float32_t4(1.0,0.0,0.0,1.0);
         outImage[coords] = pixelCol;
+#ifdef PERSISTENT_WORKGROUPS
+        continue;
+#else
         return;
+#endif
     }
 
     int flatIdx = glsl::gl_GlobalInvocationID().y * glsl::gl_NumWorkGroups().x * WorkgroupSize + glsl::gl_GlobalInvocationID().x;
@@ -200,4 +220,8 @@ void main(uint32_t3 threadID : SV_DispatchThreadID)
     float32_t3 color = pathtracer.getMeasure(pc.sampleCount, pc.depth, scene);
     float32_t4 pixCol = float32_t4(color, 1.0);
     outImage[coords] = pixCol;
+
+#ifdef PERSISTENT_WORKGROUPS
+    }
+#endif
 }
diff --git a/31_HLSLPathTracer/main.cpp b/31_HLSLPathTracer/main.cpp
index 706a0f713..0dc5fc053 100644
--- a/31_HLSLPathTracer/main.cpp
+++ b/31_HLSLPathTracer/main.cpp
@@ -323,7 +323,7 @@ class HLSLComputePathtracer final : public examples::SimpleWindowedApplication,
 				m_presentDescriptorSet = presentDSPool->createDescriptorSet(gpuPresentDescriptorSetLayout);
 
 				// Create Shaders
-				auto loadAndCompileGLSLShader = [&](const std::string& pathToShader) -> smart_refctd_ptr<IGPUShader>
+				auto loadAndCompileGLSLShader = [&](const std::string& pathToShader, bool persistentWorkGroups = false) -> smart_refctd_ptr<IGPUShader>
 				{
 					IAssetLoader::SAssetLoadParams lp = {};
 					lp.workingDirectory = localInputCWD;
@@ -339,6 +339,27 @@ class HLSLComputePathtracer final : public examples::SimpleWindowedApplication,
 					// The down-cast should not fail!
 					assert(source);
 
+					auto compiler = make_smart_refctd_ptr<asset::CGLSLCompiler>(smart_refctd_ptr(m_system));
+					CGLSLCompiler::SOptions options = {};
+					options.stage = IShader::E_SHADER_STAGE::ESS_COMPUTE;	// should be compute
+					options.targetSpirvVersion = m_device->getPhysicalDevice()->getLimits().spirvVersion;
+					options.spirvOptimizer = nullptr;
+#ifndef _NBL_DEBUG
+					ISPIRVOptimizer::E_OPTIMIZER_PASS optPasses = ISPIRVOptimizer::EOP_STRIP_DEBUG_INFO;
+					auto opt = make_smart_refctd_ptr<ISPIRVOptimizer>(std::span<ISPIRVOptimizer::E_OPTIMIZER_PASS>(&optPasses, 1));
+					options.spirvOptimizer = opt.get();
+#endif
+					options.debugInfoFlags |= IShaderCompiler::E_DEBUG_INFO_FLAGS::EDIF_LINE_BIT;
+					options.preprocessorOptions.sourceIdentifier = source->getFilepathHint();
+					options.preprocessorOptions.logger = m_logger.get();
+					options.preprocessorOptions.includeFinder = compiler->getDefaultIncludeFinder();
+
+					const IShaderCompiler::SMacroDefinition persistentDefine = { "PERSISTENT_WORKGROUPS", "1" };
+					if (persistentWorkGroups)
+						options.preprocessorOptions.extraDefines = { &persistentDefine, &persistentDefine + 1 };
+
+					source = compiler->compileToSPIRV((const char*)source->getContent()->getPointer(), options);
+
 					// this time we skip the use of the asset converter since the ICPUShader->IGPUShader path is quick and simple
 					auto shader = m_device->createShader(source.get());
 					if (!shader)
@@ -350,7 +371,7 @@ class HLSLComputePathtracer final : public examples::SimpleWindowedApplication,
 					return shader;
 				};
 
-				auto loadAndCompileHLSLShader = [&](const std::string& pathToShader, const std::string& defineMacro) -> smart_refctd_ptr<IGPUShader>
+				auto loadAndCompileHLSLShader = [&](const std::string& pathToShader, const std::string& defineMacro = "", bool persistentWorkGroups = false) -> smart_refctd_ptr<IGPUShader>
 				{
 					IAssetLoader::SAssetLoadParams lp = {};
 					lp.workingDirectory = localInputCWD;
@@ -368,7 +389,7 @@ class HLSLComputePathtracer final : public examples::SimpleWindowedApplication,
 
 					auto compiler = make_smart_refctd_ptr<asset::CHLSLCompiler>(smart_refctd_ptr(m_system));
 					CHLSLCompiler::SOptions options = {};
-					options.stage = IShader::E_SHADER_STAGE::ESS_COMPUTE;	// should be compute
+					options.stage = IShader::E_SHADER_STAGE::ESS_COMPUTE;
 					options.targetSpirvVersion = m_device->getPhysicalDevice()->getLimits().spirvVersion;
 					options.spirvOptimizer = nullptr;
 #ifndef _NBL_DEBUG
@@ -381,8 +402,11 @@ class HLSLComputePathtracer final : public examples::SimpleWindowedApplication,
 					options.preprocessorOptions.logger = m_logger.get();
 					options.preprocessorOptions.includeFinder = compiler->getDefaultIncludeFinder();
 					
-					const IShaderCompiler::SMacroDefinition variantDefine = { defineMacro, "" };
-					options.preprocessorOptions.extraDefines = { &variantDefine, &variantDefine + 1 };
+					const IShaderCompiler::SMacroDefinition defines[2] = { {defineMacro, ""}, { "PERSISTENT_WORKGROUPS", "1" } };
+					if (!defineMacro.empty() && persistentWorkGroups)
+						options.preprocessorOptions.extraDefines = { defines, defines + 2 };
+					else if (!defineMacro.empty() && !persistentWorkGroups)
+						options.preprocessorOptions.extraDefines = { defines, defines + 1 };
 
 					source = compiler->compileToSPIRV((const char*)source->getContent()->getPointer(), options);
 					
@@ -441,6 +465,34 @@ class HLSLComputePathtracer final : public examples::SimpleWindowedApplication,
 							if (!m_device->createComputePipelines(nullptr, { &params, 1 }, m_PTHLSLPipelines.data() + index))
 								return logFail("Failed to create HLSL compute pipeline!\n");
 						}
+
+						// persistent wg pipelines
+						{
+							auto ptShader = loadAndCompileGLSLShader(PTGLSLShaderPaths[index], true);
+
+							IGPUComputePipeline::SCreationParams params = {};
+							params.layout = ptPipelineLayout.get();
+							params.shader.shader = ptShader.get();
+							params.shader.entryPoint = "main";
+							params.shader.entries = nullptr;
+							params.shader.requireFullSubgroups = true;
+							params.shader.requiredSubgroupSize = static_cast<IGPUShader::SSpecInfo::SUBGROUP_SIZE>(5);
+							if (!m_device->createComputePipelines(nullptr, { &params, 1 }, m_PTGLSLPersistentWGPipelines.data() + index))
+								return logFail("Failed to create GLSL PersistentWG compute pipeline!\n");
+						}
+						{
+							auto ptShader = loadAndCompileHLSLShader(PTHLSLShaderPath, PTHLSLShaderVariants[index], true);
+
+							IGPUComputePipeline::SCreationParams params = {};
+							params.layout = ptPipelineLayout.get();
+							params.shader.shader = ptShader.get();
+							params.shader.entryPoint = "main";
+							params.shader.entries = nullptr;
+							params.shader.requireFullSubgroups = true;
+							params.shader.requiredSubgroupSize = static_cast<IGPUShader::SSpecInfo::SUBGROUP_SIZE>(5);
+							if (!m_device->createComputePipelines(nullptr, { &params, 1 }, m_PTHLSLPersistentWGPipelines.data() + index))
+								return logFail("Failed to create HLSL PersistentWG compute pipeline!\n");
+						}
 					}
 				}
 
@@ -452,7 +504,7 @@ class HLSLComputePathtracer final : public examples::SimpleWindowedApplication,
 						return logFail("Failed to create Full Screen Triangle protopipeline or load its vertex shader!");
 
 					// Load Fragment Shader
-					auto fragmentShader = loadAndCompileGLSLShader(PresentShaderPath);
+					auto fragmentShader = loadAndCompileHLSLShader(PresentShaderPath);
 					if (!fragmentShader)
 						return logFail("Failed to Load and Compile Fragment Shader: lumaMeterShader!");
 
@@ -944,6 +996,7 @@ class HLSLComputePathtracer final : public examples::SimpleWindowedApplication,
 					ImGui::Combo("Render Mode", &renderMode, shaderTypes, E_RENDER_MODE::ERM_COUNT);
 					ImGui::SliderInt("SPP", &spp, 1, MaxBufferSamples);
 					ImGui::SliderInt("Depth", &depth, 1, MaxBufferDimensions / 3);
+					ImGui::Checkbox("Persistent WorkGroups", &usePersistentWorkGroups);
 
 					ImGui::Text("X: %f Y: %f", io.MousePos.x, io.MousePos.y);
 
@@ -1069,12 +1122,22 @@ class HLSLComputePathtracer final : public examples::SimpleWindowedApplication,
 
 				// cube envmap handle
 				{
-					auto pipeline = renderMode == E_RENDER_MODE::ERM_HLSL ? m_PTHLSLPipelines[PTPipeline].get() : m_PTGLSLPipelines[PTPipeline].get();
+					IGPUComputePipeline* pipeline;
+					if (usePersistentWorkGroups)
+						pipeline = renderMode == E_RENDER_MODE::ERM_HLSL ? m_PTHLSLPersistentWGPipelines[PTPipeline].get() : m_PTGLSLPersistentWGPipelines[PTPipeline].get();
+					else
+						pipeline = renderMode == E_RENDER_MODE::ERM_HLSL ? m_PTHLSLPipelines[PTPipeline].get() : m_PTGLSLPipelines[PTPipeline].get();
 					cmdbuf->bindComputePipeline(pipeline);
 					cmdbuf->bindDescriptorSets(EPBP_COMPUTE, pipeline->getLayout(), 0u, 1u, &m_descriptorSet0.get());
 					cmdbuf->bindDescriptorSets(EPBP_COMPUTE, pipeline->getLayout(), 2u, 1u, &m_descriptorSet2.get());
 					cmdbuf->pushConstants(pipeline->getLayout(), IShader::E_SHADER_STAGE::ESS_COMPUTE, 0, sizeof(PTPushConstant), &pc);
-					cmdbuf->dispatch(1 + (WindowDimensions.x * WindowDimensions.y - 1) / DefaultWorkGroupSize, 1u, 1u);
+					if (usePersistentWorkGroups)
+					{
+						uint32_t dispatchSize = m_physicalDevice->getLimits().computeOptimalPersistentWorkgroupDispatchSize(WindowDimensions.x * WindowDimensions.y, DefaultWorkGroupSize);
+						cmdbuf->dispatch(dispatchSize, 1u, 1u);
+					}
+					else
+						cmdbuf->dispatch(1 + (WindowDimensions.x * WindowDimensions.y - 1) / DefaultWorkGroupSize, 1u, 1u);
 				}
 
 				// TRANSITION m_outImgView to READ (because of descriptorSets0 -> ComputeShader Writes into the image)
@@ -1306,6 +1369,8 @@ class HLSLComputePathtracer final : public examples::SimpleWindowedApplication,
 		smart_refctd_ptr<IGPUCommandPool> m_cmdPool;
 		std::array<smart_refctd_ptr<IGPUComputePipeline>, E_LIGHT_GEOMETRY::ELG_COUNT> m_PTGLSLPipelines;
 		std::array<smart_refctd_ptr<IGPUComputePipeline>, E_LIGHT_GEOMETRY::ELG_COUNT> m_PTHLSLPipelines;
+		std::array<smart_refctd_ptr<IGPUComputePipeline>, E_LIGHT_GEOMETRY::ELG_COUNT> m_PTGLSLPersistentWGPipelines;
+		std::array<smart_refctd_ptr<IGPUComputePipeline>, E_LIGHT_GEOMETRY::ELG_COUNT> m_PTHLSLPersistentWGPipelines;
 		smart_refctd_ptr<IGPUGraphicsPipeline> m_presentPipeline;
 		uint64_t m_realFrameIx = 0;
 		std::array<smart_refctd_ptr<IGPUCommandBuffer>, MaxFramesInFlight> m_cmdBufs;
@@ -1357,6 +1422,7 @@ class HLSLComputePathtracer final : public examples::SimpleWindowedApplication,
 		int renderMode = E_RENDER_MODE::ERM_HLSL;
 		int spp = 32;
 		int depth = 3;
+		bool usePersistentWorkGroups = false;
 
 		bool m_firstFrame = true;
 		IGPUCommandBuffer::SClearColorValue clearColor = { .float32 = {0.f,0.f,0.f,1.f} };

From 10791e9f48d5563b319b58a7ea47dbd19639abe2 Mon Sep 17 00:00:00 2001
From: Przemek <minikers21@gmail.com>
Date: Sat, 29 Mar 2025 15:28:51 +0100
Subject: [PATCH 114/529] Implemented anty aliasing between height shading
 sections

---
 62_CAD/main.cpp                               | 11 ++++---
 .../main_pipeline/fragment_shader.hlsl        | 31 ++++++++++++++++---
 2 files changed, 32 insertions(+), 10 deletions(-)

diff --git a/62_CAD/main.cpp b/62_CAD/main.cpp
index 17afb122a..9fdb4577a 100644
--- a/62_CAD/main.cpp
+++ b/62_CAD/main.cpp
@@ -3377,11 +3377,12 @@ class ComputerAidedDesign final : public examples::SimpleWindowedApplication, pu
 					dtmSettingsInfo.heightShadingMode = DTMSettingsInfo::E_HEIGHT_SHADING_MODE::DISCRETE_VARIABLE_LENGTH_INTERVALS;
 					
 					float animatedAlpha = (std::cos(m_timeElapsed * 0.0005) + 1.0) * 0.5;
-					dtmSettingsInfo.addHeightColorMapEntry(-10.0f, float32_t4(1.0f, 1.0f, 1.0f, 1.0f));
-					dtmSettingsInfo.addHeightColorMapEntry(20.0f, float32_t4(0.5f, 1.0f, 1.0f, 1.0f));
-					dtmSettingsInfo.addHeightColorMapEntry(25.0f, float32_t4(0.0f, 1.0f, 0.0f, 1.0f));
-					dtmSettingsInfo.addHeightColorMapEntry(70.0f, float32_t4(1.0f, 1.0f, 0.0f, animatedAlpha));
-					dtmSettingsInfo.addHeightColorMapEntry(100.0f, float32_t4(1.0f, 0.0f, 0.0f, 1.0f));
+					dtmSettingsInfo.addHeightColorMapEntry(-10.0f, float32_t4(0.5f, 1.0f, 1.0f, 1.0f));
+					dtmSettingsInfo.addHeightColorMapEntry(20.0f, float32_t4(0.0f, 1.0f, 0.0f, 1.0f));
+					//dtmSettingsInfo.addHeightColorMapEntry(25.0f, float32_t4(1.0f, 1.0f, 0.0f, animatedAlpha));
+					dtmSettingsInfo.addHeightColorMapEntry(25.0f, float32_t4(1.0f, 1.0f, 0.0f, 1.0f));
+					dtmSettingsInfo.addHeightColorMapEntry(70.0f, float32_t4(1.0f, 0.0f, 0.0f, 1.0f));
+					dtmSettingsInfo.addHeightColorMapEntry(90.0f, float32_t4(1.0f, 1.0f, 1.0f, 1.0f));
 					break;
 				}
 				case DTMSettingsInfo::E_HEIGHT_SHADING_MODE::DISCRETE_FIXED_LENGTH_INTERVALS:
diff --git a/62_CAD/shaders/main_pipeline/fragment_shader.hlsl b/62_CAD/shaders/main_pipeline/fragment_shader.hlsl
index 42a303fc2..225c0636e 100644
--- a/62_CAD/shaders/main_pipeline/fragment_shader.hlsl
+++ b/62_CAD/shaders/main_pipeline/fragment_shader.hlsl
@@ -473,11 +473,29 @@ float4 fragMain(PSInput input) : SV_TARGET
             if(mode == DTMSettings::E_HEIGHT_SHADING_MODE::DISCRETE_VARIABLE_LENGTH_INTERVALS)
             {
                 DTMSettingsHeightsAccessor dtmHeightsAccessor = { dtm };
-                uint32_t upperBoundHeightIndex = nbl::hlsl::upper_bound(dtmHeightsAccessor, 0, heightMapSize, height);
-                uint32_t lowerBoundHeightIndex = upperBoundHeightIndex == 0 ? upperBoundHeightIndex : upperBoundHeightIndex - 1;
+                uint32_t mapIndexPlus1 = nbl::hlsl::upper_bound(dtmHeightsAccessor, 0, heightMapSize, height);
+                uint32_t mapIndex = mapIndexPlus1 == 0 ? mapIndexPlus1 : mapIndexPlus1 - 1;
+
+                // logic explainer: if colorIdx is 0.0 then it means blend with next
+                // if color idx is >= length of the colours array then it means it's also > 0.0 and this blend with prev is true
+                // if color idx is > 0 and < len - 1, then it depends on the current pixel's height value and two closest height values
+                bool blendWithPrev = (mapIndex > 0) 
+                    && (mapIndex >= heightMapSize - 1 || (height * 2.0 < dtm.heightColorMapHeights[mapIndexPlus1] + dtm.heightColorMapHeights[mapIndex]));
+                float heightDeriv = fwidth(height);
+                if (blendWithPrev)
+                {
+                    float pxDistanceToPrevHeight = (height - dtm.heightColorMapHeights[mapIndex]) / heightDeriv;
+                    float prevColorCoverage = smoothstep(-globals.antiAliasingFactor, globals.antiAliasingFactor, pxDistanceToPrevHeight);
+                    textureColor = lerp(dtm.heightColorMapColors[mapIndex - 1].rgb, dtm.heightColorMapColors[mapIndex].rgb, prevColorCoverage);
+                }
+                else
+                {
+                    float pxDistanceToNextHeight = (height - dtm.heightColorMapHeights[mapIndexPlus1]) / heightDeriv;
+                    float nextColorCoverage = smoothstep(-globals.antiAliasingFactor, globals.antiAliasingFactor, pxDistanceToNextHeight);
+                    textureColor = lerp(dtm.heightColorMapColors[mapIndex].rgb, dtm.heightColorMapColors[mapIndexPlus1].rgb, nextColorCoverage);
+                }
 
-                textureColor = dtm.heightColorMapColors[upperBoundHeightIndex].rgb;
-                localAlpha = dtm.heightColorMapColors[upperBoundHeightIndex].a;
+                localAlpha = dtm.heightColorMapColors[mapIndex].a;
             }
             else
             {
@@ -567,6 +585,9 @@ float4 fragMain(PSInput input) : SV_TARGET
             }
             else
             {
+                // TODO:
+                // It might be beneficial to calculate distance between pixel and contour line to early out some pixels and save yourself from stipple sdf computations!
+                // where you only compute the complex sdf if abs((height - contourVal) / heightDeriv) <= aaFactor
                 nbl::hlsl::shapes::Line<float>::ArcLengthCalculator arcLenCalc = nbl::hlsl::shapes::Line<float>::ArcLengthCalculator::construct(lineSegment);
                 LineStyleClipper clipper = LineStyleClipper::construct(contourStyle, lineSegment, arcLenCalc, phaseShift, stretch, worldToScreenRatio);
                 distance = ClippedSignedDistance<nbl::hlsl::shapes::Line<float>, LineStyleClipper>::sdf(lineSegment, input.position.xy, contourThickness, contourStyle.isRoadStyleFlag, clipper);
@@ -634,7 +655,7 @@ float4 fragMain(PSInput input) : SV_TARGET
                     nbl::hlsl::swap(p0, p1);
 
                 nbl::hlsl::shapes::Line<float> lineSegment = nbl::hlsl::shapes::Line<float>::construct(float2(p0.x, p0.y), float2(p1.x, p1.y));
-
+                
                 float distance = nbl::hlsl::numeric_limits<float>::max;
                 nbl::hlsl::shapes::Line<float>::ArcLengthCalculator arcLenCalc = nbl::hlsl::shapes::Line<float>::ArcLengthCalculator::construct(lineSegment);
                 LineStyleClipper clipper = LineStyleClipper::construct(outlineStyle, lineSegment, arcLenCalc, phaseShift, stretch, worldToScreenRatio);

From 1d9e6d014e09d24ca4da7d83dbce8a206f9f084d Mon Sep 17 00:00:00 2001
From: Erfan Ahmadi <ahmadierfan99@gmail.com>
Date: Sun, 30 Mar 2025 08:27:56 +0330
Subject: [PATCH 115/529] switch from dtm to good'ol linework

---
 62_CAD/main.cpp                                   | 2 +-
 62_CAD/shaders/main_pipeline/fragment_shader.hlsl | 3 ++-
 62_CAD/shaders/main_pipeline/vertex_shader.hlsl   | 1 -
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/62_CAD/main.cpp b/62_CAD/main.cpp
index 9fdb4577a..eff8fd3e1 100644
--- a/62_CAD/main.cpp
+++ b/62_CAD/main.cpp
@@ -75,7 +75,7 @@ constexpr std::array<float, (uint32_t)ExampleMode::CASE_COUNT> cameraExtents =
 	600.0	// CASE_9
 };
 
-constexpr ExampleMode mode = ExampleMode::CASE_9;
+constexpr ExampleMode mode = ExampleMode::CASE_4;
 
 class Camera2D
 {
diff --git a/62_CAD/shaders/main_pipeline/fragment_shader.hlsl b/62_CAD/shaders/main_pipeline/fragment_shader.hlsl
index 225c0636e..cddac89ba 100644
--- a/62_CAD/shaders/main_pipeline/fragment_shader.hlsl
+++ b/62_CAD/shaders/main_pipeline/fragment_shader.hlsl
@@ -426,6 +426,7 @@ float4 fragMain(PSInput input) : SV_TARGET
     const uint32_t currentMainObjectIdx = input.getMainObjectIdx();
     const MainObject mainObj = mainObjects[currentMainObjectIdx];
 
+#ifdef DTM
     // TRIANGLE RENDERING
     {
         const float outlineThickness = input.getOutlineThickness();
@@ -672,7 +673,7 @@ float4 fragMain(PSInput input) : SV_TARGET
     }
 
     return calculateFinalColor<nbl::hlsl::jit::device_capabilities::fragmentShaderPixelInterlock>(uint2(input.position.xy), localAlpha, currentMainObjectIdx, textureColor, true);
-
+#endif
     // figure out local alpha with sdf
     if (objType == ObjectType::LINE || objType == ObjectType::QUAD_BEZIER || objType == ObjectType::POLYLINE_CONNECTOR)
     {
diff --git a/62_CAD/shaders/main_pipeline/vertex_shader.hlsl b/62_CAD/shaders/main_pipeline/vertex_shader.hlsl
index 6011defce..f7af0d8a6 100644
--- a/62_CAD/shaders/main_pipeline/vertex_shader.hlsl
+++ b/62_CAD/shaders/main_pipeline/vertex_shader.hlsl
@@ -94,7 +94,6 @@ PSInput main(uint vertexID : SV_VertexID)
     // ~~Later, most likely We will require pulling all 3 vertices of the triangle, that's where you need to know which triangle you're currently on, and instead of objectID = vertexID/4 which we currently do, you will do vertexID/3 and pull all 3 of it's vertices.~~
     // Ok, brainfart, a vertex can belong to multiple triangles, I was thinking of AA but triangles share vertices, nevermind my comment above.
 
-#define DTM
 #ifdef DTM
     PSInput outV;
 

From ad43e200d1f30f218d6b7a0e6fbf696311df1bbb Mon Sep 17 00:00:00 2001
From: Erfan Ahmadi <ahmadierfan99@gmail.com>
Date: Mon, 31 Mar 2025 10:36:31 +0330
Subject: [PATCH 116/529] [WIP] putting all data into a single buffer and
 addressing with BDA

---
 62_CAD/DrawResourcesFiller.cpp                | 329 +++++++-----------
 62_CAD/DrawResourcesFiller.h                  | 196 ++++-------
 62_CAD/main.cpp                               | 106 ++----
 62_CAD/shaders/globals.hlsl                   |  79 ++++-
 62_CAD/shaders/main_pipeline/common.hlsl      |  15 +-
 .../main_pipeline/fragment_shader.hlsl        |  28 +-
 .../shaders/main_pipeline/resolve_alphas.hlsl |   4 +-
 .../shaders/main_pipeline/vertex_shader.hlsl  |  27 +-
 8 files changed, 308 insertions(+), 476 deletions(-)

diff --git a/62_CAD/DrawResourcesFiller.cpp b/62_CAD/DrawResourcesFiller.cpp
index 58c4d0c72..8c1a42719 100644
--- a/62_CAD/DrawResourcesFiller.cpp
+++ b/62_CAD/DrawResourcesFiller.cpp
@@ -15,70 +15,35 @@ void DrawResourcesFiller::setSubmitDrawsFunction(const SubmitFunc& func)
 	submitDraws = func;
 }
 
-void DrawResourcesFiller::allocateIndexBuffer(ILogicalDevice* logicalDevice, uint32_t maxIndices)
-{
-	maxIndexCount = maxIndices;
-	const size_t indexBufferSize = maxIndices * sizeof(index_buffer_type);
-	auto indexBuffer = ICPUBuffer::create({ indexBufferSize });
-
-	index_buffer_type* indices = reinterpret_cast<index_buffer_type*>(indexBuffer->getPointer());
-	for (uint32_t i = 0u; i < maxIndices / 6u; ++i)
-	{
-		index_buffer_type objIndex = i;
-		indices[i * 6] = objIndex * 4u + 1u;
-		indices[i * 6 + 1u] = objIndex * 4u + 0u;
-		indices[i * 6 + 2u] = objIndex * 4u + 2u;
-
-		indices[i * 6 + 3u] = objIndex * 4u + 1u;
-		indices[i * 6 + 4u] = objIndex * 4u + 2u;
-		indices[i * 6 + 5u] = objIndex * 4u + 3u;
-	}
-
-	IGPUBuffer::SCreationParams indexBufferCreationParams = {};
-	indexBufferCreationParams.size = indexBufferSize;
-	indexBufferCreationParams.usage = IGPUBuffer::EUF_INDEX_BUFFER_BIT | IGPUBuffer::EUF_TRANSFER_DST_BIT;
-
-	m_utilities->createFilledDeviceLocalBufferOnDedMem(SIntendedSubmitInfo{.queue=m_copyQueue}, std::move(indexBufferCreationParams), indices).move_into(gpuDrawBuffers.indexBuffer);
-	gpuDrawBuffers.indexBuffer->setObjectDebugName("indexBuffer");
-}
-
-void DrawResourcesFiller::allocateMainObjectsBuffer(ILogicalDevice* logicalDevice, uint32_t mainObjects)
-{
-	maxMainObjects = mainObjects;
-	size_t mainObjectsBufferSize = maxMainObjects * sizeof(MainObject);
-
-	IGPUBuffer::SCreationParams mainObjectsCreationParams = {};
-	mainObjectsCreationParams.size = mainObjectsBufferSize;
-	mainObjectsCreationParams.usage = IGPUBuffer::EUF_STORAGE_BUFFER_BIT | IGPUBuffer::EUF_TRANSFER_DST_BIT;
-	gpuDrawBuffers.mainObjectsBuffer = logicalDevice->createBuffer(std::move(mainObjectsCreationParams));
-	gpuDrawBuffers.mainObjectsBuffer->setObjectDebugName("mainObjectsBuffer");
-
-	IDeviceMemoryBacked::SDeviceMemoryRequirements memReq = gpuDrawBuffers.mainObjectsBuffer->getMemoryReqs();
-	memReq.memoryTypeBits &= logicalDevice->getPhysicalDevice()->getDeviceLocalMemoryTypeBits();
-	auto mainObjectsBufferMem = logicalDevice->allocate(memReq, gpuDrawBuffers.mainObjectsBuffer.get());
-
-	cpuDrawBuffers.mainObjectsBuffer = ICPUBuffer::create({ mainObjectsBufferSize });
-}
-
-void DrawResourcesFiller::allocateDrawObjectsBuffer(ILogicalDevice* logicalDevice, uint32_t drawObjects)
-{
-	maxDrawObjects = drawObjects;
-	size_t drawObjectsBufferSize = maxDrawObjects * sizeof(DrawObject);
-
-	IGPUBuffer::SCreationParams drawObjectsCreationParams = {};
-	drawObjectsCreationParams.size = drawObjectsBufferSize;
-	drawObjectsCreationParams.usage = IGPUBuffer::EUF_STORAGE_BUFFER_BIT | IGPUBuffer::EUF_TRANSFER_DST_BIT;
-	gpuDrawBuffers.drawObjectsBuffer = logicalDevice->createBuffer(std::move(drawObjectsCreationParams));
-	gpuDrawBuffers.drawObjectsBuffer->setObjectDebugName("drawObjectsBuffer");
-
-	IDeviceMemoryBacked::SDeviceMemoryRequirements memReq = gpuDrawBuffers.drawObjectsBuffer->getMemoryReqs();
-	memReq.memoryTypeBits &= logicalDevice->getPhysicalDevice()->getDeviceLocalMemoryTypeBits();
-	auto drawObjectsBufferMem = logicalDevice->allocate(memReq, gpuDrawBuffers.drawObjectsBuffer.get());
-
-	cpuDrawBuffers.drawObjectsBuffer = ICPUBuffer::create({ drawObjectsBufferSize });
-}
-
-void DrawResourcesFiller::allocateGeometryBuffer(ILogicalDevice* logicalDevice, size_t size)
+//void DrawResourcesFiller::allocateIndexBuffer(ILogicalDevice* logicalDevice, uint32_t maxIndices)
+//{
+//	maxIndexCount = maxIndices;
+//	const size_t indexBufferSize = maxIndices * sizeof(index_buffer_type);
+//	auto indexBuffer = ICPUBuffer::create({ indexBufferSize });
+//
+//	index_buffer_type* indices = reinterpret_cast<index_buffer_type*>(indexBuffer->getPointer());
+//	for (uint32_t i = 0u; i < maxIndices / 6u; ++i)
+//	{
+//		index_buffer_type objIndex = i;
+//		indices[i * 6] = objIndex * 4u + 1u;
+//		indices[i * 6 + 1u] = objIndex * 4u + 0u;
+//		indices[i * 6 + 2u] = objIndex * 4u + 2u;
+//
+//		indices[i * 6 + 3u] = objIndex * 4u + 1u;
+//		indices[i * 6 + 4u] = objIndex * 4u + 2u;
+//		indices[i * 6 + 5u] = objIndex * 4u + 3u;
+//	}
+//
+//	IGPUBuffer::SCreationParams indexBufferCreationParams = {};
+//	indexBufferCreationParams.size = indexBufferSize;
+//	indexBufferCreationParams.usage = IGPUBuffer::EUF_INDEX_BUFFER_BIT | IGPUBuffer::EUF_TRANSFER_DST_BIT;
+//
+//	m_utilities->createFilledDeviceLocalBufferOnDedMem(SIntendedSubmitInfo{.queue=m_copyQueue}, std::move(indexBufferCreationParams), indices).move_into(gpuDrawBuffers.indexBuffer);
+//	gpuDrawBuffers.indexBuffer->setObjectDebugName("indexBuffer");
+//}
+
+
+void DrawResourcesFiller::allocateDrawResourcesBuffer(ILogicalDevice* logicalDevice, size_t size)
 {
 	maxGeometryBufferSize = size;
 
@@ -91,49 +56,11 @@ void DrawResourcesFiller::allocateGeometryBuffer(ILogicalDevice* logicalDevice,
 	IDeviceMemoryBacked::SDeviceMemoryRequirements memReq = gpuDrawBuffers.geometryBuffer->getMemoryReqs();
 	memReq.memoryTypeBits &= logicalDevice->getPhysicalDevice()->getDeviceLocalMemoryTypeBits();
 	auto geometryBufferMem = logicalDevice->allocate(memReq, gpuDrawBuffers.geometryBuffer.get(), IDeviceMemoryAllocation::EMAF_DEVICE_ADDRESS_BIT);
-	geometryBufferAddress = gpuDrawBuffers.geometryBuffer->getDeviceAddress();
+	drawResourcesBDA = gpuDrawBuffers.geometryBuffer->getDeviceAddress();
 
 	cpuDrawBuffers.geometryBuffer = ICPUBuffer::create({ size });
 }
 
-void DrawResourcesFiller::allocateStylesBuffer(ILogicalDevice* logicalDevice, uint32_t lineStylesCount)
-{
-	{
-		maxLineStyles = lineStylesCount;
-		size_t lineStylesBufferSize = lineStylesCount * sizeof(LineStyle);
-
-		IGPUBuffer::SCreationParams lineStylesCreationParams = {};
-		lineStylesCreationParams.size = lineStylesBufferSize;
-		lineStylesCreationParams.usage = IGPUBuffer::EUF_STORAGE_BUFFER_BIT | IGPUBuffer::EUF_TRANSFER_DST_BIT;
-		gpuDrawBuffers.lineStylesBuffer = logicalDevice->createBuffer(std::move(lineStylesCreationParams));
-		gpuDrawBuffers.lineStylesBuffer->setObjectDebugName("lineStylesBuffer");
-
-		IDeviceMemoryBacked::SDeviceMemoryRequirements memReq = gpuDrawBuffers.lineStylesBuffer->getMemoryReqs();
-		memReq.memoryTypeBits &= logicalDevice->getPhysicalDevice()->getDeviceLocalMemoryTypeBits();
-		auto stylesBufferMem = logicalDevice->allocate(memReq, gpuDrawBuffers.lineStylesBuffer.get());
-
-		cpuDrawBuffers.lineStylesBuffer = ICPUBuffer::create({ lineStylesBufferSize });
-	}
-}
-
-void DrawResourcesFiller::allocateDTMSettingsBuffer(ILogicalDevice* logicalDevice, uint32_t dtmSettingsCount)
-{
-	maxDtmSettings = dtmSettingsCount;
-	size_t dtmSettingsBufferSize = dtmSettingsCount * sizeof(DTMSettings);
-	
-	IGPUBuffer::SCreationParams dtmSettingsCreationParams = {};
-	dtmSettingsCreationParams.size = dtmSettingsBufferSize;
-	dtmSettingsCreationParams.usage = IGPUBuffer::EUF_STORAGE_BUFFER_BIT | IGPUBuffer::EUF_TRANSFER_DST_BIT;
-	gpuDrawBuffers.dtmSettingsBuffer = logicalDevice->createBuffer(std::move(dtmSettingsCreationParams));
-	gpuDrawBuffers.dtmSettingsBuffer->setObjectDebugName("dtmSettingsBuffer");
-	
-	IDeviceMemoryBacked::SDeviceMemoryRequirements memReq = gpuDrawBuffers.dtmSettingsBuffer->getMemoryReqs();
-	memReq.memoryTypeBits &= logicalDevice->getPhysicalDevice()->getDeviceLocalMemoryTypeBits();
-	auto stylesBufferMem = logicalDevice->allocate(memReq, gpuDrawBuffers.dtmSettingsBuffer.get());
-	
-	cpuDrawBuffers.dtmSettingsBuffer = ICPUBuffer::create({ dtmSettingsBufferSize });
-}
-
 void DrawResourcesFiller::allocateMSDFTextures(ILogicalDevice* logicalDevice, uint32_t maxMSDFs, uint32_t2 msdfsExtent)
 {
 	msdfLRUCache = std::unique_ptr<MSDFsLRUCache>(new MSDFsLRUCache(maxMSDFs));
@@ -265,7 +192,7 @@ void DrawResourcesFiller::drawTriangleMesh(const CTriangleMesh& mesh, CTriangleM
 		currentGeometryBufferSize += indexBuffByteSize;
 
 		dst = reinterpret_cast<char*>(cpuDrawBuffers.geometryBuffer->getPointer()) + currentGeometryBufferSize;
-		drawData.pushConstants.triangleMeshVerticesBaseAddress = geometryBufferAddress + currentGeometryBufferSize;
+		drawData.pushConstants.triangleMeshVerticesBaseAddress = drawResourcesBDA + currentGeometryBufferSize;
 		memcpy(dst, mesh.getVertices().data(), vtxBuffByteSize);
 		currentGeometryBufferSize += vtxBuffByteSize;
 	}
@@ -373,13 +300,56 @@ void DrawResourcesFiller::drawFontGlyph(
 	}
 }
 
+void DrawResourcesFiller::_test_addImageObject(float64_t2 topLeftPos, float32_t2 size, float32_t rotation, SIntendedSubmitInfo& intendedNextSubmit)
+{
+	auto addImageObject_Internal = [&](const ImageObjectInfo& imageObjectInfo, uint32_t mainObjIdx) -> bool
+		{
+			const uint32_t maxGeometryBufferImageObjects = static_cast<uint32_t>((maxGeometryBufferSize - currentGeometryBufferSize) / sizeof(ImageObjectInfo));
+			uint32_t uploadableObjects = (maxIndexCount / 6u) - currentDrawObjectCount;
+			uploadableObjects = core::min(uploadableObjects, maxDrawObjects - currentDrawObjectCount);
+			uploadableObjects = core::min(uploadableObjects, maxGeometryBufferImageObjects);
+
+			if (uploadableObjects >= 1u)
+			{
+				void* dstGeom = reinterpret_cast<char*>(cpuDrawBuffers.geometryBuffer->getPointer()) + currentGeometryBufferSize;
+				memcpy(dstGeom, &imageObjectInfo, sizeof(ImageObjectInfo));
+				uint64_t geomBufferAddr = drawResourcesBDA + currentGeometryBufferSize;
+				currentGeometryBufferSize += sizeof(ImageObjectInfo);
+
+				DrawObject drawObj = {};
+				drawObj.type_subsectionIdx = uint32_t(static_cast<uint16_t>(ObjectType::IMAGE) | (0 << 16)); // TODO: use custom pack/unpack function
+				drawObj.mainObjIndex = mainObjIdx;
+				drawObj.geometryAddress = geomBufferAddr;
+				void* dstDrawObj = reinterpret_cast<DrawObject*>(cpuDrawBuffers.drawObjectsBuffer->getPointer()) + currentDrawObjectCount;
+				memcpy(dstDrawObj, &drawObj, sizeof(DrawObject));
+				currentDrawObjectCount += 1u;
+
+				return true;
+			}
+			else
+				return false;
+		};
+
+	uint32_t mainObjIdx = addMainObject_SubmitIfNeeded(InvalidStyleIdx, InvalidDTMSettingsIdx, intendedNextSubmit);
+
+	ImageObjectInfo info = {};
+	info.topLeft = topLeftPos;
+	info.dirU = float32_t2(size.x * cos(rotation), size.x * sin(rotation)); // 
+	info.aspectRatio = size.y / size.x;
+	info.textureID = 0u;
+	if (!addImageObject_Internal(info, mainObjIdx))
+	{
+		// single image object couldn't fit into memory to push to gpu, so we submit rendering current objects and reset geometry buffer and draw objects
+		submitCurrentDrawObjectsAndReset(intendedNextSubmit, mainObjIdx);
+		bool success = addImageObject_Internal(info, mainObjIdx);
+		assert(success); // this should always be true, otherwise it's either bug in code or not enough memory allocated to hold a single image object 
+	}
+}
+
 bool DrawResourcesFiller::finalizeAllCopiesToGPU(SIntendedSubmitInfo& intendedNextSubmit)
 {
 	bool success = true;
-	success &= finalizeMainObjectCopiesToGPU(intendedNextSubmit);
-	success &= finalizeGeometryCopiesToGPU(intendedNextSubmit);
-	success &= finalizeLineStyleCopiesToGPU(intendedNextSubmit);
-	success &= finalizeDTMSettingsCopiesToGPU(intendedNextSubmit);
+	success &= finalizeBufferCopies(intendedNextSubmit);
 	success &= finalizeTextureCopies(intendedNextSubmit);
 	return success;
 }
@@ -461,100 +431,59 @@ void DrawResourcesFiller::popClipProjectionData()
 	clipProjectionAddresses.pop_back();
 }
 
-bool DrawResourcesFiller::finalizeMainObjectCopiesToGPU(SIntendedSubmitInfo& intendedNextSubmit)
+bool DrawResourcesFiller::finalizeBufferCopies(SIntendedSubmitInfo& intendedNextSubmit)
 {
-	bool success = true;
-	// Copy MainObjects
-	uint32_t remainingMainObjects = currentMainObjectCount - inMemMainObjectCount;
-	SBufferRange<IGPUBuffer> mainObjectsRange = { sizeof(MainObject) * inMemMainObjectCount, sizeof(MainObject) * remainingMainObjects, gpuDrawBuffers.mainObjectsBuffer };
-	if (mainObjectsRange.size > 0u)
-	{
-		const MainObject* srcMainObjData = reinterpret_cast<MainObject*>(cpuDrawBuffers.mainObjectsBuffer->getPointer()) + inMemMainObjectCount;
-		if (m_utilities->updateBufferRangeViaStagingBuffer(intendedNextSubmit, mainObjectsRange, srcMainObjData))
-			inMemMainObjectCount = currentMainObjectCount;
-		else
-		{
-			// TODO: Log
-			success = false;
-		}
-	}
-	return success;
-}
+	size_t offset = 0ull;
 
-bool DrawResourcesFiller::finalizeGeometryCopiesToGPU(SIntendedSubmitInfo& intendedNextSubmit)
-{
-	bool success = true;
-	// Copy DrawObjects
-	uint32_t remainingDrawObjects = currentDrawObjectCount - inMemDrawObjectCount;
-	SBufferRange<IGPUBuffer> drawObjectsRange = { sizeof(DrawObject) * inMemDrawObjectCount, sizeof(DrawObject) * remainingDrawObjects, gpuDrawBuffers.drawObjectsBuffer };
-	if (drawObjectsRange.size > 0u)
-	{
-		const DrawObject* srcDrawObjData = reinterpret_cast<DrawObject*>(cpuDrawBuffers.drawObjectsBuffer->getPointer()) + inMemDrawObjectCount;
-		if (m_utilities->updateBufferRangeViaStagingBuffer(intendedNextSubmit, drawObjectsRange, srcDrawObjData))
-			inMemDrawObjectCount = currentDrawObjectCount;
-		else
-		{
-			// TODO: Log
-			success = false;
-		}
-	}
+	assert(drawBuffers.calculateTotalConsumption() <= drawResourcesGPUBuffer->getSize());
 
-	// Copy GeometryBuffer
-	uint64_t remainingGeometrySize = currentGeometryBufferSize - inMemGeometryBufferSize;
-	SBufferRange<IGPUBuffer> geomRange = { inMemGeometryBufferSize, remainingGeometrySize, gpuDrawBuffers.geometryBuffer };
-	if (geomRange.size > 0u)
-	{
-		const uint8_t* srcGeomData = reinterpret_cast<uint8_t*>(cpuDrawBuffers.geometryBuffer->getPointer()) + inMemGeometryBufferSize;
-		if (m_utilities->updateBufferRangeViaStagingBuffer(intendedNextSubmit, geomRange, srcGeomData))
-			inMemGeometryBufferSize = currentGeometryBufferSize;
-		else
+	auto copyCPUFilledDrawBuffer = [&](auto& drawBuffer) -> bool
 		{
-			// TODO: Log
-			success = false;
-		}
-	}
-	return success;
-}
+			// drawBuffer must be of type CPUFilledDrawBuffer<T>
+			SBufferRange<IGPUBuffer> copyRange = { offset, drawBuffer.getStorageSize(), drawResourcesGPUBuffer};
 
-bool DrawResourcesFiller::finalizeLineStyleCopiesToGPU(SIntendedSubmitInfo& intendedNextSubmit)
-{
-	bool success = true;
-	// Copy LineStyles
-	uint32_t remainingLineStyles = currentLineStylesCount - inMemLineStylesCount;
-	SBufferRange<IGPUBuffer> stylesRange = { sizeof(LineStyle) * inMemLineStylesCount, sizeof(LineStyle) * remainingLineStyles, gpuDrawBuffers.lineStylesBuffer };
-	if (stylesRange.size > 0u)
-	{
-		LineStyle* srcLineStylesData = reinterpret_cast<LineStyle*>(cpuDrawBuffers.lineStylesBuffer->getPointer()) + inMemLineStylesCount;
+			if (copyRange.offset + copyRange.size > drawResourcesGPUBuffer->getSize())
+			{
+				// TODO: LOG ERROR, this shouldn't happen with correct auto-submission mechanism
+				assert(false);
+				return false;
+			}
 
-		if (m_utilities->updateBufferRangeViaStagingBuffer(intendedNextSubmit, stylesRange, srcLineStylesData))
-			inMemLineStylesCount = currentLineStylesCount;
-		else
+			if (copyRange.size > 0ull)
+			{
+				drawBuffer.bufferOffset = copyRange.offset;
+				if (!m_utilities->updateBufferRangeViaStagingBuffer(intendedNextSubmit, copyRange, drawBuffer.vector.data()))
+					return false;
+				offset += drawBuffer.getAlignedStorageSize();
+			}
+			return true;
+		};
+	
+	auto addComputeReservedFilledDrawBuffer = [&](auto& drawBuffer) -> bool
 		{
-			// TODO: Log
-			success = false;
-		}
-	}
-	return success;
-}
+			// drawBuffer must be of type ComputeReservedDrawBuffer<T>
+			SBufferRange<IGPUBuffer> copyRange = { offset, drawBuffer.getStorageSize(), drawResourcesGPUBuffer};
 
-bool DrawResourcesFiller::finalizeDTMSettingsCopiesToGPU(SIntendedSubmitInfo& intendedNextSubmit)
-{
-	bool success = true;
-	// Copy DTM settings
-	uint32_t remainingDTMSettings = currentDTMSettingsCount - inMemDTMSettingsCount;
-	SBufferRange<IGPUBuffer> dtmSettingsRange = { sizeof(DTMSettings) * inMemDTMSettingsCount, sizeof(DTMSettings) * remainingDTMSettings, gpuDrawBuffers.dtmSettingsBuffer };
-	if (dtmSettingsRange.size > 0u)
-	{
-		const DTMSettings* srcDTMSettingsData = reinterpret_cast<DTMSettings*>(cpuDrawBuffers.dtmSettingsBuffer->getPointer()) + inMemDTMSettingsCount;
-		if (m_utilities->updateBufferRangeViaStagingBuffer(intendedNextSubmit, dtmSettingsRange, srcDTMSettingsData))
-			inMemDTMSettingsCount = currentDTMSettingsCount;
-		else
-		{
-			// TODO: Log
-			success = false;
-		}
-	}
-	return success;
+			if (copyRange.offset + copyRange.size > drawResourcesGPUBuffer->getSize())
+			{
+				// TODO: LOG ERROR, this shouldn't happen with correct auto-submission mechanism
+				assert(false);
+				return false;
+			}
+
+			drawBuffer.bufferOffset = copyRange.offset;
+			offset += drawBuffer.getAlignedStorageSize();
+		};
+
+	copyCPUFilledDrawBuffer(drawBuffers.lineStyles);
+	copyCPUFilledDrawBuffer(drawBuffers.dtmSettings);
+	copyCPUFilledDrawBuffer(drawBuffers.clipProjections);
+	copyCPUFilledDrawBuffer(drawBuffers.mainObjects);
+	copyCPUFilledDrawBuffer(drawBuffers.drawObjects);
+	copyCPUFilledDrawBuffer(drawBuffers.indexBuffer);
+	copyCPUFilledDrawBuffer(drawBuffers.geometryInfo);
+
+	return true;
 }
 
 bool DrawResourcesFiller::finalizeTextureCopies(SIntendedSubmitInfo& intendedNextSubmit)
@@ -872,7 +801,7 @@ uint64_t DrawResourcesFiller::addClipProjectionData_Internal(const ClipProjectio
 	uint8_t* dst = reinterpret_cast<uint8_t*>(cpuDrawBuffers.geometryBuffer->getPointer()) + currentGeometryBufferSize;
 	memcpy(dst, &clipProjectionData, sizeof(ClipProjectionData));
 
-	const uint64_t ret = currentGeometryBufferSize + geometryBufferAddress;
+	const uint64_t ret = currentGeometryBufferSize + drawResourcesBDA;
 	currentGeometryBufferSize += sizeof(ClipProjectionData);
 	return ret;
 }
@@ -904,7 +833,7 @@ void DrawResourcesFiller::addPolylineConnectors_Internal(const CPolylineBase& po
 	DrawObject drawObj = {};
 	drawObj.mainObjIndex = mainObjIdx;
 	drawObj.type_subsectionIdx = uint32_t(static_cast<uint16_t>(ObjectType::POLYLINE_CONNECTOR) | 0 << 16);
-	drawObj.geometryAddress = geometryBufferAddress + currentGeometryBufferSize;
+	drawObj.geometryAddress = drawResourcesBDA + currentGeometryBufferSize;
 	for (uint32_t i = 0u; i < objectsToUpload; ++i)
 	{
 		void* dst = reinterpret_cast<DrawObject*>(cpuDrawBuffers.drawObjectsBuffer->getPointer()) + currentDrawObjectCount;
@@ -946,7 +875,7 @@ void DrawResourcesFiller::addLines_Internal(const CPolylineBase& polyline, const
 	DrawObject drawObj = {};
 	drawObj.mainObjIndex = mainObjIdx;
 	drawObj.type_subsectionIdx = uint32_t(static_cast<uint16_t>(ObjectType::LINE) | 0 << 16);
-	drawObj.geometryAddress = geometryBufferAddress + currentGeometryBufferSize;
+	drawObj.geometryAddress = drawResourcesBDA + currentGeometryBufferSize;
 	for (uint32_t i = 0u; i < objectsToUpload; ++i)
 	{
 		void* dst = reinterpret_cast<DrawObject*>(cpuDrawBuffers.drawObjectsBuffer->getPointer()) + currentDrawObjectCount;
@@ -987,7 +916,7 @@ void DrawResourcesFiller::addQuadBeziers_Internal(const CPolylineBase& polyline,
 	// Add DrawObjs
 	DrawObject drawObj = {};
 	drawObj.mainObjIndex = mainObjIdx;
-	drawObj.geometryAddress = geometryBufferAddress + currentGeometryBufferSize;
+	drawObj.geometryAddress = drawResourcesBDA + currentGeometryBufferSize;
 	for (uint32_t i = 0u; i < objectsToUpload; ++i)
 	{
 		for (uint16_t subObject = 0; subObject < CagesPerQuadBezier; subObject++)
@@ -1033,7 +962,7 @@ void DrawResourcesFiller::addHatch_Internal(const Hatch& hatch, uint32_t& curren
 			static_assert(sizeof(CurveBox) == sizeof(Hatch::CurveHatchBox));
 			void* dst = reinterpret_cast<char*>(cpuDrawBuffers.geometryBuffer->getPointer()) + currentGeometryBufferSize;
 			memcpy(dst, &hatchBox, sizeof(CurveBox));
-			hatchBoxAddress = geometryBufferAddress + currentGeometryBufferSize;
+			hatchBoxAddress = drawResourcesBDA + currentGeometryBufferSize;
 			currentGeometryBufferSize += sizeof(CurveBox);
 		}
 
@@ -1062,7 +991,7 @@ bool DrawResourcesFiller::addFontGlyph_Internal(const GlyphInfo& glyphInfo, uint
 	{
 		void* geomDst = reinterpret_cast<char*>(cpuDrawBuffers.geometryBuffer->getPointer()) + currentGeometryBufferSize;
 		memcpy(geomDst, &glyphInfo, sizeof(GlyphInfo));
-		uint64_t fontGlyphAddr = geometryBufferAddress + currentGeometryBufferSize;
+		uint64_t fontGlyphAddr = drawResourcesBDA + currentGeometryBufferSize;
 		currentGeometryBufferSize += sizeof(GlyphInfo);
 
 		DrawObject drawObj = {};
diff --git a/62_CAD/DrawResourcesFiller.h b/62_CAD/DrawResourcesFiller.h
index 98dffa90e..bc456f806 100644
--- a/62_CAD/DrawResourcesFiller.h
+++ b/62_CAD/DrawResourcesFiller.h
@@ -14,22 +14,10 @@ using namespace nbl::asset;
 using namespace nbl::ext::TextRendering;
 
 static_assert(sizeof(DrawObject) == 16u);
-static_assert(sizeof(MainObject) == 16u);
-static_assert(sizeof(Globals) == 128u);
+static_assert(sizeof(MainObject) == 12u);
 static_assert(sizeof(LineStyle) == 88u);
 static_assert(sizeof(ClipProjectionData) == 88u);
 
-template <typename BufferType>
-struct DrawBuffers
-{
-	smart_refctd_ptr<BufferType> indexBuffer; // only is valid for IGPUBuffer because it's filled at allocation time and never touched again
-	smart_refctd_ptr<BufferType> mainObjectsBuffer;
-	smart_refctd_ptr<BufferType> drawObjectsBuffer;
-	smart_refctd_ptr<BufferType> geometryBuffer;
-	smart_refctd_ptr<BufferType> lineStylesBuffer;
-	smart_refctd_ptr<BufferType> dtmSettingsBuffer;
-};
-
 // ! DrawResourcesFiller
 // ! This class provides important functionality to manage resources needed for a draw.
 // ! Drawing new objects (polylines, hatches, etc.) should go through this function.
@@ -39,9 +27,67 @@ struct DrawBuffers
 struct DrawResourcesFiller
 {
 public:
+	
+	/// @brief general parent struct for 1.ComputeReserved and 2.CPUFilled DrawBuffers
+	struct DrawBuffer
+	{
+		static constexpr size_t Alignment = 8u;
+		static constexpr size_t InvalidBufferOffset = ~0u;
+		size_t bufferOffset = InvalidBufferOffset; // set when copy to gpu buffer is issued
+		virtual size_t getCount() const = 0;
+		virtual size_t getStorageSize() const = 0;
+		virtual size_t getAlignedStorageSize() const { core::alignUp(getStorageSize(), Alignment); }
+	};
+
+	/// @brief DrawBuffer reserved for compute shader stages input/output
+	template <typename T>
+	struct ComputeReservedDrawBuffer : DrawBuffer
+	{
+		size_t count = 0ull;
+		size_t getCount() const override { return count; }
+		size_t getStorageSize() const override  { return count * sizeof(T); }
+	};
+
+	/// @brief DrawBuffer which is filled by CPU, packed and sent to GPU
+	template <typename T>
+	struct CPUFilledDrawBuffer : DrawBuffer
+	{
+		core::vector<T> vector;
+		size_t getCount() const { return vector.size(); }
+		size_t getStorageSize() const { return vector.size() * sizeof(T); }
+	};
+
+	/// @brief struct to hold all draw buffers
+	struct DrawBuffers
+	{
+		// auto-submission level 0 buffers (settings that mainObj references)
+		CPUFilledDrawBuffer<LineStyle> lineStyles;
+		CPUFilledDrawBuffer<DTMSettings> dtmSettings;
+		CPUFilledDrawBuffer<ClipProjectionData> clipProjections;
+	
+		// auto-submission level 1 buffers (mainObj that drawObjs references, if all drawObjs+idxBuffer+geometryInfo doesn't fit into mem this will be broken down into many)
+		CPUFilledDrawBuffer<MainObject> mainObjects;
 
-	typedef uint32_t index_buffer_type;
+		// auto-submission level 2 buffers
+		CPUFilledDrawBuffer<DrawObject> drawObjects;
+		CPUFilledDrawBuffer<uint32_t> indexBuffer;
+		CPUFilledDrawBuffer<uint8_t> geometryInfo; // general purpose byte buffer for custom geometries, etc
 
+		// Get Total memory consumption, If all DrawBuffers get packed together with DrawBuffer::Alignment
+		// Useful to know when to know when to overflow
+		size_t calculateTotalConsumption() const
+		{
+			return
+				lineStyles.getAlignedStorageSize() +
+				dtmSettings.getAlignedStorageSize() +
+				clipProjections.getAlignedStorageSize() +
+				mainObjects.getAlignedStorageSize() +
+				drawObjects.getAlignedStorageSize() +
+				indexBuffer.getAlignedStorageSize() +
+				geometryInfo.getAlignedStorageSize();
+		}
+	};
+	
 	DrawResourcesFiller();
 
 	DrawResourcesFiller(smart_refctd_ptr<IUtilities>&& utils, IQueue* copyQueue);
@@ -49,18 +95,8 @@ struct DrawResourcesFiller
 	typedef std::function<void(SIntendedSubmitInfo&)> SubmitFunc;
 	void setSubmitDrawsFunction(const SubmitFunc& func);
 
-	void allocateIndexBuffer(ILogicalDevice* logicalDevice, uint32_t indices);
-
-	void allocateMainObjectsBuffer(ILogicalDevice* logicalDevice, uint32_t mainObjects);
-
-	void allocateDrawObjectsBuffer(ILogicalDevice* logicalDevice, uint32_t drawObjects);
+	void allocateDrawResourcesBuffer(ILogicalDevice* logicalDevice, size_t size);
 
-	void allocateGeometryBuffer(ILogicalDevice* logicalDevice, size_t size);
-
-	void allocateStylesBuffer(ILogicalDevice* logicalDevice, uint32_t lineStylesCount);
-
-	void allocateDTMSettingsBuffer(ILogicalDevice* logicalDevice, uint32_t dtmSettingsCount);
-	
 	void allocateMSDFTextures(ILogicalDevice* logicalDevice, uint32_t maxMSDFs, uint32_t2 msdfsExtent);
 
 	// functions that user should set to get MSDF texture if it's not available in cache.
@@ -118,80 +154,10 @@ struct DrawResourcesFiller
 		float64_t2 topLeftPos,
 		float32_t2 size,
 		float32_t rotation,
-		SIntendedSubmitInfo& intendedNextSubmit)
-	{
-		auto addImageObject_Internal = [&](const ImageObjectInfo& imageObjectInfo, uint32_t mainObjIdx) -> bool
-			{
-				const uint32_t maxGeometryBufferImageObjects = static_cast<uint32_t>((maxGeometryBufferSize - currentGeometryBufferSize) / sizeof(ImageObjectInfo));
-				uint32_t uploadableObjects = (maxIndexCount / 6u) - currentDrawObjectCount;
-				uploadableObjects = core::min(uploadableObjects, maxDrawObjects - currentDrawObjectCount);
-				uploadableObjects = core::min(uploadableObjects, maxGeometryBufferImageObjects);
-
-				if (uploadableObjects >= 1u)
-				{
-					void* dstGeom = reinterpret_cast<char*>(cpuDrawBuffers.geometryBuffer->getPointer()) + currentGeometryBufferSize;
-					memcpy(dstGeom, &imageObjectInfo, sizeof(ImageObjectInfo));
-					uint64_t geomBufferAddr = geometryBufferAddress + currentGeometryBufferSize;
-					currentGeometryBufferSize += sizeof(ImageObjectInfo);
-
-					DrawObject drawObj = {};
-					drawObj.type_subsectionIdx = uint32_t(static_cast<uint16_t>(ObjectType::IMAGE) | (0 << 16)); // TODO: use custom pack/unpack function
-					drawObj.mainObjIndex = mainObjIdx;
-					drawObj.geometryAddress = geomBufferAddr;
-					void* dstDrawObj = reinterpret_cast<DrawObject*>(cpuDrawBuffers.drawObjectsBuffer->getPointer()) + currentDrawObjectCount;
-					memcpy(dstDrawObj, &drawObj, sizeof(DrawObject));
-					currentDrawObjectCount += 1u;
-
-					return true;
-				}
-				else
-					return false;
-			};
-		
-		uint32_t mainObjIdx = addMainObject_SubmitIfNeeded(InvalidStyleIdx, InvalidDTMSettingsIdx, intendedNextSubmit);
-
-		ImageObjectInfo info = {};
-		info.topLeft = topLeftPos;
-		info.dirU = float32_t2(size.x * cos(rotation), size.x * sin(rotation)); // 
-		info.aspectRatio = size.y / size.x;
-		info.textureID = 0u;
-		if (!addImageObject_Internal(info, mainObjIdx))
-		{
-			// single image object couldn't fit into memory to push to gpu, so we submit rendering current objects and reset geometry buffer and draw objects
-			submitCurrentDrawObjectsAndReset(intendedNextSubmit, mainObjIdx);
-			bool success = addImageObject_Internal(info, mainObjIdx);
-			assert(success); // this should always be true, otherwise it's either bug in code or not enough memory allocated to hold a single image object 
-		}
-	}
+		SIntendedSubmitInfo& intendedNextSubmit);
 
 	bool finalizeAllCopiesToGPU(SIntendedSubmitInfo& intendedNextSubmit);
 
-	inline uint32_t getLineStyleCount() const { return currentLineStylesCount; }
-
-	inline uint32_t getDrawObjectCount() const { return currentDrawObjectCount; } 
-
-	inline uint32_t getMainObjectCount() const { return currentMainObjectCount; }
-
-	inline size_t getCurrentMainObjectsBufferSize() const
-	{
-		return sizeof(MainObject) * currentMainObjectCount;
-	}
-
-	inline size_t getCurrentDrawObjectsBufferSize() const
-	{
-		return sizeof(DrawObject) * currentDrawObjectCount;
-	}
-
-	inline size_t getCurrentGeometryBufferSize() const
-	{
-		return currentGeometryBufferSize;
-	}
-
-	inline size_t getCurrentLineStylesBufferSize() const
-	{
-		return sizeof(LineStyle) * currentLineStylesCount;
-	}
-
 	void reset()
 	{
 		resetGeometryCounters();
@@ -200,8 +166,8 @@ struct DrawResourcesFiller
 		resetDTMSettingsCounters();
 	}
 
-	DrawBuffers<ICPUBuffer> cpuDrawBuffers;
-	DrawBuffers<IGPUBuffer> gpuDrawBuffers;
+	DrawBuffers drawBuffers; // will be compacted and copied into gpu draw resources
+	nbl::core::smart_refctd_ptr<IGPUBuffer> drawResourcesGPUBuffer;
 
 	uint32_t addLineStyle_SubmitIfNeeded(const LineStyleInfo& lineStyle, SIntendedSubmitInfo& intendedNextSubmit);
 
@@ -242,16 +208,8 @@ struct DrawResourcesFiller
 
 	SubmitFunc submitDraws;
 	
-	bool finalizeMainObjectCopiesToGPU(SIntendedSubmitInfo& intendedNextSubmit);
-
-	bool finalizeGeometryCopiesToGPU(SIntendedSubmitInfo& intendedNextSubmit);
+	bool finalizeBufferCopies(SIntendedSubmitInfo& intendedNextSubmit);
 
-	bool finalizeLineStyleCopiesToGPU(SIntendedSubmitInfo& intendedNextSubmit);
-
-	bool finalizeDTMSettingsCopiesToGPU(SIntendedSubmitInfo& intendedNextSubmit);
-	
-	bool finalizeCustomClipProjectionCopiesToGPU(SIntendedSubmitInfo& intendedNextSubmit);
-	
 	bool finalizeTextureCopies(SIntendedSubmitInfo& intendedNextSubmit);
 
 	// Internal Function to call whenever we overflow while filling our buffers with geometry (potential limiters: indexBuffer, drawObjectsBuffer or geometryBuffer)
@@ -430,29 +388,7 @@ struct DrawResourcesFiller
 	smart_refctd_ptr<IUtilities> m_utilities;
 	IQueue* m_copyQueue;
 
-	uint32_t maxIndexCount;
-
-	uint32_t inMemMainObjectCount = 0u;
-	uint32_t currentMainObjectCount = 0u;
-	uint32_t maxMainObjects = 0u;
-
-	uint32_t inMemDrawObjectCount = 0u;
-	uint32_t currentDrawObjectCount = 0u;
-	uint32_t maxDrawObjects = 0u;
-
-	uint64_t inMemGeometryBufferSize = 0u;
-	uint64_t currentGeometryBufferSize = 0u;
-	uint64_t maxGeometryBufferSize = 0u;
-
-	uint32_t inMemLineStylesCount = 0u;
-	uint32_t currentLineStylesCount = 0u;
-	uint32_t maxLineStyles = 0u;
-
-	uint32_t inMemDTMSettingsCount = 0u;
-	uint32_t currentDTMSettingsCount = 0u;
-	uint32_t maxDtmSettings = 0u;
-
-	uint64_t geometryBufferAddress = 0u; // Actual BDA offset 0 of the gpu buffer
+	uint64_t drawResourcesBDA = 0u; // Actual BDA offset 0 of the gpu buffer
 
 	std::deque<ClipProjectionData> clipProjections; // stack of clip projectios stored so we can resubmit them if geometry buffer got reset.
 	std::deque<uint64_t> clipProjectionAddresses; // stack of clip projection gpu addresses in geometry buffer. to keep track of them in push/pops
diff --git a/62_CAD/main.cpp b/62_CAD/main.cpp
index eff8fd3e1..7dd60ca47 100644
--- a/62_CAD/main.cpp
+++ b/62_CAD/main.cpp
@@ -288,19 +288,8 @@ class ComputerAidedDesign final : public examples::SimpleWindowedApplication, pu
 	{
 		drawResourcesFiller = DrawResourcesFiller(core::smart_refctd_ptr(m_utils), getGraphicsQueue());
 
-		// TODO: move individual allocations to DrawResourcesFiller::allocateResources(memory)
-		// Issue warning error, if we can't store our largest geomm struct + clip proj data inside geometry buffer along linestyle and mainObject 
-		uint32_t maxIndices = maxObjects * 6u * 2u;
-		drawResourcesFiller.allocateIndexBuffer(m_device.get(), maxIndices);
-		drawResourcesFiller.allocateMainObjectsBuffer(m_device.get(), maxObjects);
-		drawResourcesFiller.allocateDrawObjectsBuffer(m_device.get(), maxObjects * 5u);
-		drawResourcesFiller.allocateStylesBuffer(m_device.get(), 512u);
-		drawResourcesFiller.allocateDTMSettingsBuffer(m_device.get(), 512u);
-
-		// * 3 because I just assume there is on average 3x beziers per actual object (cause we approximate other curves/arcs with beziers now)
-		// + 128 ClipProjData
-		size_t geometryBufferSize = maxObjects * sizeof(QuadraticBezierInfo) * 3 + 128 * sizeof(ClipProjectionData);
-		drawResourcesFiller.allocateGeometryBuffer(m_device.get(), geometryBufferSize);
+		size_t bufferSize = 512u * 1024u * 1024u; // 512 MB
+		drawResourcesFiller.allocateDrawResourcesBuffer(m_device.get(), bufferSize);
 		drawResourcesFiller.allocateMSDFTextures(m_device.get(), 256u, uint32_t2(MSDFSize, MSDFSize));
 
 		{
@@ -314,14 +303,6 @@ class ComputerAidedDesign final : public examples::SimpleWindowedApplication, pu
 			auto globalsBufferMem = m_device->allocate(memReq, m_globalsBuffer.get());
 		}
 		
-		size_t sumBufferSizes =
-			drawResourcesFiller.gpuDrawBuffers.drawObjectsBuffer->getSize() +
-			drawResourcesFiller.gpuDrawBuffers.geometryBuffer->getSize() +
-			drawResourcesFiller.gpuDrawBuffers.indexBuffer->getSize() +
-			drawResourcesFiller.gpuDrawBuffers.lineStylesBuffer->getSize() +
-			drawResourcesFiller.gpuDrawBuffers.mainObjectsBuffer->getSize();
-		m_logger->log("Buffers Size = %.2fKB", ILogger::E_LOG_LEVEL::ELL_INFO, sumBufferSizes / 1024.0f);
-
 		// pseudoStencil
 		{
 			asset::E_FORMAT pseudoStencilFormat = asset::EF_R32_UINT;
@@ -778,7 +759,7 @@ class ComputerAidedDesign final : public examples::SimpleWindowedApplication, pu
 			{
 				descriptorSet0 = descriptorPool->createDescriptorSet(smart_refctd_ptr(descriptorSetLayout0));
 				descriptorSet1 = descriptorPool->createDescriptorSet(smart_refctd_ptr(descriptorSetLayout1));
-				constexpr uint32_t DescriptorCountSet0 = 7u;
+				constexpr uint32_t DescriptorCountSet0 = 3u;
 				video::IGPUDescriptorSet::SDescriptorInfo descriptorInfosSet0[DescriptorCountSet0] = {};
 
 				// Descriptors For Set 0:
@@ -786,31 +767,15 @@ class ComputerAidedDesign final : public examples::SimpleWindowedApplication, pu
 				descriptorInfosSet0[0u].info.buffer.size = m_globalsBuffer->getCreationParams().size;
 				descriptorInfosSet0[0u].desc = m_globalsBuffer;
 
-				descriptorInfosSet0[1u].info.buffer.offset = 0u;
-				descriptorInfosSet0[1u].info.buffer.size = drawResourcesFiller.gpuDrawBuffers.drawObjectsBuffer->getCreationParams().size;
-				descriptorInfosSet0[1u].desc = drawResourcesFiller.gpuDrawBuffers.drawObjectsBuffer;
-				
-				descriptorInfosSet0[2u].info.buffer.offset = 0u;
-				descriptorInfosSet0[2u].info.buffer.size = drawResourcesFiller.gpuDrawBuffers.mainObjectsBuffer->getCreationParams().size;
-				descriptorInfosSet0[2u].desc = drawResourcesFiller.gpuDrawBuffers.mainObjectsBuffer;
-
-				descriptorInfosSet0[3u].info.buffer.offset = 0u;
-				descriptorInfosSet0[3u].info.buffer.size = drawResourcesFiller.gpuDrawBuffers.lineStylesBuffer->getCreationParams().size;
-				descriptorInfosSet0[3u].desc = drawResourcesFiller.gpuDrawBuffers.lineStylesBuffer;
-				
-				descriptorInfosSet0[4u].info.buffer.offset = 0u;
-				descriptorInfosSet0[4u].info.buffer.size = drawResourcesFiller.gpuDrawBuffers.dtmSettingsBuffer->getCreationParams().size;
-				descriptorInfosSet0[4u].desc = drawResourcesFiller.gpuDrawBuffers.dtmSettingsBuffer;
-
-				descriptorInfosSet0[5u].info.combinedImageSampler.imageLayout = IImage::LAYOUT::READ_ONLY_OPTIMAL;
-				descriptorInfosSet0[5u].info.combinedImageSampler.sampler = msdfTextureSampler;
-				descriptorInfosSet0[5u].desc = drawResourcesFiller.getMSDFsTextureArray();
+				descriptorInfosSet0[1u].info.combinedImageSampler.imageLayout = IImage::LAYOUT::READ_ONLY_OPTIMAL;
+				descriptorInfosSet0[1u].info.combinedImageSampler.sampler = msdfTextureSampler;
+				descriptorInfosSet0[1u].desc = drawResourcesFiller.getMSDFsTextureArray();
 				
-				descriptorInfosSet0[6u].desc = msdfTextureSampler; // TODO[Erfan]: different sampler and make immutable?
+				descriptorInfosSet0[2u].desc = msdfTextureSampler; // TODO[Erfan]: different sampler and make immutable?
 				
 				// This is bindless to we write to it later.
-				// descriptorInfosSet0[6u].info.image.imageLayout = IImage::LAYOUT::READ_ONLY_OPTIMAL;
-				// descriptorInfosSet0[6u].desc = drawResourcesFiller.getMSDFsTextureArray();
+				// descriptorInfosSet0[3u].info.image.imageLayout = IImage::LAYOUT::READ_ONLY_OPTIMAL;
+				// descriptorInfosSet0[3u].desc = drawResourcesFiller.getMSDFsTextureArray();
 
 				// Descriptors For Set 1:
 				constexpr uint32_t DescriptorCountSet1 = 2u;
@@ -834,60 +799,32 @@ class ComputerAidedDesign final : public examples::SimpleWindowedApplication, pu
 				descriptorUpdates[0u].count = 1u;
 				descriptorUpdates[0u].info = &descriptorInfosSet0[0u];
 
-					// drawObjectsBuffer
+					// mdfs textures
 				descriptorUpdates[1u].dstSet = descriptorSet0.get();
 				descriptorUpdates[1u].binding = 1u;
 				descriptorUpdates[1u].arrayElement = 0u;
 				descriptorUpdates[1u].count = 1u;
 				descriptorUpdates[1u].info = &descriptorInfosSet0[1u];
-
-					// mainObjectsBuffer
+				
+					// general texture sampler	
 				descriptorUpdates[2u].dstSet = descriptorSet0.get();
 				descriptorUpdates[2u].binding = 2u;
 				descriptorUpdates[2u].arrayElement = 0u;
 				descriptorUpdates[2u].count = 1u;
 				descriptorUpdates[2u].info = &descriptorInfosSet0[2u];
 
-					// lineStylesBuffer
-				descriptorUpdates[3u].dstSet = descriptorSet0.get();
-				descriptorUpdates[3u].binding = 3u;
+				// Set 1 Updates:
+				descriptorUpdates[3u].dstSet = descriptorSet1.get();
+				descriptorUpdates[3u].binding = 0u;
 				descriptorUpdates[3u].arrayElement = 0u;
 				descriptorUpdates[3u].count = 1u;
-				descriptorUpdates[3u].info = &descriptorInfosSet0[3u];
-				
-					// dtmSettingsBuffer
-				descriptorUpdates[4u].dstSet = descriptorSet0.get();
-				descriptorUpdates[4u].binding = 4u;
+				descriptorUpdates[3u].info = &descriptorInfosSet1[0u];
+
+				descriptorUpdates[4u].dstSet = descriptorSet1.get();
+				descriptorUpdates[4u].binding = 1u;
 				descriptorUpdates[4u].arrayElement = 0u;
 				descriptorUpdates[4u].count = 1u;
-				descriptorUpdates[4u].info = &descriptorInfosSet0[4u];
-
-					// mdfs textures
-				descriptorUpdates[5u].dstSet = descriptorSet0.get();
-				descriptorUpdates[5u].binding = 5u;
-				descriptorUpdates[5u].arrayElement = 0u;
-				descriptorUpdates[5u].count = 1u;
-				descriptorUpdates[5u].info = &descriptorInfosSet0[5u];
-				
-					// mdfs samplers	
-				descriptorUpdates[6u].dstSet = descriptorSet0.get();
-				descriptorUpdates[6u].binding = 6u;
-				descriptorUpdates[6u].arrayElement = 0u;
-				descriptorUpdates[6u].count = 1u;
-				descriptorUpdates[6u].info = &descriptorInfosSet0[6u];
-
-				// Set 1 Updates:
-				descriptorUpdates[7u].dstSet = descriptorSet1.get();
-				descriptorUpdates[7u].binding = 0u;
-				descriptorUpdates[7u].arrayElement = 0u;
-				descriptorUpdates[7u].count = 1u;
-				descriptorUpdates[7u].info = &descriptorInfosSet1[0u];
-
-				descriptorUpdates[8u].dstSet = descriptorSet1.get();
-				descriptorUpdates[8u].binding = 1u;
-				descriptorUpdates[8u].arrayElement = 0u;
-				descriptorUpdates[8u].count = 1u;
-				descriptorUpdates[8u].info = &descriptorInfosSet1[1u];
+				descriptorUpdates[4u].info = &descriptorInfosSet1[1u];
 
 				m_device->updateDescriptorSets(DescriptorUpdatesCount, descriptorUpdates, 0u, nullptr);
 			}
@@ -2977,7 +2914,6 @@ class ComputerAidedDesign final : public examples::SimpleWindowedApplication, pu
 					default:
 						m_logger->log("Failed to load ICPUImage or ICPUImageView got some other Asset Type, skipping!",ILogger::ELL_ERROR);
 				}
-			
 
 				// create matching size gpu image
 				smart_refctd_ptr<IGPUImage> gpuImg;
@@ -3015,7 +2951,7 @@ class ComputerAidedDesign final : public examples::SimpleWindowedApplication, pu
 				{
 					{
 						.dstSet = descriptorSet0.get(),
-						.binding = 6u,
+						.binding = 3u,
 						.arrayElement = 0u,
 						.count = 1u,
 						.info = &dsInfo,
diff --git a/62_CAD/shaders/globals.hlsl b/62_CAD/shaders/globals.hlsl
index 8412b29ad..f9c89d45c 100644
--- a/62_CAD/shaders/globals.hlsl
+++ b/62_CAD/shaders/globals.hlsl
@@ -53,24 +53,32 @@ static_assert(offsetof(ClipProjectionData, minClipNDC) == 72u);
 static_assert(offsetof(ClipProjectionData, maxClipNDC) == 80u);
 #endif
 
-struct Globals
+struct Pointers
 {
-    ClipProjectionData defaultClipProjection; // 88
-    pfloat64_t screenToWorldRatio; // 96
-    pfloat64_t worldToScreenRatio; // 100
-    uint32_t2 resolution; // 108
-    float antiAliasingFactor; // 112
-    float miterLimit; // 116
-    float32_t2 _padding; // 128
+    uint64_t lineStyles;
+    uint64_t dtmSettings;
+    uint64_t customClipProjections;
+    uint64_t mainObjects;
+    uint64_t drawObjects;
+    uint64_t geometryBuffer;
 };
+#ifndef __HLSL_VERSION
+static_assert(sizeof(Pointers) == 48u);
+#endif
 
+struct Globals
+{
+    Pointers pointers;
+    ClipProjectionData defaultClipProjection;
+    pfloat64_t screenToWorldRatio;
+    pfloat64_t worldToScreenRatio;
+    uint32_t2 resolution;
+    float antiAliasingFactor;
+    float miterLimit;
+    float32_t2 _padding;
+};
 #ifndef __HLSL_VERSION
-static_assert(offsetof(Globals, defaultClipProjection) == 0u);
-static_assert(offsetof(Globals, screenToWorldRatio) == 88u);
-static_assert(offsetof(Globals, worldToScreenRatio) == 96u);
-static_assert(offsetof(Globals, resolution) == 104u);
-static_assert(offsetof(Globals, antiAliasingFactor) == 112u);
-static_assert(offsetof(Globals, miterLimit) == 116u);
+static_assert(sizeof(Globals) == 176u);
 #endif
 
 #ifdef __HLSL_VERSION
@@ -127,7 +135,7 @@ struct MainObject
 {
     uint32_t styleIdx;
     uint32_t dtmSettingsIdx;
-    uint64_t clipProjectionAddress;
+    uint32_t clipProjectionIndex;
 };
 
 struct DrawObject
@@ -137,6 +145,8 @@ struct DrawObject
     uint64_t geometryAddress;
 };
 
+
+// Goes into geometry buffer, needs to be aligned by 8
 struct LinePointInfo
 {
     pfloat64_t2 p;
@@ -144,6 +154,7 @@ struct LinePointInfo
     float32_t stretchValue;
 };
 
+// Goes into geometry buffer, needs to be aligned by 8
 struct QuadraticBezierInfo
 {
     nbl::hlsl::shapes::QuadraticBezier<pfloat64_t> shape; // 48bytes = 3 (control points) x 16 (float64_t2)
@@ -154,6 +165,7 @@ struct QuadraticBezierInfo
 static_assert(offsetof(QuadraticBezierInfo, phaseShift) == 48u);
 #endif
 
+// Goes into geometry buffer, needs to be aligned by 8
 struct GlyphInfo
 {
     pfloat64_t2 topLeft; // 2 * 8 = 16 bytes
@@ -198,6 +210,7 @@ struct GlyphInfo
     }
 };
 
+// Goes into geometry buffer, needs to be aligned by 8
 struct ImageObjectInfo
 {
     pfloat64_t2  topLeft; // 2 * 8 = 16 bytes (16)
@@ -247,6 +260,7 @@ struct PolylineConnector
 };
 
 // NOTE: Don't attempt to pack curveMin/Max to uints because of limited range of values, we need the logarithmic precision of floats (more precision near 0)
+// Goes into geometry buffer, needs to be aligned by 8
 struct CurveBox
 {
     // will get transformed in the vertex shader, and will be calculated on the cpu when generating these boxes
@@ -362,6 +376,7 @@ struct DTMSettings
         return DISCRETE_FIXED_LENGTH_INTERVALS;
     }
 };
+
 #ifndef __HLSL_VERSION
 inline bool operator==(const LineStyle& lhs, const LineStyle& rhs)
 {
@@ -390,7 +405,6 @@ inline bool operator==(const DTMSettings& lhs, const DTMSettings& rhs)
     return lhs.outlineLineStyleIdx == rhs.outlineLineStyleIdx &&
         lhs.contourLineStyleIdx == rhs.contourLineStyleIdx;
 }
-
 #endif
 
 NBL_CONSTEXPR uint32_t MainObjectIdxBits = 24u; // It will be packed next to alpha in a texture
@@ -399,15 +413,44 @@ NBL_CONSTEXPR uint32_t MaxIndexableMainObjects = (1u << MainObjectIdxBits) - 1u;
 NBL_CONSTEXPR uint32_t InvalidStyleIdx = nbl::hlsl::numeric_limits<uint32_t>::max;
 NBL_CONSTEXPR uint32_t InvalidDTMSettingsIdx = nbl::hlsl::numeric_limits<uint32_t>::max;
 NBL_CONSTEXPR uint32_t InvalidMainObjectIdx = MaxIndexableMainObjects;
-NBL_CONSTEXPR uint64_t InvalidClipProjectionAddress = nbl::hlsl::numeric_limits<uint64_t>::max;
+NBL_CONSTEXPR uint32_t InvalidClipProjectionIndex = nbl::hlsl::numeric_limits<uint32_t>::max;
 NBL_CONSTEXPR uint32_t InvalidTextureIdx = nbl::hlsl::numeric_limits<uint32_t>::max;
+
+// Hatches
 NBL_CONSTEXPR MajorAxis SelectedMajorAxis = MajorAxis::MAJOR_Y;
-// TODO: get automatic version working on HLSL
 NBL_CONSTEXPR MajorAxis SelectedMinorAxis = MajorAxis::MAJOR_X; //(MajorAxis) (1 - (uint32_t) SelectedMajorAxis);
+
+// Text or MSDF Hatches
 NBL_CONSTEXPR float MSDFPixelRange = 4.0f;
 NBL_CONSTEXPR float MSDFPixelRangeHalf = MSDFPixelRange / 2.0f;
 NBL_CONSTEXPR float MSDFSize = 32.0f; 
 NBL_CONSTEXPR uint32_t MSDFMips = 4; 
 NBL_CONSTEXPR float HatchFillMSDFSceenSpaceSize = 8.0; 
 
+#ifdef __HLSL_VERSION
+[[vk::binding(0, 0)]] ConstantBuffer<Globals> globals : register(b0);
+
+LineStyle loadLineStyle(const uint32_t index)
+{
+    return vk::RawBufferLoad<LineStyle>(globals.pointers.lineStyles + index * sizeof(LineStyle), 8u);
+}
+DTMSettings loadDTMSettings(const uint32_t index)
+{
+    return vk::RawBufferLoad<DTMSetting>(globals.pointers.dtmSettings + index * sizeof(DTMSetting), 8u);
+}
+ClipProjectionData loadCustomClipProjection(const uint32_t index)
+{
+    return vk::RawBufferLoad<ClipProjectionData>(globals.pointers.customClipProjections + index * sizeof(ClipProjectionData), 8u);
+}
+MainObject loadMainObject(const uint32_t index)
+{
+    return vk::RawBufferLoad<MainObject>(globals.pointers.mainObjs + index * sizeof(MainObject), 8u);
+}
+DrawObject loadDrawObject(const uint32_t index)
+{
+    return vk::RawBufferLoad<DrawObject>(globals.pointers.drawObjs + index * sizeof(DrawObject), 8u);
+}
+#endif
+
+
 #endif
diff --git a/62_CAD/shaders/main_pipeline/common.hlsl b/62_CAD/shaders/main_pipeline/common.hlsl
index 261e336f3..0cf4e3bce 100644
--- a/62_CAD/shaders/main_pipeline/common.hlsl
+++ b/62_CAD/shaders/main_pipeline/common.hlsl
@@ -233,17 +233,14 @@ struct PSInput
 };
 
 // Set 0 - Scene Data and Globals, buffer bindings don't change the buffers only get updated
-[[vk::binding(0, 0)]] ConstantBuffer<Globals> globals : register(b0);
-[[vk::binding(1, 0)]] StructuredBuffer<DrawObject> drawObjects : register(t0);
-[[vk::binding(2, 0)]] StructuredBuffer<MainObject> mainObjects : register(t1);
-[[vk::binding(3, 0)]] StructuredBuffer<LineStyle> lineStyles : register(t2);
-[[vk::binding(4, 0)]] StructuredBuffer<DTMSettings> dtmSettings : register(t3);
 
-[[vk::combinedImageSampler]][[vk::binding(5, 0)]] Texture2DArray<float3> msdfTextures : register(t4);
-[[vk::combinedImageSampler]][[vk::binding(5, 0)]] SamplerState msdfSampler : register(s4);
+// [[vk::binding(0, 0)]] ConstantBuffer<Globals> globals; ---> moved to globals.hlsl
 
-[[vk::binding(6, 0)]] SamplerState textureSampler : register(s5);
-[[vk::binding(7, 0)]] Texture2D textures[128] : register(t5);
+[[vk::combinedImageSampler]][[vk::binding(1, 0)]] Texture2DArray<float3> msdfTextures : register(t4);
+[[vk::combinedImageSampler]][[vk::binding(1, 0)]] SamplerState msdfSampler : register(s4);
+
+[[vk::binding(2, 0)]] SamplerState textureSampler : register(s5);
+[[vk::binding(3, 0)]] Texture2D textures[128] : register(t5);
 
 // Set 1 - Window dependant data which has higher update frequency due to multiple windows and resize need image recreation and descriptor writes
 [[vk::binding(0, 1)]] globallycoherent RWTexture2D<uint> pseudoStencil : register(u0);
diff --git a/62_CAD/shaders/main_pipeline/fragment_shader.hlsl b/62_CAD/shaders/main_pipeline/fragment_shader.hlsl
index cddac89ba..4852d0522 100644
--- a/62_CAD/shaders/main_pipeline/fragment_shader.hlsl
+++ b/62_CAD/shaders/main_pipeline/fragment_shader.hlsl
@@ -357,10 +357,10 @@ float32_t4 calculateFinalColor(const uint2 fragCoord, const float localAlpha, co
 template<>
 float32_t4 calculateFinalColor<false>(const uint2 fragCoord, const float localAlpha, const uint32_t currentMainObjectIdx, float3 localTextureColor, bool colorFromTexture)
 {
-    uint32_t styleIdx = mainObjects[currentMainObjectIdx].styleIdx;
+    uint32_t styleIdx = loadMainObject(currentMainObjectIdx).styleIdx;
     if (!colorFromTexture)
     {
-        float32_t4 col = lineStyles[styleIdx].color;
+        float32_t4 col = loadLineStyle(styleIdx).color;
         col.w *= localAlpha;
         return float4(col);
     }
@@ -387,7 +387,7 @@ float32_t4 calculateFinalColor<true>(const uint2 fragCoord, const float localAlp
     // sampling from colorStorage needs to happen in critical section because another fragment may also want to store into it at the same time + need to happen before store
     if (resolve)
     {
-        toResolveStyleIdx = mainObjects[storedMainObjectIdx].styleIdx;
+        toResolveStyleIdx = loadMainObject(storedMainObjectIdx).styleIdx;
         if (toResolveStyleIdx == InvalidStyleIdx) // if style idx to resolve is invalid, then it means we should resolve from color
             color = float32_t4(unpackR11G11B10_UNORM(colorStorage[fragCoord]), 1.0f);
     }
@@ -409,7 +409,7 @@ float32_t4 calculateFinalColor<true>(const uint2 fragCoord, const float localAlp
     // draw with previous geometry's style's color or stored in texture buffer :kek:
     // we don't need to load the style's color in critical section because we've already retrieved the style index from the stored main obj
     if (toResolveStyleIdx != InvalidStyleIdx) // if toResolveStyleIdx is valid then that means our resolved color should come from line style
-        color = lineStyles[toResolveStyleIdx].color;
+        color = loadLineStyle(toResolveStyleIdx).color;
     color.a *= float(storedQuantizedAlpha) / 255.f;
     
     return color;
@@ -424,7 +424,7 @@ float4 fragMain(PSInput input) : SV_TARGET
 
     ObjectType objType = input.getObjType();
     const uint32_t currentMainObjectIdx = input.getMainObjectIdx();
-    const MainObject mainObj = mainObjects[currentMainObjectIdx];
+    const MainObject mainObj = loadMainObject(currentMainObjectIdx);
 
 #ifdef DTM
     // TRIANGLE RENDERING
@@ -435,9 +435,9 @@ float4 fragMain(PSInput input) : SV_TARGET
         const float stretch = 1.0f; // TODO: figure out what is it for
         const float worldToScreenRatio = input.getCurrentWorldToScreenRatio();
 
-        DTMSettings dtm = dtmSettings[mainObj.dtmSettingsIdx];
-        LineStyle outlineStyle = lineStyles[dtm.outlineLineStyleIdx];
-        LineStyle contourStyle = lineStyles[dtm.contourLineStyleIdx];
+        DTMSettings dtm = loadDTMSettings(mainObj.dtmSettingsIdx);
+        LineStyle outlineStyle = loadLineStyle(dtm.outlineLineStyleIdx);
+        LineStyle contourStyle = loadLineStyle(dtm.contourLineStyleIdx);
 
         float3 v[3];
         v[0] = input.getScreenSpaceVertexAttribs(0);
@@ -690,7 +690,7 @@ float4 fragMain(PSInput input) : SV_TARGET
 
             nbl::hlsl::shapes::Line<float> lineSegment = nbl::hlsl::shapes::Line<float>::construct(start, end);
 
-            LineStyle style = lineStyles[styleIdx];
+            LineStyle style = loadLineStyle(styleIdx);
 
             if (!style.hasStipples() || stretch == InvalidStyleStretchValue)
             {
@@ -699,7 +699,7 @@ float4 fragMain(PSInput input) : SV_TARGET
             else
             {
                 nbl::hlsl::shapes::Line<float>::ArcLengthCalculator arcLenCalc = nbl::hlsl::shapes::Line<float>::ArcLengthCalculator::construct(lineSegment);
-                LineStyleClipper clipper = LineStyleClipper::construct(lineStyles[styleIdx], lineSegment, arcLenCalc, phaseShift, stretch, worldToScreenRatio);
+                LineStyleClipper clipper = LineStyleClipper::construct(loadLineStyle(styleIdx), lineSegment, arcLenCalc, phaseShift, stretch, worldToScreenRatio);
                 distance = ClippedSignedDistance<nbl::hlsl::shapes::Line<float>, LineStyleClipper>::sdf(lineSegment, input.position.xy, thickness, style.isRoadStyleFlag, clipper);
             }
         }
@@ -714,14 +714,14 @@ float4 fragMain(PSInput input) : SV_TARGET
             const float stretch = input.getPatternStretch();
             const float worldToScreenRatio = input.getCurrentWorldToScreenRatio();
 
-            LineStyle style = lineStyles[styleIdx];
+            LineStyle style = loadLineStyle(styleIdx);
             if (!style.hasStipples() || stretch == InvalidStyleStretchValue)
             {
                 distance = ClippedSignedDistance< nbl::hlsl::shapes::Quadratic<float> >::sdf(quadratic, input.position.xy, thickness, style.isRoadStyleFlag);
             }
             else
             {
-                BezierStyleClipper clipper = BezierStyleClipper::construct(lineStyles[styleIdx], quadratic, arcLenCalc, phaseShift, stretch, worldToScreenRatio);
+                BezierStyleClipper clipper = BezierStyleClipper::construct(loadLineStyle(styleIdx), quadratic, arcLenCalc, phaseShift, stretch, worldToScreenRatio);
                 distance = ClippedSignedDistance<nbl::hlsl::shapes::Quadratic<float>, BezierStyleClipper>::sdf(quadratic, input.position.xy, thickness, style.isRoadStyleFlag, clipper);
             }
         }
@@ -847,7 +847,7 @@ float4 fragMain(PSInput input) : SV_TARGET
             localAlpha = 1.0f - smoothstep(0.0, globals.antiAliasingFactor, dist);
         }
 
-        LineStyle style = lineStyles[mainObj.styleIdx];
+        LineStyle style = loadLineStyle(mainObj.styleIdx);
         uint32_t textureId = asuint(style.screenSpaceLineWidth);
         if (textureId != InvalidTextureIdx)
         {
@@ -883,7 +883,7 @@ float4 fragMain(PSInput input) : SV_TARGET
             */
             msdf *= exp2(max(mipLevel,0.0));
             
-            LineStyle style = lineStyles[mainObj.styleIdx];
+            LineStyle style = loadLineStyle(mainObj.styleIdx);
             const float screenPxRange = input.getFontGlyphPxRange() / MSDFPixelRangeHalf;
             const float bolden = style.worldSpaceLineWidth * screenPxRange; // worldSpaceLineWidth is actually boldenInPixels, aliased TextStyle with LineStyle
             localAlpha = smoothstep(+globals.antiAliasingFactor / 2.0f + bolden, -globals.antiAliasingFactor / 2.0f + bolden, msdf);
diff --git a/62_CAD/shaders/main_pipeline/resolve_alphas.hlsl b/62_CAD/shaders/main_pipeline/resolve_alphas.hlsl
index 46c5d28e0..c75c86825 100644
--- a/62_CAD/shaders/main_pipeline/resolve_alphas.hlsl
+++ b/62_CAD/shaders/main_pipeline/resolve_alphas.hlsl
@@ -32,7 +32,7 @@ float32_t4 calculateFinalColor<true>(const uint2 fragCoord)
     // sampling from colorStorage needs to happen in critical section because another fragment may also want to store into it at the same time + need to happen before store
     if (resolve)
     {
-        toResolveStyleIdx = mainObjects[storedMainObjectIdx].styleIdx;
+        toResolveStyleIdx = loadMainObject(storedMainObjectIdx).styleIdx;
         if (toResolveStyleIdx == InvalidStyleIdx) // if style idx to resolve is invalid, then it means we should resolve from color
             color = float32_t4(unpackR11G11B10_UNORM(colorStorage[fragCoord]), 1.0f);
     }
@@ -45,7 +45,7 @@ float32_t4 calculateFinalColor<true>(const uint2 fragCoord)
     // draw with previous geometry's style's color or stored in texture buffer :kek:
     // we don't need to load the style's color in critical section because we've already retrieved the style index from the stored main obj
     if (toResolveStyleIdx != InvalidStyleIdx) // if toResolveStyleIdx is valid then that means our resolved color should come from line style
-        color = lineStyles[toResolveStyleIdx].color;
+        color = loadLineStyle(toResolveStyleIdx).color;
     color.a *= float(storedQuantizedAlpha) / 255.f;
     
     return color;
diff --git a/62_CAD/shaders/main_pipeline/vertex_shader.hlsl b/62_CAD/shaders/main_pipeline/vertex_shader.hlsl
index f7af0d8a6..5abe693ec 100644
--- a/62_CAD/shaders/main_pipeline/vertex_shader.hlsl
+++ b/62_CAD/shaders/main_pipeline/vertex_shader.hlsl
@@ -27,19 +27,10 @@ float2 QuadraticBezier(float2 p0, float2 p1, float2 p2, float t)
 
 ClipProjectionData getClipProjectionData(in MainObject mainObj)
 {
-    if (mainObj.clipProjectionAddress != InvalidClipProjectionAddress)
-    {
-        ClipProjectionData ret;
-        ret.projectionToNDC = vk::RawBufferLoad<pfloat64_t3x3>(mainObj.clipProjectionAddress, 8u);
-        ret.minClipNDC      = vk::RawBufferLoad<float32_t2>(mainObj.clipProjectionAddress + sizeof(pfloat64_t3x3), 8u);
-        ret.maxClipNDC      = vk::RawBufferLoad<float32_t2>(mainObj.clipProjectionAddress + sizeof(pfloat64_t3x3) + sizeof(float32_t2), 8u);
-
-        return ret;
-    }
+    if (mainObj.clipProjectionIndex != InvalidClipProjectionIndex)
+        return loadCustomClipProjection(mainObj.clipProjectionIndex);
     else
-    {
         return globals.defaultClipProjection;
-    }
 }
 
 float2 transformPointScreenSpace(pfloat64_t3x3 transformation, uint32_t2 resolution, pfloat64_t2 point2d)
@@ -112,7 +103,7 @@ PSInput main(uint vertexID : SV_VertexID)
     vtxPos.x = _static_cast<pfloat64_t>(vtx.pos.x);
     vtxPos.y = _static_cast<pfloat64_t>(vtx.pos.y);
 
-    MainObject mainObj = mainObjects[pc.triangleMeshMainObjectIndex];
+    MainObject mainObj = loadMainObject(pc.triangleMeshMainObjectIndex);
     ClipProjectionData clipProjectionData = getClipProjectionData(mainObj);
 
     float2 transformedPos = transformPointScreenSpace(clipProjectionData.projectionToNDC, globals.resolution, vtxPos);
@@ -129,8 +120,8 @@ PSInput main(uint vertexID : SV_VertexID)
 
     // TODO: line style of contour line has to be set too!
     DTMSettings dtm = dtmSettings[mainObj.dtmSettingsIdx];
-    LineStyle outlineStyle = lineStyles[dtm.outlineLineStyleIdx];
-    LineStyle contourStyle = lineStyles[dtm.contourLineStyleIdx];
+    LineStyle outlineStyle = loadLineStyle(dtm.outlineLineStyleIdx);
+    LineStyle contourStyle = loadLineStyle(dtm.contourLineStyleIdx);
     const float screenSpaceOutlineWidth = outlineStyle.screenSpaceLineWidth + _static_cast<float>(_static_cast<pfloat64_t>(outlineStyle.worldSpaceLineWidth) * globals.screenToWorldRatio);
     const float sdfOutlineThickness = screenSpaceOutlineWidth * 0.5f;
     const float screenSpaceContourLineWidth = contourStyle.screenSpaceLineWidth + _static_cast<float>(_static_cast<pfloat64_t>(contourStyle.worldSpaceLineWidth) * globals.screenToWorldRatio);
@@ -145,7 +136,7 @@ PSInput main(uint vertexID : SV_VertexID)
     const uint vertexIdx = vertexID & 0x3u;
     const uint objectID = vertexID >> 2;
 
-    DrawObject drawObj = drawObjects[objectID];
+    DrawObject drawObj = loadDrawObject(objectID);
 
     ObjectType objType = (ObjectType)(drawObj.type_subsectionIdx & 0x0000FFFF);
     uint32_t subsectionIdx = drawObj.type_subsectionIdx >> 16;
@@ -161,13 +152,13 @@ PSInput main(uint vertexID : SV_VertexID)
     outV.setObjType(objType);
     outV.setMainObjectIdx(drawObj.mainObjIndex);
     
-    MainObject mainObj = mainObjects[drawObj.mainObjIndex];
+    MainObject mainObj = loadMainObject(drawObj.mainObjIndex);
     ClipProjectionData clipProjectionData = getClipProjectionData(mainObj);
     
     // We only need these for Outline type objects like lines and bezier curves
     if (objType == ObjectType::LINE || objType == ObjectType::QUAD_BEZIER || objType == ObjectType::POLYLINE_CONNECTOR)
     {
-        LineStyle lineStyle = lineStyles[mainObj.styleIdx];
+        LineStyle lineStyle = loadLineStyle(mainObj.styleIdx);
 
         // Width is on both sides, thickness is one one side of the curve (div by 2.0f)
         const float screenSpaceLineWidth = lineStyle.screenSpaceLineWidth + _static_cast<float>(_static_cast<pfloat64_t>(lineStyle.worldSpaceLineWidth) * globals.screenToWorldRatio);
@@ -545,7 +536,7 @@ PSInput main(uint vertexID : SV_VertexID)
     }
     else if (objType == ObjectType::FONT_GLYPH)
     {
-        LineStyle lineStyle = lineStyles[mainObj.styleIdx];
+        LineStyle lineStyle = loadLineStyle(mainObj.styleIdx);
         const float italicTiltSlope = lineStyle.screenSpaceLineWidth; // aliased text style member with line style
         
         GlyphInfo glyphInfo;

From 3a2ff1421f81089749694db3032ce52068731551 Mon Sep 17 00:00:00 2001
From: keptsecret <sorchon@gmail.com>
Date: Mon, 31 Mar 2025 14:55:00 +0700
Subject: [PATCH 117/529] test subgroup2 funcs correct

---
 71_ArithmeticBench/app_resources/common.hlsl  |  3 +-
 .../app_resources/shaderCommon.hlsl           | 62 ++++++++++++++-----
 .../app_resources/testSubgroup.comp.hlsl      |  2 +-
 71_ArithmeticBench/main.cpp                   | 57 +++++++++--------
 4 files changed, 78 insertions(+), 46 deletions(-)

diff --git a/71_ArithmeticBench/app_resources/common.hlsl b/71_ArithmeticBench/app_resources/common.hlsl
index 10892a2b9..8921659db 100644
--- a/71_ArithmeticBench/app_resources/common.hlsl
+++ b/71_ArithmeticBench/app_resources/common.hlsl
@@ -10,7 +10,6 @@ struct Output
 	uint32_t data[ScanElementCount];
 };
 
-// Thanks to our unified HLSL/C++ STD lib we're able to remove a whole load of code
 template<typename T>
 struct bit_and : nbl::hlsl::bit_and<T>
 {
@@ -93,4 +92,4 @@ struct ballot : nbl::hlsl::plus<T>
 #endif
 };
 
-#include "nbl/builtin/hlsl/subgroup/basic.hlsl"
\ No newline at end of file
+#include "nbl/builtin/hlsl/subgroup/basic.hlsl"
diff --git a/71_ArithmeticBench/app_resources/shaderCommon.hlsl b/71_ArithmeticBench/app_resources/shaderCommon.hlsl
index 13ee8d21e..e7105da62 100644
--- a/71_ArithmeticBench/app_resources/shaderCommon.hlsl
+++ b/71_ArithmeticBench/app_resources/shaderCommon.hlsl
@@ -2,7 +2,7 @@
 
 #include "nbl/builtin/hlsl/glsl_compat/core.hlsl"
 #include "nbl/builtin/hlsl/subgroup/basic.hlsl"
-#include "nbl/builtin/hlsl/subgroup/arithmetic_portability.hlsl"
+#include "nbl/builtin/hlsl/subgroup2/arithmetic_portability.hlsl"
 
 #include "nbl/builtin/hlsl/jit/device_capabilities.hlsl"
 
@@ -19,37 +19,67 @@ uint32_t globalIndex();
 // since we test ITEMS_PER_WG<WorkgroupSize we need this so workgroups don't overwrite each other's outputs
 bool canStore();
 
+#ifndef ITEMS_PER_INVOCATION
+#error "Define ITEMS_PER_INVOCATION!"
+#endif
 //typedef decltype(inputValue[0]) type_t;
-typedef uint32_t type_t;
+//typedef uint32_t type_t;
+//typedef uint32_t4 type_t;
+typedef vector<uint32_t, ITEMS_PER_INVOCATION> type_t;
 
 
 #ifndef OPERATION
 #error "Define OPERATION!"
 #endif
-template<template<class> class binop>
+// template<template<class> class binop>
+// static void subtest(NBL_CONST_REF_ARG(type_t) sourceVal)
+// {
+// 	if (globalIndex()==0u)
+// 		output[binop::BindingIndex].template Store<uint32_t>(0,nbl::hlsl::glsl::gl_SubgroupSize());
+		
+// 	operation_t<typename binop<type_t>::base_t,nbl::hlsl::jit::device_capabilities> func;
+// 	if (canStore())
+// 		output[binop::BindingIndex].template Store<type_t>(sizeof(uint32_t)+sizeof(type_t)*globalIndex(),func(sourceVal));
+// }
+
+#ifndef SUBGROUP_SIZE_LOG2
+#error "Define SUBGROUP_SIZE_LOG2!"
+#endif
+template<template<class> class binop, typename T, uint32_t N>
 static void subtest(NBL_CONST_REF_ARG(type_t) sourceVal)
 {
+	// TODO static assert vector<T, N> == type_t
+	//using type_t = vector<T, N>;
+	using config_t = nbl::hlsl::subgroup::Configuration<SUBGROUP_SIZE_LOG2>;
+	using params_t = nbl::hlsl::subgroup2::ArithmeticParams<config_t, typename binop<T>::base_t, N, nbl::hlsl::jit::device_capabilities>;
+
 	if (globalIndex()==0u)
-		output[binop<type_t>::BindingIndex].template Store<uint32_t>(0,nbl::hlsl::glsl::gl_SubgroupSize());
+		output[binop<T>::BindingIndex].template Store<uint32_t>(0,nbl::hlsl::glsl::gl_SubgroupSize());
 		
-	operation_t<typename binop<type_t>::base_t,nbl::hlsl::jit::device_capabilities> func;
+	operation_t<params_t> func;
 	if (canStore())
-		output[binop<type_t>::BindingIndex].template Store<type_t>(sizeof(uint32_t)+sizeof(type_t)*globalIndex(),func(sourceVal));
+		output[binop<T>::BindingIndex].template Store<type_t>(sizeof(uint32_t)+sizeof(type_t)*globalIndex(),func(sourceVal));
 }
 
 
 type_t test()
 {
-	const type_t sourceVal = inputValue[globalIndex()];
-
-	subtest<bit_and>(sourceVal);
-	subtest<bit_xor>(sourceVal);
-	subtest<bit_or>(sourceVal);
-	subtest<plus>(sourceVal);
-	subtest<multiplies>(sourceVal);
-	subtest<minimum>(sourceVal);
-	subtest<maximum>(sourceVal);
+	const uint32_t idx = globalIndex() * ITEMS_PER_INVOCATION;
+	type_t sourceVal;
+	[unroll]
+	for (uint32_t i = 0; i < ITEMS_PER_INVOCATION; i++)
+	{
+		sourceVal[i] = inputValue[idx + i];
+	}
+
+	subtest<bit_and, uint32_t, ITEMS_PER_INVOCATION>(sourceVal);
+	subtest<bit_xor, uint32_t, ITEMS_PER_INVOCATION>(sourceVal);
+	subtest<bit_or, uint32_t, ITEMS_PER_INVOCATION>(sourceVal);
+	subtest<plus, uint32_t, ITEMS_PER_INVOCATION>(sourceVal);
+	subtest<multiplies, uint32_t, ITEMS_PER_INVOCATION>(sourceVal);
+	subtest<minimum, uint32_t, ITEMS_PER_INVOCATION>(sourceVal);
+	subtest<maximum, uint32_t, ITEMS_PER_INVOCATION>(sourceVal);
 	return sourceVal;
 }
 
-#include "nbl/builtin/hlsl/workgroup/basic.hlsl"
\ No newline at end of file
+#include "nbl/builtin/hlsl/workgroup/basic.hlsl"
diff --git a/71_ArithmeticBench/app_resources/testSubgroup.comp.hlsl b/71_ArithmeticBench/app_resources/testSubgroup.comp.hlsl
index 479265d73..50173ce42 100644
--- a/71_ArithmeticBench/app_resources/testSubgroup.comp.hlsl
+++ b/71_ArithmeticBench/app_resources/testSubgroup.comp.hlsl
@@ -15,4 +15,4 @@ bool canStore() {return true;}
 void main()
 {
 	test();
-}
\ No newline at end of file
+}
diff --git a/71_ArithmeticBench/main.cpp b/71_ArithmeticBench/main.cpp
index 0952d2b57..00cfbcf35 100644
--- a/71_ArithmeticBench/main.cpp
+++ b/71_ArithmeticBench/main.cpp
@@ -200,14 +200,17 @@ class ArithmeticBenchApp final : public application_templates::BasicMultiQueueAp
 				return false;
 			}
 		}
-
-		const auto MaxWorkgroupSize = m_physicalDevice->getLimits().maxComputeWorkGroupInvocations;
+		
+		// TODO variable items per invocation?
+		const uint32_t ItemsPerInvocation = 4u;
+		const std::array<uint32_t, 5> workgroupSizes = { 64, 128, 256, 512, 1024 };
+		// const auto MaxWorkgroupSize = m_physicalDevice->getLimits().maxComputeWorkGroupInvocations;
 		const auto MinSubgroupSize = m_physicalDevice->getLimits().minSubgroupSize;
 		const auto MaxSubgroupSize = m_physicalDevice->getLimits().maxSubgroupSize;
 		for (auto subgroupSize=MinSubgroupSize; subgroupSize <= MaxSubgroupSize; subgroupSize *= 2u)
 		{
 			const uint8_t subgroupSizeLog2 = hlsl::findMSB(subgroupSize);
-			for (uint32_t workgroupSize = subgroupSize; workgroupSize <= MaxWorkgroupSize; workgroupSize += subgroupSize)
+			for (const auto& workgroupSize : workgroupSizes)
 			{
 				// make sure renderdoc captures everything for debugging
 				m_api->startCapture();
@@ -221,16 +224,16 @@ class ArithmeticBenchApp final : public application_templates::BasicMultiQueueAp
 				logTestOutcome(passed, workgroupSize);
 				passed = runTest<emulatedScanExclusive, false>(subgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize) && passed;
 				logTestOutcome(passed, workgroupSize);
-				for (uint32_t itemsPerWG = workgroupSize; itemsPerWG > workgroupSize - subgroupSize; itemsPerWG--)
-				{
-					m_logger->log("Testing Item Count %u", ILogger::ELL_INFO, itemsPerWG);
-					passed = runTest<emulatedReduction, true>(workgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize, itemsPerWG) && passed;
-					logTestOutcome(passed, itemsPerWG);
-					passed = runTest<emulatedScanInclusive, true>(workgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize, itemsPerWG) && passed;
-					logTestOutcome(passed, itemsPerWG);
-					passed = runTest<emulatedScanExclusive, true>(workgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize, itemsPerWG) && passed;
-					logTestOutcome(passed, itemsPerWG);
-				}
+				//for (uint32_t itemsPerWG = workgroupSize; itemsPerWG > workgroupSize - subgroupSize; itemsPerWG--)
+				//{
+				//	m_logger->log("Testing Item Count %u", ILogger::ELL_INFO, itemsPerWG);
+				//	passed = runTest<emulatedReduction, true>(workgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize, itemsPerWG) && passed;
+				//	logTestOutcome(passed, itemsPerWG);
+				//	passed = runTest<emulatedScanInclusive, true>(workgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize, itemsPerWG) && passed;
+				//	logTestOutcome(passed, itemsPerWG);
+				//	passed = runTest<emulatedScanExclusive, true>(workgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize, itemsPerWG) && passed;
+				//	logTestOutcome(passed, itemsPerWG);
+				//}
 				m_api->endCapture();
 
 				// save cache every now and then	
@@ -301,30 +304,30 @@ class ArithmeticBenchApp final : public application_templates::BasicMultiQueueAp
 	}*/
 
 	template<template<class> class Arithmetic, bool WorkgroupTest>
-	bool runTest(const smart_refctd_ptr<const ICPUShader>& source, const uint32_t elementCount, const uint8_t subgroupSizeLog2, const uint32_t workgroupSize, uint32_t itemsPerWG = ~0u)
+	bool runTest(const smart_refctd_ptr<const ICPUShader>& source, const uint32_t elementCount, const uint8_t subgroupSizeLog2, const uint32_t workgroupSize, uint32_t itemsPerWG = ~0u, uint32_t itemsPerInvoc = 1u)
 	{
 		std::string arith_name = Arithmetic<bit_xor<float>>::name;
 
 		smart_refctd_ptr<ICPUShader> overridenUnspecialized;
-		if constexpr (WorkgroupTest)
-		{
-			overridenUnspecialized = CHLSLCompiler::createOverridenCopy(
-				source.get(), "#define OPERATION %s\n#define WORKGROUP_SIZE %d\n#define ITEMS_PER_WG %d\n",
-				(("workgroup::") + arith_name).c_str(), workgroupSize, itemsPerWG
-			);
-		}
-		else
-		{
+		//if constexpr (WorkgroupTest)
+		//{
+		//	overridenUnspecialized = CHLSLCompiler::createOverridenCopy(
+		//		source.get(), "#define OPERATION %s\n#define WORKGROUP_SIZE %d\n#define ITEMS_PER_WG %d\n",
+		//		(("workgroup::") + arith_name).c_str(), workgroupSize, itemsPerWG
+		//	);
+		//}
+		//else
+		//{
 			itemsPerWG = workgroupSize;
 			overridenUnspecialized = CHLSLCompiler::createOverridenCopy(
-				source.get(), "#define OPERATION %s\n#define WORKGROUP_SIZE %d\n",
-				(("subgroup::") + arith_name).c_str(), workgroupSize
+				source.get(), "#define OPERATION %s\n#define WORKGROUP_SIZE %d\n#define ITEMS_PER_INVOCATION %d\n#define SUBGROUP_SIZE_LOG2 %d\n",
+				(("subgroup2::") + arith_name).c_str(), workgroupSize, itemsPerInvoc, subgroupSizeLog2
 			);
-		}
+		//}
 		auto pipeline = createPipeline(overridenUnspecialized.get(),subgroupSizeLog2);
 
 		// TODO: overlap dispatches with memory readbacks (requires multiple copies of `buffers`)
-		const uint32_t workgroupCount = elementCount / itemsPerWG;
+		const uint32_t workgroupCount = elementCount / (itemsPerWG * itemsPerInvoc);
 		cmdbuf->begin(IGPUCommandBuffer::USAGE::NONE);
 		cmdbuf->bindComputePipeline(pipeline.get());
 		cmdbuf->bindDescriptorSets(EPBP_COMPUTE, pipeline->getLayout(), 0u, 1u, &descriptorSet.get());

From dd021a05605fa48ac5962db22d0a591c0ff7691d Mon Sep 17 00:00:00 2001
From: keptsecret <sorchon@gmail.com>
Date: Mon, 31 Mar 2025 16:39:41 +0700
Subject: [PATCH 118/529] fix test

---
 71_ArithmeticBench/main.cpp | 95 +++++++++++++++++++------------------
 1 file changed, 50 insertions(+), 45 deletions(-)

diff --git a/71_ArithmeticBench/main.cpp b/71_ArithmeticBench/main.cpp
index 00cfbcf35..c03700e2a 100644
--- a/71_ArithmeticBench/main.cpp
+++ b/71_ArithmeticBench/main.cpp
@@ -203,7 +203,7 @@ class ArithmeticBenchApp final : public application_templates::BasicMultiQueueAp
 		
 		// TODO variable items per invocation?
 		const uint32_t ItemsPerInvocation = 4u;
-		const std::array<uint32_t, 5> workgroupSizes = { 64, 128, 256, 512, 1024 };
+		const std::array<uint32_t, 3> workgroupSizes = { 256, 512, 1024 };
 		// const auto MaxWorkgroupSize = m_physicalDevice->getLimits().maxComputeWorkGroupInvocations;
 		const auto MinSubgroupSize = m_physicalDevice->getLimits().minSubgroupSize;
 		const auto MaxSubgroupSize = m_physicalDevice->getLimits().maxSubgroupSize;
@@ -218,11 +218,11 @@ class ArithmeticBenchApp final : public application_templates::BasicMultiQueueAp
 
 				bool passed = true;
 				// TODO async the testing
-				passed = runTest<emulatedReduction, false>(subgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize) && passed;
+				passed = runTest<emulatedReduction, false>(subgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize, ~0u, ItemsPerInvocation) && passed;
 				logTestOutcome(passed, workgroupSize);
-				passed = runTest<emulatedScanInclusive, false>(subgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize) && passed;
+				passed = runTest<emulatedScanInclusive, false>(subgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize, ~0u, ItemsPerInvocation) && passed;
 				logTestOutcome(passed, workgroupSize);
-				passed = runTest<emulatedScanExclusive, false>(subgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize) && passed;
+				passed = runTest<emulatedScanExclusive, false>(subgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize, ~0u, ItemsPerInvocation) && passed;
 				logTestOutcome(passed, workgroupSize);
 				//for (uint32_t itemsPerWG = workgroupSize; itemsPerWG > workgroupSize - subgroupSize; itemsPerWG--)
 				//{
@@ -362,22 +362,22 @@ class ArithmeticBenchApp final : public application_templates::BasicMultiQueueAp
 		m_device->blockForSemaphores(wait);
 
 		// check results
-		bool passed = validateResults<Arithmetic, bit_and<uint32_t>, WorkgroupTest>(itemsPerWG, workgroupCount);
-		passed = validateResults<Arithmetic, bit_xor<uint32_t>, WorkgroupTest>(itemsPerWG, workgroupCount) && passed;
-		passed = validateResults<Arithmetic, bit_or<uint32_t>, WorkgroupTest>(itemsPerWG, workgroupCount) && passed;
-		passed = validateResults<Arithmetic, plus<uint32_t>, WorkgroupTest>(itemsPerWG, workgroupCount) && passed;
-		passed = validateResults<Arithmetic, multiplies<uint32_t>, WorkgroupTest>(itemsPerWG, workgroupCount) && passed;
-		passed = validateResults<Arithmetic, minimum<uint32_t>, WorkgroupTest>(itemsPerWG, workgroupCount) && passed;
-		passed = validateResults<Arithmetic, maximum<uint32_t>, WorkgroupTest>(itemsPerWG, workgroupCount) && passed;
-		if constexpr (WorkgroupTest)
-			passed = validateResults<Arithmetic, ballot<uint32_t>, WorkgroupTest>(itemsPerWG, workgroupCount) && passed;
+		bool passed = validateResults<Arithmetic, bit_and<uint32_t>, WorkgroupTest>(itemsPerWG, workgroupCount, itemsPerInvoc);
+		passed = validateResults<Arithmetic, bit_xor<uint32_t>, WorkgroupTest>(itemsPerWG, workgroupCount, itemsPerInvoc) && passed;
+		passed = validateResults<Arithmetic, bit_or<uint32_t>, WorkgroupTest>(itemsPerWG, workgroupCount, itemsPerInvoc) && passed;
+		passed = validateResults<Arithmetic, plus<uint32_t>, WorkgroupTest>(itemsPerWG, workgroupCount, itemsPerInvoc) && passed;
+		passed = validateResults<Arithmetic, multiplies<uint32_t>, WorkgroupTest>(itemsPerWG, workgroupCount, itemsPerInvoc) && passed;
+		passed = validateResults<Arithmetic, minimum<uint32_t>, WorkgroupTest>(itemsPerWG, workgroupCount, itemsPerInvoc) && passed;
+		passed = validateResults<Arithmetic, maximum<uint32_t>, WorkgroupTest>(itemsPerWG, workgroupCount, itemsPerInvoc) && passed;
+		//if constexpr (WorkgroupTest)
+		//	passed = validateResults<Arithmetic, ballot<uint32_t>, WorkgroupTest>(itemsPerWG, workgroupCount) && passed;
 
 		return passed;
 	}
 
 	//returns true if result matches
 	template<template<class> class Arithmetic, class Binop, bool WorkgroupTest>
-	bool validateResults(const uint32_t itemsPerWG, const uint32_t workgroupCount)
+	bool validateResults(const uint32_t itemsPerWG, const uint32_t workgroupCount, uint32_t itemsPerInvoc = 1u)
 	{
 		bool success = true;
 
@@ -397,47 +397,52 @@ class ArithmeticBenchApp final : public application_templates::BasicMultiQueueAp
 		const auto testData = reinterpret_cast<const type_t*>(dataFromBuffer + 1);
 		// TODO: parallel for (the temporary values need to be threadlocal or what?)
 		// now check if the data obtained has valid values
-		type_t* tmp = new type_t[itemsPerWG];
-		type_t* ballotInput = new type_t[itemsPerWG];
+		type_t* tmp = new type_t[itemsPerWG * itemsPerInvoc];
+		//type_t* ballotInput = new type_t[itemsPerWG];
 		for (uint32_t workgroupID = 0u; success && workgroupID < workgroupCount; workgroupID++)
 		{
-			const auto workgroupOffset = workgroupID * itemsPerWG;
-
-			if constexpr (WorkgroupTest)
-			{
-				if constexpr (std::is_same_v<ballot<type_t>, Binop>)
-				{
-					for (auto i = 0u; i < itemsPerWG; i++)
-						ballotInput[i] = inputData[i + workgroupOffset] & 0x1u;
-					Arithmetic<Binop>::impl(tmp, ballotInput, itemsPerWG);
-				}
-				else
-					Arithmetic<Binop>::impl(tmp, inputData + workgroupOffset, itemsPerWG);
-			}
-			else
-			{
+			const auto workgroupOffset = workgroupID * itemsPerWG * itemsPerInvoc;
+
+			//if constexpr (WorkgroupTest)
+			//{
+			//	if constexpr (std::is_same_v<ballot<type_t>, Binop>)
+			//	{
+			//		for (auto i = 0u; i < itemsPerWG; i++)
+			//			ballotInput[i] = inputData[i + workgroupOffset] & 0x1u;
+			//		Arithmetic<Binop>::impl(tmp, ballotInput, itemsPerWG);
+			//	}
+			//	else
+			//		Arithmetic<Binop>::impl(tmp, inputData + workgroupOffset, itemsPerWG);
+			//}
+			//else
+			//{
 				for (uint32_t pseudoSubgroupID = 0u; pseudoSubgroupID < itemsPerWG; pseudoSubgroupID += subgroupSize)
-					Arithmetic<Binop>::impl(tmp + pseudoSubgroupID, inputData + workgroupOffset + pseudoSubgroupID, subgroupSize);
-			}
+					Arithmetic<Binop>::impl(tmp + pseudoSubgroupID * itemsPerInvoc, inputData + workgroupOffset + pseudoSubgroupID * itemsPerInvoc, subgroupSize * itemsPerInvoc);
+			//}
 
 			for (uint32_t localInvocationIndex = 0u; localInvocationIndex < itemsPerWG; localInvocationIndex++)
 			{
-				const auto globalInvocationIndex = workgroupOffset + localInvocationIndex;
-				const auto cpuVal = tmp[localInvocationIndex];
-				const auto gpuVal = testData[globalInvocationIndex];
-				if (cpuVal != gpuVal)
+				const auto localOffset = localInvocationIndex * itemsPerInvoc;
+				const auto globalInvocationIndex = workgroupOffset + localOffset;
+
+				for (uint32_t itemInvocationIndex = 0u; itemInvocationIndex < itemsPerInvoc; itemInvocationIndex++)
 				{
-					m_logger->log(
-						"Failed test #%d  (%s)  (%s) Expected %u got %u for workgroup %d and localinvoc %d",
-						ILogger::ELL_ERROR, itemsPerWG, WorkgroupTest ? "workgroup" : "subgroup", Binop::name,
-						cpuVal, gpuVal, workgroupID, localInvocationIndex
-					);
-					success = false;
-					break;
+					const auto cpuVal = tmp[localOffset + itemInvocationIndex];
+					const auto gpuVal = testData[globalInvocationIndex + itemInvocationIndex];
+					if (cpuVal != gpuVal)
+					{
+						m_logger->log(
+							"Failed test #%d  (%s)  (%s) Expected %u got %u for workgroup %d and localinvoc %d and iteminvoc %d",
+							ILogger::ELL_ERROR, itemsPerWG, WorkgroupTest ? "workgroup" : "subgroup", Binop::name,
+							cpuVal, gpuVal, workgroupID, localInvocationIndex, itemInvocationIndex
+						);
+						success = false;
+						break;
+					}
 				}
 			}
 		}
-		delete[] ballotInput;
+		//delete[] ballotInput;
 		delete[] tmp;
 
 		return success;

From 6766420f6cc2c09d6eafcb8d519235846cb66fe5 Mon Sep 17 00:00:00 2001
From: devsh <devsh@devsh.eu>
Date: Mon, 31 Mar 2025 12:25:20 +0200
Subject: [PATCH 119/529] make RT pipeline example work when
 `NBL_EMED_RESOURCES=OFF`

---
 71_RayTracingPipeline/main.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/71_RayTracingPipeline/main.cpp b/71_RayTracingPipeline/main.cpp
index 1e4619b46..e4d53008e 100644
--- a/71_RayTracingPipeline/main.cpp
+++ b/71_RayTracingPipeline/main.cpp
@@ -3,7 +3,6 @@
 // For conditions of distribution and use, see copyright notice in nabla.h
 
 #include "common.hpp"
-#include "nbl/builtin/builtinResources.h"
 #include "nbl/ext/FullScreenTriangle/FullScreenTriangle.h"
 #include "nbl/builtin/hlsl/indirect_commands.hlsl"
 

From cf63282ffadcfdf083e3813b86196e4abb85c4e8 Mon Sep 17 00:00:00 2001
From: Erfan Ahmadi <ahmadierfan99@gmail.com>
Date: Mon, 31 Mar 2025 14:39:55 +0330
Subject: [PATCH 120/529] [WIP] save work

---
 62_CAD/DrawResourcesFiller.cpp | 88 ++++++++++++++++----------------
 62_CAD/DrawResourcesFiller.h   | 56 ++++++++++++---------
 62_CAD/main.cpp                | 91 ++++++++++------------------------
 3 files changed, 104 insertions(+), 131 deletions(-)

diff --git a/62_CAD/DrawResourcesFiller.cpp b/62_CAD/DrawResourcesFiller.cpp
index 8c1a42719..710df3cb9 100644
--- a/62_CAD/DrawResourcesFiller.cpp
+++ b/62_CAD/DrawResourcesFiller.cpp
@@ -45,20 +45,16 @@ void DrawResourcesFiller::setSubmitDrawsFunction(const SubmitFunc& func)
 
 void DrawResourcesFiller::allocateDrawResourcesBuffer(ILogicalDevice* logicalDevice, size_t size)
 {
-	maxGeometryBufferSize = size;
-
+	size = core::alignUp(size, BDALoadAlignment);
 	IGPUBuffer::SCreationParams geometryCreationParams = {};
 	geometryCreationParams.size = size;
-	geometryCreationParams.usage = bitflag(IGPUBuffer::EUF_STORAGE_BUFFER_BIT) | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT | IGPUBuffer::EUF_TRANSFER_DST_BIT | IGPUBuffer::EUF_INDEX_BUFFER_BIT; // INDEX_BUFFER USAGE for DTMs
-	gpuDrawBuffers.geometryBuffer = logicalDevice->createBuffer(std::move(geometryCreationParams));
-	gpuDrawBuffers.geometryBuffer->setObjectDebugName("geometryBuffer");
+	geometryCreationParams.usage = bitflag(IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT) | IGPUBuffer::EUF_TRANSFER_DST_BIT | IGPUBuffer::EUF_INDEX_BUFFER_BIT;
+	resourcesGPUBuffer = logicalDevice->createBuffer(std::move(geometryCreationParams));
+	resourcesGPUBuffer->setObjectDebugName("drawResourcesBuffer");
 
-	IDeviceMemoryBacked::SDeviceMemoryRequirements memReq = gpuDrawBuffers.geometryBuffer->getMemoryReqs();
+	IDeviceMemoryBacked::SDeviceMemoryRequirements memReq = resourcesGPUBuffer->getMemoryReqs();
 	memReq.memoryTypeBits &= logicalDevice->getPhysicalDevice()->getDeviceLocalMemoryTypeBits();
-	auto geometryBufferMem = logicalDevice->allocate(memReq, gpuDrawBuffers.geometryBuffer.get(), IDeviceMemoryAllocation::EMAF_DEVICE_ADDRESS_BIT);
-	drawResourcesBDA = gpuDrawBuffers.geometryBuffer->getDeviceAddress();
-
-	cpuDrawBuffers.geometryBuffer = ICPUBuffer::create({ size });
+	auto mem = logicalDevice->allocate(memReq, resourcesGPUBuffer.get(), IDeviceMemoryAllocation::EMAF_DEVICE_ADDRESS_BIT);
 }
 
 void DrawResourcesFiller::allocateMSDFTextures(ILogicalDevice* logicalDevice, uint32_t maxMSDFs, uint32_t2 msdfsExtent)
@@ -170,31 +166,37 @@ void DrawResourcesFiller::drawTriangleMesh(const CTriangleMesh& mesh, CTriangleM
 	// concatenate the index and vertex buffer into the geometry buffer
 	const size_t indexBuffByteSize = mesh.getIndexBuffByteSize();
 	const size_t vtxBuffByteSize = mesh.getVertexBuffByteSize();
-	const size_t geometryBufferDataToAddByteSize = indexBuffByteSize + vtxBuffByteSize;
+	const size_t dataToAddByteSize = vtxBuffByteSize + indexBuffByteSize;
 
 	// copy into gemoetry cpu buffer insteaed
 
+	const size_t totalResourcesConsumption = resourcesCollection.calculateTotalConsumption();
+
 	// TODO: rename, its not just points
-	const uint32_t remainingGeometryBufferSize = static_cast<uint32_t>(maxGeometryBufferSize - currentGeometryBufferSize);
+	const uint32_t remainingResourcesSize = static_cast<uint32_t>(resourcesGPUBuffer->getSize() - totalResourcesConsumption);
 
-	// TODO: assert of geometry buffer size, do i need to check if size of objects to be added <= remainingGeometryBufferSize?
+	// TODO: assert of geometry buffer size, do i need to check if size of objects to be added <= remainingResourcesSize?
 	// TODO: auto submit instead of assert
-	assert(geometryBufferDataToAddByteSize <= remainingGeometryBufferSize);
+	assert(dataToAddByteSize <= remainingResourcesSize);
 
-	// TODO: vertices need to be aligned to 8?
-	uint64_t vtxBufferAddress;
 	{
-		void* dst = reinterpret_cast<char*>(cpuDrawBuffers.geometryBuffer->getPointer()) + currentGeometryBufferSize;
-		void* dst1 = dst;
+		// NOTE[ERFAN]: these push contants will be removed, everything will be accessed by dtmSettings, including where the vertex buffer data resides
+		auto& geometryBytesVector = resourcesCollection.geometryInfo.vector;
+		size_t geometryBufferOffset = core::alignUp(geometryBytesVector.size(), BDALoadAlignment);
+		geometryBytesVector.resize(geometryBufferOffset + dataToAddByteSize);
+
+		// Copy VertexBuffer
+		void* dst = geometryBytesVector.data() + geometryBufferOffset;
+		// the actual bda address will be determined only after all copies are finalized, later we will do += `baseBDAAddress + geometryInfo.bufferOffset`
+		drawData.pushConstants.triangleMeshVerticesBaseAddress = geometryBufferOffset;
+		memcpy(dst, mesh.getVertices().data(), vtxBuffByteSize);
+		geometryBufferOffset += vtxBuffByteSize; 
 
-		drawData.indexBufferOffset = currentGeometryBufferSize;
+		// Copy IndexBuffer
+		dst = geometryBytesVector.data() + geometryBufferOffset;
+		drawData.indexBufferOffset = geometryBufferOffset;
 		memcpy(dst, mesh.getIndices().data(), indexBuffByteSize);
-		currentGeometryBufferSize += indexBuffByteSize;
-
-		dst = reinterpret_cast<char*>(cpuDrawBuffers.geometryBuffer->getPointer()) + currentGeometryBufferSize;
-		drawData.pushConstants.triangleMeshVerticesBaseAddress = drawResourcesBDA + currentGeometryBufferSize;
-		memcpy(dst, mesh.getVertices().data(), vtxBuffByteSize);
-		currentGeometryBufferSize += vtxBuffByteSize;
+		geometryBufferOffset += indexBuffByteSize;
 	}
 
 	drawData.indexCount = mesh.getIndexCount();
@@ -433,16 +435,16 @@ void DrawResourcesFiller::popClipProjectionData()
 
 bool DrawResourcesFiller::finalizeBufferCopies(SIntendedSubmitInfo& intendedNextSubmit)
 {
-	size_t offset = 0ull;
+	copiedResourcesSize = 0ull;
 
-	assert(drawBuffers.calculateTotalConsumption() <= drawResourcesGPUBuffer->getSize());
+	assert(resourcesCollection.calculateTotalConsumption() <= resourcesGPUBuffer->getSize());
 
 	auto copyCPUFilledDrawBuffer = [&](auto& drawBuffer) -> bool
 		{
-			// drawBuffer must be of type CPUFilledDrawBuffer<T>
-			SBufferRange<IGPUBuffer> copyRange = { offset, drawBuffer.getStorageSize(), drawResourcesGPUBuffer};
+			// drawBuffer must be of type CPUGeneratedResource<T>
+			SBufferRange<IGPUBuffer> copyRange = { copiedResourcesSize, drawBuffer.getStorageSize(), resourcesGPUBuffer};
 
-			if (copyRange.offset + copyRange.size > drawResourcesGPUBuffer->getSize())
+			if (copyRange.offset + copyRange.size > resourcesGPUBuffer->getSize())
 			{
 				// TODO: LOG ERROR, this shouldn't happen with correct auto-submission mechanism
 				assert(false);
@@ -454,17 +456,17 @@ bool DrawResourcesFiller::finalizeBufferCopies(SIntendedSubmitInfo& intendedNext
 				drawBuffer.bufferOffset = copyRange.offset;
 				if (!m_utilities->updateBufferRangeViaStagingBuffer(intendedNextSubmit, copyRange, drawBuffer.vector.data()))
 					return false;
-				offset += drawBuffer.getAlignedStorageSize();
+				copiedResourcesSize += drawBuffer.getAlignedStorageSize();
 			}
 			return true;
 		};
 	
 	auto addComputeReservedFilledDrawBuffer = [&](auto& drawBuffer) -> bool
 		{
-			// drawBuffer must be of type ComputeReservedDrawBuffer<T>
-			SBufferRange<IGPUBuffer> copyRange = { offset, drawBuffer.getStorageSize(), drawResourcesGPUBuffer};
+			// drawBuffer must be of type ReservedComputeResource<T>
+			SBufferRange<IGPUBuffer> copyRange = { copiedResourcesSize, drawBuffer.getStorageSize(), resourcesGPUBuffer};
 
-			if (copyRange.offset + copyRange.size > drawResourcesGPUBuffer->getSize())
+			if (copyRange.offset + copyRange.size > resourcesGPUBuffer->getSize())
 			{
 				// TODO: LOG ERROR, this shouldn't happen with correct auto-submission mechanism
 				assert(false);
@@ -472,17 +474,17 @@ bool DrawResourcesFiller::finalizeBufferCopies(SIntendedSubmitInfo& intendedNext
 			}
 
 			drawBuffer.bufferOffset = copyRange.offset;
-			offset += drawBuffer.getAlignedStorageSize();
+			copiedResourcesSize += drawBuffer.getAlignedStorageSize();
 		};
 
-	copyCPUFilledDrawBuffer(drawBuffers.lineStyles);
-	copyCPUFilledDrawBuffer(drawBuffers.dtmSettings);
-	copyCPUFilledDrawBuffer(drawBuffers.clipProjections);
-	copyCPUFilledDrawBuffer(drawBuffers.mainObjects);
-	copyCPUFilledDrawBuffer(drawBuffers.drawObjects);
-	copyCPUFilledDrawBuffer(drawBuffers.indexBuffer);
-	copyCPUFilledDrawBuffer(drawBuffers.geometryInfo);
-
+	copyCPUFilledDrawBuffer(resourcesCollection.lineStyles);
+	copyCPUFilledDrawBuffer(resourcesCollection.dtmSettings);
+	copyCPUFilledDrawBuffer(resourcesCollection.clipProjections);
+	copyCPUFilledDrawBuffer(resourcesCollection.mainObjects);
+	copyCPUFilledDrawBuffer(resourcesCollection.drawObjects);
+	copyCPUFilledDrawBuffer(resourcesCollection.indexBuffer);
+	copyCPUFilledDrawBuffer(resourcesCollection.geometryInfo);
+	
 	return true;
 }
 
diff --git a/62_CAD/DrawResourcesFiller.h b/62_CAD/DrawResourcesFiller.h
index bc456f806..47c4ba146 100644
--- a/62_CAD/DrawResourcesFiller.h
+++ b/62_CAD/DrawResourcesFiller.h
@@ -28,53 +28,54 @@ struct DrawResourcesFiller
 {
 public:
 	
-	/// @brief general parent struct for 1.ComputeReserved and 2.CPUFilled DrawBuffers
-	struct DrawBuffer
+	static constexpr size_t BDALoadAlignment = 8u;
+
+	/// @brief general parent struct for 1.ReservedCompute and 2.CPUGenerated Resources
+	struct ResourceBase
 	{
-		static constexpr size_t Alignment = 8u;
 		static constexpr size_t InvalidBufferOffset = ~0u;
 		size_t bufferOffset = InvalidBufferOffset; // set when copy to gpu buffer is issued
 		virtual size_t getCount() const = 0;
 		virtual size_t getStorageSize() const = 0;
-		virtual size_t getAlignedStorageSize() const { core::alignUp(getStorageSize(), Alignment); }
+		virtual size_t getAlignedStorageSize() const { core::alignUp(getStorageSize(), BDALoadAlignment); }
 	};
 
-	/// @brief DrawBuffer reserved for compute shader stages input/output
+	/// @brief ResourceBase reserved for compute shader stages input/output
 	template <typename T>
-	struct ComputeReservedDrawBuffer : DrawBuffer
+	struct ReservedComputeResource : ResourceBase
 	{
 		size_t count = 0ull;
 		size_t getCount() const override { return count; }
 		size_t getStorageSize() const override  { return count * sizeof(T); }
 	};
 
-	/// @brief DrawBuffer which is filled by CPU, packed and sent to GPU
+	/// @brief ResourceBase which is filled by CPU, packed and sent to GPU
 	template <typename T>
-	struct CPUFilledDrawBuffer : DrawBuffer
+	struct CPUGeneratedResource : ResourceBase
 	{
 		core::vector<T> vector;
 		size_t getCount() const { return vector.size(); }
 		size_t getStorageSize() const { return vector.size() * sizeof(T); }
 	};
 
-	/// @brief struct to hold all draw buffers
-	struct DrawBuffers
+	/// @brief struct to hold all resources
+	struct ResourcesCollection
 	{
 		// auto-submission level 0 buffers (settings that mainObj references)
-		CPUFilledDrawBuffer<LineStyle> lineStyles;
-		CPUFilledDrawBuffer<DTMSettings> dtmSettings;
-		CPUFilledDrawBuffer<ClipProjectionData> clipProjections;
+		CPUGeneratedResource<LineStyle> lineStyles;
+		CPUGeneratedResource<DTMSettings> dtmSettings;
+		CPUGeneratedResource<ClipProjectionData> clipProjections;
 	
 		// auto-submission level 1 buffers (mainObj that drawObjs references, if all drawObjs+idxBuffer+geometryInfo doesn't fit into mem this will be broken down into many)
-		CPUFilledDrawBuffer<MainObject> mainObjects;
+		CPUGeneratedResource<MainObject> mainObjects;
 
 		// auto-submission level 2 buffers
-		CPUFilledDrawBuffer<DrawObject> drawObjects;
-		CPUFilledDrawBuffer<uint32_t> indexBuffer;
-		CPUFilledDrawBuffer<uint8_t> geometryInfo; // general purpose byte buffer for custom geometries, etc
+		CPUGeneratedResource<DrawObject> drawObjects;
+		CPUGeneratedResource<uint32_t> indexBuffer;
+		CPUGeneratedResource<uint8_t> geometryInfo; // general purpose byte buffer for custom geometries, etc
 
-		// Get Total memory consumption, If all DrawBuffers get packed together with DrawBuffer::Alignment
-		// Useful to know when to know when to overflow
+		// Get Total memory consumption, If all ResourcesCollection get packed together with BDALoadAlignment
+		// used to decide when to overflow
 		size_t calculateTotalConsumption() const
 		{
 			return
@@ -166,8 +167,14 @@ struct DrawResourcesFiller
 		resetDTMSettingsCounters();
 	}
 
-	DrawBuffers drawBuffers; // will be compacted and copied into gpu draw resources
-	nbl::core::smart_refctd_ptr<IGPUBuffer> drawResourcesGPUBuffer;
+	/// @brief collection of all the resources that will eventually be reserved or copied to in the resourcesGPUBuffer, will be accessed via individual BDA pointers in shaders
+	const ResourcesCollection& getResourcesCollection() const { return &resourcesCollection; }
+
+	/// @brief buffer containing all non-texture type resources
+	nbl::core::smart_refctd_ptr<IGPUBuffer> getResourcesGPUBuffer() const { return resourcesGPUBuffer; }
+
+	/// @return how far resourcesGPUBuffer was copied to by `finalizeAllCopiesToGPU` in `resourcesCollection` 
+	const size_t getCopiedResourcesSize() { return copiedResourcesSize; }
 
 	uint32_t addLineStyle_SubmitIfNeeded(const LineStyleInfo& lineStyle, SIntendedSubmitInfo& intendedNextSubmit);
 
@@ -384,12 +391,15 @@ struct DrawResourcesFiller
 	// If you haven't created a mainObject yet, then pass InvalidMainObjectIdx
 	uint32_t addMSDFTexture(const MSDFInputInfo& msdfInput, core::smart_refctd_ptr<ICPUImage>&& cpuImage, uint32_t mainObjIdx, SIntendedSubmitInfo& intendedNextSubmit);
 	
+	// ResourcesCollection and packed into GPUBuffer
+	ResourcesCollection resourcesCollection;
+	nbl::core::smart_refctd_ptr<IGPUBuffer> resourcesGPUBuffer;
+	size_t copiedResourcesSize;
+
 	// Members
 	smart_refctd_ptr<IUtilities> m_utilities;
 	IQueue* m_copyQueue;
 
-	uint64_t drawResourcesBDA = 0u; // Actual BDA offset 0 of the gpu buffer
-
 	std::deque<ClipProjectionData> clipProjections; // stack of clip projectios stored so we can resubmit them if geometry buffer got reset.
 	std::deque<uint64_t> clipProjectionAddresses; // stack of clip projection gpu addresses in geometry buffer. to keep track of them in push/pops
 
diff --git a/62_CAD/main.cpp b/62_CAD/main.cpp
index 7dd60ca47..020b7cf6c 100644
--- a/62_CAD/main.cpp
+++ b/62_CAD/main.cpp
@@ -1188,6 +1188,16 @@ class ComputerAidedDesign final : public examples::SimpleWindowedApplication, pu
 		projectionToNDC = m_Camera.constructViewProjection();
 		
 		Globals globalData = {};
+		uint64_t baseAddress = drawResourcesFiller.getResourcesGPUBuffer()->getDeviceAddress();
+		const auto& resources = drawResourcesFiller.getResourcesCollection();
+		globalData.pointers = {
+			.lineStyles				= baseAddress + resources.lineStyles.bufferOffset,
+			.dtmSettings			= baseAddress + resources.dtmSettings.bufferOffset,
+			.customClipProjections	= baseAddress + resources.customClipProjections.bufferOffset,
+			.mainObjects			= baseAddress + resources.mainObjects.bufferOffset,
+			.drawObjects			= baseAddress + resources.drawObjects.bufferOffset,
+			.geometryBuffer			= baseAddress + resources.geometryBuffer.bufferOffset,
+		};
 		globalData.antiAliasingFactor = 1.0;// +abs(cos(m_timeElapsed * 0.0008)) * 20.0f;
 		globalData.resolution = uint32_t2{ m_window->getWidth(), m_window->getHeight() };
 		globalData.defaultClipProjection.projectionToNDC = projectionToNDC;
@@ -1254,25 +1264,12 @@ class ComputerAidedDesign final : public examples::SimpleWindowedApplication, pu
 
 		// pipelineBarriersBeforeDraw
 		{	
-			constexpr uint32_t MaxBufferBarriersCount = 6u;
+			constexpr uint32_t MaxBufferBarriersCount = 2u;
 			uint32_t bufferBarriersCount = 0u;
 			IGPUCommandBuffer::SPipelineBarrierDependencyInfo::buffer_barrier_t bufferBarriers[MaxBufferBarriersCount];
+			
+			const auto& resources = drawResourcesFiller.getResourcesCollection();
 
-			// Index Buffer Copy Barrier -> Only do once at the beginning of the frames
-			if (m_realFrameIx == 0u)
-			{
-				auto& bufferBarrier = bufferBarriers[bufferBarriersCount++];
-				bufferBarrier.barrier.dep.srcStageMask = PIPELINE_STAGE_FLAGS::COPY_BIT;
-				bufferBarrier.barrier.dep.srcAccessMask = ACCESS_FLAGS::TRANSFER_WRITE_BIT;
-				bufferBarrier.barrier.dep.dstStageMask = PIPELINE_STAGE_FLAGS::VERTEX_INPUT_BITS;
-				bufferBarrier.barrier.dep.dstAccessMask = ACCESS_FLAGS::INDEX_READ_BIT;
-				bufferBarrier.range =
-				{
-					.offset = 0u,
-					.size = drawResourcesFiller.gpuDrawBuffers.indexBuffer->getSize(),
-					.buffer = drawResourcesFiller.gpuDrawBuffers.indexBuffer,
-				};
-			}
 			if (m_globalsBuffer->getSize() > 0u)
 			{
 				auto& bufferBarrier = bufferBarriers[bufferBarriersCount++];
@@ -1287,35 +1284,7 @@ class ComputerAidedDesign final : public examples::SimpleWindowedApplication, pu
 					.buffer = m_globalsBuffer,
 				};
 			}
-			if (drawResourcesFiller.getCurrentDrawObjectsBufferSize() > 0u)
-			{
-				auto& bufferBarrier = bufferBarriers[bufferBarriersCount++];
-				bufferBarrier.barrier.dep.srcStageMask = PIPELINE_STAGE_FLAGS::COPY_BIT;
-				bufferBarrier.barrier.dep.srcAccessMask = ACCESS_FLAGS::TRANSFER_WRITE_BIT;
-				bufferBarrier.barrier.dep.dstStageMask = PIPELINE_STAGE_FLAGS::VERTEX_SHADER_BIT;
-				bufferBarrier.barrier.dep.dstAccessMask = ACCESS_FLAGS::SHADER_READ_BITS;
-				bufferBarrier.range =
-				{
-					.offset = 0u,
-					.size = drawResourcesFiller.getCurrentDrawObjectsBufferSize(),
-					.buffer = drawResourcesFiller.gpuDrawBuffers.drawObjectsBuffer,
-				};
-			}
-			if (drawResourcesFiller.getCurrentGeometryBufferSize() > 0u)
-			{
-				auto& bufferBarrier = bufferBarriers[bufferBarriersCount++];
-				bufferBarrier.barrier.dep.srcStageMask = PIPELINE_STAGE_FLAGS::COPY_BIT;
-				bufferBarrier.barrier.dep.srcAccessMask = ACCESS_FLAGS::TRANSFER_WRITE_BIT;
-				bufferBarrier.barrier.dep.dstStageMask = PIPELINE_STAGE_FLAGS::VERTEX_SHADER_BIT;
-				bufferBarrier.barrier.dep.dstAccessMask = ACCESS_FLAGS::SHADER_READ_BITS;
-				bufferBarrier.range =
-				{
-					.offset = 0u,
-					.size = drawResourcesFiller.getCurrentGeometryBufferSize(),
-					.buffer = drawResourcesFiller.gpuDrawBuffers.geometryBuffer,
-				};
-			}
-			if (drawResourcesFiller.getCurrentMainObjectsBufferSize() > 0u)
+			if (drawResourcesFiller.getCopiedResourcesSize() > 0u)
 			{
 				auto& bufferBarrier = bufferBarriers[bufferBarriersCount++];
 				bufferBarrier.barrier.dep.srcStageMask = PIPELINE_STAGE_FLAGS::COPY_BIT;
@@ -1325,22 +1294,8 @@ class ComputerAidedDesign final : public examples::SimpleWindowedApplication, pu
 				bufferBarrier.range =
 				{
 					.offset = 0u,
-					.size = drawResourcesFiller.getCurrentMainObjectsBufferSize(),
-					.buffer = drawResourcesFiller.gpuDrawBuffers.mainObjectsBuffer,
-				};
-			}
-			if (drawResourcesFiller.getCurrentLineStylesBufferSize() > 0u)
-			{
-				auto& bufferBarrier = bufferBarriers[bufferBarriersCount++];
-				bufferBarrier.barrier.dep.srcStageMask = PIPELINE_STAGE_FLAGS::COPY_BIT;
-				bufferBarrier.barrier.dep.srcAccessMask = ACCESS_FLAGS::TRANSFER_WRITE_BIT;
-				bufferBarrier.barrier.dep.dstStageMask = PIPELINE_STAGE_FLAGS::VERTEX_SHADER_BIT | PIPELINE_STAGE_FLAGS::FRAGMENT_SHADER_BIT;
-				bufferBarrier.barrier.dep.dstAccessMask = ACCESS_FLAGS::SHADER_READ_BITS;
-				bufferBarrier.range =
-				{
-					.offset = 0u,
-					.size = drawResourcesFiller.getCurrentLineStylesBufferSize(),
-					.buffer = drawResourcesFiller.gpuDrawBuffers.lineStylesBuffer,
+					.size = drawResourcesFiller.getCopiedResourcesSize(),
+					.buffer = drawResourcesFiller.getResourcesGPUBuffer(),
 				};
 			}
 			cb->pipelineBarrier(E_DEPENDENCY_FLAGS::EDF_NONE, { .bufBarriers = {bufferBarriers, bufferBarriersCount}, .imgBarriers = {} });
@@ -1365,22 +1320,27 @@ class ComputerAidedDesign final : public examples::SimpleWindowedApplication, pu
 			};
 		}
 		cb->beginRenderPass(beginInfo, IGPUCommandBuffer::SUBPASS_CONTENTS::INLINE);
+		
+		const auto& resources = drawResourcesFiller.getResourcesCollection();
+		const auto& resourcesGPUBuffer = drawResourcesFiller.getResourcesGPUBuffer();
+
+		const uint32_t currentIndexCount = resources.drawObjects.getCount() * 6u;
 
-		const uint32_t currentIndexCount = drawResourcesFiller.getDrawObjectCount() * 6u;
 		IGPUDescriptorSet* descriptorSets[] = { descriptorSet0.get(), descriptorSet1.get() };
 		cb->bindDescriptorSets(asset::EPBP_GRAPHICS, pipelineLayout.get(), 0u, 2u, descriptorSets);
+
 		if (mode == ExampleMode::CASE_9)
 		{
 
 			// TODO[Przemek]: based on our call bind index buffer you uploaded to part of the `drawResourcesFiller.gpuDrawBuffers.geometryBuffer`
 			// Vertices will be pulled based on baseBDAPointer of where you uploaded the vertex + the VertexID in the vertex shader.
-			cb->bindIndexBuffer({ .offset = m_triangleMeshDrawData.indexBufferOffset, .buffer = drawResourcesFiller.gpuDrawBuffers.geometryBuffer.get() }, asset::EIT_32BIT);
+			cb->bindIndexBuffer({ .offset = resources.geometryInfo.bufferOffset + m_triangleMeshDrawData.indexBufferOffset, .buffer = drawResourcesFiller.getResourcesGPUBuffer().get()}, asset::EIT_32BIT);
 
 			// TODO[Przemek]: binding the same pipelie, no need to change.
 			cb->bindGraphicsPipeline(graphicsPipeline.get());
 
 			// TODO[Przemek]: contour settings, height shading settings, base bda pointers will need to be pushed via pushConstants before the draw currently as it's the easiest thing to do.
-
+			m_triangleMeshDrawData.pushConstants.triangleMeshVerticesBaseAddress += resourcesGPUBuffer->getDeviceAddress() + resources.geometryInfo.bufferOffset;
 			cb->pushConstants(graphicsPipeline->getLayout(), IGPUShader::E_SHADER_STAGE::ESS_VERTEX, 0, sizeof(PushConstants), &m_triangleMeshDrawData.pushConstants);
 
 			// TODO[Przemek]: draw parameters needs to reflect the mesh involved
@@ -1388,7 +1348,8 @@ class ComputerAidedDesign final : public examples::SimpleWindowedApplication, pu
 		}
 		else
 		{
-			cb->bindIndexBuffer({ .offset = 0u, .buffer = drawResourcesFiller.gpuDrawBuffers.indexBuffer.get() }, asset::EIT_32BIT);
+			assert(currentIndexCount == resources.indexBuffer.getCount());
+			cb->bindIndexBuffer({ .offset = resources.indexBuffer.bufferOffset, .buffer = resourcesGPUBuffer.get() }, asset::EIT_32BIT);
 			cb->bindGraphicsPipeline(graphicsPipeline.get());
 			cb->drawIndexed(currentIndexCount, 1u, 0u, 0u, 0u);
 		}

From 09050536fc442a68b3da11d308432f2c4f2d375b Mon Sep 17 00:00:00 2001
From: devsh <devsh@devsh.eu>
Date: Mon, 31 Mar 2025 13:13:16 +0200
Subject: [PATCH 121/529] Correct Alpha handling in RT Pipeline example, also
 work around https://github.com/microsoft/DirectXShaderCompiler/issues/6464

---
 .../app_resources/common.hlsl                 | 19 +++++++++++++++----
 .../app_resources/raytrace.rahit.hlsl         | 10 +++++-----
 .../app_resources/raytrace.rgen.hlsl          |  7 +++----
 .../app_resources/raytrace_shadow.rahit.hlsl  |  5 ++++-
 71_RayTracingPipeline/main.cpp                |  2 +-
 5 files changed, 28 insertions(+), 15 deletions(-)

diff --git a/71_RayTracingPipeline/app_resources/common.hlsl b/71_RayTracingPipeline/app_resources/common.hlsl
index d64851b17..32e9de671 100644
--- a/71_RayTracingPipeline/app_resources/common.hlsl
+++ b/71_RayTracingPipeline/app_resources/common.hlsl
@@ -3,6 +3,7 @@
 
 #include "nbl/builtin/hlsl/cpp_compat.hlsl"
 #include "nbl/builtin/hlsl/cpp_compat/basic.h"
+#include "nbl/builtin/hlsl/random/pcg.hlsl"
 
 NBL_CONSTEXPR uint32_t WorkgroupSize = 16;
 NBL_CONSTEXPR uint32_t MAX_UNORM_10 = 1023;
@@ -196,10 +197,20 @@ struct MaterialId
 
 struct [raypayload] PrimaryPayload
 {
-    float32_t3 worldNormal : read(caller) : write(closesthit);
-    float32_t rayDistance : read(caller) : write(closesthit, miss);
-    float32_t alphaThreshold : read(closesthit, anyhit) : write(caller);
-    MaterialId materialId : read(caller) : write(closesthit);
+    using generator_t = nbl::hlsl::random::Pcg;
+/* bugged out by https://github.com/microsoft/DirectXShaderCompiler/issues/6464
+    bool nextDiscard(const float32_t alpha)
+    {
+        const uint32_t bitpattern = pcg();
+        const float32_t xi = (float32_t(bitpattern)+0.5f)/float32_t(0xFFFFFFFF);
+        return xi > alpha;
+    }
+*/
+
+    float32_t3  worldNormal : read(caller) : write(closesthit);
+    float32_t   rayDistance : read(caller) : write(closesthit, miss);
+    generator_t pcg         : read(anyhit) : write(caller,anyhit);
+    MaterialId  materialId  : read(caller) : write(closesthit);
 
 };
 
diff --git a/71_RayTracingPipeline/app_resources/raytrace.rahit.hlsl b/71_RayTracingPipeline/app_resources/raytrace.rahit.hlsl
index c499e0506..16f7551b1 100644
--- a/71_RayTracingPipeline/app_resources/raytrace.rahit.hlsl
+++ b/71_RayTracingPipeline/app_resources/raytrace.rahit.hlsl
@@ -7,10 +7,10 @@ void main(inout PrimaryPayload payload, in BuiltInTriangleIntersectionAttributes
 {
     const int instID = InstanceID();
     const STriangleGeomInfo geom = vk::RawBufferLoad < STriangleGeomInfo > (pc.triangleGeomInfoBuffer + instID * sizeof(STriangleGeomInfo));
-    const Material material = nbl::hlsl::_static_cast<Material>(geom.material);
-    
-    if (material.alpha > payload.alphaThreshold)
-    {
+
+    // Should have been a method of the payload but https://github.com/microsoft/DirectXShaderCompiler/issues/6464 stops it
+    // alpha is quantized to 10 bits
+    const uint32_t bitpattern = payload.pcg()>>22;
+    if (bitpattern > geom.material.alpha)
         IgnoreHit();
-    }
 }
diff --git a/71_RayTracingPipeline/app_resources/raytrace.rgen.hlsl b/71_RayTracingPipeline/app_resources/raytrace.rgen.hlsl
index ef84ced3e..c182d961e 100644
--- a/71_RayTracingPipeline/app_resources/raytrace.rgen.hlsl
+++ b/71_RayTracingPipeline/app_resources/raytrace.rgen.hlsl
@@ -2,7 +2,6 @@
 
 #include "nbl/builtin/hlsl/jit/device_capabilities.hlsl"
 #include "nbl/builtin/hlsl/random/xoroshiro.hlsl"
-#include "nbl/builtin/hlsl/random/pcg.hlsl"
 
 #include "nbl/builtin/hlsl/glsl_compat/core.hlsl"
 #include "nbl/builtin/hlsl/spirv_intrinsics/raytracing.hlsl"
@@ -28,8 +27,8 @@ void main()
     const uint32_t3 launchSize = DispatchRaysDimensions();
     const uint32_t2 coords = launchID.xy;
 
-    const uint32_t seed1 = nbl::hlsl::Pcg::construct(pc.frameCounter)();
-    const uint32_t seed2 = nbl::hlsl::Pcg::construct(launchID.y * launchSize.x + launchID.x)();
+    const uint32_t seed1 = nbl::hlsl::random::Pcg::create(pc.frameCounter)();
+    const uint32_t seed2 = nbl::hlsl::random::Pcg::create(launchID.y * launchSize.x + launchID.x)();
     nbl::hlsl::Xoroshiro64StarStar rnd = nbl::hlsl::Xoroshiro64StarStar::construct(uint32_t2(seed1, seed2));
 
     float32_t3 hitValues = float32_t3(0, 0, 0);
@@ -55,7 +54,7 @@ void main()
         rayDesc.TMax = 10000.0;
         
         PrimaryPayload payload;
-        payload.alphaThreshold = nextRandomUnorm(rnd);
+        payload.pcg = PrimaryPayload::generator_t::create(rnd());
         TraceRay(topLevelAS, RAY_FLAG_NONE, 0xff, ERT_PRIMARY, 0, EMT_PRIMARY, rayDesc, payload);
 
         const float32_t rayDistance = payload.rayDistance;
diff --git a/71_RayTracingPipeline/app_resources/raytrace_shadow.rahit.hlsl b/71_RayTracingPipeline/app_resources/raytrace_shadow.rahit.hlsl
index 88a9b79db..2357bb830 100644
--- a/71_RayTracingPipeline/app_resources/raytrace_shadow.rahit.hlsl
+++ b/71_RayTracingPipeline/app_resources/raytrace_shadow.rahit.hlsl
@@ -9,6 +9,9 @@ void main(inout OcclusionPayload payload, in BuiltInTriangleIntersectionAttribut
     const STriangleGeomInfo geom = vk::RawBufferLoad < STriangleGeomInfo > (pc.triangleGeomInfoBuffer + instID * sizeof(STriangleGeomInfo));
     const Material material = nbl::hlsl::_static_cast<Material>(geom.material);
     
-    payload.attenuation = material.alpha * payload.attenuation;
+    payload.attenuation = (1.f-material.alpha) * payload.attenuation;
+    // arbitrary constant
+//    if (payload.attenuation < 1.f/1024.f)
+//        TerminateRay();
     IgnoreHit();
 }
diff --git a/71_RayTracingPipeline/main.cpp b/71_RayTracingPipeline/main.cpp
index e4d53008e..73225d083 100644
--- a/71_RayTracingPipeline/main.cpp
+++ b/71_RayTracingPipeline/main.cpp
@@ -1130,7 +1130,7 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication,
           .diffuse = {0.2, 0.8, 0.2},
           .specular = {0.8, 0.8, 0.8},
           .shininess = 1.0f,
-          .alpha = 0.8,
+          .alpha = 0.2,
         },
         .transform = getTranslationMatrix(5.0f, 1.0f, 0),
       },

From d776d24fbdc7e497ee1329ab9d95598890cc1357 Mon Sep 17 00:00:00 2001
From: Erfan Ahmadi <ahmadierfan99@gmail.com>
Date: Mon, 31 Mar 2025 14:53:40 +0330
Subject: [PATCH 122/529] [WIP] add getMinimumRequiredResourcesBufferSize
 function, still not compiling

---
 62_CAD/DrawResourcesFiller.cpp | 11 +++++++----
 62_CAD/DrawResourcesFiller.h   | 10 +++++++++-
 2 files changed, 16 insertions(+), 5 deletions(-)

diff --git a/62_CAD/DrawResourcesFiller.cpp b/62_CAD/DrawResourcesFiller.cpp
index 710df3cb9..d9280cb76 100644
--- a/62_CAD/DrawResourcesFiller.cpp
+++ b/62_CAD/DrawResourcesFiller.cpp
@@ -42,10 +42,11 @@ void DrawResourcesFiller::setSubmitDrawsFunction(const SubmitFunc& func)
 //	gpuDrawBuffers.indexBuffer->setObjectDebugName("indexBuffer");
 //}
 
-
-void DrawResourcesFiller::allocateDrawResourcesBuffer(ILogicalDevice* logicalDevice, size_t size)
+void DrawResourcesFiller::allocateResourcesBuffer(ILogicalDevice* logicalDevice, size_t size)
 {
+
 	size = core::alignUp(size, BDALoadAlignment);
+	size = core::max(size, getMinimumRequiredResourcesBufferSize());
 	IGPUBuffer::SCreationParams geometryCreationParams = {};
 	geometryCreationParams.size = size;
 	geometryCreationParams.usage = bitflag(IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT) | IGPUBuffer::EUF_TRANSFER_DST_BIT | IGPUBuffer::EUF_INDEX_BUFFER_BIT;
@@ -221,8 +222,8 @@ void DrawResourcesFiller::drawHatch(
 {
 	// TODO[Optimization Idea]: don't draw hatch twice if both colors are visible: instead do the msdf inside the alpha resolve by detecting mainObj being a hatch
 	// https://discord.com/channels/593902898015109131/856835291712716820/1228337893366300743
-	// TODO: Come back to this idea when doing color resolve for ecws (they don't have mainObj/style Index, instead they have uv into a texture
-	
+	// TODO: Come back to this idea when doing color resolve for ecws (they don't have mainObj/style Index, instead they have uv into a texture	
+
 	// if backgroundColor is visible
 	drawHatch(hatch, backgroundColor, intendedNextSubmit);
 	// if foregroundColor is visible
@@ -268,6 +269,7 @@ void DrawResourcesFiller::drawHatch(const Hatch& hatch, const float32_t4& color,
 	drawHatch(hatch, color, HatchFillPattern::SOLID_FILL, intendedNextSubmit);
 }
 
+// TODO: FIX
 void DrawResourcesFiller::drawFontGlyph(
 		nbl::ext::TextRendering::FontFace* fontFace,
 		uint32_t glyphIdx,
@@ -302,6 +304,7 @@ void DrawResourcesFiller::drawFontGlyph(
 	}
 }
 
+// TODO: FIX
 void DrawResourcesFiller::_test_addImageObject(float64_t2 topLeftPos, float32_t2 size, float32_t rotation, SIntendedSubmitInfo& intendedNextSubmit)
 {
 	auto addImageObject_Internal = [&](const ImageObjectInfo& imageObjectInfo, uint32_t mainObjIdx) -> bool
diff --git a/62_CAD/DrawResourcesFiller.h b/62_CAD/DrawResourcesFiller.h
index 47c4ba146..a8010a7ec 100644
--- a/62_CAD/DrawResourcesFiller.h
+++ b/62_CAD/DrawResourcesFiller.h
@@ -95,8 +95,16 @@ struct DrawResourcesFiller
 
 	typedef std::function<void(SIntendedSubmitInfo&)> SubmitFunc;
 	void setSubmitDrawsFunction(const SubmitFunc& func);
+	
+	/// @brief Get minimum required size for resources buffer (containing objects and geometry info and their settings)
+	consteval size_t getMinimumRequiredResourcesBufferSize() const
+	{
+		// for auto-submission to work correctly, memory needs to serve at least 2 linestyle, 1 dtm settings, 1 clip proj, 1 main obj, 1 draw obj and 512 bytes of additional mem for geometries and index buffer
+		// this is the ABSOLUTE MINIMUM (if this value is used rendering will probably be as slow as CPU drawing :D)
+		return core::alignUp(sizeof(LineStyle) * 2u + sizeof(DTMSettings) + sizeof(ClipProjectionData) + sizeof(MainObject) + sizeof(DrawObject) + 512ull, BDALoadAlignment);
+	}
 
-	void allocateDrawResourcesBuffer(ILogicalDevice* logicalDevice, size_t size);
+	void allocateResourcesBuffer(ILogicalDevice* logicalDevice, size_t size);
 
 	void allocateMSDFTextures(ILogicalDevice* logicalDevice, uint32_t maxMSDFs, uint32_t2 msdfsExtent);
 

From 699f2632e96ce772f40e6715c1216c7cde96026d Mon Sep 17 00:00:00 2001
From: devsh <devsh@devsh.eu>
Date: Mon, 31 Mar 2025 13:33:28 +0200
Subject: [PATCH 123/529] clean up the code a bit more, address comments in
 https://github.com/Devsh-Graphics-Programming/Nabla-Examples-and-Tests/pull/173

---
 .../app_resources/common.hlsl                 | 27 +++++++++----------
 .../app_resources/raytrace.rahit.hlsl         |  6 ++---
 .../app_resources/raytrace.rgen.hlsl          |  9 ++++---
 .../app_resources/raytrace.rint.hlsl          | 10 ++-----
 4 files changed, 23 insertions(+), 29 deletions(-)

diff --git a/71_RayTracingPipeline/app_resources/common.hlsl b/71_RayTracingPipeline/app_resources/common.hlsl
index 32e9de671..a5916812d 100644
--- a/71_RayTracingPipeline/app_resources/common.hlsl
+++ b/71_RayTracingPipeline/app_resources/common.hlsl
@@ -53,6 +53,11 @@ struct Material
     {
         return alpha < 1.0;
     }
+
+    bool alphaTest(const float32_t xi) NBL_CONST_MEMBER_FUNC
+    {
+        return xi > alpha;
+    }
 };
 
 struct MaterialPacked
@@ -67,6 +72,11 @@ struct MaterialPacked
     {
         return alpha != MAX_UNORM_10;
     }
+
+    bool alphaTest(const uint32_t xi) NBL_CONST_MEMBER_FUNC
+    {
+        return (xi>>22) > alpha;
+    }
 };
 
 struct SProceduralGeomInfo
@@ -198,14 +208,6 @@ struct MaterialId
 struct [raypayload] PrimaryPayload
 {
     using generator_t = nbl::hlsl::random::Pcg;
-/* bugged out by https://github.com/microsoft/DirectXShaderCompiler/issues/6464
-    bool nextDiscard(const float32_t alpha)
-    {
-        const uint32_t bitpattern = pcg();
-        const float32_t xi = (float32_t(bitpattern)+0.5f)/float32_t(0xFFFFFFFF);
-        return xi > alpha;
-    }
-*/
 
     float32_t3  worldNormal : read(caller) : write(closesthit);
     float32_t   rayDistance : read(caller) : write(closesthit, miss);
@@ -335,12 +337,9 @@ float32_t3 fetchVertexNormal(int instID, int primID, STriangleGeomInfo geom, flo
         case OT_ICOSPHERE:
         default:
         {
-                n0 = normalize(vk::RawBufferLoad <
-                float3 > (normalVertexBufferAddress + i0 * vertexStride));
-                n1 = normalize(vk::RawBufferLoad <
-                float3 > (normalVertexBufferAddress + i1 * vertexStride));
-                n2 = normalize(vk::RawBufferLoad <
-                float3 > (normalVertexBufferAddress + i2 * vertexStride));
+                n0 = vk::RawBufferLoad < float3 > (normalVertexBufferAddress + i0 * vertexStride);
+                n1 = vk::RawBufferLoad < float3 > (normalVertexBufferAddress + i1 * vertexStride);
+                n2 = vk::RawBufferLoad < float3 > (normalVertexBufferAddress + i2 * vertexStride);
             }
     }
 
diff --git a/71_RayTracingPipeline/app_resources/raytrace.rahit.hlsl b/71_RayTracingPipeline/app_resources/raytrace.rahit.hlsl
index 16f7551b1..97713b3ec 100644
--- a/71_RayTracingPipeline/app_resources/raytrace.rahit.hlsl
+++ b/71_RayTracingPipeline/app_resources/raytrace.rahit.hlsl
@@ -8,9 +8,7 @@ void main(inout PrimaryPayload payload, in BuiltInTriangleIntersectionAttributes
     const int instID = InstanceID();
     const STriangleGeomInfo geom = vk::RawBufferLoad < STriangleGeomInfo > (pc.triangleGeomInfoBuffer + instID * sizeof(STriangleGeomInfo));
 
-    // Should have been a method of the payload but https://github.com/microsoft/DirectXShaderCompiler/issues/6464 stops it
-    // alpha is quantized to 10 bits
-    const uint32_t bitpattern = payload.pcg()>>22;
-    if (bitpattern > geom.material.alpha)
+    const uint32_t bitpattern = payload.pcg();
+    if (geom.material.alphaTest(bitpattern))
         IgnoreHit();
 }
diff --git a/71_RayTracingPipeline/app_resources/raytrace.rgen.hlsl b/71_RayTracingPipeline/app_resources/raytrace.rgen.hlsl
index c182d961e..3e2c45bfe 100644
--- a/71_RayTracingPipeline/app_resources/raytrace.rgen.hlsl
+++ b/71_RayTracingPipeline/app_resources/raytrace.rgen.hlsl
@@ -65,6 +65,12 @@ void main()
         }
 
         const float32_t3 worldPosition = pc.camPos + (camDirection * rayDistance);
+
+        // make sure to call with least live state
+        RayLight cLight;
+        cLight.inHitPosition = worldPosition;
+        CallShader(pc.light.type, cLight);
+
         const float32_t3 worldNormal = payload.worldNormal;
 
         Material material;
@@ -80,9 +86,6 @@ void main()
             const MaterialPacked materialPacked = vk::RawBufferLoad<MaterialPacked>(pc.triangleGeomInfoBuffer + materialId.getMaterialIndex() * sizeof(STriangleGeomInfo));
             material = nbl::hlsl::_static_cast<Material>(materialPacked);
         }
-        RayLight cLight;
-        cLight.inHitPosition = worldPosition;
-        CallShader(pc.light.type, cLight);
 
         float32_t attenuation = 1;
 
diff --git a/71_RayTracingPipeline/app_resources/raytrace.rint.hlsl b/71_RayTracingPipeline/app_resources/raytrace.rint.hlsl
index ab623382d..d081c9248 100644
--- a/71_RayTracingPipeline/app_resources/raytrace.rint.hlsl
+++ b/71_RayTracingPipeline/app_resources/raytrace.rint.hlsl
@@ -18,14 +18,8 @@ float32_t hitSphere(SProceduralGeomInfo s, Ray r)
     float32_t c = dot(oc, oc) - s.radius * s.radius;
     float32_t discriminant = b * b - 4 * a * c;
 
-    if (discriminant < 0)
-    {
-        return -1.0;
-    }
-    else
-    {
-        return (-b - sqrt(discriminant)) / (2.0 * a);
-    }
+    // return whatever, if the discriminant is negative, it will produce a NaN, and NaN will compare false
+    return (-b - sqrt(discriminant)) / (2.0 * a);
 }
 
 [shader("intersection")]

From bb2fd06626342be02dad8e043a4ea395fef7049d Mon Sep 17 00:00:00 2001
From: devsh <devsh@devsh.eu>
Date: Mon, 31 Mar 2025 14:28:34 +0200
Subject: [PATCH 124/529] do shadows without any closest hit shaders, one miss
 shader instead

---
 .../app_resources/common.hlsl                  |  7 +++++--
 .../app_resources/raytrace.rgen.hlsl           |  8 +++++---
 .../app_resources/raytrace_shadow.rahit.hlsl   | 11 +++++++----
 .../app_resources/raytrace_shadow.rmiss.hlsl   |  8 ++++++++
 .../raytrace_shadow_triangle.rchit.hlsl        |  7 -------
 71_RayTracingPipeline/main.cpp                 | 18 +++++++++---------
 6 files changed, 34 insertions(+), 25 deletions(-)
 create mode 100644 71_RayTracingPipeline/app_resources/raytrace_shadow.rmiss.hlsl
 delete mode 100644 71_RayTracingPipeline/app_resources/raytrace_shadow_triangle.rchit.hlsl

diff --git a/71_RayTracingPipeline/app_resources/common.hlsl b/71_RayTracingPipeline/app_resources/common.hlsl
index a5916812d..18b67085a 100644
--- a/71_RayTracingPipeline/app_resources/common.hlsl
+++ b/71_RayTracingPipeline/app_resources/common.hlsl
@@ -170,7 +170,10 @@ struct RayLight
 
 struct [raypayload] OcclusionPayload
 {
-    float32_t attenuation : read(caller) : write(caller, anyhit);
+    // TODO: will this break DXC? Tbh should come from push constant or some autoexposure feedback
+    // NBL_CONSTEXPR_STATIC_INLINE float32_t MinAttenuation = 1.f/1024.f;
+
+    float32_t attenuation : read(caller,anyhit,miss) : write(caller,anyhit,miss);
 };
 
 struct MaterialId
@@ -210,7 +213,7 @@ struct [raypayload] PrimaryPayload
     using generator_t = nbl::hlsl::random::Pcg;
 
     float32_t3  worldNormal : read(caller) : write(closesthit);
-    float32_t   rayDistance : read(caller) : write(closesthit, miss);
+    float32_t   rayDistance : read(caller) : write(closesthit,miss);
     generator_t pcg         : read(anyhit) : write(caller,anyhit);
     MaterialId  materialId  : read(caller) : write(closesthit);
 
diff --git a/71_RayTracingPipeline/app_resources/raytrace.rgen.hlsl b/71_RayTracingPipeline/app_resources/raytrace.rgen.hlsl
index 3e2c45bfe..55b014d07 100644
--- a/71_RayTracingPipeline/app_resources/raytrace.rgen.hlsl
+++ b/71_RayTracingPipeline/app_resources/raytrace.rgen.hlsl
@@ -97,13 +97,15 @@ void main()
             rayDesc.TMin = 0.01;
             rayDesc.TMax = cLight.outLightDistance;
 
-            uint32_t shadowRayFlags = RAY_FLAG_ACCEPT_FIRST_HIT_AND_END_SEARCH;
             OcclusionPayload occlusionPayload;
-            occlusionPayload.attenuation = 1;
+            // negative means its a hit, the miss shader will flip it back around to positive
+            occlusionPayload.attenuation = -1.f;
+            // abuse of miss shader to mean "not hit shader" solves us having to call closest hit shaders
+            uint32_t shadowRayFlags = RAY_FLAG_ACCEPT_FIRST_HIT_AND_END_SEARCH | RAY_FLAG_SKIP_CLOSEST_HIT_SHADER;
             TraceRay(topLevelAS, shadowRayFlags, 0xFF, ERT_OCCLUSION, 0, EMT_OCCLUSION, rayDesc, occlusionPayload);
 
             attenuation = occlusionPayload.attenuation;
-            if (occlusionPayload.attenuation > 0.0001)
+            if (occlusionPayload.attenuation > 1.f/1024.f)
             {
                 const float32_t3 diffuse = computeDiffuse(material, cLight.outLightDir, worldNormal);
                 const float32_t3 specular = computeSpecular(material, camDirection, cLight.outLightDir, worldNormal);
diff --git a/71_RayTracingPipeline/app_resources/raytrace_shadow.rahit.hlsl b/71_RayTracingPipeline/app_resources/raytrace_shadow.rahit.hlsl
index 2357bb830..a3432b812 100644
--- a/71_RayTracingPipeline/app_resources/raytrace_shadow.rahit.hlsl
+++ b/71_RayTracingPipeline/app_resources/raytrace_shadow.rahit.hlsl
@@ -1,4 +1,5 @@
 #include "common.hlsl"
+#include "nbl/builtin/hlsl/spirv_intrinsics/raytracing.hlsl"
 
 [[vk::push_constant]] SPushConstants pc;
 
@@ -9,9 +10,11 @@ void main(inout OcclusionPayload payload, in BuiltInTriangleIntersectionAttribut
     const STriangleGeomInfo geom = vk::RawBufferLoad < STriangleGeomInfo > (pc.triangleGeomInfoBuffer + instID * sizeof(STriangleGeomInfo));
     const Material material = nbl::hlsl::_static_cast<Material>(geom.material);
     
-    payload.attenuation = (1.f-material.alpha) * payload.attenuation;
-    // arbitrary constant
-//    if (payload.attenuation < 1.f/1024.f)
-//        TerminateRay();
+    const float attenuation = (1.f-material.alpha) * payload.attenuation;
+    // DXC cogegens weird things in the presence of termination instructions
+    payload.attenuation = attenuation;
+    // arbitrary constant, whatever you want the smallest attenuation to be. Remember until miss, the attenuatio is negative
+    if (attenuation > -1.f/1024.f)
+        AcceptHitAndEndSearch();
     IgnoreHit();
 }
diff --git a/71_RayTracingPipeline/app_resources/raytrace_shadow.rmiss.hlsl b/71_RayTracingPipeline/app_resources/raytrace_shadow.rmiss.hlsl
new file mode 100644
index 000000000..441a1b42a
--- /dev/null
+++ b/71_RayTracingPipeline/app_resources/raytrace_shadow.rmiss.hlsl
@@ -0,0 +1,8 @@
+#include "common.hlsl"
+
+[shader("miss")]
+void main(inout OcclusionPayload payload)
+{
+    // make positive
+    payload.attenuation = -payload.attenuation;
+}
diff --git a/71_RayTracingPipeline/app_resources/raytrace_shadow_triangle.rchit.hlsl b/71_RayTracingPipeline/app_resources/raytrace_shadow_triangle.rchit.hlsl
deleted file mode 100644
index c85c7c32d..000000000
--- a/71_RayTracingPipeline/app_resources/raytrace_shadow_triangle.rchit.hlsl
+++ /dev/null
@@ -1,7 +0,0 @@
-#include "common.hlsl"
-
-[shader("closesthit")]
-void main(inout OcclusionPayload payload, in BuiltInTriangleIntersectionAttributes attribs)
-{
-    payload.attenuation = 0;
-}
diff --git a/71_RayTracingPipeline/main.cpp b/71_RayTracingPipeline/main.cpp
index 73225d083..35c750373 100644
--- a/71_RayTracingPipeline/main.cpp
+++ b/71_RayTracingPipeline/main.cpp
@@ -163,7 +163,7 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication,
     const auto anyHitShaderColorPayload = loadCompileAndCreateShader("app_resources/raytrace.rahit.hlsl");
     const auto anyHitShaderShadowPayload = loadCompileAndCreateShader("app_resources/raytrace_shadow.rahit.hlsl");
     const auto missShader = loadCompileAndCreateShader("app_resources/raytrace.rmiss.hlsl");
-    const auto shadowClosestHitShader = loadCompileAndCreateShader("app_resources/raytrace_shadow_triangle.rchit.hlsl");
+    const auto missShadowShader = loadCompileAndCreateShader("app_resources/raytrace_shadow.rmiss.hlsl");
     const auto directionalLightCallShader = loadCompileAndCreateShader("app_resources/light_directional.rcall.hlsl");
     const auto pointLightCallShader = loadCompileAndCreateShader("app_resources/light_point.rcall.hlsl");
     const auto spotLightCallShader = loadCompileAndCreateShader("app_resources/light_spot.rcall.hlsl");
@@ -323,7 +323,7 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication,
       {
         RTDS_RAYGEN,
         RTDS_MISS,
-        RTDS_CLOSEST_HIT_SHADOW,
+        RTDS_MISS_SHADOW,
         RTDS_CLOSEST_HIT,
         RTDS_SPHERE_CLOSEST_HIT,
         RTDS_ANYHIT_PRIMARY,
@@ -338,7 +338,7 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication,
       IGPUShader::SSpecInfo shaders[RTDS_COUNT];
       shaders[RTDS_RAYGEN] = {.shader = raygenShader.get()};
       shaders[RTDS_MISS] = {.shader = missShader.get()};
-      shaders[RTDS_CLOSEST_HIT_SHADOW] = { .shader = shadowClosestHitShader.get() };
+      shaders[RTDS_MISS_SHADOW] = { .shader = missShadowShader.get() };
       shaders[RTDS_CLOSEST_HIT] = {.shader = closestHitShader.get()};
       shaders[RTDS_SPHERE_CLOSEST_HIT] = {.shader = proceduralClosestHitShader.get()};
       shaders[RTDS_ANYHIT_PRIMARY] = {.shader = anyHitShaderColorPayload.get()};
@@ -351,9 +351,9 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication,
       params.layout = pipelineLayout.get();
       params.shaders = std::span(shaders);
       using RayTracingFlags = IGPURayTracingPipeline::SCreationParams::FLAGS;
-      params.flags = core::bitflag(RayTracingFlags::NO_NULL_INTERSECTION_SHADERS) | 
-        RayTracingFlags::NO_NULL_ANY_HIT_SHADERS |
-        RayTracingFlags::NO_NULL_CLOSEST_HIT_SHADERS;
+      params.flags = core::bitflag(RayTracingFlags::NO_NULL_MISS_SHADERS) |
+        RayTracingFlags::NO_NULL_INTERSECTION_SHADERS | 
+        RayTracingFlags::NO_NULL_ANY_HIT_SHADERS;
 
       auto& shaderGroups = params.shaderGroups;
 
@@ -361,7 +361,7 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication,
 
       IRayTracingPipelineBase::SGeneralShaderGroup missGroups[EMT_COUNT];
       missGroups[EMT_PRIMARY] = { .index = RTDS_MISS };
-      missGroups[EMT_OCCLUSION] = { .index = IGPURayTracingPipeline::SGeneralShaderGroup::Unused };
+      missGroups[EMT_OCCLUSION] = { .index = RTDS_MISS_SHADOW };
       shaderGroups.misses = missGroups;
 
       auto getHitGroupIndex = [](E_GEOM_TYPE geomType, E_RAY_TYPE rayType)
@@ -374,7 +374,7 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication,
         .anyHit = RTDS_ANYHIT_PRIMARY,
       };
       hitGroups[getHitGroupIndex(EGT_TRIANGLES, ERT_OCCLUSION)] = {
-        .closestHit = RTDS_CLOSEST_HIT_SHADOW,
+        .closestHit = IGPURayTracingPipeline::SGeneralShaderGroup::Unused,
         .anyHit = RTDS_ANYHIT_SHADOW,
       };
       hitGroups[getHitGroupIndex(EGT_PROCEDURAL, ERT_PRIMARY)] = {
@@ -383,7 +383,7 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication,
         .intersection = RTDS_INTERSECTION,
       };
       hitGroups[getHitGroupIndex(EGT_PROCEDURAL, ERT_OCCLUSION)] = {
-        .closestHit = RTDS_CLOSEST_HIT_SHADOW,
+        .closestHit = IGPURayTracingPipeline::SGeneralShaderGroup::Unused,
         .anyHit = RTDS_ANYHIT_SHADOW,
         .intersection = RTDS_INTERSECTION,
       };

From ca219416680386ae5cd8de42470960a7a7899c50 Mon Sep 17 00:00:00 2001
From: keptsecret <sorchon@gmail.com>
Date: Tue, 1 Apr 2025 16:24:35 +0700
Subject: [PATCH 125/529] benchmarking shader + pipeline working

---
 .../app_resources/benchmarkSubgroup.comp.hlsl |  73 ++++++++
 71_ArithmeticBench/app_resources/common.hlsl  |  54 +++---
 .../app_resources/shaderCommon.hlsl           |  60 +++----
 .../app_resources/testSubgroup.comp.hlsl      |   4 +-
 71_ArithmeticBench/main.cpp                   | 161 ++++++++++++++----
 5 files changed, 251 insertions(+), 101 deletions(-)
 create mode 100644 71_ArithmeticBench/app_resources/benchmarkSubgroup.comp.hlsl

diff --git a/71_ArithmeticBench/app_resources/benchmarkSubgroup.comp.hlsl b/71_ArithmeticBench/app_resources/benchmarkSubgroup.comp.hlsl
new file mode 100644
index 000000000..f3cc679ef
--- /dev/null
+++ b/71_ArithmeticBench/app_resources/benchmarkSubgroup.comp.hlsl
@@ -0,0 +1,73 @@
+#pragma shader_stage(compute)
+
+#define operation_t nbl::hlsl::OPERATION
+
+#include "shaderCommon.hlsl"
+
+uint32_t globalIndex()
+{
+    return nbl::hlsl::glsl::gl_WorkGroupID().x*WORKGROUP_SIZE+nbl::hlsl::workgroup::SubgroupContiguousIndex();
+}
+
+bool canStore() {return true;}
+
+#ifndef NUM_LOOPS
+#error "Define NUM_LOOPS!"
+#endif
+
+// template<template<class> class binop, typename T, uint32_t N>
+// static void subbench(NBL_CONST_REF_ARG(type_t) sourceVal)
+// {
+//     using config_t = nbl::hlsl::subgroup::Configuration<SUBGROUP_SIZE_LOG2>;
+//     using params_t = nbl::hlsl::subgroup2::ArithmeticParams<config_t, typename binop<T>::base_t, N, nbl::hlsl::jit::device_capabilities>;
+
+//     const uint32_t storeAddr = sizeof(uint32_t) + sizeof(type_t) * globalIndex();
+
+//     operation_t<params_t> func;
+//     [unroll]
+//     for (uint32_t i = 0; i < NUM_LOOPS; i++)
+//     {
+//         const uint32_t arrIndex = i & 7u;   // i % 8
+//         output[arrIndex].template Store<type_t>(storeAddr, func(sourceVal));
+//     }
+// }
+
+template<template<class> class binop, typename T, uint32_t N>
+static void subbench(NBL_CONST_REF_ARG(type_t) sourceVal)
+{
+    using config_t = nbl::hlsl::subgroup::Configuration<SUBGROUP_SIZE_LOG2>;
+    using params_t = nbl::hlsl::subgroup2::ArithmeticParams<config_t, typename binop<T>::base_t, N, nbl::hlsl::jit::device_capabilities>;
+    type_t value = sourceVal;
+
+    operation_t<params_t> func;
+    [unroll]
+    for (uint32_t i = 0; i < NUM_LOOPS; i++)
+        value = func(value);
+
+    output[binop<T>::BindingIndex].template Store<type_t>(sizeof(uint32_t) + sizeof(type_t) * globalIndex(), value);
+}
+
+void benchmark()
+{
+    const uint32_t idx = globalIndex() * ITEMS_PER_INVOCATION;
+    type_t sourceVal;
+    [unroll]
+    for (uint32_t i = 0; i < ITEMS_PER_INVOCATION; i++)
+    {
+        sourceVal[i] = inputValue[idx + i];
+    }
+
+    subbench<bit_and, uint32_t, ITEMS_PER_INVOCATION>(sourceVal);
+    subbench<bit_xor, uint32_t, ITEMS_PER_INVOCATION>(sourceVal);
+    subbench<bit_or, uint32_t, ITEMS_PER_INVOCATION>(sourceVal);
+    subbench<plus, uint32_t, ITEMS_PER_INVOCATION>(sourceVal);
+    subbench<multiplies, uint32_t, ITEMS_PER_INVOCATION>(sourceVal);
+    subbench<minimum, uint32_t, ITEMS_PER_INVOCATION>(sourceVal);
+    subbench<maximum, uint32_t, ITEMS_PER_INVOCATION>(sourceVal);
+}
+
+[numthreads(WORKGROUP_SIZE,1,1)]
+void main()
+{
+    benchmark();
+}
diff --git a/71_ArithmeticBench/app_resources/common.hlsl b/71_ArithmeticBench/app_resources/common.hlsl
index 8921659db..67d3f16ca 100644
--- a/71_ArithmeticBench/app_resources/common.hlsl
+++ b/71_ArithmeticBench/app_resources/common.hlsl
@@ -4,91 +4,91 @@
 template<uint32_t kScanElementCount=1024*1024>
 struct Output
 {
-	NBL_CONSTEXPR_STATIC_INLINE uint32_t ScanElementCount = kScanElementCount;
+    NBL_CONSTEXPR_STATIC_INLINE uint32_t ScanElementCount = kScanElementCount;
 
-	uint32_t subgroupSize;
-	uint32_t data[ScanElementCount];
+    uint32_t subgroupSize;
+    uint32_t data[ScanElementCount];
 };
 
 template<typename T>
 struct bit_and : nbl::hlsl::bit_and<T>
 {
-	using base_t = nbl::hlsl::bit_and<T>;
+    using base_t = nbl::hlsl::bit_and<T>;
 
-	NBL_CONSTEXPR_STATIC_INLINE uint16_t BindingIndex = 0;
+    NBL_CONSTEXPR_STATIC_INLINE uint16_t BindingIndex = 0;
 #ifndef __HLSL_VERSION
-	static inline constexpr const char* name = "bit_and";
+    static inline constexpr const char* name = "bit_and";
 #endif
 };
 template<typename T>
 struct bit_or : nbl::hlsl::bit_or<T>
 {
-	using base_t = nbl::hlsl::bit_or<T>;
+    using base_t = nbl::hlsl::bit_or<T>;
 
-	NBL_CONSTEXPR_STATIC_INLINE uint16_t BindingIndex = 1;
+    NBL_CONSTEXPR_STATIC_INLINE uint16_t BindingIndex = 1;
 #ifndef __HLSL_VERSION
-	static inline constexpr const char* name = "bit_xor";
+    static inline constexpr const char* name = "bit_xor";
 #endif
 };
 template<typename T>
 struct bit_xor : nbl::hlsl::bit_xor<T>
 {
-	using base_t = nbl::hlsl::bit_xor<T>;
+    using base_t = nbl::hlsl::bit_xor<T>;
 
-	NBL_CONSTEXPR_STATIC_INLINE uint16_t BindingIndex = 2;
+    NBL_CONSTEXPR_STATIC_INLINE uint16_t BindingIndex = 2;
 #ifndef __HLSL_VERSION
-	static inline constexpr const char* name = "bit_or";
+    static inline constexpr const char* name = "bit_or";
 #endif
 };
 template<typename T>
 struct plus : nbl::hlsl::plus<T>
 {
-	using base_t = nbl::hlsl::plus<T>;
+    using base_t = nbl::hlsl::plus<T>;
 
-	NBL_CONSTEXPR_STATIC_INLINE uint16_t BindingIndex = 3;
+    NBL_CONSTEXPR_STATIC_INLINE uint16_t BindingIndex = 3;
 #ifndef __HLSL_VERSION
-	static inline constexpr const char* name = "plus";
+    static inline constexpr const char* name = "plus";
 #endif
 };
 template<typename T>
 struct multiplies : nbl::hlsl::multiplies<T>
 {
-	using base_t = nbl::hlsl::multiplies<T>;
+    using base_t = nbl::hlsl::multiplies<T>;
 
-	NBL_CONSTEXPR_STATIC_INLINE uint16_t BindingIndex = 4;
+    NBL_CONSTEXPR_STATIC_INLINE uint16_t BindingIndex = 4;
 #ifndef __HLSL_VERSION
-	static inline constexpr const char* name = "multiplies";
+    static inline constexpr const char* name = "multiplies";
 #endif
 };
 template<typename T>
 struct minimum : nbl::hlsl::minimum<T>
 {
-	using base_t = nbl::hlsl::minimum<T>;
+    using base_t = nbl::hlsl::minimum<T>;
 
-	NBL_CONSTEXPR_STATIC_INLINE uint16_t BindingIndex = 5;
+    NBL_CONSTEXPR_STATIC_INLINE uint16_t BindingIndex = 5;
 #ifndef __HLSL_VERSION
-	static inline constexpr const char* name = "minimum";
+    static inline constexpr const char* name = "minimum";
 #endif
 };
 template<typename T>
 struct maximum : nbl::hlsl::maximum<T>
 {
-	using base_t = nbl::hlsl::maximum<T>;
+    using base_t = nbl::hlsl::maximum<T>;
 
-	NBL_CONSTEXPR_STATIC_INLINE uint16_t BindingIndex = 6;
+    NBL_CONSTEXPR_STATIC_INLINE uint16_t BindingIndex = 6;
 #ifndef __HLSL_VERSION
-	static inline constexpr const char* name = "maximum";
+    static inline constexpr const char* name = "maximum";
 #endif
 };
 
 template<typename T>
 struct ballot : nbl::hlsl::plus<T>
 {
-	using base_t = nbl::hlsl::plus<T>;
+    using base_t = nbl::hlsl::plus<T>;
 
-	NBL_CONSTEXPR_STATIC_INLINE uint16_t BindingIndex = 7;
+    NBL_CONSTEXPR_STATIC_INLINE uint16_t BindingIndex = 7;
 #ifndef __HLSL_VERSION
-	static inline constexpr const char* name = "bitcount";
+    static inline constexpr const char* name = "bitcount";
 #endif
 };
 
diff --git a/71_ArithmeticBench/app_resources/shaderCommon.hlsl b/71_ArithmeticBench/app_resources/shaderCommon.hlsl
index e7105da62..fa3713c44 100644
--- a/71_ArithmeticBench/app_resources/shaderCommon.hlsl
+++ b/71_ArithmeticBench/app_resources/shaderCommon.hlsl
@@ -31,16 +31,6 @@ typedef vector<uint32_t, ITEMS_PER_INVOCATION> type_t;
 #ifndef OPERATION
 #error "Define OPERATION!"
 #endif
-// template<template<class> class binop>
-// static void subtest(NBL_CONST_REF_ARG(type_t) sourceVal)
-// {
-// 	if (globalIndex()==0u)
-// 		output[binop::BindingIndex].template Store<uint32_t>(0,nbl::hlsl::glsl::gl_SubgroupSize());
-		
-// 	operation_t<typename binop<type_t>::base_t,nbl::hlsl::jit::device_capabilities> func;
-// 	if (canStore())
-// 		output[binop::BindingIndex].template Store<type_t>(sizeof(uint32_t)+sizeof(type_t)*globalIndex(),func(sourceVal));
-// }
 
 #ifndef SUBGROUP_SIZE_LOG2
 #error "Define SUBGROUP_SIZE_LOG2!"
@@ -48,38 +38,38 @@ typedef vector<uint32_t, ITEMS_PER_INVOCATION> type_t;
 template<template<class> class binop, typename T, uint32_t N>
 static void subtest(NBL_CONST_REF_ARG(type_t) sourceVal)
 {
-	// TODO static assert vector<T, N> == type_t
-	//using type_t = vector<T, N>;
-	using config_t = nbl::hlsl::subgroup::Configuration<SUBGROUP_SIZE_LOG2>;
-	using params_t = nbl::hlsl::subgroup2::ArithmeticParams<config_t, typename binop<T>::base_t, N, nbl::hlsl::jit::device_capabilities>;
+    // TODO static assert vector<T, N> == type_t
+    //using type_t = vector<T, N>;
+    using config_t = nbl::hlsl::subgroup::Configuration<SUBGROUP_SIZE_LOG2>;
+    using params_t = nbl::hlsl::subgroup2::ArithmeticParams<config_t, typename binop<T>::base_t, N, nbl::hlsl::jit::device_capabilities>;
 
-	if (globalIndex()==0u)
-		output[binop<T>::BindingIndex].template Store<uint32_t>(0,nbl::hlsl::glsl::gl_SubgroupSize());
-		
-	operation_t<params_t> func;
-	if (canStore())
-		output[binop<T>::BindingIndex].template Store<type_t>(sizeof(uint32_t)+sizeof(type_t)*globalIndex(),func(sourceVal));
+    if (globalIndex()==0u)
+        output[binop<T>::BindingIndex].template Store<uint32_t>(0,nbl::hlsl::glsl::gl_SubgroupSize());
+        
+    operation_t<params_t> func;
+    if (canStore())
+        output[binop<T>::BindingIndex].template Store<type_t>(sizeof(uint32_t)+sizeof(type_t)*globalIndex(),func(sourceVal));
 }
 
 
 type_t test()
 {
-	const uint32_t idx = globalIndex() * ITEMS_PER_INVOCATION;
-	type_t sourceVal;
-	[unroll]
-	for (uint32_t i = 0; i < ITEMS_PER_INVOCATION; i++)
-	{
-		sourceVal[i] = inputValue[idx + i];
-	}
+    const uint32_t idx = globalIndex() * ITEMS_PER_INVOCATION;
+    type_t sourceVal;
+    [unroll]
+    for (uint32_t i = 0; i < ITEMS_PER_INVOCATION; i++)
+    {
+        sourceVal[i] = inputValue[idx + i];
+    }
 
-	subtest<bit_and, uint32_t, ITEMS_PER_INVOCATION>(sourceVal);
-	subtest<bit_xor, uint32_t, ITEMS_PER_INVOCATION>(sourceVal);
-	subtest<bit_or, uint32_t, ITEMS_PER_INVOCATION>(sourceVal);
-	subtest<plus, uint32_t, ITEMS_PER_INVOCATION>(sourceVal);
-	subtest<multiplies, uint32_t, ITEMS_PER_INVOCATION>(sourceVal);
-	subtest<minimum, uint32_t, ITEMS_PER_INVOCATION>(sourceVal);
-	subtest<maximum, uint32_t, ITEMS_PER_INVOCATION>(sourceVal);
-	return sourceVal;
+    subtest<bit_and, uint32_t, ITEMS_PER_INVOCATION>(sourceVal);
+    subtest<bit_xor, uint32_t, ITEMS_PER_INVOCATION>(sourceVal);
+    subtest<bit_or, uint32_t, ITEMS_PER_INVOCATION>(sourceVal);
+    subtest<plus, uint32_t, ITEMS_PER_INVOCATION>(sourceVal);
+    subtest<multiplies, uint32_t, ITEMS_PER_INVOCATION>(sourceVal);
+    subtest<minimum, uint32_t, ITEMS_PER_INVOCATION>(sourceVal);
+    subtest<maximum, uint32_t, ITEMS_PER_INVOCATION>(sourceVal);
+    return sourceVal;
 }
 
 #include "nbl/builtin/hlsl/workgroup/basic.hlsl"
diff --git a/71_ArithmeticBench/app_resources/testSubgroup.comp.hlsl b/71_ArithmeticBench/app_resources/testSubgroup.comp.hlsl
index 50173ce42..2cc1ccb60 100644
--- a/71_ArithmeticBench/app_resources/testSubgroup.comp.hlsl
+++ b/71_ArithmeticBench/app_resources/testSubgroup.comp.hlsl
@@ -6,7 +6,7 @@
 
 uint32_t globalIndex()
 {
-	return nbl::hlsl::glsl::gl_WorkGroupID().x*WORKGROUP_SIZE+nbl::hlsl::workgroup::SubgroupContiguousIndex();
+    return nbl::hlsl::glsl::gl_WorkGroupID().x*WORKGROUP_SIZE+nbl::hlsl::workgroup::SubgroupContiguousIndex();
 }
 
 bool canStore() {return true;}
@@ -14,5 +14,5 @@ bool canStore() {return true;}
 [numthreads(WORKGROUP_SIZE,1,1)]
 void main()
 {
-	test();
+    test();
 }
diff --git a/71_ArithmeticBench/main.cpp b/71_ArithmeticBench/main.cpp
index c03700e2a..29f9ede8a 100644
--- a/71_ArithmeticBench/main.cpp
+++ b/71_ArithmeticBench/main.cpp
@@ -2,6 +2,8 @@
 #include "nbl/application_templates/MonoAssetManagerAndBuiltinResourceApplication.hpp"
 #include "app_resources/common.hlsl"
 
+#include <chrono>
+
 using namespace nbl;
 using namespace core;
 using namespace asset;
@@ -188,7 +190,7 @@ class ArithmeticBenchApp final : public application_templates::BasicMultiQueueAp
 		};
 
 		auto subgroupTestSource = getShaderSource("app_resources/testSubgroup.comp.hlsl");
-		auto workgroupTestSource = getShaderSource("app_resources/testWorkgroup.comp.hlsl");
+		//auto workgroupTestSource = getShaderSource("app_resources/testWorkgroup.comp.hlsl");
 		// now create or retrieve final resources to run our tests
 		sema = m_device->createSemaphore(timelineValue);
 		resultsBuffer = ICPUBuffer::create({ outputBuffers[0]->getSize() });
@@ -203,11 +205,75 @@ class ArithmeticBenchApp final : public application_templates::BasicMultiQueueAp
 		
 		// TODO variable items per invocation?
 		const uint32_t ItemsPerInvocation = 4u;
+		const uint32_t NumLoops = 100000u;
 		const std::array<uint32_t, 3> workgroupSizes = { 256, 512, 1024 };
 		// const auto MaxWorkgroupSize = m_physicalDevice->getLimits().maxComputeWorkGroupInvocations;
 		const auto MinSubgroupSize = m_physicalDevice->getLimits().minSubgroupSize;
 		const auto MaxSubgroupSize = m_physicalDevice->getLimits().maxSubgroupSize;
-		for (auto subgroupSize=MinSubgroupSize; subgroupSize <= MaxSubgroupSize; subgroupSize *= 2u)
+		
+		if (b_runTests)
+			runTests(subgroupTestSource, elementCount, ItemsPerInvocation, MinSubgroupSize, MaxSubgroupSize, workgroupSizes);
+
+		double time = runBenchmark<emulatedReduction>(subgroupTestSource, elementCount, 5, 256, ItemsPerInvocation, NumLoops);
+		m_logger->log("Ran for %.3fms (disregard these numbers, profile in Nsight)", ILogger::ELL_INFO, time * 1000.0);
+
+		//for (auto subgroupSize = MinSubgroupSize; subgroupSize <= MaxSubgroupSize; subgroupSize *= 2u)
+		//{
+		//	const uint8_t subgroupSizeLog2 = hlsl::findMSB(subgroupSize);
+		//	for (const auto& workgroupSize : workgroupSizes)
+		//	{
+		//		passed = runBenchmark<emulatedReduction>(subgroupTestSource, queryPool, elementCount, subgroupSizeLog2, workgroupSize, ItemsPerInvocation, NumLoops) && passed;
+		//		logTestOutcome(passed, workgroupSize);
+		//		passed = runBenchmark<emulatedScanInclusive>(subgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize, ItemsPerInvocation, NumLoops) && passed;
+		//		logTestOutcome(passed, workgroupSize);
+		//		passed = runBenchmark<emulatedScanExclusive>(subgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize, ItemsPerInvocation, NumLoops) && passed;
+		//		logTestOutcome(passed, workgroupSize);
+
+		//		// save cache every now and then	
+		//		{
+		//			auto cpu = m_spirv_isa_cache->convertToCPUCache();
+		//			// Normally we'd beautifully JSON serialize the thing, allow multiple devices & drivers + metadata
+		//			auto bin = cpu->getEntries().begin()->second.bin;
+		//			IFile::success_t success;
+		//			m_spirv_isa_cache_output->write(success, bin->data(), 0ull, bin->size());
+		//			if (!success)
+		//				logFail("Could not write Create SPIR-V to ISA cache to disk!");
+		//		}
+		//	}
+		//}
+
+		return true;
+	}
+
+	virtual bool onAppTerminated() override
+	{
+		m_logger->log("==========Result==========", ILogger::ELL_INFO);
+		m_logger->log("Fail Count: %u", ILogger::ELL_INFO, totalFailCount);
+		delete[] inputData;
+		return true;
+	}
+
+	// the unit test is carried out on init
+	void workLoopBody() override {}
+
+	//
+	bool keepRunning() override { return true; }
+
+private:
+	void logTestOutcome(bool passed, uint32_t workgroupSize)
+	{
+		if (passed)
+			m_logger->log("Passed test #%u", ILogger::ELL_INFO, workgroupSize);
+		else
+		{
+			totalFailCount++;
+			m_logger->log("Failed test #%u", ILogger::ELL_ERROR, workgroupSize);
+		}
+	}
+
+	void runTests(smart_refctd_ptr<ICPUShader> subgroupTestSource, uint32_t elementCount, uint32_t ItemsPerInvocation, uint32_t MinSubgroupSize, uint32_t MaxSubgroupSize, const std::array<uint32_t, 3>& workgroupSizes)
+	{
+		for (auto subgroupSize = MinSubgroupSize; subgroupSize <= MaxSubgroupSize; subgroupSize *= 2u)
 		{
 			const uint8_t subgroupSizeLog2 = hlsl::findMSB(subgroupSize);
 			for (const auto& workgroupSize : workgroupSizes)
@@ -242,40 +308,12 @@ class ArithmeticBenchApp final : public application_templates::BasicMultiQueueAp
 					// Normally we'd beautifully JSON serialize the thing, allow multiple devices & drivers + metadata
 					auto bin = cpu->getEntries().begin()->second.bin;
 					IFile::success_t success;
-					m_spirv_isa_cache_output->write(success,bin->data(),0ull,bin->size());
+					m_spirv_isa_cache_output->write(success, bin->data(), 0ull, bin->size());
 					if (!success)
 						logFail("Could not write Create SPIR-V to ISA cache to disk!");
 				}
 			}
 		}
-
-		return true;
-	}
-
-	virtual bool onAppTerminated() override
-	{
-		m_logger->log("==========Result==========", ILogger::ELL_INFO);
-		m_logger->log("Fail Count: %u", ILogger::ELL_INFO, totalFailCount);
-		delete[] inputData;
-		return true;
-	}
-
-	// the unit test is carried out on init
-	void workLoopBody() override {}
-
-	//
-	bool keepRunning() override { return false; }
-
-private:
-	void logTestOutcome(bool passed, uint32_t workgroupSize)
-	{
-		if (passed)
-			m_logger->log("Passed test #%u", ILogger::ELL_INFO, workgroupSize);
-		else
-		{
-			totalFailCount++;
-			m_logger->log("Failed test #%u", ILogger::ELL_ERROR, workgroupSize);
-		}
 	}
 
 	// create pipeline (specialized every test) [TODO: turn into a future/async]
@@ -297,12 +335,6 @@ class ArithmeticBenchApp final : public application_templates::BasicMultiQueueAp
 		return pipeline;
 	}
 
-	/*template<template<class> class Arithmetic, bool WorkgroupTest>
-	bool runTest(const smart_refctd_ptr<const ICPUShader>& source, const uint32_t elementCount, const uint32_t workgroupSize, uint32_t itemsPerWG = ~0u)
-	{
-		return true;
-	}*/
-
 	template<template<class> class Arithmetic, bool WorkgroupTest>
 	bool runTest(const smart_refctd_ptr<const ICPUShader>& source, const uint32_t elementCount, const uint8_t subgroupSizeLog2, const uint32_t workgroupSize, uint32_t itemsPerWG = ~0u, uint32_t itemsPerInvoc = 1u)
 	{
@@ -448,11 +480,66 @@ class ArithmeticBenchApp final : public application_templates::BasicMultiQueueAp
 		return success;
 	}
 
+
+	template<template<class> class Arithmetic>
+	double runBenchmark(const smart_refctd_ptr<const ICPUShader>& source, const uint32_t elementCount, const uint8_t subgroupSizeLog2, const uint32_t workgroupSize, uint32_t itemsPerInvoc = 1u, uint32_t numLoops = 8u)
+	{
+		std::string arith_name = Arithmetic<bit_xor<float>>::name;
+
+		smart_refctd_ptr<ICPUShader> overridenUnspecialized = CHLSLCompiler::createOverridenCopy(
+			source.get(), "#define OPERATION %s\n#define WORKGROUP_SIZE %d\n#define ITEMS_PER_INVOCATION %d\n#define SUBGROUP_SIZE_LOG2 %d\n",
+			(("subgroup2::") + arith_name).c_str(), workgroupSize, itemsPerInvoc, subgroupSizeLog2
+		);
+		auto pipeline = createPipeline(overridenUnspecialized.get(), subgroupSizeLog2);
+
+		const uint32_t workgroupCount = elementCount / (workgroupSize * itemsPerInvoc);
+		cmdbuf->begin(IGPUCommandBuffer::USAGE::NONE);
+
+		cmdbuf->bindComputePipeline(pipeline.get());
+		cmdbuf->bindDescriptorSets(EPBP_COMPUTE, pipeline->getLayout(), 0u, 1u, &descriptorSet.get());
+		cmdbuf->dispatch(workgroupCount, 1, 1);
+		{
+			IGPUCommandBuffer::SPipelineBarrierDependencyInfo::buffer_barrier_t memoryBarrier[OutputBufferCount];
+			for (auto i = 0u; i < OutputBufferCount; i++)
+			{
+				memoryBarrier[i] = {
+					.barrier = {
+						.dep = {
+							.srcStageMask = PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT,
+							.srcAccessMask = ACCESS_FLAGS::SHADER_WRITE_BITS,
+							// in theory we don't need the HOST BITS cause we block on a semaphore but might as well add them
+							.dstStageMask = PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT | PIPELINE_STAGE_FLAGS::HOST_BIT,
+							.dstAccessMask = ACCESS_FLAGS::SHADER_WRITE_BITS | ACCESS_FLAGS::HOST_READ_BIT
+						}
+					},
+					.range = {0ull,outputBuffers[i]->getSize(),outputBuffers[i]}
+				};
+			}
+			IGPUCommandBuffer::SPipelineBarrierDependencyInfo info = { .memBarriers = {},.bufBarriers = memoryBarrier };
+			cmdbuf->pipelineBarrier(asset::E_DEPENDENCY_FLAGS::EDF_NONE, info);
+		}
+		cmdbuf->end();
+
+		auto startTime = std::chrono::high_resolution_clock::now();
+
+		const IQueue::SSubmitInfo::SSemaphoreInfo signal[1] = { {.semaphore = sema.get(),.value = ++timelineValue} };
+		const IQueue::SSubmitInfo::SCommandBufferInfo cmdbufs[1] = { {.cmdbuf = cmdbuf.get()} };
+		const IQueue::SSubmitInfo submits[1] = { {.commandBuffers = cmdbufs,.signalSemaphores = signal} };
+		computeQueue->submit(submits);
+		const ISemaphore::SWaitInfo wait[1] = { {.semaphore = sema.get(),.value = timelineValue} };
+		m_device->blockForSemaphores(wait);
+
+		auto endTime = std::chrono::high_resolution_clock::now();
+
+		return std::chrono::duration<double>(endTime - startTime).count();
+	}
+
 	IQueue* transferDownQueue;
 	IQueue* computeQueue;
 	smart_refctd_ptr<IGPUPipelineCache> m_spirv_isa_cache;
 	smart_refctd_ptr<IFile> m_spirv_isa_cache_output;
 
+	bool b_runTests = false;
 	uint32_t* inputData = nullptr;
 	constexpr static inline uint32_t OutputBufferCount = 8u;
 	smart_refctd_ptr<IGPUBuffer> outputBuffers[OutputBufferCount];

From 4dc28d57be6d72cbe779f36bf12599769a7c7d7e Mon Sep 17 00:00:00 2001
From: Erfan Ahmadi <ahmadierfan99@gmail.com>
Date: Wed, 2 Apr 2025 01:21:07 +0330
Subject: [PATCH 126/529] [WIP] more drawResource auto-submission logic fixes

---
 62_CAD/DrawResourcesFiller.cpp | 229 ++++++++++++++-------------------
 62_CAD/DrawResourcesFiller.h   |  76 +++++------
 62_CAD/main.cpp                |   2 +-
 3 files changed, 129 insertions(+), 178 deletions(-)

diff --git a/62_CAD/DrawResourcesFiller.cpp b/62_CAD/DrawResourcesFiller.cpp
index d9280cb76..1eff63552 100644
--- a/62_CAD/DrawResourcesFiller.cpp
+++ b/62_CAD/DrawResourcesFiller.cpp
@@ -44,8 +44,7 @@ void DrawResourcesFiller::setSubmitDrawsFunction(const SubmitFunc& func)
 
 void DrawResourcesFiller::allocateResourcesBuffer(ILogicalDevice* logicalDevice, size_t size)
 {
-
-	size = core::alignUp(size, BDALoadAlignment);
+	size = core::alignUp(size, ResourcesMaxNaturalAlignment);
 	size = core::max(size, getMinimumRequiredResourcesBufferSize());
 	IGPUBuffer::SCreationParams geometryCreationParams = {};
 	geometryCreationParams.size = size;
@@ -169,12 +168,7 @@ void DrawResourcesFiller::drawTriangleMesh(const CTriangleMesh& mesh, CTriangleM
 	const size_t vtxBuffByteSize = mesh.getVertexBuffByteSize();
 	const size_t dataToAddByteSize = vtxBuffByteSize + indexBuffByteSize;
 
-	// copy into gemoetry cpu buffer insteaed
-
-	const size_t totalResourcesConsumption = resourcesCollection.calculateTotalConsumption();
-
-	// TODO: rename, its not just points
-	const uint32_t remainingResourcesSize = static_cast<uint32_t>(resourcesGPUBuffer->getSize() - totalResourcesConsumption);
+	const size_t remainingResourcesSize = calculateRemainingResourcesSize();
 
 	// TODO: assert of geometry buffer size, do i need to check if size of objects to be added <= remainingResourcesSize?
 	// TODO: auto submit instead of assert
@@ -183,8 +177,8 @@ void DrawResourcesFiller::drawTriangleMesh(const CTriangleMesh& mesh, CTriangleM
 	{
 		// NOTE[ERFAN]: these push contants will be removed, everything will be accessed by dtmSettings, including where the vertex buffer data resides
 		auto& geometryBytesVector = resourcesCollection.geometryInfo.vector;
-		size_t geometryBufferOffset = core::alignUp(geometryBytesVector.size(), BDALoadAlignment);
-		geometryBytesVector.resize(geometryBufferOffset + dataToAddByteSize);
+		size_t geometryBufferOffset = core::alignUp(geometryBytesVector.size(), ResourcesMaxNaturalAlignment);
+		geometryBytesVector.resize(geometryBufferOffset + dataToAddByteSize); // this will increase total resources consumption and reduce remainingResourcesSize --> no need to update any size trackers
 
 		// Copy VertexBuffer
 		void* dst = geometryBytesVector.data() + geometryBufferOffset;
@@ -207,9 +201,6 @@ void DrawResourcesFiller::drawTriangleMesh(const CTriangleMesh& mesh, CTriangleM
 	uint32_t dtmSettingsIndex = addDTMSettings_SubmitIfNeeded(dtmSettingsInfo, intendedNextSubmit);
 
 	drawData.pushConstants.triangleMeshMainObjectIndex = addMainObject_SubmitIfNeeded(InvalidStyleIdx, dtmSettingsIndex, intendedNextSubmit);
-
-	// TODO: use this function later for auto submit
-	//submitCurrentDrawObjectsAndReset(intendedNextSubmit, 0);
 }
 
 // TODO[Erfan]: Makes more sense if parameters are: solidColor + fillPattern + patternColor
@@ -361,32 +352,46 @@ bool DrawResourcesFiller::finalizeAllCopiesToGPU(SIntendedSubmitInfo& intendedNe
 
 uint32_t DrawResourcesFiller::addLineStyle_SubmitIfNeeded(const LineStyleInfo& lineStyle, SIntendedSubmitInfo& intendedNextSubmit)
 {
+	const size_t remainingResourcesSize = calculateRemainingResourcesSize();
+	const bool enoughMem = remainingResourcesSize >= sizeof(LineStyle); // enough remaining memory for 1 more linestyle?
+	
 	uint32_t outLineStyleIdx = addLineStyle_Internal(lineStyle);
 	if (outLineStyleIdx == InvalidStyleIdx)
 	{
+		// There wasn't enough resource memory remaining to fit a single LineStyle
 		finalizeAllCopiesToGPU(intendedNextSubmit);
 		submitDraws(intendedNextSubmit);
-		resetGeometryCounters();
-		resetMainObjectCounters();
-		resetLineStyleCounters();
-		resetDTMSettingsCounters();
+		
+		// resets itself
+		resetLineStyles();
+		// resets higher level resources
+		resetMainObjects();
+		resetDrawObjects();
+
 		outLineStyleIdx = addLineStyle_Internal(lineStyle);
 		assert(outLineStyleIdx != InvalidStyleIdx);
 	}
+
 	return outLineStyleIdx;
 }
 
 uint32_t DrawResourcesFiller::addDTMSettings_SubmitIfNeeded(const DTMSettingsInfo& dtmSettings, SIntendedSubmitInfo& intendedNextSubmit)
 {
+	// before calling `addDTMSettings_Internal` we have made sute we have enough mem for 
 	uint32_t outDTMSettingIdx = addDTMSettings_Internal(dtmSettings, intendedNextSubmit);
-	if (outDTMSettingIdx == InvalidStyleIdx)
+	if (outDTMSettingIdx == InvalidDTMSettingsIdx)
 	{
+		// There wasn't enough resource memory remaining to fit dtmsettings struct + 2 linestyles structs.
 		finalizeAllCopiesToGPU(intendedNextSubmit);
 		submitDraws(intendedNextSubmit);
-		resetGeometryCounters();
-		resetMainObjectCounters();
-		resetLineStyleCounters();
-		resetDTMSettingsCounters();
+		
+		// resets itself
+		resetDTMSettings();
+		resetLineStyles(); // additionally resets linestyles as well, just to be safe
+		// resets higher level resources
+		resetMainObjects();
+		resetDrawObjects();
+
 		outDTMSettingIdx = addDTMSettings_Internal(dtmSettings, intendedNextSubmit);
 		assert(outDTMSettingIdx != InvalidDTMSettingsIdx);
 	}
@@ -402,18 +407,17 @@ uint32_t DrawResourcesFiller::addMainObject_SubmitIfNeeded(uint32_t styleIdx, ui
 	uint32_t outMainObjectIdx = addMainObject_Internal(mainObject);
 	if (outMainObjectIdx == InvalidMainObjectIdx)
 	{
+		// failed to fit into remaining resources mem or exceeded max indexable mainobj
 		finalizeAllCopiesToGPU(intendedNextSubmit);
 		submitDraws(intendedNextSubmit);
-
-		// geometries needs to be reset because they reference draw objects and draw objects reference main objects that are now unavailable and reset
-		resetGeometryCounters();
-		// mainObjects needs to be reset because we submitted every previous main object
-		resetMainObjectCounters();
-		// we shouldn't reset linestyles and clip projections here because it was possibly requested to push to mem before addMainObjects
-		// but clip projections are reset due to geometry/bda buffer being reset so we need to push again
 		
-		// acquireCurrentClipProjectionAddress again here because clip projection should exist in the geometry buffer, and reseting geometry counters will invalidate the current clip proj and requires repush
-		mainObject.clipProjectionAddress = acquireCurrentClipProjectionAddress(intendedNextSubmit);
+		// resets itself
+		resetMainObjects();
+		// resets higher level resources
+		resetDrawObjects();
+		// we shouldn't reset lower level resources like linestyles and clip projections here because it was possibly requested to push to mem before addMainObjects
+
+		// try to add again
 		outMainObjectIdx = addMainObject_Internal(mainObject);
 		assert(outMainObjectIdx != InvalidMainObjectIdx);
 	}
@@ -638,6 +642,12 @@ bool DrawResourcesFiller::finalizeTextureCopies(SIntendedSubmitInfo& intendedNex
 	}
 }
 
+const size_t DrawResourcesFiller::calculateRemainingResourcesSize() const
+{
+	assert(resourcesGPUBuffer->getSize() >= resourcesCollection.calculateTotalConsumption());
+	return resourcesGPUBuffer->getSize() - resourcesCollection.calculateTotalConsumption();
+}
+
 void DrawResourcesFiller::submitCurrentDrawObjectsAndReset(SIntendedSubmitInfo& intendedNextSubmit, uint32_t mainObjectIndex)
 {
 	finalizeAllCopiesToGPU(intendedNextSubmit);
@@ -645,97 +655,61 @@ void DrawResourcesFiller::submitCurrentDrawObjectsAndReset(SIntendedSubmitInfo&
 
 	// We reset Geometry Counters (drawObj+geometryInfos) because we're done rendering previous geometry
 	// We don't reset counters for styles because we will be reusing them
-	resetGeometryCounters();
-	
-#if 1
-	if (mainObjectIndex < maxMainObjects)
-	{
-		// Check if user is following proper usage, mainObjectIndex should be the last mainObj added before an autosubmit, because this is the only mainObj we want to maintain.
-		// See comments on`addMainObject_SubmitIfNeeded` function
-		// TODO: consider forcing this by not expose mainObjectIndex to user and keep track of a "currentMainObj" (?)
-		_NBL_DEBUG_BREAK_IF(mainObjectIndex != (currentMainObjectCount - 1u)); 
-
-		// If the clip projection stack is non-empty, then it means we need to re-push the clipProjectionData (because it existed in geometry data and it was erased)
-		uint64_t newClipProjectionAddress = acquireCurrentClipProjectionAddress(intendedNextSubmit);
-		// only re-upload mainObjData if it's clipProjectionAddress was changed
-		if (newClipProjectionAddress != getMainObject(mainObjectIndex)->clipProjectionAddress)
-		{
-			// then modify the mainObject data
-			getMainObject(mainObjectIndex)->clipProjectionAddress = newClipProjectionAddress;
-			// we need to rewind back inMemMainObjectCount to this mainObjIndex so it re-uploads the current mainObject (because we modified it)
-			inMemMainObjectCount = core::min(inMemMainObjectCount, mainObjectIndex);
-		}
-	}
-
-	// TODO: Consider resetting MainObjects here as well and addMainObject for the new data again, but account for the fact that mainObjectIndex now changed (either change through uint32_t& or keeping track of "currentMainObj" in drawResourcesFiller
-#else
-	resetMainObjectCounters();
-
-	// If there is a mainObject data we need to maintain and keep it's clipProjectionAddr valid
-	if (mainObjectIndex < maxMainObjects)
-	{
-		MainObject mainObjToMaintain = *getMainObject(mainObjectIndex);
-
-		// If the clip projection stack is non-empty, then it means we need to re-push the clipProjectionData (because it exists in geometry data and it was reset)
-		// `acquireCurrentClipProjectionAddress` shouldn't/won't trigger auto-submit because geometry buffer counters were reset and our geometry buffer is supposed to be larger than a single clipProjectionData
-		mainObjToMaintain->clipProjectionAddress = acquireCurrentClipProjectionAddress(intendedNextSubmit);
-		
-		// We're calling `addMainObject_Internal` instead of safer `addMainObject_SubmitIfNeeded` because we've reset our mainObject and we're sure this won't need an autoSubmit.
-		addMainObject_Internal(mainObjToMaintain);
-	}
-#endif
+	resetDrawObjects();
 }
 
 uint32_t DrawResourcesFiller::addMainObject_Internal(const MainObject& mainObject)
 {
-	MainObject* mainObjsArray = reinterpret_cast<MainObject*>(cpuDrawBuffers.mainObjectsBuffer->getPointer());
-	
-	if (currentMainObjectCount >= MaxIndexableMainObjects)
+	const size_t remainingResourcesSize = calculateRemainingResourcesSize();
+	const size_t memRequired = sizeof(MainObject);
+	const bool enoughMem = remainingResourcesSize >= memRequired; // enough remaining memory for 1 more dtm settings with 2 referenced line styles?
+	if (!enoughMem)
 		return InvalidMainObjectIdx;
-	if (currentMainObjectCount >= maxMainObjects)
+	if (resourcesCollection.mainObjects.vector.size() >= MaxIndexableMainObjects)
 		return InvalidMainObjectIdx;
-
-	void* dst = mainObjsArray + currentMainObjectCount;
-	memcpy(dst, &mainObject, sizeof(MainObject));
-	uint32_t ret = currentMainObjectCount;
-	currentMainObjectCount++;
-	return ret;
+	resourcesCollection.mainObjects.vector.push_back(mainObject); // this will implicitly increase total resource consumption and reduce remaining size --> no need for mem size trackers
+	return resourcesCollection.mainObjects.vector.size() - 1u;
 }
 
 uint32_t DrawResourcesFiller::addLineStyle_Internal(const LineStyleInfo& lineStyleInfo)
 {
+	const size_t remainingResourcesSize = calculateRemainingResourcesSize();
+	const bool enoughMem = remainingResourcesSize >= sizeof(LineStyle); // enough remaining memory for 1 more linestyle?
+	if (!enoughMem)
+		return InvalidStyleIdx;
+	// TODO: Additionally constraint by a max size? and return InvalidIdx if it would exceed
+
+
 	LineStyle gpuLineStyle = lineStyleInfo.getAsGPUData();
 	_NBL_DEBUG_BREAK_IF(gpuLineStyle.stipplePatternSize > LineStyle::StipplePatternMaxSize); // Oops, even after style normalization the style is too long to be in gpu mem :(
-	LineStyle* stylesArray = reinterpret_cast<LineStyle*>(cpuDrawBuffers.lineStylesBuffer->getPointer());
-	for (uint32_t i = 0u; i < currentLineStylesCount; ++i)
+	for (uint32_t i = 0u; i < resourcesCollection.lineStyles.vector.size(); ++i)
 	{
-		const LineStyle& itr = stylesArray[i];
-
+		const LineStyle& itr = resourcesCollection.lineStyles.vector[i];
 		if (itr == gpuLineStyle)
 			return i;
 	}
 
-	if (currentLineStylesCount >= maxLineStyles)
-		return InvalidStyleIdx;
-
-	void* dst = stylesArray + currentLineStylesCount;
-	memcpy(dst, &gpuLineStyle, sizeof(LineStyle));
-	return currentLineStylesCount++;
+	resourcesCollection.lineStyles.vector.push_back(gpuLineStyle); // this will implicitly increase total resource consumption and reduce remaining size --> no need for mem size trackers
+	return resourcesCollection.lineStyles.vector.size() - 1u;
 }
 
 uint32_t DrawResourcesFiller::addDTMSettings_Internal(const DTMSettingsInfo& dtmSettingsInfo, SIntendedSubmitInfo& intendedNextSubmit)
 {
+	const size_t remainingResourcesSize = calculateRemainingResourcesSize();
+	const size_t maxMemRequired = sizeof(DTMSettings) + 2 * sizeof(LineStyle);
+	const bool enoughMem = remainingResourcesSize >= maxMemRequired; // enough remaining memory for 1 more dtm settings with 2 referenced line styles?
+
+	if (!enoughMem)
+		return InvalidDTMSettingsIdx;
+	// TODO: Additionally constraint by a max size? and return InvalidIdx if it would exceed
+
 	DTMSettings dtmSettings;
 	dtmSettings.contourLinesStartHeight = dtmSettingsInfo.contourLinesStartHeight;
 	dtmSettings.contourLinesEndHeight = dtmSettingsInfo.contourLinesEndHeight;
 	dtmSettings.contourLinesHeightInterval = dtmSettingsInfo.contourLinesHeightInterval;
 
-	if (currentLineStylesCount + 2 > maxLineStyles)
-		return InvalidDTMSettingsIdx;
-
-	assert(currentLineStylesCount + 2 <= maxLineStyles);
-	dtmSettings.outlineLineStyleIdx = addLineStyle_SubmitIfNeeded(dtmSettingsInfo.outlineLineStyleInfo, intendedNextSubmit);
-	dtmSettings.contourLineStyleIdx = addLineStyle_SubmitIfNeeded(dtmSettingsInfo.contourLineStyleInfo, intendedNextSubmit);
+	dtmSettings.outlineLineStyleIdx = addLineStyle_Internal(dtmSettingsInfo.outlineLineStyleInfo);
+	dtmSettings.contourLineStyleIdx = addLineStyle_Internal(dtmSettingsInfo.contourLineStyleInfo);
 
 	switch (dtmSettingsInfo.heightShadingMode)
 	{
@@ -751,25 +725,18 @@ uint32_t DrawResourcesFiller::addDTMSettings_Internal(const DTMSettingsInfo& dtm
 	}
 	_NBL_DEBUG_BREAK_IF(!dtmSettingsInfo.fillShaderDTMSettingsHeightColorMap(dtmSettings));
 
-	if (currentDTMSettingsCount >= maxDtmSettings)
-		return InvalidDTMSettingsIdx;
-
-	DTMSettings* settingsArray = reinterpret_cast<DTMSettings*>(cpuDrawBuffers.dtmSettingsBuffer->getPointer());
-	for (uint32_t i = 0u; i < currentDTMSettingsCount; ++i)
+	for (uint32_t i = 0u; i < resourcesCollection.dtmSettings.vector.size(); ++i)
 	{
-		const DTMSettings& itr = settingsArray[i];
+		const DTMSettings& itr = resourcesCollection.dtmSettings.vector[i];
 		if (itr == dtmSettings)
 			return i;
 	}
-
-	void* dst = settingsArray + currentDTMSettingsCount;
-	memcpy(dst, &dtmSettings, sizeof(DTMSettings));
-	return currentDTMSettingsCount++;
-
-	return InvalidDTMSettingsIdx;
+	
+	resourcesCollection.dtmSettings.vector.push_back(dtmSettings); // this will implicitly increase total resource consumption and reduce remaining size --> no need for mem size trackers
+	return resourcesCollection.dtmSettings.vector.size() - 1u;
 }
 
-uint64_t DrawResourcesFiller::acquireCurrentClipProjectionAddress(SIntendedSubmitInfo& intendedNextSubmit)
+uint32_t DrawResourcesFiller::acquireCurrentClipProjectionAddress(SIntendedSubmitInfo& intendedNextSubmit)
 {
 	if (clipProjectionAddresses.empty())
 		return InvalidClipProjectionAddress;
@@ -780,35 +747,26 @@ uint64_t DrawResourcesFiller::acquireCurrentClipProjectionAddress(SIntendedSubmi
 	return clipProjectionAddresses.back();
 }
 
-uint64_t DrawResourcesFiller::addClipProjectionData_SubmitIfNeeded(const ClipProjectionData& clipProjectionData, SIntendedSubmitInfo& intendedNextSubmit)
+uint32_t DrawResourcesFiller::addClipProjectionData_SubmitIfNeeded(const ClipProjectionData& clipProjectionData, SIntendedSubmitInfo& intendedNextSubmit)
 {
-	uint64_t outClipProjectionAddress = addClipProjectionData_Internal(clipProjectionData);
-	if (outClipProjectionAddress == InvalidClipProjectionAddress)
+	const size_t remainingResourcesSize = calculateRemainingResourcesSize();
+	const size_t memRequired = sizeof(ClipProjectionData);
+	const bool enoughMem = remainingResourcesSize >= memRequired; // enough remaining memory for 1 more dtm settings with 2 referenced line styles?
+
+	if (!enoughMem)
 	{
 		finalizeAllCopiesToGPU(intendedNextSubmit);
 		submitDraws(intendedNextSubmit);
-
-		resetGeometryCounters();
-		resetMainObjectCounters();
-
-		outClipProjectionAddress = addClipProjectionData_Internal(clipProjectionData);
-		assert(outClipProjectionAddress != InvalidClipProjectionAddress);
+		
+		// resets itself
+		resetCustomClipProjections();
+		// resets higher level resources
+		resetMainObjects();
+		resetDrawObjects();
 	}
-	return outClipProjectionAddress;
-}
-
-uint64_t DrawResourcesFiller::addClipProjectionData_Internal(const ClipProjectionData& clipProjectionData)
-{
-	const uint64_t maxGeometryBufferClipProjData = (maxGeometryBufferSize - currentGeometryBufferSize) / sizeof(ClipProjectionData);
-	if (maxGeometryBufferClipProjData <= 0)
-		return InvalidClipProjectionAddress;
 	
-	uint8_t* dst = reinterpret_cast<uint8_t*>(cpuDrawBuffers.geometryBuffer->getPointer()) + currentGeometryBufferSize;
-	memcpy(dst, &clipProjectionData, sizeof(ClipProjectionData));
-
-	const uint64_t ret = currentGeometryBufferSize + drawResourcesBDA;
-	currentGeometryBufferSize += sizeof(ClipProjectionData);
-	return ret;
+	resourcesCollection.clipProjections.vector.push_back(clipProjectionData); // this will implicitly increase total resource consumption and reduce remaining size --> no need for mem size trackers
+	return resourcesCollection.clipProjections.vector.size() - 1u;
 }
 
 void DrawResourcesFiller::addPolylineObjects_Internal(const CPolylineBase& polyline, const CPolylineBase::SectionInfo& section, uint32_t& currentObjectInSection, uint32_t mainObjIdx)
@@ -865,6 +823,10 @@ void DrawResourcesFiller::addLines_Internal(const CPolylineBase& polyline, const
 	assert(section.count >= 1u);
 	assert(section.type == ObjectType::LINE);
 
+
+	const size_t remainingResourcesSize = calculateRemainingResourcesSize();
+	// how many lines? --> memRequired = sizeof(LinePointInfo) + sizeof(LinePointInfo)*lineCount + sizeof(DrawObject)*lineCount + sizeof(uint32_t) * 6u * lineCount
+
 	const uint32_t maxGeometryBufferPoints = static_cast<uint32_t>((maxGeometryBufferSize - currentGeometryBufferSize) / sizeof(LinePointInfo));
 	const uint32_t maxGeometryBufferLines = (maxGeometryBufferPoints <= 1u) ? 0u : maxGeometryBufferPoints - 1u;
 
@@ -889,6 +851,8 @@ void DrawResourcesFiller::addLines_Internal(const CPolylineBase& polyline, const
 		drawObj.geometryAddress += sizeof(LinePointInfo);
 	}
 
+	// TODO: Add index buffer, 
+
 	// Add Geometry
 	if (objectsToUpload > 0u)
 	{
@@ -1049,7 +1013,6 @@ uint32_t DrawResourcesFiller::addMSDFTexture(const MSDFInputInfo& msdfInput, cor
 
 			// If we reset main objects will cause an auto submission bug, where adding an msdf texture while constructing glyphs will have wrong main object references (See how SingleLineTexts add Glyphs with a single mainObject)
 			// for the same reason we don't reset line styles
-			// `submitCurrentObjectsAndReset` function handles the above + updating clipProjectionData and making sure the mainObjectIdx references to the correct clipProj data after reseting geometry buffer
 			submitCurrentDrawObjectsAndReset(intendedNextSubmit, mainObjIdx);
 		} 
 		else
diff --git a/62_CAD/DrawResourcesFiller.h b/62_CAD/DrawResourcesFiller.h
index a8010a7ec..19579449a 100644
--- a/62_CAD/DrawResourcesFiller.h
+++ b/62_CAD/DrawResourcesFiller.h
@@ -28,7 +28,8 @@ struct DrawResourcesFiller
 {
 public:
 	
-	static constexpr size_t BDALoadAlignment = 8u;
+	// We pack multiple data types in a single buffer, we need to makes sure each offset starts aligned to avoid mis-aligned accesses
+	static constexpr size_t ResourcesMaxNaturalAlignment = 8u;
 
 	/// @brief general parent struct for 1.ReservedCompute and 2.CPUGenerated Resources
 	struct ResourceBase
@@ -37,7 +38,7 @@ struct DrawResourcesFiller
 		size_t bufferOffset = InvalidBufferOffset; // set when copy to gpu buffer is issued
 		virtual size_t getCount() const = 0;
 		virtual size_t getStorageSize() const = 0;
-		virtual size_t getAlignedStorageSize() const { core::alignUp(getStorageSize(), BDALoadAlignment); }
+		virtual size_t getAlignedStorageSize() const { core::alignUp(getStorageSize(), ResourcesMaxNaturalAlignment); }
 	};
 
 	/// @brief ResourceBase reserved for compute shader stages input/output
@@ -61,7 +62,8 @@ struct DrawResourcesFiller
 	/// @brief struct to hold all resources
 	struct ResourcesCollection
 	{
-		// auto-submission level 0 buffers (settings that mainObj references)
+		// auto-submission level 0 resources (settings that mainObj references)
+		// Not enough VRAM available to serve adding one of the level 0 resources: they clear themselves and everything from higher levels after doing submission
 		CPUGeneratedResource<LineStyle> lineStyles;
 		CPUGeneratedResource<DTMSettings> dtmSettings;
 		CPUGeneratedResource<ClipProjectionData> clipProjections;
@@ -71,10 +73,10 @@ struct DrawResourcesFiller
 
 		// auto-submission level 2 buffers
 		CPUGeneratedResource<DrawObject> drawObjects;
-		CPUGeneratedResource<uint32_t> indexBuffer;
+		CPUGeneratedResource<uint32_t> indexBuffer; // this is going to change to ReservedComputeResource where index buffer gets filled by compute shaders
 		CPUGeneratedResource<uint8_t> geometryInfo; // general purpose byte buffer for custom geometries, etc
 
-		// Get Total memory consumption, If all ResourcesCollection get packed together with BDALoadAlignment
+		// Get Total memory consumption, If all ResourcesCollection get packed together with ResourcesMaxNaturalAlignment
 		// used to decide when to overflow
 		size_t calculateTotalConsumption() const
 		{
@@ -101,7 +103,7 @@ struct DrawResourcesFiller
 	{
 		// for auto-submission to work correctly, memory needs to serve at least 2 linestyle, 1 dtm settings, 1 clip proj, 1 main obj, 1 draw obj and 512 bytes of additional mem for geometries and index buffer
 		// this is the ABSOLUTE MINIMUM (if this value is used rendering will probably be as slow as CPU drawing :D)
-		return core::alignUp(sizeof(LineStyle) * 2u + sizeof(DTMSettings) + sizeof(ClipProjectionData) + sizeof(MainObject) + sizeof(DrawObject) + 512ull, BDALoadAlignment);
+		return core::alignUp(sizeof(LineStyle) * 2u + sizeof(DTMSettings) + sizeof(ClipProjectionData) + sizeof(MainObject) + sizeof(DrawObject) + 512ull, ResourcesMaxNaturalAlignment);
 	}
 
 	void allocateResourcesBuffer(ILogicalDevice* logicalDevice, size_t size);
@@ -169,10 +171,10 @@ struct DrawResourcesFiller
 
 	void reset()
 	{
-		resetGeometryCounters();
-		resetMainObjectCounters();
-		resetLineStyleCounters();
-		resetDTMSettingsCounters();
+		resetDrawObjects();
+		resetMainObjects();
+		resetLineStyles();
+		resetDTMSettings();
 	}
 
 	/// @brief collection of all the resources that will eventually be reserved or copied to in the resourcesGPUBuffer, will be accessed via individual BDA pointers in shaders
@@ -227,13 +229,13 @@ struct DrawResourcesFiller
 
 	bool finalizeTextureCopies(SIntendedSubmitInfo& intendedNextSubmit);
 
-	// Internal Function to call whenever we overflow while filling our buffers with geometry (potential limiters: indexBuffer, drawObjectsBuffer or geometryBuffer)
-	// ! mainObjIdx: is the mainObject the "overflowed" drawObjects belong to.
-	//		mainObjIdx is required to ensure that valid data, especially the `clipProjectionData`, remains linked to the main object.
-	//		This is important because, while other data may change during overflow handling, the main object must persist to maintain consistency throughout rendering all parts of it. (for example all lines and beziers of a single polyline)
-	//		[ADVANCED] If you have not created your mainObject yet, pass `InvalidMainObjectIdx` (See drawHatch)
+	const size_t calculateRemainingResourcesSize() const;
+
+	// Internal Function to call whenever we overflow when we can't fill all of mainObject's drawObjects
 	void submitCurrentDrawObjectsAndReset(SIntendedSubmitInfo& intendedNextSubmit, uint32_t mainObjectIndex);
 
+	/// @return index to added main object.
+	///		It will return `InvalidMainObjectIndex` if it there isn't enough remaining resources memory OR the index would exceed MaxIndexableMainObjects
 	uint32_t addMainObject_Internal(const MainObject& mainObject);
 
 	uint32_t addLineStyle_Internal(const LineStyleInfo& lineStyleInfo);
@@ -242,11 +244,9 @@ struct DrawResourcesFiller
 
 	// Gets the current clip projection data (the top of stack) gpu addreess inside the geometryBuffer
 	// If it's been invalidated then it will request to upload again with a possible auto-submit on low geometry buffer memory.
-	uint64_t acquireCurrentClipProjectionAddress(SIntendedSubmitInfo& intendedNextSubmit);
+	uint32_t acquireCurrentClipProjectionAddress(SIntendedSubmitInfo& intendedNextSubmit);
 	
-	uint64_t addClipProjectionData_SubmitIfNeeded(const ClipProjectionData& clipProjectionData, SIntendedSubmitInfo& intendedNextSubmit);
-
-	uint64_t addClipProjectionData_Internal(const ClipProjectionData& clipProjectionData);
+	uint32_t addClipProjectionData_SubmitIfNeeded(const ClipProjectionData& clipProjectionData, SIntendedSubmitInfo& intendedNextSubmit);
 
 	static constexpr uint32_t getCageCountPerPolylineObject(ObjectType type)
 	{
@@ -269,44 +269,32 @@ struct DrawResourcesFiller
 	
 	bool addFontGlyph_Internal(const GlyphInfo& glyphInfo, uint32_t mainObjIdx);
 	
-	void resetMainObjectCounters()
+	void resetMainObjects()
 	{
-		inMemMainObjectCount = 0u;
-		currentMainObjectCount = 0u;
+		resourcesCollection.mainObjects.vector.clear();
 	}
 
-	// WARN: If you plan to use this, make sure you either reset the mainObjectCounters as well
-	//			Or if you want to keep your  mainObject around, make sure you're using the `submitCurrentObjectsAndReset` function instead of calling this directly
-	//			So that it makes your mainObject point to the correct clipProjectionData (which exists in the geometry buffer)
-	void resetGeometryCounters()
+	// these resources are data related to chunks of a whole mainObject
+	void resetDrawObjects()
 	{
-		inMemDrawObjectCount = 0u;
-		currentDrawObjectCount = 0u;
-
-		inMemGeometryBufferSize = 0u;
-		currentGeometryBufferSize = 0u;
-
-		// Invalidate all the clip projection addresses because geometry buffer got reset
-		for (auto& clipProjAddr : clipProjectionAddresses)
-			clipProjAddr = InvalidClipProjectionAddress;
+		resourcesCollection.drawObjects.vector.clear();
+		resourcesCollection.indexBuffer.vector.clear();
+		resourcesCollection.geometryInfo.vector.clear();
 	}
 
-	void resetLineStyleCounters()
+	void resetCustomClipProjections()
 	{
-		currentLineStylesCount = 0u;
-		inMemLineStylesCount = 0u;
+		resourcesCollection.clipProjections.vector.clear();
 	}
 
-	void resetDTMSettingsCounters()
+	void resetLineStyles()
 	{
-		currentDTMSettingsCount = 0u;
-		inMemDTMSettingsCount = 0u;
+		resourcesCollection.lineStyles.vector.clear();
 	}
 
-	MainObject* getMainObject(uint32_t idx)
+	void resetDTMSettings()
 	{
-		MainObject* mainObjsArray = reinterpret_cast<MainObject*>(cpuDrawBuffers.mainObjectsBuffer->getPointer());
-		return &mainObjsArray[idx];
+		resourcesCollection.dtmSettings.vector.clear();
 	}
 
 	// MSDF Hashing and Caching Internal Functions 
diff --git a/62_CAD/main.cpp b/62_CAD/main.cpp
index 020b7cf6c..6784e6b70 100644
--- a/62_CAD/main.cpp
+++ b/62_CAD/main.cpp
@@ -289,7 +289,7 @@ class ComputerAidedDesign final : public examples::SimpleWindowedApplication, pu
 		drawResourcesFiller = DrawResourcesFiller(core::smart_refctd_ptr(m_utils), getGraphicsQueue());
 
 		size_t bufferSize = 512u * 1024u * 1024u; // 512 MB
-		drawResourcesFiller.allocateDrawResourcesBuffer(m_device.get(), bufferSize);
+		drawResourcesFiller.allocateResourcesBuffer(m_device.get(), bufferSize);
 		drawResourcesFiller.allocateMSDFTextures(m_device.get(), 256u, uint32_t2(MSDFSize, MSDFSize));
 
 		{

From 2abd6b970c2394a324f747ab6bc432ba5c9fa7a6 Mon Sep 17 00:00:00 2001
From: Erfan Ahmadi <ahmadierfan99@gmail.com>
Date: Wed, 2 Apr 2025 02:57:47 +0330
Subject: [PATCH 127/529] compiles but probably has runtime errors

---
 62_CAD/DrawResourcesFiller.cpp                | 208 ++++++++++--------
 62_CAD/DrawResourcesFiller.h                  |  40 ++--
 62_CAD/main.cpp                               |   4 +-
 .../shaders/main_pipeline/vertex_shader.hlsl  |  48 ++--
 4 files changed, 168 insertions(+), 132 deletions(-)

diff --git a/62_CAD/DrawResourcesFiller.cpp b/62_CAD/DrawResourcesFiller.cpp
index 1eff63552..80ddc0d57 100644
--- a/62_CAD/DrawResourcesFiller.cpp
+++ b/62_CAD/DrawResourcesFiller.cpp
@@ -15,33 +15,6 @@ void DrawResourcesFiller::setSubmitDrawsFunction(const SubmitFunc& func)
 	submitDraws = func;
 }
 
-//void DrawResourcesFiller::allocateIndexBuffer(ILogicalDevice* logicalDevice, uint32_t maxIndices)
-//{
-//	maxIndexCount = maxIndices;
-//	const size_t indexBufferSize = maxIndices * sizeof(index_buffer_type);
-//	auto indexBuffer = ICPUBuffer::create({ indexBufferSize });
-//
-//	index_buffer_type* indices = reinterpret_cast<index_buffer_type*>(indexBuffer->getPointer());
-//	for (uint32_t i = 0u; i < maxIndices / 6u; ++i)
-//	{
-//		index_buffer_type objIndex = i;
-//		indices[i * 6] = objIndex * 4u + 1u;
-//		indices[i * 6 + 1u] = objIndex * 4u + 0u;
-//		indices[i * 6 + 2u] = objIndex * 4u + 2u;
-//
-//		indices[i * 6 + 3u] = objIndex * 4u + 1u;
-//		indices[i * 6 + 4u] = objIndex * 4u + 2u;
-//		indices[i * 6 + 5u] = objIndex * 4u + 3u;
-//	}
-//
-//	IGPUBuffer::SCreationParams indexBufferCreationParams = {};
-//	indexBufferCreationParams.size = indexBufferSize;
-//	indexBufferCreationParams.usage = IGPUBuffer::EUF_INDEX_BUFFER_BIT | IGPUBuffer::EUF_TRANSFER_DST_BIT;
-//
-//	m_utilities->createFilledDeviceLocalBufferOnDedMem(SIntendedSubmitInfo{.queue=m_copyQueue}, std::move(indexBufferCreationParams), indices).move_into(gpuDrawBuffers.indexBuffer);
-//	gpuDrawBuffers.indexBuffer->setObjectDebugName("indexBuffer");
-//}
-
 void DrawResourcesFiller::allocateResourcesBuffer(ILogicalDevice* logicalDevice, size_t size)
 {
 	size = core::alignUp(size, ResourcesMaxNaturalAlignment);
@@ -146,6 +119,8 @@ void DrawResourcesFiller::drawPolyline(const CPolylineBase& polyline, uint32_t p
 			submitCurrentDrawObjectsAndReset(intendedNextSubmit, polylineMainObjIdx);
 	}
 
+	return; // TODO: Remove
+
 	if (!polyline.getConnectors().empty())
 	{
 		uint32_t currentConnectorPolylineObject = 0u;
@@ -176,19 +151,17 @@ void DrawResourcesFiller::drawTriangleMesh(const CTriangleMesh& mesh, CTriangleM
 
 	{
 		// NOTE[ERFAN]: these push contants will be removed, everything will be accessed by dtmSettings, including where the vertex buffer data resides
-		auto& geometryBytesVector = resourcesCollection.geometryInfo.vector;
-		size_t geometryBufferOffset = core::alignUp(geometryBytesVector.size(), ResourcesMaxNaturalAlignment);
-		geometryBytesVector.resize(geometryBufferOffset + dataToAddByteSize); // this will increase total resources consumption and reduce remainingResourcesSize --> no need to update any size trackers
 
 		// Copy VertexBuffer
-		void* dst = geometryBytesVector.data() + geometryBufferOffset;
+		size_t geometryBufferOffset = resourcesCollection.geometryInfo.increaseSizeAndGetOffset(dataToAddByteSize, alignof(CTriangleMesh::vertex_t));
+		void* dst = resourcesCollection.geometryInfo.data() + geometryBufferOffset;
 		// the actual bda address will be determined only after all copies are finalized, later we will do += `baseBDAAddress + geometryInfo.bufferOffset`
 		drawData.pushConstants.triangleMeshVerticesBaseAddress = geometryBufferOffset;
 		memcpy(dst, mesh.getVertices().data(), vtxBuffByteSize);
 		geometryBufferOffset += vtxBuffByteSize; 
 
 		// Copy IndexBuffer
-		dst = geometryBytesVector.data() + geometryBufferOffset;
+		dst = resourcesCollection.geometryInfo.data() + geometryBufferOffset;
 		drawData.indexBufferOffset = geometryBufferOffset;
 		memcpy(dst, mesh.getIndices().data(), indexBuffByteSize);
 		geometryBufferOffset += indexBuffByteSize;
@@ -227,6 +200,7 @@ void DrawResourcesFiller::drawHatch(
 		const HatchFillPattern fillPattern,
 		SIntendedSubmitInfo& intendedNextSubmit)
 {
+	return; // TODO: Remove
 	if (color.a == 0.0f) // not visible
 		return;
 
@@ -271,6 +245,7 @@ void DrawResourcesFiller::drawFontGlyph(
 		uint32_t mainObjIdx,
 		SIntendedSubmitInfo& intendedNextSubmit)
 {
+#if 0
 	uint32_t textureIdx = InvalidTextureIdx;
 	const MSDFInputInfo msdfInput = MSDFInputInfo(fontFace->getHash(), glyphIdx);
 	textureIdx = getMSDFIndexFromInputInfo(msdfInput, intendedNextSubmit);
@@ -293,11 +268,13 @@ void DrawResourcesFiller::drawFontGlyph(
 		// TODO: Log, probably getGlyphMSDF(face,glyphIdx) returned nullptr ICPUImage ptr
 		_NBL_DEBUG_BREAK_IF(true);
 	}
+#endif
 }
 
 // TODO: FIX
 void DrawResourcesFiller::_test_addImageObject(float64_t2 topLeftPos, float32_t2 size, float32_t rotation, SIntendedSubmitInfo& intendedNextSubmit)
 {
+#if 0
 	auto addImageObject_Internal = [&](const ImageObjectInfo& imageObjectInfo, uint32_t mainObjIdx) -> bool
 		{
 			const uint32_t maxGeometryBufferImageObjects = static_cast<uint32_t>((maxGeometryBufferSize - currentGeometryBufferSize) / sizeof(ImageObjectInfo));
@@ -340,6 +317,7 @@ void DrawResourcesFiller::_test_addImageObject(float64_t2 topLeftPos, float32_t2
 		bool success = addImageObject_Internal(info, mainObjIdx);
 		assert(success); // this should always be true, otherwise it's either bug in code or not enough memory allocated to hold a single image object 
 	}
+#endif
 }
 
 bool DrawResourcesFiller::finalizeAllCopiesToGPU(SIntendedSubmitInfo& intendedNextSubmit)
@@ -403,7 +381,7 @@ uint32_t DrawResourcesFiller::addMainObject_SubmitIfNeeded(uint32_t styleIdx, ui
 	MainObject mainObject = {};
 	mainObject.styleIdx = styleIdx;
 	mainObject.dtmSettingsIdx = dtmSettingsIdx;
-	mainObject.clipProjectionAddress = acquireCurrentClipProjectionAddress(intendedNextSubmit);
+	mainObject.clipProjectionIndex = acquireCurrentClipProjectionIndex(intendedNextSubmit);
 	uint32_t outMainObjectIdx = addMainObject_Internal(mainObject);
 	if (outMainObjectIdx == InvalidMainObjectIdx)
 	{
@@ -428,7 +406,7 @@ uint32_t DrawResourcesFiller::addMainObject_SubmitIfNeeded(uint32_t styleIdx, ui
 void DrawResourcesFiller::pushClipProjectionData(const ClipProjectionData& clipProjectionData)
 {
 	clipProjections.push_back(clipProjectionData);
-	clipProjectionAddresses.push_back(InvalidClipProjectionAddress);
+	clipProjectionIndices.push_back(InvalidClipProjectionIndex);
 }
 
 void DrawResourcesFiller::popClipProjectionData()
@@ -437,7 +415,7 @@ void DrawResourcesFiller::popClipProjectionData()
 		return;
 
 	clipProjections.pop_back();
-	clipProjectionAddresses.pop_back();
+	clipProjectionIndices.pop_back();
 }
 
 bool DrawResourcesFiller::finalizeBufferCopies(SIntendedSubmitInfo& intendedNextSubmit)
@@ -736,15 +714,15 @@ uint32_t DrawResourcesFiller::addDTMSettings_Internal(const DTMSettingsInfo& dtm
 	return resourcesCollection.dtmSettings.vector.size() - 1u;
 }
 
-uint32_t DrawResourcesFiller::acquireCurrentClipProjectionAddress(SIntendedSubmitInfo& intendedNextSubmit)
+uint32_t DrawResourcesFiller::acquireCurrentClipProjectionIndex(SIntendedSubmitInfo& intendedNextSubmit)
 {
-	if (clipProjectionAddresses.empty())
-		return InvalidClipProjectionAddress;
+	if (clipProjectionIndices.empty())
+		return InvalidClipProjectionIndex;
 
-	if (clipProjectionAddresses.back() == InvalidClipProjectionAddress)
-		clipProjectionAddresses.back() = addClipProjectionData_SubmitIfNeeded(clipProjections.back(), intendedNextSubmit);
+	if (clipProjectionIndices.back() == InvalidClipProjectionIndex)
+		clipProjectionIndices.back() = addClipProjectionData_SubmitIfNeeded(clipProjections.back(), intendedNextSubmit);
 	
-	return clipProjectionAddresses.back();
+	return clipProjectionIndices.back();
 }
 
 uint32_t DrawResourcesFiller::addClipProjectionData_SubmitIfNeeded(const ClipProjectionData& clipProjectionData, SIntendedSubmitInfo& intendedNextSubmit)
@@ -779,19 +757,32 @@ void DrawResourcesFiller::addPolylineObjects_Internal(const CPolylineBase& polyl
 		assert(false); // we don't handle other object types
 }
 
+// TODO: FIX
 void DrawResourcesFiller::addPolylineConnectors_Internal(const CPolylineBase& polyline, uint32_t& currentPolylineConnectorObj, uint32_t mainObjIdx)
 {
-	const uint32_t maxGeometryBufferConnectors = static_cast<uint32_t>((maxGeometryBufferSize - currentGeometryBufferSize) / sizeof(PolylineConnector));
-
-	uint32_t uploadableObjects = (maxIndexCount / 6u) - currentDrawObjectCount;
-	uploadableObjects = core::min(uploadableObjects, maxGeometryBufferConnectors);
-	uploadableObjects = core::min(uploadableObjects, maxDrawObjects - currentDrawObjectCount);
+#if 0
+	const size_t remainingResourcesSize = calculateRemainingResourcesSize();
 
+	const uint32_t uploadableObjects = (remainingResourcesSize) / (sizeof(PolylineConnector) + sizeof(DrawObject) + sizeof(uint32_t) * 6u);
+	// TODO[ERFAN]: later take into account, our limit of max index buffer and vettex buffer size or constrainst other than mem
+	
 	const uint32_t connectorCount = static_cast<uint32_t>(polyline.getConnectors().size());
 	const uint32_t remainingObjects = connectorCount - currentPolylineConnectorObj;
-
 	const uint32_t objectsToUpload = core::min(uploadableObjects, remainingObjects);
 
+	if (objectsToUpload <= 0u)
+		return;
+
+
+
+
+
+	// TODO: 
+
+
+
+
+
 	// Add DrawObjs
 	DrawObject drawObj = {};
 	drawObj.mainObjIndex = mainObjIdx;
@@ -816,6 +807,7 @@ void DrawResourcesFiller::addPolylineConnectors_Internal(const CPolylineBase& po
 	}
 
 	currentPolylineConnectorObj += objectsToUpload;
+#endif
 }
 
 void DrawResourcesFiller::addLines_Internal(const CPolylineBase& polyline, const CPolylineBase::SectionInfo& section, uint32_t& currentObjectInSection, uint32_t mainObjIdx)
@@ -825,94 +817,120 @@ void DrawResourcesFiller::addLines_Internal(const CPolylineBase& polyline, const
 
 
 	const size_t remainingResourcesSize = calculateRemainingResourcesSize();
-	// how many lines? --> memRequired = sizeof(LinePointInfo) + sizeof(LinePointInfo)*lineCount + sizeof(DrawObject)*lineCount + sizeof(uint32_t) * 6u * lineCount
+	if (remainingResourcesSize < sizeof(LinePointInfo))
+		return;
 
-	const uint32_t maxGeometryBufferPoints = static_cast<uint32_t>((maxGeometryBufferSize - currentGeometryBufferSize) / sizeof(LinePointInfo));
-	const uint32_t maxGeometryBufferLines = (maxGeometryBufferPoints <= 1u) ? 0u : maxGeometryBufferPoints - 1u;
-
-	uint32_t uploadableObjects = (maxIndexCount / 6u) - currentDrawObjectCount;
-	uploadableObjects = core::min(uploadableObjects, maxGeometryBufferLines);
-	uploadableObjects = core::min(uploadableObjects, maxDrawObjects - currentDrawObjectCount);
+	// how many lines fit into mem? --> memConsumption = sizeof(LinePointInfo) + sizeof(LinePointInfo)*lineCount + sizeof(DrawObject)*lineCount + sizeof(uint32_t) * 6u * lineCount
+	const uint32_t uploadableObjects = (remainingResourcesSize - sizeof(LinePointInfo)) / (sizeof(LinePointInfo) + sizeof(DrawObject) + sizeof(uint32_t) * 6u);
+	// TODO[ERFAN]: later take into account, our limit of max index buffer and vettex buffer size or constrainst other than mem
 
 	const uint32_t lineCount = section.count;
 	const uint32_t remainingObjects = lineCount - currentObjectInSection;
-	uint32_t objectsToUpload = core::min(uploadableObjects, remainingObjects);
+	const uint32_t objectsToUpload = core::min(uploadableObjects, remainingObjects);
+
+	if (objectsToUpload <= 0u)
+		return;
+
+	// Add Geometry
+	const auto pointsByteSize = sizeof(LinePointInfo) * (objectsToUpload + 1u);
+
+	
+	size_t geometryBufferOffset = resourcesCollection.geometryInfo.increaseSizeAndGetOffset(pointsByteSize, alignof(LinePointInfo));
+	void* dst = resourcesCollection.geometryInfo.data() + geometryBufferOffset;
+	const LinePointInfo& linePoint = polyline.getLinePointAt(section.index + currentObjectInSection);
+	memcpy(dst, &linePoint, pointsByteSize);
+
+	// Push Indices, remove later when compute fills this
+	uint32_t* indexBufferToBeFilled = resourcesCollection.indexBuffer.increaseCountAndGetPtr(6u * objectsToUpload);
+	for (uint32_t i = 0u; i < objectsToUpload; ++i)
+	{
+		indexBufferToBeFilled[i*6]		= i*4u + 1u;
+		indexBufferToBeFilled[i*6 + 1u]	= i*4u + 0u;
+		indexBufferToBeFilled[i*6 + 2u]	= i*4u + 2u;
+		indexBufferToBeFilled[i*6 + 3u]	= i*4u + 1u;
+		indexBufferToBeFilled[i*6 + 4u]	= i*4u + 2u;
+		indexBufferToBeFilled[i*6 + 5u]	= i*4u + 3u;
+	}
 
 	// Add DrawObjs
+	DrawObject* drawObjectsToBeFilled = resourcesCollection.drawObjects.increaseCountAndGetPtr(objectsToUpload);
 	DrawObject drawObj = {};
 	drawObj.mainObjIndex = mainObjIdx;
 	drawObj.type_subsectionIdx = uint32_t(static_cast<uint16_t>(ObjectType::LINE) | 0 << 16);
-	drawObj.geometryAddress = drawResourcesBDA + currentGeometryBufferSize;
+	drawObj.geometryAddress = geometryBufferOffset;
 	for (uint32_t i = 0u; i < objectsToUpload; ++i)
 	{
-		void* dst = reinterpret_cast<DrawObject*>(cpuDrawBuffers.drawObjectsBuffer->getPointer()) + currentDrawObjectCount;
-		memcpy(dst, &drawObj, sizeof(DrawObject));
-		currentDrawObjectCount += 1u;
+		drawObjectsToBeFilled[i] = drawObj;
 		drawObj.geometryAddress += sizeof(LinePointInfo);
-	}
-
-	// TODO: Add index buffer, 
-
-	// Add Geometry
-	if (objectsToUpload > 0u)
-	{
-		const auto pointsByteSize = sizeof(LinePointInfo) * (objectsToUpload + 1u);
-		void* dst = reinterpret_cast<char*>(cpuDrawBuffers.geometryBuffer->getPointer()) + currentGeometryBufferSize;
-		auto& linePoint = polyline.getLinePointAt(section.index + currentObjectInSection);
-		memcpy(dst, &linePoint, pointsByteSize);
-		currentGeometryBufferSize += pointsByteSize;
-	}
+	} 
 
 	currentObjectInSection += objectsToUpload;
 }
 
 void DrawResourcesFiller::addQuadBeziers_Internal(const CPolylineBase& polyline, const CPolylineBase::SectionInfo& section, uint32_t& currentObjectInSection, uint32_t mainObjIdx)
 {
-	constexpr uint32_t CagesPerQuadBezier = getCageCountPerPolylineObject(ObjectType::QUAD_BEZIER);
+	constexpr uint32_t CagesPerQuadBezier = 3u; // TODO: Break into 3 beziers in compute shader.
+
 	assert(section.type == ObjectType::QUAD_BEZIER);
 
-	const uint32_t maxGeometryBufferBeziers = static_cast<uint32_t>((maxGeometryBufferSize - currentGeometryBufferSize) / sizeof(QuadraticBezierInfo));
+	const size_t remainingResourcesSize = calculateRemainingResourcesSize();
+	// how many quad bezier objects fit into mem?
+	// memConsumption = quadBezCount * (sizeof(QuadraticBezierInfo) + 3*(sizeof(DrawObject)+6u*sizeof(uint32_t))
+	const uint32_t uploadableObjects = (remainingResourcesSize) / (sizeof(QuadraticBezierInfo) + (sizeof(DrawObject) + 6u * sizeof(uint32_t)) * CagesPerQuadBezier);
+	// TODO[ERFAN]: later take into account, our limit of max index buffer and vettex buffer size or constrainst other than mem
 	
-	uint32_t uploadableObjects = (maxIndexCount / 6u) - currentDrawObjectCount;
-	uploadableObjects = core::min(uploadableObjects, maxGeometryBufferBeziers);
-	uploadableObjects = core::min(uploadableObjects, maxDrawObjects - currentDrawObjectCount);
-	uploadableObjects /= CagesPerQuadBezier;
-
 	const uint32_t beziersCount = section.count;
 	const uint32_t remainingObjects = beziersCount - currentObjectInSection;
-	uint32_t objectsToUpload = core::min(uploadableObjects, remainingObjects);
+	const uint32_t objectsToUpload = core::min(uploadableObjects, remainingObjects);
+	const uint32_t cagesCount = objectsToUpload * CagesPerQuadBezier;
 
+	if (objectsToUpload <= 0u)
+		return;
+	
+	// Add Geometry
+	const auto beziersByteSize = sizeof(QuadraticBezierInfo) * (objectsToUpload);
+	size_t geometryBufferOffset = resourcesCollection.geometryInfo.increaseSizeAndGetOffset(beziersByteSize, alignof(QuadraticBezierInfo));
+	void* dst = resourcesCollection.geometryInfo.data() + geometryBufferOffset;
+	const QuadraticBezierInfo& quadBezier = polyline.getQuadBezierInfoAt(section.index + currentObjectInSection);
+	memcpy(dst, &quadBezier, beziersByteSize);
+
+
+
+	// Push Indices, remove later when compute fills this
+	uint32_t* indexBufferToBeFilled = resourcesCollection.indexBuffer.increaseCountAndGetPtr(6u*cagesCount);
+	for (uint32_t i = 0u; i < cagesCount; ++i)
+	{
+		indexBufferToBeFilled[i*6]		= i*4u + 1u;
+		indexBufferToBeFilled[i*6 + 1u]	= i*4u + 0u;
+		indexBufferToBeFilled[i*6 + 2u]	= i*4u + 2u;
+		indexBufferToBeFilled[i*6 + 3u]	= i*4u + 1u;
+		indexBufferToBeFilled[i*6 + 4u]	= i*4u + 2u;
+		indexBufferToBeFilled[i*6 + 5u]	= i*4u + 3u;
+	}
+	
 	// Add DrawObjs
+	DrawObject* drawObjectsToBeFilled = resourcesCollection.drawObjects.increaseCountAndGetPtr(cagesCount);
 	DrawObject drawObj = {};
 	drawObj.mainObjIndex = mainObjIdx;
-	drawObj.geometryAddress = drawResourcesBDA + currentGeometryBufferSize;
+	drawObj.geometryAddress = geometryBufferOffset;
 	for (uint32_t i = 0u; i < objectsToUpload; ++i)
 	{
 		for (uint16_t subObject = 0; subObject < CagesPerQuadBezier; subObject++)
 		{
 			drawObj.type_subsectionIdx = uint32_t(static_cast<uint16_t>(ObjectType::QUAD_BEZIER) | (subObject << 16));
-			void* dst = reinterpret_cast<DrawObject*>(cpuDrawBuffers.drawObjectsBuffer->getPointer()) + currentDrawObjectCount;
-			memcpy(dst, &drawObj, sizeof(DrawObject));
-			currentDrawObjectCount += 1u;
+			drawObjectsToBeFilled[i * CagesPerQuadBezier + subObject] = drawObj;
 		}
 		drawObj.geometryAddress += sizeof(QuadraticBezierInfo);
 	}
 
-	// Add Geometry
-	if (objectsToUpload > 0u)
-	{
-		const auto beziersByteSize = sizeof(QuadraticBezierInfo) * (objectsToUpload);
-		void* dst = reinterpret_cast<char*>(cpuDrawBuffers.geometryBuffer->getPointer()) + currentGeometryBufferSize;
-		auto& quadBezier = polyline.getQuadBezierInfoAt(section.index + currentObjectInSection);
-		memcpy(dst, &quadBezier, beziersByteSize);
-		currentGeometryBufferSize += beziersByteSize;
-	}
 
 	currentObjectInSection += objectsToUpload;
 }
 
+// TODO: FIX
 void DrawResourcesFiller::addHatch_Internal(const Hatch& hatch, uint32_t& currentObjectInSection, uint32_t mainObjIndex)
 {
+#if 0
 	const uint32_t maxGeometryBufferHatchBoxes = static_cast<uint32_t>((maxGeometryBufferSize - currentGeometryBufferSize) / sizeof(Hatch::CurveHatchBox));
 	
 	uint32_t uploadableObjects = (maxIndexCount / 6u) - currentDrawObjectCount;
@@ -946,10 +964,13 @@ void DrawResourcesFiller::addHatch_Internal(const Hatch& hatch, uint32_t& curren
 	// Add Indices
 	currentDrawObjectCount += uploadableObjects;
 	currentObjectInSection += uploadableObjects;
+#endif
 }
 
+// TODO: FIX
 bool DrawResourcesFiller::addFontGlyph_Internal(const GlyphInfo& glyphInfo, uint32_t mainObjIdx)
 {
+#if 0
 	const uint32_t maxGeometryBufferFontGlyphs = static_cast<uint32_t>((maxGeometryBufferSize - currentGeometryBufferSize) / sizeof(GlyphInfo));
 	
 	uint32_t uploadableObjects = (maxIndexCount / 6u) - currentDrawObjectCount;
@@ -977,6 +998,7 @@ bool DrawResourcesFiller::addFontGlyph_Internal(const GlyphInfo& glyphInfo, uint
 	{
 		return false;
 	}
+#endif
 }
 
 void DrawResourcesFiller::setGlyphMSDFTextureFunction(const GetGlyphMSDFTextureFunc& func)
diff --git a/62_CAD/DrawResourcesFiller.h b/62_CAD/DrawResourcesFiller.h
index 19579449a..439c7cfaf 100644
--- a/62_CAD/DrawResourcesFiller.h
+++ b/62_CAD/DrawResourcesFiller.h
@@ -57,6 +57,29 @@ struct DrawResourcesFiller
 		core::vector<T> vector;
 		size_t getCount() const { return vector.size(); }
 		size_t getStorageSize() const { return vector.size() * sizeof(T); }
+		
+		/// @return pointer to start of the data to be filled, up to additionalCount
+		T* increaseCountAndGetPtr(size_t additionalCount) 
+		{
+			size_t offset = vector.size();
+			vector.resize(offset + additionalCount);
+			return &vector[offset];
+		}
+
+		/// @brief increases size of general-purpose resources that hold bytes
+		/// @param additionalSize
+		/// @param alignment: Alignment of the pointer returned to be filled, should be PoT and <= ResourcesMaxNaturalAlignment, only use this if storing raw bytes in vector
+		/// @return pointer to start of the data to be filled, up to additional size
+		// TODO: make sure t is 1 byte with templates.
+		size_t increaseSizeAndGetOffset(size_t additionalSize, size_t alignment) 
+		{
+			assert(core::isPoT(alignment) && alignment <= ResourcesMaxNaturalAlignment);
+			size_t offset = core::alignUp(vector.size(), alignment);
+			vector.resize(offset + additionalSize);
+			return offset;
+		}
+
+		T* data() { return vector.data(); }
 	};
 
 	/// @brief struct to hold all resources
@@ -99,7 +122,7 @@ struct DrawResourcesFiller
 	void setSubmitDrawsFunction(const SubmitFunc& func);
 	
 	/// @brief Get minimum required size for resources buffer (containing objects and geometry info and their settings)
-	consteval size_t getMinimumRequiredResourcesBufferSize() const
+	static constexpr size_t getMinimumRequiredResourcesBufferSize()
 	{
 		// for auto-submission to work correctly, memory needs to serve at least 2 linestyle, 1 dtm settings, 1 clip proj, 1 main obj, 1 draw obj and 512 bytes of additional mem for geometries and index buffer
 		// this is the ABSOLUTE MINIMUM (if this value is used rendering will probably be as slow as CPU drawing :D)
@@ -178,7 +201,7 @@ struct DrawResourcesFiller
 	}
 
 	/// @brief collection of all the resources that will eventually be reserved or copied to in the resourcesGPUBuffer, will be accessed via individual BDA pointers in shaders
-	const ResourcesCollection& getResourcesCollection() const { return &resourcesCollection; }
+	const ResourcesCollection& getResourcesCollection() const { return resourcesCollection; }
 
 	/// @brief buffer containing all non-texture type resources
 	nbl::core::smart_refctd_ptr<IGPUBuffer> getResourcesGPUBuffer() const { return resourcesGPUBuffer; }
@@ -244,19 +267,10 @@ struct DrawResourcesFiller
 
 	// Gets the current clip projection data (the top of stack) gpu addreess inside the geometryBuffer
 	// If it's been invalidated then it will request to upload again with a possible auto-submit on low geometry buffer memory.
-	uint32_t acquireCurrentClipProjectionAddress(SIntendedSubmitInfo& intendedNextSubmit);
+	uint32_t acquireCurrentClipProjectionIndex(SIntendedSubmitInfo& intendedNextSubmit);
 	
 	uint32_t addClipProjectionData_SubmitIfNeeded(const ClipProjectionData& clipProjectionData, SIntendedSubmitInfo& intendedNextSubmit);
 
-	static constexpr uint32_t getCageCountPerPolylineObject(ObjectType type)
-	{
-		if (type == ObjectType::LINE)
-			return 1u;
-		else if (type == ObjectType::QUAD_BEZIER)
-			return 3u;
-		return 0u;
-	};
-
 	void addPolylineObjects_Internal(const CPolylineBase& polyline, const CPolylineBase::SectionInfo& section, uint32_t& currentObjectInSection, uint32_t mainObjIdx);
 
 	void addPolylineConnectors_Internal(const CPolylineBase& polyline, uint32_t& currentPolylineConnectorObj, uint32_t mainObjIdx);
@@ -397,7 +411,7 @@ struct DrawResourcesFiller
 	IQueue* m_copyQueue;
 
 	std::deque<ClipProjectionData> clipProjections; // stack of clip projectios stored so we can resubmit them if geometry buffer got reset.
-	std::deque<uint64_t> clipProjectionAddresses; // stack of clip projection gpu addresses in geometry buffer. to keep track of them in push/pops
+	std::deque<uint32_t> clipProjectionIndices; // stack of clip projection gpu addresses in geometry buffer. to keep track of them in push/pops
 
 	// MSDF
 	GetGlyphMSDFTextureFunc getGlyphMSDF;
diff --git a/62_CAD/main.cpp b/62_CAD/main.cpp
index 6784e6b70..1f83244d6 100644
--- a/62_CAD/main.cpp
+++ b/62_CAD/main.cpp
@@ -1193,10 +1193,10 @@ class ComputerAidedDesign final : public examples::SimpleWindowedApplication, pu
 		globalData.pointers = {
 			.lineStyles				= baseAddress + resources.lineStyles.bufferOffset,
 			.dtmSettings			= baseAddress + resources.dtmSettings.bufferOffset,
-			.customClipProjections	= baseAddress + resources.customClipProjections.bufferOffset,
+			.customClipProjections	= baseAddress + resources.clipProjections.bufferOffset,
 			.mainObjects			= baseAddress + resources.mainObjects.bufferOffset,
 			.drawObjects			= baseAddress + resources.drawObjects.bufferOffset,
-			.geometryBuffer			= baseAddress + resources.geometryBuffer.bufferOffset,
+			.geometryBuffer			= baseAddress + resources.geometryInfo.bufferOffset,
 		};
 		globalData.antiAliasingFactor = 1.0;// +abs(cos(m_timeElapsed * 0.0008)) * 20.0f;
 		globalData.resolution = uint32_t2{ m_window->getWidth(), m_window->getHeight() };
diff --git a/62_CAD/shaders/main_pipeline/vertex_shader.hlsl b/62_CAD/shaders/main_pipeline/vertex_shader.hlsl
index 5abe693ec..5dbe386fd 100644
--- a/62_CAD/shaders/main_pipeline/vertex_shader.hlsl
+++ b/62_CAD/shaders/main_pipeline/vertex_shader.hlsl
@@ -173,11 +173,11 @@ PSInput main(uint vertexID : SV_VertexID)
         if (objType == ObjectType::LINE)
         {
             pfloat64_t2 points[2u];
-            points[0u] = vk::RawBufferLoad<pfloat64_t2>(drawObj.geometryAddress, 8u);
-            points[1u] = vk::RawBufferLoad<pfloat64_t2>(drawObj.geometryAddress + sizeof(LinePointInfo), 8u);
+            points[0u] = vk::RawBufferLoad<pfloat64_t2>(globals.pointers.geometryBuffer + drawObj.geometryAddress, 8u);
+            points[1u] = vk::RawBufferLoad<pfloat64_t2>(globals.pointers.geometryBuffer + drawObj.geometryAddress + sizeof(LinePointInfo), 8u);
 
-            const float phaseShift = vk::RawBufferLoad<float>(drawObj.geometryAddress + sizeof(pfloat64_t2), 8u);
-            const float patternStretch = vk::RawBufferLoad<float>(drawObj.geometryAddress + sizeof(pfloat64_t2) + sizeof(float), 8u);
+            const float phaseShift = vk::RawBufferLoad<float>(globals.pointers.geometryBuffer + drawObj.geometryAddress + sizeof(pfloat64_t2), 8u);
+            const float patternStretch = vk::RawBufferLoad<float>(globals.pointers.geometryBuffer + drawObj.geometryAddress + sizeof(pfloat64_t2) + sizeof(float), 8u);
             outV.setCurrentPhaseShift(phaseShift);
             outV.setPatternStretch(patternStretch);
 
@@ -213,12 +213,12 @@ PSInput main(uint vertexID : SV_VertexID)
         else if (objType == ObjectType::QUAD_BEZIER)
         {
             pfloat64_t2 points[3u];
-            points[0u] = vk::RawBufferLoad<pfloat64_t2>(drawObj.geometryAddress, 8u);
-            points[1u] = vk::RawBufferLoad<pfloat64_t2>(drawObj.geometryAddress + sizeof(pfloat64_t2), 8u);
-            points[2u] = vk::RawBufferLoad<pfloat64_t2>(drawObj.geometryAddress + sizeof(pfloat64_t2) * 2u, 8u);
+            points[0u] = vk::RawBufferLoad<pfloat64_t2>(globals.pointers.geometryBuffer + drawObj.geometryAddress, 8u);
+            points[1u] = vk::RawBufferLoad<pfloat64_t2>(globals.pointers.geometryBuffer + drawObj.geometryAddress + sizeof(pfloat64_t2), 8u);
+            points[2u] = vk::RawBufferLoad<pfloat64_t2>(globals.pointers.geometryBuffer + drawObj.geometryAddress + sizeof(pfloat64_t2) * 2u, 8u);
 
-            const float phaseShift = vk::RawBufferLoad<float>(drawObj.geometryAddress + sizeof(pfloat64_t2) * 3u, 8u);
-            const float patternStretch = vk::RawBufferLoad<float>(drawObj.geometryAddress + sizeof(pfloat64_t2) * 3u + sizeof(float), 8u);
+            const float phaseShift = vk::RawBufferLoad<float>(globals.pointers.geometryBuffer + drawObj.geometryAddress + sizeof(pfloat64_t2) * 3u, 8u);
+            const float patternStretch = vk::RawBufferLoad<float>(globals.pointers.geometryBuffer + drawObj.geometryAddress + sizeof(pfloat64_t2) * 3u + sizeof(float), 8u);
             outV.setCurrentPhaseShift(phaseShift);
             outV.setPatternStretch(patternStretch);
 
@@ -387,9 +387,9 @@ PSInput main(uint vertexID : SV_VertexID)
 
             if (lineStyle.isRoadStyleFlag)
             {
-                const pfloat64_t2 circleCenter = vk::RawBufferLoad<pfloat64_t2>(drawObj.geometryAddress, 8u);
-                const float2 v = vk::RawBufferLoad<float2>(drawObj.geometryAddress + sizeof(pfloat64_t2), 8u);
-                const float cosHalfAngleBetweenNormals = vk::RawBufferLoad<float>(drawObj.geometryAddress + sizeof(pfloat64_t2) + sizeof(float2), 8u);
+                const pfloat64_t2 circleCenter = vk::RawBufferLoad<pfloat64_t2>(globals.pointers.geometryBuffer + drawObj.geometryAddress, 8u);
+                const float2 v = vk::RawBufferLoad<float2>(globals.pointers.geometryBuffer + drawObj.geometryAddress + sizeof(pfloat64_t2), 8u);
+                const float cosHalfAngleBetweenNormals = vk::RawBufferLoad<float>(globals.pointers.geometryBuffer + drawObj.geometryAddress + sizeof(pfloat64_t2) + sizeof(float2), 8u);
 
                 const float2 circleCenterScreenSpace = transformPointScreenSpace(clipProjectionData.projectionToNDC, globals.resolution, circleCenter);
                 outV.setPolylineConnectorCircleCenter(circleCenterScreenSpace);
@@ -449,13 +449,13 @@ PSInput main(uint vertexID : SV_VertexID)
     else if (objType == ObjectType::CURVE_BOX)
     {
         CurveBox curveBox;
-        curveBox.aabbMin = vk::RawBufferLoad<pfloat64_t2>(drawObj.geometryAddress, 8u);
-        curveBox.aabbMax = vk::RawBufferLoad<pfloat64_t2>(drawObj.geometryAddress + sizeof(pfloat64_t2), 8u);
+        curveBox.aabbMin = vk::RawBufferLoad<pfloat64_t2>(globals.pointers.geometryBuffer + drawObj.geometryAddress, 8u);
+        curveBox.aabbMax = vk::RawBufferLoad<pfloat64_t2>(globals.pointers.geometryBuffer + drawObj.geometryAddress + sizeof(pfloat64_t2), 8u);
 
         for (uint32_t i = 0; i < 3; i ++)
         {
-            curveBox.curveMin[i] = vk::RawBufferLoad<float32_t2>(drawObj.geometryAddress + sizeof(pfloat64_t2) * 2 + sizeof(float32_t2) * i, 4u);
-            curveBox.curveMax[i] = vk::RawBufferLoad<float32_t2>(drawObj.geometryAddress + sizeof(pfloat64_t2) * 2 + sizeof(float32_t2) * (3 + i), 4u);
+            curveBox.curveMin[i] = vk::RawBufferLoad<float32_t2>(globals.pointers.geometryBuffer + drawObj.geometryAddress + sizeof(pfloat64_t2) * 2 + sizeof(float32_t2) * i, 4u);
+            curveBox.curveMax[i] = vk::RawBufferLoad<float32_t2>(globals.pointers.geometryBuffer + drawObj.geometryAddress + sizeof(pfloat64_t2) * 2 + sizeof(float32_t2) * (3 + i), 4u);
         }
 
         pfloat64_t2 aabbMaxXMinY;
@@ -540,10 +540,10 @@ PSInput main(uint vertexID : SV_VertexID)
         const float italicTiltSlope = lineStyle.screenSpaceLineWidth; // aliased text style member with line style
         
         GlyphInfo glyphInfo;
-        glyphInfo.topLeft = vk::RawBufferLoad<pfloat64_t2>(drawObj.geometryAddress, 8u);
-        glyphInfo.dirU = vk::RawBufferLoad<float32_t2>(drawObj.geometryAddress + sizeof(pfloat64_t2), 4u);
-        glyphInfo.aspectRatio = vk::RawBufferLoad<float32_t>(drawObj.geometryAddress + sizeof(pfloat64_t2) + sizeof(float2), 4u);
-        glyphInfo.minUV_textureID_packed = vk::RawBufferLoad<uint32_t>(drawObj.geometryAddress + sizeof(pfloat64_t2) + sizeof(float2) + sizeof(float), 4u);
+        glyphInfo.topLeft = vk::RawBufferLoad<pfloat64_t2>(globals.pointers.geometryBuffer + drawObj.geometryAddress, 8u);
+        glyphInfo.dirU = vk::RawBufferLoad<float32_t2>(globals.pointers.geometryBuffer + drawObj.geometryAddress + sizeof(pfloat64_t2), 4u);
+        glyphInfo.aspectRatio = vk::RawBufferLoad<float32_t>(globals.pointers.geometryBuffer + drawObj.geometryAddress + sizeof(pfloat64_t2) + sizeof(float2), 4u);
+        glyphInfo.minUV_textureID_packed = vk::RawBufferLoad<uint32_t>(globals.pointers.geometryBuffer + drawObj.geometryAddress + sizeof(pfloat64_t2) + sizeof(float2) + sizeof(float), 4u);
 
         float32_t2 minUV = glyphInfo.getMinUV();
         uint16_t textureID = glyphInfo.getTextureID();
@@ -591,10 +591,10 @@ PSInput main(uint vertexID : SV_VertexID)
     }
     else if (objType == ObjectType::IMAGE)
     {
-        pfloat64_t2 topLeft = vk::RawBufferLoad<pfloat64_t2>(drawObj.geometryAddress, 8u);
-        float32_t2 dirU = vk::RawBufferLoad<float32_t2>(drawObj.geometryAddress + sizeof(pfloat64_t2), 4u);
-        float32_t aspectRatio = vk::RawBufferLoad<float32_t>(drawObj.geometryAddress + sizeof(pfloat64_t2) + sizeof(float2), 4u);
-        uint32_t textureID = vk::RawBufferLoad<uint32_t>(drawObj.geometryAddress + sizeof(pfloat64_t2) + sizeof(float2) + sizeof(float), 4u);
+        pfloat64_t2 topLeft = vk::RawBufferLoad<pfloat64_t2>(globals.pointers.geometryBuffer + drawObj.geometryAddress, 8u);
+        float32_t2 dirU = vk::RawBufferLoad<float32_t2>(globals.pointers.geometryBuffer + drawObj.geometryAddress + sizeof(pfloat64_t2), 4u);
+        float32_t aspectRatio = vk::RawBufferLoad<float32_t>(globals.pointers.geometryBuffer + drawObj.geometryAddress + sizeof(pfloat64_t2) + sizeof(float2), 4u);
+        uint32_t textureID = vk::RawBufferLoad<uint32_t>(globals.pointers.geometryBuffer + drawObj.geometryAddress + sizeof(pfloat64_t2) + sizeof(float2) + sizeof(float), 4u);
 
         const float32_t2 dirV = float32_t2(dirU.y, -dirU.x) * aspectRatio;
         const float2 ndcTopLeft = _static_cast<float2>(transformPointNdc(clipProjectionData.projectionToNDC, topLeft));

From ab8f303216552dec39ac1e01aa3199043f3a82d7 Mon Sep 17 00:00:00 2001
From: Erfan Ahmadi <ahmadierfan99@gmail.com>
Date: Wed, 2 Apr 2025 02:59:42 +0330
Subject: [PATCH 128/529] [62.CAD] compile error fix

---
 62_CAD/DrawResourcesFiller.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/62_CAD/DrawResourcesFiller.h b/62_CAD/DrawResourcesFiller.h
index 439c7cfaf..85d88f2eb 100644
--- a/62_CAD/DrawResourcesFiller.h
+++ b/62_CAD/DrawResourcesFiller.h
@@ -38,7 +38,7 @@ struct DrawResourcesFiller
 		size_t bufferOffset = InvalidBufferOffset; // set when copy to gpu buffer is issued
 		virtual size_t getCount() const = 0;
 		virtual size_t getStorageSize() const = 0;
-		virtual size_t getAlignedStorageSize() const { core::alignUp(getStorageSize(), ResourcesMaxNaturalAlignment); }
+		virtual size_t getAlignedStorageSize() const { return core::alignUp(getStorageSize(), ResourcesMaxNaturalAlignment); }
 	};
 
 	/// @brief ResourceBase reserved for compute shader stages input/output

From 8f2ae9c51c2e47dc152d4081ea0a1ec495940fd7 Mon Sep 17 00:00:00 2001
From: Erfan Ahmadi <ahmadierfan99@gmail.com>
Date: Wed, 2 Apr 2025 03:01:58 +0330
Subject: [PATCH 129/529] small shader fixes

---
 62_CAD/shaders/globals.hlsl | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/62_CAD/shaders/globals.hlsl b/62_CAD/shaders/globals.hlsl
index f9c89d45c..93578b7b6 100644
--- a/62_CAD/shaders/globals.hlsl
+++ b/62_CAD/shaders/globals.hlsl
@@ -436,7 +436,7 @@ LineStyle loadLineStyle(const uint32_t index)
 }
 DTMSettings loadDTMSettings(const uint32_t index)
 {
-    return vk::RawBufferLoad<DTMSetting>(globals.pointers.dtmSettings + index * sizeof(DTMSetting), 8u);
+    return vk::RawBufferLoad<DTMSettings>(globals.pointers.dtmSettings + index * sizeof(DTMSettings), 8u);
 }
 ClipProjectionData loadCustomClipProjection(const uint32_t index)
 {
@@ -444,11 +444,11 @@ ClipProjectionData loadCustomClipProjection(const uint32_t index)
 }
 MainObject loadMainObject(const uint32_t index)
 {
-    return vk::RawBufferLoad<MainObject>(globals.pointers.mainObjs + index * sizeof(MainObject), 8u);
+    return vk::RawBufferLoad<MainObject>(globals.pointers.mainObjects + index * sizeof(MainObject), 8u);
 }
 DrawObject loadDrawObject(const uint32_t index)
 {
-    return vk::RawBufferLoad<DrawObject>(globals.pointers.drawObjs + index * sizeof(DrawObject), 8u);
+    return vk::RawBufferLoad<DrawObject>(globals.pointers.drawObjects + index * sizeof(DrawObject), 8u);
 }
 #endif
 

From 97b1693c23a656efa853f0bc055125cc41a913dd Mon Sep 17 00:00:00 2001
From: Erfan Ahmadi <ahmadierfan99@gmail.com>
Date: Wed, 2 Apr 2025 03:06:16 +0330
Subject: [PATCH 130/529] more fixes

---
 62_CAD/main.cpp                       | 32 ++-------------------------
 62_CAD/shaders/geotexture/common.hlsl |  2 +-
 2 files changed, 3 insertions(+), 31 deletions(-)

diff --git a/62_CAD/main.cpp b/62_CAD/main.cpp
index 1f83244d6..ed3bdf88c 100644
--- a/62_CAD/main.cpp
+++ b/62_CAD/main.cpp
@@ -674,48 +674,20 @@ class ComputerAidedDesign final : public examples::SimpleWindowedApplication, pu
 				},
 				{
 					.binding = 1u,
-					.type = asset::IDescriptor::E_TYPE::ET_STORAGE_BUFFER,
-					.createFlags = IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_NONE,
-					.stageFlags = asset::IShader::E_SHADER_STAGE::ESS_VERTEX | asset::IShader::E_SHADER_STAGE::ESS_FRAGMENT,
-					.count = 1u,
-				},
-				{
-					.binding = 2u,
-					.type = asset::IDescriptor::E_TYPE::ET_STORAGE_BUFFER,
-					.createFlags = IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_NONE,
-					.stageFlags = asset::IShader::E_SHADER_STAGE::ESS_VERTEX | asset::IShader::E_SHADER_STAGE::ESS_FRAGMENT,
-					.count = 1u,
-				},
-				{
-					.binding = 3u,
-					.type = asset::IDescriptor::E_TYPE::ET_STORAGE_BUFFER,
-					.createFlags = IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_NONE,
-					.stageFlags = asset::IShader::E_SHADER_STAGE::ESS_VERTEX | asset::IShader::E_SHADER_STAGE::ESS_FRAGMENT,
-					.count = 1u,
-				},
-				{
-					.binding = 4u,
-					.type = asset::IDescriptor::E_TYPE::ET_STORAGE_BUFFER,
-					.createFlags = IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_NONE,
-					.stageFlags = asset::IShader::E_SHADER_STAGE::ESS_VERTEX | asset::IShader::E_SHADER_STAGE::ESS_FRAGMENT,
-					.count = 1u,
-				},
-				{
-					.binding = 5u,
 					.type = asset::IDescriptor::E_TYPE::ET_COMBINED_IMAGE_SAMPLER,
 					.createFlags = IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_NONE,
 					.stageFlags = asset::IShader::E_SHADER_STAGE::ESS_FRAGMENT,
 					.count = 1u,
 				},
 				{
-					.binding = 6u,
+					.binding = 2u,
 					.type = asset::IDescriptor::E_TYPE::ET_SAMPLER,
 					.createFlags = IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_NONE,
 					.stageFlags = asset::IShader::E_SHADER_STAGE::ESS_FRAGMENT,
 					.count = 1u,
 				},
 				{
-					.binding = 7u,
+					.binding = 3u,
 					.type = asset::IDescriptor::E_TYPE::ET_SAMPLED_IMAGE,
 					.createFlags = bindlessTextureFlags,
 					.stageFlags = asset::IShader::E_SHADER_STAGE::ESS_FRAGMENT,
diff --git a/62_CAD/shaders/geotexture/common.hlsl b/62_CAD/shaders/geotexture/common.hlsl
index 82a646319..691cd3d3b 100644
--- a/62_CAD/shaders/geotexture/common.hlsl
+++ b/62_CAD/shaders/geotexture/common.hlsl
@@ -25,7 +25,7 @@ struct PSInput
 [[vk::push_constant]] GeoTextureOBB geoTextureOBB;
 
 // Set 0 - Scene Data and Globals, buffer bindings don't change the buffers only get updated
-[[vk::binding(0, 0)]] ConstantBuffer<Globals> globals : register(b0);
+// [[vk::binding(0, 0)]] ConstantBuffer<Globals> globals; ---> moved to globals.hlsl
 
 // Set 1 - Window dependant data which has higher update frequency due to multiple windows and resize need image recreation and descriptor writes
 [[vk::binding(0, 1)]] Texture2D<float4> geoTexture : register(t0);

From 76360625346f5eb4c0232ff9e93eed078f37fa63 Mon Sep 17 00:00:00 2001
From: Erfan Ahmadi <ahmadierfan99@gmail.com>
Date: Wed, 2 Apr 2025 03:55:34 +0330
Subject: [PATCH 131/529] fixes to cad example

---
 62_CAD/Polyline.h                             |  4 +-
 62_CAD/main.cpp                               | 76 +++++++++----------
 62_CAD/shaders/globals.hlsl                   |  2 +-
 .../shaders/main_pipeline/vertex_shader.hlsl  |  1 +
 4 files changed, 41 insertions(+), 42 deletions(-)

diff --git a/62_CAD/Polyline.h b/62_CAD/Polyline.h
index 03b2f2c30..bee5650c7 100644
--- a/62_CAD/Polyline.h
+++ b/62_CAD/Polyline.h
@@ -66,8 +66,6 @@ struct LineStyleInfo
 		rigidSegmentIdx = InvalidRigidSegmentIndex;
 		phaseShift = 0.0f;
 
-		assert(stipplePatternUnnormalizedRepresentation.size() <= StipplePatternMaxSize);
-
 		if (stipplePatternUnnormalizedRepresentation.size() == 0)
 		{
 			stipplePatternSize = 0;
@@ -110,6 +108,8 @@ struct LineStyleInfo
 				stipplePatternTransformed[0] += stipplePatternTransformed[stipplePatternTransformed.size() - 1];
 				stipplePatternTransformed.pop_back();
 			}
+			
+			assert(stipplePatternTransformed.size() <= StipplePatternMaxSize);
 
 			if (stipplePatternTransformed.size() != 1)
 			{
diff --git a/62_CAD/main.cpp b/62_CAD/main.cpp
index ed3bdf88c..5f9e88694 100644
--- a/62_CAD/main.cpp
+++ b/62_CAD/main.cpp
@@ -867,14 +867,14 @@ class ComputerAidedDesign final : public examples::SimpleWindowedApplication, pu
 
 			auto mainPipelineFragmentCpuShader = loadCompileShader("../shaders/main_pipeline/fragment.hlsl", IShader::E_SHADER_STAGE::ESS_ALL_OR_LIBRARY);
 			auto mainPipelineVertexCpuShader = loadCompileShader("../shaders/main_pipeline/vertex_shader.hlsl", IShader::E_SHADER_STAGE::ESS_VERTEX);
-			auto geoTexturePipelineVertCpuShader = loadCompileShader(GeoTextureRenderer::VertexShaderRelativePath, IShader::E_SHADER_STAGE::ESS_VERTEX);
-			auto geoTexturePipelineFragCpuShader = loadCompileShader(GeoTextureRenderer::FragmentShaderRelativePath, IShader::E_SHADER_STAGE::ESS_FRAGMENT);
+			// auto geoTexturePipelineVertCpuShader = loadCompileShader(GeoTextureRenderer::VertexShaderRelativePath, IShader::E_SHADER_STAGE::ESS_VERTEX);
+			// auto geoTexturePipelineFragCpuShader = loadCompileShader(GeoTextureRenderer::FragmentShaderRelativePath, IShader::E_SHADER_STAGE::ESS_FRAGMENT);
 			mainPipelineFragmentCpuShader->setShaderStage(IShader::E_SHADER_STAGE::ESS_FRAGMENT);
 
 			mainPipelineFragmentShaders = m_device->createShader({ mainPipelineFragmentCpuShader.get(), nullptr, shaderReadCache.get(), shaderWriteCache.get() });
 			mainPipelineVertexShader = m_device->createShader({ mainPipelineVertexCpuShader.get(), nullptr, shaderReadCache.get(), shaderWriteCache.get() });
-			geoTexturePipelineShaders[0] = m_device->createShader({ geoTexturePipelineVertCpuShader.get(), nullptr, shaderReadCache.get(), shaderWriteCache.get() });
-			geoTexturePipelineShaders[1] = m_device->createShader({ geoTexturePipelineFragCpuShader.get(), nullptr, shaderReadCache.get(), shaderWriteCache.get() });
+			// geoTexturePipelineShaders[0] = m_device->createShader({ geoTexturePipelineVertCpuShader.get(), nullptr, shaderReadCache.get(), shaderWriteCache.get() });
+			// geoTexturePipelineShaders[1] = m_device->createShader({ geoTexturePipelineFragCpuShader.get(), nullptr, shaderReadCache.get(), shaderWriteCache.get() });
 			
 			core::smart_refctd_ptr<system::IFile> shaderWriteCacheFile;
 			{
@@ -1011,7 +1011,7 @@ class ComputerAidedDesign final : public examples::SimpleWindowedApplication, pu
 		);
 		
 		m_geoTextureRenderer = std::unique_ptr<GeoTextureRenderer>(new GeoTextureRenderer(smart_refctd_ptr(m_device), smart_refctd_ptr(m_logger)));
-		m_geoTextureRenderer->initialize(geoTexturePipelineShaders[0].get(), geoTexturePipelineShaders[1].get(), compatibleRenderPass.get(), m_globalsBuffer);
+		// m_geoTextureRenderer->initialize(geoTexturePipelineShaders[0].get(), geoTexturePipelineShaders[1].get(), compatibleRenderPass.get(), m_globalsBuffer);
 		
 		// Create the Semaphores
 		m_renderSemaphore = m_device->createSemaphore(0ull);
@@ -1155,33 +1155,6 @@ class ComputerAidedDesign final : public examples::SimpleWindowedApplication, pu
 		// cb->reset(video::IGPUCommandBuffer::RESET_FLAGS::RELEASE_RESOURCES_BIT);
 		// cb->begin(video::IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT);
 		cb->beginDebugMarker("Frame");
-
-		float64_t3x3 projectionToNDC;
-		projectionToNDC = m_Camera.constructViewProjection();
-		
-		Globals globalData = {};
-		uint64_t baseAddress = drawResourcesFiller.getResourcesGPUBuffer()->getDeviceAddress();
-		const auto& resources = drawResourcesFiller.getResourcesCollection();
-		globalData.pointers = {
-			.lineStyles				= baseAddress + resources.lineStyles.bufferOffset,
-			.dtmSettings			= baseAddress + resources.dtmSettings.bufferOffset,
-			.customClipProjections	= baseAddress + resources.clipProjections.bufferOffset,
-			.mainObjects			= baseAddress + resources.mainObjects.bufferOffset,
-			.drawObjects			= baseAddress + resources.drawObjects.bufferOffset,
-			.geometryBuffer			= baseAddress + resources.geometryInfo.bufferOffset,
-		};
-		globalData.antiAliasingFactor = 1.0;// +abs(cos(m_timeElapsed * 0.0008)) * 20.0f;
-		globalData.resolution = uint32_t2{ m_window->getWidth(), m_window->getHeight() };
-		globalData.defaultClipProjection.projectionToNDC = projectionToNDC;
-		globalData.defaultClipProjection.minClipNDC = float32_t2(-1.0, -1.0);
-		globalData.defaultClipProjection.maxClipNDC = float32_t2(+1.0, +1.0);
-		auto screenToWorld = getScreenToWorldRatio(globalData.defaultClipProjection.projectionToNDC, globalData.resolution);
-		globalData.screenToWorldRatio = screenToWorld;
-		globalData.worldToScreenRatio = (1.0/screenToWorld);
-		globalData.miterLimit = 10.0f;
-		SBufferRange<IGPUBuffer> globalBufferUpdateRange = { .offset = 0ull, .size = sizeof(Globals), .buffer = m_globalsBuffer.get() };
-		bool updateSuccess = cb->updateBuffer(globalBufferUpdateRange, &globalData);
-		assert(updateSuccess);
 		
 		nbl::video::IGPUCommandBuffer::SRenderpassBeginInfo beginInfo;
 		auto scRes = static_cast<CSwapchainResources*>(m_surface->getSwapchainResources());
@@ -1214,8 +1187,36 @@ class ComputerAidedDesign final : public examples::SimpleWindowedApplication, pu
 	{
 		// Use the current recording command buffer of the intendedSubmitInfos scratchCommandBuffers, it should be in recording state
 		auto* cb = m_currentRecordingCommandBufferInfo->cmdbuf;
-		auto&r = drawResourcesFiller;
 		
+		const auto& resources = drawResourcesFiller.getResourcesCollection();
+		const auto& resourcesGPUBuffer = drawResourcesFiller.getResourcesGPUBuffer();
+
+		float64_t3x3 projectionToNDC;
+		projectionToNDC = m_Camera.constructViewProjection();
+		
+		Globals globalData = {};
+		uint64_t baseAddress = resourcesGPUBuffer->getDeviceAddress();
+		globalData.pointers = {
+			.lineStyles				= baseAddress + resources.lineStyles.bufferOffset,
+			.dtmSettings			= baseAddress + resources.dtmSettings.bufferOffset,
+			.customClipProjections	= baseAddress + resources.clipProjections.bufferOffset,
+			.mainObjects			= baseAddress + resources.mainObjects.bufferOffset,
+			.drawObjects			= baseAddress + resources.drawObjects.bufferOffset,
+			.geometryBuffer			= baseAddress + resources.geometryInfo.bufferOffset,
+		};
+		globalData.antiAliasingFactor = 1.0;// +abs(cos(m_timeElapsed * 0.0008)) * 20.0f;
+		globalData.resolution = uint32_t2{ m_window->getWidth(), m_window->getHeight() };
+		globalData.defaultClipProjection.projectionToNDC = projectionToNDC;
+		globalData.defaultClipProjection.minClipNDC = float32_t2(-1.0, -1.0);
+		globalData.defaultClipProjection.maxClipNDC = float32_t2(+1.0, +1.0);
+		auto screenToWorld = getScreenToWorldRatio(globalData.defaultClipProjection.projectionToNDC, globalData.resolution);
+		globalData.screenToWorldRatio = screenToWorld;
+		globalData.worldToScreenRatio = (1.0/screenToWorld);
+		globalData.miterLimit = 10.0f;
+		SBufferRange<IGPUBuffer> globalBufferUpdateRange = { .offset = 0ull, .size = sizeof(Globals), .buffer = m_globalsBuffer.get() };
+		bool updateSuccess = cb->updateBuffer(globalBufferUpdateRange, &globalData);
+		assert(updateSuccess);
+
 		asset::SViewport vp =
 		{
 			.x = 0u,
@@ -1261,8 +1262,8 @@ class ComputerAidedDesign final : public examples::SimpleWindowedApplication, pu
 				auto& bufferBarrier = bufferBarriers[bufferBarriersCount++];
 				bufferBarrier.barrier.dep.srcStageMask = PIPELINE_STAGE_FLAGS::COPY_BIT;
 				bufferBarrier.barrier.dep.srcAccessMask = ACCESS_FLAGS::TRANSFER_WRITE_BIT;
-				bufferBarrier.barrier.dep.dstStageMask = PIPELINE_STAGE_FLAGS::VERTEX_SHADER_BIT | PIPELINE_STAGE_FLAGS::FRAGMENT_SHADER_BIT;
-				bufferBarrier.barrier.dep.dstAccessMask = ACCESS_FLAGS::SHADER_READ_BITS;
+				bufferBarrier.barrier.dep.dstStageMask = PIPELINE_STAGE_FLAGS::VERTEX_INPUT_BITS | PIPELINE_STAGE_FLAGS::VERTEX_SHADER_BIT | PIPELINE_STAGE_FLAGS::FRAGMENT_SHADER_BIT;
+				bufferBarrier.barrier.dep.dstAccessMask = ACCESS_FLAGS::MEMORY_READ_BITS | ACCESS_FLAGS::MEMORY_WRITE_BITS;
 				bufferBarrier.range =
 				{
 					.offset = 0u,
@@ -1293,9 +1294,6 @@ class ComputerAidedDesign final : public examples::SimpleWindowedApplication, pu
 		}
 		cb->beginRenderPass(beginInfo, IGPUCommandBuffer::SUBPASS_CONTENTS::INLINE);
 		
-		const auto& resources = drawResourcesFiller.getResourcesCollection();
-		const auto& resourcesGPUBuffer = drawResourcesFiller.getResourcesGPUBuffer();
-
 		const uint32_t currentIndexCount = resources.drawObjects.getCount() * 6u;
 
 		IGPUDescriptorSet* descriptorSets[] = { descriptorSet0.get(), descriptorSet1.get() };
@@ -1424,7 +1422,7 @@ class ComputerAidedDesign final : public examples::SimpleWindowedApplication, pu
 		// We only support one swapchain mode, surface, the other one is Display which we have not implemented yet.
 		retval.swapchainMode = video::E_SWAPCHAIN_MODE::ESM_SURFACE;
 		retval.validations = true;
-		retval.synchronizationValidation = true;
+		retval.synchronizationValidation = false;
 		return retval;
 	}
 protected:
diff --git a/62_CAD/shaders/globals.hlsl b/62_CAD/shaders/globals.hlsl
index 93578b7b6..562f523e6 100644
--- a/62_CAD/shaders/globals.hlsl
+++ b/62_CAD/shaders/globals.hlsl
@@ -444,7 +444,7 @@ ClipProjectionData loadCustomClipProjection(const uint32_t index)
 }
 MainObject loadMainObject(const uint32_t index)
 {
-    return vk::RawBufferLoad<MainObject>(globals.pointers.mainObjects + index * sizeof(MainObject), 8u);
+    return vk::RawBufferLoad<MainObject>(globals.pointers.mainObjects + index * sizeof(MainObject), 4u);
 }
 DrawObject loadDrawObject(const uint32_t index)
 {
diff --git a/62_CAD/shaders/main_pipeline/vertex_shader.hlsl b/62_CAD/shaders/main_pipeline/vertex_shader.hlsl
index 5dbe386fd..b62cbe543 100644
--- a/62_CAD/shaders/main_pipeline/vertex_shader.hlsl
+++ b/62_CAD/shaders/main_pipeline/vertex_shader.hlsl
@@ -152,6 +152,7 @@ PSInput main(uint vertexID : SV_VertexID)
     outV.setObjType(objType);
     outV.setMainObjectIdx(drawObj.mainObjIndex);
     
+
     MainObject mainObj = loadMainObject(drawObj.mainObjIndex);
     ClipProjectionData clipProjectionData = getClipProjectionData(mainObj);
     

From eec41cba4bb8bab52bf2f2f3a88d2784efc71109 Mon Sep 17 00:00:00 2001
From: Erfan Ahmadi <ahmadierfan99@gmail.com>
Date: Wed, 2 Apr 2025 04:41:11 +0330
Subject: [PATCH 132/529] fixed hatches and polyline connector's auto
 submission logic

---
 62_CAD/DrawResourcesFiller.cpp | 153 +++++++++++++++++----------------
 62_CAD/DrawResourcesFiller.h   |   1 +
 2 files changed, 78 insertions(+), 76 deletions(-)

diff --git a/62_CAD/DrawResourcesFiller.cpp b/62_CAD/DrawResourcesFiller.cpp
index 80ddc0d57..45abd8555 100644
--- a/62_CAD/DrawResourcesFiller.cpp
+++ b/62_CAD/DrawResourcesFiller.cpp
@@ -436,9 +436,9 @@ bool DrawResourcesFiller::finalizeBufferCopies(SIntendedSubmitInfo& intendedNext
 				return false;
 			}
 
+			drawBuffer.bufferOffset = copyRange.offset;
 			if (copyRange.size > 0ull)
 			{
-				drawBuffer.bufferOffset = copyRange.offset;
 				if (!m_utilities->updateBufferRangeViaStagingBuffer(intendedNextSubmit, copyRange, drawBuffer.vector.data()))
 					return false;
 				copiedResourcesSize += drawBuffer.getAlignedStorageSize();
@@ -757,10 +757,8 @@ void DrawResourcesFiller::addPolylineObjects_Internal(const CPolylineBase& polyl
 		assert(false); // we don't handle other object types
 }
 
-// TODO: FIX
 void DrawResourcesFiller::addPolylineConnectors_Internal(const CPolylineBase& polyline, uint32_t& currentPolylineConnectorObj, uint32_t mainObjIdx)
 {
-#if 0
 	const size_t remainingResourcesSize = calculateRemainingResourcesSize();
 
 	const uint32_t uploadableObjects = (remainingResourcesSize) / (sizeof(PolylineConnector) + sizeof(DrawObject) + sizeof(uint32_t) * 6u);
@@ -773,41 +771,39 @@ void DrawResourcesFiller::addPolylineConnectors_Internal(const CPolylineBase& po
 	if (objectsToUpload <= 0u)
 		return;
 
+	// Add Geometry
+	const auto connectorsByteSize = sizeof(PolylineConnector) * objectsToUpload;
+	size_t geometryBufferOffset = resourcesCollection.geometryInfo.increaseSizeAndGetOffset(connectorsByteSize, alignof(PolylineConnector));
+	void* dst = resourcesCollection.geometryInfo.data() + geometryBufferOffset;
+	const PolylineConnector& connector = polyline.getConnectors()[currentPolylineConnectorObj];
+	memcpy(dst, &connector, connectorsByteSize);
 
-
-
-
-	// TODO: 
-
-
-
-
+	// Push Indices, remove later when compute fills this
+	uint32_t* indexBufferToBeFilled = resourcesCollection.indexBuffer.increaseCountAndGetPtr(6u * objectsToUpload);
+	const uint32_t startObj = resourcesCollection.drawObjects.getCount();
+	for (uint32_t i = 0u; i < objectsToUpload; ++i)
+	{
+		indexBufferToBeFilled[i*6]		= (startObj+i)*4u + 1u;
+		indexBufferToBeFilled[i*6 + 1u]	= (startObj+i)*4u + 0u;
+		indexBufferToBeFilled[i*6 + 2u]	= (startObj+i)*4u + 2u;
+		indexBufferToBeFilled[i*6 + 3u]	= (startObj+i)*4u + 1u;
+		indexBufferToBeFilled[i*6 + 4u]	= (startObj+i)*4u + 2u;
+		indexBufferToBeFilled[i*6 + 5u]	= (startObj+i)*4u + 3u;
+	}
 
 	// Add DrawObjs
+	DrawObject* drawObjectsToBeFilled = resourcesCollection.drawObjects.increaseCountAndGetPtr(objectsToUpload);
 	DrawObject drawObj = {};
 	drawObj.mainObjIndex = mainObjIdx;
 	drawObj.type_subsectionIdx = uint32_t(static_cast<uint16_t>(ObjectType::POLYLINE_CONNECTOR) | 0 << 16);
-	drawObj.geometryAddress = drawResourcesBDA + currentGeometryBufferSize;
+	drawObj.geometryAddress = geometryBufferOffset;
 	for (uint32_t i = 0u; i < objectsToUpload; ++i)
 	{
-		void* dst = reinterpret_cast<DrawObject*>(cpuDrawBuffers.drawObjectsBuffer->getPointer()) + currentDrawObjectCount;
-		memcpy(dst, &drawObj, sizeof(DrawObject));
-		currentDrawObjectCount += 1u;
+		drawObjectsToBeFilled[i] = drawObj;
 		drawObj.geometryAddress += sizeof(PolylineConnector);
-	}
-
-	// Add Geometry
-	if (objectsToUpload > 0u)
-	{
-		const auto connectorsByteSize = sizeof(PolylineConnector) * objectsToUpload;
-		void* dst = reinterpret_cast<char*>(cpuDrawBuffers.geometryBuffer->getPointer()) + currentGeometryBufferSize;
-		auto& connector = polyline.getConnectors()[currentPolylineConnectorObj];
-		memcpy(dst, &connector, connectorsByteSize);
-		currentGeometryBufferSize += connectorsByteSize;
-	}
+	} 
 
 	currentPolylineConnectorObj += objectsToUpload;
-#endif
 }
 
 void DrawResourcesFiller::addLines_Internal(const CPolylineBase& polyline, const CPolylineBase::SectionInfo& section, uint32_t& currentObjectInSection, uint32_t mainObjIdx)
@@ -833,8 +829,6 @@ void DrawResourcesFiller::addLines_Internal(const CPolylineBase& polyline, const
 
 	// Add Geometry
 	const auto pointsByteSize = sizeof(LinePointInfo) * (objectsToUpload + 1u);
-
-	
 	size_t geometryBufferOffset = resourcesCollection.geometryInfo.increaseSizeAndGetOffset(pointsByteSize, alignof(LinePointInfo));
 	void* dst = resourcesCollection.geometryInfo.data() + geometryBufferOffset;
 	const LinePointInfo& linePoint = polyline.getLinePointAt(section.index + currentObjectInSection);
@@ -842,14 +836,15 @@ void DrawResourcesFiller::addLines_Internal(const CPolylineBase& polyline, const
 
 	// Push Indices, remove later when compute fills this
 	uint32_t* indexBufferToBeFilled = resourcesCollection.indexBuffer.increaseCountAndGetPtr(6u * objectsToUpload);
+	const uint32_t startObj = resourcesCollection.drawObjects.getCount();
 	for (uint32_t i = 0u; i < objectsToUpload; ++i)
 	{
-		indexBufferToBeFilled[i*6]		= i*4u + 1u;
-		indexBufferToBeFilled[i*6 + 1u]	= i*4u + 0u;
-		indexBufferToBeFilled[i*6 + 2u]	= i*4u + 2u;
-		indexBufferToBeFilled[i*6 + 3u]	= i*4u + 1u;
-		indexBufferToBeFilled[i*6 + 4u]	= i*4u + 2u;
-		indexBufferToBeFilled[i*6 + 5u]	= i*4u + 3u;
+		indexBufferToBeFilled[i*6]		= (startObj+i)*4u + 1u;
+		indexBufferToBeFilled[i*6 + 1u]	= (startObj+i)*4u + 0u;
+		indexBufferToBeFilled[i*6 + 2u]	= (startObj+i)*4u + 2u;
+		indexBufferToBeFilled[i*6 + 3u]	= (startObj+i)*4u + 1u;
+		indexBufferToBeFilled[i*6 + 4u]	= (startObj+i)*4u + 2u;
+		indexBufferToBeFilled[i*6 + 5u]	= (startObj+i)*4u + 3u;
 	}
 
 	// Add DrawObjs
@@ -898,14 +893,15 @@ void DrawResourcesFiller::addQuadBeziers_Internal(const CPolylineBase& polyline,
 
 	// Push Indices, remove later when compute fills this
 	uint32_t* indexBufferToBeFilled = resourcesCollection.indexBuffer.increaseCountAndGetPtr(6u*cagesCount);
+	const uint32_t startObj = resourcesCollection.drawObjects.getCount();
 	for (uint32_t i = 0u; i < cagesCount; ++i)
 	{
-		indexBufferToBeFilled[i*6]		= i*4u + 1u;
-		indexBufferToBeFilled[i*6 + 1u]	= i*4u + 0u;
-		indexBufferToBeFilled[i*6 + 2u]	= i*4u + 2u;
-		indexBufferToBeFilled[i*6 + 3u]	= i*4u + 1u;
-		indexBufferToBeFilled[i*6 + 4u]	= i*4u + 2u;
-		indexBufferToBeFilled[i*6 + 5u]	= i*4u + 3u;
+		indexBufferToBeFilled[i*6]		= (startObj+i)*4u + 1u;
+		indexBufferToBeFilled[i*6 + 1u]	= (startObj+i)*4u + 0u;
+		indexBufferToBeFilled[i*6 + 2u]	= (startObj+i)*4u + 2u;
+		indexBufferToBeFilled[i*6 + 3u]	= (startObj+i)*4u + 1u;
+		indexBufferToBeFilled[i*6 + 4u]	= (startObj+i)*4u + 2u;
+		indexBufferToBeFilled[i*6 + 5u]	= (startObj+i)*4u + 3u;
 	}
 	
 	// Add DrawObjs
@@ -927,50 +923,58 @@ void DrawResourcesFiller::addQuadBeziers_Internal(const CPolylineBase& polyline,
 	currentObjectInSection += objectsToUpload;
 }
 
-// TODO: FIX
 void DrawResourcesFiller::addHatch_Internal(const Hatch& hatch, uint32_t& currentObjectInSection, uint32_t mainObjIndex)
 {
-#if 0
-	const uint32_t maxGeometryBufferHatchBoxes = static_cast<uint32_t>((maxGeometryBufferSize - currentGeometryBufferSize) / sizeof(Hatch::CurveHatchBox));
-	
-	uint32_t uploadableObjects = (maxIndexCount / 6u) - currentDrawObjectCount;
-	uploadableObjects = core::min(uploadableObjects, maxDrawObjects - currentDrawObjectCount);
-	uploadableObjects = core::min(uploadableObjects, maxGeometryBufferHatchBoxes);
+	const size_t remainingResourcesSize = calculateRemainingResourcesSize();
 
+	const uint32_t uploadableObjects = (remainingResourcesSize) / (sizeof(Hatch::CurveHatchBox) + sizeof(DrawObject) + sizeof(uint32_t) * 6u);
+	// TODO[ERFAN]: later take into account, our limit of max index buffer and vettex buffer size or constrainst other than mem
+	
 	uint32_t remainingObjects = hatch.getHatchBoxCount() - currentObjectInSection;
-	uploadableObjects = core::min(uploadableObjects, remainingObjects);
+	const uint32_t objectsToUpload = core::min(uploadableObjects, remainingObjects);
 
-	for (uint32_t i = 0; i < uploadableObjects; i++)
-	{
-		const Hatch::CurveHatchBox& hatchBox = hatch.getHatchBox(i + currentObjectInSection);
-
-		uint64_t hatchBoxAddress;
-		{			
-			static_assert(sizeof(CurveBox) == sizeof(Hatch::CurveHatchBox));
-			void* dst = reinterpret_cast<char*>(cpuDrawBuffers.geometryBuffer->getPointer()) + currentGeometryBufferSize;
-			memcpy(dst, &hatchBox, sizeof(CurveBox));
-			hatchBoxAddress = drawResourcesBDA + currentGeometryBufferSize;
-			currentGeometryBufferSize += sizeof(CurveBox);
-		}
+	if (objectsToUpload <= 0u)
+		return;
 
-		DrawObject drawObj = {};
-		drawObj.type_subsectionIdx = uint32_t(static_cast<uint16_t>(ObjectType::CURVE_BOX) | (0 << 16));
-		drawObj.mainObjIndex = mainObjIndex;
-		drawObj.geometryAddress = hatchBoxAddress;
-		void* dst = reinterpret_cast<DrawObject*>(cpuDrawBuffers.drawObjectsBuffer->getPointer()) + currentDrawObjectCount + i;
-		memcpy(dst, &drawObj, sizeof(DrawObject));
+	// Add Geometry
+	static_assert(sizeof(CurveBox) == sizeof(Hatch::CurveHatchBox));
+	const auto curveBoxesByteSize = sizeof(Hatch::CurveHatchBox) * objectsToUpload;
+	size_t geometryBufferOffset = resourcesCollection.geometryInfo.increaseSizeAndGetOffset(curveBoxesByteSize, alignof(Hatch::CurveHatchBox));
+	void* dst = resourcesCollection.geometryInfo.data() + geometryBufferOffset;
+	const Hatch::CurveHatchBox& hatchBox = hatch.getHatchBox(currentObjectInSection); // WARNING: This is assuming hatch boxes are contigous in memory, TODO: maybe make that more obvious through Hatch interface
+	memcpy(dst, &hatchBox, curveBoxesByteSize);
+	
+	// Push Indices, remove later when compute fills this
+	uint32_t* indexBufferToBeFilled = resourcesCollection.indexBuffer.increaseCountAndGetPtr(6u * objectsToUpload);
+	const uint32_t startObj = resourcesCollection.drawObjects.getCount();
+	for (uint32_t i = 0u; i < objectsToUpload; ++i)
+	{
+		indexBufferToBeFilled[i*6]		= (startObj+i)*4u + 1u;
+		indexBufferToBeFilled[i*6 + 1u]	= (startObj+i)*4u + 0u;
+		indexBufferToBeFilled[i*6 + 2u]	= (startObj+i)*4u + 2u;
+		indexBufferToBeFilled[i*6 + 3u]	= (startObj+i)*4u + 1u;
+		indexBufferToBeFilled[i*6 + 4u]	= (startObj+i)*4u + 2u;
+		indexBufferToBeFilled[i*6 + 5u]	= (startObj+i)*4u + 3u;
+	}
+	
+	// Add DrawObjs
+	DrawObject* drawObjectsToBeFilled = resourcesCollection.drawObjects.increaseCountAndGetPtr(objectsToUpload);
+	DrawObject drawObj = {};
+	drawObj.mainObjIndex = mainObjIndex;
+	drawObj.type_subsectionIdx = uint32_t(static_cast<uint16_t>(ObjectType::CURVE_BOX) | (0 << 16));
+	drawObj.geometryAddress = geometryBufferOffset;
+	for (uint32_t i = 0u; i < objectsToUpload; ++i)
+	{
+		drawObjectsToBeFilled[i] = drawObj;
+		drawObj.geometryAddress += sizeof(Hatch::CurveHatchBox);
 	}
 
 	// Add Indices
-	currentDrawObjectCount += uploadableObjects;
 	currentObjectInSection += uploadableObjects;
-#endif
 }
 
-// TODO: FIX
 bool DrawResourcesFiller::addFontGlyph_Internal(const GlyphInfo& glyphInfo, uint32_t mainObjIdx)
 {
-#if 0
 	const uint32_t maxGeometryBufferFontGlyphs = static_cast<uint32_t>((maxGeometryBufferSize - currentGeometryBufferSize) / sizeof(GlyphInfo));
 	
 	uint32_t uploadableObjects = (maxIndexCount / 6u) - currentDrawObjectCount;
@@ -994,11 +998,8 @@ bool DrawResourcesFiller::addFontGlyph_Internal(const GlyphInfo& glyphInfo, uint
 
 		return true;
 	}
-	else
-	{
-		return false;
-	}
-#endif
+
+	return false;
 }
 
 void DrawResourcesFiller::setGlyphMSDFTextureFunction(const GetGlyphMSDFTextureFunc& func)
diff --git a/62_CAD/DrawResourcesFiller.h b/62_CAD/DrawResourcesFiller.h
index 85d88f2eb..b329f4d01 100644
--- a/62_CAD/DrawResourcesFiller.h
+++ b/62_CAD/DrawResourcesFiller.h
@@ -196,6 +196,7 @@ struct DrawResourcesFiller
 	{
 		resetDrawObjects();
 		resetMainObjects();
+		resetCustomClipProjections();
 		resetLineStyles();
 		resetDTMSettings();
 	}

From 692df5ff80f72d8d99bef5d5efa3e4813a8233f3 Mon Sep 17 00:00:00 2001
From: Erfan Ahmadi <ahmadierfan99@gmail.com>
Date: Wed, 2 Apr 2025 04:54:10 +0330
Subject: [PATCH 133/529] fix glyph and image auto-submission logic

---
 62_CAD/DrawResourcesFiller.cpp | 114 ++++++++++++++++++---------------
 1 file changed, 61 insertions(+), 53 deletions(-)

diff --git a/62_CAD/DrawResourcesFiller.cpp b/62_CAD/DrawResourcesFiller.cpp
index 45abd8555..8892f93c8 100644
--- a/62_CAD/DrawResourcesFiller.cpp
+++ b/62_CAD/DrawResourcesFiller.cpp
@@ -119,8 +119,6 @@ void DrawResourcesFiller::drawPolyline(const CPolylineBase& polyline, uint32_t p
 			submitCurrentDrawObjectsAndReset(intendedNextSubmit, polylineMainObjIdx);
 	}
 
-	return; // TODO: Remove
-
 	if (!polyline.getConnectors().empty())
 	{
 		uint32_t currentConnectorPolylineObject = 0u;
@@ -200,7 +198,6 @@ void DrawResourcesFiller::drawHatch(
 		const HatchFillPattern fillPattern,
 		SIntendedSubmitInfo& intendedNextSubmit)
 {
-	return; // TODO: Remove
 	if (color.a == 0.0f) // not visible
 		return;
 
@@ -234,7 +231,6 @@ void DrawResourcesFiller::drawHatch(const Hatch& hatch, const float32_t4& color,
 	drawHatch(hatch, color, HatchFillPattern::SOLID_FILL, intendedNextSubmit);
 }
 
-// TODO: FIX
 void DrawResourcesFiller::drawFontGlyph(
 		nbl::ext::TextRendering::FontFace* fontFace,
 		uint32_t glyphIdx,
@@ -245,7 +241,6 @@ void DrawResourcesFiller::drawFontGlyph(
 		uint32_t mainObjIdx,
 		SIntendedSubmitInfo& intendedNextSubmit)
 {
-#if 0
 	uint32_t textureIdx = InvalidTextureIdx;
 	const MSDFInputInfo msdfInput = MSDFInputInfo(fontFace->getHash(), glyphIdx);
 	textureIdx = getMSDFIndexFromInputInfo(msdfInput, intendedNextSubmit);
@@ -268,39 +263,45 @@ void DrawResourcesFiller::drawFontGlyph(
 		// TODO: Log, probably getGlyphMSDF(face,glyphIdx) returned nullptr ICPUImage ptr
 		_NBL_DEBUG_BREAK_IF(true);
 	}
-#endif
 }
 
-// TODO: FIX
 void DrawResourcesFiller::_test_addImageObject(float64_t2 topLeftPos, float32_t2 size, float32_t rotation, SIntendedSubmitInfo& intendedNextSubmit)
 {
-#if 0
 	auto addImageObject_Internal = [&](const ImageObjectInfo& imageObjectInfo, uint32_t mainObjIdx) -> bool
 		{
-			const uint32_t maxGeometryBufferImageObjects = static_cast<uint32_t>((maxGeometryBufferSize - currentGeometryBufferSize) / sizeof(ImageObjectInfo));
-			uint32_t uploadableObjects = (maxIndexCount / 6u) - currentDrawObjectCount;
-			uploadableObjects = core::min(uploadableObjects, maxDrawObjects - currentDrawObjectCount);
-			uploadableObjects = core::min(uploadableObjects, maxGeometryBufferImageObjects);
-
-			if (uploadableObjects >= 1u)
-			{
-				void* dstGeom = reinterpret_cast<char*>(cpuDrawBuffers.geometryBuffer->getPointer()) + currentGeometryBufferSize;
-				memcpy(dstGeom, &imageObjectInfo, sizeof(ImageObjectInfo));
-				uint64_t geomBufferAddr = drawResourcesBDA + currentGeometryBufferSize;
-				currentGeometryBufferSize += sizeof(ImageObjectInfo);
-
-				DrawObject drawObj = {};
-				drawObj.type_subsectionIdx = uint32_t(static_cast<uint16_t>(ObjectType::IMAGE) | (0 << 16)); // TODO: use custom pack/unpack function
-				drawObj.mainObjIndex = mainObjIdx;
-				drawObj.geometryAddress = geomBufferAddr;
-				void* dstDrawObj = reinterpret_cast<DrawObject*>(cpuDrawBuffers.drawObjectsBuffer->getPointer()) + currentDrawObjectCount;
-				memcpy(dstDrawObj, &drawObj, sizeof(DrawObject));
-				currentDrawObjectCount += 1u;
-
-				return true;
-			}
-			else
+			const size_t remainingResourcesSize = calculateRemainingResourcesSize();
+			
+			const uint32_t uploadableObjects = (remainingResourcesSize) / (sizeof(ImageObjectInfo) + sizeof(DrawObject) + sizeof(uint32_t) * 6u);
+			// TODO[ERFAN]: later take into account, our limit of max index buffer and vettex buffer size or constrainst other than mem
+	
+			if (uploadableObjects <= 0u)
 				return false;
+
+			// Add Geometry
+			size_t geometryBufferOffset = resourcesCollection.geometryInfo.increaseSizeAndGetOffset(sizeof(ImageObjectInfo), alignof(ImageObjectInfo));
+			void* dst = resourcesCollection.geometryInfo.data() + geometryBufferOffset;
+			memcpy(dst, &imageObjectInfo, sizeof(ImageObjectInfo));
+
+			// Push Indices, remove later when compute fills this
+			uint32_t* indexBufferToBeFilled = resourcesCollection.indexBuffer.increaseCountAndGetPtr(6u * 1u);
+			const uint32_t startObj = resourcesCollection.drawObjects.getCount();
+			uint32_t i = 0u;
+			indexBufferToBeFilled[i*6]		= (startObj+i)*4u + 1u;
+			indexBufferToBeFilled[i*6 + 1u]	= (startObj+i)*4u + 0u;
+			indexBufferToBeFilled[i*6 + 2u]	= (startObj+i)*4u + 2u;
+			indexBufferToBeFilled[i*6 + 3u]	= (startObj+i)*4u + 1u;
+			indexBufferToBeFilled[i*6 + 4u]	= (startObj+i)*4u + 2u;
+			indexBufferToBeFilled[i*6 + 5u]	= (startObj+i)*4u + 3u;
+
+			// Add DrawObjs
+			DrawObject* drawObjectsToBeFilled = resourcesCollection.drawObjects.increaseCountAndGetPtr(1u);
+			DrawObject drawObj = {};
+			drawObj.mainObjIndex = mainObjIdx;
+			drawObj.type_subsectionIdx = uint32_t(static_cast<uint16_t>(ObjectType::IMAGE) | (0 << 16)); // TODO: use custom pack/unpack function
+			drawObj.geometryAddress = geometryBufferOffset;
+			drawObjectsToBeFilled[0u] = drawObj;
+
+			return true;
 		};
 
 	uint32_t mainObjIdx = addMainObject_SubmitIfNeeded(InvalidStyleIdx, InvalidDTMSettingsIdx, intendedNextSubmit);
@@ -317,7 +318,6 @@ void DrawResourcesFiller::_test_addImageObject(float64_t2 topLeftPos, float32_t2
 		bool success = addImageObject_Internal(info, mainObjIdx);
 		assert(success); // this should always be true, otherwise it's either bug in code or not enough memory allocated to hold a single image object 
 	}
-#endif
 }
 
 bool DrawResourcesFiller::finalizeAllCopiesToGPU(SIntendedSubmitInfo& intendedNextSubmit)
@@ -975,31 +975,39 @@ void DrawResourcesFiller::addHatch_Internal(const Hatch& hatch, uint32_t& curren
 
 bool DrawResourcesFiller::addFontGlyph_Internal(const GlyphInfo& glyphInfo, uint32_t mainObjIdx)
 {
-	const uint32_t maxGeometryBufferFontGlyphs = static_cast<uint32_t>((maxGeometryBufferSize - currentGeometryBufferSize) / sizeof(GlyphInfo));
+	const size_t remainingResourcesSize = calculateRemainingResourcesSize();
+
+	const uint32_t uploadableObjects = (remainingResourcesSize) / (sizeof(GlyphInfo) + sizeof(DrawObject) + sizeof(uint32_t) * 6u);
+	// TODO[ERFAN]: later take into account, our limit of max index buffer and vettex buffer size or constrainst other than mem
 	
-	uint32_t uploadableObjects = (maxIndexCount / 6u) - currentDrawObjectCount;
-	uploadableObjects = core::min(uploadableObjects, maxDrawObjects - currentDrawObjectCount);
-	uploadableObjects = core::min(uploadableObjects, maxGeometryBufferFontGlyphs);
+	if (uploadableObjects <= 0u)
+		return false;
 
-	if (uploadableObjects >= 1u)
-	{
-		void* geomDst = reinterpret_cast<char*>(cpuDrawBuffers.geometryBuffer->getPointer()) + currentGeometryBufferSize;
-		memcpy(geomDst, &glyphInfo, sizeof(GlyphInfo));
-		uint64_t fontGlyphAddr = drawResourcesBDA + currentGeometryBufferSize;
-		currentGeometryBufferSize += sizeof(GlyphInfo);
-
-		DrawObject drawObj = {};
-		drawObj.type_subsectionIdx = uint32_t(static_cast<uint16_t>(ObjectType::FONT_GLYPH) | (0 << 16));
-		drawObj.mainObjIndex = mainObjIdx;
-		drawObj.geometryAddress = fontGlyphAddr;
-		void* drawObjDst = reinterpret_cast<DrawObject*>(cpuDrawBuffers.drawObjectsBuffer->getPointer()) + currentDrawObjectCount;
-		memcpy(drawObjDst, &drawObj, sizeof(DrawObject));
-		currentDrawObjectCount += 1u;
+	// Add Geometry
+	size_t geometryBufferOffset = resourcesCollection.geometryInfo.increaseSizeAndGetOffset(sizeof(GlyphInfo), alignof(GlyphInfo));
+	void* dst = resourcesCollection.geometryInfo.data() + geometryBufferOffset;
+	memcpy(dst, &glyphInfo, sizeof(GlyphInfo));
 
-		return true;
-	}
+	// Push Indices, remove later when compute fills this
+	uint32_t* indexBufferToBeFilled = resourcesCollection.indexBuffer.increaseCountAndGetPtr(6u * 1u);
+	const uint32_t startObj = resourcesCollection.drawObjects.getCount();
+	uint32_t i = 0u;
+	indexBufferToBeFilled[i*6]		= (startObj+i)*4u + 1u;
+	indexBufferToBeFilled[i*6 + 1u]	= (startObj+i)*4u + 0u;
+	indexBufferToBeFilled[i*6 + 2u]	= (startObj+i)*4u + 2u;
+	indexBufferToBeFilled[i*6 + 3u]	= (startObj+i)*4u + 1u;
+	indexBufferToBeFilled[i*6 + 4u]	= (startObj+i)*4u + 2u;
+	indexBufferToBeFilled[i*6 + 5u]	= (startObj+i)*4u + 3u;
 
-	return false;
+	// Add DrawObjs
+	DrawObject* drawObjectsToBeFilled = resourcesCollection.drawObjects.increaseCountAndGetPtr(1u);
+	DrawObject drawObj = {};
+	drawObj.mainObjIndex = mainObjIdx;
+	drawObj.type_subsectionIdx = uint32_t(static_cast<uint16_t>(ObjectType::FONT_GLYPH) | (0 << 16));
+	drawObj.geometryAddress = geometryBufferOffset;
+	drawObjectsToBeFilled[0u] = drawObj;
+
+	return true;
 }
 
 void DrawResourcesFiller::setGlyphMSDFTextureFunction(const GetGlyphMSDFTextureFunc& func)

From cf05dcad580343633ca1fb72b0232753ad3a299f Mon Sep 17 00:00:00 2001
From: Erfan Ahmadi <ahmadierfan99@gmail.com>
Date: Wed, 2 Apr 2025 04:59:01 +0330
Subject: [PATCH 134/529] Fix ClipProjectionIndices reset

---
 62_CAD/DrawResourcesFiller.h | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/62_CAD/DrawResourcesFiller.h b/62_CAD/DrawResourcesFiller.h
index b329f4d01..7ef3e2020 100644
--- a/62_CAD/DrawResourcesFiller.h
+++ b/62_CAD/DrawResourcesFiller.h
@@ -300,6 +300,10 @@ struct DrawResourcesFiller
 	void resetCustomClipProjections()
 	{
 		resourcesCollection.clipProjections.vector.clear();
+		
+		// Invalidate all the clip projection addresses because clipProjections buffer got reset
+		for (auto& clipProjAddr : clipProjectionIndices)
+			clipProjAddr = InvalidClipProjectionIndex;
 	}
 
 	void resetLineStyles()

From 0bb41db1de63dcaa5e0c0efc95d3ac37a4210d6b Mon Sep 17 00:00:00 2001
From: keptsecret <sorchon@gmail.com>
Date: Wed, 2 Apr 2025 10:47:36 +0700
Subject: [PATCH 135/529] begin adding fake frames for nsight profiler

---
 71_ArithmeticBench/main.cpp | 48 ++++++++++++++++++++++++++++++++-----
 1 file changed, 42 insertions(+), 6 deletions(-)

diff --git a/71_ArithmeticBench/main.cpp b/71_ArithmeticBench/main.cpp
index 29f9ede8a..beb243b97 100644
--- a/71_ArithmeticBench/main.cpp
+++ b/71_ArithmeticBench/main.cpp
@@ -1,13 +1,14 @@
-#include "nbl/application_templates/BasicMultiQueueApplication.hpp"
+#include "SimpleWindowedApplication.hpp"
+#include "CEventCallback.hpp"
 #include "nbl/application_templates/MonoAssetManagerAndBuiltinResourceApplication.hpp"
 #include "app_resources/common.hlsl"
 
-#include <chrono>
-
 using namespace nbl;
 using namespace core;
-using namespace asset;
+using namespace hlsl;
 using namespace system;
+using namespace asset;
+using namespace ui;
 using namespace video;
 
 // method emulations on the CPU, to verify the results of the GPU methods
@@ -47,15 +48,46 @@ struct emulatedScanExclusive
 	static inline constexpr const char* name = "exclusive_scan";
 };
 
-class ArithmeticBenchApp final : public application_templates::BasicMultiQueueApplication, public application_templates::MonoAssetManagerAndBuiltinResourceApplication
+class ArithmeticBenchApp final : public examples::SimpleWindowedApplication, public application_templates::MonoAssetManagerAndBuiltinResourceApplication
 {
-	using device_base_t = application_templates::BasicMultiQueueApplication;
+	using device_base_t = examples::SimpleWindowedApplication;
 	using asset_base_t = application_templates::MonoAssetManagerAndBuiltinResourceApplication;
 
+	constexpr static inline uint32_t2 WindowDimensions = { 1280, 720 };
+	constexpr static inline uint32_t MaxFramesInFlight = 5;
+
 public:
 	ArithmeticBenchApp(const path& _localInputCWD, const path& _localOutputCWD, const path& _sharedInputCWD, const path& _sharedOutputCWD) :
 		system::IApplicationFramework(_localInputCWD, _localOutputCWD, _sharedInputCWD, _sharedOutputCWD) {}
 
+	inline core::vector<video::SPhysicalDeviceFilter::SurfaceCompatibility> getSurfaces() const override
+	{
+		if (!m_surface)
+		{
+			{
+				auto windowCallback = core::make_smart_refctd_ptr<CEventCallback>(smart_refctd_ptr(m_inputSystem), smart_refctd_ptr(m_logger));
+				IWindow::SCreationParams params = {};
+				params.callback = core::make_smart_refctd_ptr<nbl::video::ISimpleManagedSurface::ICallback>();
+				params.width = WindowDimensions.x;
+				params.height = WindowDimensions.y;
+				params.x = 32;
+				params.y = 32;
+				params.flags = ui::IWindow::ECF_HIDDEN | IWindow::ECF_BORDERLESS | IWindow::ECF_RESIZABLE;
+				params.windowCaption = "ComputeShaderPathtracer";
+				params.callback = windowCallback;
+				const_cast<std::remove_const_t<decltype(m_window)>&>(m_window) = m_winMgr->createWindow(std::move(params));
+			}
+
+			auto surface = CSurfaceVulkanWin32::create(smart_refctd_ptr(m_api), smart_refctd_ptr_static_cast<IWindowWin32>(m_window));
+			const_cast<std::remove_const_t<decltype(m_surface)>&>(m_surface) = nbl::video::CSimpleResizeSurface<nbl::video::CDefaultSwapchainFramebuffers>::create(std::move(surface));
+		}
+
+		if (m_surface)
+			return { {m_surface->getSurface()/*,EQF_NONE*/} };
+
+		return {};
+	}
+
 	bool onAppInitialized(smart_refctd_ptr<ISystem>&& system) override
 	{
 		if (!device_base_t::onAppInitialized(std::move(system)))
@@ -539,6 +571,10 @@ class ArithmeticBenchApp final : public application_templates::BasicMultiQueueAp
 	smart_refctd_ptr<IGPUPipelineCache> m_spirv_isa_cache;
 	smart_refctd_ptr<IFile> m_spirv_isa_cache_output;
 
+	smart_refctd_ptr<IWindow> m_window;
+	smart_refctd_ptr<CSimpleResizeSurface<CDefaultSwapchainFramebuffers>> m_surface;
+	smart_refctd_ptr<InputSystem> m_inputSystem;
+
 	bool b_runTests = false;
 	uint32_t* inputData = nullptr;
 	constexpr static inline uint32_t OutputBufferCount = 8u;

From 3304cde640bbe189e9e579ef1af33602b19277b5 Mon Sep 17 00:00:00 2001
From: Erfan Ahmadi <ahmadierfan99@gmail.com>
Date: Wed, 2 Apr 2025 14:18:02 +0330
Subject: [PATCH 136/529] auto-submission more mature, corrects resource
 references on the go by not storing any index but address to active +
 functions to begin/endMainObject and setActiveLineStyle

---
 62_CAD/DrawResourcesFiller.cpp | 294 ++++++++++++++++++---------------
 62_CAD/DrawResourcesFiller.h   | 116 ++++++++-----
 62_CAD/SingleLineText.cpp      |   7 +-
 62_CAD/main.cpp                |  11 +-
 62_CAD/shaders/globals.hlsl    |  12 +-
 5 files changed, 254 insertions(+), 186 deletions(-)

diff --git a/62_CAD/DrawResourcesFiller.cpp b/62_CAD/DrawResourcesFiller.cpp
index 8892f93c8..8ad13cb97 100644
--- a/62_CAD/DrawResourcesFiller.cpp
+++ b/62_CAD/DrawResourcesFiller.cpp
@@ -19,6 +19,7 @@ void DrawResourcesFiller::allocateResourcesBuffer(ILogicalDevice* logicalDevice,
 {
 	size = core::alignUp(size, ResourcesMaxNaturalAlignment);
 	size = core::max(size, getMinimumRequiredResourcesBufferSize());
+	size = 512u;
 	IGPUBuffer::SCreationParams geometryCreationParams = {};
 	geometryCreationParams.size = size;
 	geometryCreationParams.usage = bitflag(IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT) | IGPUBuffer::EUF_TRANSFER_DST_BIT | IGPUBuffer::EUF_INDEX_BUFFER_BIT;
@@ -84,16 +85,17 @@ void DrawResourcesFiller::drawPolyline(const CPolylineBase& polyline, const Line
 	if (!lineStyleInfo.isVisible())
 		return;
 
-	uint32_t styleIdx = addLineStyle_SubmitIfNeeded(lineStyleInfo, intendedNextSubmit);
-
-	uint32_t mainObjIdx = addMainObject_SubmitIfNeeded(styleIdx, InvalidDTMSettingsIdx, intendedNextSubmit);
-
-	drawPolyline(polyline, mainObjIdx, intendedNextSubmit);
+	setActiveLineStyle(lineStyleInfo);
+	
+	beginMainObject(MainObjectType::POLYLINE);
+	drawPolyline(polyline, intendedNextSubmit);
+	endMainObject();
 }
 
-void DrawResourcesFiller::drawPolyline(const CPolylineBase& polyline, uint32_t polylineMainObjIdx, SIntendedSubmitInfo& intendedNextSubmit)
+void DrawResourcesFiller::drawPolyline(const CPolylineBase& polyline, SIntendedSubmitInfo& intendedNextSubmit)
 {
-	if (polylineMainObjIdx == InvalidMainObjectIdx)
+	uint32_t mainObjectIdx = acquireActiveMainObjectIndex_SubmitIfNeeded(intendedNextSubmit);
+	if (mainObjectIdx == InvalidMainObjectIdx)
 	{
 		// TODO: assert or log error here
 		assert(false);
@@ -108,7 +110,7 @@ void DrawResourcesFiller::drawPolyline(const CPolylineBase& polyline, uint32_t p
 	while (currentSectionIdx < sectionsCount)
 	{
 		const auto& currentSection = polyline.getSectionInfoAt(currentSectionIdx);
-		addPolylineObjects_Internal(polyline, currentSection, currentObjectInSection, polylineMainObjIdx);
+		addPolylineObjects_Internal(polyline, currentSection, currentObjectInSection, mainObjectIdx);
 
 		if (currentObjectInSection >= currentSection.count)
 		{
@@ -116,7 +118,7 @@ void DrawResourcesFiller::drawPolyline(const CPolylineBase& polyline, uint32_t p
 			currentObjectInSection = 0u;
 		}
 		else
-			submitCurrentDrawObjectsAndReset(intendedNextSubmit, polylineMainObjIdx);
+			submitCurrentDrawObjectsAndReset(intendedNextSubmit, mainObjectIdx);
 	}
 
 	if (!polyline.getConnectors().empty())
@@ -124,16 +126,20 @@ void DrawResourcesFiller::drawPolyline(const CPolylineBase& polyline, uint32_t p
 		uint32_t currentConnectorPolylineObject = 0u;
 		while (currentConnectorPolylineObject < polyline.getConnectors().size())
 		{
-			addPolylineConnectors_Internal(polyline, currentConnectorPolylineObject, polylineMainObjIdx);
+			addPolylineConnectors_Internal(polyline, currentConnectorPolylineObject, mainObjectIdx);
 
 			if (currentConnectorPolylineObject < polyline.getConnectors().size())
-				submitCurrentDrawObjectsAndReset(intendedNextSubmit, polylineMainObjIdx);
+				submitCurrentDrawObjectsAndReset(intendedNextSubmit, mainObjectIdx);
 		}
 	}
 }
 
 void DrawResourcesFiller::drawTriangleMesh(const CTriangleMesh& mesh, CTriangleMesh::DrawData& drawData, const DTMSettingsInfo& dtmSettingsInfo, SIntendedSubmitInfo& intendedNextSubmit)
 {
+	setActiveDTMSettings(dtmSettingsInfo);
+	uint32_t mainObjectIdx = acquireActiveMainObjectIndex_SubmitIfNeeded(intendedNextSubmit);
+	drawData.pushConstants.triangleMeshMainObjectIndex = mainObjectIdx;
+
 	ICPUBuffer::SCreationParams geometryBuffParams;
 	
 	// concatenate the index and vertex buffer into the geometry buffer
@@ -166,12 +172,6 @@ void DrawResourcesFiller::drawTriangleMesh(const CTriangleMesh& mesh, CTriangleM
 	}
 
 	drawData.indexCount = mesh.getIndexCount();
-
-	// call addMainObject_SubmitIfNeeded, use its index in push constants
-
-	uint32_t dtmSettingsIndex = addDTMSettings_SubmitIfNeeded(dtmSettingsInfo, intendedNextSubmit);
-
-	drawData.pushConstants.triangleMeshMainObjectIndex = addMainObject_SubmitIfNeeded(InvalidStyleIdx, dtmSettingsIndex, intendedNextSubmit);
 }
 
 // TODO[Erfan]: Makes more sense if parameters are: solidColor + fillPattern + patternColor
@@ -207,23 +207,27 @@ void DrawResourcesFiller::drawHatch(
 		MSDFInputInfo msdfInfo = MSDFInputInfo(fillPattern);
 		textureIdx = getMSDFIndexFromInputInfo(msdfInfo, intendedNextSubmit);
 		if (textureIdx == InvalidTextureIdx)
-			textureIdx = addMSDFTexture(msdfInfo, getHatchFillPatternMSDF(fillPattern), InvalidMainObjectIdx, intendedNextSubmit);
+			textureIdx = addMSDFTexture(msdfInfo, getHatchFillPatternMSDF(fillPattern), intendedNextSubmit);
 		_NBL_DEBUG_BREAK_IF(textureIdx == InvalidTextureIdx); // probably getHatchFillPatternMSDF returned nullptr
 	}
 
 	LineStyleInfo lineStyle = {};
 	lineStyle.color = color;
 	lineStyle.screenSpaceLineWidth = nbl::hlsl::bit_cast<float, uint32_t>(textureIdx);
-	const uint32_t styleIdx = addLineStyle_SubmitIfNeeded(lineStyle, intendedNextSubmit);
-
-	uint32_t mainObjIdx = addMainObject_SubmitIfNeeded(styleIdx, InvalidDTMSettingsIdx, intendedNextSubmit);
-	uint32_t currentObjectInSection = 0u; // Object here refers to DrawObject used in vertex shader. You can think of it as a Cage.
+	
+	setActiveLineStyle(lineStyle);
+	beginMainObject(MainObjectType::HATCH);
+	
+	uint32_t mainObjectIdx = acquireActiveMainObjectIndex_SubmitIfNeeded(intendedNextSubmit);
+	uint32_t currentObjectInSection = 0u; // Object here refers to DrawObject. You can think of it as a Cage.
 	while (currentObjectInSection < hatch.getHatchBoxCount())
 	{
-		addHatch_Internal(hatch, currentObjectInSection, mainObjIdx);
+		addHatch_Internal(hatch, currentObjectInSection, mainObjectIdx);
 		if (currentObjectInSection < hatch.getHatchBoxCount())
-			submitCurrentDrawObjectsAndReset(intendedNextSubmit, mainObjIdx);
+			submitCurrentDrawObjectsAndReset(intendedNextSubmit, mainObjectIdx);
 	}
+
+	endMainObject();
 }
 
 void DrawResourcesFiller::drawHatch(const Hatch& hatch, const float32_t4& color, SIntendedSubmitInfo& intendedNextSubmit)
@@ -238,14 +242,16 @@ void DrawResourcesFiller::drawFontGlyph(
 		float32_t2 dirU,
 		float32_t  aspectRatio,
 		float32_t2 minUV,
-		uint32_t mainObjIdx,
 		SIntendedSubmitInfo& intendedNextSubmit)
 {
 	uint32_t textureIdx = InvalidTextureIdx;
 	const MSDFInputInfo msdfInput = MSDFInputInfo(fontFace->getHash(), glyphIdx);
 	textureIdx = getMSDFIndexFromInputInfo(msdfInput, intendedNextSubmit);
 	if (textureIdx == InvalidTextureIdx)
-		textureIdx = addMSDFTexture(msdfInput, getGlyphMSDF(fontFace, glyphIdx), mainObjIdx, intendedNextSubmit);
+		textureIdx = addMSDFTexture(msdfInput, getGlyphMSDF(fontFace, glyphIdx), intendedNextSubmit);
+
+	uint32_t mainObjIdx = acquireActiveMainObjectIndex_SubmitIfNeeded(intendedNextSubmit);
+	assert(mainObjIdx != InvalidMainObjectIdx);
 
 	if (textureIdx != InvalidTextureIdx)
 	{
@@ -304,7 +310,9 @@ void DrawResourcesFiller::_test_addImageObject(float64_t2 topLeftPos, float32_t2
 			return true;
 		};
 
-	uint32_t mainObjIdx = addMainObject_SubmitIfNeeded(InvalidStyleIdx, InvalidDTMSettingsIdx, intendedNextSubmit);
+	beginMainObject(MainObjectType::IMAGE);
+
+	uint32_t mainObjIdx = acquireActiveMainObjectIndex_SubmitIfNeeded(intendedNextSubmit);
 
 	ImageObjectInfo info = {};
 	info.topLeft = topLeftPos;
@@ -318,6 +326,8 @@ void DrawResourcesFiller::_test_addImageObject(float64_t2 topLeftPos, float32_t2
 		bool success = addImageObject_Internal(info, mainObjIdx);
 		assert(success); // this should always be true, otherwise it's either bug in code or not enough memory allocated to hold a single image object 
 	}
+
+	endMainObject();
 }
 
 bool DrawResourcesFiller::finalizeAllCopiesToGPU(SIntendedSubmitInfo& intendedNextSubmit)
@@ -328,94 +338,41 @@ bool DrawResourcesFiller::finalizeAllCopiesToGPU(SIntendedSubmitInfo& intendedNe
 	return success;
 }
 
-uint32_t DrawResourcesFiller::addLineStyle_SubmitIfNeeded(const LineStyleInfo& lineStyle, SIntendedSubmitInfo& intendedNextSubmit)
+void DrawResourcesFiller::setActiveLineStyle(const LineStyleInfo& lineStyle)
 {
-	const size_t remainingResourcesSize = calculateRemainingResourcesSize();
-	const bool enoughMem = remainingResourcesSize >= sizeof(LineStyle); // enough remaining memory for 1 more linestyle?
-	
-	uint32_t outLineStyleIdx = addLineStyle_Internal(lineStyle);
-	if (outLineStyleIdx == InvalidStyleIdx)
-	{
-		// There wasn't enough resource memory remaining to fit a single LineStyle
-		finalizeAllCopiesToGPU(intendedNextSubmit);
-		submitDraws(intendedNextSubmit);
-		
-		// resets itself
-		resetLineStyles();
-		// resets higher level resources
-		resetMainObjects();
-		resetDrawObjects();
-
-		outLineStyleIdx = addLineStyle_Internal(lineStyle);
-		assert(outLineStyleIdx != InvalidStyleIdx);
-	}
-
-	return outLineStyleIdx;
+	activeLineStyle = lineStyle;
 }
 
-uint32_t DrawResourcesFiller::addDTMSettings_SubmitIfNeeded(const DTMSettingsInfo& dtmSettings, SIntendedSubmitInfo& intendedNextSubmit)
+void DrawResourcesFiller::setActiveDTMSettings(const DTMSettingsInfo& dtmSettings)
 {
-	// before calling `addDTMSettings_Internal` we have made sute we have enough mem for 
-	uint32_t outDTMSettingIdx = addDTMSettings_Internal(dtmSettings, intendedNextSubmit);
-	if (outDTMSettingIdx == InvalidDTMSettingsIdx)
-	{
-		// There wasn't enough resource memory remaining to fit dtmsettings struct + 2 linestyles structs.
-		finalizeAllCopiesToGPU(intendedNextSubmit);
-		submitDraws(intendedNextSubmit);
-		
-		// resets itself
-		resetDTMSettings();
-		resetLineStyles(); // additionally resets linestyles as well, just to be safe
-		// resets higher level resources
-		resetMainObjects();
-		resetDrawObjects();
-
-		outDTMSettingIdx = addDTMSettings_Internal(dtmSettings, intendedNextSubmit);
-		assert(outDTMSettingIdx != InvalidDTMSettingsIdx);
-	}
-	return outDTMSettingIdx;
+	activeDTMSettings = dtmSettings;
 }
 
-uint32_t DrawResourcesFiller::addMainObject_SubmitIfNeeded(uint32_t styleIdx, uint32_t dtmSettingsIdx, SIntendedSubmitInfo& intendedNextSubmit)
+void DrawResourcesFiller::beginMainObject(MainObjectType type)
 {
-	MainObject mainObject = {};
-	mainObject.styleIdx = styleIdx;
-	mainObject.dtmSettingsIdx = dtmSettingsIdx;
-	mainObject.clipProjectionIndex = acquireCurrentClipProjectionIndex(intendedNextSubmit);
-	uint32_t outMainObjectIdx = addMainObject_Internal(mainObject);
-	if (outMainObjectIdx == InvalidMainObjectIdx)
-	{
-		// failed to fit into remaining resources mem or exceeded max indexable mainobj
-		finalizeAllCopiesToGPU(intendedNextSubmit);
-		submitDraws(intendedNextSubmit);
-		
-		// resets itself
-		resetMainObjects();
-		// resets higher level resources
-		resetDrawObjects();
-		// we shouldn't reset lower level resources like linestyles and clip projections here because it was possibly requested to push to mem before addMainObjects
+	activeMainObjectType = type;
+	activeMainObjectIndex = InvalidMainObjectIdx;
+}
 
-		// try to add again
-		outMainObjectIdx = addMainObject_Internal(mainObject);
-		assert(outMainObjectIdx != InvalidMainObjectIdx);
-	}
-	
-	return outMainObjectIdx;
+void DrawResourcesFiller::endMainObject()
+{
+	activeMainObjectType = MainObjectType::NONE;
+	activeMainObjectIndex = InvalidMainObjectIdx;
 }
 
 void DrawResourcesFiller::pushClipProjectionData(const ClipProjectionData& clipProjectionData)
 {
-	clipProjections.push_back(clipProjectionData);
-	clipProjectionIndices.push_back(InvalidClipProjectionIndex);
+	activeClipProjections.push_back(clipProjectionData);
+	activeClipProjectionIndices.push_back(InvalidClipProjectionIndex);
 }
 
 void DrawResourcesFiller::popClipProjectionData()
 {
-	if (clipProjections.empty())
+	if (activeClipProjections.empty())
 		return;
 
-	clipProjections.pop_back();
-	clipProjectionIndices.pop_back();
+	activeClipProjections.pop_back();
+	activeClipProjectionIndices.pop_back();
 }
 
 bool DrawResourcesFiller::finalizeBufferCopies(SIntendedSubmitInfo& intendedNextSubmit)
@@ -626,27 +583,12 @@ const size_t DrawResourcesFiller::calculateRemainingResourcesSize() const
 	return resourcesGPUBuffer->getSize() - resourcesCollection.calculateTotalConsumption();
 }
 
-void DrawResourcesFiller::submitCurrentDrawObjectsAndReset(SIntendedSubmitInfo& intendedNextSubmit, uint32_t mainObjectIndex)
+void DrawResourcesFiller::submitCurrentDrawObjectsAndReset(SIntendedSubmitInfo& intendedNextSubmit, uint32_t& mainObjectIndex)
 {
 	finalizeAllCopiesToGPU(intendedNextSubmit);
 	submitDraws(intendedNextSubmit);
-
-	// We reset Geometry Counters (drawObj+geometryInfos) because we're done rendering previous geometry
-	// We don't reset counters for styles because we will be reusing them
-	resetDrawObjects();
-}
-
-uint32_t DrawResourcesFiller::addMainObject_Internal(const MainObject& mainObject)
-{
-	const size_t remainingResourcesSize = calculateRemainingResourcesSize();
-	const size_t memRequired = sizeof(MainObject);
-	const bool enoughMem = remainingResourcesSize >= memRequired; // enough remaining memory for 1 more dtm settings with 2 referenced line styles?
-	if (!enoughMem)
-		return InvalidMainObjectIdx;
-	if (resourcesCollection.mainObjects.vector.size() >= MaxIndexableMainObjects)
-		return InvalidMainObjectIdx;
-	resourcesCollection.mainObjects.vector.push_back(mainObject); // this will implicitly increase total resource consumption and reduce remaining size --> no need for mem size trackers
-	return resourcesCollection.mainObjects.vector.size() - 1u;
+	reset(); // resets everything, things referenced through mainObj and other shit will be pushed again through acquireXXX_SubmitIfNeeded
+	mainObjectIndex = acquireActiveMainObjectIndex_SubmitIfNeeded(intendedNextSubmit);
 }
 
 uint32_t DrawResourcesFiller::addLineStyle_Internal(const LineStyleInfo& lineStyleInfo)
@@ -667,8 +609,7 @@ uint32_t DrawResourcesFiller::addLineStyle_Internal(const LineStyleInfo& lineSty
 			return i;
 	}
 
-	resourcesCollection.lineStyles.vector.push_back(gpuLineStyle); // this will implicitly increase total resource consumption and reduce remaining size --> no need for mem size trackers
-	return resourcesCollection.lineStyles.vector.size() - 1u;
+	return resourcesCollection.lineStyles.addAndGetOffset(gpuLineStyle); // this will implicitly increase total resource consumption and reduce remaining size --> no need for mem size trackers
 }
 
 uint32_t DrawResourcesFiller::addDTMSettings_Internal(const DTMSettingsInfo& dtmSettingsInfo, SIntendedSubmitInfo& intendedNextSubmit)
@@ -710,37 +651,125 @@ uint32_t DrawResourcesFiller::addDTMSettings_Internal(const DTMSettingsInfo& dtm
 			return i;
 	}
 	
-	resourcesCollection.dtmSettings.vector.push_back(dtmSettings); // this will implicitly increase total resource consumption and reduce remaining size --> no need for mem size trackers
-	return resourcesCollection.dtmSettings.vector.size() - 1u;
+	resourcesCollection.dtmSettings.addAndGetOffset(dtmSettings); // this will implicitly increase total resource consumption and reduce remaining size --> no need for mem size trackers
+}
+
+uint32_t DrawResourcesFiller::acquireActiveLineStyleIndex_SubmitIfNeeded(SIntendedSubmitInfo& intendedNextSubmit)
+{
+	if (activeLineStyleIndex == InvalidStyleIdx)
+		activeLineStyleIndex = addLineStyle_SubmitIfNeeded(activeLineStyle, intendedNextSubmit);
+	
+	return activeLineStyleIndex;
 }
 
-uint32_t DrawResourcesFiller::acquireCurrentClipProjectionIndex(SIntendedSubmitInfo& intendedNextSubmit)
+uint32_t DrawResourcesFiller::acquireActiveDTMSettingsIndex_SubmitIfNeeded(SIntendedSubmitInfo& intendedNextSubmit)
 {
-	if (clipProjectionIndices.empty())
+	if (activeDTMSettingsIndex == InvalidDTMSettingsIdx)
+		activeDTMSettingsIndex = addDTMSettings_SubmitIfNeeded(activeDTMSettings, intendedNextSubmit);
+	
+	return activeDTMSettingsIndex;
+}
+
+uint32_t DrawResourcesFiller::acquireActiveClipProjectionIndex_SubmitIfNeeded(SIntendedSubmitInfo& intendedNextSubmit)
+{
+	if (activeClipProjectionIndices.empty())
 		return InvalidClipProjectionIndex;
 
-	if (clipProjectionIndices.back() == InvalidClipProjectionIndex)
-		clipProjectionIndices.back() = addClipProjectionData_SubmitIfNeeded(clipProjections.back(), intendedNextSubmit);
+	if (activeClipProjectionIndices.back() == InvalidClipProjectionIndex)
+		activeClipProjectionIndices.back() = addClipProjectionData_SubmitIfNeeded(activeClipProjections.back(), intendedNextSubmit);
 	
-	return clipProjectionIndices.back();
+	return activeClipProjectionIndices.back();
 }
 
-uint32_t DrawResourcesFiller::addClipProjectionData_SubmitIfNeeded(const ClipProjectionData& clipProjectionData, SIntendedSubmitInfo& intendedNextSubmit)
+uint32_t DrawResourcesFiller::acquireActiveMainObjectIndex_SubmitIfNeeded(SIntendedSubmitInfo& intendedNextSubmit)
 {
+	if (activeMainObjectIndex != InvalidMainObjectIdx)
+		return activeMainObjectIndex;
+	if (activeMainObjectType == MainObjectType::NONE)
+	{
+		assert(false); // You're probably trying to acquire mainObjectIndex outside of startMainObject, endMainObject scope
+		return InvalidMainObjectIdx;
+	}
+
 	const size_t remainingResourcesSize = calculateRemainingResourcesSize();
-	const size_t memRequired = sizeof(ClipProjectionData);
+	// making sure MainObject and everything it references fits into remaining resources mem
+	size_t memRequired = sizeof(MainObject);
+	memRequired += ((activeMainObjectType == MainObjectType::DTM) ? sizeof(DTMSettings) : sizeof(LineStyle)); // needing LineStyle or DTMSettings depends on mainObject type
+	memRequired += (activeClipProjectionIndices.empty()) ? 0u : sizeof(ClipProjectionData); // if there is custom clip projections, account for it
+
 	const bool enoughMem = remainingResourcesSize >= memRequired; // enough remaining memory for 1 more dtm settings with 2 referenced line styles?
+	const bool needToOverflowSubmit = (!enoughMem) || (resourcesCollection.mainObjects.vector.size() >= MaxIndexableMainObjects);
+	
+	if (needToOverflowSubmit)
+	{
+		// failed to fit into remaining resources mem or exceeded max indexable mainobj
+		finalizeAllCopiesToGPU(intendedNextSubmit);
+		submitDraws(intendedNextSubmit);
+		reset(); // resets everything! be careful!
+	}
+	
+	MainObject mainObject = {};
+	// These 3 calls below shouldn't need to Submit because we made sure there is enough memory for all of them.
+	// if something here triggers a auto-submit it's a possible bug, TODO: assert that somehow?
+	mainObject.styleIdx = (activeMainObjectType == MainObjectType::DTM) ? InvalidStyleIdx : acquireActiveDTMSettingsIndex_SubmitIfNeeded(intendedNextSubmit); // only call if it requirees dtm
+	mainObject.dtmSettingsIdx = (activeMainObjectType == MainObjectType::DTM) ? acquireActiveDTMSettingsIndex_SubmitIfNeeded(intendedNextSubmit) : InvalidDTMSettingsIdx; // only call if it requirees dtm
+	mainObject.clipProjectionIndex = acquireActiveClipProjectionIndex_SubmitIfNeeded(intendedNextSubmit);
+	activeMainObjectIndex = resourcesCollection.mainObjects.addAndGetOffset(mainObject);
+	return activeMainObjectIndex;
+}
 
-	if (!enoughMem)
+uint32_t DrawResourcesFiller::addLineStyle_SubmitIfNeeded(const LineStyleInfo& lineStyle, SIntendedSubmitInfo& intendedNextSubmit)
+{
+	uint32_t outLineStyleIdx = addLineStyle_Internal(lineStyle);
+	if (outLineStyleIdx == InvalidStyleIdx)
 	{
+		// There wasn't enough resource memory remaining to fit a single LineStyle
 		finalizeAllCopiesToGPU(intendedNextSubmit);
 		submitDraws(intendedNextSubmit);
 		
 		// resets itself
-		resetCustomClipProjections();
+		resetLineStyles();
 		// resets higher level resources
 		resetMainObjects();
 		resetDrawObjects();
+
+		outLineStyleIdx = addLineStyle_Internal(lineStyle);
+		assert(outLineStyleIdx != InvalidStyleIdx);
+	}
+
+	return outLineStyleIdx;
+}
+
+uint32_t DrawResourcesFiller::addDTMSettings_SubmitIfNeeded(const DTMSettingsInfo& dtmSettings, SIntendedSubmitInfo& intendedNextSubmit)
+{
+	// before calling `addDTMSettings_Internal` we have made sute we have enough mem for 
+	uint32_t outDTMSettingIdx = addDTMSettings_Internal(dtmSettings, intendedNextSubmit);
+	if (outDTMSettingIdx == InvalidDTMSettingsIdx)
+	{
+		// There wasn't enough resource memory remaining to fit dtmsettings struct + 2 linestyles structs.
+		finalizeAllCopiesToGPU(intendedNextSubmit);
+		submitDraws(intendedNextSubmit);
+		// resets everything! be careful!
+		reset();
+
+		outDTMSettingIdx = addDTMSettings_Internal(dtmSettings, intendedNextSubmit);
+		assert(outDTMSettingIdx != InvalidDTMSettingsIdx);
+	}
+	return outDTMSettingIdx;
+}
+
+uint32_t DrawResourcesFiller::addClipProjectionData_SubmitIfNeeded(const ClipProjectionData& clipProjectionData, SIntendedSubmitInfo& intendedNextSubmit)
+{
+	const size_t remainingResourcesSize = calculateRemainingResourcesSize();
+	const size_t memRequired = sizeof(ClipProjectionData);
+	const bool enoughMem = remainingResourcesSize >= memRequired; // enough remaining memory for 1 more dtm settings with 2 referenced line styles?
+
+	if (!enoughMem)
+	{
+		finalizeAllCopiesToGPU(intendedNextSubmit);
+		submitDraws(intendedNextSubmit);
+		// resets everything! be careful!
+		reset();
 	}
 	
 	resourcesCollection.clipProjections.vector.push_back(clipProjectionData); // this will implicitly increase total resource consumption and reduce remaining size --> no need for mem size trackers
@@ -1020,7 +1049,7 @@ void DrawResourcesFiller::setHatchFillMSDFTextureFunction(const GetHatchFillPatt
 	getHatchFillPatternMSDF = func;
 }
 
-uint32_t DrawResourcesFiller::addMSDFTexture(const MSDFInputInfo& msdfInput, core::smart_refctd_ptr<ICPUImage>&& cpuImage, uint32_t mainObjIdx, SIntendedSubmitInfo& intendedNextSubmit)
+uint32_t DrawResourcesFiller::addMSDFTexture(const MSDFInputInfo& msdfInput, core::smart_refctd_ptr<ICPUImage>&& cpuImage, SIntendedSubmitInfo& intendedNextSubmit)
 {
 	if (!cpuImage)
 		return InvalidTextureIdx; // TODO: Log
@@ -1041,10 +1070,9 @@ uint32_t DrawResourcesFiller::addMSDFTexture(const MSDFInputInfo& msdfInput, cor
 		{
 			// Dealloc once submission is finished
 			msdfTextureArrayIndexAllocator->multi_deallocate(1u, &evicted.alloc_idx, nextSemaSignal);
-
-			// If we reset main objects will cause an auto submission bug, where adding an msdf texture while constructing glyphs will have wrong main object references (See how SingleLineTexts add Glyphs with a single mainObject)
-			// for the same reason we don't reset line styles
-			submitCurrentDrawObjectsAndReset(intendedNextSubmit, mainObjIdx);
+			finalizeAllCopiesToGPU(intendedNextSubmit);
+			submitDraws(intendedNextSubmit);
+			reset(); // resets everything, things referenced through mainObj and other shit will be pushed again through acquireXXX_SubmitIfNeeded
 		} 
 		else
 		{
diff --git a/62_CAD/DrawResourcesFiller.h b/62_CAD/DrawResourcesFiller.h
index 7ef3e2020..60e7c923c 100644
--- a/62_CAD/DrawResourcesFiller.h
+++ b/62_CAD/DrawResourcesFiller.h
@@ -78,6 +78,12 @@ struct DrawResourcesFiller
 			vector.resize(offset + additionalSize);
 			return offset;
 		}
+		
+		uint32_t addAndGetOffset(const T& val)
+		{
+			vector.push_back(val);
+			return vector.size() - 1u;
+		}
 
 		T* data() { return vector.data(); }
 	};
@@ -148,7 +154,8 @@ struct DrawResourcesFiller
 	//! this function fills buffers required for drawing a polyline and submits a draw through provided callback when there is not enough memory.
 	void drawPolyline(const CPolylineBase& polyline, const LineStyleInfo& lineStyleInfo, SIntendedSubmitInfo& intendedNextSubmit);
 
-	void drawPolyline(const CPolylineBase& polyline, uint32_t polylineMainObjIdx, SIntendedSubmitInfo& intendedNextSubmit);
+	/// WARNING: make sure this function  is called within begin/endMainObject scope
+	void drawPolyline(const CPolylineBase& polyline, SIntendedSubmitInfo& intendedNextSubmit);
 	
 	void drawTriangleMesh(const CTriangleMesh& mesh, CTriangleMesh::DrawData& drawData, const DTMSettingsInfo& dtmSettings, SIntendedSubmitInfo& intendedNextSubmit);
 
@@ -172,8 +179,8 @@ struct DrawResourcesFiller
 		const Hatch& hatch,
 		const float32_t4& color,
 		SIntendedSubmitInfo& intendedNextSubmit);
-
-	// ! Draw Font Glyph, will auto submit if there is no space
+	
+	/// WARNING: make sure this function  is called within begin/endMainObject scope
 	void drawFontGlyph(
 		nbl::ext::TextRendering::FontFace* fontFace,
 		uint32_t glyphIdx,
@@ -181,7 +188,6 @@ struct DrawResourcesFiller
 		float32_t2 dirU,
 		float32_t  aspectRatio,
 		float32_t2 minUV,
-		uint32_t mainObjIdx,
 		SIntendedSubmitInfo& intendedNextSubmit);
 	
 	void _test_addImageObject(
@@ -210,24 +216,17 @@ struct DrawResourcesFiller
 	/// @return how far resourcesGPUBuffer was copied to by `finalizeAllCopiesToGPU` in `resourcesCollection` 
 	const size_t getCopiedResourcesSize() { return copiedResourcesSize; }
 
-	uint32_t addLineStyle_SubmitIfNeeded(const LineStyleInfo& lineStyle, SIntendedSubmitInfo& intendedNextSubmit);
-
-	uint32_t addDTMSettings_SubmitIfNeeded(const DTMSettingsInfo& dtmSettings, SIntendedSubmitInfo& intendedNextSubmit);
-	
-	// TODO[Przemek]: Read after reading the fragment shader comments and having a basic understanding of the relationship between "mainObject" and our programmable blending resolve:
-	// Use `addMainObject_SubmitIfNeeded` to push your single mainObject you'll be using for the enitre triangle mesh (this will ensure overlaps between triangles of the same mesh is resolved correctly)
-	// Delete comment when you understand this
+	// Setting Active Resources:
+	void setActiveLineStyle(const LineStyleInfo& lineStyle);
+	void setActiveDTMSettings(const DTMSettingsInfo& dtmSettings);
 
-	// [ADVANCED] Do not use this function unless you know what you're doing (It may cause auto submit)
-	// Never call this function multiple times in a row before indexing it in a drawable, because future auto-submits may invalidate mainObjects, so do them one by one, for example:
-	// Valid: addMainObject1 --> addXXX(mainObj1) ---> addMainObject2 ---> addXXX(mainObj2) ....
-	// Invalid: addMainObject1 ---> addMainObject2 ---> addXXX(mainObj1) ---> addXXX(mainObj2) ....
-	uint32_t addMainObject_SubmitIfNeeded(uint32_t styleIdx, uint32_t dtmSettingsIdx, SIntendedSubmitInfo& intendedNextSubmit);
+	void beginMainObject(MainObjectType type);
+	void endMainObject();
 
-	// we need to store the clip projection stack to make sure the front is always available in memory
 	void pushClipProjectionData(const ClipProjectionData& clipProjectionData);
 	void popClipProjectionData();
-	const std::deque<ClipProjectionData>& getClipProjectionStack() const { return clipProjections; }
+
+	const std::deque<ClipProjectionData>& getClipProjectionStack() const { return activeClipProjections; }
 
 	smart_refctd_ptr<IGPUImageView> getMSDFsTextureArray() { return msdfTextureArray; }
 
@@ -255,33 +254,59 @@ struct DrawResourcesFiller
 
 	const size_t calculateRemainingResourcesSize() const;
 
-	// Internal Function to call whenever we overflow when we can't fill all of mainObject's drawObjects
-	void submitCurrentDrawObjectsAndReset(SIntendedSubmitInfo& intendedNextSubmit, uint32_t mainObjectIndex);
-
-	/// @return index to added main object.
-	///		It will return `InvalidMainObjectIndex` if it there isn't enough remaining resources memory OR the index would exceed MaxIndexableMainObjects
-	uint32_t addMainObject_Internal(const MainObject& mainObject);
+	// TODO: Find better name for function
+	/// @brief Internal Function to call whenever we overflow when we can't fill all of mainObject's drawObjects
+	/// @param intendedNextSubmit 
+	/// @param mainObjectIndex: function updates mainObjectIndex after submitting, clearing everything and acquiring  mainObjectIndex again.
+	void submitCurrentDrawObjectsAndReset(SIntendedSubmitInfo& intendedNextSubmit, uint32_t& mainObjectIndex);
 
-	uint32_t addLineStyle_Internal(const LineStyleInfo& lineStyleInfo);
+	// Gets resource index to the active linestyle data from the top of stack 
+	// If it's been invalidated then it will request to add to resources again ( auto-submission happens If there is not enough memory to add again)
+	uint32_t acquireActiveLineStyleIndex_SubmitIfNeeded(SIntendedSubmitInfo& intendedNextSubmit);
+	
+	// Gets resource index to the active linestyle data from the top of stack 
+	// If it's been invalidated then it will request to add to resources again ( auto-submission happens If there is not enough memory to add again)
+	uint32_t acquireActiveDTMSettingsIndex_SubmitIfNeeded(SIntendedSubmitInfo& intendedNextSubmit);
 
-	uint32_t addDTMSettings_Internal(const DTMSettingsInfo& dtmSettings, SIntendedSubmitInfo& intendedNextSubmit);
+	// Gets resource index to the active clip projection data from the top of stack 
+	// If it's been invalidated then it will request to add to resources again ( auto-submission happens If there is not enough memory to add again)
+	uint32_t acquireActiveClipProjectionIndex_SubmitIfNeeded(SIntendedSubmitInfo& intendedNextSubmit);
+	
+	// Gets resource index to the active main object data
+	// If it's been invalidated then it will request to add to resources again ( auto-submission happens If there is not enough memory to add again)
+	uint32_t acquireActiveMainObjectIndex_SubmitIfNeeded(SIntendedSubmitInfo& intendedNextSubmit);
 
-	// Gets the current clip projection data (the top of stack) gpu addreess inside the geometryBuffer
-	// If it's been invalidated then it will request to upload again with a possible auto-submit on low geometry buffer memory.
-	uint32_t acquireCurrentClipProjectionIndex(SIntendedSubmitInfo& intendedNextSubmit);
+	/// Attempts to add lineStyle to resources. If it fails to do, due to resource limitations, auto-submits and tries again. 
+	uint32_t addLineStyle_SubmitIfNeeded(const LineStyleInfo& lineStyle, SIntendedSubmitInfo& intendedNextSubmit);
+	
+	/// Attempts to add dtmSettings to resources. If it fails to do, due to resource limitations, auto-submits and tries again. 
+	uint32_t addDTMSettings_SubmitIfNeeded(const DTMSettingsInfo& dtmSettings, SIntendedSubmitInfo& intendedNextSubmit);
 	
+	/// Attempts to add clipProjection to resources. If it fails to do, due to resource limitations, auto-submits and tries again. 
 	uint32_t addClipProjectionData_SubmitIfNeeded(const ClipProjectionData& clipProjectionData, SIntendedSubmitInfo& intendedNextSubmit);
-
+	
+	/// returns index to added LineStyleInfo, returns Invalid index if it exceeds resource limitations
+	uint32_t addLineStyle_Internal(const LineStyleInfo& lineStyleInfo);
+	
+	/// returns index to added DTMSettingsInfo, returns Invalid index if it exceeds resource limitations
+	uint32_t addDTMSettings_Internal(const DTMSettingsInfo& dtmSettings, SIntendedSubmitInfo& intendedNextSubmit);
+	
+	/// Attempts to upload as many draw objects as possible within the given polyline section considering resource limitations
 	void addPolylineObjects_Internal(const CPolylineBase& polyline, const CPolylineBase::SectionInfo& section, uint32_t& currentObjectInSection, uint32_t mainObjIdx);
-
+	
+	/// Attempts to upload as many draw objects as possible within the given polyline connectors considering resource limitations
 	void addPolylineConnectors_Internal(const CPolylineBase& polyline, uint32_t& currentPolylineConnectorObj, uint32_t mainObjIdx);
-
+	
+	/// Attempts to upload as many draw objects as possible within the given polyline section considering resource limitations
 	void addLines_Internal(const CPolylineBase& polyline, const CPolylineBase::SectionInfo& section, uint32_t& currentObjectInSection, uint32_t mainObjIdx);
-
+	
+	/// Attempts to upload as many draw objects as possible within the given polyline section considering resource limitations
 	void addQuadBeziers_Internal(const CPolylineBase& polyline, const CPolylineBase::SectionInfo& section, uint32_t& currentObjectInSection, uint32_t mainObjIdx);
-
+	
+	/// Attempts to upload as many draw objects as possible within the given hatch considering resource limitations
 	void addHatch_Internal(const Hatch& hatch, uint32_t& currentObjectInSection, uint32_t mainObjIndex);
 	
+	/// Attempts to upload a single GlyphInfo considering resource limitations
 	bool addFontGlyph_Internal(const GlyphInfo& glyphInfo, uint32_t mainObjIdx);
 	
 	void resetMainObjects()
@@ -301,19 +326,21 @@ struct DrawResourcesFiller
 	{
 		resourcesCollection.clipProjections.vector.clear();
 		
-		// Invalidate all the clip projection addresses because clipProjections buffer got reset
-		for (auto& clipProjAddr : clipProjectionIndices)
+		// Invalidate all the clip projection addresses because activeClipProjections buffer got reset
+		for (auto& clipProjAddr : activeClipProjectionIndices)
 			clipProjAddr = InvalidClipProjectionIndex;
 	}
 
 	void resetLineStyles()
 	{
 		resourcesCollection.lineStyles.vector.clear();
+		activeLineStyleIndex = InvalidStyleIdx;
 	}
 
 	void resetDTMSettings()
 	{
 		resourcesCollection.dtmSettings.vector.clear();
+		activeDTMSettingsIndex = InvalidDTMSettingsIdx;
 	}
 
 	// MSDF Hashing and Caching Internal Functions 
@@ -404,7 +431,7 @@ struct DrawResourcesFiller
 	
 	// ! mainObjIdx: make sure to pass your mainObjIdx to it if you want it to stay synced/updated if some overflow submit occured which would potentially erase what your mainObject points at.
 	// If you haven't created a mainObject yet, then pass InvalidMainObjectIdx
-	uint32_t addMSDFTexture(const MSDFInputInfo& msdfInput, core::smart_refctd_ptr<ICPUImage>&& cpuImage, uint32_t mainObjIdx, SIntendedSubmitInfo& intendedNextSubmit);
+	uint32_t addMSDFTexture(const MSDFInputInfo& msdfInput, core::smart_refctd_ptr<ICPUImage>&& cpuImage, SIntendedSubmitInfo& intendedNextSubmit);
 	
 	// ResourcesCollection and packed into GPUBuffer
 	ResourcesCollection resourcesCollection;
@@ -415,8 +442,19 @@ struct DrawResourcesFiller
 	smart_refctd_ptr<IUtilities> m_utilities;
 	IQueue* m_copyQueue;
 
-	std::deque<ClipProjectionData> clipProjections; // stack of clip projectios stored so we can resubmit them if geometry buffer got reset.
-	std::deque<uint32_t> clipProjectionIndices; // stack of clip projection gpu addresses in geometry buffer. to keep track of them in push/pops
+	// Active Resources we need to keep track of and push to resources buffer if needed.
+	LineStyleInfo activeLineStyle;
+	uint32_t activeLineStyleIndex = InvalidStyleIdx;
+
+	DTMSettingsInfo activeDTMSettings;
+	uint32_t activeDTMSettingsIndex = InvalidDTMSettingsIdx;
+
+	MainObjectType activeMainObjectType;
+	uint32_t activeMainObjectIndex = InvalidMainObjectIdx;
+
+	// The ClipProjections are stack, because user can push/pop ClipProjections in any order
+	std::deque<ClipProjectionData> activeClipProjections; // stack of clip projections stored so we can resubmit them if geometry buffer got reset.
+	std::deque<uint32_t> activeClipProjectionIndices; // stack of clip projection gpu addresses in geometry buffer. to keep track of them in push/pops
 
 	// MSDF
 	GetGlyphMSDFTextureFunc getGlyphMSDF;
diff --git a/62_CAD/SingleLineText.cpp b/62_CAD/SingleLineText.cpp
index f68f78db3..ea755a2df 100644
--- a/62_CAD/SingleLineText.cpp
+++ b/62_CAD/SingleLineText.cpp
@@ -63,8 +63,8 @@ void SingleLineText::Draw(
 	lineStyle.color = color;
 	lineStyle.screenSpaceLineWidth = tan(tiltTiltAngle);
 	lineStyle.worldSpaceLineWidth = boldInPixels;
-	const uint32_t styleIdx = drawResourcesFiller.addLineStyle_SubmitIfNeeded(lineStyle, intendedNextSubmit);
-	auto glyphObjectIdx = drawResourcesFiller.addMainObject_SubmitIfNeeded(styleIdx, InvalidDTMSettingsIdx, intendedNextSubmit);
+	drawResourcesFiller.setActiveLineStyle(lineStyle);
+	drawResourcesFiller.beginMainObject(MainObjectType::TEXT);
 
 	for (const auto& glyphBox : m_glyphBoxes)
 	{
@@ -75,7 +75,8 @@ void SingleLineText::Draw(
 		// float32_t3 xx = float64_t3(0.0, -glyphBox.size.y, 0.0);
 		const float32_t aspectRatio = static_cast<float32_t>(glm::length(dirV) / glm::length(dirU)); // check if you can just do: (glyphBox.size.y * scale.y) / glyphBox.size.x * scale.x)
 		const float32_t2 minUV = face->getUV(float32_t2(0.0f,0.0f), glyphBox.size, drawResourcesFiller.getMSDFResolution(), MSDFPixelRange);
-		drawResourcesFiller.drawFontGlyph(face, glyphBox.glyphIdx, topLeft, dirU, aspectRatio, minUV, glyphObjectIdx, intendedNextSubmit);
+		drawResourcesFiller.drawFontGlyph(face, glyphBox.glyphIdx, topLeft, dirU, aspectRatio, minUV, intendedNextSubmit);
 	}
 
+	drawResourcesFiller.endMainObject();
 }
\ No newline at end of file
diff --git a/62_CAD/main.cpp b/62_CAD/main.cpp
index 5f9e88694..c11108599 100644
--- a/62_CAD/main.cpp
+++ b/62_CAD/main.cpp
@@ -75,7 +75,7 @@ constexpr std::array<float, (uint32_t)ExampleMode::CASE_COUNT> cameraExtents =
 	600.0	// CASE_9
 };
 
-constexpr ExampleMode mode = ExampleMode::CASE_4;
+constexpr ExampleMode mode = ExampleMode::CASE_6;
 
 class Camera2D
 {
@@ -3022,15 +3022,6 @@ class ComputerAidedDesign final : public examples::SimpleWindowedApplication, pu
 				auto penY = -500.0;
 				auto previous = 0;
 
-				uint32_t glyphObjectIdx;
-				{
-					LineStyleInfo lineStyle = {};
-					lineStyle.color = float32_t4(1.0, 1.0, 1.0, 1.0);
-					const uint32_t styleIdx = drawResourcesFiller.addLineStyle_SubmitIfNeeded(lineStyle, intendedNextSubmit);
-
-					glyphObjectIdx = drawResourcesFiller.addMainObject_SubmitIfNeeded(styleIdx, InvalidDTMSettingsIdx, intendedNextSubmit);
-				}
-
 				float64_t2 currentBaselineStart = float64_t2(0.0, 0.0);
 				float64_t scale = 1.0 / 64.0;
 
diff --git a/62_CAD/shaders/globals.hlsl b/62_CAD/shaders/globals.hlsl
index 562f523e6..b7686684d 100644
--- a/62_CAD/shaders/globals.hlsl
+++ b/62_CAD/shaders/globals.hlsl
@@ -113,6 +113,16 @@ pfloat64_t2 transformVectorNdc(NBL_CONST_REF_ARG(pfloat64_t3x3) transformation,
 }
 #endif
 
+enum class MainObjectType : uint32_t
+{
+    NONE = 0u,
+    POLYLINE,
+    HATCH,
+    TEXT,
+    IMAGE,
+    DTM,
+};
+
 enum class ObjectType : uint32_t
 {
     LINE = 0u,
@@ -282,7 +292,7 @@ NBL_CONSTEXPR uint32_t InvalidRigidSegmentIndex = 0xffffffff;
 NBL_CONSTEXPR float InvalidStyleStretchValue = nbl::hlsl::numeric_limits<float>::infinity;
 
 
-// TODO[Przemek]: we will need something similar to LineStyles but related to heigh shading settings which is user customizable (like LineStyle stipple patterns) and requires upper_bound to figure out the color based on height value.
+// TODO[Przemek]: we will need something similar to LineStyles but related to heigh shading settings which is user customizable (like  stipple patterns) and requires upper_bound to figure out the color based on height value.
 // We'll discuss that later or what it will be looking like and how it's gonna get passed to our shaders.
 
 struct TriangleMeshVertex

From e7b63ee06e9f002e55eb4654156263b013f429c1 Mon Sep 17 00:00:00 2001
From: Erfan Ahmadi <ahmadierfan99@gmail.com>
Date: Wed, 2 Apr 2025 14:20:03 +0330
Subject: [PATCH 137/529] small fix

---
 62_CAD/DrawResourcesFiller.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/62_CAD/DrawResourcesFiller.cpp b/62_CAD/DrawResourcesFiller.cpp
index 8ad13cb97..f2d850cd4 100644
--- a/62_CAD/DrawResourcesFiller.cpp
+++ b/62_CAD/DrawResourcesFiller.cpp
@@ -711,7 +711,7 @@ uint32_t DrawResourcesFiller::acquireActiveMainObjectIndex_SubmitIfNeeded(SInten
 	MainObject mainObject = {};
 	// These 3 calls below shouldn't need to Submit because we made sure there is enough memory for all of them.
 	// if something here triggers a auto-submit it's a possible bug, TODO: assert that somehow?
-	mainObject.styleIdx = (activeMainObjectType == MainObjectType::DTM) ? InvalidStyleIdx : acquireActiveDTMSettingsIndex_SubmitIfNeeded(intendedNextSubmit); // only call if it requirees dtm
+	mainObject.styleIdx = (activeMainObjectType == MainObjectType::DTM) ? InvalidStyleIdx : acquireActiveLineStyleIndex_SubmitIfNeeded(intendedNextSubmit); // only call if it requirees dtm
 	mainObject.dtmSettingsIdx = (activeMainObjectType == MainObjectType::DTM) ? acquireActiveDTMSettingsIndex_SubmitIfNeeded(intendedNextSubmit) : InvalidDTMSettingsIdx; // only call if it requirees dtm
 	mainObject.clipProjectionIndex = acquireActiveClipProjectionIndex_SubmitIfNeeded(intendedNextSubmit);
 	activeMainObjectIndex = resourcesCollection.mainObjects.addAndGetOffset(mainObject);

From 468dab1e9d25d186be08a79af333ff286da6257c Mon Sep 17 00:00:00 2001
From: Erfan Ahmadi <ahmadierfan99@gmail.com>
Date: Wed, 2 Apr 2025 14:21:49 +0330
Subject: [PATCH 138/529] another small fix

---
 62_CAD/DrawResourcesFiller.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/62_CAD/DrawResourcesFiller.h b/62_CAD/DrawResourcesFiller.h
index 60e7c923c..e88b032cd 100644
--- a/62_CAD/DrawResourcesFiller.h
+++ b/62_CAD/DrawResourcesFiller.h
@@ -312,6 +312,7 @@ struct DrawResourcesFiller
 	void resetMainObjects()
 	{
 		resourcesCollection.mainObjects.vector.clear();
+		activeMainObjectIndex = InvalidMainObjectIdx;
 	}
 
 	// these resources are data related to chunks of a whole mainObject

From bb793be88e46143ce29065e720e37f99306b4795 Mon Sep 17 00:00:00 2001
From: Erfan Ahmadi <ahmadierfan99@gmail.com>
Date: Wed, 2 Apr 2025 14:29:33 +0330
Subject: [PATCH 139/529] [temp] lower mem to test auto-submit

---
 62_CAD/DrawResourcesFiller.cpp | 2 +-
 62_CAD/main.cpp                | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/62_CAD/DrawResourcesFiller.cpp b/62_CAD/DrawResourcesFiller.cpp
index f2d850cd4..23b03f97f 100644
--- a/62_CAD/DrawResourcesFiller.cpp
+++ b/62_CAD/DrawResourcesFiller.cpp
@@ -19,7 +19,7 @@ void DrawResourcesFiller::allocateResourcesBuffer(ILogicalDevice* logicalDevice,
 {
 	size = core::alignUp(size, ResourcesMaxNaturalAlignment);
 	size = core::max(size, getMinimumRequiredResourcesBufferSize());
-	size = 512u;
+	size = 368u;
 	IGPUBuffer::SCreationParams geometryCreationParams = {};
 	geometryCreationParams.size = size;
 	geometryCreationParams.usage = bitflag(IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT) | IGPUBuffer::EUF_TRANSFER_DST_BIT | IGPUBuffer::EUF_INDEX_BUFFER_BIT;
diff --git a/62_CAD/main.cpp b/62_CAD/main.cpp
index c11108599..cd673278c 100644
--- a/62_CAD/main.cpp
+++ b/62_CAD/main.cpp
@@ -75,7 +75,7 @@ constexpr std::array<float, (uint32_t)ExampleMode::CASE_COUNT> cameraExtents =
 	600.0	// CASE_9
 };
 
-constexpr ExampleMode mode = ExampleMode::CASE_6;
+constexpr ExampleMode mode = ExampleMode::CASE_8;
 
 class Camera2D
 {

From bcb5fa3e0dc9d05fc73efdd6dede89069b4bb3e9 Mon Sep 17 00:00:00 2001
From: Erfan Ahmadi <ahmadierfan99@gmail.com>
Date: Wed, 2 Apr 2025 15:39:18 +0330
Subject: [PATCH 140/529] fixed linestyle index fetching

---
 62_CAD/DrawResourcesFiller.cpp | 2 ++
 62_CAD/DrawResourcesFiller.h   | 1 +
 2 files changed, 3 insertions(+)

diff --git a/62_CAD/DrawResourcesFiller.cpp b/62_CAD/DrawResourcesFiller.cpp
index 23b03f97f..ce267fb92 100644
--- a/62_CAD/DrawResourcesFiller.cpp
+++ b/62_CAD/DrawResourcesFiller.cpp
@@ -341,11 +341,13 @@ bool DrawResourcesFiller::finalizeAllCopiesToGPU(SIntendedSubmitInfo& intendedNe
 void DrawResourcesFiller::setActiveLineStyle(const LineStyleInfo& lineStyle)
 {
 	activeLineStyle = lineStyle;
+	activeLineStyleIndex = InvalidStyleIdx;
 }
 
 void DrawResourcesFiller::setActiveDTMSettings(const DTMSettingsInfo& dtmSettings)
 {
 	activeDTMSettings = dtmSettings;
+	activeDTMSettingsIndex = InvalidDTMSettingsIdx;
 }
 
 void DrawResourcesFiller::beginMainObject(MainObjectType type)
diff --git a/62_CAD/DrawResourcesFiller.h b/62_CAD/DrawResourcesFiller.h
index e88b032cd..d6e3c7968 100644
--- a/62_CAD/DrawResourcesFiller.h
+++ b/62_CAD/DrawResourcesFiller.h
@@ -198,6 +198,7 @@ struct DrawResourcesFiller
 
 	bool finalizeAllCopiesToGPU(SIntendedSubmitInfo& intendedNextSubmit);
 
+	/// @brief  resets resources buffers
 	void reset()
 	{
 		resetDrawObjects();

From da76699d9356589008fa5190d3feeffd1d2c0a68 Mon Sep 17 00:00:00 2001
From: Erfan Ahmadi <ahmadierfan99@gmail.com>
Date: Wed, 2 Apr 2025 15:57:15 +0330
Subject: [PATCH 141/529] fixed acquireMainObjectIndex

---
 62_CAD/DrawResourcesFiller.cpp | 20 ++++++++++++++------
 1 file changed, 14 insertions(+), 6 deletions(-)

diff --git a/62_CAD/DrawResourcesFiller.cpp b/62_CAD/DrawResourcesFiller.cpp
index ce267fb92..3c808478d 100644
--- a/62_CAD/DrawResourcesFiller.cpp
+++ b/62_CAD/DrawResourcesFiller.cpp
@@ -19,7 +19,7 @@ void DrawResourcesFiller::allocateResourcesBuffer(ILogicalDevice* logicalDevice,
 {
 	size = core::alignUp(size, ResourcesMaxNaturalAlignment);
 	size = core::max(size, getMinimumRequiredResourcesBufferSize());
-	size = 368u;
+	// size = 368u;
 	IGPUBuffer::SCreationParams geometryCreationParams = {};
 	geometryCreationParams.size = size;
 	geometryCreationParams.usage = bitflag(IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT) | IGPUBuffer::EUF_TRANSFER_DST_BIT | IGPUBuffer::EUF_INDEX_BUFFER_BIT;
@@ -693,11 +693,19 @@ uint32_t DrawResourcesFiller::acquireActiveMainObjectIndex_SubmitIfNeeded(SInten
 		return InvalidMainObjectIdx;
 	}
 
+	const bool needsLineStyle =
+		(activeMainObjectType == MainObjectType::POLYLINE) ||
+		(activeMainObjectType == MainObjectType::HATCH) ||
+		(activeMainObjectType == MainObjectType::TEXT);
+	const bool needsDTMSettings = (activeMainObjectType == MainObjectType::DTM);
+	const bool needsCustomClipProjection = (!activeClipProjectionIndices.empty());
+
 	const size_t remainingResourcesSize = calculateRemainingResourcesSize();
 	// making sure MainObject and everything it references fits into remaining resources mem
 	size_t memRequired = sizeof(MainObject);
-	memRequired += ((activeMainObjectType == MainObjectType::DTM) ? sizeof(DTMSettings) : sizeof(LineStyle)); // needing LineStyle or DTMSettings depends on mainObject type
-	memRequired += (activeClipProjectionIndices.empty()) ? 0u : sizeof(ClipProjectionData); // if there is custom clip projections, account for it
+	if (needsLineStyle) memRequired += sizeof(LineStyle);
+	if (needsDTMSettings) memRequired += sizeof(DTMSettings);
+	if (needsCustomClipProjection) memRequired += sizeof(ClipProjectionData);
 
 	const bool enoughMem = remainingResourcesSize >= memRequired; // enough remaining memory for 1 more dtm settings with 2 referenced line styles?
 	const bool needToOverflowSubmit = (!enoughMem) || (resourcesCollection.mainObjects.vector.size() >= MaxIndexableMainObjects);
@@ -713,9 +721,9 @@ uint32_t DrawResourcesFiller::acquireActiveMainObjectIndex_SubmitIfNeeded(SInten
 	MainObject mainObject = {};
 	// These 3 calls below shouldn't need to Submit because we made sure there is enough memory for all of them.
 	// if something here triggers a auto-submit it's a possible bug, TODO: assert that somehow?
-	mainObject.styleIdx = (activeMainObjectType == MainObjectType::DTM) ? InvalidStyleIdx : acquireActiveLineStyleIndex_SubmitIfNeeded(intendedNextSubmit); // only call if it requirees dtm
-	mainObject.dtmSettingsIdx = (activeMainObjectType == MainObjectType::DTM) ? acquireActiveDTMSettingsIndex_SubmitIfNeeded(intendedNextSubmit) : InvalidDTMSettingsIdx; // only call if it requirees dtm
-	mainObject.clipProjectionIndex = acquireActiveClipProjectionIndex_SubmitIfNeeded(intendedNextSubmit);
+	mainObject.styleIdx = (needsLineStyle) ? acquireActiveLineStyleIndex_SubmitIfNeeded(intendedNextSubmit) : InvalidStyleIdx;
+	mainObject.dtmSettingsIdx = (needsDTMSettings) ? acquireActiveDTMSettingsIndex_SubmitIfNeeded(intendedNextSubmit) : InvalidDTMSettingsIdx;
+	mainObject.clipProjectionIndex = (needsCustomClipProjection) ? acquireActiveClipProjectionIndex_SubmitIfNeeded(intendedNextSubmit) : InvalidClipProjectionIndex;
 	activeMainObjectIndex = resourcesCollection.mainObjects.addAndGetOffset(mainObject);
 	return activeMainObjectIndex;
 }

From ed920b0dfa561146782c6434ebacc9afa2644843 Mon Sep 17 00:00:00 2001
From: Erfan Ahmadi <ahmadierfan99@gmail.com>
Date: Wed, 2 Apr 2025 16:15:50 +0330
Subject: [PATCH 142/529] Fix DTM Rendering

---
 62_CAD/DrawResourcesFiller.cpp                    | 5 ++++-
 62_CAD/main.cpp                                   | 3 +--
 62_CAD/shaders/main_pipeline/fragment_shader.hlsl | 5 +++--
 62_CAD/shaders/main_pipeline/vertex_shader.hlsl   | 5 +++--
 4 files changed, 11 insertions(+), 7 deletions(-)

diff --git a/62_CAD/DrawResourcesFiller.cpp b/62_CAD/DrawResourcesFiller.cpp
index 3c808478d..09651eef2 100644
--- a/62_CAD/DrawResourcesFiller.cpp
+++ b/62_CAD/DrawResourcesFiller.cpp
@@ -137,6 +137,8 @@ void DrawResourcesFiller::drawPolyline(const CPolylineBase& polyline, SIntendedS
 void DrawResourcesFiller::drawTriangleMesh(const CTriangleMesh& mesh, CTriangleMesh::DrawData& drawData, const DTMSettingsInfo& dtmSettingsInfo, SIntendedSubmitInfo& intendedNextSubmit)
 {
 	setActiveDTMSettings(dtmSettingsInfo);
+	beginMainObject(MainObjectType::DTM);
+
 	uint32_t mainObjectIdx = acquireActiveMainObjectIndex_SubmitIfNeeded(intendedNextSubmit);
 	drawData.pushConstants.triangleMeshMainObjectIndex = mainObjectIdx;
 
@@ -172,6 +174,7 @@ void DrawResourcesFiller::drawTriangleMesh(const CTriangleMesh& mesh, CTriangleM
 	}
 
 	drawData.indexCount = mesh.getIndexCount();
+	endMainObject();
 }
 
 // TODO[Erfan]: Makes more sense if parameters are: solidColor + fillPattern + patternColor
@@ -653,7 +656,7 @@ uint32_t DrawResourcesFiller::addDTMSettings_Internal(const DTMSettingsInfo& dtm
 			return i;
 	}
 	
-	resourcesCollection.dtmSettings.addAndGetOffset(dtmSettings); // this will implicitly increase total resource consumption and reduce remaining size --> no need for mem size trackers
+	return resourcesCollection.dtmSettings.addAndGetOffset(dtmSettings); // this will implicitly increase total resource consumption and reduce remaining size --> no need for mem size trackers
 }
 
 uint32_t DrawResourcesFiller::acquireActiveLineStyleIndex_SubmitIfNeeded(SIntendedSubmitInfo& intendedNextSubmit)
diff --git a/62_CAD/main.cpp b/62_CAD/main.cpp
index cd673278c..4797aa281 100644
--- a/62_CAD/main.cpp
+++ b/62_CAD/main.cpp
@@ -75,7 +75,7 @@ constexpr std::array<float, (uint32_t)ExampleMode::CASE_COUNT> cameraExtents =
 	600.0	// CASE_9
 };
 
-constexpr ExampleMode mode = ExampleMode::CASE_8;
+constexpr ExampleMode mode = ExampleMode::CASE_9;
 
 class Camera2D
 {
@@ -1429,7 +1429,6 @@ class ComputerAidedDesign final : public examples::SimpleWindowedApplication, pu
 	
 	void addObjects(SIntendedSubmitInfo& intendedNextSubmit)
 	{
-		
 		// TODO[Przemek]: add your own case, you won't call any other drawResourcesFiller function, only drawMesh with your custom made Mesh (for start it can be a single triangle)
 
 		// we record upload of our objects and if we failed to allocate we submit everything
diff --git a/62_CAD/shaders/main_pipeline/fragment_shader.hlsl b/62_CAD/shaders/main_pipeline/fragment_shader.hlsl
index 4852d0522..3e583600f 100644
--- a/62_CAD/shaders/main_pipeline/fragment_shader.hlsl
+++ b/62_CAD/shaders/main_pipeline/fragment_shader.hlsl
@@ -425,14 +425,15 @@ float4 fragMain(PSInput input) : SV_TARGET
     ObjectType objType = input.getObjType();
     const uint32_t currentMainObjectIdx = input.getMainObjectIdx();
     const MainObject mainObj = loadMainObject(currentMainObjectIdx);
-
+    
+#define DTM
 #ifdef DTM
     // TRIANGLE RENDERING
     {
         const float outlineThickness = input.getOutlineThickness();
         const float contourThickness = input.getContourLineThickness();
         const float phaseShift = 0.0f; // input.getCurrentPhaseShift();
-        const float stretch = 1.0f; // TODO: figure out what is it for
+        const float stretch = 1.0f; // TODO: figure out what is it for ---> [ERFAN's REPLY: no need to give shit about this in dtms, it's for special shape styles] 
         const float worldToScreenRatio = input.getCurrentWorldToScreenRatio();
 
         DTMSettings dtm = loadDTMSettings(mainObj.dtmSettingsIdx);
diff --git a/62_CAD/shaders/main_pipeline/vertex_shader.hlsl b/62_CAD/shaders/main_pipeline/vertex_shader.hlsl
index b62cbe543..a808a459d 100644
--- a/62_CAD/shaders/main_pipeline/vertex_shader.hlsl
+++ b/62_CAD/shaders/main_pipeline/vertex_shader.hlsl
@@ -84,7 +84,8 @@ PSInput main(uint vertexID : SV_VertexID)
     // your programmable pulling will use the baseVertexBufferAddress BDA address and `vertexID` to RawBufferLoad it's vertex. 
     // ~~Later, most likely We will require pulling all 3 vertices of the triangle, that's where you need to know which triangle you're currently on, and instead of objectID = vertexID/4 which we currently do, you will do vertexID/3 and pull all 3 of it's vertices.~~
     // Ok, brainfart, a vertex can belong to multiple triangles, I was thinking of AA but triangles share vertices, nevermind my comment above.
-
+    
+#define DTM
 #ifdef DTM
     PSInput outV;
 
@@ -119,7 +120,7 @@ PSInput main(uint vertexID : SV_VertexID)
     );
 
     // TODO: line style of contour line has to be set too!
-    DTMSettings dtm = dtmSettings[mainObj.dtmSettingsIdx];
+    DTMSettings dtm = loadDTMSettings(mainObj.dtmSettingsIdx);
     LineStyle outlineStyle = loadLineStyle(dtm.outlineLineStyleIdx);
     LineStyle contourStyle = loadLineStyle(dtm.contourLineStyleIdx);
     const float screenSpaceOutlineWidth = outlineStyle.screenSpaceLineWidth + _static_cast<float>(_static_cast<pfloat64_t>(outlineStyle.worldSpaceLineWidth) * globals.screenToWorldRatio);

From 9d23afd6ee1388459ec5793d6ddbb790b63ee6de Mon Sep 17 00:00:00 2001
From: Erfan Ahmadi <ahmadierfan99@gmail.com>
Date: Thu, 3 Apr 2025 09:53:08 +0330
Subject: [PATCH 143/529] Fixed auto-submission bug with self-blending in a
 beautiful and simple way :)

---
 62_CAD/DrawResourcesFiller.cpp                |  4 +-
 62_CAD/DrawResourcesFiller.h                  |  4 +-
 62_CAD/main.cpp                               | 17 ++++----
 62_CAD/shaders/globals.hlsl                   |  5 ++-
 .../main_pipeline/fragment_shader.hlsl        |  2 +-
 .../shaders/main_pipeline/resolve_alphas.hlsl | 42 +++++++++++++------
 .../shaders/main_pipeline/vertex_shader.hlsl  |  2 +-
 7 files changed, 49 insertions(+), 27 deletions(-)

diff --git a/62_CAD/DrawResourcesFiller.cpp b/62_CAD/DrawResourcesFiller.cpp
index 09651eef2..ee9aca985 100644
--- a/62_CAD/DrawResourcesFiller.cpp
+++ b/62_CAD/DrawResourcesFiller.cpp
@@ -19,7 +19,7 @@ void DrawResourcesFiller::allocateResourcesBuffer(ILogicalDevice* logicalDevice,
 {
 	size = core::alignUp(size, ResourcesMaxNaturalAlignment);
 	size = core::max(size, getMinimumRequiredResourcesBufferSize());
-	// size = 368u;
+	size = 368u;
 	IGPUBuffer::SCreationParams geometryCreationParams = {};
 	geometryCreationParams.size = size;
 	geometryCreationParams.usage = bitflag(IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT) | IGPUBuffer::EUF_TRANSFER_DST_BIT | IGPUBuffer::EUF_INDEX_BUFFER_BIT;
@@ -593,7 +593,7 @@ void DrawResourcesFiller::submitCurrentDrawObjectsAndReset(SIntendedSubmitInfo&
 	finalizeAllCopiesToGPU(intendedNextSubmit);
 	submitDraws(intendedNextSubmit);
 	reset(); // resets everything, things referenced through mainObj and other shit will be pushed again through acquireXXX_SubmitIfNeeded
-	mainObjectIndex = acquireActiveMainObjectIndex_SubmitIfNeeded(intendedNextSubmit);
+	mainObjectIndex = acquireActiveMainObjectIndex_SubmitIfNeeded(intendedNextSubmit); // it will be 0 because it's first mainObjectIndex after reset and invalidation
 }
 
 uint32_t DrawResourcesFiller::addLineStyle_Internal(const LineStyleInfo& lineStyleInfo)
diff --git a/62_CAD/DrawResourcesFiller.h b/62_CAD/DrawResourcesFiller.h
index d6e3c7968..8ee4ff1b5 100644
--- a/62_CAD/DrawResourcesFiller.h
+++ b/62_CAD/DrawResourcesFiller.h
@@ -239,6 +239,9 @@ struct DrawResourcesFiller
 		return msdfTextureArray->getCreationParameters().image->getCreationParameters().mipLevels;
 	}
 
+	/// For advanced use only, (passed to shaders for them to know if we overflow-submitted in the middle if a main obj
+	uint32_t getActiveMainObjectIndex() const { return activeMainObjectIndex; }
+
 protected:
 	
 	struct MSDFTextureCopy
@@ -255,7 +258,6 @@ struct DrawResourcesFiller
 
 	const size_t calculateRemainingResourcesSize() const;
 
-	// TODO: Find better name for function
 	/// @brief Internal Function to call whenever we overflow when we can't fill all of mainObject's drawObjects
 	/// @param intendedNextSubmit 
 	/// @param mainObjectIndex: function updates mainObjectIndex after submitting, clearing everything and acquiring  mainObjectIndex again.
diff --git a/62_CAD/main.cpp b/62_CAD/main.cpp
index 4797aa281..de639f8ba 100644
--- a/62_CAD/main.cpp
+++ b/62_CAD/main.cpp
@@ -75,7 +75,7 @@ constexpr std::array<float, (uint32_t)ExampleMode::CASE_COUNT> cameraExtents =
 	600.0	// CASE_9
 };
 
-constexpr ExampleMode mode = ExampleMode::CASE_9;
+constexpr ExampleMode mode = ExampleMode::CASE_4;
 
 class Camera2D
 {
@@ -1213,6 +1213,7 @@ class ComputerAidedDesign final : public examples::SimpleWindowedApplication, pu
 		globalData.screenToWorldRatio = screenToWorld;
 		globalData.worldToScreenRatio = (1.0/screenToWorld);
 		globalData.miterLimit = 10.0f;
+		globalData.currentlyActiveMainObjectIndex = drawResourcesFiller.getActiveMainObjectIndex();
 		SBufferRange<IGPUBuffer> globalBufferUpdateRange = { .offset = 0ull, .size = sizeof(Globals), .buffer = m_globalsBuffer.get() };
 		bool updateSuccess = cb->updateBuffer(globalBufferUpdateRange, &globalData);
 		assert(updateSuccess);
@@ -1883,8 +1884,8 @@ class ComputerAidedDesign final : public examples::SimpleWindowedApplication, pu
 
 			LineStyleInfo style = {};
 			style.screenSpaceLineWidth = 4.0f;
-			style.worldSpaceLineWidth = 0.0f;
-			style.color = float32_t4(0.7f, 0.3f, 0.1f, 0.5f);
+			style.worldSpaceLineWidth = 2.0f;
+			style.color = float32_t4(0.7f, 0.3f, 0.1f, 0.1f);
 
 			LineStyleInfo style2 = {};
 			style2.screenSpaceLineWidth = 2.0f;
@@ -1957,7 +1958,7 @@ class ComputerAidedDesign final : public examples::SimpleWindowedApplication, pu
 						myCurve.majorAxis = { -10.0, 5.0 };
 						myCurve.center = { 0, -5.0 };
 						myCurve.angleBounds = {
-							nbl::core::PI<double>() * 2.0,
+							nbl::core::PI<double>() * 1.0,
 							nbl::core::PI<double>() * 0.0
 							};
 						myCurve.eccentricity = 1.0;
@@ -1985,10 +1986,10 @@ class ComputerAidedDesign final : public examples::SimpleWindowedApplication, pu
 			}
 
 			drawResourcesFiller.drawPolyline(originalPolyline, style, intendedNextSubmit);
-			//CPolyline offsettedPolyline = originalPolyline.generateParallelPolyline(+0.0 - 3.0 * abs(cos(m_timeElapsed * 0.0009)));
-			//CPolyline offsettedPolyline2 = originalPolyline.generateParallelPolyline(+0.0 + 3.0 * abs(cos(m_timeElapsed * 0.0009)));
-			//drawResourcesFiller.drawPolyline(offsettedPolyline, style2, intendedNextSubmit);
-			//drawResourcesFiller.drawPolyline(offsettedPolyline2, style2, intendedNextSubmit);
+			CPolyline offsettedPolyline = originalPolyline.generateParallelPolyline(+0.0 - 3.0 * abs(cos(10.0 * 0.0009)));
+			CPolyline offsettedPolyline2 = originalPolyline.generateParallelPolyline(+0.0 + 3.0 * abs(cos(10.0 * 0.0009)));
+			drawResourcesFiller.drawPolyline(offsettedPolyline, style2, intendedNextSubmit);
+			drawResourcesFiller.drawPolyline(offsettedPolyline2, style2, intendedNextSubmit);
 		}
 		else if (mode == ExampleMode::CASE_4)
 		{
diff --git a/62_CAD/shaders/globals.hlsl b/62_CAD/shaders/globals.hlsl
index b7686684d..319c30b3d 100644
--- a/62_CAD/shaders/globals.hlsl
+++ b/62_CAD/shaders/globals.hlsl
@@ -74,8 +74,9 @@ struct Globals
     pfloat64_t worldToScreenRatio;
     uint32_t2 resolution;
     float antiAliasingFactor;
-    float miterLimit;
-    float32_t2 _padding;
+    uint32_t miterLimit;
+    uint32_t currentlyActiveMainObjectIndex; // for alpha resolve to skip resolving activeMainObjectIdx and prep it for next submit
+    float32_t _padding;
 };
 #ifndef __HLSL_VERSION
 static_assert(sizeof(Globals) == 176u);
diff --git a/62_CAD/shaders/main_pipeline/fragment_shader.hlsl b/62_CAD/shaders/main_pipeline/fragment_shader.hlsl
index 3e583600f..4716f0a66 100644
--- a/62_CAD/shaders/main_pipeline/fragment_shader.hlsl
+++ b/62_CAD/shaders/main_pipeline/fragment_shader.hlsl
@@ -426,7 +426,7 @@ float4 fragMain(PSInput input) : SV_TARGET
     const uint32_t currentMainObjectIdx = input.getMainObjectIdx();
     const MainObject mainObj = loadMainObject(currentMainObjectIdx);
     
-#define DTM
+//#define DTM
 #ifdef DTM
     // TRIANGLE RENDERING
     {
diff --git a/62_CAD/shaders/main_pipeline/resolve_alphas.hlsl b/62_CAD/shaders/main_pipeline/resolve_alphas.hlsl
index c75c86825..987dd7c29 100644
--- a/62_CAD/shaders/main_pipeline/resolve_alphas.hlsl
+++ b/62_CAD/shaders/main_pipeline/resolve_alphas.hlsl
@@ -16,26 +16,44 @@ template<>
 float32_t4 calculateFinalColor<true>(const uint2 fragCoord)
 {
     float32_t4 color;
-    
-    nbl::hlsl::spirv::beginInvocationInterlockEXT();
 
+    nbl::hlsl::spirv::beginInvocationInterlockEXT();
+    
+    bool resolve = false;
+    uint32_t toResolveStyleIdx = InvalidStyleIdx;
     const uint32_t packedData = pseudoStencil[fragCoord];
     const uint32_t storedQuantizedAlpha = nbl::hlsl::glsl::bitfieldExtract<uint32_t>(packedData,0,AlphaBits);
     const uint32_t storedMainObjectIdx = nbl::hlsl::glsl::bitfieldExtract<uint32_t>(packedData,AlphaBits,MainObjectIdxBits);
-    pseudoStencil[fragCoord] = nbl::hlsl::glsl::bitfieldInsert<uint32_t>(0, InvalidMainObjectIdx, AlphaBits, MainObjectIdxBits);
 
-    // if geomID has changed, we resolve the SDF alpha (draw using blend), else accumulate
-    const bool resolve = storedMainObjectIdx != InvalidMainObjectIdx;
-    uint32_t toResolveStyleIdx = InvalidStyleIdx;
+    const bool currentlyActiveMainObj = (storedMainObjectIdx == globals.currentlyActiveMainObjectIndex);
+    if (!currentlyActiveMainObj)
+    {
+        // Normal Scenario, this branch will always be taken if there is no overflow submit in the middle of an active mainObject
+        //we do the final resolve of the pixel and invalidate the pseudo-stencil
+        pseudoStencil[fragCoord] = nbl::hlsl::glsl::bitfieldInsert<uint32_t>(0, InvalidMainObjectIdx, AlphaBits, MainObjectIdxBits);
+        
+        // if geomID has changed, we resolve the SDF alpha (draw using blend), else accumulate
+        resolve = storedMainObjectIdx != InvalidMainObjectIdx;
     
-    // load from colorStorage only if we want to resolve color from texture instead of style
-    // sampling from colorStorage needs to happen in critical section because another fragment may also want to store into it at the same time + need to happen before store
-    if (resolve)
+        // load from colorStorage only if we want to resolve color from texture instead of style
+        // sampling from colorStorage needs to happen in critical section because another fragment may also want to store into it at the same time + need to happen before store
+        if (resolve)
+        {
+            toResolveStyleIdx = loadMainObject(storedMainObjectIdx).styleIdx;
+            if (toResolveStyleIdx == InvalidStyleIdx) // if style idx to resolve is invalid, then it means we should resolve from color
+                color = float32_t4(unpackR11G11B10_UNORM(colorStorage[fragCoord]), 1.0f);
+        }
+    }
+    else if (globals.currentlyActiveMainObjectIndex != InvalidMainObjectIdx)
     {
-        toResolveStyleIdx = loadMainObject(storedMainObjectIdx).styleIdx;
-        if (toResolveStyleIdx == InvalidStyleIdx) // if style idx to resolve is invalid, then it means we should resolve from color
-            color = float32_t4(unpackR11G11B10_UNORM(colorStorage[fragCoord]), 1.0f);
+        // Being here means there was an overflow submit in the middle of an active main objejct
+        // We don't want to resolve the active mainObj, because it needs to fully resolved later when the mainObject  actually finishes.
+        // We change the active main object index in our pseudo-stencil to 0u, because that will be it's new index in the next submit.
+        uint32_t newMainObjectIdx = 0u;
+        pseudoStencil[fragCoord] = nbl::hlsl::glsl::bitfieldInsert<uint32_t>(storedQuantizedAlpha, newMainObjectIdx, AlphaBits, MainObjectIdxBits);
+        resolve = false; // just to re-iterate that we don't want to resolve this.
     }
+    
 
     nbl::hlsl::spirv::endInvocationInterlockEXT();
 
diff --git a/62_CAD/shaders/main_pipeline/vertex_shader.hlsl b/62_CAD/shaders/main_pipeline/vertex_shader.hlsl
index a808a459d..d45eac46f 100644
--- a/62_CAD/shaders/main_pipeline/vertex_shader.hlsl
+++ b/62_CAD/shaders/main_pipeline/vertex_shader.hlsl
@@ -85,7 +85,7 @@ PSInput main(uint vertexID : SV_VertexID)
     // ~~Later, most likely We will require pulling all 3 vertices of the triangle, that's where you need to know which triangle you're currently on, and instead of objectID = vertexID/4 which we currently do, you will do vertexID/3 and pull all 3 of it's vertices.~~
     // Ok, brainfart, a vertex can belong to multiple triangles, I was thinking of AA but triangles share vertices, nevermind my comment above.
     
-#define DTM
+//#define DTM
 #ifdef DTM
     PSInput outV;
 

From 126afa88f659e40f721a8ffe265703e0c5ea06f7 Mon Sep 17 00:00:00 2001
From: Erfan Ahmadi <ahmadierfan99@gmail.com>
Date: Thu, 3 Apr 2025 10:14:49 +0330
Subject: [PATCH 144/529] updates to comments regarding auto-submit

---
 62_CAD/DrawResourcesFiller.cpp | 26 ++++++++------------------
 62_CAD/DrawResourcesFiller.h   | 13 +++++++------
 62_CAD/main.cpp                |  2 +-
 3 files changed, 16 insertions(+), 25 deletions(-)

diff --git a/62_CAD/DrawResourcesFiller.cpp b/62_CAD/DrawResourcesFiller.cpp
index ee9aca985..a6d975f5c 100644
--- a/62_CAD/DrawResourcesFiller.cpp
+++ b/62_CAD/DrawResourcesFiller.cpp
@@ -19,7 +19,7 @@ void DrawResourcesFiller::allocateResourcesBuffer(ILogicalDevice* logicalDevice,
 {
 	size = core::alignUp(size, ResourcesMaxNaturalAlignment);
 	size = core::max(size, getMinimumRequiredResourcesBufferSize());
-	size = 368u;
+	// size = 368u; STRESS TEST
 	IGPUBuffer::SCreationParams geometryCreationParams = {};
 	geometryCreationParams.size = size;
 	geometryCreationParams.usage = bitflag(IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT) | IGPUBuffer::EUF_TRANSFER_DST_BIT | IGPUBuffer::EUF_INDEX_BUFFER_BIT;
@@ -185,9 +185,7 @@ void DrawResourcesFiller::drawHatch(
 		const HatchFillPattern fillPattern,
 		SIntendedSubmitInfo& intendedNextSubmit)
 {
-	// TODO[Optimization Idea]: don't draw hatch twice if both colors are visible: instead do the msdf inside the alpha resolve by detecting mainObj being a hatch
-	// https://discord.com/channels/593902898015109131/856835291712716820/1228337893366300743
-	// TODO: Come back to this idea when doing color resolve for ecws (they don't have mainObj/style Index, instead they have uv into a texture	
+	// TODO[Optimization Idea]: don't draw hatch twice, we now have color storage buffer and we can treat rendering hatches like a procedural texture (requires 2 colors so no more abusing of linestyle for hatches)
 
 	// if backgroundColor is visible
 	drawHatch(hatch, backgroundColor, intendedNextSubmit);
@@ -602,8 +600,7 @@ uint32_t DrawResourcesFiller::addLineStyle_Internal(const LineStyleInfo& lineSty
 	const bool enoughMem = remainingResourcesSize >= sizeof(LineStyle); // enough remaining memory for 1 more linestyle?
 	if (!enoughMem)
 		return InvalidStyleIdx;
-	// TODO: Additionally constraint by a max size? and return InvalidIdx if it would exceed
-
+	// TODO: Maybe constraint by a max size? and return InvalidIdx if it would exceed
 
 	LineStyle gpuLineStyle = lineStyleInfo.getAsGPUData();
 	_NBL_DEBUG_BREAK_IF(gpuLineStyle.stipplePatternSize > LineStyle::StipplePatternMaxSize); // Oops, even after style normalization the style is too long to be in gpu mem :(
@@ -625,7 +622,7 @@ uint32_t DrawResourcesFiller::addDTMSettings_Internal(const DTMSettingsInfo& dtm
 
 	if (!enoughMem)
 		return InvalidDTMSettingsIdx;
-	// TODO: Additionally constraint by a max size? and return InvalidIdx if it would exceed
+	// TODO: Maybe constraint by a max size? and return InvalidIdx if it would exceed
 
 	DTMSettings dtmSettings;
 	dtmSettings.contourLinesStartHeight = dtmSettingsInfo.contourLinesStartHeight;
@@ -723,7 +720,7 @@ uint32_t DrawResourcesFiller::acquireActiveMainObjectIndex_SubmitIfNeeded(SInten
 	
 	MainObject mainObject = {};
 	// These 3 calls below shouldn't need to Submit because we made sure there is enough memory for all of them.
-	// if something here triggers a auto-submit it's a possible bug, TODO: assert that somehow?
+	// if something here triggers a auto-submit it's a possible bug with calculating `memRequired` above, TODO: assert that somehow?
 	mainObject.styleIdx = (needsLineStyle) ? acquireActiveLineStyleIndex_SubmitIfNeeded(intendedNextSubmit) : InvalidStyleIdx;
 	mainObject.dtmSettingsIdx = (needsDTMSettings) ? acquireActiveDTMSettingsIndex_SubmitIfNeeded(intendedNextSubmit) : InvalidDTMSettingsIdx;
 	mainObject.clipProjectionIndex = (needsCustomClipProjection) ? acquireActiveClipProjectionIndex_SubmitIfNeeded(intendedNextSubmit) : InvalidClipProjectionIndex;
@@ -739,12 +736,7 @@ uint32_t DrawResourcesFiller::addLineStyle_SubmitIfNeeded(const LineStyleInfo& l
 		// There wasn't enough resource memory remaining to fit a single LineStyle
 		finalizeAllCopiesToGPU(intendedNextSubmit);
 		submitDraws(intendedNextSubmit);
-		
-		// resets itself
-		resetLineStyles();
-		// resets higher level resources
-		resetMainObjects();
-		resetDrawObjects();
+		reset(); // resets everything! be careful!
 
 		outLineStyleIdx = addLineStyle_Internal(lineStyle);
 		assert(outLineStyleIdx != InvalidStyleIdx);
@@ -762,8 +754,7 @@ uint32_t DrawResourcesFiller::addDTMSettings_SubmitIfNeeded(const DTMSettingsInf
 		// There wasn't enough resource memory remaining to fit dtmsettings struct + 2 linestyles structs.
 		finalizeAllCopiesToGPU(intendedNextSubmit);
 		submitDraws(intendedNextSubmit);
-		// resets everything! be careful!
-		reset();
+		reset(); // resets everything! be careful!
 
 		outDTMSettingIdx = addDTMSettings_Internal(dtmSettings, intendedNextSubmit);
 		assert(outDTMSettingIdx != InvalidDTMSettingsIdx);
@@ -781,8 +772,7 @@ uint32_t DrawResourcesFiller::addClipProjectionData_SubmitIfNeeded(const ClipPro
 	{
 		finalizeAllCopiesToGPU(intendedNextSubmit);
 		submitDraws(intendedNextSubmit);
-		// resets everything! be careful!
-		reset();
+		reset(); // resets everything! be careful!
 	}
 	
 	resourcesCollection.clipProjections.vector.push_back(clipProjectionData); // this will implicitly increase total resource consumption and reduce remaining size --> no need for mem size trackers
diff --git a/62_CAD/DrawResourcesFiller.h b/62_CAD/DrawResourcesFiller.h
index 8ee4ff1b5..03482320e 100644
--- a/62_CAD/DrawResourcesFiller.h
+++ b/62_CAD/DrawResourcesFiller.h
@@ -67,10 +67,8 @@ struct DrawResourcesFiller
 		}
 
 		/// @brief increases size of general-purpose resources that hold bytes
-		/// @param additionalSize
 		/// @param alignment: Alignment of the pointer returned to be filled, should be PoT and <= ResourcesMaxNaturalAlignment, only use this if storing raw bytes in vector
 		/// @return pointer to start of the data to be filled, up to additional size
-		// TODO: make sure t is 1 byte with templates.
 		size_t increaseSizeAndGetOffset(size_t additionalSize, size_t alignment) 
 		{
 			assert(core::isPoT(alignment) && alignment <= ResourcesMaxNaturalAlignment);
@@ -92,7 +90,6 @@ struct DrawResourcesFiller
 	struct ResourcesCollection
 	{
 		// auto-submission level 0 resources (settings that mainObj references)
-		// Not enough VRAM available to serve adding one of the level 0 resources: they clear themselves and everything from higher levels after doing submission
 		CPUGeneratedResource<LineStyle> lineStyles;
 		CPUGeneratedResource<DTMSettings> dtmSettings;
 		CPUGeneratedResource<ClipProjectionData> clipProjections;
@@ -102,11 +99,11 @@ struct DrawResourcesFiller
 
 		// auto-submission level 2 buffers
 		CPUGeneratedResource<DrawObject> drawObjects;
-		CPUGeneratedResource<uint32_t> indexBuffer; // this is going to change to ReservedComputeResource where index buffer gets filled by compute shaders
-		CPUGeneratedResource<uint8_t> geometryInfo; // general purpose byte buffer for custom geometries, etc
+		CPUGeneratedResource<uint32_t> indexBuffer; // TODO: this is going to change to ReservedComputeResource where index buffer gets filled by compute shaders
+		CPUGeneratedResource<uint8_t> geometryInfo; // general purpose byte buffer for custom data for geometries (eg. line points, bezier definitions, aabbs)
 
 		// Get Total memory consumption, If all ResourcesCollection get packed together with ResourcesMaxNaturalAlignment
-		// used to decide when to overflow
+		// used to decide the remaining memory and when to overflow
 		size_t calculateTotalConsumption() const
 		{
 			return
@@ -154,6 +151,7 @@ struct DrawResourcesFiller
 	//! this function fills buffers required for drawing a polyline and submits a draw through provided callback when there is not enough memory.
 	void drawPolyline(const CPolylineBase& polyline, const LineStyleInfo& lineStyleInfo, SIntendedSubmitInfo& intendedNextSubmit);
 
+	/// Use this in a begin/endMainObject scope when you want to draw different polylines that should essentially be a single main object (no self-blending between components of a single main object)
 	/// WARNING: make sure this function  is called within begin/endMainObject scope
 	void drawPolyline(const CPolylineBase& polyline, SIntendedSubmitInfo& intendedNextSubmit);
 	
@@ -180,6 +178,7 @@ struct DrawResourcesFiller
 		const float32_t4& color,
 		SIntendedSubmitInfo& intendedNextSubmit);
 	
+	/// Used by SingleLineText, Issue drawing a font glyph
 	/// WARNING: make sure this function  is called within begin/endMainObject scope
 	void drawFontGlyph(
 		nbl::ext::TextRendering::FontFace* fontFace,
@@ -196,6 +195,8 @@ struct DrawResourcesFiller
 		float32_t rotation,
 		SIntendedSubmitInfo& intendedNextSubmit);
 
+	/// @brief call this function before submitting to ensure all resources are copied
+	/// records copy command into intendedNextSubmit's active command buffer and might possibly submits if fails allocation on staging upload memory.
 	bool finalizeAllCopiesToGPU(SIntendedSubmitInfo& intendedNextSubmit);
 
 	/// @brief  resets resources buffers
diff --git a/62_CAD/main.cpp b/62_CAD/main.cpp
index de639f8ba..791c8fc04 100644
--- a/62_CAD/main.cpp
+++ b/62_CAD/main.cpp
@@ -75,7 +75,7 @@ constexpr std::array<float, (uint32_t)ExampleMode::CASE_COUNT> cameraExtents =
 	600.0	// CASE_9
 };
 
-constexpr ExampleMode mode = ExampleMode::CASE_4;
+constexpr ExampleMode mode = ExampleMode::CASE_6;
 
 class Camera2D
 {

From 6830c089c9e6ecd7d84e5526a0112f750660b554 Mon Sep 17 00:00:00 2001
From: Erfan Ahmadi <ahmadierfan99@gmail.com>
Date: Thu, 3 Apr 2025 10:19:52 +0330
Subject: [PATCH 145/529] update TODO, need to handle it, after figuring out
 compute stages and what vertex buffers and index buffers look like

---
 62_CAD/DrawResourcesFiller.cpp | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/62_CAD/DrawResourcesFiller.cpp b/62_CAD/DrawResourcesFiller.cpp
index a6d975f5c..c566de456 100644
--- a/62_CAD/DrawResourcesFiller.cpp
+++ b/62_CAD/DrawResourcesFiller.cpp
@@ -279,7 +279,7 @@ void DrawResourcesFiller::_test_addImageObject(float64_t2 topLeftPos, float32_t2
 			const size_t remainingResourcesSize = calculateRemainingResourcesSize();
 			
 			const uint32_t uploadableObjects = (remainingResourcesSize) / (sizeof(ImageObjectInfo) + sizeof(DrawObject) + sizeof(uint32_t) * 6u);
-			// TODO[ERFAN]: later take into account, our limit of max index buffer and vettex buffer size or constrainst other than mem
+			// TODO[ERFAN]: later take into account: our maximum indexable vertex 
 	
 			if (uploadableObjects <= 0u)
 				return false;
@@ -794,7 +794,7 @@ void DrawResourcesFiller::addPolylineConnectors_Internal(const CPolylineBase& po
 	const size_t remainingResourcesSize = calculateRemainingResourcesSize();
 
 	const uint32_t uploadableObjects = (remainingResourcesSize) / (sizeof(PolylineConnector) + sizeof(DrawObject) + sizeof(uint32_t) * 6u);
-	// TODO[ERFAN]: later take into account, our limit of max index buffer and vettex buffer size or constrainst other than mem
+	// TODO[ERFAN]: later take into account: our maximum indexable vertex 
 	
 	const uint32_t connectorCount = static_cast<uint32_t>(polyline.getConnectors().size());
 	const uint32_t remainingObjects = connectorCount - currentPolylineConnectorObj;
@@ -850,7 +850,7 @@ void DrawResourcesFiller::addLines_Internal(const CPolylineBase& polyline, const
 
 	// how many lines fit into mem? --> memConsumption = sizeof(LinePointInfo) + sizeof(LinePointInfo)*lineCount + sizeof(DrawObject)*lineCount + sizeof(uint32_t) * 6u * lineCount
 	const uint32_t uploadableObjects = (remainingResourcesSize - sizeof(LinePointInfo)) / (sizeof(LinePointInfo) + sizeof(DrawObject) + sizeof(uint32_t) * 6u);
-	// TODO[ERFAN]: later take into account, our limit of max index buffer and vettex buffer size or constrainst other than mem
+	// TODO[ERFAN]: later take into account: our maximum indexable vertex 
 
 	const uint32_t lineCount = section.count;
 	const uint32_t remainingObjects = lineCount - currentObjectInSection;
@@ -904,7 +904,7 @@ void DrawResourcesFiller::addQuadBeziers_Internal(const CPolylineBase& polyline,
 	// how many quad bezier objects fit into mem?
 	// memConsumption = quadBezCount * (sizeof(QuadraticBezierInfo) + 3*(sizeof(DrawObject)+6u*sizeof(uint32_t))
 	const uint32_t uploadableObjects = (remainingResourcesSize) / (sizeof(QuadraticBezierInfo) + (sizeof(DrawObject) + 6u * sizeof(uint32_t)) * CagesPerQuadBezier);
-	// TODO[ERFAN]: later take into account, our limit of max index buffer and vettex buffer size or constrainst other than mem
+	// TODO[ERFAN]: later take into account: our maximum indexable vertex 
 	
 	const uint32_t beziersCount = section.count;
 	const uint32_t remainingObjects = beziersCount - currentObjectInSection;
@@ -960,7 +960,7 @@ void DrawResourcesFiller::addHatch_Internal(const Hatch& hatch, uint32_t& curren
 	const size_t remainingResourcesSize = calculateRemainingResourcesSize();
 
 	const uint32_t uploadableObjects = (remainingResourcesSize) / (sizeof(Hatch::CurveHatchBox) + sizeof(DrawObject) + sizeof(uint32_t) * 6u);
-	// TODO[ERFAN]: later take into account, our limit of max index buffer and vettex buffer size or constrainst other than mem
+	// TODO[ERFAN]: later take into account: our maximum indexable vertex 
 	
 	uint32_t remainingObjects = hatch.getHatchBoxCount() - currentObjectInSection;
 	const uint32_t objectsToUpload = core::min(uploadableObjects, remainingObjects);
@@ -1010,7 +1010,7 @@ bool DrawResourcesFiller::addFontGlyph_Internal(const GlyphInfo& glyphInfo, uint
 	const size_t remainingResourcesSize = calculateRemainingResourcesSize();
 
 	const uint32_t uploadableObjects = (remainingResourcesSize) / (sizeof(GlyphInfo) + sizeof(DrawObject) + sizeof(uint32_t) * 6u);
-	// TODO[ERFAN]: later take into account, our limit of max index buffer and vettex buffer size or constrainst other than mem
+	// TODO[ERFAN]: later take into account: our maximum indexable vertex 
 	
 	if (uploadableObjects <= 0u)
 		return false;

From 391c3aca99f39c41f5b63db43559b3be6482a727 Mon Sep 17 00:00:00 2001
From: kevyuu <kevin.kayu@gmail.com>
Date: Thu, 3 Apr 2025 17:23:37 +0700
Subject: [PATCH 146/529] fix ray query geometry to use IShader

---
 67_RayQueryGeometry/app_resources/render.comp.hlsl | 1 +
 67_RayQueryGeometry/main.cpp                       | 7 ++++---
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/67_RayQueryGeometry/app_resources/render.comp.hlsl b/67_RayQueryGeometry/app_resources/render.comp.hlsl
index e3d78f385..b9323ac74 100644
--- a/67_RayQueryGeometry/app_resources/render.comp.hlsl
+++ b/67_RayQueryGeometry/app_resources/render.comp.hlsl
@@ -95,6 +95,7 @@ float3 calculateSmoothNormals(int instID, int primID, SGeomInfo geom, float2 bar
 }
 
 [numthreads(WorkgroupSize, WorkgroupSize, 1)]
+[shader("compute")]
 void main(uint32_t3 threadID : SV_DispatchThreadID)
 {
     uint2 coords = threadID.xy;
diff --git a/67_RayQueryGeometry/main.cpp b/67_RayQueryGeometry/main.cpp
index dab137cbd..c4c483263 100644
--- a/67_RayQueryGeometry/main.cpp
+++ b/67_RayQueryGeometry/main.cpp
@@ -164,9 +164,8 @@ class RayQueryGeometryApp final : public examples::SimpleWindowedApplication, pu
 
 				const auto assets = bundle.getContents();
 				assert(assets.size() == 1);
-				smart_refctd_ptr<ICPUShader> shaderSrc = IAsset::castDown<ICPUShader>(assets[0]);
-				shaderSrc->setShaderStage(IShader::E_SHADER_STAGE::ESS_COMPUTE);
-				auto shader = m_device->createShader(shaderSrc.get());
+				const auto sourceRaw = smart_refctd_ptr_static_cast<IShader>(assets[0]);
+				smart_refctd_ptr<IShader> shader = m_device->compileShader({sourceRaw.get(), nullptr, nullptr, nullptr});
 				if (!shader)
 					return logFail("Failed to create shader!");
 
@@ -203,6 +202,8 @@ class RayQueryGeometryApp final : public examples::SimpleWindowedApplication, pu
 				IGPUComputePipeline::SCreationParams params = {};
 				params.layout = pipelineLayout.get();
 				params.shader.shader = shader.get();
+				params.shader.entryPoint = "main";
+				params.shader.stage = ESS_COMPUTE;
 				if (!m_device->createComputePipelines(nullptr, { &params, 1 }, &renderPipeline))
 					return logFail("Failed to create compute pipeline");
 			}

From 416d7b3735c8ca313928e70a71fdf606551d1850 Mon Sep 17 00:00:00 2001
From: Przemek <minikers21@gmail.com>
Date: Fri, 4 Apr 2025 12:30:44 +0200
Subject: [PATCH 147/529] Saving work

---
 .../main_pipeline/fragment_shader.hlsl        | 37 +++++++++++++------
 1 file changed, 25 insertions(+), 12 deletions(-)

diff --git a/62_CAD/shaders/main_pipeline/fragment_shader.hlsl b/62_CAD/shaders/main_pipeline/fragment_shader.hlsl
index 225c0636e..39a9601c1 100644
--- a/62_CAD/shaders/main_pipeline/fragment_shader.hlsl
+++ b/62_CAD/shaders/main_pipeline/fragment_shader.hlsl
@@ -464,9 +464,7 @@ float4 fragMain(PSInput input) : SV_TARGET
         float minShadingHeight = dtm.heightColorMapHeights[0];
         float maxShadingHeight = dtm.heightColorMapHeights[heightMapSize - 1];
 
-        const bool isHeightBetweenMinAndMax = height >= minShadingHeight && height <= maxShadingHeight;
-        const bool isHeightColorMapNotEmpty = heightMapSize > 0;
-        if (isHeightColorMapNotEmpty && isHeightBetweenMinAndMax)
+        if (heightMapSize > 0)
         {
             DTMSettings::E_HEIGHT_SHADING_MODE mode = dtm.determineHeightShadingMode();
 
@@ -476,23 +474,38 @@ float4 fragMain(PSInput input) : SV_TARGET
                 uint32_t mapIndexPlus1 = nbl::hlsl::upper_bound(dtmHeightsAccessor, 0, heightMapSize, height);
                 uint32_t mapIndex = mapIndexPlus1 == 0 ? mapIndexPlus1 : mapIndexPlus1 - 1;
 
+                float heightDeriv = fwidth(height);
+                bool blendWithPrev = true
+                    && (mapIndex >= heightMapSize - 1 || (height * 2.0 < dtm.heightColorMapHeights[mapIndexPlus1] + dtm.heightColorMapHeights[mapIndex]));
+                
                 // logic explainer: if colorIdx is 0.0 then it means blend with next
                 // if color idx is >= length of the colours array then it means it's also > 0.0 and this blend with prev is true
                 // if color idx is > 0 and < len - 1, then it depends on the current pixel's height value and two closest height values
-                bool blendWithPrev = (mapIndex > 0) 
-                    && (mapIndex >= heightMapSize - 1 || (height * 2.0 < dtm.heightColorMapHeights[mapIndexPlus1] + dtm.heightColorMapHeights[mapIndex]));
-                float heightDeriv = fwidth(height);
                 if (blendWithPrev)
                 {
-                    float pxDistanceToPrevHeight = (height - dtm.heightColorMapHeights[mapIndex]) / heightDeriv;
-                    float prevColorCoverage = smoothstep(-globals.antiAliasingFactor, globals.antiAliasingFactor, pxDistanceToPrevHeight);
-                    textureColor = lerp(dtm.heightColorMapColors[mapIndex - 1].rgb, dtm.heightColorMapColors[mapIndex].rgb, prevColorCoverage);
+                    if (mapIndex > 0)
+                    {
+                        float pxDistanceToPrevHeight = (height - dtm.heightColorMapHeights[mapIndex]) / heightDeriv;
+                        float prevColorCoverage = smoothstep(-globals.antiAliasingFactor, globals.antiAliasingFactor, pxDistanceToPrevHeight);
+                        textureColor = lerp(dtm.heightColorMapColors[mapIndex - 1].rgb, dtm.heightColorMapColors[mapIndex].rgb, prevColorCoverage);
+                    }
+                    else
+                    {
+                        textureColor = dtm.heightColorMapColors[mapIndex].rgb;
+                    }
                 }
                 else
                 {
-                    float pxDistanceToNextHeight = (height - dtm.heightColorMapHeights[mapIndexPlus1]) / heightDeriv;
-                    float nextColorCoverage = smoothstep(-globals.antiAliasingFactor, globals.antiAliasingFactor, pxDistanceToNextHeight);
-                    textureColor = lerp(dtm.heightColorMapColors[mapIndex].rgb, dtm.heightColorMapColors[mapIndexPlus1].rgb, nextColorCoverage);
+                    if (mapIndex < heightMapSize - 1)
+                    {
+                        float pxDistanceToNextHeight = (height - dtm.heightColorMapHeights[mapIndexPlus1]) / heightDeriv;
+                        float nextColorCoverage = smoothstep(-globals.antiAliasingFactor, globals.antiAliasingFactor, pxDistanceToNextHeight);
+                        textureColor = lerp(dtm.heightColorMapColors[mapIndex].rgb, dtm.heightColorMapColors[mapIndexPlus1].rgb, nextColorCoverage);
+                    }
+                    else
+                    {
+                        textureColor = dtm.heightColorMapColors[mapIndex].rgb;
+                    }
                 }
 
                 localAlpha = dtm.heightColorMapColors[mapIndex].a;

From 17dda8e2b8d5d3c2d3a7a853a3662b1c695bb145 Mon Sep 17 00:00:00 2001
From: keptsecret <sorchon@gmail.com>
Date: Mon, 7 Apr 2025 09:48:05 +0700
Subject: [PATCH 148/529] re-numbered example to avoid duplicate

---
 {71_ArithmeticBench => 73_ArithmeticBench}/CMakeLists.txt       | 0
 .../app_resources/benchmarkSubgroup.comp.hlsl                   | 0
 .../app_resources/common.hlsl                                   | 0
 .../app_resources/shaderCommon.hlsl                             | 0
 .../app_resources/testSubgroup.comp.hlsl                        | 0
 .../app_resources/testWorkgroup.comp.hlsl                       | 0
 {71_ArithmeticBench => 73_ArithmeticBench}/config.json.template | 0
 {71_ArithmeticBench => 73_ArithmeticBench}/main.cpp             | 0
 {71_ArithmeticBench => 73_ArithmeticBench}/pipeline.groovy      | 0
 CMakeLists.txt                                                  | 2 +-
 10 files changed, 1 insertion(+), 1 deletion(-)
 rename {71_ArithmeticBench => 73_ArithmeticBench}/CMakeLists.txt (100%)
 rename {71_ArithmeticBench => 73_ArithmeticBench}/app_resources/benchmarkSubgroup.comp.hlsl (100%)
 rename {71_ArithmeticBench => 73_ArithmeticBench}/app_resources/common.hlsl (100%)
 rename {71_ArithmeticBench => 73_ArithmeticBench}/app_resources/shaderCommon.hlsl (100%)
 rename {71_ArithmeticBench => 73_ArithmeticBench}/app_resources/testSubgroup.comp.hlsl (100%)
 rename {71_ArithmeticBench => 73_ArithmeticBench}/app_resources/testWorkgroup.comp.hlsl (100%)
 rename {71_ArithmeticBench => 73_ArithmeticBench}/config.json.template (100%)
 rename {71_ArithmeticBench => 73_ArithmeticBench}/main.cpp (100%)
 rename {71_ArithmeticBench => 73_ArithmeticBench}/pipeline.groovy (100%)

diff --git a/71_ArithmeticBench/CMakeLists.txt b/73_ArithmeticBench/CMakeLists.txt
similarity index 100%
rename from 71_ArithmeticBench/CMakeLists.txt
rename to 73_ArithmeticBench/CMakeLists.txt
diff --git a/71_ArithmeticBench/app_resources/benchmarkSubgroup.comp.hlsl b/73_ArithmeticBench/app_resources/benchmarkSubgroup.comp.hlsl
similarity index 100%
rename from 71_ArithmeticBench/app_resources/benchmarkSubgroup.comp.hlsl
rename to 73_ArithmeticBench/app_resources/benchmarkSubgroup.comp.hlsl
diff --git a/71_ArithmeticBench/app_resources/common.hlsl b/73_ArithmeticBench/app_resources/common.hlsl
similarity index 100%
rename from 71_ArithmeticBench/app_resources/common.hlsl
rename to 73_ArithmeticBench/app_resources/common.hlsl
diff --git a/71_ArithmeticBench/app_resources/shaderCommon.hlsl b/73_ArithmeticBench/app_resources/shaderCommon.hlsl
similarity index 100%
rename from 71_ArithmeticBench/app_resources/shaderCommon.hlsl
rename to 73_ArithmeticBench/app_resources/shaderCommon.hlsl
diff --git a/71_ArithmeticBench/app_resources/testSubgroup.comp.hlsl b/73_ArithmeticBench/app_resources/testSubgroup.comp.hlsl
similarity index 100%
rename from 71_ArithmeticBench/app_resources/testSubgroup.comp.hlsl
rename to 73_ArithmeticBench/app_resources/testSubgroup.comp.hlsl
diff --git a/71_ArithmeticBench/app_resources/testWorkgroup.comp.hlsl b/73_ArithmeticBench/app_resources/testWorkgroup.comp.hlsl
similarity index 100%
rename from 71_ArithmeticBench/app_resources/testWorkgroup.comp.hlsl
rename to 73_ArithmeticBench/app_resources/testWorkgroup.comp.hlsl
diff --git a/71_ArithmeticBench/config.json.template b/73_ArithmeticBench/config.json.template
similarity index 100%
rename from 71_ArithmeticBench/config.json.template
rename to 73_ArithmeticBench/config.json.template
diff --git a/71_ArithmeticBench/main.cpp b/73_ArithmeticBench/main.cpp
similarity index 100%
rename from 71_ArithmeticBench/main.cpp
rename to 73_ArithmeticBench/main.cpp
diff --git a/71_ArithmeticBench/pipeline.groovy b/73_ArithmeticBench/pipeline.groovy
similarity index 100%
rename from 71_ArithmeticBench/pipeline.groovy
rename to 73_ArithmeticBench/pipeline.groovy
diff --git a/CMakeLists.txt b/CMakeLists.txt
index e073141c5..22033c682 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -98,7 +98,7 @@ if(NBL_BUILD_EXAMPLES)
   	add_subdirectory(70_FLIPFluids EXCLUDE_FROM_ALL)
 	add_subdirectory(71_RayTracingPipeline EXCLUDE_FROM_ALL)
 
-	add_subdirectory(71_ArithmeticBench EXCLUDE_FROM_ALL)
+	add_subdirectory(73_ArithmeticBench EXCLUDE_FROM_ALL)
 
 	NBL_HOOK_COMMON_API("${NBL_COMMON_API_TARGETS}")
 endif()

From 3d4e0f2372a799045422a6e71ef7f1bceeed0adc Mon Sep 17 00:00:00 2001
From: keptsecret <sorchon@gmail.com>
Date: Tue, 8 Apr 2025 14:04:57 +0700
Subject: [PATCH 149/529] fake frames for nsight

---
 .../app_resources/benchmarkSubgroup.comp.hlsl |   2 +
 73_ArithmeticBench/main.cpp                   | 452 +++++++++++++++---
 2 files changed, 398 insertions(+), 56 deletions(-)

diff --git a/73_ArithmeticBench/app_resources/benchmarkSubgroup.comp.hlsl b/73_ArithmeticBench/app_resources/benchmarkSubgroup.comp.hlsl
index f3cc679ef..2815d1e38 100644
--- a/73_ArithmeticBench/app_resources/benchmarkSubgroup.comp.hlsl
+++ b/73_ArithmeticBench/app_resources/benchmarkSubgroup.comp.hlsl
@@ -4,6 +4,8 @@
 
 #include "shaderCommon.hlsl"
 
+[[vk::binding(2, 0)]] RWTexture2D<float32_t4> outImage; // dummy
+
 uint32_t globalIndex()
 {
     return nbl::hlsl::glsl::gl_WorkGroupID().x*WORKGROUP_SIZE+nbl::hlsl::workgroup::SubgroupContiguousIndex();
diff --git a/73_ArithmeticBench/main.cpp b/73_ArithmeticBench/main.cpp
index beb243b97..8e067e6cc 100644
--- a/73_ArithmeticBench/main.cpp
+++ b/73_ArithmeticBench/main.cpp
@@ -5,7 +5,6 @@
 
 using namespace nbl;
 using namespace core;
-using namespace hlsl;
 using namespace system;
 using namespace asset;
 using namespace ui;
@@ -53,7 +52,8 @@ class ArithmeticBenchApp final : public examples::SimpleWindowedApplication, pub
 	using device_base_t = examples::SimpleWindowedApplication;
 	using asset_base_t = application_templates::MonoAssetManagerAndBuiltinResourceApplication;
 
-	constexpr static inline uint32_t2 WindowDimensions = { 1280, 720 };
+	constexpr static inline uint32_t WIN_W = 1280;
+	constexpr static inline uint32_t WIN_H = 720;
 	constexpr static inline uint32_t MaxFramesInFlight = 5;
 
 public:
@@ -67,19 +67,19 @@ class ArithmeticBenchApp final : public examples::SimpleWindowedApplication, pub
 			{
 				auto windowCallback = core::make_smart_refctd_ptr<CEventCallback>(smart_refctd_ptr(m_inputSystem), smart_refctd_ptr(m_logger));
 				IWindow::SCreationParams params = {};
-				params.callback = core::make_smart_refctd_ptr<nbl::video::ISimpleManagedSurface::ICallback>();
-				params.width = WindowDimensions.x;
-				params.height = WindowDimensions.y;
+				params.callback = core::make_smart_refctd_ptr<ISimpleManagedSurface::ICallback>();
+				params.width = WIN_W;
+				params.height = WIN_H;
 				params.x = 32;
 				params.y = 32;
 				params.flags = ui::IWindow::ECF_HIDDEN | IWindow::ECF_BORDERLESS | IWindow::ECF_RESIZABLE;
-				params.windowCaption = "ComputeShaderPathtracer";
+				params.windowCaption = "ArithmeticBenchApp";
 				params.callback = windowCallback;
 				const_cast<std::remove_const_t<decltype(m_window)>&>(m_window) = m_winMgr->createWindow(std::move(params));
 			}
 
 			auto surface = CSurfaceVulkanWin32::create(smart_refctd_ptr(m_api), smart_refctd_ptr_static_cast<IWindowWin32>(m_window));
-			const_cast<std::remove_const_t<decltype(m_surface)>&>(m_surface) = nbl::video::CSimpleResizeSurface<nbl::video::CDefaultSwapchainFramebuffers>::create(std::move(surface));
+			const_cast<std::remove_const_t<decltype(m_surface)>&>(m_surface) = CSimpleResizeSurface<ISimpleManagedSurface::ISwapchainResources>::create(std::move(surface));
 		}
 
 		if (m_surface)
@@ -90,11 +90,38 @@ class ArithmeticBenchApp final : public examples::SimpleWindowedApplication, pub
 
 	bool onAppInitialized(smart_refctd_ptr<ISystem>&& system) override
 	{
+		m_inputSystem = make_smart_refctd_ptr<InputSystem>(logger_opt_smart_ptr(smart_refctd_ptr(m_logger)));
+
 		if (!device_base_t::onAppInitialized(std::move(system)))
 			return false;
 		if (!asset_base_t::onAppInitialized(std::move(system)))
 			return false;
 
+		m_semaphore = m_device->createSemaphore(m_realFrameIx);
+		if (!m_semaphore)
+			return logFail("Failed to Create a Semaphore!");
+
+		ISwapchain::SCreationParams swapchainParams = { .surface = m_surface->getSurface() };
+		if (!swapchainParams.deduceFormat(m_physicalDevice))
+			return logFail("Could not choose a Surface Format for the Swapchain!");
+
+		auto graphicsQueue = getGraphicsQueue();
+		if (!m_surface || !m_surface->init(graphicsQueue, std::make_unique<ISimpleManagedSurface::ISwapchainResources>(), swapchainParams.sharedParams))
+			return logFail("Could not create Window & Surface or initialize the Surface!");
+
+		auto pool = m_device->createCommandPool(graphicsQueue->getFamilyIndex(), IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT);
+
+		for (auto i = 0u; i < MaxFramesInFlight; i++)
+		{
+			if (!pool)
+				return logFail("Couldn't create Command Pool!");
+			if (!pool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY, { m_cmdBufs.data() + i, 1 }))
+				return logFail("Couldn't create Command Buffer!");
+		}
+
+		m_winMgr->setWindowSize(m_window.get(), WIN_W, WIN_H);
+		m_surface->recreateSwapchain();
+
 		transferDownQueue = getTransferDownQueue();
 		computeQueue = getComputeQueue();
 
@@ -134,7 +161,24 @@ class ArithmeticBenchApp final : public examples::SimpleWindowedApplication, pub
 			assert(bufferMem.isValid());
 		}
 
-		// create Descriptor Set and Pipeline Layout
+		// create dummy image
+		dummyImg = m_device->createImage({
+				{
+					.type = IGPUImage::ET_2D,
+					.samples = asset::ICPUImage::ESCF_1_BIT,
+					.format = asset::EF_R16G16B16A16_SFLOAT,
+					.extent = {WIN_W, WIN_H, 1},
+					.mipLevels = 1,
+					.arrayLayers = 1,
+					.flags = IImage::ECF_NONE,
+					.usage = core::bitflag(asset::IImage::EUF_STORAGE_BIT) | asset::IImage::EUF_TRANSFER_SRC_BIT
+				}
+			});
+		if (!dummyImg || !m_device->allocate(dummyImg->getMemoryReqs(), dummyImg.get()).isValid())
+			return logFail("Could not create HDR Image");
+
+		// create Descriptor Sets and Pipeline Layouts
+		smart_refctd_ptr<IGPUPipelineLayout> benchPplnLayout;
 		{
 			// create Descriptor Set Layout
 			smart_refctd_ptr<IGPUDescriptorSetLayout> dsLayout;
@@ -148,7 +192,7 @@ class ArithmeticBenchApp final : public examples::SimpleWindowedApplication, pub
 
 			// set and transient pool
 			auto descPool = m_device->createDescriptorPoolForDSLayouts(IDescriptorPool::ECF_NONE,{&dsLayout.get(),1});
-			descriptorSet = descPool->createDescriptorSet(smart_refctd_ptr(dsLayout));
+			testDs = descPool->createDescriptorSet(smart_refctd_ptr(dsLayout));
 			{
 				IGPUDescriptorSet::SDescriptorInfo infos[1+OutputBufferCount];
 				infos[0].desc = gpuinputDataBuffer;
@@ -158,18 +202,49 @@ class ArithmeticBenchApp final : public examples::SimpleWindowedApplication, pub
 					auto buff = outputBuffers[i - 1];
 					infos[i].info.buffer = { 0u,buff->getSize() };
 					infos[i].desc = std::move(buff); // save an atomic in the refcount
-
 				}
 
 				IGPUDescriptorSet::SWriteDescriptorSet writes[2];
 				for (uint32_t i=0u; i<2; i++)
-					writes[i] = {descriptorSet.get(),i,0u,1u,infos+i};
+					writes[i] = {testDs.get(),i,0u,1u,infos+i};
 				writes[1].count = OutputBufferCount;
 
 				m_device->updateDescriptorSets(2, writes, 0u, nullptr);
 			}
+			testPplnLayout = m_device->createPipelineLayout({}, std::move(dsLayout));
 
-			pipelineLayout = m_device->createPipelineLayout({},std::move(dsLayout));
+
+			{
+				IGPUDescriptorSetLayout::SBinding binding[3];
+				for (uint32_t i = 0u; i < 2; i++)
+					binding[i] = { {},i,IDescriptor::E_TYPE::ET_STORAGE_BUFFER,IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_NONE,IShader::E_SHADER_STAGE::ESS_COMPUTE,1u,nullptr };
+				binding[1].count = OutputBufferCount;
+				binding[2] = { {},2,IDescriptor::E_TYPE::ET_STORAGE_IMAGE,IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_UPDATE_AFTER_BIND_BIT,IShader::E_SHADER_STAGE::ESS_COMPUTE,1u,nullptr };
+				dsLayout = m_device->createDescriptorSetLayout(binding);
+			}
+
+			benchPool = m_device->createDescriptorPoolForDSLayouts(IDescriptorPool::ECF_UPDATE_AFTER_BIND_BIT, { &dsLayout.get(),1 });
+			benchDs = benchPool->createDescriptorSet(smart_refctd_ptr(dsLayout));
+			{
+				IGPUDescriptorSet::SDescriptorInfo infos[1 + OutputBufferCount];
+				infos[0].desc = gpuinputDataBuffer;
+				infos[0].info.buffer = { 0u,gpuinputDataBuffer->getSize() };
+				for (uint32_t i = 1u; i <= OutputBufferCount; i++)
+				{
+					auto buff = outputBuffers[i - 1];
+					infos[i].info.buffer = { 0u,buff->getSize() };
+					infos[i].desc = std::move(buff); // save an atomic in the refcount
+				}
+				// write swapchain image descriptor in loop
+
+				IGPUDescriptorSet::SWriteDescriptorSet writes[2];
+				for (uint32_t i = 0u; i < 2; i++)
+					writes[i] = { testDs.get(),i,0u,1u,infos + i };
+				writes[1].count = OutputBufferCount;
+
+				m_device->updateDescriptorSets(2, writes, 0u, nullptr);
+			}
+			benchPplnLayout = m_device->createPipelineLayout({}, std::move(dsLayout));
 		}
 
 		const auto spirv_isa_cache_path = localOutputCWD/"spirv_isa_cache.bin";
@@ -226,6 +301,7 @@ class ArithmeticBenchApp final : public examples::SimpleWindowedApplication, pub
 		// now create or retrieve final resources to run our tests
 		sema = m_device->createSemaphore(timelineValue);
 		resultsBuffer = ICPUBuffer::create({ outputBuffers[0]->getSize() });
+		smart_refctd_ptr<IGPUCommandBuffer> cmdbuf;
 		{
 			smart_refctd_ptr<nbl::video::IGPUCommandPool> cmdpool = m_device->createCommandPool(computeQueue->getFamilyIndex(),IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT);
 			if (!cmdpool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY,{&cmdbuf,1}))
@@ -244,10 +320,15 @@ class ArithmeticBenchApp final : public examples::SimpleWindowedApplication, pub
 		const auto MaxSubgroupSize = m_physicalDevice->getLimits().maxSubgroupSize;
 		
 		if (b_runTests)
-			runTests(subgroupTestSource, elementCount, ItemsPerInvocation, MinSubgroupSize, MaxSubgroupSize, workgroupSizes);
+		{
+			runTests(cmdbuf.get(), subgroupTestSource, elementCount, ItemsPerInvocation, MinSubgroupSize, MaxSubgroupSize, workgroupSizes);
 
-		double time = runBenchmark<emulatedReduction>(subgroupTestSource, elementCount, 5, 256, ItemsPerInvocation, NumLoops);
-		m_logger->log("Ran for %.3fms (disregard these numbers, profile in Nsight)", ILogger::ELL_INFO, time * 1000.0);
+			m_logger->log("==========Result==========", ILogger::ELL_INFO);
+			m_logger->log("Fail Count: %u", ILogger::ELL_INFO, totalFailCount);
+		}
+
+		// for each variant, workgroup size etc.
+		benchPipeline = createBenchmarkPipelines<emulatedReduction>(subgroupTestSource, elementCount, hlsl::findMSB(MinSubgroupSize), workgroupSizes[0], ItemsPerInvocation);
 
 		//for (auto subgroupSize = MinSubgroupSize; subgroupSize <= MaxSubgroupSize; subgroupSize *= 2u)
 		//{
@@ -274,22 +355,276 @@ class ArithmeticBenchApp final : public examples::SimpleWindowedApplication, pub
 		//	}
 		//}
 
+		m_winMgr->show(m_window.get());
+
 		return true;
 	}
 
 	virtual bool onAppTerminated() override
 	{
-		m_logger->log("==========Result==========", ILogger::ELL_INFO);
-		m_logger->log("Fail Count: %u", ILogger::ELL_INFO, totalFailCount);
 		delete[] inputData;
 		return true;
 	}
 
 	// the unit test is carried out on init
-	void workLoopBody() override {}
+	void workLoopBody() override
+	{
+		const auto resourceIx = m_realFrameIx % MaxFramesInFlight;
+
+		const uint32_t framesInFlight = core::min(MaxFramesInFlight, m_surface->getMaxAcquiresInFlight());
+
+		if (m_realFrameIx >= framesInFlight)
+		{
+			const ISemaphore::SWaitInfo cbDonePending[] =
+			{
+				{
+					.semaphore = m_semaphore.get(),
+					.value = m_realFrameIx + 1 - framesInFlight
+				}
+			};
+			if (m_device->blockForSemaphores(cbDonePending) != ISemaphore::WAIT_RESULT::SUCCESS)
+				return;
+		}
+
+		m_currentImageAcquire = m_surface->acquireNextImage();
+		if (!m_currentImageAcquire)
+			return;
+
+		auto* const cmdbuf = m_cmdBufs.data()[resourceIx].get();
+		cmdbuf->reset(IGPUCommandBuffer::RESET_FLAGS::RELEASE_RESOURCES_BIT);
+		cmdbuf->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT);
+
+		// barrier transition to GENERAL
+		{
+			IGPUCommandBuffer::SPipelineBarrierDependencyInfo::image_barrier_t imageBarriers[1];
+			imageBarriers[0].barrier = {
+				   .dep = {
+					   .srcStageMask = PIPELINE_STAGE_FLAGS::NONE,
+					   .srcAccessMask = ACCESS_FLAGS::NONE,
+					   .dstStageMask = PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT,
+					   .dstAccessMask = ACCESS_FLAGS::SHADER_WRITE_BITS
+					}
+			};
+			imageBarriers[0].image = dummyImg.get();
+			imageBarriers[0].subresourceRange = {
+				.aspectMask = IImage::EAF_COLOR_BIT,
+				.baseMipLevel = 0u,
+				.levelCount = 1u,
+				.baseArrayLayer = 0u,
+				.layerCount = 1u
+			};
+			imageBarriers[0].oldLayout = IImage::LAYOUT::UNDEFINED;
+			imageBarriers[0].newLayout = IImage::LAYOUT::GENERAL;
+
+			cmdbuf->pipelineBarrier(E_DEPENDENCY_FLAGS::EDF_NONE, { .imgBarriers = imageBarriers });
+		}
+
+		// bind dummy image
+		IGPUImageView::SCreationParams viewParams = {
+			.flags = IGPUImageView::ECF_NONE,
+			.subUsages = IGPUImage::E_USAGE_FLAGS::EUF_STORAGE_BIT,
+			.image = dummyImg,
+			.viewType = IGPUImageView::ET_2D,
+			.format = dummyImg->getCreationParameters().format
+		};
+		auto dummyImgView = m_device->createImageView(std::move(viewParams));
+
+		video::IGPUDescriptorSet::SDescriptorInfo dsInfo;
+		dsInfo.info.image.imageLayout = IImage::LAYOUT::GENERAL;
+		dsInfo.desc = dummyImgView;
+
+		IGPUDescriptorSet::SWriteDescriptorSet dsWrites[1u] =
+		{
+			{
+				.dstSet = benchDs.get(),
+				.binding = 2u,
+				.arrayElement = 0u,
+				.count = 1u,
+				.info = &dsInfo,
+			}
+		};
+		m_device->updateDescriptorSets(1u, dsWrites, 0u, nullptr);
+
+		const uint32_t elementCount = Output<>::ScanElementCount;
+		const uint32_t ItemsPerInvocation = 4u;
+		const uint32_t NumLoops = 100000u;
+		const std::array<uint32_t, 3> workgroupSizes = { 256, 512, 1024 };
+		// const auto MaxWorkgroupSize = m_physicalDevice->getLimits().maxComputeWorkGroupInvocations;
+		const auto MinSubgroupSize = m_physicalDevice->getLimits().minSubgroupSize;
+		const auto MaxSubgroupSize = m_physicalDevice->getLimits().maxSubgroupSize;
+
+		//{
+		//	auto startTime = std::chrono::high_resolution_clock::now();
+
+		//	const IQueue::SSubmitInfo::SSemaphoreInfo signal[1] = { {.semaphore = sema.get(),.value = ++timelineValue} };
+		//	const IQueue::SSubmitInfo::SCommandBufferInfo cmdbufs[1] = { {.cmdbuf = cmdbuf.get()} };
+		//	const IQueue::SSubmitInfo submits[1] = { {.commandBuffers = cmdbufs,.signalSemaphores = signal} };
+		//	computeQueue->submit(submits);
+		//	const ISemaphore::SWaitInfo wait[1] = { {.semaphore = sema.get(),.value = timelineValue} };
+		//	m_device->blockForSemaphores(wait);
+
+		//	auto endTime = std::chrono::high_resolution_clock::now();
+		//}
+
+		double time = runBenchmark<emulatedReduction>(cmdbuf, benchPipeline, elementCount, 5, 256, ItemsPerInvocation, NumLoops);
+		m_logger->log("Ran for %.3fms (disregard these numbers, profile in Nsight)", ILogger::ELL_INFO, time * 1000.0);
+
+
+		// blit
+		{
+			IGPUCommandBuffer::SPipelineBarrierDependencyInfo::image_barrier_t imageBarriers[2];
+			imageBarriers[0].barrier = {
+			   .dep = {
+				   .srcStageMask = PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT,
+				   .srcAccessMask = ACCESS_FLAGS::SHADER_WRITE_BITS,
+				   .dstStageMask = PIPELINE_STAGE_FLAGS::BLIT_BIT,
+				   .dstAccessMask = ACCESS_FLAGS::TRANSFER_WRITE_BIT
+				}
+			};
+			imageBarriers[0].image = dummyImg.get();
+			imageBarriers[0].subresourceRange = {
+				.aspectMask = IImage::EAF_COLOR_BIT,
+				.baseMipLevel = 0u,
+				.levelCount = 1u,
+				.baseArrayLayer = 0u,
+				.layerCount = 1u
+			};
+			imageBarriers[0].oldLayout = IImage::LAYOUT::UNDEFINED;
+			imageBarriers[0].newLayout = IImage::LAYOUT::TRANSFER_SRC_OPTIMAL;
+
+			imageBarriers[1].barrier = {
+			   .dep = {
+				   .srcStageMask = PIPELINE_STAGE_FLAGS::NONE,
+				   .srcAccessMask = ACCESS_FLAGS::NONE,
+				   .dstStageMask = PIPELINE_STAGE_FLAGS::BLIT_BIT,
+				   .dstAccessMask = ACCESS_FLAGS::TRANSFER_WRITE_BIT
+				}
+			};
+			imageBarriers[1].image = m_surface->getSwapchainResources()->getImage(m_currentImageAcquire.imageIndex);
+			imageBarriers[1].subresourceRange = {
+				.aspectMask = IImage::EAF_COLOR_BIT,
+				.baseMipLevel = 0u,
+				.levelCount = 1u,
+				.baseArrayLayer = 0u,
+				.layerCount = 1u
+			};
+			imageBarriers[1].oldLayout = IImage::LAYOUT::UNDEFINED;
+			imageBarriers[1].newLayout = IImage::LAYOUT::TRANSFER_DST_OPTIMAL;
+
+			cmdbuf->pipelineBarrier(E_DEPENDENCY_FLAGS::EDF_NONE, { .imgBarriers = imageBarriers });
+		}
+
+		{
+			IGPUCommandBuffer::SImageBlit regions[] = { {
+				.srcMinCoord = {0,0,0},
+				.srcMaxCoord = {WIN_W,WIN_H,1},
+				.dstMinCoord = {0,0,0},
+				.dstMaxCoord = {WIN_W,WIN_H,1},
+				.layerCount = 1,
+				.srcBaseLayer = 0,
+				.dstBaseLayer = 0,
+				.srcMipLevel = 0,
+				.dstMipLevel = 0,
+				.aspectMask = IGPUImage::E_ASPECT_FLAGS::EAF_COLOR_BIT
+			} };
+
+			auto srcImg = dummyImg.get();
+			auto scRes = static_cast<CDefaultSwapchainFramebuffers*>(m_surface->getSwapchainResources());
+			auto dstImg = scRes->getImage(m_currentImageAcquire.imageIndex);
+
+			cmdbuf->blitImage(srcImg, IImage::LAYOUT::TRANSFER_SRC_OPTIMAL, dstImg, IImage::LAYOUT::TRANSFER_DST_OPTIMAL, regions, ISampler::ETF_NEAREST);
+		}
+
+		// barrier transition to PRESENT
+		{
+			IGPUCommandBuffer::SPipelineBarrierDependencyInfo::image_barrier_t imageBarriers[1];
+			imageBarriers[0].barrier = {
+				   .dep = {
+					   .srcStageMask = PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT,
+					   .srcAccessMask = ACCESS_FLAGS::SHADER_WRITE_BITS,
+					   .dstStageMask = PIPELINE_STAGE_FLAGS::NONE,
+					   .dstAccessMask = ACCESS_FLAGS::NONE
+					}
+			};
+			imageBarriers[0].image = m_surface->getSwapchainResources()->getImage(m_currentImageAcquire.imageIndex);
+			imageBarriers[0].subresourceRange = {
+				.aspectMask = IImage::EAF_COLOR_BIT,
+				.baseMipLevel = 0u,
+				.levelCount = 1u,
+				.baseArrayLayer = 0u,
+				.layerCount = 1u
+			};
+			imageBarriers[0].oldLayout = IImage::LAYOUT::TRANSFER_DST_OPTIMAL;
+			imageBarriers[0].newLayout = IImage::LAYOUT::PRESENT_SRC;
+
+			cmdbuf->pipelineBarrier(E_DEPENDENCY_FLAGS::EDF_NONE, { .imgBarriers = imageBarriers });
+		}
+
+		cmdbuf->end();
+
+		// submit
+		{
+			auto* queue = getGraphicsQueue();
+			const IQueue::SSubmitInfo::SSemaphoreInfo rendered[] =
+			{
+				{
+					.semaphore = m_semaphore.get(),
+					.value = ++m_realFrameIx,
+					.stageMask = PIPELINE_STAGE_FLAGS::ALL_TRANSFER_BITS
+				}
+			};
+			{
+				{
+					const IQueue::SSubmitInfo::SCommandBufferInfo commandBuffers[] =
+					{
+						{.cmdbuf = cmdbuf }
+					};
+
+					const IQueue::SSubmitInfo::SSemaphoreInfo acquired[] =
+					{
+						{
+							.semaphore = m_currentImageAcquire.semaphore,
+							.value = m_currentImageAcquire.acquireCount,
+							.stageMask = PIPELINE_STAGE_FLAGS::NONE
+						}
+					};
+					const IQueue::SSubmitInfo infos[] =
+					{
+						{
+							.waitSemaphores = acquired,
+							.commandBuffers = commandBuffers,
+							.signalSemaphores = rendered
+						}
+					};
+
+					if (queue->submit(infos) == IQueue::RESULT::SUCCESS)
+					{
+						const nbl::video::ISemaphore::SWaitInfo waitInfos[] =
+						{ {
+							.semaphore = m_semaphore.get(),
+							.value = m_realFrameIx
+						} };
+
+						m_device->blockForSemaphores(waitInfos); // this is not solution, quick wa to not throw validation errors
+					}
+					else
+						--m_realFrameIx;
+				}
+			}
+
+			std::string caption = "[Nabla Engine] Geometry Creator";
+			{
+				caption += ", displaying [all objects]";
+				m_window->setCaption(caption);
+			}
+			m_surface->present(m_currentImageAcquire.imageIndex, rendered);
+		}
+
+		numSubmits++;
+	}
 
 	//
-	bool keepRunning() override { return true; }
+	bool keepRunning() override { return numSubmits < MaxNumSubmits; }
 
 private:
 	void logTestOutcome(bool passed, uint32_t workgroupSize)
@@ -303,7 +638,7 @@ class ArithmeticBenchApp final : public examples::SimpleWindowedApplication, pub
 		}
 	}
 
-	void runTests(smart_refctd_ptr<ICPUShader> subgroupTestSource, uint32_t elementCount, uint32_t ItemsPerInvocation, uint32_t MinSubgroupSize, uint32_t MaxSubgroupSize, const std::array<uint32_t, 3>& workgroupSizes)
+	void runTests(IGPUCommandBuffer* cmdbuf, smart_refctd_ptr<ICPUShader> subgroupTestSource, uint32_t elementCount, uint32_t ItemsPerInvocation, uint32_t MinSubgroupSize, uint32_t MaxSubgroupSize, const std::array<uint32_t, 3>& workgroupSizes)
 	{
 		for (auto subgroupSize = MinSubgroupSize; subgroupSize <= MaxSubgroupSize; subgroupSize *= 2u)
 		{
@@ -316,11 +651,11 @@ class ArithmeticBenchApp final : public examples::SimpleWindowedApplication, pub
 
 				bool passed = true;
 				// TODO async the testing
-				passed = runTest<emulatedReduction, false>(subgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize, ~0u, ItemsPerInvocation) && passed;
+				passed = runTest<emulatedReduction, false>(cmdbuf, subgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize, ~0u, ItemsPerInvocation) && passed;
 				logTestOutcome(passed, workgroupSize);
-				passed = runTest<emulatedScanInclusive, false>(subgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize, ~0u, ItemsPerInvocation) && passed;
+				passed = runTest<emulatedScanInclusive, false>(cmdbuf, subgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize, ~0u, ItemsPerInvocation) && passed;
 				logTestOutcome(passed, workgroupSize);
-				passed = runTest<emulatedScanExclusive, false>(subgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize, ~0u, ItemsPerInvocation) && passed;
+				passed = runTest<emulatedScanExclusive, false>(cmdbuf, subgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize, ~0u, ItemsPerInvocation) && passed;
 				logTestOutcome(passed, workgroupSize);
 				//for (uint32_t itemsPerWG = workgroupSize; itemsPerWG > workgroupSize - subgroupSize; itemsPerWG--)
 				//{
@@ -353,7 +688,7 @@ class ArithmeticBenchApp final : public examples::SimpleWindowedApplication, pub
 	{
 		auto shader = m_device->createShader(overridenUnspecialized);
 		IGPUComputePipeline::SCreationParams params = {};
-		params.layout = pipelineLayout.get();
+		params.layout = testPplnLayout.get();
 		params.shader = {
 			.entryPoint = "main",
 			.shader = shader.get(),
@@ -367,10 +702,22 @@ class ArithmeticBenchApp final : public examples::SimpleWindowedApplication, pub
 		return pipeline;
 	}
 
+	template<template<class> class Arithmetic>
+	smart_refctd_ptr<IGPUComputePipeline> createBenchmarkPipelines(const smart_refctd_ptr<const ICPUShader>&source, const uint32_t elementCount, const uint8_t subgroupSizeLog2, const uint32_t workgroupSize, uint32_t itemsPerInvoc = 1u)
+	{
+		std::string arith_name = Arithmetic<bit_xor<uint32_t>>::name;	// TODO all operations
+
+		smart_refctd_ptr<ICPUShader> overridenUnspecialized = CHLSLCompiler::createOverridenCopy(
+			source.get(), "#define OPERATION %s\n#define WORKGROUP_SIZE %d\n#define ITEMS_PER_INVOCATION %d\n#define SUBGROUP_SIZE_LOG2 %d\n",
+			(("subgroup2::") + arith_name).c_str(), workgroupSize, itemsPerInvoc, subgroupSizeLog2
+		);
+		return createPipeline(overridenUnspecialized.get(), subgroupSizeLog2);
+	};
+
 	template<template<class> class Arithmetic, bool WorkgroupTest>
-	bool runTest(const smart_refctd_ptr<const ICPUShader>& source, const uint32_t elementCount, const uint8_t subgroupSizeLog2, const uint32_t workgroupSize, uint32_t itemsPerWG = ~0u, uint32_t itemsPerInvoc = 1u)
+	bool runTest(IGPUCommandBuffer* cmdbuf, const smart_refctd_ptr<const ICPUShader>& source, const uint32_t elementCount, const uint8_t subgroupSizeLog2, const uint32_t workgroupSize, uint32_t itemsPerWG = ~0u, uint32_t itemsPerInvoc = 1u)
 	{
-		std::string arith_name = Arithmetic<bit_xor<float>>::name;
+		std::string arith_name = Arithmetic<bit_xor<uint32_t>>::name;
 
 		smart_refctd_ptr<ICPUShader> overridenUnspecialized;
 		//if constexpr (WorkgroupTest)
@@ -394,7 +741,7 @@ class ArithmeticBenchApp final : public examples::SimpleWindowedApplication, pub
 		const uint32_t workgroupCount = elementCount / (itemsPerWG * itemsPerInvoc);
 		cmdbuf->begin(IGPUCommandBuffer::USAGE::NONE);
 		cmdbuf->bindComputePipeline(pipeline.get());
-		cmdbuf->bindDescriptorSets(EPBP_COMPUTE, pipeline->getLayout(), 0u, 1u, &descriptorSet.get());
+		cmdbuf->bindDescriptorSets(EPBP_COMPUTE, pipeline->getLayout(), 0u, 1u, &testDs.get());
 		cmdbuf->dispatch(workgroupCount, 1, 1);
 		{
 			IGPUCommandBuffer::SPipelineBarrierDependencyInfo::buffer_barrier_t memoryBarrier[OutputBufferCount];
@@ -419,7 +766,7 @@ class ArithmeticBenchApp final : public examples::SimpleWindowedApplication, pub
 		cmdbuf->end();
 
 		const IQueue::SSubmitInfo::SSemaphoreInfo signal[1] = {{.semaphore=sema.get(),.value=++timelineValue}};
-		const IQueue::SSubmitInfo::SCommandBufferInfo cmdbufs[1] = {{.cmdbuf=cmdbuf.get()}};
+		const IQueue::SSubmitInfo::SCommandBufferInfo cmdbufs[1] = {{.cmdbuf=cmdbuf}};
 		const IQueue::SSubmitInfo submits[1] = {{.commandBuffers=cmdbufs,.signalSemaphores=signal}};
 		computeQueue->submit(submits);
 		const ISemaphore::SWaitInfo wait[1] = {{.semaphore=sema.get(),.value=timelineValue}};
@@ -514,21 +861,12 @@ class ArithmeticBenchApp final : public examples::SimpleWindowedApplication, pub
 
 
 	template<template<class> class Arithmetic>
-	double runBenchmark(const smart_refctd_ptr<const ICPUShader>& source, const uint32_t elementCount, const uint8_t subgroupSizeLog2, const uint32_t workgroupSize, uint32_t itemsPerInvoc = 1u, uint32_t numLoops = 8u)
+	bool runBenchmark(IGPUCommandBuffer* cmdbuf, const smart_refctd_ptr<IGPUComputePipeline>& pipeline, const uint32_t elementCount, const uint8_t subgroupSizeLog2, const uint32_t workgroupSize, uint32_t itemsPerInvoc = 1u, uint32_t numLoops = 8u)
 	{
-		std::string arith_name = Arithmetic<bit_xor<float>>::name;
-
-		smart_refctd_ptr<ICPUShader> overridenUnspecialized = CHLSLCompiler::createOverridenCopy(
-			source.get(), "#define OPERATION %s\n#define WORKGROUP_SIZE %d\n#define ITEMS_PER_INVOCATION %d\n#define SUBGROUP_SIZE_LOG2 %d\n",
-			(("subgroup2::") + arith_name).c_str(), workgroupSize, itemsPerInvoc, subgroupSizeLog2
-		);
-		auto pipeline = createPipeline(overridenUnspecialized.get(), subgroupSizeLog2);
-
 		const uint32_t workgroupCount = elementCount / (workgroupSize * itemsPerInvoc);
-		cmdbuf->begin(IGPUCommandBuffer::USAGE::NONE);
 
 		cmdbuf->bindComputePipeline(pipeline.get());
-		cmdbuf->bindDescriptorSets(EPBP_COMPUTE, pipeline->getLayout(), 0u, 1u, &descriptorSet.get());
+		cmdbuf->bindDescriptorSets(EPBP_COMPUTE, pipeline->getLayout(), 0u, 1u, &testDs.get());
 		cmdbuf->dispatch(workgroupCount, 1, 1);
 		{
 			IGPUCommandBuffer::SPipelineBarrierDependencyInfo::buffer_barrier_t memoryBarrier[OutputBufferCount];
@@ -550,20 +888,8 @@ class ArithmeticBenchApp final : public examples::SimpleWindowedApplication, pub
 			IGPUCommandBuffer::SPipelineBarrierDependencyInfo info = { .memBarriers = {},.bufBarriers = memoryBarrier };
 			cmdbuf->pipelineBarrier(asset::E_DEPENDENCY_FLAGS::EDF_NONE, info);
 		}
-		cmdbuf->end();
-
-		auto startTime = std::chrono::high_resolution_clock::now();
-
-		const IQueue::SSubmitInfo::SSemaphoreInfo signal[1] = { {.semaphore = sema.get(),.value = ++timelineValue} };
-		const IQueue::SSubmitInfo::SCommandBufferInfo cmdbufs[1] = { {.cmdbuf = cmdbuf.get()} };
-		const IQueue::SSubmitInfo submits[1] = { {.commandBuffers = cmdbufs,.signalSemaphores = signal} };
-		computeQueue->submit(submits);
-		const ISemaphore::SWaitInfo wait[1] = { {.semaphore = sema.get(),.value = timelineValue} };
-		m_device->blockForSemaphores(wait);
-
-		auto endTime = std::chrono::high_resolution_clock::now();
 
-		return std::chrono::duration<double>(endTime - startTime).count();
+		return true;
 	}
 
 	IQueue* transferDownQueue;
@@ -572,19 +898,33 @@ class ArithmeticBenchApp final : public examples::SimpleWindowedApplication, pub
 	smart_refctd_ptr<IFile> m_spirv_isa_cache_output;
 
 	smart_refctd_ptr<IWindow> m_window;
-	smart_refctd_ptr<CSimpleResizeSurface<CDefaultSwapchainFramebuffers>> m_surface;
+	smart_refctd_ptr<CSimpleResizeSurface<ISimpleManagedSurface::ISwapchainResources>> m_surface;
+	smart_refctd_ptr<ISemaphore> m_semaphore;
+	uint64_t m_realFrameIx = 0;
+	std::array<smart_refctd_ptr<IGPUCommandBuffer>, MaxFramesInFlight> m_cmdBufs;
+	ISimpleManagedSurface::SAcquireResult m_currentImageAcquire = {};
+
 	smart_refctd_ptr<InputSystem> m_inputSystem;
 
+	smart_refctd_ptr<IGPUImage> dummyImg;
+
+	smart_refctd_ptr<IGPUComputePipeline> benchPipeline;	// TODO array
+	smart_refctd_ptr<IDescriptorPool> benchPool;
+	smart_refctd_ptr<IGPUDescriptorSet> benchDs;
+
+	smart_refctd_ptr<IGPUDescriptorSet> testDs;
+	smart_refctd_ptr<IGPUPipelineLayout> testPplnLayout;
+
+	constexpr static inline uint32_t MaxNumSubmits = 30;
+	uint32_t numSubmits = 0;
+
 	bool b_runTests = false;
 	uint32_t* inputData = nullptr;
 	constexpr static inline uint32_t OutputBufferCount = 8u;
 	smart_refctd_ptr<IGPUBuffer> outputBuffers[OutputBufferCount];
-	smart_refctd_ptr<IGPUDescriptorSet> descriptorSet;
-	smart_refctd_ptr<IGPUPipelineLayout> pipelineLayout;
 
 	smart_refctd_ptr<ISemaphore> sema;
 	uint64_t timelineValue = 0;
-	smart_refctd_ptr<IGPUCommandBuffer> cmdbuf;
 	smart_refctd_ptr<ICPUBuffer> resultsBuffer;
 
 	uint32_t totalFailCount = 0;

From 019299994a9969f4d542a9769aa23b2bd5076318 Mon Sep 17 00:00:00 2001
From: keptsecret <sorchon@gmail.com>
Date: Tue, 8 Apr 2025 15:14:31 +0700
Subject: [PATCH 150/529] use correct shader, spirv line dbinfo for nsight

---
 73_ArithmeticBench/main.cpp | 53 ++++++++++++++++++++++++++++++++-----
 1 file changed, 46 insertions(+), 7 deletions(-)

diff --git a/73_ArithmeticBench/main.cpp b/73_ArithmeticBench/main.cpp
index 8e067e6cc..29a38d2eb 100644
--- a/73_ArithmeticBench/main.cpp
+++ b/73_ArithmeticBench/main.cpp
@@ -297,6 +297,7 @@ class ArithmeticBenchApp final : public examples::SimpleWindowedApplication, pub
 		};
 
 		auto subgroupTestSource = getShaderSource("app_resources/testSubgroup.comp.hlsl");
+		auto subgroupBenchSource = getShaderSource("app_resources/benchmarkSubgroup.comp.hlsl");
 		//auto workgroupTestSource = getShaderSource("app_resources/testWorkgroup.comp.hlsl");
 		// now create or retrieve final resources to run our tests
 		sema = m_device->createSemaphore(timelineValue);
@@ -313,7 +314,7 @@ class ArithmeticBenchApp final : public examples::SimpleWindowedApplication, pub
 		
 		// TODO variable items per invocation?
 		const uint32_t ItemsPerInvocation = 4u;
-		const uint32_t NumLoops = 100000u;
+		const uint32_t NumLoops = 1000u;
 		const std::array<uint32_t, 3> workgroupSizes = { 256, 512, 1024 };
 		// const auto MaxWorkgroupSize = m_physicalDevice->getLimits().maxComputeWorkGroupInvocations;
 		const auto MinSubgroupSize = m_physicalDevice->getLimits().minSubgroupSize;
@@ -328,7 +329,7 @@ class ArithmeticBenchApp final : public examples::SimpleWindowedApplication, pub
 		}
 
 		// for each variant, workgroup size etc.
-		benchPipeline = createBenchmarkPipelines<emulatedReduction>(subgroupTestSource, elementCount, hlsl::findMSB(MinSubgroupSize), workgroupSizes[0], ItemsPerInvocation);
+		benchPipeline = createBenchmarkPipelines<emulatedReduction>(subgroupBenchSource, elementCount, hlsl::findMSB(MinSubgroupSize), workgroupSizes[0], ItemsPerInvocation, NumLoops);
 
 		//for (auto subgroupSize = MinSubgroupSize; subgroupSize <= MaxSubgroupSize; subgroupSize *= 2u)
 		//{
@@ -703,14 +704,52 @@ class ArithmeticBenchApp final : public examples::SimpleWindowedApplication, pub
 	}
 
 	template<template<class> class Arithmetic>
-	smart_refctd_ptr<IGPUComputePipeline> createBenchmarkPipelines(const smart_refctd_ptr<const ICPUShader>&source, const uint32_t elementCount, const uint8_t subgroupSizeLog2, const uint32_t workgroupSize, uint32_t itemsPerInvoc = 1u)
+	smart_refctd_ptr<IGPUComputePipeline> createBenchmarkPipelines(const smart_refctd_ptr<const ICPUShader>&source, const uint32_t elementCount, const uint8_t subgroupSizeLog2, const uint32_t workgroupSize, uint32_t itemsPerInvoc = 1u, uint32_t numLoops = 8u)
 	{
 		std::string arith_name = Arithmetic<bit_xor<uint32_t>>::name;	// TODO all operations
 
-		smart_refctd_ptr<ICPUShader> overridenUnspecialized = CHLSLCompiler::createOverridenCopy(
-			source.get(), "#define OPERATION %s\n#define WORKGROUP_SIZE %d\n#define ITEMS_PER_INVOCATION %d\n#define SUBGROUP_SIZE_LOG2 %d\n",
-			(("subgroup2::") + arith_name).c_str(), workgroupSize, itemsPerInvoc, subgroupSizeLog2
-		);
+		//smart_refctd_ptr<ICPUShader> overridenUnspecialized = CHLSLCompiler::createOverridenCopy(
+		//	source.get(), "#define OPERATION %s\n#define WORKGROUP_SIZE %d\n#define ITEMS_PER_INVOCATION %d\n#define SUBGROUP_SIZE_LOG2 %d\n",
+		//	(("subgroup2::") + arith_name).c_str(), workgroupSize, itemsPerInvoc, subgroupSizeLog2
+		//);
+
+		auto compiler = make_smart_refctd_ptr<asset::CHLSLCompiler>(smart_refctd_ptr(m_system));
+		CHLSLCompiler::SOptions options = {};
+		options.stage = IShader::E_SHADER_STAGE::ESS_COMPUTE;
+		options.targetSpirvVersion = m_device->getPhysicalDevice()->getLimits().spirvVersion;
+		options.spirvOptimizer = nullptr;
+//#ifndef _NBL_DEBUG
+//		ISPIRVOptimizer::E_OPTIMIZER_PASS optPasses = ISPIRVOptimizer::EOP_STRIP_DEBUG_INFO;
+//		auto opt = make_smart_refctd_ptr<ISPIRVOptimizer>(std::span<ISPIRVOptimizer::E_OPTIMIZER_PASS>(&optPasses, 1));
+//		options.spirvOptimizer = opt.get();
+//#endif
+		options.debugInfoFlags |= IShaderCompiler::E_DEBUG_INFO_FLAGS::EDIF_LINE_BIT;
+		options.preprocessorOptions.sourceIdentifier = source->getFilepathHint();
+		options.preprocessorOptions.logger = m_logger.get();
+
+		auto* includeFinder = compiler->getDefaultIncludeFinder();
+		includeFinder->addSearchPath("nbl/builtin/hlsl/jit", core::make_smart_refctd_ptr<CJITIncludeLoader>(m_physicalDevice->getLimits(), m_device->getEnabledFeatures()));
+		options.preprocessorOptions.includeFinder = includeFinder;
+
+		const std::string definitions[5] = { 
+			"subgroup2::" + arith_name,
+			std::to_string(workgroupSize),
+			std::to_string(itemsPerInvoc),
+			std::to_string(subgroupSizeLog2),
+			std::to_string(numLoops)
+		};
+
+		const IShaderCompiler::SMacroDefinition defines[5] = {
+			{ "OPERATION", definitions[0] },
+			{ "WORKGROUP_SIZE", definitions[1] },
+			{ "ITEMS_PER_INVOCATION", definitions[2] },
+			{ "SUBGROUP_SIZE_LOG2", definitions[3] },
+			{ "NUM_LOOPS", definitions[4] },
+		};
+		options.preprocessorOptions.extraDefines = { defines, defines + 5 };
+
+		smart_refctd_ptr<ICPUShader> overridenUnspecialized = compiler->compileToSPIRV((const char*)source->getContent()->getPointer(), options);
+
 		return createPipeline(overridenUnspecialized.get(), subgroupSizeLog2);
 	};
 

From 8c9d55e6233d0f50f99403835e100b8aba799bca Mon Sep 17 00:00:00 2001
From: keptsecret <sorchon@gmail.com>
Date: Tue, 8 Apr 2025 17:02:09 +0700
Subject: [PATCH 151/529] support for 1 item per invoc

---
 .../app_resources/shaderCommon.hlsl           |   6 +
 73_ArithmeticBench/main.cpp                   | 111 ++++++++----------
 2 files changed, 58 insertions(+), 59 deletions(-)

diff --git a/73_ArithmeticBench/app_resources/shaderCommon.hlsl b/73_ArithmeticBench/app_resources/shaderCommon.hlsl
index fa3713c44..7d25b98ee 100644
--- a/73_ArithmeticBench/app_resources/shaderCommon.hlsl
+++ b/73_ArithmeticBench/app_resources/shaderCommon.hlsl
@@ -2,6 +2,7 @@
 
 #include "nbl/builtin/hlsl/glsl_compat/core.hlsl"
 #include "nbl/builtin/hlsl/subgroup/basic.hlsl"
+#include "nbl/builtin/hlsl/subgroup/arithmetic_portability.hlsl"
 #include "nbl/builtin/hlsl/subgroup2/arithmetic_portability.hlsl"
 
 #include "nbl/builtin/hlsl/jit/device_capabilities.hlsl"
@@ -25,7 +26,12 @@ bool canStore();
 //typedef decltype(inputValue[0]) type_t;
 //typedef uint32_t type_t;
 //typedef uint32_t4 type_t;
+
+#if ITEMS_PER_INVOCATION > 1
 typedef vector<uint32_t, ITEMS_PER_INVOCATION> type_t;
+#else
+typedef uint32_t type_t;
+#endif
 
 
 #ifndef OPERATION
diff --git a/73_ArithmeticBench/main.cpp b/73_ArithmeticBench/main.cpp
index 29a38d2eb..276efbd18 100644
--- a/73_ArithmeticBench/main.cpp
+++ b/73_ArithmeticBench/main.cpp
@@ -214,17 +214,18 @@ class ArithmeticBenchApp final : public examples::SimpleWindowedApplication, pub
 			testPplnLayout = m_device->createPipelineLayout({}, std::move(dsLayout));
 
 
+			smart_refctd_ptr<IGPUDescriptorSetLayout> benchLayout;
 			{
 				IGPUDescriptorSetLayout::SBinding binding[3];
 				for (uint32_t i = 0u; i < 2; i++)
 					binding[i] = { {},i,IDescriptor::E_TYPE::ET_STORAGE_BUFFER,IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_NONE,IShader::E_SHADER_STAGE::ESS_COMPUTE,1u,nullptr };
 				binding[1].count = OutputBufferCount;
 				binding[2] = { {},2,IDescriptor::E_TYPE::ET_STORAGE_IMAGE,IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_UPDATE_AFTER_BIND_BIT,IShader::E_SHADER_STAGE::ESS_COMPUTE,1u,nullptr };
-				dsLayout = m_device->createDescriptorSetLayout(binding);
+				benchLayout = m_device->createDescriptorSetLayout(binding);
 			}
 
-			benchPool = m_device->createDescriptorPoolForDSLayouts(IDescriptorPool::ECF_UPDATE_AFTER_BIND_BIT, { &dsLayout.get(),1 });
-			benchDs = benchPool->createDescriptorSet(smart_refctd_ptr(dsLayout));
+			benchPool = m_device->createDescriptorPoolForDSLayouts(IDescriptorPool::ECF_UPDATE_AFTER_BIND_BIT, { &benchLayout.get(),1 });
+			benchDs = benchPool->createDescriptorSet(smart_refctd_ptr(benchLayout));
 			{
 				IGPUDescriptorSet::SDescriptorInfo infos[1 + OutputBufferCount];
 				infos[0].desc = gpuinputDataBuffer;
@@ -239,12 +240,12 @@ class ArithmeticBenchApp final : public examples::SimpleWindowedApplication, pub
 
 				IGPUDescriptorSet::SWriteDescriptorSet writes[2];
 				for (uint32_t i = 0u; i < 2; i++)
-					writes[i] = { testDs.get(),i,0u,1u,infos + i };
+					writes[i] = { benchDs.get(),i,0u,1u,infos + i };
 				writes[1].count = OutputBufferCount;
 
 				m_device->updateDescriptorSets(2, writes, 0u, nullptr);
 			}
-			benchPplnLayout = m_device->createPipelineLayout({}, std::move(dsLayout));
+			benchPplnLayout = m_device->createPipelineLayout({}, std::move(benchLayout));
 		}
 
 		const auto spirv_isa_cache_path = localOutputCWD/"spirv_isa_cache.bin";
@@ -313,7 +314,6 @@ class ArithmeticBenchApp final : public examples::SimpleWindowedApplication, pub
 		}
 		
 		// TODO variable items per invocation?
-		const uint32_t ItemsPerInvocation = 4u;
 		const uint32_t NumLoops = 1000u;
 		const std::array<uint32_t, 3> workgroupSizes = { 256, 512, 1024 };
 		// const auto MaxWorkgroupSize = m_physicalDevice->getLimits().maxComputeWorkGroupInvocations;
@@ -328,33 +328,9 @@ class ArithmeticBenchApp final : public examples::SimpleWindowedApplication, pub
 			m_logger->log("Fail Count: %u", ILogger::ELL_INFO, totalFailCount);
 		}
 
-		// for each variant, workgroup size etc.
-		benchPipeline = createBenchmarkPipelines<emulatedReduction>(subgroupBenchSource, elementCount, hlsl::findMSB(MinSubgroupSize), workgroupSizes[0], ItemsPerInvocation, NumLoops);
-
-		//for (auto subgroupSize = MinSubgroupSize; subgroupSize <= MaxSubgroupSize; subgroupSize *= 2u)
-		//{
-		//	const uint8_t subgroupSizeLog2 = hlsl::findMSB(subgroupSize);
-		//	for (const auto& workgroupSize : workgroupSizes)
-		//	{
-		//		passed = runBenchmark<emulatedReduction>(subgroupTestSource, queryPool, elementCount, subgroupSizeLog2, workgroupSize, ItemsPerInvocation, NumLoops) && passed;
-		//		logTestOutcome(passed, workgroupSize);
-		//		passed = runBenchmark<emulatedScanInclusive>(subgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize, ItemsPerInvocation, NumLoops) && passed;
-		//		logTestOutcome(passed, workgroupSize);
-		//		passed = runBenchmark<emulatedScanExclusive>(subgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize, ItemsPerInvocation, NumLoops) && passed;
-		//		logTestOutcome(passed, workgroupSize);
-
-		//		// save cache every now and then	
-		//		{
-		//			auto cpu = m_spirv_isa_cache->convertToCPUCache();
-		//			// Normally we'd beautifully JSON serialize the thing, allow multiple devices & drivers + metadata
-		//			auto bin = cpu->getEntries().begin()->second.bin;
-		//			IFile::success_t success;
-		//			m_spirv_isa_cache_output->write(success, bin->data(), 0ull, bin->size());
-		//			if (!success)
-		//				logFail("Could not write Create SPIR-V to ISA cache to disk!");
-		//		}
-		//	}
-		//}
+		// for each workgroup size (manually adjust items per invoc, operation else uses up a lot of ram)
+		for (uint32_t i = 0; i < workgroupSizes.size(); i++)
+			benchSets[i] = createBenchmarkPipelines<emulatedReduction>(subgroupBenchSource, benchPplnLayout.get(), elementCount, hlsl::findMSB(MinSubgroupSize), workgroupSizes[i], ItemsPerInvocation, NumLoops);
 
 		m_winMgr->show(m_window.get());
 
@@ -447,9 +423,6 @@ class ArithmeticBenchApp final : public examples::SimpleWindowedApplication, pub
 		m_device->updateDescriptorSets(1u, dsWrites, 0u, nullptr);
 
 		const uint32_t elementCount = Output<>::ScanElementCount;
-		const uint32_t ItemsPerInvocation = 4u;
-		const uint32_t NumLoops = 100000u;
-		const std::array<uint32_t, 3> workgroupSizes = { 256, 512, 1024 };
 		// const auto MaxWorkgroupSize = m_physicalDevice->getLimits().maxComputeWorkGroupInvocations;
 		const auto MinSubgroupSize = m_physicalDevice->getLimits().minSubgroupSize;
 		const auto MaxSubgroupSize = m_physicalDevice->getLimits().maxSubgroupSize;
@@ -467,8 +440,10 @@ class ArithmeticBenchApp final : public examples::SimpleWindowedApplication, pub
 		//	auto endTime = std::chrono::high_resolution_clock::now();
 		//}
 
-		double time = runBenchmark<emulatedReduction>(cmdbuf, benchPipeline, elementCount, 5, 256, ItemsPerInvocation, NumLoops);
-		m_logger->log("Ran for %.3fms (disregard these numbers, profile in Nsight)", ILogger::ELL_INFO, time * 1000.0);
+		double t0 = runBenchmark<emulatedReduction>(cmdbuf, benchSets[0], elementCount, 5);
+		double t1 = runBenchmark<emulatedReduction>(cmdbuf, benchSets[1], elementCount, 5);
+		double t2 = runBenchmark<emulatedReduction>(cmdbuf, benchSets[2], elementCount, 5);
+		m_logger->log("Ran for %.3fms (disregard these numbers, profile in Nsight)", ILogger::ELL_INFO, t0 * 1000.0);
 
 
 		// blit
@@ -639,7 +614,7 @@ class ArithmeticBenchApp final : public examples::SimpleWindowedApplication, pub
 		}
 	}
 
-	void runTests(IGPUCommandBuffer* cmdbuf, smart_refctd_ptr<ICPUShader> subgroupTestSource, uint32_t elementCount, uint32_t ItemsPerInvocation, uint32_t MinSubgroupSize, uint32_t MaxSubgroupSize, const std::array<uint32_t, 3>& workgroupSizes)
+	void runTests(IGPUCommandBuffer* cmdbuf, smart_refctd_ptr<ICPUShader> subgroupTestSource, uint32_t elementCount, uint32_t itemsPerInvocation, uint32_t MinSubgroupSize, uint32_t MaxSubgroupSize, const std::array<uint32_t, 3>& workgroupSizes)
 	{
 		for (auto subgroupSize = MinSubgroupSize; subgroupSize <= MaxSubgroupSize; subgroupSize *= 2u)
 		{
@@ -652,11 +627,11 @@ class ArithmeticBenchApp final : public examples::SimpleWindowedApplication, pub
 
 				bool passed = true;
 				// TODO async the testing
-				passed = runTest<emulatedReduction, false>(cmdbuf, subgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize, ~0u, ItemsPerInvocation) && passed;
+				passed = runTest<emulatedReduction, false>(cmdbuf, subgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize, ~0u, itemsPerInvocation) && passed;
 				logTestOutcome(passed, workgroupSize);
-				passed = runTest<emulatedScanInclusive, false>(cmdbuf, subgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize, ~0u, ItemsPerInvocation) && passed;
+				passed = runTest<emulatedScanInclusive, false>(cmdbuf, subgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize, ~0u, itemsPerInvocation) && passed;
 				logTestOutcome(passed, workgroupSize);
-				passed = runTest<emulatedScanExclusive, false>(cmdbuf, subgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize, ~0u, ItemsPerInvocation) && passed;
+				passed = runTest<emulatedScanExclusive, false>(cmdbuf, subgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize, ~0u, itemsPerInvocation) && passed;
 				logTestOutcome(passed, workgroupSize);
 				//for (uint32_t itemsPerWG = workgroupSize; itemsPerWG > workgroupSize - subgroupSize; itemsPerWG--)
 				//{
@@ -685,11 +660,11 @@ class ArithmeticBenchApp final : public examples::SimpleWindowedApplication, pub
 	}
 
 	// create pipeline (specialized every test) [TODO: turn into a future/async]
-	smart_refctd_ptr<IGPUComputePipeline> createPipeline(const ICPUShader* overridenUnspecialized, const uint8_t subgroupSizeLog2)
+	smart_refctd_ptr<IGPUComputePipeline> createPipeline(const ICPUShader* overridenUnspecialized, const IGPUPipelineLayout* layout, const uint8_t subgroupSizeLog2)
 	{
 		auto shader = m_device->createShader(overridenUnspecialized);
 		IGPUComputePipeline::SCreationParams params = {};
-		params.layout = testPplnLayout.get();
+		params.layout = layout;
 		params.shader = {
 			.entryPoint = "main",
 			.shader = shader.get(),
@@ -703,8 +678,15 @@ class ArithmeticBenchApp final : public examples::SimpleWindowedApplication, pub
 		return pipeline;
 	}
 
+	struct BenchmarkSet
+	{
+		smart_refctd_ptr<IGPUComputePipeline> pipeline;
+		uint32_t workgroupSize;
+		uint32_t itemsPerInvocation;
+	};
+
 	template<template<class> class Arithmetic>
-	smart_refctd_ptr<IGPUComputePipeline> createBenchmarkPipelines(const smart_refctd_ptr<const ICPUShader>&source, const uint32_t elementCount, const uint8_t subgroupSizeLog2, const uint32_t workgroupSize, uint32_t itemsPerInvoc = 1u, uint32_t numLoops = 8u)
+	BenchmarkSet createBenchmarkPipelines(const smart_refctd_ptr<const ICPUShader>&source, const IGPUPipelineLayout* layout, const uint32_t elementCount, const uint8_t subgroupSizeLog2, const uint32_t workgroupSize, uint32_t itemsPerInvoc = 1u, uint32_t numLoops = 8u)
 	{
 		std::string arith_name = Arithmetic<bit_xor<uint32_t>>::name;	// TODO all operations
 
@@ -731,26 +713,35 @@ class ArithmeticBenchApp final : public examples::SimpleWindowedApplication, pub
 		includeFinder->addSearchPath("nbl/builtin/hlsl/jit", core::make_smart_refctd_ptr<CJITIncludeLoader>(m_physicalDevice->getLimits(), m_device->getEnabledFeatures()));
 		options.preprocessorOptions.includeFinder = includeFinder;
 
-		const std::string definitions[5] = { 
+		const std::string definitions[6] = { 
 			"subgroup2::" + arith_name,
+			"subgroup::" + arith_name,
 			std::to_string(workgroupSize),
 			std::to_string(itemsPerInvoc),
 			std::to_string(subgroupSizeLog2),
 			std::to_string(numLoops)
 		};
 
-		const IShaderCompiler::SMacroDefinition defines[5] = {
-			{ "OPERATION", definitions[0] },
-			{ "WORKGROUP_SIZE", definitions[1] },
-			{ "ITEMS_PER_INVOCATION", definitions[2] },
-			{ "SUBGROUP_SIZE_LOG2", definitions[3] },
-			{ "NUM_LOOPS", definitions[4] },
+		const IShaderCompiler::SMacroDefinition defines[6] = {
+			{ "OPERATION", ItemsPerInvocation > 1 ? definitions[0] : definitions[1] },
+			{ "WORKGROUP_SIZE", definitions[2] },
+			{ "ITEMS_PER_INVOCATION", definitions[3] },
+			{ "SUBGROUP_SIZE_LOG2", definitions[4] },
+			{ "NUM_LOOPS", definitions[5] },
 		};
-		options.preprocessorOptions.extraDefines = { defines, defines + 5 };
+		//if (b_useOldSubgroups)
+		//	options.preprocessorOptions.extraDefines = { defines, defines + 6 };
+		//else
+			options.preprocessorOptions.extraDefines = { defines, defines + 5 };
 
 		smart_refctd_ptr<ICPUShader> overridenUnspecialized = compiler->compileToSPIRV((const char*)source->getContent()->getPointer(), options);
 
-		return createPipeline(overridenUnspecialized.get(), subgroupSizeLog2);
+		BenchmarkSet set;
+		set.pipeline = createPipeline(overridenUnspecialized.get(), layout, subgroupSizeLog2);
+		set.workgroupSize = workgroupSize;
+		set.itemsPerInvocation = itemsPerInvoc;
+
+		return set;
 	};
 
 	template<template<class> class Arithmetic, bool WorkgroupTest>
@@ -774,7 +765,7 @@ class ArithmeticBenchApp final : public examples::SimpleWindowedApplication, pub
 				(("subgroup2::") + arith_name).c_str(), workgroupSize, itemsPerInvoc, subgroupSizeLog2
 			);
 		//}
-		auto pipeline = createPipeline(overridenUnspecialized.get(),subgroupSizeLog2);
+		auto pipeline = createPipeline(overridenUnspecialized.get(),testPplnLayout.get(), subgroupSizeLog2);
 
 		// TODO: overlap dispatches with memory readbacks (requires multiple copies of `buffers`)
 		const uint32_t workgroupCount = elementCount / (itemsPerWG * itemsPerInvoc);
@@ -900,12 +891,12 @@ class ArithmeticBenchApp final : public examples::SimpleWindowedApplication, pub
 
 
 	template<template<class> class Arithmetic>
-	bool runBenchmark(IGPUCommandBuffer* cmdbuf, const smart_refctd_ptr<IGPUComputePipeline>& pipeline, const uint32_t elementCount, const uint8_t subgroupSizeLog2, const uint32_t workgroupSize, uint32_t itemsPerInvoc = 1u, uint32_t numLoops = 8u)
+	bool runBenchmark(IGPUCommandBuffer* cmdbuf, const BenchmarkSet& set, const uint32_t elementCount, const uint8_t subgroupSizeLog2)
 	{
-		const uint32_t workgroupCount = elementCount / (workgroupSize * itemsPerInvoc);
+		const uint32_t workgroupCount = elementCount / (set.workgroupSize * set.itemsPerInvocation);
 
-		cmdbuf->bindComputePipeline(pipeline.get());
-		cmdbuf->bindDescriptorSets(EPBP_COMPUTE, pipeline->getLayout(), 0u, 1u, &testDs.get());
+		cmdbuf->bindComputePipeline(set.pipeline.get());
+		cmdbuf->bindDescriptorSets(EPBP_COMPUTE, set.pipeline->getLayout(), 0u, 1u, &benchDs.get());
 		cmdbuf->dispatch(workgroupCount, 1, 1);
 		{
 			IGPUCommandBuffer::SPipelineBarrierDependencyInfo::buffer_barrier_t memoryBarrier[OutputBufferCount];
@@ -947,6 +938,7 @@ class ArithmeticBenchApp final : public examples::SimpleWindowedApplication, pub
 
 	smart_refctd_ptr<IGPUImage> dummyImg;
 
+	std::array<BenchmarkSet, 3> benchSets;
 	smart_refctd_ptr<IGPUComputePipeline> benchPipeline;	// TODO array
 	smart_refctd_ptr<IDescriptorPool> benchPool;
 	smart_refctd_ptr<IGPUDescriptorSet> benchDs;
@@ -959,6 +951,7 @@ class ArithmeticBenchApp final : public examples::SimpleWindowedApplication, pub
 
 	bool b_runTests = false;
 	uint32_t* inputData = nullptr;
+	uint32_t ItemsPerInvocation = 4u;
 	constexpr static inline uint32_t OutputBufferCount = 8u;
 	smart_refctd_ptr<IGPUBuffer> outputBuffers[OutputBufferCount];
 

From 06f72c50cd2b575741faa61ad9624f688817f41c Mon Sep 17 00:00:00 2001
From: Przemek <minikers21@gmail.com>
Date: Tue, 8 Apr 2025 15:50:35 +0200
Subject: [PATCH 152/529] Fixed anti aliasing

---
 62_CAD/main.cpp                               | 19 +++++++-
 .../main_pipeline/fragment_shader.hlsl        | 43 ++++++++++++++++++-
 .../shaders/main_pipeline/vertex_shader.hlsl  | 13 +++++-
 3 files changed, 70 insertions(+), 5 deletions(-)

diff --git a/62_CAD/main.cpp b/62_CAD/main.cpp
index 791c8fc04..b49dc56d2 100644
--- a/62_CAD/main.cpp
+++ b/62_CAD/main.cpp
@@ -75,7 +75,7 @@ constexpr std::array<float, (uint32_t)ExampleMode::CASE_COUNT> cameraExtents =
 	600.0	// CASE_9
 };
 
-constexpr ExampleMode mode = ExampleMode::CASE_6;
+constexpr ExampleMode mode = ExampleMode::CASE_9;
 
 class Camera2D
 {
@@ -3156,6 +3156,8 @@ class ComputerAidedDesign final : public examples::SimpleWindowedApplication, pu
 		}
 		else if (mode == ExampleMode::CASE_9)
 		{
+			// GRID
+
 			/*core::vector<TriangleMeshVertex> vertices = {
 				{ float32_t2(-200.0f, -200.0f), 10.0f },
 				{ float32_t2(-50.0f, -200.0f), 50.0f },
@@ -3186,6 +3188,8 @@ class ComputerAidedDesign final : public examples::SimpleWindowedApplication, pu
 				8, 10, 11
 			};*/
 
+			// PYRAMID
+
 			core::vector<TriangleMeshVertex> vertices = {
 				{ float32_t2(0.0, 0.0), 100.0 },
 				{ float32_t2(-200.0, -200.0), 10.0 },
@@ -3201,6 +3205,17 @@ class ComputerAidedDesign final : public examples::SimpleWindowedApplication, pu
 				0, 4, 1
 			};
 
+			// SINGLE TRIANGLE
+			/*core::vector<TriangleMeshVertex> vertices = {
+				{ float32_t2(0.0, 0.0), -20.0 },
+				{ float32_t2(200.0, 200.0), 100.0 },
+				{ float32_t2(200.0, -200.0), 80.0 }
+			};
+
+			core::vector<uint32_t> indices = {
+				0, 1, 2
+			};*/
+
 			CTriangleMesh mesh;
 			mesh.setVertices(std::move(vertices));
 			mesh.setIndices(std::move(indices));
@@ -3240,7 +3255,7 @@ class ComputerAidedDesign final : public examples::SimpleWindowedApplication, pu
 					//dtmSettingsInfo.addHeightColorMapEntry(25.0f, float32_t4(1.0f, 1.0f, 0.0f, animatedAlpha));
 					dtmSettingsInfo.addHeightColorMapEntry(25.0f, float32_t4(1.0f, 1.0f, 0.0f, 1.0f));
 					dtmSettingsInfo.addHeightColorMapEntry(70.0f, float32_t4(1.0f, 0.0f, 0.0f, 1.0f));
-					dtmSettingsInfo.addHeightColorMapEntry(90.0f, float32_t4(1.0f, 1.0f, 1.0f, 1.0f));
+					dtmSettingsInfo.addHeightColorMapEntry(90.0f, float32_t4(1.0f, 0.0f, 0.0f, 1.0f));
 					break;
 				}
 				case DTMSettingsInfo::E_HEIGHT_SHADING_MODE::DISCRETE_FIXED_LENGTH_INTERVALS:
diff --git a/62_CAD/shaders/main_pipeline/fragment_shader.hlsl b/62_CAD/shaders/main_pipeline/fragment_shader.hlsl
index 293943e64..5d5d464cc 100644
--- a/62_CAD/shaders/main_pipeline/fragment_shader.hlsl
+++ b/62_CAD/shaders/main_pipeline/fragment_shader.hlsl
@@ -415,6 +415,11 @@ float32_t4 calculateFinalColor<true>(const uint2 fragCoord, const float localAlp
     return color;
 }
 
+float dot2(in float2 vec)
+{
+    return dot(vec, vec);
+}
+
 [[vk::spvexecutionmode(spv::ExecutionModePixelInterlockOrderedEXT)]]
 [shader("pixel")]
 float4 fragMain(PSInput input) : SV_TARGET
@@ -426,7 +431,7 @@ float4 fragMain(PSInput input) : SV_TARGET
     const uint32_t currentMainObjectIdx = input.getMainObjectIdx();
     const MainObject mainObj = loadMainObject(currentMainObjectIdx);
     
-//#define DTM
+#define DTM
 #ifdef DTM
     // TRIANGLE RENDERING
     {
@@ -468,6 +473,35 @@ float4 fragMain(PSInput input) : SV_TARGET
 
         if (heightMapSize > 0)
         {
+            // partially based on https://www.shadertoy.com/view/XsXSz4 by Inigo Quilez
+            float2 e0 = v[1] - v[0];
+            float2 e1 = v[2] - v[1];
+            float2 e2 = v[0] - v[2];
+            
+            float triangleAreaSign = -sign(e0.x * e2.y - e0.y * e2.x);
+            float2 v0 = input.position.xy - v[0];
+            float2 v1 = input.position.xy - v[1];
+            float2 v2 = input.position.xy - v[2];
+
+            float distanceToLine0 = sqrt(dot2(v0 - e0 * dot(v0, e0) / dot(e0, e0)));
+            float distanceToLine1 = sqrt(dot2(v1 - e1 * dot(v1, e1) / dot(e1, e1)));
+            float distanceToLine2 = sqrt(dot2(v2 - e2 * dot(v2, e2) / dot(e2, e2)));
+
+            float line0Sdf = distanceToLine0 * triangleAreaSign * (v0.x * e0.y - v0.y * e0.x);
+            float line1Sdf = distanceToLine1 * triangleAreaSign * (v1.x * e1.y - v1.y * e1.x);
+            float line2Sdf = distanceToLine2 * triangleAreaSign * (v2.x * e2.y - v2.y * e2.x);
+            float heightDeriv = fwidth(height);
+            float line3Sdf = (minShadingHeight - height) / heightDeriv;
+            float line4Sdf = (height - maxShadingHeight) / heightDeriv;
+
+            float convexPolygonSdf = max(line0Sdf, line1Sdf);
+            convexPolygonSdf = max(convexPolygonSdf, line2Sdf);
+            convexPolygonSdf = max(convexPolygonSdf, line3Sdf);
+            convexPolygonSdf = max(convexPolygonSdf, line4Sdf);
+
+            localAlpha = 1.0f - smoothstep(0.0f, globals.antiAliasingFactor * 2.0f, convexPolygonSdf);
+
+            // calculate height color
             DTMSettings::E_HEIGHT_SHADING_MODE mode = dtm.determineHeightShadingMode();
 
             if(mode == DTMSettings::E_HEIGHT_SHADING_MODE::DISCRETE_VARIABLE_LENGTH_INTERVALS)
@@ -510,7 +544,7 @@ float4 fragMain(PSInput input) : SV_TARGET
                     }
                 }
 
-                localAlpha = dtm.heightColorMapColors[mapIndex].a;
+                //localAlpha = dtm.heightColorMapColors[mapIndex].a;
             }
             else
             {
@@ -546,6 +580,11 @@ float4 fragMain(PSInput input) : SV_TARGET
                 localAlpha = lerp(lowerBoundColor.a, upperBoundColor.a, interpolationVal);;
             }
         }
+        //else // TODO: remove!!
+        //{
+        //    printf("WTF");
+        //    return float4(0.0f, 0.0f, 0.0f, 1.0f);
+        //}
 
         // CONTOUR
 
diff --git a/62_CAD/shaders/main_pipeline/vertex_shader.hlsl b/62_CAD/shaders/main_pipeline/vertex_shader.hlsl
index d45eac46f..4a955d92d 100644
--- a/62_CAD/shaders/main_pipeline/vertex_shader.hlsl
+++ b/62_CAD/shaders/main_pipeline/vertex_shader.hlsl
@@ -85,7 +85,7 @@ PSInput main(uint vertexID : SV_VertexID)
     // ~~Later, most likely We will require pulling all 3 vertices of the triangle, that's where you need to know which triangle you're currently on, and instead of objectID = vertexID/4 which we currently do, you will do vertexID/3 and pull all 3 of it's vertices.~~
     // Ok, brainfart, a vertex can belong to multiple triangles, I was thinking of AA but triangles share vertices, nevermind my comment above.
     
-//#define DTM
+#define DTM
 #ifdef DTM
     PSInput outV;
 
@@ -130,6 +130,17 @@ PSInput main(uint vertexID : SV_VertexID)
     outV.setOutlineThickness(sdfOutlineThickness);
     outV.setContourLineThickness(sdfContourLineThickness);
 
+    // full screen triangle (this will destroy outline, contour line and height drawing)
+#if 0
+    const uint vertexIdx = vertexID % 3;
+    if(vertexIdx == 0)
+        outV.position.xy = float2(-1.0f, -1.0f);
+    else if (vertexIdx == 1)
+        outV.position.xy = float2(-1.0f, 3.0f);
+    else if (vertexIdx == 2)
+        outV.position.xy = float2(3.0f, -1.0f);
+#endif
+    
     return outV;
 
 #else

From 07d6980cacbc8e646de2e622405a365dc47dd961 Mon Sep 17 00:00:00 2001
From: keptsecret <sorchon@gmail.com>
Date: Wed, 9 Apr 2025 14:48:31 +0700
Subject: [PATCH 153/529] handle when items per invoc =1

---
 .../app_resources/benchmarkSubgroup.comp.hlsl |  6 ++-
 .../app_resources/shaderCommon.hlsl           |  4 ++
 73_ArithmeticBench/main.cpp                   | 44 ++++++-------------
 3 files changed, 23 insertions(+), 31 deletions(-)

diff --git a/73_ArithmeticBench/app_resources/benchmarkSubgroup.comp.hlsl b/73_ArithmeticBench/app_resources/benchmarkSubgroup.comp.hlsl
index 2815d1e38..fb9f5e8c7 100644
--- a/73_ArithmeticBench/app_resources/benchmarkSubgroup.comp.hlsl
+++ b/73_ArithmeticBench/app_resources/benchmarkSubgroup.comp.hlsl
@@ -42,7 +42,7 @@ static void subbench(NBL_CONST_REF_ARG(type_t) sourceVal)
     type_t value = sourceVal;
 
     operation_t<params_t> func;
-    [unroll]
+    // [unroll]
     for (uint32_t i = 0; i < NUM_LOOPS; i++)
         value = func(value);
 
@@ -53,11 +53,15 @@ void benchmark()
 {
     const uint32_t idx = globalIndex() * ITEMS_PER_INVOCATION;
     type_t sourceVal;
+#if ITEMS_PER_INVOCATION > 1
     [unroll]
     for (uint32_t i = 0; i < ITEMS_PER_INVOCATION; i++)
     {
         sourceVal[i] = inputValue[idx + i];
     }
+#else
+    sourceVal = inputValue[idx];
+#endif
 
     subbench<bit_and, uint32_t, ITEMS_PER_INVOCATION>(sourceVal);
     subbench<bit_xor, uint32_t, ITEMS_PER_INVOCATION>(sourceVal);
diff --git a/73_ArithmeticBench/app_resources/shaderCommon.hlsl b/73_ArithmeticBench/app_resources/shaderCommon.hlsl
index 7d25b98ee..5cb1f3cf1 100644
--- a/73_ArithmeticBench/app_resources/shaderCommon.hlsl
+++ b/73_ArithmeticBench/app_resources/shaderCommon.hlsl
@@ -62,11 +62,15 @@ type_t test()
 {
     const uint32_t idx = globalIndex() * ITEMS_PER_INVOCATION;
     type_t sourceVal;
+#if ITEMS_PER_INVOCATION > 1
     [unroll]
     for (uint32_t i = 0; i < ITEMS_PER_INVOCATION; i++)
     {
         sourceVal[i] = inputValue[idx + i];
     }
+#else
+    sourceVal = inputValue[idx];
+#endif
 
     subtest<bit_and, uint32_t, ITEMS_PER_INVOCATION>(sourceVal);
     subtest<bit_xor, uint32_t, ITEMS_PER_INVOCATION>(sourceVal);
diff --git a/73_ArithmeticBench/main.cpp b/73_ArithmeticBench/main.cpp
index 276efbd18..5ddd0cf6b 100644
--- a/73_ArithmeticBench/main.cpp
+++ b/73_ArithmeticBench/main.cpp
@@ -330,7 +330,7 @@ class ArithmeticBenchApp final : public examples::SimpleWindowedApplication, pub
 
 		// for each workgroup size (manually adjust items per invoc, operation else uses up a lot of ram)
 		for (uint32_t i = 0; i < workgroupSizes.size(); i++)
-			benchSets[i] = createBenchmarkPipelines<emulatedReduction>(subgroupBenchSource, benchPplnLayout.get(), elementCount, hlsl::findMSB(MinSubgroupSize), workgroupSizes[i], ItemsPerInvocation, NumLoops);
+			benchSets[i] = createBenchmarkPipelines<emulatedScanInclusive>(subgroupBenchSource, benchPplnLayout.get(), elementCount, hlsl::findMSB(MinSubgroupSize), workgroupSizes[i], ItemsPerInvocation, NumLoops);
 
 		m_winMgr->show(m_window.get());
 
@@ -423,27 +423,15 @@ class ArithmeticBenchApp final : public examples::SimpleWindowedApplication, pub
 		m_device->updateDescriptorSets(1u, dsWrites, 0u, nullptr);
 
 		const uint32_t elementCount = Output<>::ScanElementCount;
-		// const auto MaxWorkgroupSize = m_physicalDevice->getLimits().maxComputeWorkGroupInvocations;
 		const auto MinSubgroupSize = m_physicalDevice->getLimits().minSubgroupSize;
 		const auto MaxSubgroupSize = m_physicalDevice->getLimits().maxSubgroupSize;
 
-		//{
-		//	auto startTime = std::chrono::high_resolution_clock::now();
-
-		//	const IQueue::SSubmitInfo::SSemaphoreInfo signal[1] = { {.semaphore = sema.get(),.value = ++timelineValue} };
-		//	const IQueue::SSubmitInfo::SCommandBufferInfo cmdbufs[1] = { {.cmdbuf = cmdbuf.get()} };
-		//	const IQueue::SSubmitInfo submits[1] = { {.commandBuffers = cmdbufs,.signalSemaphores = signal} };
-		//	computeQueue->submit(submits);
-		//	const ISemaphore::SWaitInfo wait[1] = { {.semaphore = sema.get(),.value = timelineValue} };
-		//	m_device->blockForSemaphores(wait);
+		const auto SubgroupSizeLog2 = hlsl::findMSB(MinSubgroupSize);
 
-		//	auto endTime = std::chrono::high_resolution_clock::now();
-		//}
-
-		double t0 = runBenchmark<emulatedReduction>(cmdbuf, benchSets[0], elementCount, 5);
-		double t1 = runBenchmark<emulatedReduction>(cmdbuf, benchSets[1], elementCount, 5);
-		double t2 = runBenchmark<emulatedReduction>(cmdbuf, benchSets[2], elementCount, 5);
-		m_logger->log("Ran for %.3fms (disregard these numbers, profile in Nsight)", ILogger::ELL_INFO, t0 * 1000.0);
+		bool passed = true;
+		passed = runBenchmark<emulatedScanInclusive>(cmdbuf, benchSets[0], elementCount, SubgroupSizeLog2);
+		passed = runBenchmark<emulatedScanInclusive>(cmdbuf, benchSets[1], elementCount, SubgroupSizeLog2);
+		passed = runBenchmark<emulatedScanInclusive>(cmdbuf, benchSets[2], elementCount, SubgroupSizeLog2);
 
 
 		// blit
@@ -713,26 +701,22 @@ class ArithmeticBenchApp final : public examples::SimpleWindowedApplication, pub
 		includeFinder->addSearchPath("nbl/builtin/hlsl/jit", core::make_smart_refctd_ptr<CJITIncludeLoader>(m_physicalDevice->getLimits(), m_device->getEnabledFeatures()));
 		options.preprocessorOptions.includeFinder = includeFinder;
 
-		const std::string definitions[6] = { 
+		const std::string definitions[5] = { 
 			"subgroup2::" + arith_name,
-			"subgroup::" + arith_name,
 			std::to_string(workgroupSize),
 			std::to_string(itemsPerInvoc),
 			std::to_string(subgroupSizeLog2),
 			std::to_string(numLoops)
 		};
 
-		const IShaderCompiler::SMacroDefinition defines[6] = {
-			{ "OPERATION", ItemsPerInvocation > 1 ? definitions[0] : definitions[1] },
-			{ "WORKGROUP_SIZE", definitions[2] },
-			{ "ITEMS_PER_INVOCATION", definitions[3] },
-			{ "SUBGROUP_SIZE_LOG2", definitions[4] },
-			{ "NUM_LOOPS", definitions[5] },
+		const IShaderCompiler::SMacroDefinition defines[5] = {
+			{ "OPERATION", definitions[0] },
+			{ "WORKGROUP_SIZE", definitions[1] },
+			{ "ITEMS_PER_INVOCATION", definitions[2] },
+			{ "SUBGROUP_SIZE_LOG2", definitions[3] },
+			{ "NUM_LOOPS", definitions[4] },
 		};
-		//if (b_useOldSubgroups)
-		//	options.preprocessorOptions.extraDefines = { defines, defines + 6 };
-		//else
-			options.preprocessorOptions.extraDefines = { defines, defines + 5 };
+		options.preprocessorOptions.extraDefines = { defines, defines + 5 };
 
 		smart_refctd_ptr<ICPUShader> overridenUnspecialized = compiler->compileToSPIRV((const char*)source->getContent()->getPointer(), options);
 

From be756d56f66a94e608f43f3ad98c43a6d8557f43 Mon Sep 17 00:00:00 2001
From: keptsecret <sorchon@gmail.com>
Date: Thu, 10 Apr 2025 11:49:19 +0700
Subject: [PATCH 154/529] minor fixes

---
 .../app_resources/benchmarkSubgroup.comp.hlsl   |  1 +
 73_ArithmeticBench/main.cpp                     | 17 ++++++++++-------
 2 files changed, 11 insertions(+), 7 deletions(-)

diff --git a/73_ArithmeticBench/app_resources/benchmarkSubgroup.comp.hlsl b/73_ArithmeticBench/app_resources/benchmarkSubgroup.comp.hlsl
index fb9f5e8c7..0b6a7e3c4 100644
--- a/73_ArithmeticBench/app_resources/benchmarkSubgroup.comp.hlsl
+++ b/73_ArithmeticBench/app_resources/benchmarkSubgroup.comp.hlsl
@@ -4,6 +4,7 @@
 
 #include "shaderCommon.hlsl"
 
+// NOTE added dummy output image to be able to profile with Nsight, which still doesn't support profiling headless compute shaders
 [[vk::binding(2, 0)]] RWTexture2D<float32_t4> outImage; // dummy
 
 uint32_t globalIndex()
diff --git a/73_ArithmeticBench/main.cpp b/73_ArithmeticBench/main.cpp
index 5ddd0cf6b..94983c03c 100644
--- a/73_ArithmeticBench/main.cpp
+++ b/73_ArithmeticBench/main.cpp
@@ -47,6 +47,7 @@ struct emulatedScanExclusive
 	static inline constexpr const char* name = "exclusive_scan";
 };
 
+// NOTE added swapchain + drawing frames to be able to profile with Nsight, which still doesn't support profiling headless compute shaders
 class ArithmeticBenchApp final : public examples::SimpleWindowedApplication, public application_templates::MonoAssetManagerAndBuiltinResourceApplication
 {
 	using device_base_t = examples::SimpleWindowedApplication;
@@ -330,7 +331,7 @@ class ArithmeticBenchApp final : public examples::SimpleWindowedApplication, pub
 
 		// for each workgroup size (manually adjust items per invoc, operation else uses up a lot of ram)
 		for (uint32_t i = 0; i < workgroupSizes.size(); i++)
-			benchSets[i] = createBenchmarkPipelines<emulatedScanInclusive>(subgroupBenchSource, benchPplnLayout.get(), elementCount, hlsl::findMSB(MinSubgroupSize), workgroupSizes[i], ItemsPerInvocation, NumLoops);
+			benchSets[i] = createBenchmarkPipelines<ArithmeticOp>(subgroupBenchSource, benchPplnLayout.get(), elementCount, hlsl::findMSB(MinSubgroupSize), workgroupSizes[i], ItemsPerInvocation, NumLoops);
 
 		m_winMgr->show(m_window.get());
 
@@ -429,9 +430,9 @@ class ArithmeticBenchApp final : public examples::SimpleWindowedApplication, pub
 		const auto SubgroupSizeLog2 = hlsl::findMSB(MinSubgroupSize);
 
 		bool passed = true;
-		passed = runBenchmark<emulatedScanInclusive>(cmdbuf, benchSets[0], elementCount, SubgroupSizeLog2);
-		passed = runBenchmark<emulatedScanInclusive>(cmdbuf, benchSets[1], elementCount, SubgroupSizeLog2);
-		passed = runBenchmark<emulatedScanInclusive>(cmdbuf, benchSets[2], elementCount, SubgroupSizeLog2);
+		passed = runBenchmark(cmdbuf, benchSets[0], elementCount, SubgroupSizeLog2);
+		passed = runBenchmark(cmdbuf, benchSets[1], elementCount, SubgroupSizeLog2);
+		passed = runBenchmark(cmdbuf, benchSets[2], elementCount, SubgroupSizeLog2);
 
 
 		// blit
@@ -676,7 +677,7 @@ class ArithmeticBenchApp final : public examples::SimpleWindowedApplication, pub
 	template<template<class> class Arithmetic>
 	BenchmarkSet createBenchmarkPipelines(const smart_refctd_ptr<const ICPUShader>&source, const IGPUPipelineLayout* layout, const uint32_t elementCount, const uint8_t subgroupSizeLog2, const uint32_t workgroupSize, uint32_t itemsPerInvoc = 1u, uint32_t numLoops = 8u)
 	{
-		std::string arith_name = Arithmetic<bit_xor<uint32_t>>::name;	// TODO all operations
+		std::string arith_name = Arithmetic<plus<uint32_t>>::name;	// TODO all operations
 
 		//smart_refctd_ptr<ICPUShader> overridenUnspecialized = CHLSLCompiler::createOverridenCopy(
 		//	source.get(), "#define OPERATION %s\n#define WORKGROUP_SIZE %d\n#define ITEMS_PER_INVOCATION %d\n#define SUBGROUP_SIZE_LOG2 %d\n",
@@ -874,7 +875,6 @@ class ArithmeticBenchApp final : public examples::SimpleWindowedApplication, pub
 	}
 
 
-	template<template<class> class Arithmetic>
 	bool runBenchmark(IGPUCommandBuffer* cmdbuf, const BenchmarkSet& set, const uint32_t elementCount, const uint8_t subgroupSizeLog2)
 	{
 		const uint32_t workgroupCount = elementCount / (set.workgroupSize * set.itemsPerInvocation);
@@ -933,9 +933,12 @@ class ArithmeticBenchApp final : public examples::SimpleWindowedApplication, pub
 	constexpr static inline uint32_t MaxNumSubmits = 30;
 	uint32_t numSubmits = 0;
 
+	template<class BinOp>
+	using ArithmeticOp = emulatedReduction<BinOp>;	// change this to test other arithmetic ops
+
 	bool b_runTests = false;
 	uint32_t* inputData = nullptr;
-	uint32_t ItemsPerInvocation = 4u;
+	uint32_t ItemsPerInvocation = 1u;
 	constexpr static inline uint32_t OutputBufferCount = 8u;
 	smart_refctd_ptr<IGPUBuffer> outputBuffers[OutputBufferCount];
 

From e4e7f1ec8476ff7d8c3f9bf71002c40c9585b81d Mon Sep 17 00:00:00 2001
From: Erfan Ahmadi <ahmadierfan99@gmail.com>
Date: Thu, 10 Apr 2025 08:44:20 +0330
Subject: [PATCH 155/529] multiple draw calls to allow dtms and linework
 simultaneously.

---
 62_CAD/CTriangleMesh.h                        |    9 -
 62_CAD/DrawResourcesFiller.cpp                |   17 +-
 62_CAD/DrawResourcesFiller.h                  |   44 +-
 62_CAD/main.cpp                               |   86 +-
 62_CAD/shaders/globals.hlsl                   |    1 +
 62_CAD/shaders/main_pipeline/common.hlsl      |    2 +
 .../main_pipeline/fragment_shader.hlsl        | 1941 +++++++++--------
 .../shaders/main_pipeline/vertex_shader.hlsl  |  951 ++++----
 8 files changed, 1554 insertions(+), 1497 deletions(-)

diff --git a/62_CAD/CTriangleMesh.h b/62_CAD/CTriangleMesh.h
index 374fae1b4..6c68cec27 100644
--- a/62_CAD/CTriangleMesh.h
+++ b/62_CAD/CTriangleMesh.h
@@ -69,13 +69,6 @@ class CTriangleMesh final
 	using index_t = uint32_t;
 	using vertex_t = TriangleMeshVertex;
 
-	struct DrawData
-	{
-		PushConstants pushConstants;
-		uint64_t indexBufferOffset;
-		uint64_t indexCount;
-	};
-
 	inline void setVertices(core::vector<vertex_t>&& vertices)
 	{
 		m_vertices = std::move(vertices);
@@ -107,8 +100,6 @@ class CTriangleMesh final
 		return m_indices.size();
 	}
 
-
-private:
 	core::vector<vertex_t> m_vertices;
 	core::vector<index_t> m_indices;
 };
\ No newline at end of file
diff --git a/62_CAD/DrawResourcesFiller.cpp b/62_CAD/DrawResourcesFiller.cpp
index c566de456..d28843a31 100644
--- a/62_CAD/DrawResourcesFiller.cpp
+++ b/62_CAD/DrawResourcesFiller.cpp
@@ -134,13 +134,18 @@ void DrawResourcesFiller::drawPolyline(const CPolylineBase& polyline, SIntendedS
 	}
 }
 
-void DrawResourcesFiller::drawTriangleMesh(const CTriangleMesh& mesh, CTriangleMesh::DrawData& drawData, const DTMSettingsInfo& dtmSettingsInfo, SIntendedSubmitInfo& intendedNextSubmit)
+void DrawResourcesFiller::drawTriangleMesh(const CTriangleMesh& mesh, const DTMSettingsInfo& dtmSettingsInfo, SIntendedSubmitInfo& intendedNextSubmit)
 {
+	flushDrawObjects(); // flushes draw call construction of any possible draw objects before dtm, because currently we're sepaerating dtm draw calls from drawObj draw calls
+
 	setActiveDTMSettings(dtmSettingsInfo);
 	beginMainObject(MainObjectType::DTM);
 
+	DrawCallData drawCallData = {}; 
+	drawCallData.isDTMRendering = true;
+
 	uint32_t mainObjectIdx = acquireActiveMainObjectIndex_SubmitIfNeeded(intendedNextSubmit);
-	drawData.pushConstants.triangleMeshMainObjectIndex = mainObjectIdx;
+	drawCallData.dtm.triangleMeshMainObjectIndex = mainObjectIdx;
 
 	ICPUBuffer::SCreationParams geometryBuffParams;
 	
@@ -162,18 +167,19 @@ void DrawResourcesFiller::drawTriangleMesh(const CTriangleMesh& mesh, CTriangleM
 		size_t geometryBufferOffset = resourcesCollection.geometryInfo.increaseSizeAndGetOffset(dataToAddByteSize, alignof(CTriangleMesh::vertex_t));
 		void* dst = resourcesCollection.geometryInfo.data() + geometryBufferOffset;
 		// the actual bda address will be determined only after all copies are finalized, later we will do += `baseBDAAddress + geometryInfo.bufferOffset`
-		drawData.pushConstants.triangleMeshVerticesBaseAddress = geometryBufferOffset;
+		drawCallData.dtm.triangleMeshVerticesBaseAddress = geometryBufferOffset;
 		memcpy(dst, mesh.getVertices().data(), vtxBuffByteSize);
 		geometryBufferOffset += vtxBuffByteSize; 
 
 		// Copy IndexBuffer
 		dst = resourcesCollection.geometryInfo.data() + geometryBufferOffset;
-		drawData.indexBufferOffset = geometryBufferOffset;
+		drawCallData.dtm.indexBufferOffset = geometryBufferOffset;
 		memcpy(dst, mesh.getIndices().data(), indexBuffByteSize);
 		geometryBufferOffset += indexBuffByteSize;
 	}
 
-	drawData.indexCount = mesh.getIndexCount();
+	drawCallData.dtm.indexCount = mesh.getIndexCount();
+	drawCalls.push_back(drawCallData);
 	endMainObject();
 }
 
@@ -334,6 +340,7 @@ void DrawResourcesFiller::_test_addImageObject(float64_t2 topLeftPos, float32_t2
 bool DrawResourcesFiller::finalizeAllCopiesToGPU(SIntendedSubmitInfo& intendedNextSubmit)
 {
 	bool success = true;
+	flushDrawObjects();
 	success &= finalizeBufferCopies(intendedNextSubmit);
 	success &= finalizeTextureCopies(intendedNextSubmit);
 	return success;
diff --git a/62_CAD/DrawResourcesFiller.h b/62_CAD/DrawResourcesFiller.h
index 03482320e..846046a43 100644
--- a/62_CAD/DrawResourcesFiller.h
+++ b/62_CAD/DrawResourcesFiller.h
@@ -155,7 +155,7 @@ struct DrawResourcesFiller
 	/// WARNING: make sure this function  is called within begin/endMainObject scope
 	void drawPolyline(const CPolylineBase& polyline, SIntendedSubmitInfo& intendedNextSubmit);
 	
-	void drawTriangleMesh(const CTriangleMesh& mesh, CTriangleMesh::DrawData& drawData, const DTMSettingsInfo& dtmSettings, SIntendedSubmitInfo& intendedNextSubmit);
+	void drawTriangleMesh(const CTriangleMesh& mesh, const DTMSettingsInfo& dtmSettings, SIntendedSubmitInfo& intendedNextSubmit);
 
 	// ! Convinience function for Hatch with MSDF Pattern and a solid background
 	void drawHatch(
@@ -207,6 +207,9 @@ struct DrawResourcesFiller
 		resetCustomClipProjections();
 		resetLineStyles();
 		resetDTMSettings();
+
+		drawObjectsFlushedToDrawCalls = 0ull;
+		drawCalls.clear();
 	}
 
 	/// @brief collection of all the resources that will eventually be reserved or copied to in the resourcesGPUBuffer, will be accessed via individual BDA pointers in shaders
@@ -243,6 +246,45 @@ struct DrawResourcesFiller
 	/// For advanced use only, (passed to shaders for them to know if we overflow-submitted in the middle if a main obj
 	uint32_t getActiveMainObjectIndex() const { return activeMainObjectIndex; }
 
+	// TODO: Remove these later, these are for multiple draw calls instead of a single one.
+	struct DrawCallData
+	{
+		union
+		{
+			struct Dtm
+			{
+				uint64_t indexBufferOffset;
+				uint64_t indexCount;
+				uint64_t triangleMeshVerticesBaseAddress;
+				uint32_t triangleMeshMainObjectIndex;
+			} dtm;
+			struct DrawObj
+			{
+				uint64_t drawObjectStart = 0ull;
+				uint64_t drawObjectCount = 0ull;
+			} drawObj;
+		};
+		bool isDTMRendering;
+	};
+
+	uint64_t drawObjectsFlushedToDrawCalls = 0ull;
+
+	void flushDrawObjects()
+	{
+		if (resourcesCollection.drawObjects.getCount() > drawObjectsFlushedToDrawCalls)
+		{
+			DrawCallData drawCall = {};
+			drawCall.isDTMRendering = false;
+			drawCall.drawObj.drawObjectStart = drawObjectsFlushedToDrawCalls;
+			drawCall.drawObj.drawObjectCount = resourcesCollection.drawObjects.getCount() - drawObjectsFlushedToDrawCalls;
+			drawCalls.push_back(drawCall);
+			drawObjectsFlushedToDrawCalls = resourcesCollection.drawObjects.getCount();
+		}
+	}
+
+	std::vector<DrawCallData> drawCalls; // either dtms or objects
+
+
 protected:
 	
 	struct MSDFTextureCopy
diff --git a/62_CAD/main.cpp b/62_CAD/main.cpp
index b49dc56d2..e425dce54 100644
--- a/62_CAD/main.cpp
+++ b/62_CAD/main.cpp
@@ -802,7 +802,7 @@ class ComputerAidedDesign final : public examples::SimpleWindowedApplication, pu
 			}
 
 			const asset::SPushConstantRange range = {
-						.stageFlags = IShader::E_SHADER_STAGE::ESS_VERTEX,
+						.stageFlags = IShader::E_SHADER_STAGE::ESS_VERTEX | IShader::E_SHADER_STAGE::ESS_FRAGMENT,
 						.offset = 0,
 						.size = sizeof(PushConstants)
 			};
@@ -1185,6 +1185,13 @@ class ComputerAidedDesign final : public examples::SimpleWindowedApplication, pu
 	
 	void submitDraws(SIntendedSubmitInfo& intendedSubmitInfo, bool inBetweenSubmit)
 	{
+		// TODO: Remove this check later
+		if (inBetweenSubmit)
+		{
+			m_logger->log("Temporarily Disabled. Auto-Submission shouldn't happen (for Demo)", ILogger::ELL_ERROR);
+			assert(!inBetweenSubmit);
+		}
+
 		// Use the current recording command buffer of the intendedSubmitInfos scratchCommandBuffers, it should be in recording state
 		auto* cb = m_currentRecordingCommandBufferInfo->cmdbuf;
 		
@@ -1295,36 +1302,42 @@ class ComputerAidedDesign final : public examples::SimpleWindowedApplication, pu
 		}
 		cb->beginRenderPass(beginInfo, IGPUCommandBuffer::SUBPASS_CONTENTS::INLINE);
 		
-		const uint32_t currentIndexCount = resources.drawObjects.getCount() * 6u;
-
 		IGPUDescriptorSet* descriptorSets[] = { descriptorSet0.get(), descriptorSet1.get() };
 		cb->bindDescriptorSets(asset::EPBP_GRAPHICS, pipelineLayout.get(), 0u, 2u, descriptorSets);
+		
+		cb->bindGraphicsPipeline(graphicsPipeline.get());
 
-		if (mode == ExampleMode::CASE_9)
+		for (auto& drawCall : drawResourcesFiller.drawCalls)
 		{
+			if (drawCall.isDTMRendering)
+			{
+				cb->bindIndexBuffer({ .offset = resources.geometryInfo.bufferOffset + drawCall.dtm.indexBufferOffset, .buffer = drawResourcesFiller.getResourcesGPUBuffer().get()}, asset::EIT_32BIT);
 
-			// TODO[Przemek]: based on our call bind index buffer you uploaded to part of the `drawResourcesFiller.gpuDrawBuffers.geometryBuffer`
-			// Vertices will be pulled based on baseBDAPointer of where you uploaded the vertex + the VertexID in the vertex shader.
-			cb->bindIndexBuffer({ .offset = resources.geometryInfo.bufferOffset + m_triangleMeshDrawData.indexBufferOffset, .buffer = drawResourcesFiller.getResourcesGPUBuffer().get()}, asset::EIT_32BIT);
+				PushConstants pc = {
+					.triangleMeshVerticesBaseAddress = drawCall.dtm.triangleMeshVerticesBaseAddress + resourcesGPUBuffer->getDeviceAddress() + resources.geometryInfo.bufferOffset,
+					.triangleMeshMainObjectIndex = drawCall.dtm.triangleMeshMainObjectIndex,
+					.isDTMRendering = true
+				};
+				cb->pushConstants(graphicsPipeline->getLayout(), IGPUShader::E_SHADER_STAGE::ESS_VERTEX | IShader::E_SHADER_STAGE::ESS_FRAGMENT, 0, sizeof(PushConstants), &pc);
 
-			// TODO[Przemek]: binding the same pipelie, no need to change.
-			cb->bindGraphicsPipeline(graphicsPipeline.get());
+				cb->drawIndexed(drawCall.dtm.indexCount, 1u, 0u, 0u, 0u);
+			}
+			else
+			{
+				PushConstants pc = {
+					.isDTMRendering = false
+				};
+				cb->pushConstants(graphicsPipeline->getLayout(), IGPUShader::E_SHADER_STAGE::ESS_VERTEX | IShader::E_SHADER_STAGE::ESS_FRAGMENT, 0, sizeof(PushConstants), &pc);
 
-			// TODO[Przemek]: contour settings, height shading settings, base bda pointers will need to be pushed via pushConstants before the draw currently as it's the easiest thing to do.
-			m_triangleMeshDrawData.pushConstants.triangleMeshVerticesBaseAddress += resourcesGPUBuffer->getDeviceAddress() + resources.geometryInfo.bufferOffset;
-			cb->pushConstants(graphicsPipeline->getLayout(), IGPUShader::E_SHADER_STAGE::ESS_VERTEX, 0, sizeof(PushConstants), &m_triangleMeshDrawData.pushConstants);
+				const uint64_t indexOffset = drawCall.drawObj.drawObjectStart * 6u;
+				const uint64_t indexCount = drawCall.drawObj.drawObjectCount * 6u;
 
-			// TODO[Przemek]: draw parameters needs to reflect the mesh involved
-			cb->drawIndexed(m_triangleMeshDrawData.indexCount, 1u, 0u, 0u, 0u);
-		}
-		else
-		{
-			assert(currentIndexCount == resources.indexBuffer.getCount());
-			cb->bindIndexBuffer({ .offset = resources.indexBuffer.bufferOffset, .buffer = resourcesGPUBuffer.get() }, asset::EIT_32BIT);
-			cb->bindGraphicsPipeline(graphicsPipeline.get());
-			cb->drawIndexed(currentIndexCount, 1u, 0u, 0u, 0u);
+				// assert(currentIndexCount == resources.indexBuffer.getCount());
+				cb->bindIndexBuffer({ .offset = resources.indexBuffer.bufferOffset + indexOffset * sizeof(uint32_t), .buffer = resourcesGPUBuffer.get()}, asset::EIT_32BIT);
+				cb->drawIndexed(indexCount, 1u, 0u, 0u, 0u);
+			}
 		}
-		
+
 		if (fragmentShaderInterlockEnabled)
 		{
 			cb->bindGraphicsPipeline(resolveAlphaGraphicsPipeline.get());
@@ -1333,10 +1346,11 @@ class ComputerAidedDesign final : public examples::SimpleWindowedApplication, pu
 
 		if constexpr (DebugModeWireframe)
 		{
+			const uint32_t indexCount = resources.drawObjects.getCount() * 6u;
 			cb->bindGraphicsPipeline(debugGraphicsPipeline.get());
-			cb->drawIndexed(currentIndexCount, 1u, 0u, 0u, 0u);
+			cb->drawIndexed(indexCount, 1u, 0u, 0u, 0u);
 		}
-		
+
 		cb->endRenderPass();
 
 		if (!inBetweenSubmit)
@@ -3191,11 +3205,11 @@ class ComputerAidedDesign final : public examples::SimpleWindowedApplication, pu
 			// PYRAMID
 
 			core::vector<TriangleMeshVertex> vertices = {
-				{ float32_t2(0.0, 0.0), 100.0 },
-				{ float32_t2(-200.0, -200.0), 10.0 },
-				{ float32_t2(200.0, -200.0), 10.0 },
-				{ float32_t2(200.0, 200.0), -20.0 },
-				{ float32_t2(-200.0, 200.0), 10.0 },
+				{ float64_t2(0.0, 0.0), 100.0 },
+				{ float64_t2(-200.0, -200.0), 10.0 },
+				{ float64_t2(200.0, -200.0), 10.0 },
+				{ float64_t2(200.0, 200.0), -20.0 },
+				{ float64_t2(-200.0, 200.0), 10.0 },
 			};
 
 			core::vector<uint32_t> indices = {
@@ -3277,7 +3291,17 @@ class ComputerAidedDesign final : public examples::SimpleWindowedApplication, pu
 				}
 			}
 
-			drawResourcesFiller.drawTriangleMesh(mesh, m_triangleMeshDrawData, dtmSettingsInfo, intendedNextSubmit);
+			drawResourcesFiller.drawTriangleMesh(mesh, dtmSettingsInfo, intendedNextSubmit);
+
+			dtmSettingsInfo.contourLineStyleInfo.color = float32_t4(1.0f, 0.39f, 0.0f, 1.0f);
+			dtmSettingsInfo.outlineLineStyleInfo.color = float32_t4(0.0f, 0.39f, 1.0f, 1.0f);
+			for (auto& v : mesh.m_vertices)
+			{
+				v.pos += float64_t2(400.0, 200.0);
+				v.height -= 10.0;
+			}
+
+			drawResourcesFiller.drawTriangleMesh(mesh, dtmSettingsInfo, intendedNextSubmit);
 		}
 
 		drawResourcesFiller.finalizeAllCopiesToGPU(intendedNextSubmit);
@@ -3360,8 +3384,6 @@ class ComputerAidedDesign final : public examples::SimpleWindowedApplication, pu
 	#endif
 	
 	std::unique_ptr<GeoTextureRenderer> m_geoTextureRenderer;
-
-	CTriangleMesh::DrawData m_triangleMeshDrawData;
 };
 
 NBL_MAIN_FUNC(ComputerAidedDesign)
diff --git a/62_CAD/shaders/globals.hlsl b/62_CAD/shaders/globals.hlsl
index 319c30b3d..24a833334 100644
--- a/62_CAD/shaders/globals.hlsl
+++ b/62_CAD/shaders/globals.hlsl
@@ -35,6 +35,7 @@ struct PushConstants
 {
     uint64_t triangleMeshVerticesBaseAddress;
     uint32_t triangleMeshMainObjectIndex;
+    uint32_t isDTMRendering;
 };
 
 // TODO: Compute this in a compute shader from the world counterparts
diff --git a/62_CAD/shaders/main_pipeline/common.hlsl b/62_CAD/shaders/main_pipeline/common.hlsl
index 0cf4e3bce..4327cf7fe 100644
--- a/62_CAD/shaders/main_pipeline/common.hlsl
+++ b/62_CAD/shaders/main_pipeline/common.hlsl
@@ -236,6 +236,8 @@ struct PSInput
 
 // [[vk::binding(0, 0)]] ConstantBuffer<Globals> globals; ---> moved to globals.hlsl
 
+[[vk::push_constant]] PushConstants pc;
+
 [[vk::combinedImageSampler]][[vk::binding(1, 0)]] Texture2DArray<float3> msdfTextures : register(t4);
 [[vk::combinedImageSampler]][[vk::binding(1, 0)]] SamplerState msdfSampler : register(s4);
 
diff --git a/62_CAD/shaders/main_pipeline/fragment_shader.hlsl b/62_CAD/shaders/main_pipeline/fragment_shader.hlsl
index 5d5d464cc..ab6388bc8 100644
--- a/62_CAD/shaders/main_pipeline/fragment_shader.hlsl
+++ b/62_CAD/shaders/main_pipeline/fragment_shader.hlsl
@@ -1,969 +1,972 @@
-#define FRAGMENT_SHADER_INPUT
-#include "common.hlsl"
-#include <nbl/builtin/hlsl/shapes/beziers.hlsl>
-#include <nbl/builtin/hlsl/shapes/line.hlsl>
-#include <nbl/builtin/hlsl/algorithm.hlsl>
-#include <nbl/builtin/hlsl/math/equations/quadratic.hlsl>
-#include <nbl/builtin/hlsl/math/geometry.hlsl>
-#include <nbl/builtin/hlsl/spirv_intrinsics/fragment_shader_pixel_interlock.hlsl>
-#include <nbl/builtin/hlsl/jit/device_capabilities.hlsl>
-#include <nbl/builtin/hlsl/text_rendering/msdf.hlsl>
-#include <nbl/builtin/hlsl/spirv_intrinsics/fragment_shader_barycentric.hlsl>
-
-template<typename float_t>
-struct DefaultClipper
-{
-    using float_t2 = vector<float_t, 2>;
-    NBL_CONSTEXPR_STATIC_INLINE float_t AccuracyThresholdT = 0.0;
-
-    static DefaultClipper construct()
-    {
-        DefaultClipper ret;
-        return ret;
-    }
-
-    inline float_t2 operator()(const float_t t)
-    {
-        const float_t ret = clamp(t, 0.0, 1.0);
-        return float_t2(ret, ret);
-    }
-};
-
-// for usage in upper_bound function
-struct StyleAccessor
-{
-    LineStyle style;
-    using value_type = float;
-
-    float operator[](const uint32_t ix)
-    {
-        return style.getStippleValue(ix);
-    }
-};
-
-template<typename CurveType>
-struct StyleClipper
-{
-    using float_t = typename CurveType::scalar_t;
-    using float_t2 = typename CurveType::float_t2;
-    using float_t3 = typename CurveType::float_t3;
-    NBL_CONSTEXPR_STATIC_INLINE float_t AccuracyThresholdT = 0.000001;
-
-    static StyleClipper<CurveType> construct(
-        LineStyle style,
-        CurveType curve,
-        typename CurveType::ArcLengthCalculator arcLenCalc,
-        float phaseShift,
-        float stretch,
-        float worldToScreenRatio)
-    {
-        StyleClipper<CurveType> ret = { style, curve, arcLenCalc, phaseShift, stretch, worldToScreenRatio, 0.0f, 0.0f, 0.0f, 0.0f };
-
-        // values for non-uniform stretching with a rigid segment
-        if (style.rigidSegmentIdx != InvalidRigidSegmentIndex && stretch != 1.0f)
-        {
-            // rigidSegment info in old non stretched pattern
-            ret.rigidSegmentStart = (style.rigidSegmentIdx >= 1u) ? style.getStippleValue(style.rigidSegmentIdx - 1u) : 0.0f;
-            ret.rigidSegmentEnd = (style.rigidSegmentIdx < style.stipplePatternSize) ? style.getStippleValue(style.rigidSegmentIdx) : 1.0f;
-            ret.rigidSegmentLen = ret.rigidSegmentEnd - ret.rigidSegmentStart;
-            // stretch value for non rigid segments
-            ret.nonRigidSegmentStretchValue = (stretch - ret.rigidSegmentLen) / (1.0f - ret.rigidSegmentLen);
-            // rigidSegment info to new stretched pattern
-            ret.rigidSegmentStart *= ret.nonRigidSegmentStretchValue / stretch; // get the new normalized rigid segment start
-            ret.rigidSegmentLen /= stretch; // get the new rigid segment normalized len
-            ret.rigidSegmentEnd = ret.rigidSegmentStart + ret.rigidSegmentLen; // get the new normalized rigid segment end 
-        }
-        else
-        {
-            ret.nonRigidSegmentStretchValue = stretch;
-        }
-        
-        return ret;
-    }
-
-    // For non-uniform stretching with a rigid segment (the one segement that shouldn't stretch) the whole pattern changes
-    // instead of transforming each of the style.stipplePattern values (max 14 of them), we transform the normalized place in pattern
-    float getRealNormalizedPlaceInPattern(float normalizedPlaceInPattern)
-    {
-        if (style.rigidSegmentIdx != InvalidRigidSegmentIndex && stretch != 1.0f)
-        {
-            float ret = min(normalizedPlaceInPattern, rigidSegmentStart) / nonRigidSegmentStretchValue; // unstretch parts before rigid segment
-            ret += max(normalizedPlaceInPattern - rigidSegmentEnd, 0.0f) / nonRigidSegmentStretchValue; // unstretch parts after rigid segment
-            ret += max(min(rigidSegmentLen, normalizedPlaceInPattern - rigidSegmentStart), 0.0f); // unstretch parts inside rigid segment
-            ret *= stretch;
-            return ret;
-        }
-        else
-        {
-            return normalizedPlaceInPattern;
-        }
-    }
-
-    float_t2 operator()(float_t t)
-    {
-        // basicaly 0.0 and 1.0 but with a guardband to discard outside the range
-        const float_t minT = 0.0 - 1.0;
-        const float_t maxT = 1.0 + 1.0;
-
-        StyleAccessor styleAccessor = { style };
-        const float_t reciprocalStretchedStipplePatternLen = style.reciprocalStipplePatternLen / stretch;
-        const float_t patternLenInScreenSpace = 1.0 / (worldToScreenRatio * style.reciprocalStipplePatternLen);
-
-        const float_t arcLen = arcLenCalc.calcArcLen(t);
-        const float_t worldSpaceArcLen = arcLen * float_t(worldToScreenRatio);
-        float_t normalizedPlaceInPattern = frac(worldSpaceArcLen * reciprocalStretchedStipplePatternLen + phaseShift);
-        normalizedPlaceInPattern = getRealNormalizedPlaceInPattern(normalizedPlaceInPattern);
-        uint32_t patternIdx = nbl::hlsl::upper_bound(styleAccessor, 0, style.stipplePatternSize, normalizedPlaceInPattern);
-
-        const float_t InvalidT = nbl::hlsl::numeric_limits<float32_t>::infinity; 
-        float_t2 ret = float_t2(InvalidT, InvalidT);
-
-        // odd patternIdx means a "no draw section" and current candidate should split into two nearest draw sections
-        const bool notInDrawSection = patternIdx & 0x1;
-        
-        // TODO[Erfan]: Disable this piece of code after clipping, and comment the reason, that the bezier start and end at 0.0 and 1.0 should be in drawable sections
-        float_t minDrawT = 0.0;
-        float_t maxDrawT = 1.0;
-        {
-            float_t normalizedPlaceInPatternBegin = frac(phaseShift);
-            normalizedPlaceInPatternBegin = getRealNormalizedPlaceInPattern(normalizedPlaceInPatternBegin);
-            uint32_t patternIdxBegin = nbl::hlsl::upper_bound(styleAccessor, 0, style.stipplePatternSize, normalizedPlaceInPatternBegin);
-            const bool BeginInNonDrawSection = patternIdxBegin & 0x1;
-
-            if (BeginInNonDrawSection)
-            {
-                float_t diffToRightDrawableSection = (patternIdxBegin == style.stipplePatternSize) ? 1.0 : styleAccessor[patternIdxBegin];
-                diffToRightDrawableSection -= normalizedPlaceInPatternBegin;
-                float_t scrSpcOffsetToArcLen1 = diffToRightDrawableSection * patternLenInScreenSpace * ((patternIdxBegin != style.rigidSegmentIdx) ? nonRigidSegmentStretchValue : 1.0);
-                const float_t arcLenForT1 = 0.0 + scrSpcOffsetToArcLen1;
-                minDrawT = arcLenCalc.calcArcLenInverse(curve, minT, maxT, arcLenForT1, AccuracyThresholdT, 0.0);
-            }
-            
-            // Completely in non-draw section -> clip away:
-            if (minDrawT >= 1.0)
-                return ret;
-
-            const float_t arcLenEnd = arcLenCalc.calcArcLen(1.0);
-            const float_t worldSpaceArcLenEnd = arcLenEnd * float_t(worldToScreenRatio);
-            float_t normalizedPlaceInPatternEnd = frac(worldSpaceArcLenEnd * reciprocalStretchedStipplePatternLen + phaseShift);
-            normalizedPlaceInPatternEnd = getRealNormalizedPlaceInPattern(normalizedPlaceInPatternEnd);
-            uint32_t patternIdxEnd = nbl::hlsl::upper_bound(styleAccessor, 0, style.stipplePatternSize, normalizedPlaceInPatternEnd);
-            const bool EndInNonDrawSection = patternIdxEnd & 0x1;
-
-            if (EndInNonDrawSection)
-            {
-                float_t diffToLeftDrawableSection = (patternIdxEnd == 0) ? 0.0 : styleAccessor[patternIdxEnd - 1];
-                diffToLeftDrawableSection -= normalizedPlaceInPatternEnd;
-                float_t scrSpcOffsetToArcLen0 = diffToLeftDrawableSection * patternLenInScreenSpace * ((patternIdxEnd != style.rigidSegmentIdx) ? nonRigidSegmentStretchValue : 1.0);
-                const float_t arcLenForT0 = arcLenEnd + scrSpcOffsetToArcLen0;
-                maxDrawT = arcLenCalc.calcArcLenInverse(curve, minT, maxT, arcLenForT0, AccuracyThresholdT, 1.0);
-            }
-        }
-
-        if (notInDrawSection)
-        {
-            float toScreenSpaceLen = patternLenInScreenSpace * ((patternIdx != style.rigidSegmentIdx) ? nonRigidSegmentStretchValue : 1.0);
-
-            float_t diffToLeftDrawableSection = (patternIdx == 0) ? 0.0 : styleAccessor[patternIdx - 1];
-            diffToLeftDrawableSection -= normalizedPlaceInPattern;
-            float_t scrSpcOffsetToArcLen0 = diffToLeftDrawableSection * toScreenSpaceLen;
-            const float_t arcLenForT0 = arcLen + scrSpcOffsetToArcLen0;
-            float_t t0 = arcLenCalc.calcArcLenInverse(curve, minT, maxT, arcLenForT0, AccuracyThresholdT, t);
-            t0 = clamp(t0, minDrawT, maxDrawT);
-
-            float_t diffToRightDrawableSection = (patternIdx == style.stipplePatternSize) ? 1.0 : styleAccessor[patternIdx];
-            diffToRightDrawableSection -= normalizedPlaceInPattern;
-            float_t scrSpcOffsetToArcLen1 = diffToRightDrawableSection * toScreenSpaceLen;
-            const float_t arcLenForT1 = arcLen + scrSpcOffsetToArcLen1;
-            float_t t1 = arcLenCalc.calcArcLenInverse(curve, minT, maxT, arcLenForT1, AccuracyThresholdT, t);
-            t1 = clamp(t1, minDrawT, maxDrawT);
-
-            ret = float_t2(t0, t1);
-        }
-        else
-        {
-            t = clamp(t, minDrawT, maxDrawT);
-            ret = float_t2(t, t);
-        }
-
-        return ret;
-    }
-
-    LineStyle style;
-    CurveType curve;
-    typename CurveType::ArcLengthCalculator arcLenCalc;
-    float phaseShift;
-    float stretch;
-    float worldToScreenRatio;
-    // precomp value for non uniform stretching
-    float rigidSegmentStart;
-    float rigidSegmentEnd;
-    float rigidSegmentLen;
-    float nonRigidSegmentStretchValue;
-};
-
-template<typename CurveType, typename Clipper = DefaultClipper<typename CurveType::scalar_t> >
-struct ClippedSignedDistance
-{
-    using float_t = typename CurveType::scalar_t;
-    using float_t2 = typename CurveType::float_t2;
-    using float_t3 = typename CurveType::float_t3;
-
-    const static float_t sdf(CurveType curve, float_t2 pos, float_t thickness, bool isRoadStyle, Clipper clipper = DefaultClipper<typename CurveType::scalar_t>::construct())
-    {
-        typename CurveType::Candidates candidates = curve.getClosestCandidates(pos);
-
-        const float_t InvalidT = nbl::hlsl::numeric_limits<float32_t>::max;
-        // TODO: Fix and test, we're not working with squared distance anymore
-        const float_t MAX_DISTANCE_SQUARED = (thickness + 1.0f) * (thickness + 1.0f); // TODO: ' + 1' is too much?
-
-        bool clipped = false;
-        float_t closestDistanceSquared = MAX_DISTANCE_SQUARED;
-        float_t closestT = InvalidT;
-        [[unroll(CurveType::MaxCandidates)]]
-        for (uint32_t i = 0; i < CurveType::MaxCandidates; i++)
-        {
-            const float_t candidateDistanceSquared = length(curve.evaluate(candidates[i]) - pos);
-            if (candidateDistanceSquared < closestDistanceSquared)
-            {
-                float_t2 snappedTs = clipper(candidates[i]);
-
-                if (snappedTs[0] == InvalidT)
-                {
-                    continue;
-                }
-
-                if (snappedTs[0] != candidates[i])
-                {
-                    // left snapped or clamped
-                    const float_t leftSnappedCandidateDistanceSquared = length(curve.evaluate(snappedTs[0]) - pos);
-                    if (leftSnappedCandidateDistanceSquared < closestDistanceSquared)
-                    {
-                        clipped = true;
-                        closestT = snappedTs[0];
-                        closestDistanceSquared = leftSnappedCandidateDistanceSquared;
-                    }
-
-                    if (snappedTs[0] != snappedTs[1])
-                    {
-                        // right snapped or clamped
-                        const float_t rightSnappedCandidateDistanceSquared = length(curve.evaluate(snappedTs[1]) - pos);
-                        if (rightSnappedCandidateDistanceSquared < closestDistanceSquared)
-                        {
-                            clipped = true;
-                            closestT = snappedTs[1];
-                            closestDistanceSquared = rightSnappedCandidateDistanceSquared;
-                        }
-                    }
-                }
-                else
-                {
-                    // no snapping
-                    if (candidateDistanceSquared < closestDistanceSquared)
-                    {
-                        clipped = false;
-                        closestT = candidates[i];
-                        closestDistanceSquared = candidateDistanceSquared;
-                    }
-                }
-            }
-        }
-
-
-        float_t roundedDistance = closestDistanceSquared - thickness;
-        if(!isRoadStyle)
-        {
-            return roundedDistance;
-        }
-        else
-        {
-            const float_t aaWidth = globals.antiAliasingFactor;
-            float_t rectCappedDistance = roundedDistance;
-
-            if (clipped)
-            {
-                float_t2 q = mul(curve.getLocalCoordinateSpace(closestT), pos - curve.evaluate(closestT));
-                rectCappedDistance = capSquare(q, thickness, aaWidth);
-            }
-
-            return rectCappedDistance;
-        }
-    }
-
-    static float capSquare(float_t2 q, float_t th, float_t aaWidth)
-    {
-        float_t2 d = abs(q) - float_t2(aaWidth, th);
-        return length(max(d, 0.0)) + min(max(d.x, d.y), 0.0);
-    }
-};
-
-// sdf of Isosceles Trapezoid y-aligned by https://iquilezles.org/articles/distfunctions2d/
-float sdTrapezoid(float2 p, float r1, float r2, float he)
-{
-    float2 k1 = float2(r2, he);
-    float2 k2 = float2(r2 - r1, 2.0 * he);
-
-    p.x = abs(p.x);
-    float2 ca = float2(max(0.0, p.x - ((p.y < 0.0) ? r1 : r2)), abs(p.y) - he);
-    float2 cb = p - k1 + k2 * clamp(dot(k1 - p, k2) / dot(k2,k2), 0.0, 1.0);
-
-    float s = (cb.x < 0.0 && ca.y < 0.0) ? -1.0 : 1.0;
-
-    return s * sqrt(min(dot(ca,ca), dot(cb,cb)));
-}
-
-// line segment sdf which returns the distance vector specialized for usage in hatch box line boundaries
-float2 sdLineDstVec(float2 P, float2 A, float2 B)
-{
-    const float2 PA = P - A;
-    const float2 BA = B - A;
-    float h = clamp(dot(PA, BA) / dot(BA, BA), 0.0, 1.0);
-    return PA - BA * h;
-}
-
-float miterSDF(float2 p, float thickness, float2 a, float2 b, float ra, float rb)
-{
-    float h = length(b - a) / 2.0;
-    float2 d = normalize(b - a);
-    float2x2 rot = float2x2(d.y, -d.x, d.x, d.y);
-    p = mul(rot, p);
-    p.y -= h - thickness;
-    return sdTrapezoid(p, ra, rb, h);
-}
-
-typedef StyleClipper< nbl::hlsl::shapes::Quadratic<float> > BezierStyleClipper;
-typedef StyleClipper< nbl::hlsl::shapes::Line<float> > LineStyleClipper;
-
-// for usage in upper_bound function
-struct DTMSettingsHeightsAccessor
-{
-    DTMSettings dtmSettings;
-    using value_type = float;
-
-    float operator[](const uint32_t ix)
-    {
-        return dtmSettings.heightColorMapHeights[ix];
-    }
-};
-
-// We need to specialize color calculation based on FragmentShaderInterlock feature availability for our transparency algorithm
-// because there is no `if constexpr` in hlsl
-// @params
-// textureColor: color sampled from a texture
-// useStyleColor: instead of writing and reading from colorStorage, use main object Idx to find the style color for the object.
-template<bool FragmentShaderPixelInterlock>
-float32_t4 calculateFinalColor(const uint2 fragCoord, const float localAlpha, const uint32_t currentMainObjectIdx, float3 textureColor, bool colorFromTexture);
-
-template<>
-float32_t4 calculateFinalColor<false>(const uint2 fragCoord, const float localAlpha, const uint32_t currentMainObjectIdx, float3 localTextureColor, bool colorFromTexture)
-{
-    uint32_t styleIdx = loadMainObject(currentMainObjectIdx).styleIdx;
-    if (!colorFromTexture)
-    {
-        float32_t4 col = loadLineStyle(styleIdx).color;
-        col.w *= localAlpha;
-        return float4(col);
-    }
-    else
-        return float4(localTextureColor, localAlpha);
-}
-template<>
-float32_t4 calculateFinalColor<true>(const uint2 fragCoord, const float localAlpha, const uint32_t currentMainObjectIdx, float3 localTextureColor, bool colorFromTexture)
-{
-    float32_t4 color;
-    nbl::hlsl::spirv::beginInvocationInterlockEXT();
-
-    const uint32_t packedData = pseudoStencil[fragCoord];
-
-    const uint32_t localQuantizedAlpha = (uint32_t)(localAlpha * 255.f);
-    const uint32_t storedQuantizedAlpha = nbl::hlsl::glsl::bitfieldExtract<uint32_t>(packedData,0,AlphaBits);
-    const uint32_t storedMainObjectIdx = nbl::hlsl::glsl::bitfieldExtract<uint32_t>(packedData,AlphaBits,MainObjectIdxBits);
-    // if geomID has changed, we resolve the SDF alpha (draw using blend), else accumulate
-    const bool differentMainObject = currentMainObjectIdx != storedMainObjectIdx; // meaning current pixel's main object is different than what is already stored
-    const bool resolve = differentMainObject && storedMainObjectIdx != InvalidMainObjectIdx;
-    uint32_t toResolveStyleIdx = InvalidStyleIdx;
-    
-    // load from colorStorage only if we want to resolve color from texture instead of style
-    // sampling from colorStorage needs to happen in critical section because another fragment may also want to store into it at the same time + need to happen before store
-    if (resolve)
-    {
-        toResolveStyleIdx = loadMainObject(storedMainObjectIdx).styleIdx;
-        if (toResolveStyleIdx == InvalidStyleIdx) // if style idx to resolve is invalid, then it means we should resolve from color
-            color = float32_t4(unpackR11G11B10_UNORM(colorStorage[fragCoord]), 1.0f);
-    }
-    
-    // If current localAlpha is higher than what is already stored in pseudoStencil we will update the value in pseudoStencil or the color in colorStorage, this is equivalent to programmable blending MAX operation.
-    // OR If previous pixel has a different ID than current's  (i.e. previous either empty/invalid or a differnet mainObject), we should update our alpha and color storages.
-    if (differentMainObject || localQuantizedAlpha > storedQuantizedAlpha)
-    {
-        pseudoStencil[fragCoord] = nbl::hlsl::glsl::bitfieldInsert<uint32_t>(localQuantizedAlpha,currentMainObjectIdx,AlphaBits,MainObjectIdxBits);
-        if (colorFromTexture) // writing color from texture
-            colorStorage[fragCoord] = packR11G11B10_UNORM(localTextureColor);
-    }
-    
-    nbl::hlsl::spirv::endInvocationInterlockEXT();
-
-    if (!resolve)
-        discard;
-
-    // draw with previous geometry's style's color or stored in texture buffer :kek:
-    // we don't need to load the style's color in critical section because we've already retrieved the style index from the stored main obj
-    if (toResolveStyleIdx != InvalidStyleIdx) // if toResolveStyleIdx is valid then that means our resolved color should come from line style
-        color = loadLineStyle(toResolveStyleIdx).color;
-    color.a *= float(storedQuantizedAlpha) / 255.f;
-    
-    return color;
-}
-
-float dot2(in float2 vec)
-{
-    return dot(vec, vec);
-}
-
-[[vk::spvexecutionmode(spv::ExecutionModePixelInterlockOrderedEXT)]]
-[shader("pixel")]
-float4 fragMain(PSInput input) : SV_TARGET
-{
-    float localAlpha = 0.0f;
-    float3 textureColor = float3(0, 0, 0); // color sampled from a texture
-
-    ObjectType objType = input.getObjType();
-    const uint32_t currentMainObjectIdx = input.getMainObjectIdx();
-    const MainObject mainObj = loadMainObject(currentMainObjectIdx);
-    
-#define DTM
-#ifdef DTM
-    // TRIANGLE RENDERING
-    {
-        const float outlineThickness = input.getOutlineThickness();
-        const float contourThickness = input.getContourLineThickness();
-        const float phaseShift = 0.0f; // input.getCurrentPhaseShift();
-        const float stretch = 1.0f; // TODO: figure out what is it for ---> [ERFAN's REPLY: no need to give shit about this in dtms, it's for special shape styles] 
-        const float worldToScreenRatio = input.getCurrentWorldToScreenRatio();
-
-        DTMSettings dtm = loadDTMSettings(mainObj.dtmSettingsIdx);
-        LineStyle outlineStyle = loadLineStyle(dtm.outlineLineStyleIdx);
-        LineStyle contourStyle = loadLineStyle(dtm.contourLineStyleIdx);
-
-        float3 v[3];
-        v[0] = input.getScreenSpaceVertexAttribs(0);
-        v[1] = input.getScreenSpaceVertexAttribs(1);
-        v[2] = input.getScreenSpaceVertexAttribs(2);
-
-        const float3 baryCoord = nbl::hlsl::spirv::BaryCoordKHR;
-
-        // indices of points constructing every edge
-        uint2 edgePoints[3];
-        edgePoints[0] = uint2(0, 1);
-        edgePoints[1] = uint2(1, 2);
-        edgePoints[2] = uint2(2, 0);
-
-        // index of vertex opposing an edge, needed for calculation of triangle heights
-        uint opposingVertexIdx[3];
-        opposingVertexIdx[0] = 2;
-        opposingVertexIdx[1] = 0;
-        opposingVertexIdx[2] = 1;
-        
-        float height = input.getHeight();
-
-        // HEIGHT SHADING
-        const uint32_t heightMapSize = dtm.heightColorEntryCount;
-        float minShadingHeight = dtm.heightColorMapHeights[0];
-        float maxShadingHeight = dtm.heightColorMapHeights[heightMapSize - 1];
-
-        if (heightMapSize > 0)
-        {
-            // partially based on https://www.shadertoy.com/view/XsXSz4 by Inigo Quilez
-            float2 e0 = v[1] - v[0];
-            float2 e1 = v[2] - v[1];
-            float2 e2 = v[0] - v[2];
-            
-            float triangleAreaSign = -sign(e0.x * e2.y - e0.y * e2.x);
-            float2 v0 = input.position.xy - v[0];
-            float2 v1 = input.position.xy - v[1];
-            float2 v2 = input.position.xy - v[2];
-
-            float distanceToLine0 = sqrt(dot2(v0 - e0 * dot(v0, e0) / dot(e0, e0)));
-            float distanceToLine1 = sqrt(dot2(v1 - e1 * dot(v1, e1) / dot(e1, e1)));
-            float distanceToLine2 = sqrt(dot2(v2 - e2 * dot(v2, e2) / dot(e2, e2)));
-
-            float line0Sdf = distanceToLine0 * triangleAreaSign * (v0.x * e0.y - v0.y * e0.x);
-            float line1Sdf = distanceToLine1 * triangleAreaSign * (v1.x * e1.y - v1.y * e1.x);
-            float line2Sdf = distanceToLine2 * triangleAreaSign * (v2.x * e2.y - v2.y * e2.x);
-            float heightDeriv = fwidth(height);
-            float line3Sdf = (minShadingHeight - height) / heightDeriv;
-            float line4Sdf = (height - maxShadingHeight) / heightDeriv;
-
-            float convexPolygonSdf = max(line0Sdf, line1Sdf);
-            convexPolygonSdf = max(convexPolygonSdf, line2Sdf);
-            convexPolygonSdf = max(convexPolygonSdf, line3Sdf);
-            convexPolygonSdf = max(convexPolygonSdf, line4Sdf);
-
-            localAlpha = 1.0f - smoothstep(0.0f, globals.antiAliasingFactor * 2.0f, convexPolygonSdf);
-
-            // calculate height color
-            DTMSettings::E_HEIGHT_SHADING_MODE mode = dtm.determineHeightShadingMode();
-
-            if(mode == DTMSettings::E_HEIGHT_SHADING_MODE::DISCRETE_VARIABLE_LENGTH_INTERVALS)
-            {
-                DTMSettingsHeightsAccessor dtmHeightsAccessor = { dtm };
-                uint32_t mapIndexPlus1 = nbl::hlsl::upper_bound(dtmHeightsAccessor, 0, heightMapSize, height);
-                uint32_t mapIndex = mapIndexPlus1 == 0 ? mapIndexPlus1 : mapIndexPlus1 - 1;
-
-                float heightDeriv = fwidth(height);
-                bool blendWithPrev = true
-                    && (mapIndex >= heightMapSize - 1 || (height * 2.0 < dtm.heightColorMapHeights[mapIndexPlus1] + dtm.heightColorMapHeights[mapIndex]));
-                
-                // logic explainer: if colorIdx is 0.0 then it means blend with next
-                // if color idx is >= length of the colours array then it means it's also > 0.0 and this blend with prev is true
-                // if color idx is > 0 and < len - 1, then it depends on the current pixel's height value and two closest height values
-                if (blendWithPrev)
-                {
-                    if (mapIndex > 0)
-                    {
-                        float pxDistanceToPrevHeight = (height - dtm.heightColorMapHeights[mapIndex]) / heightDeriv;
-                        float prevColorCoverage = smoothstep(-globals.antiAliasingFactor, globals.antiAliasingFactor, pxDistanceToPrevHeight);
-                        textureColor = lerp(dtm.heightColorMapColors[mapIndex - 1].rgb, dtm.heightColorMapColors[mapIndex].rgb, prevColorCoverage);
-                    }
-                    else
-                    {
-                        textureColor = dtm.heightColorMapColors[mapIndex].rgb;
-                    }
-                }
-                else
-                {
-                    if (mapIndex < heightMapSize - 1)
-                    {
-                        float pxDistanceToNextHeight = (height - dtm.heightColorMapHeights[mapIndexPlus1]) / heightDeriv;
-                        float nextColorCoverage = smoothstep(-globals.antiAliasingFactor, globals.antiAliasingFactor, pxDistanceToNextHeight);
-                        textureColor = lerp(dtm.heightColorMapColors[mapIndex].rgb, dtm.heightColorMapColors[mapIndexPlus1].rgb, nextColorCoverage);
-                    }
-                    else
-                    {
-                        textureColor = dtm.heightColorMapColors[mapIndex].rgb;
-                    }
-                }
-
-                //localAlpha = dtm.heightColorMapColors[mapIndex].a;
-            }
-            else
-            {
-                float heightTmp;
-                if (mode == DTMSettings::E_HEIGHT_SHADING_MODE::DISCRETE_FIXED_LENGTH_INTERVALS)
-                {
-                    float interval = dtm.intervalWidth;
-                    int sectionIndex = int((height - minShadingHeight) / interval);
-                    heightTmp = minShadingHeight + float(sectionIndex) * interval;
-                }
-                else if (mode == DTMSettings::E_HEIGHT_SHADING_MODE::CONTINOUS_INTERVALS)
-                {
-                    heightTmp = height;
-                }
-
-                DTMSettingsHeightsAccessor dtmHeightsAccessor = { dtm };
-                uint32_t upperBoundHeightIndex = nbl::hlsl::upper_bound(dtmHeightsAccessor, 0, heightMapSize, height);
-                uint32_t lowerBoundHeightIndex = upperBoundHeightIndex == 0 ? upperBoundHeightIndex : upperBoundHeightIndex - 1;
-
-                float upperBoundHeight = dtm.heightColorMapHeights[upperBoundHeightIndex];
-                float lowerBoundHeight = dtm.heightColorMapHeights[lowerBoundHeightIndex];
-                
-                float4 upperBoundColor = dtm.heightColorMapColors[upperBoundHeightIndex];
-                float4 lowerBoundColor = dtm.heightColorMapColors[lowerBoundHeightIndex];
-                
-                float interpolationVal;
-                if (upperBoundHeightIndex == 0)
-                    interpolationVal = 1.0f;
-                else
-                    interpolationVal = (heightTmp - lowerBoundHeight) / (upperBoundHeight - lowerBoundHeight);
-                
-                textureColor = lerp(lowerBoundColor.rgb, upperBoundColor.rgb, interpolationVal);
-                localAlpha = lerp(lowerBoundColor.a, upperBoundColor.a, interpolationVal);;
-            }
-        }
-        //else // TODO: remove!!
-        //{
-        //    printf("WTF");
-        //    return float4(0.0f, 0.0f, 0.0f, 1.0f);
-        //}
-
-        // CONTOUR
-
-        // TODO: move to ubo or push constants
-        const float startHeight = dtm.contourLinesStartHeight;
-        const float endHeight = dtm.contourLinesEndHeight;
-        const float interval = dtm.contourLinesHeightInterval;
-
-        // TODO: can be precomputed
-        const int maxContourLineIdx = (endHeight - startHeight + 1) / interval;
-
-        // TODO: it actually can output a negative number, fix
-        int contourLineIdx = nbl::hlsl::_static_cast<int>((height - startHeight + (interval * 0.5f)) / interval);
-        contourLineIdx = clamp(contourLineIdx, 0, maxContourLineIdx);
-        float contourLineHeight = startHeight + interval * contourLineIdx;
-
-        int contourLinePointsIdx = 0;
-        float2 contourLinePoints[2];
-        // TODO: case where heights we are looking for are on all three vertices
-        for (int i = 0; i < 3; ++i)
-        {
-            if (contourLinePointsIdx == 3)
-                break;
-
-            const uint2 currentEdgePoints = edgePoints[i];
-            float3 p0 = v[currentEdgePoints[0]];
-            float3 p1 = v[currentEdgePoints[1]];
-
-            if (p1.z < p0.z)
-                nbl::hlsl::swap(p0, p1);
-
-            float minHeight = p0.z;
-            float maxHeight = p1.z;
-
-            if (height >= minHeight && height <= maxHeight)
-            {
-                float2 edge = float2(p1.x, p1.y) - float2(p0.x, p0.y);
-                float scale = (contourLineHeight - minHeight) / (maxHeight - minHeight);
-
-                contourLinePoints[contourLinePointsIdx] = scale * edge + float2(p0.x, p0.y);
-                ++contourLinePointsIdx;
-            }
-        }
-
-        {
-            nbl::hlsl::shapes::Line<float> lineSegment = nbl::hlsl::shapes::Line<float>::construct(contourLinePoints[0], contourLinePoints[1]);
-
-            float distance = nbl::hlsl::numeric_limits<float>::max;
-            if (!contourStyle.hasStipples() || stretch == InvalidStyleStretchValue)
-            {
-                distance = ClippedSignedDistance< nbl::hlsl::shapes::Line<float> >::sdf(lineSegment, input.position.xy, contourThickness, contourStyle.isRoadStyleFlag);
-            }
-            else
-            {
-                // TODO:
-                // It might be beneficial to calculate distance between pixel and contour line to early out some pixels and save yourself from stipple sdf computations!
-                // where you only compute the complex sdf if abs((height - contourVal) / heightDeriv) <= aaFactor
-                nbl::hlsl::shapes::Line<float>::ArcLengthCalculator arcLenCalc = nbl::hlsl::shapes::Line<float>::ArcLengthCalculator::construct(lineSegment);
-                LineStyleClipper clipper = LineStyleClipper::construct(contourStyle, lineSegment, arcLenCalc, phaseShift, stretch, worldToScreenRatio);
-                distance = ClippedSignedDistance<nbl::hlsl::shapes::Line<float>, LineStyleClipper>::sdf(lineSegment, input.position.xy, contourThickness, contourStyle.isRoadStyleFlag, clipper);
-            }
-
-            float contourLocalAlpha = smoothstep(+globals.antiAliasingFactor, -globals.antiAliasingFactor, distance) * contourStyle.color.a;
-            textureColor = lerp(textureColor, contourStyle.color.rgb, contourLocalAlpha);
-            localAlpha = max(localAlpha, contourLocalAlpha);
-        }
-
-        
-
-        // OUTLINE
-
-        // find sdf of every edge
-        float triangleAreaTimesTwo;
-        {
-            float3 AB = v[0] - v[1];
-            float3 AC = v[0] - v[2];
-            AB.z = 0.0f;
-            AC.z = 0.0f;
-
-            // TODO: figure out if there is a faster solution
-            triangleAreaTimesTwo = length(cross(AB, AC));
-        }
-
-        // calculate sdf of every edge as it wasn't stippled
-        float distances[3];
-        for (int i = 0; i < 3; ++i)
-        {
-            const uint2 currentEdgePoints = edgePoints[i];
-            float3 A = v[currentEdgePoints[0]];
-            float3 B = v[currentEdgePoints[1]];
-            float3 AB = B - A;
-            float ABLen = length(AB);
-
-            distances[i] = (triangleAreaTimesTwo / ABLen) * baryCoord[opposingVertexIdx[i]];
-        }
-
-        float minDistance = nbl::hlsl::numeric_limits<float>::max;
-        if (!outlineStyle.hasStipples() || stretch == InvalidStyleStretchValue)
-        {
-            for (uint i = 0; i < 3; ++i)
-                distances[i] -= outlineThickness;
-
-            minDistance = min(distances[0], min(distances[1], distances[2]));
-        }
-        else
-        {
-            for (int i = 0; i < 3; ++i)
-            {
-                if (distances[i] > outlineThickness)
-                    continue;
-
-                const uint2 currentEdgePoints = edgePoints[i];
-                float3 p0 = v[currentEdgePoints[0]];
-                float3 p1 = v[currentEdgePoints[1]];
-
-                // long story short, in order for stipple patterns to be consistent:
-                // - point with lesser x coord should be starting point
-                // - if x coord of both points are equal then point with lesser y value should be starting point
-                if (p1.x < p0.x)
-                    nbl::hlsl::swap(p0, p1);
-                else if (p1.x == p0.x && p1.y < p0.y)
-                    nbl::hlsl::swap(p0, p1);
-
-                nbl::hlsl::shapes::Line<float> lineSegment = nbl::hlsl::shapes::Line<float>::construct(float2(p0.x, p0.y), float2(p1.x, p1.y));
-                
-                float distance = nbl::hlsl::numeric_limits<float>::max;
-                nbl::hlsl::shapes::Line<float>::ArcLengthCalculator arcLenCalc = nbl::hlsl::shapes::Line<float>::ArcLengthCalculator::construct(lineSegment);
-                LineStyleClipper clipper = LineStyleClipper::construct(outlineStyle, lineSegment, arcLenCalc, phaseShift, stretch, worldToScreenRatio);
-                distance = ClippedSignedDistance<nbl::hlsl::shapes::Line<float>, LineStyleClipper>::sdf(lineSegment, input.position.xy, outlineThickness, outlineStyle.isRoadStyleFlag, clipper);
-
-                minDistance = min(minDistance, distance);
-            }
-
-        }
-
-        float outlineLocalAlpha = smoothstep(+globals.antiAliasingFactor, -globals.antiAliasingFactor, minDistance) * outlineStyle.color.a;
-        textureColor = lerp(textureColor, outlineStyle.color.rgb, outlineLocalAlpha);
-        localAlpha = max(localAlpha, outlineLocalAlpha);
-    }
-
-    return calculateFinalColor<nbl::hlsl::jit::device_capabilities::fragmentShaderPixelInterlock>(uint2(input.position.xy), localAlpha, currentMainObjectIdx, textureColor, true);
-#endif
-    // figure out local alpha with sdf
-    if (objType == ObjectType::LINE || objType == ObjectType::QUAD_BEZIER || objType == ObjectType::POLYLINE_CONNECTOR)
-    {
-        float distance = nbl::hlsl::numeric_limits<float>::max;
-        if (objType == ObjectType::LINE)
-        {
-            const float2 start = input.getLineStart();
-            const float2 end = input.getLineEnd();
-            const uint32_t styleIdx = mainObj.styleIdx;
-            const float thickness = input.getLineThickness();
-            const float phaseShift = input.getCurrentPhaseShift();
-            const float stretch = input.getPatternStretch();
-            const float worldToScreenRatio = input.getCurrentWorldToScreenRatio();
-
-            nbl::hlsl::shapes::Line<float> lineSegment = nbl::hlsl::shapes::Line<float>::construct(start, end);
-
-            LineStyle style = loadLineStyle(styleIdx);
-
-            if (!style.hasStipples() || stretch == InvalidStyleStretchValue)
-            {
-                distance = ClippedSignedDistance< nbl::hlsl::shapes::Line<float> >::sdf(lineSegment, input.position.xy, thickness, style.isRoadStyleFlag);
-            }
-            else
-            {
-                nbl::hlsl::shapes::Line<float>::ArcLengthCalculator arcLenCalc = nbl::hlsl::shapes::Line<float>::ArcLengthCalculator::construct(lineSegment);
-                LineStyleClipper clipper = LineStyleClipper::construct(loadLineStyle(styleIdx), lineSegment, arcLenCalc, phaseShift, stretch, worldToScreenRatio);
-                distance = ClippedSignedDistance<nbl::hlsl::shapes::Line<float>, LineStyleClipper>::sdf(lineSegment, input.position.xy, thickness, style.isRoadStyleFlag, clipper);
-            }
-        }
-        else if (objType == ObjectType::QUAD_BEZIER)
-        {
-            nbl::hlsl::shapes::Quadratic<float> quadratic = input.getQuadratic();
-            nbl::hlsl::shapes::Quadratic<float>::ArcLengthCalculator arcLenCalc = input.getQuadraticArcLengthCalculator();
-
-            const uint32_t styleIdx = mainObj.styleIdx;
-            const float thickness = input.getLineThickness();
-            const float phaseShift = input.getCurrentPhaseShift();
-            const float stretch = input.getPatternStretch();
-            const float worldToScreenRatio = input.getCurrentWorldToScreenRatio();
-
-            LineStyle style = loadLineStyle(styleIdx);
-            if (!style.hasStipples() || stretch == InvalidStyleStretchValue)
-            {
-                distance = ClippedSignedDistance< nbl::hlsl::shapes::Quadratic<float> >::sdf(quadratic, input.position.xy, thickness, style.isRoadStyleFlag);
-            }
-            else
-            {
-                BezierStyleClipper clipper = BezierStyleClipper::construct(loadLineStyle(styleIdx), quadratic, arcLenCalc, phaseShift, stretch, worldToScreenRatio);
-                distance = ClippedSignedDistance<nbl::hlsl::shapes::Quadratic<float>, BezierStyleClipper>::sdf(quadratic, input.position.xy, thickness, style.isRoadStyleFlag, clipper);
-            }
-        }
-        else if (objType == ObjectType::POLYLINE_CONNECTOR)
-        {
-            const float2 P = input.position.xy - input.getPolylineConnectorCircleCenter();
-            distance = miterSDF(
-                P,
-                input.getLineThickness(),
-                input.getPolylineConnectorTrapezoidStart(),
-                input.getPolylineConnectorTrapezoidEnd(),
-                input.getPolylineConnectorTrapezoidLongBase(),
-                input.getPolylineConnectorTrapezoidShortBase());
-
-        }
-        localAlpha = smoothstep(+globals.antiAliasingFactor, -globals.antiAliasingFactor, distance);
-    }
-    else if (objType == ObjectType::CURVE_BOX) 
-    {
-        const float minorBBoxUV = input.getMinorBBoxUV();
-        const float majorBBoxUV = input.getMajorBBoxUV();
-
-        nbl::hlsl::math::equations::Quadratic<float> curveMinMinor = input.getCurveMinMinor();
-        nbl::hlsl::math::equations::Quadratic<float> curveMinMajor = input.getCurveMinMajor();
-        nbl::hlsl::math::equations::Quadratic<float> curveMaxMinor = input.getCurveMaxMinor();
-        nbl::hlsl::math::equations::Quadratic<float> curveMaxMajor = input.getCurveMaxMajor();
-
-        //  TODO(Optimization): Can we ignore this majorBBoxUV clamp and rely on the t clamp that happens next? then we can pass `PrecomputedRootFinder`s instead of computing the values per pixel.
-        nbl::hlsl::math::equations::Quadratic<float> minCurveEquation = nbl::hlsl::math::equations::Quadratic<float>::construct(curveMinMajor.a, curveMinMajor.b, curveMinMajor.c - clamp(majorBBoxUV, 0.0, 1.0));
-        nbl::hlsl::math::equations::Quadratic<float> maxCurveEquation = nbl::hlsl::math::equations::Quadratic<float>::construct(curveMaxMajor.a, curveMaxMajor.b, curveMaxMajor.c - clamp(majorBBoxUV, 0.0, 1.0));
-
-        const float minT = clamp(PrecomputedRootFinder<float>::construct(minCurveEquation).computeRoots(), 0.0, 1.0);
-        const float minEv = curveMinMinor.evaluate(minT);
-
-        const float maxT = clamp(PrecomputedRootFinder<float>::construct(maxCurveEquation).computeRoots(), 0.0, 1.0);
-        const float maxEv = curveMaxMinor.evaluate(maxT);
-
-        const bool insideMajor = majorBBoxUV >= 0.0 && majorBBoxUV <= 1.0;
-        const bool insideMinor = minorBBoxUV >= minEv && minorBBoxUV <= maxEv;
-
-        if (insideMinor && insideMajor)
-        {
-            localAlpha = 1.0;
-        }
-        else
-        {
-            // Find the true SDF of a hatch box boundary which is bounded by two curves, It requires knowing the distance from the current UV to the closest point on bounding curves and the limiting lines (in major direction)
-            // We also keep track of distance vector (minor, major) to convert to screenspace distance for anti-aliasing with screenspace aaFactor
-            const float InvalidT = nbl::hlsl::numeric_limits<float32_t>::max;
-            const float MAX_DISTANCE_SQUARED = nbl::hlsl::numeric_limits<float32_t>::max;
-
-            const float2 boxScreenSpaceSize = input.getCurveBoxScreenSpaceSize();
-
-
-            float closestDistanceSquared = MAX_DISTANCE_SQUARED;
-            const float2 pos = float2(minorBBoxUV, majorBBoxUV) * boxScreenSpaceSize;
-
-            if (minorBBoxUV < minEv)
-            {
-                // DO SDF of Min Curve
-                nbl::hlsl::shapes::Quadratic<float> minCurve = nbl::hlsl::shapes::Quadratic<float>::construct(
-                    float2(curveMinMinor.a, curveMinMajor.a) * boxScreenSpaceSize,
-                    float2(curveMinMinor.b, curveMinMajor.b) * boxScreenSpaceSize,
-                    float2(curveMinMinor.c, curveMinMajor.c) * boxScreenSpaceSize);
-
-                nbl::hlsl::shapes::Quadratic<float>::Candidates candidates = minCurve.getClosestCandidates(pos);
-                [[unroll(nbl::hlsl::shapes::Quadratic<float>::MaxCandidates)]]
-                for (uint32_t i = 0; i < nbl::hlsl::shapes::Quadratic<float>::MaxCandidates; i++)
-                {
-                    candidates[i] = clamp(candidates[i], 0.0, 1.0);
-                    const float2 distVector = minCurve.evaluate(candidates[i]) - pos;
-                    const float candidateDistanceSquared = dot(distVector, distVector);
-                    if (candidateDistanceSquared < closestDistanceSquared)
-                        closestDistanceSquared = candidateDistanceSquared;
-                }
-            }
-            else if (minorBBoxUV > maxEv)
-            {
-                // Do SDF of Max Curve
-                nbl::hlsl::shapes::Quadratic<float> maxCurve = nbl::hlsl::shapes::Quadratic<float>::construct(
-                    float2(curveMaxMinor.a, curveMaxMajor.a) * boxScreenSpaceSize,
-                    float2(curveMaxMinor.b, curveMaxMajor.b) * boxScreenSpaceSize,
-                    float2(curveMaxMinor.c, curveMaxMajor.c) * boxScreenSpaceSize);
-                nbl::hlsl::shapes::Quadratic<float>::Candidates candidates = maxCurve.getClosestCandidates(pos);
-                [[unroll(nbl::hlsl::shapes::Quadratic<float>::MaxCandidates)]]
-                for (uint32_t i = 0; i < nbl::hlsl::shapes::Quadratic<float>::MaxCandidates; i++)
-                {
-                    candidates[i] = clamp(candidates[i], 0.0, 1.0);
-                    const float2 distVector = maxCurve.evaluate(candidates[i]) - pos;
-                    const float candidateDistanceSquared = dot(distVector, distVector);
-                    if (candidateDistanceSquared < closestDistanceSquared)
-                        closestDistanceSquared = candidateDistanceSquared;
-                }
-            }
-
-            if (!insideMajor)
-            {
-                const bool minLessThanMax = minEv < maxEv;
-                float2 majorDistVector = float2(MAX_DISTANCE_SQUARED, MAX_DISTANCE_SQUARED);
-                if (majorBBoxUV > 1.0)
-                {
-                    const float2 minCurveEnd = float2(minEv, 1.0) * boxScreenSpaceSize;
-                    if (minLessThanMax)
-                        majorDistVector = sdLineDstVec(pos, minCurveEnd, float2(maxEv, 1.0) * boxScreenSpaceSize);
-                    else
-                        majorDistVector = pos - minCurveEnd;
-                }
-                else
-                {
-                    const float2 minCurveStart = float2(minEv, 0.0) * boxScreenSpaceSize;
-                    if (minLessThanMax)
-                        majorDistVector = sdLineDstVec(pos, minCurveStart, float2(maxEv, 0.0) * boxScreenSpaceSize);
-                    else
-                        majorDistVector = pos - minCurveStart;
-                }
-
-                const float majorDistSq = dot(majorDistVector, majorDistVector);
-                if (majorDistSq < closestDistanceSquared)
-                    closestDistanceSquared = majorDistSq;
-            }
-
-            const float dist = sqrt(closestDistanceSquared);
-            localAlpha = 1.0f - smoothstep(0.0, globals.antiAliasingFactor, dist);
-        }
-
-        LineStyle style = loadLineStyle(mainObj.styleIdx);
-        uint32_t textureId = asuint(style.screenSpaceLineWidth);
-        if (textureId != InvalidTextureIdx)
-        {
-            // For Hatch fiils we sample the first mip as we don't fill the others, because they are constant in screenspace and render as expected
-            // If later on we decided that we can have different sizes here, we should do computations similar to FONT_GLYPH
-            float3 msdfSample = msdfTextures.SampleLevel(msdfSampler, float3(frac(input.position.xy / HatchFillMSDFSceenSpaceSize), float(textureId)), 0.0).xyz;
-            float msdf = nbl::hlsl::text::msdfDistance(msdfSample, MSDFPixelRange * HatchFillMSDFSceenSpaceSize / MSDFSize);
-            localAlpha *= smoothstep(+globals.antiAliasingFactor / 2.0, -globals.antiAliasingFactor / 2.0f, msdf);
-        }
-    }
-    else if (objType == ObjectType::FONT_GLYPH) 
-    {
-        const float2 uv = input.getFontGlyphUV();
-        const uint32_t textureId = input.getFontGlyphTextureId();
-
-        if (textureId != InvalidTextureIdx)
-        {
-            float mipLevel = msdfTextures.CalculateLevelOfDetail(msdfSampler, uv);
-            float3 msdfSample = msdfTextures.SampleLevel(msdfSampler, float3(uv, float(textureId)), mipLevel);
-            float msdf = nbl::hlsl::text::msdfDistance(msdfSample, input.getFontGlyphPxRange());
-            /*
-                explaining "*= exp2(max(mipLevel,0.0))"
-                Each mip level has constant MSDFPixelRange
-                Which essentially makes the msdfSamples here (Harware Sampled) have different scales per mip
-                As we go up 1 mip level, the msdf distance should be multiplied by 2.0
-                While this makes total sense for NEAREST mip sampling when mipLevel is an integer and only one mip is being sampled.
-                It's a bit complex when it comes to trilinear filtering (LINEAR mip sampling), but it works in practice!
-                
-                Alternatively you can think of it as doing this instead:
-                localAlpha = smoothstep(+globals.antiAliasingFactor / exp2(max(mipLevel,0.0)), 0.0, msdf);
-                Which is reducing the aa feathering as we go up the mip levels. 
-                to avoid aa feathering of the MAX_MSDF_DISTANCE_VALUE to be less than aa factor and eventually color it and cause greyed out area around the main glyph
-            */
-            msdf *= exp2(max(mipLevel,0.0));
-            
-            LineStyle style = loadLineStyle(mainObj.styleIdx);
-            const float screenPxRange = input.getFontGlyphPxRange() / MSDFPixelRangeHalf;
-            const float bolden = style.worldSpaceLineWidth * screenPxRange; // worldSpaceLineWidth is actually boldenInPixels, aliased TextStyle with LineStyle
-            localAlpha = smoothstep(+globals.antiAliasingFactor / 2.0f + bolden, -globals.antiAliasingFactor / 2.0f + bolden, msdf);
-        }
-    }
-    else if (objType == ObjectType::IMAGE) 
-    {
-        const float2 uv = input.getImageUV();
-        const uint32_t textureId = input.getImageTextureId();
-
-        if (textureId != InvalidTextureIdx)
-        {
-            float4 colorSample = textures[NonUniformResourceIndex(textureId)].Sample(textureSampler, float2(uv.x, uv.y));
-            textureColor = colorSample.rgb;
-            localAlpha = colorSample.a;
-        }
-    }
-
-    uint2 fragCoord = uint2(input.position.xy);
-    
-    if (localAlpha <= 0)
-        discard;
-    
-    const bool colorFromTexture = objType == ObjectType::IMAGE;
-    
-    // TODO[Przemek]: But make sure you're still calling this, correctly calculating alpha and texture color.
-    // you can add 1 main object and push via DrawResourcesFiller like we already do for other objects (this go in the mainObjects StorageBuffer) and then set the currentMainObjectIdx to 0 here
-    // having 1 main object temporarily means that all triangle meshes will be treated as a unified object in blending operations. 
-    return calculateFinalColor<nbl::hlsl::jit::device_capabilities::fragmentShaderPixelInterlock>(fragCoord, localAlpha, currentMainObjectIdx, textureColor, colorFromTexture);
-}
+#define FRAGMENT_SHADER_INPUT
+#include "common.hlsl"
+#include <nbl/builtin/hlsl/shapes/beziers.hlsl>
+#include <nbl/builtin/hlsl/shapes/line.hlsl>
+#include <nbl/builtin/hlsl/algorithm.hlsl>
+#include <nbl/builtin/hlsl/math/equations/quadratic.hlsl>
+#include <nbl/builtin/hlsl/math/geometry.hlsl>
+#include <nbl/builtin/hlsl/spirv_intrinsics/fragment_shader_pixel_interlock.hlsl>
+#include <nbl/builtin/hlsl/jit/device_capabilities.hlsl>
+#include <nbl/builtin/hlsl/text_rendering/msdf.hlsl>
+#include <nbl/builtin/hlsl/spirv_intrinsics/fragment_shader_barycentric.hlsl>
+
+template<typename float_t>
+struct DefaultClipper
+{
+    using float_t2 = vector<float_t, 2>;
+    NBL_CONSTEXPR_STATIC_INLINE float_t AccuracyThresholdT = 0.0;
+
+    static DefaultClipper construct()
+    {
+        DefaultClipper ret;
+        return ret;
+    }
+
+    inline float_t2 operator()(const float_t t)
+    {
+        const float_t ret = clamp(t, 0.0, 1.0);
+        return float_t2(ret, ret);
+    }
+};
+
+// for usage in upper_bound function
+struct StyleAccessor
+{
+    LineStyle style;
+    using value_type = float;
+
+    float operator[](const uint32_t ix)
+    {
+        return style.getStippleValue(ix);
+    }
+};
+
+template<typename CurveType>
+struct StyleClipper
+{
+    using float_t = typename CurveType::scalar_t;
+    using float_t2 = typename CurveType::float_t2;
+    using float_t3 = typename CurveType::float_t3;
+    NBL_CONSTEXPR_STATIC_INLINE float_t AccuracyThresholdT = 0.000001;
+
+    static StyleClipper<CurveType> construct(
+        LineStyle style,
+        CurveType curve,
+        typename CurveType::ArcLengthCalculator arcLenCalc,
+        float phaseShift,
+        float stretch,
+        float worldToScreenRatio)
+    {
+        StyleClipper<CurveType> ret = { style, curve, arcLenCalc, phaseShift, stretch, worldToScreenRatio, 0.0f, 0.0f, 0.0f, 0.0f };
+
+        // values for non-uniform stretching with a rigid segment
+        if (style.rigidSegmentIdx != InvalidRigidSegmentIndex && stretch != 1.0f)
+        {
+            // rigidSegment info in old non stretched pattern
+            ret.rigidSegmentStart = (style.rigidSegmentIdx >= 1u) ? style.getStippleValue(style.rigidSegmentIdx - 1u) : 0.0f;
+            ret.rigidSegmentEnd = (style.rigidSegmentIdx < style.stipplePatternSize) ? style.getStippleValue(style.rigidSegmentIdx) : 1.0f;
+            ret.rigidSegmentLen = ret.rigidSegmentEnd - ret.rigidSegmentStart;
+            // stretch value for non rigid segments
+            ret.nonRigidSegmentStretchValue = (stretch - ret.rigidSegmentLen) / (1.0f - ret.rigidSegmentLen);
+            // rigidSegment info to new stretched pattern
+            ret.rigidSegmentStart *= ret.nonRigidSegmentStretchValue / stretch; // get the new normalized rigid segment start
+            ret.rigidSegmentLen /= stretch; // get the new rigid segment normalized len
+            ret.rigidSegmentEnd = ret.rigidSegmentStart + ret.rigidSegmentLen; // get the new normalized rigid segment end 
+        }
+        else
+        {
+            ret.nonRigidSegmentStretchValue = stretch;
+        }
+        
+        return ret;
+    }
+
+    // For non-uniform stretching with a rigid segment (the one segement that shouldn't stretch) the whole pattern changes
+    // instead of transforming each of the style.stipplePattern values (max 14 of them), we transform the normalized place in pattern
+    float getRealNormalizedPlaceInPattern(float normalizedPlaceInPattern)
+    {
+        if (style.rigidSegmentIdx != InvalidRigidSegmentIndex && stretch != 1.0f)
+        {
+            float ret = min(normalizedPlaceInPattern, rigidSegmentStart) / nonRigidSegmentStretchValue; // unstretch parts before rigid segment
+            ret += max(normalizedPlaceInPattern - rigidSegmentEnd, 0.0f) / nonRigidSegmentStretchValue; // unstretch parts after rigid segment
+            ret += max(min(rigidSegmentLen, normalizedPlaceInPattern - rigidSegmentStart), 0.0f); // unstretch parts inside rigid segment
+            ret *= stretch;
+            return ret;
+        }
+        else
+        {
+            return normalizedPlaceInPattern;
+        }
+    }
+
+    float_t2 operator()(float_t t)
+    {
+        // basicaly 0.0 and 1.0 but with a guardband to discard outside the range
+        const float_t minT = 0.0 - 1.0;
+        const float_t maxT = 1.0 + 1.0;
+
+        StyleAccessor styleAccessor = { style };
+        const float_t reciprocalStretchedStipplePatternLen = style.reciprocalStipplePatternLen / stretch;
+        const float_t patternLenInScreenSpace = 1.0 / (worldToScreenRatio * style.reciprocalStipplePatternLen);
+
+        const float_t arcLen = arcLenCalc.calcArcLen(t);
+        const float_t worldSpaceArcLen = arcLen * float_t(worldToScreenRatio);
+        float_t normalizedPlaceInPattern = frac(worldSpaceArcLen * reciprocalStretchedStipplePatternLen + phaseShift);
+        normalizedPlaceInPattern = getRealNormalizedPlaceInPattern(normalizedPlaceInPattern);
+        uint32_t patternIdx = nbl::hlsl::upper_bound(styleAccessor, 0, style.stipplePatternSize, normalizedPlaceInPattern);
+
+        const float_t InvalidT = nbl::hlsl::numeric_limits<float32_t>::infinity; 
+        float_t2 ret = float_t2(InvalidT, InvalidT);
+
+        // odd patternIdx means a "no draw section" and current candidate should split into two nearest draw sections
+        const bool notInDrawSection = patternIdx & 0x1;
+        
+        // TODO[Erfan]: Disable this piece of code after clipping, and comment the reason, that the bezier start and end at 0.0 and 1.0 should be in drawable sections
+        float_t minDrawT = 0.0;
+        float_t maxDrawT = 1.0;
+        {
+            float_t normalizedPlaceInPatternBegin = frac(phaseShift);
+            normalizedPlaceInPatternBegin = getRealNormalizedPlaceInPattern(normalizedPlaceInPatternBegin);
+            uint32_t patternIdxBegin = nbl::hlsl::upper_bound(styleAccessor, 0, style.stipplePatternSize, normalizedPlaceInPatternBegin);
+            const bool BeginInNonDrawSection = patternIdxBegin & 0x1;
+
+            if (BeginInNonDrawSection)
+            {
+                float_t diffToRightDrawableSection = (patternIdxBegin == style.stipplePatternSize) ? 1.0 : styleAccessor[patternIdxBegin];
+                diffToRightDrawableSection -= normalizedPlaceInPatternBegin;
+                float_t scrSpcOffsetToArcLen1 = diffToRightDrawableSection * patternLenInScreenSpace * ((patternIdxBegin != style.rigidSegmentIdx) ? nonRigidSegmentStretchValue : 1.0);
+                const float_t arcLenForT1 = 0.0 + scrSpcOffsetToArcLen1;
+                minDrawT = arcLenCalc.calcArcLenInverse(curve, minT, maxT, arcLenForT1, AccuracyThresholdT, 0.0);
+            }
+            
+            // Completely in non-draw section -> clip away:
+            if (minDrawT >= 1.0)
+                return ret;
+
+            const float_t arcLenEnd = arcLenCalc.calcArcLen(1.0);
+            const float_t worldSpaceArcLenEnd = arcLenEnd * float_t(worldToScreenRatio);
+            float_t normalizedPlaceInPatternEnd = frac(worldSpaceArcLenEnd * reciprocalStretchedStipplePatternLen + phaseShift);
+            normalizedPlaceInPatternEnd = getRealNormalizedPlaceInPattern(normalizedPlaceInPatternEnd);
+            uint32_t patternIdxEnd = nbl::hlsl::upper_bound(styleAccessor, 0, style.stipplePatternSize, normalizedPlaceInPatternEnd);
+            const bool EndInNonDrawSection = patternIdxEnd & 0x1;
+
+            if (EndInNonDrawSection)
+            {
+                float_t diffToLeftDrawableSection = (patternIdxEnd == 0) ? 0.0 : styleAccessor[patternIdxEnd - 1];
+                diffToLeftDrawableSection -= normalizedPlaceInPatternEnd;
+                float_t scrSpcOffsetToArcLen0 = diffToLeftDrawableSection * patternLenInScreenSpace * ((patternIdxEnd != style.rigidSegmentIdx) ? nonRigidSegmentStretchValue : 1.0);
+                const float_t arcLenForT0 = arcLenEnd + scrSpcOffsetToArcLen0;
+                maxDrawT = arcLenCalc.calcArcLenInverse(curve, minT, maxT, arcLenForT0, AccuracyThresholdT, 1.0);
+            }
+        }
+
+        if (notInDrawSection)
+        {
+            float toScreenSpaceLen = patternLenInScreenSpace * ((patternIdx != style.rigidSegmentIdx) ? nonRigidSegmentStretchValue : 1.0);
+
+            float_t diffToLeftDrawableSection = (patternIdx == 0) ? 0.0 : styleAccessor[patternIdx - 1];
+            diffToLeftDrawableSection -= normalizedPlaceInPattern;
+            float_t scrSpcOffsetToArcLen0 = diffToLeftDrawableSection * toScreenSpaceLen;
+            const float_t arcLenForT0 = arcLen + scrSpcOffsetToArcLen0;
+            float_t t0 = arcLenCalc.calcArcLenInverse(curve, minT, maxT, arcLenForT0, AccuracyThresholdT, t);
+            t0 = clamp(t0, minDrawT, maxDrawT);
+
+            float_t diffToRightDrawableSection = (patternIdx == style.stipplePatternSize) ? 1.0 : styleAccessor[patternIdx];
+            diffToRightDrawableSection -= normalizedPlaceInPattern;
+            float_t scrSpcOffsetToArcLen1 = diffToRightDrawableSection * toScreenSpaceLen;
+            const float_t arcLenForT1 = arcLen + scrSpcOffsetToArcLen1;
+            float_t t1 = arcLenCalc.calcArcLenInverse(curve, minT, maxT, arcLenForT1, AccuracyThresholdT, t);
+            t1 = clamp(t1, minDrawT, maxDrawT);
+
+            ret = float_t2(t0, t1);
+        }
+        else
+        {
+            t = clamp(t, minDrawT, maxDrawT);
+            ret = float_t2(t, t);
+        }
+
+        return ret;
+    }
+
+    LineStyle style;
+    CurveType curve;
+    typename CurveType::ArcLengthCalculator arcLenCalc;
+    float phaseShift;
+    float stretch;
+    float worldToScreenRatio;
+    // precomp value for non uniform stretching
+    float rigidSegmentStart;
+    float rigidSegmentEnd;
+    float rigidSegmentLen;
+    float nonRigidSegmentStretchValue;
+};
+
+template<typename CurveType, typename Clipper = DefaultClipper<typename CurveType::scalar_t> >
+struct ClippedSignedDistance
+{
+    using float_t = typename CurveType::scalar_t;
+    using float_t2 = typename CurveType::float_t2;
+    using float_t3 = typename CurveType::float_t3;
+
+    const static float_t sdf(CurveType curve, float_t2 pos, float_t thickness, bool isRoadStyle, Clipper clipper = DefaultClipper<typename CurveType::scalar_t>::construct())
+    {
+        typename CurveType::Candidates candidates = curve.getClosestCandidates(pos);
+
+        const float_t InvalidT = nbl::hlsl::numeric_limits<float32_t>::max;
+        // TODO: Fix and test, we're not working with squared distance anymore
+        const float_t MAX_DISTANCE_SQUARED = (thickness + 1.0f) * (thickness + 1.0f); // TODO: ' + 1' is too much?
+
+        bool clipped = false;
+        float_t closestDistanceSquared = MAX_DISTANCE_SQUARED;
+        float_t closestT = InvalidT;
+        [[unroll(CurveType::MaxCandidates)]]
+        for (uint32_t i = 0; i < CurveType::MaxCandidates; i++)
+        {
+            const float_t candidateDistanceSquared = length(curve.evaluate(candidates[i]) - pos);
+            if (candidateDistanceSquared < closestDistanceSquared)
+            {
+                float_t2 snappedTs = clipper(candidates[i]);
+
+                if (snappedTs[0] == InvalidT)
+                {
+                    continue;
+                }
+
+                if (snappedTs[0] != candidates[i])
+                {
+                    // left snapped or clamped
+                    const float_t leftSnappedCandidateDistanceSquared = length(curve.evaluate(snappedTs[0]) - pos);
+                    if (leftSnappedCandidateDistanceSquared < closestDistanceSquared)
+                    {
+                        clipped = true;
+                        closestT = snappedTs[0];
+                        closestDistanceSquared = leftSnappedCandidateDistanceSquared;
+                    }
+
+                    if (snappedTs[0] != snappedTs[1])
+                    {
+                        // right snapped or clamped
+                        const float_t rightSnappedCandidateDistanceSquared = length(curve.evaluate(snappedTs[1]) - pos);
+                        if (rightSnappedCandidateDistanceSquared < closestDistanceSquared)
+                        {
+                            clipped = true;
+                            closestT = snappedTs[1];
+                            closestDistanceSquared = rightSnappedCandidateDistanceSquared;
+                        }
+                    }
+                }
+                else
+                {
+                    // no snapping
+                    if (candidateDistanceSquared < closestDistanceSquared)
+                    {
+                        clipped = false;
+                        closestT = candidates[i];
+                        closestDistanceSquared = candidateDistanceSquared;
+                    }
+                }
+            }
+        }
+
+
+        float_t roundedDistance = closestDistanceSquared - thickness;
+        if(!isRoadStyle)
+        {
+            return roundedDistance;
+        }
+        else
+        {
+            const float_t aaWidth = globals.antiAliasingFactor;
+            float_t rectCappedDistance = roundedDistance;
+
+            if (clipped)
+            {
+                float_t2 q = mul(curve.getLocalCoordinateSpace(closestT), pos - curve.evaluate(closestT));
+                rectCappedDistance = capSquare(q, thickness, aaWidth);
+            }
+
+            return rectCappedDistance;
+        }
+    }
+
+    static float capSquare(float_t2 q, float_t th, float_t aaWidth)
+    {
+        float_t2 d = abs(q) - float_t2(aaWidth, th);
+        return length(max(d, 0.0)) + min(max(d.x, d.y), 0.0);
+    }
+};
+
+// sdf of Isosceles Trapezoid y-aligned by https://iquilezles.org/articles/distfunctions2d/
+float sdTrapezoid(float2 p, float r1, float r2, float he)
+{
+    float2 k1 = float2(r2, he);
+    float2 k2 = float2(r2 - r1, 2.0 * he);
+
+    p.x = abs(p.x);
+    float2 ca = float2(max(0.0, p.x - ((p.y < 0.0) ? r1 : r2)), abs(p.y) - he);
+    float2 cb = p - k1 + k2 * clamp(dot(k1 - p, k2) / dot(k2,k2), 0.0, 1.0);
+
+    float s = (cb.x < 0.0 && ca.y < 0.0) ? -1.0 : 1.0;
+
+    return s * sqrt(min(dot(ca,ca), dot(cb,cb)));
+}
+
+// line segment sdf which returns the distance vector specialized for usage in hatch box line boundaries
+float2 sdLineDstVec(float2 P, float2 A, float2 B)
+{
+    const float2 PA = P - A;
+    const float2 BA = B - A;
+    float h = clamp(dot(PA, BA) / dot(BA, BA), 0.0, 1.0);
+    return PA - BA * h;
+}
+
+float miterSDF(float2 p, float thickness, float2 a, float2 b, float ra, float rb)
+{
+    float h = length(b - a) / 2.0;
+    float2 d = normalize(b - a);
+    float2x2 rot = float2x2(d.y, -d.x, d.x, d.y);
+    p = mul(rot, p);
+    p.y -= h - thickness;
+    return sdTrapezoid(p, ra, rb, h);
+}
+
+typedef StyleClipper< nbl::hlsl::shapes::Quadratic<float> > BezierStyleClipper;
+typedef StyleClipper< nbl::hlsl::shapes::Line<float> > LineStyleClipper;
+
+// for usage in upper_bound function
+struct DTMSettingsHeightsAccessor
+{
+    DTMSettings dtmSettings;
+    using value_type = float;
+
+    float operator[](const uint32_t ix)
+    {
+        return dtmSettings.heightColorMapHeights[ix];
+    }
+};
+
+// We need to specialize color calculation based on FragmentShaderInterlock feature availability for our transparency algorithm
+// because there is no `if constexpr` in hlsl
+// @params
+// textureColor: color sampled from a texture
+// useStyleColor: instead of writing and reading from colorStorage, use main object Idx to find the style color for the object.
+template<bool FragmentShaderPixelInterlock>
+float32_t4 calculateFinalColor(const uint2 fragCoord, const float localAlpha, const uint32_t currentMainObjectIdx, float3 textureColor, bool colorFromTexture);
+
+template<>
+float32_t4 calculateFinalColor<false>(const uint2 fragCoord, const float localAlpha, const uint32_t currentMainObjectIdx, float3 localTextureColor, bool colorFromTexture)
+{
+    uint32_t styleIdx = loadMainObject(currentMainObjectIdx).styleIdx;
+    if (!colorFromTexture)
+    {
+        float32_t4 col = loadLineStyle(styleIdx).color;
+        col.w *= localAlpha;
+        return float4(col);
+    }
+    else
+        return float4(localTextureColor, localAlpha);
+}
+template<>
+float32_t4 calculateFinalColor<true>(const uint2 fragCoord, const float localAlpha, const uint32_t currentMainObjectIdx, float3 localTextureColor, bool colorFromTexture)
+{
+    float32_t4 color;
+    nbl::hlsl::spirv::beginInvocationInterlockEXT();
+
+    const uint32_t packedData = pseudoStencil[fragCoord];
+
+    const uint32_t localQuantizedAlpha = (uint32_t)(localAlpha * 255.f);
+    const uint32_t storedQuantizedAlpha = nbl::hlsl::glsl::bitfieldExtract<uint32_t>(packedData,0,AlphaBits);
+    const uint32_t storedMainObjectIdx = nbl::hlsl::glsl::bitfieldExtract<uint32_t>(packedData,AlphaBits,MainObjectIdxBits);
+    // if geomID has changed, we resolve the SDF alpha (draw using blend), else accumulate
+    const bool differentMainObject = currentMainObjectIdx != storedMainObjectIdx; // meaning current pixel's main object is different than what is already stored
+    const bool resolve = differentMainObject && storedMainObjectIdx != InvalidMainObjectIdx;
+    uint32_t toResolveStyleIdx = InvalidStyleIdx;
+    
+    // load from colorStorage only if we want to resolve color from texture instead of style
+    // sampling from colorStorage needs to happen in critical section because another fragment may also want to store into it at the same time + need to happen before store
+    if (resolve)
+    {
+        toResolveStyleIdx = loadMainObject(storedMainObjectIdx).styleIdx;
+        if (toResolveStyleIdx == InvalidStyleIdx) // if style idx to resolve is invalid, then it means we should resolve from color
+            color = float32_t4(unpackR11G11B10_UNORM(colorStorage[fragCoord]), 1.0f);
+    }
+    
+    // If current localAlpha is higher than what is already stored in pseudoStencil we will update the value in pseudoStencil or the color in colorStorage, this is equivalent to programmable blending MAX operation.
+    // OR If previous pixel has a different ID than current's  (i.e. previous either empty/invalid or a differnet mainObject), we should update our alpha and color storages.
+    if (differentMainObject || localQuantizedAlpha > storedQuantizedAlpha)
+    {
+        pseudoStencil[fragCoord] = nbl::hlsl::glsl::bitfieldInsert<uint32_t>(localQuantizedAlpha,currentMainObjectIdx,AlphaBits,MainObjectIdxBits);
+        if (colorFromTexture) // writing color from texture
+            colorStorage[fragCoord] = packR11G11B10_UNORM(localTextureColor);
+    }
+    
+    nbl::hlsl::spirv::endInvocationInterlockEXT();
+
+    if (!resolve)
+        discard;
+
+    // draw with previous geometry's style's color or stored in texture buffer :kek:
+    // we don't need to load the style's color in critical section because we've already retrieved the style index from the stored main obj
+    if (toResolveStyleIdx != InvalidStyleIdx) // if toResolveStyleIdx is valid then that means our resolved color should come from line style
+        color = loadLineStyle(toResolveStyleIdx).color;
+    color.a *= float(storedQuantizedAlpha) / 255.f;
+    
+    return color;
+}
+
+float dot2(in float2 vec)
+{
+    return dot(vec, vec);
+}
+
+[[vk::spvexecutionmode(spv::ExecutionModePixelInterlockOrderedEXT)]]
+[shader("pixel")]
+float4 fragMain(PSInput input) : SV_TARGET
+{
+    float localAlpha = 0.0f;
+    float3 textureColor = float3(0, 0, 0); // color sampled from a texture
+
+    ObjectType objType = input.getObjType();
+    const uint32_t currentMainObjectIdx = input.getMainObjectIdx();
+    const MainObject mainObj = loadMainObject(currentMainObjectIdx);
+    
+    if (pc.isDTMRendering)
+    {   
+        // TRIANGLE RENDERING
+        {
+            const float outlineThickness = input.getOutlineThickness();
+            const float contourThickness = input.getContourLineThickness();
+            const float phaseShift = 0.0f; // input.getCurrentPhaseShift();
+            const float stretch = 1.0f; // TODO: figure out what is it for ---> [ERFAN's REPLY: no need to give shit about this in dtms, it's for special shape styles] 
+            const float worldToScreenRatio = input.getCurrentWorldToScreenRatio();
+
+            DTMSettings dtm = loadDTMSettings(mainObj.dtmSettingsIdx);
+            LineStyle outlineStyle = loadLineStyle(dtm.outlineLineStyleIdx);
+            LineStyle contourStyle = loadLineStyle(dtm.contourLineStyleIdx);
+
+            float3 v[3];
+            v[0] = input.getScreenSpaceVertexAttribs(0);
+            v[1] = input.getScreenSpaceVertexAttribs(1);
+            v[2] = input.getScreenSpaceVertexAttribs(2);
+
+            const float3 baryCoord = nbl::hlsl::spirv::BaryCoordKHR;
+
+            // indices of points constructing every edge
+            uint2 edgePoints[3];
+            edgePoints[0] = uint2(0, 1);
+            edgePoints[1] = uint2(1, 2);
+            edgePoints[2] = uint2(2, 0);
+
+            // index of vertex opposing an edge, needed for calculation of triangle heights
+            uint opposingVertexIdx[3];
+            opposingVertexIdx[0] = 2;
+            opposingVertexIdx[1] = 0;
+            opposingVertexIdx[2] = 1;
+        
+            float height = input.getHeight();
+
+            // HEIGHT SHADING
+            const uint32_t heightMapSize = dtm.heightColorEntryCount;
+            float minShadingHeight = dtm.heightColorMapHeights[0];
+            float maxShadingHeight = dtm.heightColorMapHeights[heightMapSize - 1];
+
+            if (heightMapSize > 0)
+            {
+                // partially based on https://www.shadertoy.com/view/XsXSz4 by Inigo Quilez
+                float2 e0 = v[1] - v[0];
+                float2 e1 = v[2] - v[1];
+                float2 e2 = v[0] - v[2];
+            
+                float triangleAreaSign = -sign(e0.x * e2.y - e0.y * e2.x);
+                float2 v0 = input.position.xy - v[0];
+                float2 v1 = input.position.xy - v[1];
+                float2 v2 = input.position.xy - v[2];
+
+                float distanceToLine0 = sqrt(dot2(v0 - e0 * dot(v0, e0) / dot(e0, e0)));
+                float distanceToLine1 = sqrt(dot2(v1 - e1 * dot(v1, e1) / dot(e1, e1)));
+                float distanceToLine2 = sqrt(dot2(v2 - e2 * dot(v2, e2) / dot(e2, e2)));
+
+                float line0Sdf = distanceToLine0 * triangleAreaSign * (v0.x * e0.y - v0.y * e0.x);
+                float line1Sdf = distanceToLine1 * triangleAreaSign * (v1.x * e1.y - v1.y * e1.x);
+                float line2Sdf = distanceToLine2 * triangleAreaSign * (v2.x * e2.y - v2.y * e2.x);
+                float heightDeriv = fwidth(height);
+                float line3Sdf = (minShadingHeight - height) / heightDeriv;
+                float line4Sdf = (height - maxShadingHeight) / heightDeriv;
+
+                float convexPolygonSdf = max(line0Sdf, line1Sdf);
+                convexPolygonSdf = max(convexPolygonSdf, line2Sdf);
+                convexPolygonSdf = max(convexPolygonSdf, line3Sdf);
+                convexPolygonSdf = max(convexPolygonSdf, line4Sdf);
+
+                localAlpha = 1.0f - smoothstep(0.0f, globals.antiAliasingFactor * 2.0f, convexPolygonSdf);
+
+                // calculate height color
+                DTMSettings::E_HEIGHT_SHADING_MODE mode = dtm.determineHeightShadingMode();
+
+                if(mode == DTMSettings::E_HEIGHT_SHADING_MODE::DISCRETE_VARIABLE_LENGTH_INTERVALS)
+                {
+                    DTMSettingsHeightsAccessor dtmHeightsAccessor = { dtm };
+                    uint32_t mapIndexPlus1 = nbl::hlsl::upper_bound(dtmHeightsAccessor, 0, heightMapSize, height);
+                    uint32_t mapIndex = mapIndexPlus1 == 0 ? mapIndexPlus1 : mapIndexPlus1 - 1;
+
+                    float heightDeriv = fwidth(height);
+                    bool blendWithPrev = true
+                        && (mapIndex >= heightMapSize - 1 || (height * 2.0 < dtm.heightColorMapHeights[mapIndexPlus1] + dtm.heightColorMapHeights[mapIndex]));
+                
+                    // logic explainer: if colorIdx is 0.0 then it means blend with next
+                    // if color idx is >= length of the colours array then it means it's also > 0.0 and this blend with prev is true
+                    // if color idx is > 0 and < len - 1, then it depends on the current pixel's height value and two closest height values
+                    if (blendWithPrev)
+                    {
+                        if (mapIndex > 0)
+                        {
+                            float pxDistanceToPrevHeight = (height - dtm.heightColorMapHeights[mapIndex]) / heightDeriv;
+                            float prevColorCoverage = smoothstep(-globals.antiAliasingFactor, globals.antiAliasingFactor, pxDistanceToPrevHeight);
+                            textureColor = lerp(dtm.heightColorMapColors[mapIndex - 1].rgb, dtm.heightColorMapColors[mapIndex].rgb, prevColorCoverage);
+                        }
+                        else
+                        {
+                            textureColor = dtm.heightColorMapColors[mapIndex].rgb;
+                        }
+                    }
+                    else
+                    {
+                        if (mapIndex < heightMapSize - 1)
+                        {
+                            float pxDistanceToNextHeight = (height - dtm.heightColorMapHeights[mapIndexPlus1]) / heightDeriv;
+                            float nextColorCoverage = smoothstep(-globals.antiAliasingFactor, globals.antiAliasingFactor, pxDistanceToNextHeight);
+                            textureColor = lerp(dtm.heightColorMapColors[mapIndex].rgb, dtm.heightColorMapColors[mapIndexPlus1].rgb, nextColorCoverage);
+                        }
+                        else
+                        {
+                            textureColor = dtm.heightColorMapColors[mapIndex].rgb;
+                        }
+                    }
+
+                    //localAlpha = dtm.heightColorMapColors[mapIndex].a;
+                }
+                else
+                {
+                    float heightTmp;
+                    if (mode == DTMSettings::E_HEIGHT_SHADING_MODE::DISCRETE_FIXED_LENGTH_INTERVALS)
+                    {
+                        float interval = dtm.intervalWidth;
+                        int sectionIndex = int((height - minShadingHeight) / interval);
+                        heightTmp = minShadingHeight + float(sectionIndex) * interval;
+                    }
+                    else if (mode == DTMSettings::E_HEIGHT_SHADING_MODE::CONTINOUS_INTERVALS)
+                    {
+                        heightTmp = height;
+                    }
+
+                    DTMSettingsHeightsAccessor dtmHeightsAccessor = { dtm };
+                    uint32_t upperBoundHeightIndex = nbl::hlsl::upper_bound(dtmHeightsAccessor, 0, heightMapSize, height);
+                    uint32_t lowerBoundHeightIndex = upperBoundHeightIndex == 0 ? upperBoundHeightIndex : upperBoundHeightIndex - 1;
+
+                    float upperBoundHeight = dtm.heightColorMapHeights[upperBoundHeightIndex];
+                    float lowerBoundHeight = dtm.heightColorMapHeights[lowerBoundHeightIndex];
+                
+                    float4 upperBoundColor = dtm.heightColorMapColors[upperBoundHeightIndex];
+                    float4 lowerBoundColor = dtm.heightColorMapColors[lowerBoundHeightIndex];
+                
+                    float interpolationVal;
+                    if (upperBoundHeightIndex == 0)
+                        interpolationVal = 1.0f;
+                    else
+                        interpolationVal = (heightTmp - lowerBoundHeight) / (upperBoundHeight - lowerBoundHeight);
+                
+                    textureColor = lerp(lowerBoundColor.rgb, upperBoundColor.rgb, interpolationVal);
+                    localAlpha = lerp(lowerBoundColor.a, upperBoundColor.a, interpolationVal);;
+                }
+            }
+            //else // TODO: remove!!
+            //{
+            //    printf("WTF");
+            //    return float4(0.0f, 0.0f, 0.0f, 1.0f);
+            //}
+
+            // CONTOUR
+
+            // TODO: move to ubo or push constants
+            const float startHeight = dtm.contourLinesStartHeight;
+            const float endHeight = dtm.contourLinesEndHeight;
+            const float interval = dtm.contourLinesHeightInterval;
+
+            // TODO: can be precomputed
+            const int maxContourLineIdx = (endHeight - startHeight + 1) / interval;
+
+            // TODO: it actually can output a negative number, fix
+            int contourLineIdx = nbl::hlsl::_static_cast<int>((height - startHeight + (interval * 0.5f)) / interval);
+            contourLineIdx = clamp(contourLineIdx, 0, maxContourLineIdx);
+            float contourLineHeight = startHeight + interval * contourLineIdx;
+
+            int contourLinePointsIdx = 0;
+            float2 contourLinePoints[2];
+            // TODO: case where heights we are looking for are on all three vertices
+            for (int i = 0; i < 3; ++i)
+            {
+                if (contourLinePointsIdx == 3)
+                    break;
+
+                const uint2 currentEdgePoints = edgePoints[i];
+                float3 p0 = v[currentEdgePoints[0]];
+                float3 p1 = v[currentEdgePoints[1]];
+
+                if (p1.z < p0.z)
+                    nbl::hlsl::swap(p0, p1);
+
+                float minHeight = p0.z;
+                float maxHeight = p1.z;
+
+                if (height >= minHeight && height <= maxHeight)
+                {
+                    float2 edge = float2(p1.x, p1.y) - float2(p0.x, p0.y);
+                    float scale = (contourLineHeight - minHeight) / (maxHeight - minHeight);
+
+                    contourLinePoints[contourLinePointsIdx] = scale * edge + float2(p0.x, p0.y);
+                    ++contourLinePointsIdx;
+                }
+            }
+
+            {
+                nbl::hlsl::shapes::Line<float> lineSegment = nbl::hlsl::shapes::Line<float>::construct(contourLinePoints[0], contourLinePoints[1]);
+
+                float distance = nbl::hlsl::numeric_limits<float>::max;
+                if (!contourStyle.hasStipples() || stretch == InvalidStyleStretchValue)
+                {
+                    distance = ClippedSignedDistance< nbl::hlsl::shapes::Line<float> >::sdf(lineSegment, input.position.xy, contourThickness, contourStyle.isRoadStyleFlag);
+                }
+                else
+                {
+                    // TODO:
+                    // It might be beneficial to calculate distance between pixel and contour line to early out some pixels and save yourself from stipple sdf computations!
+                    // where you only compute the complex sdf if abs((height - contourVal) / heightDeriv) <= aaFactor
+                    nbl::hlsl::shapes::Line<float>::ArcLengthCalculator arcLenCalc = nbl::hlsl::shapes::Line<float>::ArcLengthCalculator::construct(lineSegment);
+                    LineStyleClipper clipper = LineStyleClipper::construct(contourStyle, lineSegment, arcLenCalc, phaseShift, stretch, worldToScreenRatio);
+                    distance = ClippedSignedDistance<nbl::hlsl::shapes::Line<float>, LineStyleClipper>::sdf(lineSegment, input.position.xy, contourThickness, contourStyle.isRoadStyleFlag, clipper);
+                }
+
+                float contourLocalAlpha = smoothstep(+globals.antiAliasingFactor, -globals.antiAliasingFactor, distance) * contourStyle.color.a;
+                textureColor = lerp(textureColor, contourStyle.color.rgb, contourLocalAlpha);
+                localAlpha = max(localAlpha, contourLocalAlpha);
+            }
+
+        
+
+            // OUTLINE
+
+            // find sdf of every edge
+            float triangleAreaTimesTwo;
+            {
+                float3 AB = v[0] - v[1];
+                float3 AC = v[0] - v[2];
+                AB.z = 0.0f;
+                AC.z = 0.0f;
+
+                // TODO: figure out if there is a faster solution
+                triangleAreaTimesTwo = length(cross(AB, AC));
+            }
+
+            // calculate sdf of every edge as it wasn't stippled
+            float distances[3];
+            for (int i = 0; i < 3; ++i)
+            {
+                const uint2 currentEdgePoints = edgePoints[i];
+                float3 A = v[currentEdgePoints[0]];
+                float3 B = v[currentEdgePoints[1]];
+                float3 AB = B - A;
+                float ABLen = length(AB);
+
+                distances[i] = (triangleAreaTimesTwo / ABLen) * baryCoord[opposingVertexIdx[i]];
+            }
+
+            float minDistance = nbl::hlsl::numeric_limits<float>::max;
+            if (!outlineStyle.hasStipples() || stretch == InvalidStyleStretchValue)
+            {
+                for (uint i = 0; i < 3; ++i)
+                    distances[i] -= outlineThickness;
+
+                minDistance = min(distances[0], min(distances[1], distances[2]));
+            }
+            else
+            {
+                for (int i = 0; i < 3; ++i)
+                {
+                    if (distances[i] > outlineThickness)
+                        continue;
+
+                    const uint2 currentEdgePoints = edgePoints[i];
+                    float3 p0 = v[currentEdgePoints[0]];
+                    float3 p1 = v[currentEdgePoints[1]];
+
+                    // long story short, in order for stipple patterns to be consistent:
+                    // - point with lesser x coord should be starting point
+                    // - if x coord of both points are equal then point with lesser y value should be starting point
+                    if (p1.x < p0.x)
+                        nbl::hlsl::swap(p0, p1);
+                    else if (p1.x == p0.x && p1.y < p0.y)
+                        nbl::hlsl::swap(p0, p1);
+
+                    nbl::hlsl::shapes::Line<float> lineSegment = nbl::hlsl::shapes::Line<float>::construct(float2(p0.x, p0.y), float2(p1.x, p1.y));
+                
+                    float distance = nbl::hlsl::numeric_limits<float>::max;
+                    nbl::hlsl::shapes::Line<float>::ArcLengthCalculator arcLenCalc = nbl::hlsl::shapes::Line<float>::ArcLengthCalculator::construct(lineSegment);
+                    LineStyleClipper clipper = LineStyleClipper::construct(outlineStyle, lineSegment, arcLenCalc, phaseShift, stretch, worldToScreenRatio);
+                    distance = ClippedSignedDistance<nbl::hlsl::shapes::Line<float>, LineStyleClipper>::sdf(lineSegment, input.position.xy, outlineThickness, outlineStyle.isRoadStyleFlag, clipper);
+
+                    minDistance = min(minDistance, distance);
+                }
+
+            }
+
+            float outlineLocalAlpha = smoothstep(+globals.antiAliasingFactor, -globals.antiAliasingFactor, minDistance) * outlineStyle.color.a;
+            textureColor = lerp(textureColor, outlineStyle.color.rgb, outlineLocalAlpha);
+            localAlpha = max(localAlpha, outlineLocalAlpha);
+        }
+
+        return calculateFinalColor<nbl::hlsl::jit::device_capabilities::fragmentShaderPixelInterlock>(uint2(input.position.xy), localAlpha, currentMainObjectIdx, textureColor, true);
+    }
+    else
+    {
+        // figure out local alpha with sdf
+        if (objType == ObjectType::LINE || objType == ObjectType::QUAD_BEZIER || objType == ObjectType::POLYLINE_CONNECTOR)
+    {
+        float distance = nbl::hlsl::numeric_limits<float>::max;
+        if (objType == ObjectType::LINE)
+        {
+            const float2 start = input.getLineStart();
+            const float2 end = input.getLineEnd();
+            const uint32_t styleIdx = mainObj.styleIdx;
+            const float thickness = input.getLineThickness();
+            const float phaseShift = input.getCurrentPhaseShift();
+            const float stretch = input.getPatternStretch();
+            const float worldToScreenRatio = input.getCurrentWorldToScreenRatio();
+
+            nbl::hlsl::shapes::Line<float> lineSegment = nbl::hlsl::shapes::Line<float>::construct(start, end);
+
+            LineStyle style = loadLineStyle(styleIdx);
+
+            if (!style.hasStipples() || stretch == InvalidStyleStretchValue)
+            {
+                distance = ClippedSignedDistance< nbl::hlsl::shapes::Line<float> >::sdf(lineSegment, input.position.xy, thickness, style.isRoadStyleFlag);
+            }
+            else
+            {
+                nbl::hlsl::shapes::Line<float>::ArcLengthCalculator arcLenCalc = nbl::hlsl::shapes::Line<float>::ArcLengthCalculator::construct(lineSegment);
+                LineStyleClipper clipper = LineStyleClipper::construct(loadLineStyle(styleIdx), lineSegment, arcLenCalc, phaseShift, stretch, worldToScreenRatio);
+                distance = ClippedSignedDistance<nbl::hlsl::shapes::Line<float>, LineStyleClipper>::sdf(lineSegment, input.position.xy, thickness, style.isRoadStyleFlag, clipper);
+            }
+        }
+        else if (objType == ObjectType::QUAD_BEZIER)
+        {
+            nbl::hlsl::shapes::Quadratic<float> quadratic = input.getQuadratic();
+            nbl::hlsl::shapes::Quadratic<float>::ArcLengthCalculator arcLenCalc = input.getQuadraticArcLengthCalculator();
+
+            const uint32_t styleIdx = mainObj.styleIdx;
+            const float thickness = input.getLineThickness();
+            const float phaseShift = input.getCurrentPhaseShift();
+            const float stretch = input.getPatternStretch();
+            const float worldToScreenRatio = input.getCurrentWorldToScreenRatio();
+
+            LineStyle style = loadLineStyle(styleIdx);
+            if (!style.hasStipples() || stretch == InvalidStyleStretchValue)
+            {
+                distance = ClippedSignedDistance< nbl::hlsl::shapes::Quadratic<float> >::sdf(quadratic, input.position.xy, thickness, style.isRoadStyleFlag);
+            }
+            else
+            {
+                BezierStyleClipper clipper = BezierStyleClipper::construct(loadLineStyle(styleIdx), quadratic, arcLenCalc, phaseShift, stretch, worldToScreenRatio);
+                distance = ClippedSignedDistance<nbl::hlsl::shapes::Quadratic<float>, BezierStyleClipper>::sdf(quadratic, input.position.xy, thickness, style.isRoadStyleFlag, clipper);
+            }
+        }
+        else if (objType == ObjectType::POLYLINE_CONNECTOR)
+        {
+            const float2 P = input.position.xy - input.getPolylineConnectorCircleCenter();
+            distance = miterSDF(
+                P,
+                input.getLineThickness(),
+                input.getPolylineConnectorTrapezoidStart(),
+                input.getPolylineConnectorTrapezoidEnd(),
+                input.getPolylineConnectorTrapezoidLongBase(),
+                input.getPolylineConnectorTrapezoidShortBase());
+
+        }
+        localAlpha = smoothstep(+globals.antiAliasingFactor, -globals.antiAliasingFactor, distance);
+    }
+        else if (objType == ObjectType::CURVE_BOX) 
+    {
+        const float minorBBoxUV = input.getMinorBBoxUV();
+        const float majorBBoxUV = input.getMajorBBoxUV();
+
+        nbl::hlsl::math::equations::Quadratic<float> curveMinMinor = input.getCurveMinMinor();
+        nbl::hlsl::math::equations::Quadratic<float> curveMinMajor = input.getCurveMinMajor();
+        nbl::hlsl::math::equations::Quadratic<float> curveMaxMinor = input.getCurveMaxMinor();
+        nbl::hlsl::math::equations::Quadratic<float> curveMaxMajor = input.getCurveMaxMajor();
+
+        //  TODO(Optimization): Can we ignore this majorBBoxUV clamp and rely on the t clamp that happens next? then we can pass `PrecomputedRootFinder`s instead of computing the values per pixel.
+        nbl::hlsl::math::equations::Quadratic<float> minCurveEquation = nbl::hlsl::math::equations::Quadratic<float>::construct(curveMinMajor.a, curveMinMajor.b, curveMinMajor.c - clamp(majorBBoxUV, 0.0, 1.0));
+        nbl::hlsl::math::equations::Quadratic<float> maxCurveEquation = nbl::hlsl::math::equations::Quadratic<float>::construct(curveMaxMajor.a, curveMaxMajor.b, curveMaxMajor.c - clamp(majorBBoxUV, 0.0, 1.0));
+
+        const float minT = clamp(PrecomputedRootFinder<float>::construct(minCurveEquation).computeRoots(), 0.0, 1.0);
+        const float minEv = curveMinMinor.evaluate(minT);
+
+        const float maxT = clamp(PrecomputedRootFinder<float>::construct(maxCurveEquation).computeRoots(), 0.0, 1.0);
+        const float maxEv = curveMaxMinor.evaluate(maxT);
+
+        const bool insideMajor = majorBBoxUV >= 0.0 && majorBBoxUV <= 1.0;
+        const bool insideMinor = minorBBoxUV >= minEv && minorBBoxUV <= maxEv;
+
+        if (insideMinor && insideMajor)
+        {
+            localAlpha = 1.0;
+        }
+        else
+        {
+            // Find the true SDF of a hatch box boundary which is bounded by two curves, It requires knowing the distance from the current UV to the closest point on bounding curves and the limiting lines (in major direction)
+            // We also keep track of distance vector (minor, major) to convert to screenspace distance for anti-aliasing with screenspace aaFactor
+            const float InvalidT = nbl::hlsl::numeric_limits<float32_t>::max;
+            const float MAX_DISTANCE_SQUARED = nbl::hlsl::numeric_limits<float32_t>::max;
+
+            const float2 boxScreenSpaceSize = input.getCurveBoxScreenSpaceSize();
+
+
+            float closestDistanceSquared = MAX_DISTANCE_SQUARED;
+            const float2 pos = float2(minorBBoxUV, majorBBoxUV) * boxScreenSpaceSize;
+
+            if (minorBBoxUV < minEv)
+            {
+                // DO SDF of Min Curve
+                nbl::hlsl::shapes::Quadratic<float> minCurve = nbl::hlsl::shapes::Quadratic<float>::construct(
+                    float2(curveMinMinor.a, curveMinMajor.a) * boxScreenSpaceSize,
+                    float2(curveMinMinor.b, curveMinMajor.b) * boxScreenSpaceSize,
+                    float2(curveMinMinor.c, curveMinMajor.c) * boxScreenSpaceSize);
+
+                nbl::hlsl::shapes::Quadratic<float>::Candidates candidates = minCurve.getClosestCandidates(pos);
+                [[unroll(nbl::hlsl::shapes::Quadratic<float>::MaxCandidates)]]
+                for (uint32_t i = 0; i < nbl::hlsl::shapes::Quadratic<float>::MaxCandidates; i++)
+                {
+                    candidates[i] = clamp(candidates[i], 0.0, 1.0);
+                    const float2 distVector = minCurve.evaluate(candidates[i]) - pos;
+                    const float candidateDistanceSquared = dot(distVector, distVector);
+                    if (candidateDistanceSquared < closestDistanceSquared)
+                        closestDistanceSquared = candidateDistanceSquared;
+                }
+            }
+            else if (minorBBoxUV > maxEv)
+            {
+                // Do SDF of Max Curve
+                nbl::hlsl::shapes::Quadratic<float> maxCurve = nbl::hlsl::shapes::Quadratic<float>::construct(
+                    float2(curveMaxMinor.a, curveMaxMajor.a) * boxScreenSpaceSize,
+                    float2(curveMaxMinor.b, curveMaxMajor.b) * boxScreenSpaceSize,
+                    float2(curveMaxMinor.c, curveMaxMajor.c) * boxScreenSpaceSize);
+                nbl::hlsl::shapes::Quadratic<float>::Candidates candidates = maxCurve.getClosestCandidates(pos);
+                [[unroll(nbl::hlsl::shapes::Quadratic<float>::MaxCandidates)]]
+                for (uint32_t i = 0; i < nbl::hlsl::shapes::Quadratic<float>::MaxCandidates; i++)
+                {
+                    candidates[i] = clamp(candidates[i], 0.0, 1.0);
+                    const float2 distVector = maxCurve.evaluate(candidates[i]) - pos;
+                    const float candidateDistanceSquared = dot(distVector, distVector);
+                    if (candidateDistanceSquared < closestDistanceSquared)
+                        closestDistanceSquared = candidateDistanceSquared;
+                }
+            }
+
+            if (!insideMajor)
+            {
+                const bool minLessThanMax = minEv < maxEv;
+                float2 majorDistVector = float2(MAX_DISTANCE_SQUARED, MAX_DISTANCE_SQUARED);
+                if (majorBBoxUV > 1.0)
+                {
+                    const float2 minCurveEnd = float2(minEv, 1.0) * boxScreenSpaceSize;
+                    if (minLessThanMax)
+                        majorDistVector = sdLineDstVec(pos, minCurveEnd, float2(maxEv, 1.0) * boxScreenSpaceSize);
+                    else
+                        majorDistVector = pos - minCurveEnd;
+                }
+                else
+                {
+                    const float2 minCurveStart = float2(minEv, 0.0) * boxScreenSpaceSize;
+                    if (minLessThanMax)
+                        majorDistVector = sdLineDstVec(pos, minCurveStart, float2(maxEv, 0.0) * boxScreenSpaceSize);
+                    else
+                        majorDistVector = pos - minCurveStart;
+                }
+
+                const float majorDistSq = dot(majorDistVector, majorDistVector);
+                if (majorDistSq < closestDistanceSquared)
+                    closestDistanceSquared = majorDistSq;
+            }
+
+            const float dist = sqrt(closestDistanceSquared);
+            localAlpha = 1.0f - smoothstep(0.0, globals.antiAliasingFactor, dist);
+        }
+
+        LineStyle style = loadLineStyle(mainObj.styleIdx);
+        uint32_t textureId = asuint(style.screenSpaceLineWidth);
+        if (textureId != InvalidTextureIdx)
+        {
+            // For Hatch fiils we sample the first mip as we don't fill the others, because they are constant in screenspace and render as expected
+            // If later on we decided that we can have different sizes here, we should do computations similar to FONT_GLYPH
+            float3 msdfSample = msdfTextures.SampleLevel(msdfSampler, float3(frac(input.position.xy / HatchFillMSDFSceenSpaceSize), float(textureId)), 0.0).xyz;
+            float msdf = nbl::hlsl::text::msdfDistance(msdfSample, MSDFPixelRange * HatchFillMSDFSceenSpaceSize / MSDFSize);
+            localAlpha *= smoothstep(+globals.antiAliasingFactor / 2.0, -globals.antiAliasingFactor / 2.0f, msdf);
+        }
+    }
+        else if (objType == ObjectType::FONT_GLYPH) 
+    {
+        const float2 uv = input.getFontGlyphUV();
+        const uint32_t textureId = input.getFontGlyphTextureId();
+
+        if (textureId != InvalidTextureIdx)
+        {
+            float mipLevel = msdfTextures.CalculateLevelOfDetail(msdfSampler, uv);
+            float3 msdfSample = msdfTextures.SampleLevel(msdfSampler, float3(uv, float(textureId)), mipLevel);
+            float msdf = nbl::hlsl::text::msdfDistance(msdfSample, input.getFontGlyphPxRange());
+            /*
+                explaining "*= exp2(max(mipLevel,0.0))"
+                Each mip level has constant MSDFPixelRange
+                Which essentially makes the msdfSamples here (Harware Sampled) have different scales per mip
+                As we go up 1 mip level, the msdf distance should be multiplied by 2.0
+                While this makes total sense for NEAREST mip sampling when mipLevel is an integer and only one mip is being sampled.
+                It's a bit complex when it comes to trilinear filtering (LINEAR mip sampling), but it works in practice!
+                
+                Alternatively you can think of it as doing this instead:
+                localAlpha = smoothstep(+globals.antiAliasingFactor / exp2(max(mipLevel,0.0)), 0.0, msdf);
+                Which is reducing the aa feathering as we go up the mip levels. 
+                to avoid aa feathering of the MAX_MSDF_DISTANCE_VALUE to be less than aa factor and eventually color it and cause greyed out area around the main glyph
+            */
+            msdf *= exp2(max(mipLevel,0.0));
+            
+            LineStyle style = loadLineStyle(mainObj.styleIdx);
+            const float screenPxRange = input.getFontGlyphPxRange() / MSDFPixelRangeHalf;
+            const float bolden = style.worldSpaceLineWidth * screenPxRange; // worldSpaceLineWidth is actually boldenInPixels, aliased TextStyle with LineStyle
+            localAlpha = smoothstep(+globals.antiAliasingFactor / 2.0f + bolden, -globals.antiAliasingFactor / 2.0f + bolden, msdf);
+        }
+    }
+        else if (objType == ObjectType::IMAGE) 
+    {
+        const float2 uv = input.getImageUV();
+        const uint32_t textureId = input.getImageTextureId();
+
+        if (textureId != InvalidTextureIdx)
+        {
+            float4 colorSample = textures[NonUniformResourceIndex(textureId)].Sample(textureSampler, float2(uv.x, uv.y));
+            textureColor = colorSample.rgb;
+            localAlpha = colorSample.a;
+        }
+    }
+
+        uint2 fragCoord = uint2(input.position.xy);
+        
+        if (localAlpha <= 0)
+            discard;
+        
+        const bool colorFromTexture = objType == ObjectType::IMAGE;
+        
+        // TODO[Przemek]: But make sure you're still calling this, correctly calculating alpha and texture color.
+        // you can add 1 main object and push via DrawResourcesFiller like we already do for other objects (this go in the mainObjects StorageBuffer) and then set the currentMainObjectIdx to 0 here
+        // having 1 main object temporarily means that all triangle meshes will be treated as a unified object in blending operations. 
+        return calculateFinalColor<nbl::hlsl::jit::device_capabilities::fragmentShaderPixelInterlock>(fragCoord, localAlpha, currentMainObjectIdx, textureColor, colorFromTexture);
+    }
+}
diff --git a/62_CAD/shaders/main_pipeline/vertex_shader.hlsl b/62_CAD/shaders/main_pipeline/vertex_shader.hlsl
index 4a955d92d..b300a6958 100644
--- a/62_CAD/shaders/main_pipeline/vertex_shader.hlsl
+++ b/62_CAD/shaders/main_pipeline/vertex_shader.hlsl
@@ -7,8 +7,6 @@
 #include <nbl/builtin/hlsl/algorithm.hlsl>
 #include <nbl/builtin/hlsl/jit/device_capabilities.hlsl>
 
-[[vk::push_constant]] PushConstants pc;
-
 // TODO[Lucas]: Move these functions to builtin hlsl functions (Even the shadertoy obb and aabb ones)
 float cross2D(float2 a, float2 b)
 {
@@ -85,73 +83,9 @@ PSInput main(uint vertexID : SV_VertexID)
     // ~~Later, most likely We will require pulling all 3 vertices of the triangle, that's where you need to know which triangle you're currently on, and instead of objectID = vertexID/4 which we currently do, you will do vertexID/3 and pull all 3 of it's vertices.~~
     // Ok, brainfart, a vertex can belong to multiple triangles, I was thinking of AA but triangles share vertices, nevermind my comment above.
     
-#define DTM
-#ifdef DTM
-    PSInput outV;
 
-    // Default Initialize PS Input
-    outV.position.zw = float2(0.0, 1.0);
-    outV.data1 = uint4(0, 0, 0, 0);
-    outV.data2 = float4(0, 0, 0, 0);
-    outV.data3 = float4(0, 0, 0, 0);
-    outV.data4 = float4(0, 0, 0, 0);
-    outV.interp_data5 = float2(0, 0);
-    outV.setObjType(ObjectType::TRIANGLE_MESH);
-    outV.setMainObjectIdx(pc.triangleMeshMainObjectIndex);
-
-    TriangleMeshVertex vtx = vk::RawBufferLoad<TriangleMeshVertex>(pc.triangleMeshVerticesBaseAddress + sizeof(TriangleMeshVertex) * vertexID, 8u);
-    pfloat64_t2 vtxPos;
-    vtxPos.x = _static_cast<pfloat64_t>(vtx.pos.x);
-    vtxPos.y = _static_cast<pfloat64_t>(vtx.pos.y);
-
-    MainObject mainObj = loadMainObject(pc.triangleMeshMainObjectIndex);
-    ClipProjectionData clipProjectionData = getClipProjectionData(mainObj);
-
-    float2 transformedPos = transformPointScreenSpace(clipProjectionData.projectionToNDC, globals.resolution, vtxPos);
-
-    outV.position.xy = transformedPos;
-    outV.position = transformFromSreenSpaceToNdc(outV.position.xy, globals.resolution);
-    const float heightAsFloat = nbl::hlsl::_static_cast<float>(vtx.height);
-    outV.setHeight(heightAsFloat);
-    outV.setScreenSpaceVertexAttribs(float3(transformedPos, heightAsFloat));
-    outV.setCurrentWorldToScreenRatio(
-        _static_cast<float>((_static_cast<pfloat64_t>(2.0f) /
-            (clipProjectionData.projectionToNDC[0].x * _static_cast<pfloat64_t>(globals.resolution.x))))
-    );
-
-    // TODO: line style of contour line has to be set too!
-    DTMSettings dtm = loadDTMSettings(mainObj.dtmSettingsIdx);
-    LineStyle outlineStyle = loadLineStyle(dtm.outlineLineStyleIdx);
-    LineStyle contourStyle = loadLineStyle(dtm.contourLineStyleIdx);
-    const float screenSpaceOutlineWidth = outlineStyle.screenSpaceLineWidth + _static_cast<float>(_static_cast<pfloat64_t>(outlineStyle.worldSpaceLineWidth) * globals.screenToWorldRatio);
-    const float sdfOutlineThickness = screenSpaceOutlineWidth * 0.5f;
-    const float screenSpaceContourLineWidth = contourStyle.screenSpaceLineWidth + _static_cast<float>(_static_cast<pfloat64_t>(contourStyle.worldSpaceLineWidth) * globals.screenToWorldRatio);
-    const float sdfContourLineThickness = screenSpaceContourLineWidth * 0.5f;
-    outV.setOutlineThickness(sdfOutlineThickness);
-    outV.setContourLineThickness(sdfContourLineThickness);
-
-    // full screen triangle (this will destroy outline, contour line and height drawing)
-#if 0
-    const uint vertexIdx = vertexID % 3;
-    if(vertexIdx == 0)
-        outV.position.xy = float2(-1.0f, -1.0f);
-    else if (vertexIdx == 1)
-        outV.position.xy = float2(-1.0f, 3.0f);
-    else if (vertexIdx == 2)
-        outV.position.xy = float2(3.0f, -1.0f);
-#endif
+    ClipProjectionData clipProjectionData;
     
-    return outV;
-
-#else
-
-    const uint vertexIdx = vertexID & 0x3u;
-    const uint objectID = vertexID >> 2;
-
-    DrawObject drawObj = loadDrawObject(objectID);
-
-    ObjectType objType = (ObjectType)(drawObj.type_subsectionIdx & 0x0000FFFF);
-    uint32_t subsectionIdx = drawObj.type_subsectionIdx >> 16;
     PSInput outV;
 
     // Default Initialize PS Input
@@ -161,487 +95,542 @@ PSInput main(uint vertexID : SV_VertexID)
     outV.data3 = float4(0, 0, 0, 0);
     outV.data4 = float4(0, 0, 0, 0);
     outV.interp_data5 = float2(0, 0);
-    outV.setObjType(objType);
-    outV.setMainObjectIdx(drawObj.mainObjIndex);
     
-
-    MainObject mainObj = loadMainObject(drawObj.mainObjIndex);
-    ClipProjectionData clipProjectionData = getClipProjectionData(mainObj);
-    
-    // We only need these for Outline type objects like lines and bezier curves
-    if (objType == ObjectType::LINE || objType == ObjectType::QUAD_BEZIER || objType == ObjectType::POLYLINE_CONNECTOR)
+    if (pc.isDTMRendering)
     {
-        LineStyle lineStyle = loadLineStyle(mainObj.styleIdx);
+        outV.setObjType(ObjectType::TRIANGLE_MESH);
+        outV.setMainObjectIdx(pc.triangleMeshMainObjectIndex);
+    
+        TriangleMeshVertex vtx = vk::RawBufferLoad<TriangleMeshVertex>(pc.triangleMeshVerticesBaseAddress + sizeof(TriangleMeshVertex) * vertexID, 8u);
+        pfloat64_t2 vtxPos;
+        vtxPos.x = _static_cast<pfloat64_t>(vtx.pos.x);
+        vtxPos.y = _static_cast<pfloat64_t>(vtx.pos.y);
+
+        MainObject mainObj = loadMainObject(pc.triangleMeshMainObjectIndex);
+        clipProjectionData = getClipProjectionData(mainObj);
 
-        // Width is on both sides, thickness is one one side of the curve (div by 2.0f)
-        const float screenSpaceLineWidth = lineStyle.screenSpaceLineWidth + _static_cast<float>(_static_cast<pfloat64_t>(lineStyle.worldSpaceLineWidth) * globals.screenToWorldRatio);
-        const float antiAliasedLineThickness = screenSpaceLineWidth * 0.5f + globals.antiAliasingFactor;
-        const float sdfLineThickness = screenSpaceLineWidth / 2.0f;
-        outV.setLineThickness(sdfLineThickness);
+        float2 transformedPos = transformPointScreenSpace(clipProjectionData.projectionToNDC, globals.resolution, vtxPos);
+
+        outV.position.xy = transformedPos;
+        outV.position = transformFromSreenSpaceToNdc(outV.position.xy, globals.resolution);
+        const float heightAsFloat = nbl::hlsl::_static_cast<float>(vtx.height);
+        outV.setHeight(heightAsFloat);
+        outV.setScreenSpaceVertexAttribs(float3(transformedPos, heightAsFloat));
         outV.setCurrentWorldToScreenRatio(
             _static_cast<float>((_static_cast<pfloat64_t>(2.0f) /
-            (clipProjectionData.projectionToNDC[0].x * _static_cast<pfloat64_t>(globals.resolution.x))))
+                (clipProjectionData.projectionToNDC[0].x * _static_cast<pfloat64_t>(globals.resolution.x))))
         );
 
-        if (objType == ObjectType::LINE)
-        {
-            pfloat64_t2 points[2u];
-            points[0u] = vk::RawBufferLoad<pfloat64_t2>(globals.pointers.geometryBuffer + drawObj.geometryAddress, 8u);
-            points[1u] = vk::RawBufferLoad<pfloat64_t2>(globals.pointers.geometryBuffer + drawObj.geometryAddress + sizeof(LinePointInfo), 8u);
-
-            const float phaseShift = vk::RawBufferLoad<float>(globals.pointers.geometryBuffer + drawObj.geometryAddress + sizeof(pfloat64_t2), 8u);
-            const float patternStretch = vk::RawBufferLoad<float>(globals.pointers.geometryBuffer + drawObj.geometryAddress + sizeof(pfloat64_t2) + sizeof(float), 8u);
-            outV.setCurrentPhaseShift(phaseShift);
-            outV.setPatternStretch(patternStretch);
-
-            float2 transformedPoints[2u];
-            for (uint i = 0u; i < 2u; ++i)
-            {
-                transformedPoints[i] = transformPointScreenSpace(clipProjectionData.projectionToNDC, globals.resolution, points[i]);
-            }
+        // TODO: line style of contour line has to be set too!
+        DTMSettings dtm = loadDTMSettings(mainObj.dtmSettingsIdx);
+        LineStyle outlineStyle = loadLineStyle(dtm.outlineLineStyleIdx);
+        LineStyle contourStyle = loadLineStyle(dtm.contourLineStyleIdx);
+        const float screenSpaceOutlineWidth = outlineStyle.screenSpaceLineWidth + _static_cast<float>(_static_cast<pfloat64_t>(outlineStyle.worldSpaceLineWidth) * globals.screenToWorldRatio);
+        const float sdfOutlineThickness = screenSpaceOutlineWidth * 0.5f;
+        const float screenSpaceContourLineWidth = contourStyle.screenSpaceLineWidth + _static_cast<float>(_static_cast<pfloat64_t>(contourStyle.worldSpaceLineWidth) * globals.screenToWorldRatio);
+        const float sdfContourLineThickness = screenSpaceContourLineWidth * 0.5f;
+        outV.setOutlineThickness(sdfOutlineThickness);
+        outV.setContourLineThickness(sdfContourLineThickness);
+
+        // full screen triangle (this will destroy outline, contour line and height drawing)
+#if 0
+        const uint vertexIdx = vertexID % 3;
+        if(vertexIdx == 0)
+            outV.position.xy = float2(-1.0f, -1.0f);
+        else if (vertexIdx == 1)
+            outV.position.xy = float2(-1.0f, 3.0f);
+        else if (vertexIdx == 2)
+            outV.position.xy = float2(3.0f, -1.0f);
+#endif
+    }
+    else
+    {
+        const uint vertexIdx = vertexID & 0x3u;
+        const uint objectID = vertexID >> 2;
 
-            const float2 lineVector = normalize(transformedPoints[1u] - transformedPoints[0u]);
-            const float2 normalToLine = float2(-lineVector.y, lineVector.x);
+        DrawObject drawObj = loadDrawObject(objectID);
 
-            if (vertexIdx == 0u || vertexIdx == 1u)
-            {
-                // work in screen space coordinates because of fixed pixel size
-                outV.position.xy = transformedPoints[0u]
-                    + normalToLine * (((float)vertexIdx - 0.5f) * 2.0f * antiAliasedLineThickness)
-                    - lineVector * antiAliasedLineThickness;
-            }
-            else // if (vertexIdx == 2u || vertexIdx == 3u)
-            {
-                // work in screen space coordinates because of fixed pixel size
-                outV.position.xy = transformedPoints[1u]
-                    + normalToLine * (((float)vertexIdx - 2.5f) * 2.0f * antiAliasedLineThickness)
-                    + lineVector * antiAliasedLineThickness;
-            }
-
-            outV.setLineStart(transformedPoints[0u]);
-            outV.setLineEnd(transformedPoints[1u]);
+        ObjectType objType = (ObjectType)(drawObj.type_subsectionIdx & 0x0000FFFF);
+        uint32_t subsectionIdx = drawObj.type_subsectionIdx >> 16;
+        outV.setObjType(objType);
+        outV.setMainObjectIdx(drawObj.mainObjIndex);
+    
 
-            outV.position.xy = transformFromSreenSpaceToNdc(outV.position.xy, globals.resolution).xy;
-        }
-        else if (objType == ObjectType::QUAD_BEZIER)
+        MainObject mainObj = loadMainObject(drawObj.mainObjIndex);
+        clipProjectionData = getClipProjectionData(mainObj);
+    
+        // We only need these for Outline type objects like lines and bezier curves
+        if (objType == ObjectType::LINE || objType == ObjectType::QUAD_BEZIER || objType == ObjectType::POLYLINE_CONNECTOR)
         {
-            pfloat64_t2 points[3u];
-            points[0u] = vk::RawBufferLoad<pfloat64_t2>(globals.pointers.geometryBuffer + drawObj.geometryAddress, 8u);
-            points[1u] = vk::RawBufferLoad<pfloat64_t2>(globals.pointers.geometryBuffer + drawObj.geometryAddress + sizeof(pfloat64_t2), 8u);
-            points[2u] = vk::RawBufferLoad<pfloat64_t2>(globals.pointers.geometryBuffer + drawObj.geometryAddress + sizeof(pfloat64_t2) * 2u, 8u);
-
-            const float phaseShift = vk::RawBufferLoad<float>(globals.pointers.geometryBuffer + drawObj.geometryAddress + sizeof(pfloat64_t2) * 3u, 8u);
-            const float patternStretch = vk::RawBufferLoad<float>(globals.pointers.geometryBuffer + drawObj.geometryAddress + sizeof(pfloat64_t2) * 3u + sizeof(float), 8u);
-            outV.setCurrentPhaseShift(phaseShift);
-            outV.setPatternStretch(patternStretch);
-
-            // transform these points into screen space and pass to fragment
-            float2 transformedPoints[3u];
-            for (uint i = 0u; i < 3u; ++i)
+            LineStyle lineStyle = loadLineStyle(mainObj.styleIdx);
+
+            // Width is on both sides, thickness is one one side of the curve (div by 2.0f)
+            const float screenSpaceLineWidth = lineStyle.screenSpaceLineWidth + _static_cast<float>(_static_cast<pfloat64_t>(lineStyle.worldSpaceLineWidth) * globals.screenToWorldRatio);
+            const float antiAliasedLineThickness = screenSpaceLineWidth * 0.5f + globals.antiAliasingFactor;
+            const float sdfLineThickness = screenSpaceLineWidth / 2.0f;
+            outV.setLineThickness(sdfLineThickness);
+            outV.setCurrentWorldToScreenRatio(
+                _static_cast<float>((_static_cast<pfloat64_t>(2.0f) /
+                (clipProjectionData.projectionToNDC[0].x * _static_cast<pfloat64_t>(globals.resolution.x))))
+            );
+
+            if (objType == ObjectType::LINE)
             {
-                transformedPoints[i] = transformPointScreenSpace(clipProjectionData.projectionToNDC, globals.resolution, points[i]);
-            }
+                pfloat64_t2 points[2u];
+                points[0u] = vk::RawBufferLoad<pfloat64_t2>(globals.pointers.geometryBuffer + drawObj.geometryAddress, 8u);
+                points[1u] = vk::RawBufferLoad<pfloat64_t2>(globals.pointers.geometryBuffer + drawObj.geometryAddress + sizeof(LinePointInfo), 8u);
 
-            shapes::QuadraticBezier<float> quadraticBezier = shapes::QuadraticBezier<float>::construct(transformedPoints[0u], transformedPoints[1u], transformedPoints[2u]);
-            shapes::Quadratic<float> quadratic = shapes::Quadratic<float>::constructFromBezier(quadraticBezier);
-            shapes::Quadratic<float>::ArcLengthCalculator preCompData = shapes::Quadratic<float>::ArcLengthCalculator::construct(quadratic);
-
-            outV.setQuadratic(quadratic);
-            outV.setQuadraticPrecomputedArcLenData(preCompData);
-
-            float2 Mid = (transformedPoints[0u] + transformedPoints[2u]) / 2.0f;
-            float Radius = length(Mid - transformedPoints[0u]) / 2.0f;
-
-            // https://algorithmist.wordpress.com/2010/12/01/quad-bezier-curvature/
-            float2 vectorAB = transformedPoints[1u] - transformedPoints[0u];
-            float2 vectorAC = transformedPoints[2u] - transformedPoints[1u];
-            float area = abs(vectorAB.x * vectorAC.y - vectorAB.y * vectorAC.x) * 0.5;
-            float MaxCurvature;
-            if (length(transformedPoints[1u] - lerp(transformedPoints[0u], transformedPoints[2u], 0.25f)) > Radius && length(transformedPoints[1u] - lerp(transformedPoints[0u], transformedPoints[2u], 0.75f)) > Radius)
-                MaxCurvature = pow(length(transformedPoints[1u] - Mid), 3) / (area * area);
-            else
-                MaxCurvature = max(area / pow(length(transformedPoints[0u] - transformedPoints[1u]), 3), area / pow(length(transformedPoints[2u] - transformedPoints[1u]), 3));
-
-            // We only do this adaptive thing when "MinRadiusOfOsculatingCircle = RadiusOfMaxCurvature < screenSpaceLineWidth/4" OR "MaxCurvature > 4/screenSpaceLineWidth";
-            //  which means there is a self intersection because of large lineWidth relative to the curvature (in screenspace)
-            //  the reason for division by 4.0f is 1. screenSpaceLineWidth is expanded on both sides and 2. the fact that diameter/2=radius, 
-            const bool noCurvature = abs(dot(normalize(vectorAB), normalize(vectorAC)) - 1.0f) < exp2(-10.0f);
-            if (MaxCurvature * screenSpaceLineWidth > 4.0f || noCurvature)
-            {
-                //OBB Fallback
-                float2 obbV0;
-                float2 obbV1;
-                float2 obbV2;
-                float2 obbV3;
-                quadraticBezier.computeOBB(antiAliasedLineThickness, obbV0, obbV1, obbV2, obbV3);
-                if (subsectionIdx == 0)
+                const float phaseShift = vk::RawBufferLoad<float>(globals.pointers.geometryBuffer + drawObj.geometryAddress + sizeof(pfloat64_t2), 8u);
+                const float patternStretch = vk::RawBufferLoad<float>(globals.pointers.geometryBuffer + drawObj.geometryAddress + sizeof(pfloat64_t2) + sizeof(float), 8u);
+                outV.setCurrentPhaseShift(phaseShift);
+                outV.setPatternStretch(patternStretch);
+
+                float2 transformedPoints[2u];
+                for (uint i = 0u; i < 2u; ++i)
                 {
-                    if (vertexIdx == 0u)
-                        outV.position = float4(obbV0, 0.0, 1.0f);
-                    else if (vertexIdx == 1u)
-                        outV.position = float4(obbV1, 0.0, 1.0f);
-                    else if (vertexIdx == 2u)
-                        outV.position = float4(obbV3, 0.0, 1.0f);
-                    else if (vertexIdx == 3u)
-                        outV.position = float4(obbV2, 0.0, 1.0f);
+                    transformedPoints[i] = transformPointScreenSpace(clipProjectionData.projectionToNDC, globals.resolution, points[i]);
                 }
-                else
-                    outV.position = float4(0.0f, 0.0f, 0.0f, 0.0f);
-            }
-            else
-            {
-                // this optimal value is hardcoded based on tests and benchmarks of pixel shader invocation
-                // this is the place where we use it's tangent in the bezier to form sides the cages
-                const float optimalT = 0.145f;
-
-                // Whether or not to flip the the interior cage nodes
-                int flip = cross2D(transformedPoints[0u] - transformedPoints[1u], transformedPoints[2u] - transformedPoints[1u]) > 0.0f ? -1 : 1;
 
-                const float middleT = 0.5f;
-                float2 midPos = QuadraticBezier(transformedPoints[0u], transformedPoints[1u], transformedPoints[2u], middleT);
-                float2 midTangent = normalize(BezierTangent(transformedPoints[0u], transformedPoints[1u], transformedPoints[2u], middleT));
-                float2 midNormal = float2(-midTangent.y, midTangent.x) * flip;
-
-                /*
-                            P1
-                            +
+                const float2 lineVector = normalize(transformedPoints[1u] - transformedPoints[0u]);
+                const float2 normalToLine = float2(-lineVector.y, lineVector.x);
 
+                if (vertexIdx == 0u || vertexIdx == 1u)
+                {
+                    // work in screen space coordinates because of fixed pixel size
+                    outV.position.xy = transformedPoints[0u]
+                        + normalToLine * (((float)vertexIdx - 0.5f) * 2.0f * antiAliasedLineThickness)
+                        - lineVector * antiAliasedLineThickness;
+                }
+                else // if (vertexIdx == 2u || vertexIdx == 3u)
+                {
+                    // work in screen space coordinates because of fixed pixel size
+                    outV.position.xy = transformedPoints[1u]
+                        + normalToLine * (((float)vertexIdx - 2.5f) * 2.0f * antiAliasedLineThickness)
+                        + lineVector * antiAliasedLineThickness;
+                }
 
-               exterior0              exterior1
-                  ----------------------
-                 /                      \-
-               -/    ----------------     \
-              /    -/interior0     interior1
-             /    /                    \    \-
-           -/   -/                      \-    \
-          /   -/                          \    \-
-         /   /                             \-    \
-     P0 +                                    \    + P2
-                */
+                outV.setLineStart(transformedPoints[0u]);
+                outV.setLineEnd(transformedPoints[1u]);
 
-                // Internal cage points
-                float2 interior0;
-                float2 interior1;
+                outV.position.xy = transformFromSreenSpaceToNdc(outV.position.xy, globals.resolution).xy;
+            }
+            else if (objType == ObjectType::QUAD_BEZIER)
+            {
+                pfloat64_t2 points[3u];
+                points[0u] = vk::RawBufferLoad<pfloat64_t2>(globals.pointers.geometryBuffer + drawObj.geometryAddress, 8u);
+                points[1u] = vk::RawBufferLoad<pfloat64_t2>(globals.pointers.geometryBuffer + drawObj.geometryAddress + sizeof(pfloat64_t2), 8u);
+                points[2u] = vk::RawBufferLoad<pfloat64_t2>(globals.pointers.geometryBuffer + drawObj.geometryAddress + sizeof(pfloat64_t2) * 2u, 8u);
+
+                const float phaseShift = vk::RawBufferLoad<float>(globals.pointers.geometryBuffer + drawObj.geometryAddress + sizeof(pfloat64_t2) * 3u, 8u);
+                const float patternStretch = vk::RawBufferLoad<float>(globals.pointers.geometryBuffer + drawObj.geometryAddress + sizeof(pfloat64_t2) * 3u + sizeof(float), 8u);
+                outV.setCurrentPhaseShift(phaseShift);
+                outV.setPatternStretch(patternStretch);
+
+                // transform these points into screen space and pass to fragment
+                float2 transformedPoints[3u];
+                for (uint i = 0u; i < 3u; ++i)
+                {
+                    transformedPoints[i] = transformPointScreenSpace(clipProjectionData.projectionToNDC, globals.resolution, points[i]);
+                }
 
-                float2 middleExteriorPoint = midPos - midNormal * antiAliasedLineThickness;
+                shapes::QuadraticBezier<float> quadraticBezier = shapes::QuadraticBezier<float>::construct(transformedPoints[0u], transformedPoints[1u], transformedPoints[2u]);
+                shapes::Quadratic<float> quadratic = shapes::Quadratic<float>::constructFromBezier(quadraticBezier);
+                shapes::Quadratic<float>::ArcLengthCalculator preCompData = shapes::Quadratic<float>::ArcLengthCalculator::construct(quadratic);
 
+                outV.setQuadratic(quadratic);
+                outV.setQuadraticPrecomputedArcLenData(preCompData);
 
-                float2 leftTangent = normalize(BezierTangent(transformedPoints[0u], transformedPoints[1u], transformedPoints[2u], optimalT));
-                float2 leftNormal = normalize(float2(-leftTangent.y, leftTangent.x)) * flip;
-                float2 leftExteriorPoint = QuadraticBezier(transformedPoints[0u], transformedPoints[1u], transformedPoints[2u], optimalT) - leftNormal * antiAliasedLineThickness;
-                float2 exterior0 = shapes::util::LineLineIntersection<float>(middleExteriorPoint, midTangent, leftExteriorPoint, leftTangent);
+                float2 Mid = (transformedPoints[0u] + transformedPoints[2u]) / 2.0f;
+                float Radius = length(Mid - transformedPoints[0u]) / 2.0f;
 
-                float2 rightTangent = normalize(BezierTangent(transformedPoints[0u], transformedPoints[1u], transformedPoints[2u], 1.0f - optimalT));
-                float2 rightNormal = normalize(float2(-rightTangent.y, rightTangent.x)) * flip;
-                float2 rightExteriorPoint = QuadraticBezier(transformedPoints[0u], transformedPoints[1u], transformedPoints[2u], 1.0f - optimalT) - rightNormal * antiAliasedLineThickness;
-                float2 exterior1 = shapes::util::LineLineIntersection<float>(middleExteriorPoint, midTangent, rightExteriorPoint, rightTangent);
+                // https://algorithmist.wordpress.com/2010/12/01/quad-bezier-curvature/
+                float2 vectorAB = transformedPoints[1u] - transformedPoints[0u];
+                float2 vectorAC = transformedPoints[2u] - transformedPoints[1u];
+                float area = abs(vectorAB.x * vectorAC.y - vectorAB.y * vectorAC.x) * 0.5;
+                float MaxCurvature;
+                if (length(transformedPoints[1u] - lerp(transformedPoints[0u], transformedPoints[2u], 0.25f)) > Radius && length(transformedPoints[1u] - lerp(transformedPoints[0u], transformedPoints[2u], 0.75f)) > Radius)
+                    MaxCurvature = pow(length(transformedPoints[1u] - Mid), 3) / (area * area);
+                else
+                    MaxCurvature = max(area / pow(length(transformedPoints[0u] - transformedPoints[1u]), 3), area / pow(length(transformedPoints[2u] - transformedPoints[1u]), 3));
 
-                // Interiors
+                // We only do this adaptive thing when "MinRadiusOfOsculatingCircle = RadiusOfMaxCurvature < screenSpaceLineWidth/4" OR "MaxCurvature > 4/screenSpaceLineWidth";
+                //  which means there is a self intersection because of large lineWidth relative to the curvature (in screenspace)
+                //  the reason for division by 4.0f is 1. screenSpaceLineWidth is expanded on both sides and 2. the fact that diameter/2=radius, 
+                const bool noCurvature = abs(dot(normalize(vectorAB), normalize(vectorAC)) - 1.0f) < exp2(-10.0f);
+                if (MaxCurvature * screenSpaceLineWidth > 4.0f || noCurvature)
                 {
-                    float2 tangent = normalize(BezierTangent(transformedPoints[0u], transformedPoints[1u], transformedPoints[2u], 0.286f));
-                    float2 normal = normalize(float2(-tangent.y, tangent.x)) * flip;
-                    interior0 = QuadraticBezier(transformedPoints[0u], transformedPoints[1u], transformedPoints[2u], 0.286) + normal * antiAliasedLineThickness;
+                    //OBB Fallback
+                    float2 obbV0;
+                    float2 obbV1;
+                    float2 obbV2;
+                    float2 obbV3;
+                    quadraticBezier.computeOBB(antiAliasedLineThickness, obbV0, obbV1, obbV2, obbV3);
+                    if (subsectionIdx == 0)
+                    {
+                        if (vertexIdx == 0u)
+                            outV.position = float4(obbV0, 0.0, 1.0f);
+                        else if (vertexIdx == 1u)
+                            outV.position = float4(obbV1, 0.0, 1.0f);
+                        else if (vertexIdx == 2u)
+                            outV.position = float4(obbV3, 0.0, 1.0f);
+                        else if (vertexIdx == 3u)
+                            outV.position = float4(obbV2, 0.0, 1.0f);
+                    }
+                    else
+                        outV.position = float4(0.0f, 0.0f, 0.0f, 0.0f);
                 }
+                else
                 {
-                    float2 tangent = normalize(BezierTangent(transformedPoints[0u], transformedPoints[1u], transformedPoints[2u], 0.714f));
-                    float2 normal = normalize(float2(-tangent.y, tangent.x)) * flip;
-                    interior1 = QuadraticBezier(transformedPoints[0u], transformedPoints[1u], transformedPoints[2u], 0.714f) + normal * antiAliasedLineThickness;
+                    // this optimal value is hardcoded based on tests and benchmarks of pixel shader invocation
+                    // this is the place where we use it's tangent in the bezier to form sides the cages
+                    const float optimalT = 0.145f;
+
+                    // Whether or not to flip the the interior cage nodes
+                    int flip = cross2D(transformedPoints[0u] - transformedPoints[1u], transformedPoints[2u] - transformedPoints[1u]) > 0.0f ? -1 : 1;
+
+                    const float middleT = 0.5f;
+                    float2 midPos = QuadraticBezier(transformedPoints[0u], transformedPoints[1u], transformedPoints[2u], middleT);
+                    float2 midTangent = normalize(BezierTangent(transformedPoints[0u], transformedPoints[1u], transformedPoints[2u], middleT));
+                    float2 midNormal = float2(-midTangent.y, midTangent.x) * flip;
+
+                    /*
+                                P1
+                                +
+
+
+                   exterior0              exterior1
+                      ----------------------
+                     /                      \-
+                   -/    ----------------     \
+                  /    -/interior0     interior1
+                 /    /                    \    \-
+               -/   -/                      \-    \
+              /   -/                          \    \-
+             /   /                             \-    \
+         P0 +                                    \    + P2
+                    */
+
+                    // Internal cage points
+                    float2 interior0;
+                    float2 interior1;
+
+                    float2 middleExteriorPoint = midPos - midNormal * antiAliasedLineThickness;
+
+
+                    float2 leftTangent = normalize(BezierTangent(transformedPoints[0u], transformedPoints[1u], transformedPoints[2u], optimalT));
+                    float2 leftNormal = normalize(float2(-leftTangent.y, leftTangent.x)) * flip;
+                    float2 leftExteriorPoint = QuadraticBezier(transformedPoints[0u], transformedPoints[1u], transformedPoints[2u], optimalT) - leftNormal * antiAliasedLineThickness;
+                    float2 exterior0 = shapes::util::LineLineIntersection<float>(middleExteriorPoint, midTangent, leftExteriorPoint, leftTangent);
+
+                    float2 rightTangent = normalize(BezierTangent(transformedPoints[0u], transformedPoints[1u], transformedPoints[2u], 1.0f - optimalT));
+                    float2 rightNormal = normalize(float2(-rightTangent.y, rightTangent.x)) * flip;
+                    float2 rightExteriorPoint = QuadraticBezier(transformedPoints[0u], transformedPoints[1u], transformedPoints[2u], 1.0f - optimalT) - rightNormal * antiAliasedLineThickness;
+                    float2 exterior1 = shapes::util::LineLineIntersection<float>(middleExteriorPoint, midTangent, rightExteriorPoint, rightTangent);
+
+                    // Interiors
+                    {
+                        float2 tangent = normalize(BezierTangent(transformedPoints[0u], transformedPoints[1u], transformedPoints[2u], 0.286f));
+                        float2 normal = normalize(float2(-tangent.y, tangent.x)) * flip;
+                        interior0 = QuadraticBezier(transformedPoints[0u], transformedPoints[1u], transformedPoints[2u], 0.286) + normal * antiAliasedLineThickness;
+                    }
+                    {
+                        float2 tangent = normalize(BezierTangent(transformedPoints[0u], transformedPoints[1u], transformedPoints[2u], 0.714f));
+                        float2 normal = normalize(float2(-tangent.y, tangent.x)) * flip;
+                        interior1 = QuadraticBezier(transformedPoints[0u], transformedPoints[1u], transformedPoints[2u], 0.714f) + normal * antiAliasedLineThickness;
+                    }
+
+                    if (subsectionIdx == 0u)
+                    {
+                        float2 endPointTangent = normalize(transformedPoints[1u] - transformedPoints[0u]);
+                        float2 endPointNormal = float2(-endPointTangent.y, endPointTangent.x) * flip;
+                        float2 endPointExterior = transformedPoints[0u] - endPointTangent * antiAliasedLineThickness;
+
+                        if (vertexIdx == 0u)
+                            outV.position = float4(shapes::util::LineLineIntersection<float>(leftExteriorPoint, leftTangent, endPointExterior, endPointNormal), 0.0, 1.0f);
+                        else if (vertexIdx == 1u)
+                            outV.position = float4(transformedPoints[0u] + endPointNormal * antiAliasedLineThickness - endPointTangent * antiAliasedLineThickness, 0.0, 1.0f);
+                        else if (vertexIdx == 2u)
+                            outV.position = float4(exterior0, 0.0, 1.0f);
+                        else if (vertexIdx == 3u)
+                            outV.position = float4(interior0, 0.0, 1.0f);
+                    }
+                    else if (subsectionIdx == 1u)
+                    {
+                        if (vertexIdx == 0u)
+                            outV.position = float4(exterior0, 0.0, 1.0f);
+                        else if (vertexIdx == 1u)
+                            outV.position = float4(interior0, 0.0, 1.0f);
+                        else if (vertexIdx == 2u)
+                            outV.position = float4(exterior1, 0.0, 1.0f);
+                        else if (vertexIdx == 3u)
+                            outV.position = float4(interior1, 0.0, 1.0f);
+                    }
+                    else if (subsectionIdx == 2u)
+                    {
+                        float2 endPointTangent = normalize(transformedPoints[2u] - transformedPoints[1u]);
+                        float2 endPointNormal = float2(-endPointTangent.y, endPointTangent.x) * flip;
+                        float2 endPointExterior = transformedPoints[2u] + endPointTangent * antiAliasedLineThickness;
+
+                        if (vertexIdx == 0u)
+                            outV.position = float4(shapes::util::LineLineIntersection<float>(rightExteriorPoint, rightTangent, endPointExterior, endPointNormal), 0.0, 1.0f);
+                        else if (vertexIdx == 1u)
+                            outV.position = float4(transformedPoints[2u] + endPointNormal * antiAliasedLineThickness + endPointTangent * antiAliasedLineThickness, 0.0, 1.0f);
+                        else if (vertexIdx == 2u)
+                            outV.position = float4(exterior1, 0.0, 1.0f);
+                        else if (vertexIdx == 3u)
+                            outV.position = float4(interior1, 0.0, 1.0f);
+                    }
                 }
 
-                if (subsectionIdx == 0u)
-                {
-                    float2 endPointTangent = normalize(transformedPoints[1u] - transformedPoints[0u]);
-                    float2 endPointNormal = float2(-endPointTangent.y, endPointTangent.x) * flip;
-                    float2 endPointExterior = transformedPoints[0u] - endPointTangent * antiAliasedLineThickness;
+                outV.position.xy = (outV.position.xy / globals.resolution) * 2.0f - 1.0f;
+            }
+            else if (objType == ObjectType::POLYLINE_CONNECTOR)
+            {
+                const float FLOAT_INF = numeric_limits<float>::infinity;
+                const float4 INVALID_VERTEX = float4(FLOAT_INF, FLOAT_INF, FLOAT_INF, FLOAT_INF);
 
-                    if (vertexIdx == 0u)
-                        outV.position = float4(shapes::util::LineLineIntersection<float>(leftExteriorPoint, leftTangent, endPointExterior, endPointNormal), 0.0, 1.0f);
-                    else if (vertexIdx == 1u)
-                        outV.position = float4(transformedPoints[0u] + endPointNormal * antiAliasedLineThickness - endPointTangent * antiAliasedLineThickness, 0.0, 1.0f);
-                    else if (vertexIdx == 2u)
-                        outV.position = float4(exterior0, 0.0, 1.0f);
-                    else if (vertexIdx == 3u)
-                        outV.position = float4(interior0, 0.0, 1.0f);
-                }
-                else if (subsectionIdx == 1u)
-                {
-                    if (vertexIdx == 0u)
-                        outV.position = float4(exterior0, 0.0, 1.0f);
-                    else if (vertexIdx == 1u)
-                        outV.position = float4(interior0, 0.0, 1.0f);
-                    else if (vertexIdx == 2u)
-                        outV.position = float4(exterior1, 0.0, 1.0f);
-                    else if (vertexIdx == 3u)
-                        outV.position = float4(interior1, 0.0, 1.0f);
-                }
-                else if (subsectionIdx == 2u)
+                if (lineStyle.isRoadStyleFlag)
                 {
-                    float2 endPointTangent = normalize(transformedPoints[2u] - transformedPoints[1u]);
-                    float2 endPointNormal = float2(-endPointTangent.y, endPointTangent.x) * flip;
-                    float2 endPointExterior = transformedPoints[2u] + endPointTangent * antiAliasedLineThickness;
+                    const pfloat64_t2 circleCenter = vk::RawBufferLoad<pfloat64_t2>(globals.pointers.geometryBuffer + drawObj.geometryAddress, 8u);
+                    const float2 v = vk::RawBufferLoad<float2>(globals.pointers.geometryBuffer + drawObj.geometryAddress + sizeof(pfloat64_t2), 8u);
+                    const float cosHalfAngleBetweenNormals = vk::RawBufferLoad<float>(globals.pointers.geometryBuffer + drawObj.geometryAddress + sizeof(pfloat64_t2) + sizeof(float2), 8u);
 
-                    if (vertexIdx == 0u)
-                        outV.position = float4(shapes::util::LineLineIntersection<float>(rightExteriorPoint, rightTangent, endPointExterior, endPointNormal), 0.0, 1.0f);
-                    else if (vertexIdx == 1u)
-                        outV.position = float4(transformedPoints[2u] + endPointNormal * antiAliasedLineThickness + endPointTangent * antiAliasedLineThickness, 0.0, 1.0f);
-                    else if (vertexIdx == 2u)
-                        outV.position = float4(exterior1, 0.0, 1.0f);
-                    else if (vertexIdx == 3u)
-                        outV.position = float4(interior1, 0.0, 1.0f);
-                }
-            }
+                    const float2 circleCenterScreenSpace = transformPointScreenSpace(clipProjectionData.projectionToNDC, globals.resolution, circleCenter);
+                    outV.setPolylineConnectorCircleCenter(circleCenterScreenSpace);
 
-            outV.position.xy = (outV.position.xy / globals.resolution) * 2.0f - 1.0f;
-        }
-        else if (objType == ObjectType::POLYLINE_CONNECTOR)
-        {
-            const float FLOAT_INF = numeric_limits<float>::infinity;
-            const float4 INVALID_VERTEX = float4(FLOAT_INF, FLOAT_INF, FLOAT_INF, FLOAT_INF);
+                    // Find other miter vertices
+                    const float sinHalfAngleBetweenNormals = sqrt(1.0f - (cosHalfAngleBetweenNormals * cosHalfAngleBetweenNormals));
+                    const float32_t2x2 rotationMatrix = float32_t2x2(cosHalfAngleBetweenNormals, -sinHalfAngleBetweenNormals, sinHalfAngleBetweenNormals, cosHalfAngleBetweenNormals);
 
-            if (lineStyle.isRoadStyleFlag)
-            {
-                const pfloat64_t2 circleCenter = vk::RawBufferLoad<pfloat64_t2>(globals.pointers.geometryBuffer + drawObj.geometryAddress, 8u);
-                const float2 v = vk::RawBufferLoad<float2>(globals.pointers.geometryBuffer + drawObj.geometryAddress + sizeof(pfloat64_t2), 8u);
-                const float cosHalfAngleBetweenNormals = vk::RawBufferLoad<float>(globals.pointers.geometryBuffer + drawObj.geometryAddress + sizeof(pfloat64_t2) + sizeof(float2), 8u);
+                    // Pass the precomputed trapezoid values for the sdf
+                    {
+                        float vLen = length(v);
+                        float2 intersectionDirection = v / vLen;
 
-                const float2 circleCenterScreenSpace = transformPointScreenSpace(clipProjectionData.projectionToNDC, globals.resolution, circleCenter);
-                outV.setPolylineConnectorCircleCenter(circleCenterScreenSpace);
+                        float longBase = sinHalfAngleBetweenNormals;
+                        float shortBase = max((vLen - globals.miterLimit) * cosHalfAngleBetweenNormals / sinHalfAngleBetweenNormals, 0.0);
+                        // height of the trapezoid / triangle
+                        float hLen = min(globals.miterLimit, vLen);
 
-                // Find other miter vertices
-                const float sinHalfAngleBetweenNormals = sqrt(1.0f - (cosHalfAngleBetweenNormals * cosHalfAngleBetweenNormals));
-                const float32_t2x2 rotationMatrix = float32_t2x2(cosHalfAngleBetweenNormals, -sinHalfAngleBetweenNormals, sinHalfAngleBetweenNormals, cosHalfAngleBetweenNormals);
+                        outV.setPolylineConnectorTrapezoidStart(-1.0 * intersectionDirection * sdfLineThickness);
+                        outV.setPolylineConnectorTrapezoidEnd(intersectionDirection * hLen * sdfLineThickness);
+                        outV.setPolylineConnectorTrapezoidLongBase(sinHalfAngleBetweenNormals * ((1.0 + vLen) / (vLen - cosHalfAngleBetweenNormals)) * sdfLineThickness);
+                        outV.setPolylineConnectorTrapezoidShortBase(shortBase * sdfLineThickness);
+                    }
 
-                // Pass the precomputed trapezoid values for the sdf
-                {
-                    float vLen = length(v);
-                    float2 intersectionDirection = v / vLen;
-
-                    float longBase = sinHalfAngleBetweenNormals;
-                    float shortBase = max((vLen - globals.miterLimit) * cosHalfAngleBetweenNormals / sinHalfAngleBetweenNormals, 0.0);
-                    // height of the trapezoid / triangle
-                    float hLen = min(globals.miterLimit, vLen);
-
-                    outV.setPolylineConnectorTrapezoidStart(-1.0 * intersectionDirection * sdfLineThickness);
-                    outV.setPolylineConnectorTrapezoidEnd(intersectionDirection * hLen * sdfLineThickness);
-                    outV.setPolylineConnectorTrapezoidLongBase(sinHalfAngleBetweenNormals * ((1.0 + vLen) / (vLen - cosHalfAngleBetweenNormals)) * sdfLineThickness);
-                    outV.setPolylineConnectorTrapezoidShortBase(shortBase * sdfLineThickness);
-                }
+                    if (vertexIdx == 0u)
+                    {
+                        const float2 V1 = normalize(mul(v, rotationMatrix)) * antiAliasedLineThickness * 2.0f;
+                        const float2 screenSpaceV1 = circleCenterScreenSpace + V1;
+                        outV.position = float4(screenSpaceV1, 0.0f, 1.0f);   
+                    }
+                    else if (vertexIdx == 1u)
+                    {
+                        outV.position = float4(circleCenterScreenSpace, 0.0f, 1.0f);
+                    }
+                    else if (vertexIdx == 2u)
+                    {
+                        // find intersection point vertex
+                        float2 intersectionPoint = v * antiAliasedLineThickness * 2.0f;
+                        intersectionPoint += circleCenterScreenSpace;
+                        outV.position = float4(intersectionPoint, 0.0f, 1.0f);
+                    }
+                    else if (vertexIdx == 3u)
+                    {
+                        const float2 V2 = normalize(mul(rotationMatrix, v)) * antiAliasedLineThickness * 2.0f;
+                        const float2 screenSpaceV2 = circleCenterScreenSpace + V2;
+                        outV.position = float4(screenSpaceV2, 0.0f, 1.0f);
+                    }
 
-                if (vertexIdx == 0u)
-                {
-                    const float2 V1 = normalize(mul(v, rotationMatrix)) * antiAliasedLineThickness * 2.0f;
-                    const float2 screenSpaceV1 = circleCenterScreenSpace + V1;
-                    outV.position = float4(screenSpaceV1, 0.0f, 1.0f);   
-                }
-                else if (vertexIdx == 1u)
-                {
-                    outV.position = float4(circleCenterScreenSpace, 0.0f, 1.0f);
+                    outV.position.xy = transformFromSreenSpaceToNdc(outV.position.xy, globals.resolution).xy;
                 }
-                else if (vertexIdx == 2u)
-                {
-                    // find intersection point vertex
-                    float2 intersectionPoint = v * antiAliasedLineThickness * 2.0f;
-                    intersectionPoint += circleCenterScreenSpace;
-                    outV.position = float4(intersectionPoint, 0.0f, 1.0f);
-                }
-                else if (vertexIdx == 3u)
+                else
                 {
-                    const float2 V2 = normalize(mul(rotationMatrix, v)) * antiAliasedLineThickness * 2.0f;
-                    const float2 screenSpaceV2 = circleCenterScreenSpace + V2;
-                    outV.position = float4(screenSpaceV2, 0.0f, 1.0f);
+                    outV.position = INVALID_VERTEX;
                 }
-
-                outV.position.xy = transformFromSreenSpaceToNdc(outV.position.xy, globals.resolution).xy;
-            }
-            else
-            {
-                outV.position = INVALID_VERTEX;
             }
         }
-    }
-    else if (objType == ObjectType::CURVE_BOX)
-    {
-        CurveBox curveBox;
-        curveBox.aabbMin = vk::RawBufferLoad<pfloat64_t2>(globals.pointers.geometryBuffer + drawObj.geometryAddress, 8u);
-        curveBox.aabbMax = vk::RawBufferLoad<pfloat64_t2>(globals.pointers.geometryBuffer + drawObj.geometryAddress + sizeof(pfloat64_t2), 8u);
-
-        for (uint32_t i = 0; i < 3; i ++)
+        else if (objType == ObjectType::CURVE_BOX)
         {
-            curveBox.curveMin[i] = vk::RawBufferLoad<float32_t2>(globals.pointers.geometryBuffer + drawObj.geometryAddress + sizeof(pfloat64_t2) * 2 + sizeof(float32_t2) * i, 4u);
-            curveBox.curveMax[i] = vk::RawBufferLoad<float32_t2>(globals.pointers.geometryBuffer + drawObj.geometryAddress + sizeof(pfloat64_t2) * 2 + sizeof(float32_t2) * (3 + i), 4u);
-        }
+            CurveBox curveBox;
+            curveBox.aabbMin = vk::RawBufferLoad<pfloat64_t2>(globals.pointers.geometryBuffer + drawObj.geometryAddress, 8u);
+            curveBox.aabbMax = vk::RawBufferLoad<pfloat64_t2>(globals.pointers.geometryBuffer + drawObj.geometryAddress + sizeof(pfloat64_t2), 8u);
+
+            for (uint32_t i = 0; i < 3; i ++)
+            {
+                curveBox.curveMin[i] = vk::RawBufferLoad<float32_t2>(globals.pointers.geometryBuffer + drawObj.geometryAddress + sizeof(pfloat64_t2) * 2 + sizeof(float32_t2) * i, 4u);
+                curveBox.curveMax[i] = vk::RawBufferLoad<float32_t2>(globals.pointers.geometryBuffer + drawObj.geometryAddress + sizeof(pfloat64_t2) * 2 + sizeof(float32_t2) * (3 + i), 4u);
+            }
 
-        pfloat64_t2 aabbMaxXMinY;
-        aabbMaxXMinY.x = curveBox.aabbMax.x;
-        aabbMaxXMinY.y = curveBox.aabbMin.y;
+            pfloat64_t2 aabbMaxXMinY;
+            aabbMaxXMinY.x = curveBox.aabbMax.x;
+            aabbMaxXMinY.y = curveBox.aabbMin.y;
 
-        pfloat64_t2 aabbMinXMaxY;
-        aabbMinXMaxY.x = curveBox.aabbMin.x;
-        aabbMinXMaxY.y = curveBox.aabbMax.y;
+            pfloat64_t2 aabbMinXMaxY;
+            aabbMinXMaxY.x = curveBox.aabbMin.x;
+            aabbMinXMaxY.y = curveBox.aabbMax.y;
 
-        const float2 ndcAxisU = _static_cast<float2>(transformVectorNdc(clipProjectionData.projectionToNDC, aabbMaxXMinY - curveBox.aabbMin));
-        const float2 ndcAxisV = _static_cast<float2>(transformVectorNdc(clipProjectionData.projectionToNDC, aabbMinXMaxY - curveBox.aabbMin));
+            const float2 ndcAxisU = _static_cast<float2>(transformVectorNdc(clipProjectionData.projectionToNDC, aabbMaxXMinY - curveBox.aabbMin));
+            const float2 ndcAxisV = _static_cast<float2>(transformVectorNdc(clipProjectionData.projectionToNDC, aabbMinXMaxY - curveBox.aabbMin));
 
-        const float2 screenSpaceAabbExtents = float2(length(ndcAxisU * float2(globals.resolution)) / 2.0, length(ndcAxisV * float2(globals.resolution)) / 2.0);
+            const float2 screenSpaceAabbExtents = float2(length(ndcAxisU * float2(globals.resolution)) / 2.0, length(ndcAxisV * float2(globals.resolution)) / 2.0);
 
-        // we could use something like  this to compute screen space change over minor/major change and avoid ddx(minor), ddy(major) in frag shader (the code below doesn't account for rotation)
-        outV.setCurveBoxScreenSpaceSize(float2(screenSpaceAabbExtents));
+            // we could use something like  this to compute screen space change over minor/major change and avoid ddx(minor), ddy(major) in frag shader (the code below doesn't account for rotation)
+            outV.setCurveBoxScreenSpaceSize(float2(screenSpaceAabbExtents));
         
-        const float2 undilatedCorner = float2(bool2(vertexIdx & 0x1u, vertexIdx >> 1));
-        const pfloat64_t2 undilatedCornerF64 = _static_cast<pfloat64_t2>(undilatedCorner);
+            const float2 undilatedCorner = float2(bool2(vertexIdx & 0x1u, vertexIdx >> 1));
+            const pfloat64_t2 undilatedCornerF64 = _static_cast<pfloat64_t2>(undilatedCorner);
 
-        // We don't dilate on AMD (= no fragShaderInterlock)
-        const float pixelsToIncreaseOnEachSide = globals.antiAliasingFactor + 1.0;
-        const float2 dilateRate = pixelsToIncreaseOnEachSide / screenSpaceAabbExtents; // float sufficient to hold the dilate rect? 
-        float2 dilateVec;
-        float2 dilatedUV;
-        dilateHatch<jit::device_capabilities::fragmentShaderPixelInterlock>(dilateVec, dilatedUV, undilatedCorner, dilateRate, ndcAxisU, ndcAxisV);
+            // We don't dilate on AMD (= no fragShaderInterlock)
+            const float pixelsToIncreaseOnEachSide = globals.antiAliasingFactor + 1.0;
+            const float2 dilateRate = pixelsToIncreaseOnEachSide / screenSpaceAabbExtents; // float sufficient to hold the dilate rect? 
+            float2 dilateVec;
+            float2 dilatedUV;
+            dilateHatch<jit::device_capabilities::fragmentShaderPixelInterlock>(dilateVec, dilatedUV, undilatedCorner, dilateRate, ndcAxisU, ndcAxisV);
 
-        // doing interpolation this way to ensure correct endpoints and 0 and 1, we can alternatively use branches to set current corner based on vertexIdx
-        const pfloat64_t2 currentCorner = curveBox.aabbMin * (_static_cast<pfloat64_t2>(float2(1.0f, 1.0f)) - undilatedCornerF64) +
-            curveBox.aabbMax * undilatedCornerF64;
+            // doing interpolation this way to ensure correct endpoints and 0 and 1, we can alternatively use branches to set current corner based on vertexIdx
+            const pfloat64_t2 currentCorner = curveBox.aabbMin * (_static_cast<pfloat64_t2>(float2(1.0f, 1.0f)) - undilatedCornerF64) +
+                curveBox.aabbMax * undilatedCornerF64;
 
-        const float2 coord = _static_cast<float2>(transformPointNdc(clipProjectionData.projectionToNDC, currentCorner) + _static_cast<pfloat64_t2>(dilateVec));
+            const float2 coord = _static_cast<float2>(transformPointNdc(clipProjectionData.projectionToNDC, currentCorner) + _static_cast<pfloat64_t2>(dilateVec));
 
-        outV.position = float4(coord, 0.f, 1.f);
+            outV.position = float4(coord, 0.f, 1.f);
  
-        const uint major = (uint)SelectedMajorAxis;
-        const uint minor = 1-major;
-
-        // A, B & C get converted from unorm to [0, 1]
-        // A & B get converted from [0,1] to [-2, 2]
-        shapes::Quadratic<float> curveMin = shapes::Quadratic<float>::construct(
-            curveBox.curveMin[0], curveBox.curveMin[1], curveBox.curveMin[2]);
-        shapes::Quadratic<float> curveMax = shapes::Quadratic<float>::construct(
-            curveBox.curveMax[0], curveBox.curveMax[1], curveBox.curveMax[2]);
-
-        outV.setMinorBBoxUV(dilatedUV[minor]);
-        outV.setMajorBBoxUV(dilatedUV[major]);
-
-        outV.setCurveMinMinor(math::equations::Quadratic<float>::construct(
-            curveMin.A[minor], 
-            curveMin.B[minor], 
-            curveMin.C[minor]));
-        outV.setCurveMinMajor(math::equations::Quadratic<float>::construct(
-            curveMin.A[major], 
-            curveMin.B[major], 
-            curveMin.C[major]));
-
-        outV.setCurveMaxMinor(math::equations::Quadratic<float>::construct(
-            curveMax.A[minor], 
-            curveMax.B[minor], 
-            curveMax.C[minor]));
-        outV.setCurveMaxMajor(math::equations::Quadratic<float>::construct(
-            curveMax.A[major], 
-            curveMax.B[major], 
-            curveMax.C[major]));
-
-        //math::equations::Quadratic<float> curveMinRootFinding = math::equations::Quadratic<float>::construct(
-        //    curveMin.A[major], 
-        //    curveMin.B[major], 
-        //    curveMin.C[major] - maxCorner[major]);
-        //math::equations::Quadratic<float> curveMaxRootFinding = math::equations::Quadratic<float>::construct(
-        //    curveMax.A[major], 
-        //    curveMax.B[major], 
-        //    curveMax.C[major] - maxCorner[major]);
-        //outV.setMinCurvePrecomputedRootFinders(PrecomputedRootFinder<float>::construct(curveMinRootFinding));
-        //outV.setMaxCurvePrecomputedRootFinders(PrecomputedRootFinder<float>::construct(curveMaxRootFinding));
-    }
-    else if (objType == ObjectType::FONT_GLYPH)
-    {
-        LineStyle lineStyle = loadLineStyle(mainObj.styleIdx);
-        const float italicTiltSlope = lineStyle.screenSpaceLineWidth; // aliased text style member with line style
+            const uint major = (uint)SelectedMajorAxis;
+            const uint minor = 1-major;
+
+            // A, B & C get converted from unorm to [0, 1]
+            // A & B get converted from [0,1] to [-2, 2]
+            shapes::Quadratic<float> curveMin = shapes::Quadratic<float>::construct(
+                curveBox.curveMin[0], curveBox.curveMin[1], curveBox.curveMin[2]);
+            shapes::Quadratic<float> curveMax = shapes::Quadratic<float>::construct(
+                curveBox.curveMax[0], curveBox.curveMax[1], curveBox.curveMax[2]);
+
+            outV.setMinorBBoxUV(dilatedUV[minor]);
+            outV.setMajorBBoxUV(dilatedUV[major]);
+
+            outV.setCurveMinMinor(math::equations::Quadratic<float>::construct(
+                curveMin.A[minor], 
+                curveMin.B[minor], 
+                curveMin.C[minor]));
+            outV.setCurveMinMajor(math::equations::Quadratic<float>::construct(
+                curveMin.A[major], 
+                curveMin.B[major], 
+                curveMin.C[major]));
+
+            outV.setCurveMaxMinor(math::equations::Quadratic<float>::construct(
+                curveMax.A[minor], 
+                curveMax.B[minor], 
+                curveMax.C[minor]));
+            outV.setCurveMaxMajor(math::equations::Quadratic<float>::construct(
+                curveMax.A[major], 
+                curveMax.B[major], 
+                curveMax.C[major]));
+
+            //math::equations::Quadratic<float> curveMinRootFinding = math::equations::Quadratic<float>::construct(
+            //    curveMin.A[major], 
+            //    curveMin.B[major], 
+            //    curveMin.C[major] - maxCorner[major]);
+            //math::equations::Quadratic<float> curveMaxRootFinding = math::equations::Quadratic<float>::construct(
+            //    curveMax.A[major], 
+            //    curveMax.B[major], 
+            //    curveMax.C[major] - maxCorner[major]);
+            //outV.setMinCurvePrecomputedRootFinders(PrecomputedRootFinder<float>::construct(curveMinRootFinding));
+            //outV.setMaxCurvePrecomputedRootFinders(PrecomputedRootFinder<float>::construct(curveMaxRootFinding));
+        }
+        else if (objType == ObjectType::FONT_GLYPH)
+        {
+            LineStyle lineStyle = loadLineStyle(mainObj.styleIdx);
+            const float italicTiltSlope = lineStyle.screenSpaceLineWidth; // aliased text style member with line style
         
-        GlyphInfo glyphInfo;
-        glyphInfo.topLeft = vk::RawBufferLoad<pfloat64_t2>(globals.pointers.geometryBuffer + drawObj.geometryAddress, 8u);
-        glyphInfo.dirU = vk::RawBufferLoad<float32_t2>(globals.pointers.geometryBuffer + drawObj.geometryAddress + sizeof(pfloat64_t2), 4u);
-        glyphInfo.aspectRatio = vk::RawBufferLoad<float32_t>(globals.pointers.geometryBuffer + drawObj.geometryAddress + sizeof(pfloat64_t2) + sizeof(float2), 4u);
-        glyphInfo.minUV_textureID_packed = vk::RawBufferLoad<uint32_t>(globals.pointers.geometryBuffer + drawObj.geometryAddress + sizeof(pfloat64_t2) + sizeof(float2) + sizeof(float), 4u);
-
-        float32_t2 minUV = glyphInfo.getMinUV();
-        uint16_t textureID = glyphInfo.getTextureID();
-
-        const float32_t2 dirV = float32_t2(glyphInfo.dirU.y, -glyphInfo.dirU.x) * glyphInfo.aspectRatio;
-        const float2 screenTopLeft = _static_cast<float2>(transformPointNdc(clipProjectionData.projectionToNDC, glyphInfo.topLeft));
-        const float2 screenDirU = _static_cast<float2>(transformVectorNdc(clipProjectionData.projectionToNDC, _static_cast<pfloat64_t2>(glyphInfo.dirU)));
-        const float2 screenDirV = _static_cast<float2>(transformVectorNdc(clipProjectionData.projectionToNDC, _static_cast<pfloat64_t2>(dirV)));
-
-        const float2 corner = float2(bool2(vertexIdx & 0x1u, vertexIdx >> 1)); // corners of square from (0, 0) to (1, 1)
-        const float2 undilatedCornerNDC = corner * 2.0 - 1.0; // corners of square from (-1, -1) to (1, 1)
+            GlyphInfo glyphInfo;
+            glyphInfo.topLeft = vk::RawBufferLoad<pfloat64_t2>(globals.pointers.geometryBuffer + drawObj.geometryAddress, 8u);
+            glyphInfo.dirU = vk::RawBufferLoad<float32_t2>(globals.pointers.geometryBuffer + drawObj.geometryAddress + sizeof(pfloat64_t2), 4u);
+            glyphInfo.aspectRatio = vk::RawBufferLoad<float32_t>(globals.pointers.geometryBuffer + drawObj.geometryAddress + sizeof(pfloat64_t2) + sizeof(float2), 4u);
+            glyphInfo.minUV_textureID_packed = vk::RawBufferLoad<uint32_t>(globals.pointers.geometryBuffer + drawObj.geometryAddress + sizeof(pfloat64_t2) + sizeof(float2) + sizeof(float), 4u);
+
+            float32_t2 minUV = glyphInfo.getMinUV();
+            uint16_t textureID = glyphInfo.getTextureID();
+
+            const float32_t2 dirV = float32_t2(glyphInfo.dirU.y, -glyphInfo.dirU.x) * glyphInfo.aspectRatio;
+            const float2 screenTopLeft = _static_cast<float2>(transformPointNdc(clipProjectionData.projectionToNDC, glyphInfo.topLeft));
+            const float2 screenDirU = _static_cast<float2>(transformVectorNdc(clipProjectionData.projectionToNDC, _static_cast<pfloat64_t2>(glyphInfo.dirU)));
+            const float2 screenDirV = _static_cast<float2>(transformVectorNdc(clipProjectionData.projectionToNDC, _static_cast<pfloat64_t2>(dirV)));
+
+            const float2 corner = float2(bool2(vertexIdx & 0x1u, vertexIdx >> 1)); // corners of square from (0, 0) to (1, 1)
+            const float2 undilatedCornerNDC = corner * 2.0 - 1.0; // corners of square from (-1, -1) to (1, 1)
         
-        const float2 screenSpaceAabbExtents = float2(length(screenDirU * float2(globals.resolution)) / 2.0, length(screenDirV * float2(globals.resolution)) / 2.0);
-        const float pixelsToIncreaseOnEachSide = globals.antiAliasingFactor + 1.0;
-        const float2 dilateRate = (pixelsToIncreaseOnEachSide / screenSpaceAabbExtents);
+            const float2 screenSpaceAabbExtents = float2(length(screenDirU * float2(globals.resolution)) / 2.0, length(screenDirV * float2(globals.resolution)) / 2.0);
+            const float pixelsToIncreaseOnEachSide = globals.antiAliasingFactor + 1.0;
+            const float2 dilateRate = (pixelsToIncreaseOnEachSide / screenSpaceAabbExtents);
 
-        const float2 vx = screenDirU * dilateRate.x;
-        const float2 vy = screenDirV * dilateRate.y;
-        const float2 offsetVec = vx * undilatedCornerNDC.x + vy * undilatedCornerNDC.y;
-        float2 coord = screenTopLeft + corner.x * screenDirU + corner.y * screenDirV + offsetVec;
+            const float2 vx = screenDirU * dilateRate.x;
+            const float2 vy = screenDirV * dilateRate.y;
+            const float2 offsetVec = vx * undilatedCornerNDC.x + vy * undilatedCornerNDC.y;
+            float2 coord = screenTopLeft + corner.x * screenDirU + corner.y * screenDirV + offsetVec;
 
-        if (corner.y == 0 && italicTiltSlope > 0.0f)
-            coord += normalize(screenDirU) * length(screenDirV) * italicTiltSlope * float(globals.resolution.y) / float(globals.resolution.x);
+            if (corner.y == 0 && italicTiltSlope > 0.0f)
+                coord += normalize(screenDirU) * length(screenDirV) * italicTiltSlope * float(globals.resolution.y) / float(globals.resolution.x);
         
-        // If aspect ratio of the dimensions and glyph inside the texture are the same then screenPxRangeX === screenPxRangeY
-        // but if the glyph box is stretched in any way then we won't get correct msdf
-        // in that case we need to take the max(screenPxRangeX, screenPxRangeY) to avoid blur due to underexaggerated distances
-        // We compute screenPxRange using the ratio of our screenspace extent to the texel space our glyph takes inside the texture
-        // Our glyph is centered inside the texture, so `maxUV = 1.0 - minUV` and `glyphTexelSize = (1.0-2.0*minUV) * MSDFSize
-        const float screenPxRangeX = screenSpaceAabbExtents.x / ((1.0 - 2.0 * minUV.x)); // division by MSDFSize happens after max
-        const float screenPxRangeY = screenSpaceAabbExtents.y / ((1.0 - 2.0 * minUV.y)); // division by MSDFSize happens after max
-        outV.setFontGlyphPxRange((max(max(screenPxRangeX, screenPxRangeY), 1.0) * MSDFPixelRangeHalf) / MSDFSize); // we premultuply by MSDFPixelRange/2.0, to avoid doing it in frag shader
-
-        // In order to keep the shape scale constant with any dilation values:
-        // We compute the new dilated minUV that gets us minUV when interpolated on the previous undilated top left
-        const float2 topLeftInterpolationValue = (dilateRate/(1.0+2.0*dilateRate));
-        const float2 dilatedMinUV = (topLeftInterpolationValue - minUV) / (2.0 * topLeftInterpolationValue - 1.0);
-        const float2 dilatedMaxUV = float2(1.0, 1.0) - dilatedMinUV;
+            // If aspect ratio of the dimensions and glyph inside the texture are the same then screenPxRangeX === screenPxRangeY
+            // but if the glyph box is stretched in any way then we won't get correct msdf
+            // in that case we need to take the max(screenPxRangeX, screenPxRangeY) to avoid blur due to underexaggerated distances
+            // We compute screenPxRange using the ratio of our screenspace extent to the texel space our glyph takes inside the texture
+            // Our glyph is centered inside the texture, so `maxUV = 1.0 - minUV` and `glyphTexelSize = (1.0-2.0*minUV) * MSDFSize
+            const float screenPxRangeX = screenSpaceAabbExtents.x / ((1.0 - 2.0 * minUV.x)); // division by MSDFSize happens after max
+            const float screenPxRangeY = screenSpaceAabbExtents.y / ((1.0 - 2.0 * minUV.y)); // division by MSDFSize happens after max
+            outV.setFontGlyphPxRange((max(max(screenPxRangeX, screenPxRangeY), 1.0) * MSDFPixelRangeHalf) / MSDFSize); // we premultuply by MSDFPixelRange/2.0, to avoid doing it in frag shader
+
+            // In order to keep the shape scale constant with any dilation values:
+            // We compute the new dilated minUV that gets us minUV when interpolated on the previous undilated top left
+            const float2 topLeftInterpolationValue = (dilateRate/(1.0+2.0*dilateRate));
+            const float2 dilatedMinUV = (topLeftInterpolationValue - minUV) / (2.0 * topLeftInterpolationValue - 1.0);
+            const float2 dilatedMaxUV = float2(1.0, 1.0) - dilatedMinUV;
         
-        const float2 uv = dilatedMinUV + corner * (dilatedMaxUV - dilatedMinUV);
+            const float2 uv = dilatedMinUV + corner * (dilatedMaxUV - dilatedMinUV);
 
-        outV.position = float4(coord, 0.f, 1.f);
-        outV.setFontGlyphUV(uv);
-        outV.setFontGlyphTextureId(textureID);
-    }
-    else if (objType == ObjectType::IMAGE)
-    {
-        pfloat64_t2 topLeft = vk::RawBufferLoad<pfloat64_t2>(globals.pointers.geometryBuffer + drawObj.geometryAddress, 8u);
-        float32_t2 dirU = vk::RawBufferLoad<float32_t2>(globals.pointers.geometryBuffer + drawObj.geometryAddress + sizeof(pfloat64_t2), 4u);
-        float32_t aspectRatio = vk::RawBufferLoad<float32_t>(globals.pointers.geometryBuffer + drawObj.geometryAddress + sizeof(pfloat64_t2) + sizeof(float2), 4u);
-        uint32_t textureID = vk::RawBufferLoad<uint32_t>(globals.pointers.geometryBuffer + drawObj.geometryAddress + sizeof(pfloat64_t2) + sizeof(float2) + sizeof(float), 4u);
-
-        const float32_t2 dirV = float32_t2(dirU.y, -dirU.x) * aspectRatio;
-        const float2 ndcTopLeft = _static_cast<float2>(transformPointNdc(clipProjectionData.projectionToNDC, topLeft));
-        const float2 ndcDirU = _static_cast<float2>(transformVectorNdc(clipProjectionData.projectionToNDC, _static_cast<pfloat64_t2>(dirU)));
-        const float2 ndcDirV = _static_cast<float2>(transformVectorNdc(clipProjectionData.projectionToNDC, _static_cast<pfloat64_t2>(dirV)));
-
-        float2 corner = float2(bool2(vertexIdx & 0x1u, vertexIdx >> 1));
-        float2 uv = corner; // non-dilated
+            outV.position = float4(coord, 0.f, 1.f);
+            outV.setFontGlyphUV(uv);
+            outV.setFontGlyphTextureId(textureID);
+        }
+        else if (objType == ObjectType::IMAGE)
+        {
+            pfloat64_t2 topLeft = vk::RawBufferLoad<pfloat64_t2>(globals.pointers.geometryBuffer + drawObj.geometryAddress, 8u);
+            float32_t2 dirU = vk::RawBufferLoad<float32_t2>(globals.pointers.geometryBuffer + drawObj.geometryAddress + sizeof(pfloat64_t2), 4u);
+            float32_t aspectRatio = vk::RawBufferLoad<float32_t>(globals.pointers.geometryBuffer + drawObj.geometryAddress + sizeof(pfloat64_t2) + sizeof(float2), 4u);
+            uint32_t textureID = vk::RawBufferLoad<uint32_t>(globals.pointers.geometryBuffer + drawObj.geometryAddress + sizeof(pfloat64_t2) + sizeof(float2) + sizeof(float), 4u);
+
+            const float32_t2 dirV = float32_t2(dirU.y, -dirU.x) * aspectRatio;
+            const float2 ndcTopLeft = _static_cast<float2>(transformPointNdc(clipProjectionData.projectionToNDC, topLeft));
+            const float2 ndcDirU = _static_cast<float2>(transformVectorNdc(clipProjectionData.projectionToNDC, _static_cast<pfloat64_t2>(dirU)));
+            const float2 ndcDirV = _static_cast<float2>(transformVectorNdc(clipProjectionData.projectionToNDC, _static_cast<pfloat64_t2>(dirV)));
+
+            float2 corner = float2(bool2(vertexIdx & 0x1u, vertexIdx >> 1));
+            float2 uv = corner; // non-dilated
         
-        float2 ndcCorner = ndcTopLeft + corner.x * ndcDirU + corner.y * ndcDirV;
+            float2 ndcCorner = ndcTopLeft + corner.x * ndcDirU + corner.y * ndcDirV;
         
-        outV.position = float4(ndcCorner, 0.f, 1.f);
-        outV.setImageUV(uv);
-        outV.setImageTextureId(textureID);
-    }
-
+            outV.position = float4(ndcCorner, 0.f, 1.f);
+            outV.setImageUV(uv);
+            outV.setImageTextureId(textureID);
+        }
 
-// Make the cage fullscreen for testing: 
+    // Make the cage fullscreen for testing: 
 #if 0
-    // disabled for object of POLYLINE_CONNECTOR type, since miters would cover whole screen
-    if(objType != ObjectType::POLYLINE_CONNECTOR)
-    {
-        if (vertexIdx == 0u)
-            outV.position = float4(-1, -1, 0, 1);
-        else if (vertexIdx == 1u)
-            outV.position = float4(-1, +1, 0, 1);
-        else if (vertexIdx == 2u)
-            outV.position = float4(+1, -1, 0, 1);
-        else if (vertexIdx == 3u)
-            outV.position = float4(+1, +1, 0, 1);
-    }
+        // disabled for object of POLYLINE_CONNECTOR type, since miters would cover whole screen
+        if(objType != ObjectType::POLYLINE_CONNECTOR)
+        {
+            if (vertexIdx == 0u)
+                outV.position = float4(-1, -1, 0, 1);
+            else if (vertexIdx == 1u)
+                outV.position = float4(-1, +1, 0, 1);
+            else if (vertexIdx == 2u)
+                outV.position = float4(+1, -1, 0, 1);
+            else if (vertexIdx == 3u)
+                outV.position = float4(+1, +1, 0, 1);
+        }
 #endif
-
+    }
     outV.clip = float4(outV.position.x - clipProjectionData.minClipNDC.x, outV.position.y - clipProjectionData.minClipNDC.y, clipProjectionData.maxClipNDC.x - outV.position.x, clipProjectionData.maxClipNDC.y - outV.position.y);
     return outV;
-#endif
 }

From 1963b51c27cf445b6515bbd16eb2bec3da9aa311 Mon Sep 17 00:00:00 2001
From: keptsecret <sorchon@gmail.com>
Date: Thu, 10 Apr 2025 16:00:08 +0700
Subject: [PATCH 156/529] changes in Param, Config usage

---
 .../app_resources/benchmarkSubgroup.comp.hlsl | 10 +--
 .../app_resources/shaderCommon.hlsl           | 18 ++---
 73_ArithmeticBench/main.cpp                   | 75 ++++---------------
 3 files changed, 27 insertions(+), 76 deletions(-)

diff --git a/73_ArithmeticBench/app_resources/benchmarkSubgroup.comp.hlsl b/73_ArithmeticBench/app_resources/benchmarkSubgroup.comp.hlsl
index 0b6a7e3c4..4715f0abf 100644
--- a/73_ArithmeticBench/app_resources/benchmarkSubgroup.comp.hlsl
+++ b/73_ArithmeticBench/app_resources/benchmarkSubgroup.comp.hlsl
@@ -38,7 +38,7 @@ bool canStore() {return true;}
 template<template<class> class binop, typename T, uint32_t N>
 static void subbench(NBL_CONST_REF_ARG(type_t) sourceVal)
 {
-    using config_t = nbl::hlsl::subgroup::Configuration<SUBGROUP_SIZE_LOG2>;
+    using config_t = nbl::hlsl::subgroup2::Configuration<SUBGROUP_SIZE_LOG2>;
     using params_t = nbl::hlsl::subgroup2::ArithmeticParams<config_t, typename binop<T>::base_t, N, nbl::hlsl::jit::device_capabilities>;
     type_t value = sourceVal;
 
@@ -54,15 +54,15 @@ void benchmark()
 {
     const uint32_t idx = globalIndex() * ITEMS_PER_INVOCATION;
     type_t sourceVal;
-#if ITEMS_PER_INVOCATION > 1
+// #if ITEMS_PER_INVOCATION > 1
     [unroll]
     for (uint32_t i = 0; i < ITEMS_PER_INVOCATION; i++)
     {
         sourceVal[i] = inputValue[idx + i];
     }
-#else
-    sourceVal = inputValue[idx];
-#endif
+// #else
+//     sourceVal = inputValue[idx];
+// #endif
 
     subbench<bit_and, uint32_t, ITEMS_PER_INVOCATION>(sourceVal);
     subbench<bit_xor, uint32_t, ITEMS_PER_INVOCATION>(sourceVal);
diff --git a/73_ArithmeticBench/app_resources/shaderCommon.hlsl b/73_ArithmeticBench/app_resources/shaderCommon.hlsl
index 5cb1f3cf1..3fdd3c986 100644
--- a/73_ArithmeticBench/app_resources/shaderCommon.hlsl
+++ b/73_ArithmeticBench/app_resources/shaderCommon.hlsl
@@ -27,11 +27,11 @@ bool canStore();
 //typedef uint32_t type_t;
 //typedef uint32_t4 type_t;
 
-#if ITEMS_PER_INVOCATION > 1
+// #if ITEMS_PER_INVOCATION > 1
 typedef vector<uint32_t, ITEMS_PER_INVOCATION> type_t;
-#else
-typedef uint32_t type_t;
-#endif
+// #else
+// typedef uint32_t type_t;
+// #endif
 
 
 #ifndef OPERATION
@@ -46,7 +46,7 @@ static void subtest(NBL_CONST_REF_ARG(type_t) sourceVal)
 {
     // TODO static assert vector<T, N> == type_t
     //using type_t = vector<T, N>;
-    using config_t = nbl::hlsl::subgroup::Configuration<SUBGROUP_SIZE_LOG2>;
+    using config_t = nbl::hlsl::subgroup2::Configuration<SUBGROUP_SIZE_LOG2>;
     using params_t = nbl::hlsl::subgroup2::ArithmeticParams<config_t, typename binop<T>::base_t, N, nbl::hlsl::jit::device_capabilities>;
 
     if (globalIndex()==0u)
@@ -62,15 +62,15 @@ type_t test()
 {
     const uint32_t idx = globalIndex() * ITEMS_PER_INVOCATION;
     type_t sourceVal;
-#if ITEMS_PER_INVOCATION > 1
+// #if ITEMS_PER_INVOCATION > 1
     [unroll]
     for (uint32_t i = 0; i < ITEMS_PER_INVOCATION; i++)
     {
         sourceVal[i] = inputValue[idx + i];
     }
-#else
-    sourceVal = inputValue[idx];
-#endif
+// #else
+//     sourceVal = inputValue[idx];
+// #endif
 
     subtest<bit_and, uint32_t, ITEMS_PER_INVOCATION>(sourceVal);
     subtest<bit_xor, uint32_t, ITEMS_PER_INVOCATION>(sourceVal);
diff --git a/73_ArithmeticBench/main.cpp b/73_ArithmeticBench/main.cpp
index 94983c03c..d129cfaf9 100644
--- a/73_ArithmeticBench/main.cpp
+++ b/73_ArithmeticBench/main.cpp
@@ -249,39 +249,6 @@ class ArithmeticBenchApp final : public examples::SimpleWindowedApplication, pub
 			benchPplnLayout = m_device->createPipelineLayout({}, std::move(benchLayout));
 		}
 
-		const auto spirv_isa_cache_path = localOutputCWD/"spirv_isa_cache.bin";
-		// enclose to make sure file goes out of scope and we can reopen it
-		{
-			smart_refctd_ptr<const IFile> spirv_isa_cache_input;
-			// try to load SPIR-V to ISA cache
-			{
-				ISystem::future_t<smart_refctd_ptr<IFile>> fileCreate;
-				m_system->createFile(fileCreate,spirv_isa_cache_path,IFile::ECF_READ|IFile::ECF_MAPPABLE|IFile::ECF_COHERENT);
-				if (auto lock=fileCreate.acquire())
-					spirv_isa_cache_input = *lock;
-			}
-			// create the cache
-			{
-				std::span<const uint8_t> spirv_isa_cache_data = {};
-				if (spirv_isa_cache_input)
-					spirv_isa_cache_data = {reinterpret_cast<const uint8_t*>(spirv_isa_cache_input->getMappedPointer()),spirv_isa_cache_input->getSize()};
-				else
-					m_logger->log("Failed to load SPIR-V 2 ISA cache!",ILogger::ELL_PERFORMANCE);
-				// Normally we'd deserialize a `ICPUPipelineCache` properly and pass that instead
-				m_spirv_isa_cache = m_device->createPipelineCache(spirv_isa_cache_data);
-			}
-		}
-		{
-			// TODO: rename `deleteDirectory` to just `delete`? and a `IFile::setSize()` ?
-			m_system->deleteDirectory(spirv_isa_cache_path);
-			ISystem::future_t<smart_refctd_ptr<IFile>> fileCreate;
-			m_system->createFile(fileCreate,spirv_isa_cache_path,IFile::ECF_WRITE);
-			// I can be relatively sure I'll succeed to acquire the future, the pointer to created file might be null though.
-			m_spirv_isa_cache_output=*fileCreate.acquire();
-			if (!m_spirv_isa_cache_output)
-				logFail("Failed to Create SPIR-V to ISA cache file.");
-		}
-
 		// load shader source from file
 		auto getShaderSource = [&](const char* filePath) -> auto
 		{
@@ -429,10 +396,10 @@ class ArithmeticBenchApp final : public examples::SimpleWindowedApplication, pub
 
 		const auto SubgroupSizeLog2 = hlsl::findMSB(MinSubgroupSize);
 
-		bool passed = true;
-		passed = runBenchmark(cmdbuf, benchSets[0], elementCount, SubgroupSizeLog2);
-		passed = runBenchmark(cmdbuf, benchSets[1], elementCount, SubgroupSizeLog2);
-		passed = runBenchmark(cmdbuf, benchSets[2], elementCount, SubgroupSizeLog2);
+		cmdbuf->bindDescriptorSets(EPBP_COMPUTE, benchSets[0].pipeline->getLayout(), 0u, 1u, &benchDs.get());
+
+		for (uint32_t i = 0; i < benchSets.size(); i++)
+			runBenchmark(cmdbuf, benchSets[i], elementCount, SubgroupSizeLog2);
 
 
 		// blit
@@ -633,17 +600,6 @@ class ArithmeticBenchApp final : public examples::SimpleWindowedApplication, pub
 				//	logTestOutcome(passed, itemsPerWG);
 				//}
 				m_api->endCapture();
-
-				// save cache every now and then	
-				{
-					auto cpu = m_spirv_isa_cache->convertToCPUCache();
-					// Normally we'd beautifully JSON serialize the thing, allow multiple devices & drivers + metadata
-					auto bin = cpu->getEntries().begin()->second.bin;
-					IFile::success_t success;
-					m_spirv_isa_cache_output->write(success, bin->data(), 0ull, bin->size());
-					if (!success)
-						logFail("Could not write Create SPIR-V to ISA cache to disk!");
-				}
 			}
 		}
 	}
@@ -662,7 +618,7 @@ class ArithmeticBenchApp final : public examples::SimpleWindowedApplication, pub
 			.requireFullSubgroups = true
 		};
 		core::smart_refctd_ptr<IGPUComputePipeline> pipeline;
-		if (!m_device->createComputePipelines(m_spirv_isa_cache.get(),{&params,1},&pipeline))
+		if (!m_device->createComputePipelines(nullptr,{&params,1},&pipeline))
 			return nullptr;
 		return pipeline;
 	}
@@ -689,12 +645,13 @@ class ArithmeticBenchApp final : public examples::SimpleWindowedApplication, pub
 		options.stage = IShader::E_SHADER_STAGE::ESS_COMPUTE;
 		options.targetSpirvVersion = m_device->getPhysicalDevice()->getLimits().spirvVersion;
 		options.spirvOptimizer = nullptr;
-//#ifndef _NBL_DEBUG
-//		ISPIRVOptimizer::E_OPTIMIZER_PASS optPasses = ISPIRVOptimizer::EOP_STRIP_DEBUG_INFO;
-//		auto opt = make_smart_refctd_ptr<ISPIRVOptimizer>(std::span<ISPIRVOptimizer::E_OPTIMIZER_PASS>(&optPasses, 1));
-//		options.spirvOptimizer = opt.get();
-//#endif
+#ifndef _NBL_DEBUG
+		ISPIRVOptimizer::E_OPTIMIZER_PASS optPasses = ISPIRVOptimizer::EOP_STRIP_DEBUG_INFO;
+		auto opt = make_smart_refctd_ptr<ISPIRVOptimizer>(std::span<ISPIRVOptimizer::E_OPTIMIZER_PASS>(&optPasses, 1));
+		options.spirvOptimizer = opt.get();
+#else
 		options.debugInfoFlags |= IShaderCompiler::E_DEBUG_INFO_FLAGS::EDIF_LINE_BIT;
+#endif
 		options.preprocessorOptions.sourceIdentifier = source->getFilepathHint();
 		options.preprocessorOptions.logger = m_logger.get();
 
@@ -875,12 +832,11 @@ class ArithmeticBenchApp final : public examples::SimpleWindowedApplication, pub
 	}
 
 
-	bool runBenchmark(IGPUCommandBuffer* cmdbuf, const BenchmarkSet& set, const uint32_t elementCount, const uint8_t subgroupSizeLog2)
+	void runBenchmark(IGPUCommandBuffer* cmdbuf, const BenchmarkSet& set, const uint32_t elementCount, const uint8_t subgroupSizeLog2)
 	{
 		const uint32_t workgroupCount = elementCount / (set.workgroupSize * set.itemsPerInvocation);
 
 		cmdbuf->bindComputePipeline(set.pipeline.get());
-		cmdbuf->bindDescriptorSets(EPBP_COMPUTE, set.pipeline->getLayout(), 0u, 1u, &benchDs.get());
 		cmdbuf->dispatch(workgroupCount, 1, 1);
 		{
 			IGPUCommandBuffer::SPipelineBarrierDependencyInfo::buffer_barrier_t memoryBarrier[OutputBufferCount];
@@ -902,14 +858,10 @@ class ArithmeticBenchApp final : public examples::SimpleWindowedApplication, pub
 			IGPUCommandBuffer::SPipelineBarrierDependencyInfo info = { .memBarriers = {},.bufBarriers = memoryBarrier };
 			cmdbuf->pipelineBarrier(asset::E_DEPENDENCY_FLAGS::EDF_NONE, info);
 		}
-
-		return true;
 	}
 
 	IQueue* transferDownQueue;
 	IQueue* computeQueue;
-	smart_refctd_ptr<IGPUPipelineCache> m_spirv_isa_cache;
-	smart_refctd_ptr<IFile> m_spirv_isa_cache_output;
 
 	smart_refctd_ptr<IWindow> m_window;
 	smart_refctd_ptr<CSimpleResizeSurface<ISimpleManagedSurface::ISwapchainResources>> m_surface;
@@ -923,7 +875,6 @@ class ArithmeticBenchApp final : public examples::SimpleWindowedApplication, pub
 	smart_refctd_ptr<IGPUImage> dummyImg;
 
 	std::array<BenchmarkSet, 3> benchSets;
-	smart_refctd_ptr<IGPUComputePipeline> benchPipeline;	// TODO array
 	smart_refctd_ptr<IDescriptorPool> benchPool;
 	smart_refctd_ptr<IGPUDescriptorSet> benchDs;
 
@@ -938,7 +889,7 @@ class ArithmeticBenchApp final : public examples::SimpleWindowedApplication, pub
 
 	bool b_runTests = false;
 	uint32_t* inputData = nullptr;
-	uint32_t ItemsPerInvocation = 1u;
+	uint32_t ItemsPerInvocation = 4u;
 	constexpr static inline uint32_t OutputBufferCount = 8u;
 	smart_refctd_ptr<IGPUBuffer> outputBuffers[OutputBufferCount];
 

From 2528e75062433a9e78a329da2a37434c9d92525c Mon Sep 17 00:00:00 2001
From: Przemek <minikers21@gmail.com>
Date: Fri, 11 Apr 2025 14:04:58 +0200
Subject: [PATCH 157/529] Implemented anti aliasing

---
 62_CAD/main.cpp                               |  38 ++++--
 .../main_pipeline/fragment_shader.hlsl        | 126 +++++++++++-------
 2 files changed, 103 insertions(+), 61 deletions(-)

diff --git a/62_CAD/main.cpp b/62_CAD/main.cpp
index e425dce54..77f90d13d 100644
--- a/62_CAD/main.cpp
+++ b/62_CAD/main.cpp
@@ -3170,8 +3170,7 @@ class ComputerAidedDesign final : public examples::SimpleWindowedApplication, pu
 		}
 		else if (mode == ExampleMode::CASE_9)
 		{
-			// GRID
-
+			// GRID (outdated)
 			/*core::vector<TriangleMeshVertex> vertices = {
 				{ float32_t2(-200.0f, -200.0f), 10.0f },
 				{ float32_t2(-50.0f, -200.0f), 50.0f },
@@ -3203,20 +3202,32 @@ class ComputerAidedDesign final : public examples::SimpleWindowedApplication, pu
 			};*/
 
 			// PYRAMID
-
 			core::vector<TriangleMeshVertex> vertices = {
+				//{ float64_t2(0.0, 0.0), 100.0 }, //0
+				//{ float64_t2(-200.0, -200.0), 10.0 }, //1
+				//{ float64_t2(200.0, -200.0), 10.0 }, //2
+				//{ float64_t2(200.0, 200.0), -20.0 }, //3
+				//{ float64_t2(-200.0, 200.0), 10.0 }, //4
+
 				{ float64_t2(0.0, 0.0), 100.0 },
 				{ float64_t2(-200.0, -200.0), 10.0 },
 				{ float64_t2(200.0, -200.0), 10.0 },
+				{ float64_t2(0.0, 0.0), 100.0 },
+				{ float64_t2(200.0, -200.0), 10.0 },
 				{ float64_t2(200.0, 200.0), -20.0 },
+				{ float64_t2(0.0, 0.0), 100.0 },
+				{ float64_t2(200.0, 200.0), -20.0 },
+				{ float64_t2(-200.0, 200.0), 10.0 },
+				{ float64_t2(0.0, 0.0), 100.0 },
 				{ float64_t2(-200.0, 200.0), 10.0 },
+				{ float64_t2(-200.0, -200.0), 10.0 },
 			};
 
 			core::vector<uint32_t> indices = {
 				0, 1, 2,
-				0, 2, 3,
-				0, 3, 4,
-				0, 4, 1
+				3, 4, 5,
+				6, 7, 8,
+				9, 10, 11
 			};
 
 			// SINGLE TRIANGLE
@@ -3257,17 +3268,16 @@ class ComputerAidedDesign final : public examples::SimpleWindowedApplication, pu
 			// 1 - DISCRETE_VARIABLE_LENGTH_INTERVALS
 			// 2 - DISCRETE_FIXED_LENGTH_INTERVALS
 			// 3 - CONTINOUS_INTERVALS
+			float animatedAlpha = (std::cos(m_timeElapsed * 0.0005) + 1.0) * 0.5;
 			switch (m_shadingModeExample)
 			{
 				case DTMSettingsInfo::E_HEIGHT_SHADING_MODE::DISCRETE_VARIABLE_LENGTH_INTERVALS:
 				{
 					dtmSettingsInfo.heightShadingMode = DTMSettingsInfo::E_HEIGHT_SHADING_MODE::DISCRETE_VARIABLE_LENGTH_INTERVALS;
 					
-					float animatedAlpha = (std::cos(m_timeElapsed * 0.0005) + 1.0) * 0.5;
-					dtmSettingsInfo.addHeightColorMapEntry(-10.0f, float32_t4(0.5f, 1.0f, 1.0f, 1.0f));
+					dtmSettingsInfo.addHeightColorMapEntry(-10.0f, float32_t4(0.5f, 1.0f, 1.0f, animatedAlpha));
 					dtmSettingsInfo.addHeightColorMapEntry(20.0f, float32_t4(0.0f, 1.0f, 0.0f, 1.0f));
-					//dtmSettingsInfo.addHeightColorMapEntry(25.0f, float32_t4(1.0f, 1.0f, 0.0f, animatedAlpha));
-					dtmSettingsInfo.addHeightColorMapEntry(25.0f, float32_t4(1.0f, 1.0f, 0.0f, 1.0f));
+					dtmSettingsInfo.addHeightColorMapEntry(25.0f, float32_t4(1.0f, 1.0f, 0.0f, animatedAlpha));
 					dtmSettingsInfo.addHeightColorMapEntry(70.0f, float32_t4(1.0f, 0.0f, 0.0f, 1.0f));
 					dtmSettingsInfo.addHeightColorMapEntry(90.0f, float32_t4(1.0f, 0.0f, 0.0f, 1.0f));
 					break;
@@ -3277,16 +3287,16 @@ class ComputerAidedDesign final : public examples::SimpleWindowedApplication, pu
 					dtmSettingsInfo.intervalWidth = 8.0f;
 					dtmSettingsInfo.heightShadingMode = DTMSettingsInfo::E_HEIGHT_SHADING_MODE::DISCRETE_FIXED_LENGTH_INTERVALS;
 					dtmSettingsInfo.addHeightColorMapEntry(0.0f, float32_t4(0.0f, 1.0f, 0.0f, 1.0f));
-					dtmSettingsInfo.addHeightColorMapEntry(50.0f, float32_t4(1.0f, 1.0f, 0.0f, 1.0f));
+					dtmSettingsInfo.addHeightColorMapEntry(50.0f, float32_t4(1.0f, 1.0f, 0.0f, animatedAlpha));
 					dtmSettingsInfo.addHeightColorMapEntry(100.0f, float32_t4(1.0f, 0.0f, 0.0f, 1.0f));
 					break;
 				}
 				case DTMSettingsInfo::E_HEIGHT_SHADING_MODE::CONTINOUS_INTERVALS:
 				{
 					dtmSettingsInfo.heightShadingMode = DTMSettingsInfo::E_HEIGHT_SHADING_MODE::CONTINOUS_INTERVALS;
-					dtmSettingsInfo.addHeightColorMapEntry(-10.0f, float32_t4(0.0f, 1.0f, 0.0f, 1.0f));
-					dtmSettingsInfo.addHeightColorMapEntry(30.0f, float32_t4(1.0f, 1.0f, 0.0f, 1.0f));
-					dtmSettingsInfo.addHeightColorMapEntry(90.0f, float32_t4(1.0f, 0.0f, 0.0f, 1.0f));
+					dtmSettingsInfo.addHeightColorMapEntry(-10.0f, float32_t4(0.0f, 1.0f, 0.0f, animatedAlpha));
+					dtmSettingsInfo.addHeightColorMapEntry(30.0f, float32_t4(1.0f, 1.0f, 0.0f, animatedAlpha));
+					dtmSettingsInfo.addHeightColorMapEntry(90.0f, float32_t4(1.0f, 0.0f, 0.0f, animatedAlpha));
 					break;
 				}
 			}
diff --git a/62_CAD/shaders/main_pipeline/fragment_shader.hlsl b/62_CAD/shaders/main_pipeline/fragment_shader.hlsl
index ab6388bc8..ffa94f15c 100644
--- a/62_CAD/shaders/main_pipeline/fragment_shader.hlsl
+++ b/62_CAD/shaders/main_pipeline/fragment_shader.hlsl
@@ -420,6 +420,26 @@ float dot2(in float2 vec)
     return dot(vec, vec);
 }
 
+struct DTMHeightShadingAAInfo
+{
+    float currentHeight;
+    float4 currentSegmentColor;
+    float nearestSegmentHeight;
+    float4 nearestSegmentColor;
+};
+
+void calculateBetweenHeightShadingRegionsAntiAliasing(in DTMSettings dtm, in DTMHeightShadingAAInfo aaInfo, out float3 textureColor, out float localAlpha)
+{
+    float heightDeriv = fwidth(aaInfo.currentHeight);
+
+    float pxDistanceToNearestSegment = abs(aaInfo.currentHeight - aaInfo.nearestSegmentHeight) / heightDeriv;
+    float nearestSegmentColorCoverage = smoothstep(-globals.antiAliasingFactor, globals.antiAliasingFactor, pxDistanceToNearestSegment);
+    float4 localHeightColor = lerp(aaInfo.nearestSegmentColor, aaInfo.currentSegmentColor, nearestSegmentColorCoverage);
+
+    localAlpha *= localHeightColor.a;
+    textureColor = localHeightColor.rgb * localAlpha + textureColor * (1.0f - localAlpha);
+}
+
 [[vk::spvexecutionmode(spv::ExecutionModePixelInterlockOrderedEXT)]]
 [shader("pixel")]
 float4 fragMain(PSInput input) : SV_TARGET
@@ -438,7 +458,7 @@ float4 fragMain(PSInput input) : SV_TARGET
             const float outlineThickness = input.getOutlineThickness();
             const float contourThickness = input.getContourLineThickness();
             const float phaseShift = 0.0f; // input.getCurrentPhaseShift();
-            const float stretch = 1.0f; // TODO: figure out what is it for ---> [ERFAN's REPLY: no need to give shit about this in dtms, it's for special shape styles] 
+            const float stretch = 1.0f;
             const float worldToScreenRatio = input.getCurrentWorldToScreenRatio();
 
             DTMSettings dtm = loadDTMSettings(mainObj.dtmSettingsIdx);
@@ -507,58 +527,73 @@ float4 fragMain(PSInput input) : SV_TARGET
                 if(mode == DTMSettings::E_HEIGHT_SHADING_MODE::DISCRETE_VARIABLE_LENGTH_INTERVALS)
                 {
                     DTMSettingsHeightsAccessor dtmHeightsAccessor = { dtm };
-                    uint32_t mapIndexPlus1 = nbl::hlsl::upper_bound(dtmHeightsAccessor, 0, heightMapSize, height);
-                    uint32_t mapIndex = mapIndexPlus1 == 0 ? mapIndexPlus1 : mapIndexPlus1 - 1;
+                    int upperBoundIndex = nbl::hlsl::upper_bound(dtmHeightsAccessor, 0, heightMapSize, height);
+                    int mapIndex = max(upperBoundIndex - 1, 0);
+                    int mapIndexPrev = max(mapIndex - 1, 0);
+                    int mapIndexNext = min(mapIndex + 1, heightMapSize - 1);
 
-                    float heightDeriv = fwidth(height);
-                    bool blendWithPrev = true
-                        && (mapIndex >= heightMapSize - 1 || (height * 2.0 < dtm.heightColorMapHeights[mapIndexPlus1] + dtm.heightColorMapHeights[mapIndex]));
-                
                     // logic explainer: if colorIdx is 0.0 then it means blend with next
                     // if color idx is >= length of the colours array then it means it's also > 0.0 and this blend with prev is true
                     // if color idx is > 0 and < len - 1, then it depends on the current pixel's height value and two closest height values
-                    if (blendWithPrev)
+                    bool blendWithPrev = (mapIndex > 0)
+                        && (mapIndex >= heightMapSize - 1 || (height * 2.0 < dtm.heightColorMapHeights[upperBoundIndex] + dtm.heightColorMapHeights[mapIndex]));
+
+                    DTMHeightShadingAAInfo aaInfo;
+                    aaInfo.currentHeight = height;
+                    aaInfo.currentSegmentColor = dtm.heightColorMapColors[mapIndex];
+                    aaInfo.nearestSegmentHeight = blendWithPrev ? dtm.heightColorMapHeights[mapIndex] : dtm.heightColorMapHeights[mapIndexNext];
+                    aaInfo.nearestSegmentColor = blendWithPrev ? dtm.heightColorMapColors[mapIndexPrev] : dtm.heightColorMapColors[mapIndexNext];
+
+                    calculateBetweenHeightShadingRegionsAntiAliasing(dtm, aaInfo, textureColor, localAlpha);
+                }
+                else if (mode == DTMSettings::E_HEIGHT_SHADING_MODE::DISCRETE_FIXED_LENGTH_INTERVALS)
+                {
+                    float interval = dtm.intervalWidth;
+                    float heightMinShadingHeightDiff = (height - minShadingHeight);
+                    int sectionIndex = int(heightMinShadingHeightDiff / interval);
+                    float heightTmp = minShadingHeight + float(sectionIndex) * interval;
+
+                    DTMSettingsHeightsAccessor dtmHeightsAccessor = { dtm };
+                    uint32_t upperBoundHeightIndex = nbl::hlsl::upper_bound(dtmHeightsAccessor, 0, heightMapSize, height);
+                    uint32_t lowerBoundHeightIndex = max(upperBoundHeightIndex - 1, 0);
+
+                    float upperBoundHeight = dtm.heightColorMapHeights[upperBoundHeightIndex];
+                    float lowerBoundHeight = dtm.heightColorMapHeights[lowerBoundHeightIndex];
+
+                    float4 upperBoundColor = dtm.heightColorMapColors[upperBoundHeightIndex];
+                    float4 lowerBoundColor = dtm.heightColorMapColors[lowerBoundHeightIndex];
+
+                    float interpolationVal;
+                    bool blendWithPrev;
+                    if (upperBoundHeightIndex == 0)
                     {
-                        if (mapIndex > 0)
-                        {
-                            float pxDistanceToPrevHeight = (height - dtm.heightColorMapHeights[mapIndex]) / heightDeriv;
-                            float prevColorCoverage = smoothstep(-globals.antiAliasingFactor, globals.antiAliasingFactor, pxDistanceToPrevHeight);
-                            textureColor = lerp(dtm.heightColorMapColors[mapIndex - 1].rgb, dtm.heightColorMapColors[mapIndex].rgb, prevColorCoverage);
-                        }
-                        else
-                        {
-                            textureColor = dtm.heightColorMapColors[mapIndex].rgb;
-                        }
+                        interpolationVal = 1.0f;
+                        blendWithPrev = false;
                     }
                     else
                     {
-                        if (mapIndex < heightMapSize - 1)
-                        {
-                            float pxDistanceToNextHeight = (height - dtm.heightColorMapHeights[mapIndexPlus1]) / heightDeriv;
-                            float nextColorCoverage = smoothstep(-globals.antiAliasingFactor, globals.antiAliasingFactor, pxDistanceToNextHeight);
-                            textureColor = lerp(dtm.heightColorMapColors[mapIndex].rgb, dtm.heightColorMapColors[mapIndexPlus1].rgb, nextColorCoverage);
-                        }
-                        else
-                        {
-                            textureColor = dtm.heightColorMapColors[mapIndex].rgb;
-                        }
+                        interpolationVal = (heightTmp - lowerBoundHeight) / (upperBoundHeight - lowerBoundHeight);
+                        blendWithPrev = height - interval * sectionIndex < 0.5f;
                     }
 
-                    //localAlpha = dtm.heightColorMapColors[mapIndex].a;
-                }
-                else
-                {
-                    float heightTmp;
-                    if (mode == DTMSettings::E_HEIGHT_SHADING_MODE::DISCRETE_FIXED_LENGTH_INTERVALS)
+                    DTMHeightShadingAAInfo aaInfo;
+                    aaInfo.currentHeight = height;
+                    aaInfo.currentSegmentColor = lerp(lowerBoundColor, upperBoundColor, interpolationVal);
+                    if (blendWithPrev)
                     {
-                        float interval = dtm.intervalWidth;
-                        int sectionIndex = int((height - minShadingHeight) / interval);
-                        heightTmp = minShadingHeight + float(sectionIndex) * interval;
+                        aaInfo.nearestSegmentHeight = heightTmp;
+                        aaInfo.nearestSegmentColor = lerp(lowerBoundColor, upperBoundColor, interpolationVal - 1.0f / interval);
                     }
-                    else if (mode == DTMSettings::E_HEIGHT_SHADING_MODE::CONTINOUS_INTERVALS)
+                    else
                     {
-                        heightTmp = height;
+                        aaInfo.nearestSegmentHeight = heightTmp + interval;
+                        aaInfo.nearestSegmentColor = lerp(lowerBoundColor, upperBoundColor, interpolationVal + 1.0f / interval);
                     }
+                    calculateBetweenHeightShadingRegionsAntiAliasing(dtm, aaInfo, textureColor, localAlpha);
+                }
+                else if (mode == DTMSettings::E_HEIGHT_SHADING_MODE::CONTINOUS_INTERVALS)
+                {
+                    float heightTmp = height;
 
                     DTMSettingsHeightsAccessor dtmHeightsAccessor = { dtm };
                     uint32_t upperBoundHeightIndex = nbl::hlsl::upper_bound(dtmHeightsAccessor, 0, heightMapSize, height);
@@ -575,16 +610,13 @@ float4 fragMain(PSInput input) : SV_TARGET
                         interpolationVal = 1.0f;
                     else
                         interpolationVal = (heightTmp - lowerBoundHeight) / (upperBoundHeight - lowerBoundHeight);
-                
-                    textureColor = lerp(lowerBoundColor.rgb, upperBoundColor.rgb, interpolationVal);
-                    localAlpha = lerp(lowerBoundColor.a, upperBoundColor.a, interpolationVal);;
+
+                    float4 localHeightColor = lerp(lowerBoundColor, upperBoundColor, interpolationVal);
+
+                    localAlpha *= localHeightColor.a;
+                    textureColor = localHeightColor.rgb * localAlpha + textureColor * (1.0f - localAlpha);
                 }
             }
-            //else // TODO: remove!!
-            //{
-            //    printf("WTF");
-            //    return float4(0.0f, 0.0f, 0.0f, 1.0f);
-            //}
 
             // CONTOUR
 

From fd4e576665b228ede3acd28fc32119cc42d8cf5e Mon Sep 17 00:00:00 2001
From: Przemek <minikers21@gmail.com>
Date: Sat, 12 Apr 2025 12:42:23 +0200
Subject: [PATCH 158/529] Triangle dilation

---
 62_CAD/main.cpp                               |  4 +-
 .../main_pipeline/fragment_shader.hlsl        | 16 ++++++--
 .../shaders/main_pipeline/vertex_shader.hlsl  | 38 ++++++++++++++++---
 3 files changed, 47 insertions(+), 11 deletions(-)

diff --git a/62_CAD/main.cpp b/62_CAD/main.cpp
index 77f90d13d..3ad285c46 100644
--- a/62_CAD/main.cpp
+++ b/62_CAD/main.cpp
@@ -3211,9 +3211,9 @@ class ComputerAidedDesign final : public examples::SimpleWindowedApplication, pu
 
 				{ float64_t2(0.0, 0.0), 100.0 },
 				{ float64_t2(-200.0, -200.0), 10.0 },
-				{ float64_t2(200.0, -200.0), 10.0 },
+				{ float64_t2(200.0, -100.0), 10.0 },
 				{ float64_t2(0.0, 0.0), 100.0 },
-				{ float64_t2(200.0, -200.0), 10.0 },
+				{ float64_t2(200.0, -100.0), 10.0 },
 				{ float64_t2(200.0, 200.0), -20.0 },
 				{ float64_t2(0.0, 0.0), 100.0 },
 				{ float64_t2(200.0, 200.0), -20.0 },
diff --git a/62_CAD/shaders/main_pipeline/fragment_shader.hlsl b/62_CAD/shaders/main_pipeline/fragment_shader.hlsl
index ffa94f15c..d19503ca8 100644
--- a/62_CAD/shaders/main_pipeline/fragment_shader.hlsl
+++ b/62_CAD/shaders/main_pipeline/fragment_shader.hlsl
@@ -440,6 +440,15 @@ void calculateBetweenHeightShadingRegionsAntiAliasing(in DTMSettings dtm, in DTM
     textureColor = localHeightColor.rgb * localAlpha + textureColor * (1.0f - localAlpha);
 }
 
+float3 calculateDTMTriangleBarycentrics(in float2 v1, in float2 v2, in float2 v3, in float2 p)
+{
+    float denom = (v2.x - v1.x) * (v3.y - v1.y) - (v3.x - v1.x) * (v2.y - v1.y);
+    float u = ((v2.y - v3.y) * (p.x - v3.x) + (v3.x - v2.x) * (p.y - v3.y)) / denom;
+    float v = ((v3.y - v1.y) * (p.x - v3.x) + (v1.x - v3.x) * (p.y - v3.y)) / denom;
+    float w = 1.0 - u - v;
+    return float3(u, v, w);
+}
+
 [[vk::spvexecutionmode(spv::ExecutionModePixelInterlockOrderedEXT)]]
 [shader("pixel")]
 float4 fragMain(PSInput input) : SV_TARGET
@@ -470,7 +479,8 @@ float4 fragMain(PSInput input) : SV_TARGET
             v[1] = input.getScreenSpaceVertexAttribs(1);
             v[2] = input.getScreenSpaceVertexAttribs(2);
 
-            const float3 baryCoord = nbl::hlsl::spirv::BaryCoordKHR;
+            //const float3 baryCoord = nbl::hlsl::spirv::BaryCoordKHR;
+            const float3 baryCoord = calculateDTMTriangleBarycentrics(v[0], v[1], v[2], input.position.xy);
 
             // indices of points constructing every edge
             uint2 edgePoints[3];
@@ -483,8 +493,8 @@ float4 fragMain(PSInput input) : SV_TARGET
             opposingVertexIdx[0] = 2;
             opposingVertexIdx[1] = 0;
             opposingVertexIdx[2] = 1;
-        
-            float height = input.getHeight();
+
+            float height = baryCoord.x * v[0].z + baryCoord.y * v[1].z + baryCoord.z * v[2].z;
 
             // HEIGHT SHADING
             const uint32_t heightMapSize = dtm.heightColorEntryCount;
diff --git a/62_CAD/shaders/main_pipeline/vertex_shader.hlsl b/62_CAD/shaders/main_pipeline/vertex_shader.hlsl
index b300a6958..cef5fb4c2 100644
--- a/62_CAD/shaders/main_pipeline/vertex_shader.hlsl
+++ b/62_CAD/shaders/main_pipeline/vertex_shader.hlsl
@@ -102,20 +102,46 @@ PSInput main(uint vertexID : SV_VertexID)
         outV.setMainObjectIdx(pc.triangleMeshMainObjectIndex);
     
         TriangleMeshVertex vtx = vk::RawBufferLoad<TriangleMeshVertex>(pc.triangleMeshVerticesBaseAddress + sizeof(TriangleMeshVertex) * vertexID, 8u);
-        pfloat64_t2 vtxPos;
-        vtxPos.x = _static_cast<pfloat64_t>(vtx.pos.x);
-        vtxPos.y = _static_cast<pfloat64_t>(vtx.pos.y);
 
         MainObject mainObj = loadMainObject(pc.triangleMeshMainObjectIndex);
         clipProjectionData = getClipProjectionData(mainObj);
 
-        float2 transformedPos = transformPointScreenSpace(clipProjectionData.projectionToNDC, globals.resolution, vtxPos);
+        // assuming there are 3 * N vertices, number of vertices is equal to number of indices and indices are sequential starting from 0
+        float2 transformedOriginalPos;
+        float2 transformedDilatedPos;
+        {
+            uint32_t firstVertexOfCurrentTriangleIndex = vertexID - vertexID % 3;
+            uint32_t currentVertexWithinTriangleIndex = vertexID - firstVertexOfCurrentTriangleIndex;
+
+            TriangleMeshVertex triangleVertices[3];
+            triangleVertices[0] = vk::RawBufferLoad<TriangleMeshVertex>(pc.triangleMeshVerticesBaseAddress + sizeof(TriangleMeshVertex) * firstVertexOfCurrentTriangleIndex, 8u);
+            triangleVertices[1] = vk::RawBufferLoad<TriangleMeshVertex>(pc.triangleMeshVerticesBaseAddress + sizeof(TriangleMeshVertex) * (firstVertexOfCurrentTriangleIndex + 1), 8u);
+            triangleVertices[2] = vk::RawBufferLoad<TriangleMeshVertex>(pc.triangleMeshVerticesBaseAddress + sizeof(TriangleMeshVertex) * (firstVertexOfCurrentTriangleIndex + 2), 8u);
+            transformedOriginalPos = transformPointScreenSpace(clipProjectionData.projectionToNDC, globals.resolution, triangleVertices[currentVertexWithinTriangleIndex].pos);
+
+            pfloat64_t2 triangleCentroid;
+            triangleCentroid.x = (triangleVertices[0].pos.x + triangleVertices[1].pos.x + triangleVertices[2].pos.x) / _static_cast<pfloat64_t>(3.0f);
+            triangleCentroid.y = (triangleVertices[0].pos.y + triangleVertices[1].pos.y + triangleVertices[2].pos.y) / _static_cast<pfloat64_t>(3.0f);
+
+            // move triangles to local space, with centroid at (0, 0)
+            triangleVertices[0].pos = triangleVertices[0].pos - triangleCentroid;
+            triangleVertices[1].pos = triangleVertices[1].pos - triangleCentroid;
+            triangleVertices[2].pos = triangleVertices[2].pos - triangleCentroid;
+
+            // TODO: calculate dialation factor
+            pfloat64_t dialationFactor = _static_cast<pfloat64_t>(2.0f);
+            pfloat64_t2 dialatedVertex = triangleVertices[currentVertexWithinTriangleIndex].pos * dialationFactor;
+
+            dialatedVertex = dialatedVertex + triangleCentroid;
+
+            transformedDilatedPos = transformPointScreenSpace(clipProjectionData.projectionToNDC, globals.resolution, dialatedVertex);
+        }
 
-        outV.position.xy = transformedPos;
+        outV.position.xy = transformedDilatedPos;
         outV.position = transformFromSreenSpaceToNdc(outV.position.xy, globals.resolution);
         const float heightAsFloat = nbl::hlsl::_static_cast<float>(vtx.height);
         outV.setHeight(heightAsFloat);
-        outV.setScreenSpaceVertexAttribs(float3(transformedPos, heightAsFloat));
+        outV.setScreenSpaceVertexAttribs(float3(transformedOriginalPos, heightAsFloat));
         outV.setCurrentWorldToScreenRatio(
             _static_cast<float>((_static_cast<pfloat64_t>(2.0f) /
                 (clipProjectionData.projectionToNDC[0].x * _static_cast<pfloat64_t>(globals.resolution.x))))

From 6c907e49bce8d7bde928af6b521fa2d7cc280584 Mon Sep 17 00:00:00 2001
From: Przemek <minikers21@gmail.com>
Date: Mon, 14 Apr 2025 12:54:50 +0200
Subject: [PATCH 159/529] Added few todos

---
 .../main_pipeline/fragment_shader.hlsl        | 20 +++++++++----------
 .../shaders/main_pipeline/vertex_shader.hlsl  |  4 ++--
 2 files changed, 11 insertions(+), 13 deletions(-)

diff --git a/62_CAD/shaders/main_pipeline/fragment_shader.hlsl b/62_CAD/shaders/main_pipeline/fragment_shader.hlsl
index d19503ca8..bfb267a01 100644
--- a/62_CAD/shaders/main_pipeline/fragment_shader.hlsl
+++ b/62_CAD/shaders/main_pipeline/fragment_shader.hlsl
@@ -430,6 +430,7 @@ struct DTMHeightShadingAAInfo
 
 void calculateBetweenHeightShadingRegionsAntiAliasing(in DTMSettings dtm, in DTMHeightShadingAAInfo aaInfo, out float3 textureColor, out float localAlpha)
 {
+    //TODO: move outside
     float heightDeriv = fwidth(aaInfo.currentHeight);
 
     float pxDistanceToNearestSegment = abs(aaInfo.currentHeight - aaInfo.nearestSegmentHeight) / heightDeriv;
@@ -437,7 +438,7 @@ void calculateBetweenHeightShadingRegionsAntiAliasing(in DTMSettings dtm, in DTM
     float4 localHeightColor = lerp(aaInfo.nearestSegmentColor, aaInfo.currentSegmentColor, nearestSegmentColorCoverage);
 
     localAlpha *= localHeightColor.a;
-    textureColor = localHeightColor.rgb * localAlpha + textureColor * (1.0f - localAlpha);
+    textureColor = localHeightColor.rgb;
 }
 
 float3 calculateDTMTriangleBarycentrics(in float2 v1, in float2 v2, in float2 v3, in float2 p)
@@ -529,6 +530,7 @@ float4 fragMain(PSInput input) : SV_TARGET
                 convexPolygonSdf = max(convexPolygonSdf, line3Sdf);
                 convexPolygonSdf = max(convexPolygonSdf, line4Sdf);
 
+                // TODO: separate
                 localAlpha = 1.0f - smoothstep(0.0f, globals.antiAliasingFactor * 2.0f, convexPolygonSdf);
 
                 // calculate height color
@@ -577,13 +579,13 @@ float4 fragMain(PSInput input) : SV_TARGET
                     bool blendWithPrev;
                     if (upperBoundHeightIndex == 0)
                     {
-                        interpolationVal = 1.0f;
+                        interpolationVal = 1.0f; // TODO: investigate if it is correct
                         blendWithPrev = false;
                     }
                     else
                     {
                         interpolationVal = (heightTmp - lowerBoundHeight) / (upperBoundHeight - lowerBoundHeight);
-                        blendWithPrev = height - interval * sectionIndex < 0.5f;
+                        blendWithPrev = height - interval * sectionIndex < 0.5f; // TODO: investigate if it is correct
                     }
 
                     DTMHeightShadingAAInfo aaInfo;
@@ -603,8 +605,6 @@ float4 fragMain(PSInput input) : SV_TARGET
                 }
                 else if (mode == DTMSettings::E_HEIGHT_SHADING_MODE::CONTINOUS_INTERVALS)
                 {
-                    float heightTmp = height;
-
                     DTMSettingsHeightsAccessor dtmHeightsAccessor = { dtm };
                     uint32_t upperBoundHeightIndex = nbl::hlsl::upper_bound(dtmHeightsAccessor, 0, heightMapSize, height);
                     uint32_t lowerBoundHeightIndex = upperBoundHeightIndex == 0 ? upperBoundHeightIndex : upperBoundHeightIndex - 1;
@@ -619,7 +619,7 @@ float4 fragMain(PSInput input) : SV_TARGET
                     if (upperBoundHeightIndex == 0)
                         interpolationVal = 1.0f;
                     else
-                        interpolationVal = (heightTmp - lowerBoundHeight) / (upperBoundHeight - lowerBoundHeight);
+                        interpolationVal = (height - lowerBoundHeight) / (upperBoundHeight - lowerBoundHeight);
 
                     float4 localHeightColor = lerp(lowerBoundColor, upperBoundColor, interpolationVal);
 
@@ -648,7 +648,7 @@ float4 fragMain(PSInput input) : SV_TARGET
             // TODO: case where heights we are looking for are on all three vertices
             for (int i = 0; i < 3; ++i)
             {
-                if (contourLinePointsIdx == 3)
+                if (contourLinePointsIdx == 2)
                     break;
 
                 const uint2 currentEdgePoints = edgePoints[i];
@@ -694,8 +694,6 @@ float4 fragMain(PSInput input) : SV_TARGET
                 localAlpha = max(localAlpha, contourLocalAlpha);
             }
 
-        
-
             // OUTLINE
 
             // find sdf of every edge
@@ -719,8 +717,8 @@ float4 fragMain(PSInput input) : SV_TARGET
                 float3 B = v[currentEdgePoints[1]];
                 float3 AB = B - A;
                 float ABLen = length(AB);
-
-                distances[i] = (triangleAreaTimesTwo / ABLen) * baryCoord[opposingVertexIdx[i]];
+                float triangleHeightToOpositeVertex = triangleAreaTimesTwo / ABLen;
+                distances[i] = triangleHeightToOpositeVertex * baryCoord[opposingVertexIdx[i]];
             }
 
             float minDistance = nbl::hlsl::numeric_limits<float>::max;
diff --git a/62_CAD/shaders/main_pipeline/vertex_shader.hlsl b/62_CAD/shaders/main_pipeline/vertex_shader.hlsl
index cef5fb4c2..20c29f16a 100644
--- a/62_CAD/shaders/main_pipeline/vertex_shader.hlsl
+++ b/62_CAD/shaders/main_pipeline/vertex_shader.hlsl
@@ -137,8 +137,7 @@ PSInput main(uint vertexID : SV_VertexID)
             transformedDilatedPos = transformPointScreenSpace(clipProjectionData.projectionToNDC, globals.resolution, dialatedVertex);
         }
 
-        outV.position.xy = transformedDilatedPos;
-        outV.position = transformFromSreenSpaceToNdc(outV.position.xy, globals.resolution);
+        outV.position = transformFromSreenSpaceToNdc(transformedDilatedPos, globals.resolution);
         const float heightAsFloat = nbl::hlsl::_static_cast<float>(vtx.height);
         outV.setHeight(heightAsFloat);
         outV.setScreenSpaceVertexAttribs(float3(transformedOriginalPos, heightAsFloat));
@@ -151,6 +150,7 @@ PSInput main(uint vertexID : SV_VertexID)
         DTMSettings dtm = loadDTMSettings(mainObj.dtmSettingsIdx);
         LineStyle outlineStyle = loadLineStyle(dtm.outlineLineStyleIdx);
         LineStyle contourStyle = loadLineStyle(dtm.contourLineStyleIdx);
+        // TODO: maybe move to fragment shader since we may have multiple contour styles later
         const float screenSpaceOutlineWidth = outlineStyle.screenSpaceLineWidth + _static_cast<float>(_static_cast<pfloat64_t>(outlineStyle.worldSpaceLineWidth) * globals.screenToWorldRatio);
         const float sdfOutlineThickness = screenSpaceOutlineWidth * 0.5f;
         const float screenSpaceContourLineWidth = contourStyle.screenSpaceLineWidth + _static_cast<float>(_static_cast<pfloat64_t>(contourStyle.worldSpaceLineWidth) * globals.screenToWorldRatio);

From f173c71866259189779de0edc8bb209717bbf7b2 Mon Sep 17 00:00:00 2001
From: Przemek <minikers21@gmail.com>
Date: Mon, 14 Apr 2025 16:00:51 +0200
Subject: [PATCH 160/529] Refactor

---
 62_CAD/CTriangleMesh.h                        |   4 +
 62_CAD/DrawResourcesFiller.cpp                |   4 +
 62_CAD/main.cpp                               |   4 +
 62_CAD/shaders/globals.hlsl                   |   6 +-
 .../main_pipeline/fragment_shader.hlsl        | 620 ++++++++++--------
 5 files changed, 357 insertions(+), 281 deletions(-)

diff --git a/62_CAD/CTriangleMesh.h b/62_CAD/CTriangleMesh.h
index 6c68cec27..c1dcbca68 100644
--- a/62_CAD/CTriangleMesh.h
+++ b/62_CAD/CTriangleMesh.h
@@ -25,6 +25,10 @@ struct DTMSettingsInfo
 	float intervalWidth;
 	E_HEIGHT_SHADING_MODE heightShadingMode;
 
+	bool drawHeightsFlag;
+	bool drawContoursFlag;
+	bool drawOutlineFlag;
+
 	void addHeightColorMapEntry(float height, float32_t4 color)
 	{
 		heightColorSet.emplace(height, color);
diff --git a/62_CAD/DrawResourcesFiller.cpp b/62_CAD/DrawResourcesFiller.cpp
index d28843a31..c11b0a67f 100644
--- a/62_CAD/DrawResourcesFiller.cpp
+++ b/62_CAD/DrawResourcesFiller.cpp
@@ -653,6 +653,10 @@ uint32_t DrawResourcesFiller::addDTMSettings_Internal(const DTMSettingsInfo& dtm
 	}
 	_NBL_DEBUG_BREAK_IF(!dtmSettingsInfo.fillShaderDTMSettingsHeightColorMap(dtmSettings));
 
+	dtmSettings.drawHeightsFlag = static_cast<int>(dtmSettingsInfo.drawHeightsFlag);
+	dtmSettings.drawContoursFlag = static_cast<int>(dtmSettingsInfo.drawContoursFlag);
+	dtmSettings.drawOutlineFlag = static_cast<int>(dtmSettingsInfo.drawOutlineFlag);
+
 	for (uint32_t i = 0u; i < resourcesCollection.dtmSettings.vector.size(); ++i)
 	{
 		const DTMSettings& itr = resourcesCollection.dtmSettings.vector[i];
diff --git a/62_CAD/main.cpp b/62_CAD/main.cpp
index 3ad285c46..48ca4f5a3 100644
--- a/62_CAD/main.cpp
+++ b/62_CAD/main.cpp
@@ -3264,6 +3264,10 @@ class ComputerAidedDesign final : public examples::SimpleWindowedApplication, pu
 			std::array<double, 4> contourStipplePattern = { 0.0f, -5.0f, 10.0f, -5.0f };
 			dtmSettingsInfo.contourLineStyleInfo.setStipplePatternData(contourStipplePattern);
 
+			dtmSettingsInfo.drawHeightsFlag = true;
+			dtmSettingsInfo.drawContoursFlag = true;
+			dtmSettingsInfo.drawOutlineFlag = true;
+
 			// PRESS 1, 2, 3 TO SWITCH HEIGHT SHADING MODE
 			// 1 - DISCRETE_VARIABLE_LENGTH_INTERVALS
 			// 2 - DISCRETE_FIXED_LENGTH_INTERVALS
diff --git a/62_CAD/shaders/globals.hlsl b/62_CAD/shaders/globals.hlsl
index 24a833334..bf4c06db7 100644
--- a/62_CAD/shaders/globals.hlsl
+++ b/62_CAD/shaders/globals.hlsl
@@ -359,7 +359,11 @@ struct DTMSettings
     const static uint32_t HeightColorMapMaxEntries = 16u;
     uint32_t outlineLineStyleIdx; // index into line styles
     uint32_t contourLineStyleIdx; // index into line styles
-    
+
+    int drawHeightsFlag;
+    int drawContoursFlag;
+    int drawOutlineFlag;
+
     // contour lines
     float contourLinesStartHeight;
     float contourLinesEndHeight;
diff --git a/62_CAD/shaders/main_pipeline/fragment_shader.hlsl b/62_CAD/shaders/main_pipeline/fragment_shader.hlsl
index bfb267a01..be5e0bf07 100644
--- a/62_CAD/shaders/main_pipeline/fragment_shader.hlsl
+++ b/62_CAD/shaders/main_pipeline/fragment_shader.hlsl
@@ -428,17 +428,14 @@ struct DTMHeightShadingAAInfo
     float4 nearestSegmentColor;
 };
 
-void calculateBetweenHeightShadingRegionsAntiAliasing(in DTMSettings dtm, in DTMHeightShadingAAInfo aaInfo, out float3 textureColor, out float localAlpha)
+void calculateBetweenHeightShadingRegionsAntiAliasing(in DTMSettings dtm, in DTMHeightShadingAAInfo aaInfo, in float heightDeriv, out float4 outputColor)
 {
-    //TODO: move outside
-    float heightDeriv = fwidth(aaInfo.currentHeight);
-
     float pxDistanceToNearestSegment = abs(aaInfo.currentHeight - aaInfo.nearestSegmentHeight) / heightDeriv;
     float nearestSegmentColorCoverage = smoothstep(-globals.antiAliasingFactor, globals.antiAliasingFactor, pxDistanceToNearestSegment);
     float4 localHeightColor = lerp(aaInfo.nearestSegmentColor, aaInfo.currentSegmentColor, nearestSegmentColorCoverage);
 
-    localAlpha *= localHeightColor.a;
-    textureColor = localHeightColor.rgb;
+    outputColor.a *= localHeightColor.a;
+    outputColor.rgb = localHeightColor.rgb;
 }
 
 float3 calculateDTMTriangleBarycentrics(in float2 v1, in float2 v2, in float2 v3, in float2 p)
@@ -450,322 +447,385 @@ float3 calculateDTMTriangleBarycentrics(in float2 v1, in float2 v2, in float2 v3
     return float3(u, v, w);
 }
 
-[[vk::spvexecutionmode(spv::ExecutionModePixelInterlockOrderedEXT)]]
-[shader("pixel")]
-float4 fragMain(PSInput input) : SV_TARGET
+float4 calculateDTMHeightColor(in DTMSettings dtm, in float3 v[3], in float heightDeriv, in float2 fragPos, in float height)
 {
-    float localAlpha = 0.0f;
-    float3 textureColor = float3(0, 0, 0); // color sampled from a texture
+    float4 outputColor = float4(0.0f, 0.0f, 0.0f, 1.0f);
 
-    ObjectType objType = input.getObjType();
-    const uint32_t currentMainObjectIdx = input.getMainObjectIdx();
-    const MainObject mainObj = loadMainObject(currentMainObjectIdx);
-    
-    if (pc.isDTMRendering)
-    {   
-        // TRIANGLE RENDERING
+    // HEIGHT SHADING
+    const uint32_t heightMapSize = dtm.heightColorEntryCount;
+    float minShadingHeight = dtm.heightColorMapHeights[0];
+    float maxShadingHeight = dtm.heightColorMapHeights[heightMapSize - 1];
+
+    if (heightMapSize > 0)
+    {
+        // partially based on https://www.shadertoy.com/view/XsXSz4 by Inigo Quilez
+        float2 e0 = v[1] - v[0];
+        float2 e1 = v[2] - v[1];
+        float2 e2 = v[0] - v[2];
+
+        float triangleAreaSign = -sign(e0.x * e2.y - e0.y * e2.x);
+        float2 v0 = fragPos - v[0];
+        float2 v1 = fragPos - v[1];
+        float2 v2 = fragPos - v[2];
+
+        float distanceToLine0 = sqrt(dot2(v0 - e0 * dot(v0, e0) / dot(e0, e0)));
+        float distanceToLine1 = sqrt(dot2(v1 - e1 * dot(v1, e1) / dot(e1, e1)));
+        float distanceToLine2 = sqrt(dot2(v2 - e2 * dot(v2, e2) / dot(e2, e2)));
+
+        float line0Sdf = distanceToLine0 * triangleAreaSign * (v0.x * e0.y - v0.y * e0.x);
+        float line1Sdf = distanceToLine1 * triangleAreaSign * (v1.x * e1.y - v1.y * e1.x);
+        float line2Sdf = distanceToLine2 * triangleAreaSign * (v2.x * e2.y - v2.y * e2.x);
+        float line3Sdf = (minShadingHeight - height) / heightDeriv;
+        float line4Sdf = (height - maxShadingHeight) / heightDeriv;
+
+        float convexPolygonSdf = max(line0Sdf, line1Sdf);
+        convexPolygonSdf = max(convexPolygonSdf, line2Sdf);
+        convexPolygonSdf = max(convexPolygonSdf, line3Sdf);
+        convexPolygonSdf = max(convexPolygonSdf, line4Sdf);
+
+        // TODO: separate
+        outputColor.a = 1.0f - smoothstep(0.0f, globals.antiAliasingFactor * 2.0f, convexPolygonSdf);
+
+        // calculate height color
+        DTMSettings::E_HEIGHT_SHADING_MODE mode = dtm.determineHeightShadingMode();
+
+        if (mode == DTMSettings::E_HEIGHT_SHADING_MODE::DISCRETE_VARIABLE_LENGTH_INTERVALS)
         {
-            const float outlineThickness = input.getOutlineThickness();
-            const float contourThickness = input.getContourLineThickness();
-            const float phaseShift = 0.0f; // input.getCurrentPhaseShift();
-            const float stretch = 1.0f;
-            const float worldToScreenRatio = input.getCurrentWorldToScreenRatio();
+            DTMSettingsHeightsAccessor dtmHeightsAccessor = { dtm };
+            int upperBoundIndex = nbl::hlsl::upper_bound(dtmHeightsAccessor, 0, heightMapSize, height);
+            int mapIndex = max(upperBoundIndex - 1, 0);
+            int mapIndexPrev = max(mapIndex - 1, 0);
+            int mapIndexNext = min(mapIndex + 1, heightMapSize - 1);
+
+            // logic explainer: if colorIdx is 0.0 then it means blend with next
+            // if color idx is >= length of the colours array then it means it's also > 0.0 and this blend with prev is true
+            // if color idx is > 0 and < len - 1, then it depends on the current pixel's height value and two closest height values
+            bool blendWithPrev = (mapIndex > 0)
+                && (mapIndex >= heightMapSize - 1 || (height * 2.0 < dtm.heightColorMapHeights[upperBoundIndex] + dtm.heightColorMapHeights[mapIndex]));
+
+            DTMHeightShadingAAInfo aaInfo;
+            aaInfo.currentHeight = height;
+            aaInfo.currentSegmentColor = dtm.heightColorMapColors[mapIndex];
+            aaInfo.nearestSegmentHeight = blendWithPrev ? dtm.heightColorMapHeights[mapIndex] : dtm.heightColorMapHeights[mapIndexNext];
+            aaInfo.nearestSegmentColor = blendWithPrev ? dtm.heightColorMapColors[mapIndexPrev] : dtm.heightColorMapColors[mapIndexNext];
+
+            calculateBetweenHeightShadingRegionsAntiAliasing(dtm, aaInfo, heightDeriv, outputColor);
+        }
+        else if (mode == DTMSettings::E_HEIGHT_SHADING_MODE::DISCRETE_FIXED_LENGTH_INTERVALS)
+        {
+            float interval = dtm.intervalWidth;
+            float heightMinShadingHeightDiff = (height - minShadingHeight);
+            int sectionIndex = int(heightMinShadingHeightDiff / interval);
+            float heightTmp = minShadingHeight + float(sectionIndex) * interval;
 
-            DTMSettings dtm = loadDTMSettings(mainObj.dtmSettingsIdx);
-            LineStyle outlineStyle = loadLineStyle(dtm.outlineLineStyleIdx);
-            LineStyle contourStyle = loadLineStyle(dtm.contourLineStyleIdx);
+            DTMSettingsHeightsAccessor dtmHeightsAccessor = { dtm };
+            uint32_t upperBoundHeightIndex = nbl::hlsl::upper_bound(dtmHeightsAccessor, 0, heightMapSize, height);
+            uint32_t lowerBoundHeightIndex = max(upperBoundHeightIndex - 1, 0);
 
-            float3 v[3];
-            v[0] = input.getScreenSpaceVertexAttribs(0);
-            v[1] = input.getScreenSpaceVertexAttribs(1);
-            v[2] = input.getScreenSpaceVertexAttribs(2);
+            float upperBoundHeight = dtm.heightColorMapHeights[upperBoundHeightIndex];
+            float lowerBoundHeight = dtm.heightColorMapHeights[lowerBoundHeightIndex];
 
-            //const float3 baryCoord = nbl::hlsl::spirv::BaryCoordKHR;
-            const float3 baryCoord = calculateDTMTriangleBarycentrics(v[0], v[1], v[2], input.position.xy);
-
-            // indices of points constructing every edge
-            uint2 edgePoints[3];
-            edgePoints[0] = uint2(0, 1);
-            edgePoints[1] = uint2(1, 2);
-            edgePoints[2] = uint2(2, 0);
+            float4 upperBoundColor = dtm.heightColorMapColors[upperBoundHeightIndex];
+            float4 lowerBoundColor = dtm.heightColorMapColors[lowerBoundHeightIndex];
 
-            // index of vertex opposing an edge, needed for calculation of triangle heights
-            uint opposingVertexIdx[3];
-            opposingVertexIdx[0] = 2;
-            opposingVertexIdx[1] = 0;
-            opposingVertexIdx[2] = 1;
+            float interpolationVal;
+            bool blendWithPrev;
+            if (upperBoundHeightIndex == 0)
+            {
+                interpolationVal = 1.0f; // TODO: investigate if it is correct
+                blendWithPrev = false;
+            }
+            else
+            {
+                interpolationVal = (heightTmp - lowerBoundHeight) / (upperBoundHeight - lowerBoundHeight);
+                blendWithPrev = height - interval * sectionIndex < 0.5f; // TODO: investigate if it is correct
+            }
 
-            float height = baryCoord.x * v[0].z + baryCoord.y * v[1].z + baryCoord.z * v[2].z;
+            DTMHeightShadingAAInfo aaInfo;
+            aaInfo.currentHeight = height;
+            aaInfo.currentSegmentColor = lerp(lowerBoundColor, upperBoundColor, interpolationVal);
+            if (blendWithPrev)
+            {
+                aaInfo.nearestSegmentHeight = heightTmp;
+                aaInfo.nearestSegmentColor = lerp(lowerBoundColor, upperBoundColor, interpolationVal - 1.0f / interval);
+            }
+            else
+            {
+                aaInfo.nearestSegmentHeight = heightTmp + interval;
+                aaInfo.nearestSegmentColor = lerp(lowerBoundColor, upperBoundColor, interpolationVal + 1.0f / interval);
+            }
+            calculateBetweenHeightShadingRegionsAntiAliasing(dtm, aaInfo, heightDeriv, outputColor);
+        }
+        else if (mode == DTMSettings::E_HEIGHT_SHADING_MODE::CONTINOUS_INTERVALS)
+        {
+            DTMSettingsHeightsAccessor dtmHeightsAccessor = { dtm };
+            uint32_t upperBoundHeightIndex = nbl::hlsl::upper_bound(dtmHeightsAccessor, 0, heightMapSize, height);
+            uint32_t lowerBoundHeightIndex = upperBoundHeightIndex == 0 ? upperBoundHeightIndex : upperBoundHeightIndex - 1;
 
-            // HEIGHT SHADING
-            const uint32_t heightMapSize = dtm.heightColorEntryCount;
-            float minShadingHeight = dtm.heightColorMapHeights[0];
-            float maxShadingHeight = dtm.heightColorMapHeights[heightMapSize - 1];
+            float upperBoundHeight = dtm.heightColorMapHeights[upperBoundHeightIndex];
+            float lowerBoundHeight = dtm.heightColorMapHeights[lowerBoundHeightIndex];
 
-            if (heightMapSize > 0)
-            {
-                // partially based on https://www.shadertoy.com/view/XsXSz4 by Inigo Quilez
-                float2 e0 = v[1] - v[0];
-                float2 e1 = v[2] - v[1];
-                float2 e2 = v[0] - v[2];
-            
-                float triangleAreaSign = -sign(e0.x * e2.y - e0.y * e2.x);
-                float2 v0 = input.position.xy - v[0];
-                float2 v1 = input.position.xy - v[1];
-                float2 v2 = input.position.xy - v[2];
-
-                float distanceToLine0 = sqrt(dot2(v0 - e0 * dot(v0, e0) / dot(e0, e0)));
-                float distanceToLine1 = sqrt(dot2(v1 - e1 * dot(v1, e1) / dot(e1, e1)));
-                float distanceToLine2 = sqrt(dot2(v2 - e2 * dot(v2, e2) / dot(e2, e2)));
-
-                float line0Sdf = distanceToLine0 * triangleAreaSign * (v0.x * e0.y - v0.y * e0.x);
-                float line1Sdf = distanceToLine1 * triangleAreaSign * (v1.x * e1.y - v1.y * e1.x);
-                float line2Sdf = distanceToLine2 * triangleAreaSign * (v2.x * e2.y - v2.y * e2.x);
-                float heightDeriv = fwidth(height);
-                float line3Sdf = (minShadingHeight - height) / heightDeriv;
-                float line4Sdf = (height - maxShadingHeight) / heightDeriv;
-
-                float convexPolygonSdf = max(line0Sdf, line1Sdf);
-                convexPolygonSdf = max(convexPolygonSdf, line2Sdf);
-                convexPolygonSdf = max(convexPolygonSdf, line3Sdf);
-                convexPolygonSdf = max(convexPolygonSdf, line4Sdf);
-
-                // TODO: separate
-                localAlpha = 1.0f - smoothstep(0.0f, globals.antiAliasingFactor * 2.0f, convexPolygonSdf);
-
-                // calculate height color
-                DTMSettings::E_HEIGHT_SHADING_MODE mode = dtm.determineHeightShadingMode();
-
-                if(mode == DTMSettings::E_HEIGHT_SHADING_MODE::DISCRETE_VARIABLE_LENGTH_INTERVALS)
-                {
-                    DTMSettingsHeightsAccessor dtmHeightsAccessor = { dtm };
-                    int upperBoundIndex = nbl::hlsl::upper_bound(dtmHeightsAccessor, 0, heightMapSize, height);
-                    int mapIndex = max(upperBoundIndex - 1, 0);
-                    int mapIndexPrev = max(mapIndex - 1, 0);
-                    int mapIndexNext = min(mapIndex + 1, heightMapSize - 1);
-
-                    // logic explainer: if colorIdx is 0.0 then it means blend with next
-                    // if color idx is >= length of the colours array then it means it's also > 0.0 and this blend with prev is true
-                    // if color idx is > 0 and < len - 1, then it depends on the current pixel's height value and two closest height values
-                    bool blendWithPrev = (mapIndex > 0)
-                        && (mapIndex >= heightMapSize - 1 || (height * 2.0 < dtm.heightColorMapHeights[upperBoundIndex] + dtm.heightColorMapHeights[mapIndex]));
-
-                    DTMHeightShadingAAInfo aaInfo;
-                    aaInfo.currentHeight = height;
-                    aaInfo.currentSegmentColor = dtm.heightColorMapColors[mapIndex];
-                    aaInfo.nearestSegmentHeight = blendWithPrev ? dtm.heightColorMapHeights[mapIndex] : dtm.heightColorMapHeights[mapIndexNext];
-                    aaInfo.nearestSegmentColor = blendWithPrev ? dtm.heightColorMapColors[mapIndexPrev] : dtm.heightColorMapColors[mapIndexNext];
-
-                    calculateBetweenHeightShadingRegionsAntiAliasing(dtm, aaInfo, textureColor, localAlpha);
-                }
-                else if (mode == DTMSettings::E_HEIGHT_SHADING_MODE::DISCRETE_FIXED_LENGTH_INTERVALS)
-                {
-                    float interval = dtm.intervalWidth;
-                    float heightMinShadingHeightDiff = (height - minShadingHeight);
-                    int sectionIndex = int(heightMinShadingHeightDiff / interval);
-                    float heightTmp = minShadingHeight + float(sectionIndex) * interval;
+            float4 upperBoundColor = dtm.heightColorMapColors[upperBoundHeightIndex];
+            float4 lowerBoundColor = dtm.heightColorMapColors[lowerBoundHeightIndex];
 
-                    DTMSettingsHeightsAccessor dtmHeightsAccessor = { dtm };
-                    uint32_t upperBoundHeightIndex = nbl::hlsl::upper_bound(dtmHeightsAccessor, 0, heightMapSize, height);
-                    uint32_t lowerBoundHeightIndex = max(upperBoundHeightIndex - 1, 0);
+            float interpolationVal;
+            if (upperBoundHeightIndex == 0)
+                interpolationVal = 1.0f;
+            else
+                interpolationVal = (height - lowerBoundHeight) / (upperBoundHeight - lowerBoundHeight);
 
-                    float upperBoundHeight = dtm.heightColorMapHeights[upperBoundHeightIndex];
-                    float lowerBoundHeight = dtm.heightColorMapHeights[lowerBoundHeightIndex];
+            float4 localHeightColor = lerp(lowerBoundColor, upperBoundColor, interpolationVal);
 
-                    float4 upperBoundColor = dtm.heightColorMapColors[upperBoundHeightIndex];
-                    float4 lowerBoundColor = dtm.heightColorMapColors[lowerBoundHeightIndex];
+            outputColor.a *= localHeightColor.a;
+            outputColor.rgb = localHeightColor.rgb * outputColor.a + outputColor.rgb * (1.0f - outputColor.a);
+        }
+    }
 
-                    float interpolationVal;
-                    bool blendWithPrev;
-                    if (upperBoundHeightIndex == 0)
-                    {
-                        interpolationVal = 1.0f; // TODO: investigate if it is correct
-                        blendWithPrev = false;
-                    }
-                    else
-                    {
-                        interpolationVal = (heightTmp - lowerBoundHeight) / (upperBoundHeight - lowerBoundHeight);
-                        blendWithPrev = height - interval * sectionIndex < 0.5f; // TODO: investigate if it is correct
-                    }
+    return outputColor; 
+}
 
-                    DTMHeightShadingAAInfo aaInfo;
-                    aaInfo.currentHeight = height;
-                    aaInfo.currentSegmentColor = lerp(lowerBoundColor, upperBoundColor, interpolationVal);
-                    if (blendWithPrev)
-                    {
-                        aaInfo.nearestSegmentHeight = heightTmp;
-                        aaInfo.nearestSegmentColor = lerp(lowerBoundColor, upperBoundColor, interpolationVal - 1.0f / interval);
-                    }
-                    else
-                    {
-                        aaInfo.nearestSegmentHeight = heightTmp + interval;
-                        aaInfo.nearestSegmentColor = lerp(lowerBoundColor, upperBoundColor, interpolationVal + 1.0f / interval);
-                    }
-                    calculateBetweenHeightShadingRegionsAntiAliasing(dtm, aaInfo, textureColor, localAlpha);
-                }
-                else if (mode == DTMSettings::E_HEIGHT_SHADING_MODE::CONTINOUS_INTERVALS)
-                {
-                    DTMSettingsHeightsAccessor dtmHeightsAccessor = { dtm };
-                    uint32_t upperBoundHeightIndex = nbl::hlsl::upper_bound(dtmHeightsAccessor, 0, heightMapSize, height);
-                    uint32_t lowerBoundHeightIndex = upperBoundHeightIndex == 0 ? upperBoundHeightIndex : upperBoundHeightIndex - 1;
+float4 calculateDTMContourColor(in DTMSettings dtm, in float3 v[3], in uint2 edgePoints[3], in PSInput psInput, in float height)
+{
+    float4 outputColor;
+
+    LineStyle contourStyle = loadLineStyle(dtm.contourLineStyleIdx);
+    const float contourThickness = psInput.getContourLineThickness();
+    float stretch = 1.0f;
+    float phaseShift = 0.0f;
+    const float worldToScreenRatio = psInput.getCurrentWorldToScreenRatio();
+
+    // TODO: move to ubo or push constants
+    const float startHeight = dtm.contourLinesStartHeight;
+    const float endHeight = dtm.contourLinesEndHeight;
+    const float interval = dtm.contourLinesHeightInterval;
+
+    // TODO: can be precomputed
+    const int maxContourLineIdx = (endHeight - startHeight + 1) / interval;
+
+    // TODO: it actually can output a negative number, fix
+    int contourLineIdx = nbl::hlsl::_static_cast<int>((height - startHeight + (interval * 0.5f)) / interval);
+    contourLineIdx = clamp(contourLineIdx, 0, maxContourLineIdx);
+    float contourLineHeight = startHeight + interval * contourLineIdx;
+
+    int contourLinePointsIdx = 0;
+    float2 contourLinePoints[2];
+    // TODO: case where heights we are looking for are on all three vertices
+    for (int i = 0; i < 3; ++i)
+    {
+        if (contourLinePointsIdx == 2)
+            break;
 
-                    float upperBoundHeight = dtm.heightColorMapHeights[upperBoundHeightIndex];
-                    float lowerBoundHeight = dtm.heightColorMapHeights[lowerBoundHeightIndex];
-                
-                    float4 upperBoundColor = dtm.heightColorMapColors[upperBoundHeightIndex];
-                    float4 lowerBoundColor = dtm.heightColorMapColors[lowerBoundHeightIndex];
-                
-                    float interpolationVal;
-                    if (upperBoundHeightIndex == 0)
-                        interpolationVal = 1.0f;
-                    else
-                        interpolationVal = (height - lowerBoundHeight) / (upperBoundHeight - lowerBoundHeight);
+        const uint2 currentEdgePoints = edgePoints[i];
+        float3 p0 = v[currentEdgePoints[0]];
+        float3 p1 = v[currentEdgePoints[1]];
 
-                    float4 localHeightColor = lerp(lowerBoundColor, upperBoundColor, interpolationVal);
+        if (p1.z < p0.z)
+            nbl::hlsl::swap(p0, p1);
 
-                    localAlpha *= localHeightColor.a;
-                    textureColor = localHeightColor.rgb * localAlpha + textureColor * (1.0f - localAlpha);
-                }
-            }
+        float minHeight = p0.z;
+        float maxHeight = p1.z;
 
-            // CONTOUR
+        if (height >= minHeight && height <= maxHeight)
+        {
+            float2 edge = float2(p1.x, p1.y) - float2(p0.x, p0.y);
+            float scale = (contourLineHeight - minHeight) / (maxHeight - minHeight);
 
-            // TODO: move to ubo or push constants
-            const float startHeight = dtm.contourLinesStartHeight;
-            const float endHeight = dtm.contourLinesEndHeight;
-            const float interval = dtm.contourLinesHeightInterval;
+            contourLinePoints[contourLinePointsIdx] = scale * edge + float2(p0.x, p0.y);
+            ++contourLinePointsIdx;
+        }
+    }
 
-            // TODO: can be precomputed
-            const int maxContourLineIdx = (endHeight - startHeight + 1) / interval;
+    {
+        nbl::hlsl::shapes::Line<float> lineSegment = nbl::hlsl::shapes::Line<float>::construct(contourLinePoints[0], contourLinePoints[1]);
 
-            // TODO: it actually can output a negative number, fix
-            int contourLineIdx = nbl::hlsl::_static_cast<int>((height - startHeight + (interval * 0.5f)) / interval);
-            contourLineIdx = clamp(contourLineIdx, 0, maxContourLineIdx);
-            float contourLineHeight = startHeight + interval * contourLineIdx;
+        float distance = nbl::hlsl::numeric_limits<float>::max;
+        if (!contourStyle.hasStipples() || stretch == InvalidStyleStretchValue)
+        {
+            distance = ClippedSignedDistance< nbl::hlsl::shapes::Line<float> >::sdf(lineSegment, psInput.position.xy, contourThickness, contourStyle.isRoadStyleFlag);
+        }
+        else
+        {
+            // TODO:
+            // It might be beneficial to calculate distance between pixel and contour line to early out some pixels and save yourself from stipple sdf computations!
+            // where you only compute the complex sdf if abs((height - contourVal) / heightDeriv) <= aaFactor
+            nbl::hlsl::shapes::Line<float>::ArcLengthCalculator arcLenCalc = nbl::hlsl::shapes::Line<float>::ArcLengthCalculator::construct(lineSegment);
+            LineStyleClipper clipper = LineStyleClipper::construct(contourStyle, lineSegment, arcLenCalc, phaseShift, stretch, worldToScreenRatio);
+            distance = ClippedSignedDistance<nbl::hlsl::shapes::Line<float>, LineStyleClipper>::sdf(lineSegment, psInput.position.xy, contourThickness, contourStyle.isRoadStyleFlag, clipper);
+        }
+        
+        outputColor.a = smoothstep(+globals.antiAliasingFactor, -globals.antiAliasingFactor, distance) * contourStyle.color.a;
+        outputColor.rgb = contourStyle.color.rgb;
+    }
 
-            int contourLinePointsIdx = 0;
-            float2 contourLinePoints[2];
-            // TODO: case where heights we are looking for are on all three vertices
-            for (int i = 0; i < 3; ++i)
-            {
-                if (contourLinePointsIdx == 2)
-                    break;
+    return outputColor;
+}
 
-                const uint2 currentEdgePoints = edgePoints[i];
-                float3 p0 = v[currentEdgePoints[0]];
-                float3 p1 = v[currentEdgePoints[1]];
+float4 calculateDTMOutlineColor(in DTMSettings dtm, in float3 v[3], in uint2 edgePoints[3], in PSInput psInput, in float3 baryCoord, in float height)
+{
+    float4 outputColor;
+
+    LineStyle outlineStyle = loadLineStyle(dtm.outlineLineStyleIdx);
+    const float outlineThickness = psInput.getOutlineThickness();
+    const float phaseShift = 0.0f; // input.getCurrentPhaseShift();
+    const float worldToScreenRatio = psInput.getCurrentWorldToScreenRatio();
+    const float stretch = 1.0f;
+
+    // index of vertex opposing an edge, needed for calculation of triangle heights
+    uint opposingVertexIdx[3];
+    opposingVertexIdx[0] = 2;
+    opposingVertexIdx[1] = 0;
+    opposingVertexIdx[2] = 1;
+
+    // find sdf of every edge
+    float triangleAreaTimesTwo;
+    {
+        float3 AB = v[0] - v[1];
+        float3 AC = v[0] - v[2];
+        AB.z = 0.0f;
+        AC.z = 0.0f;
 
-                if (p1.z < p0.z)
-                    nbl::hlsl::swap(p0, p1);
+        // TODO: figure out if there is a faster solution
+        triangleAreaTimesTwo = length(cross(AB, AC));
+    }
 
-                float minHeight = p0.z;
-                float maxHeight = p1.z;
+    // calculate sdf of every edge as it wasn't stippled
+    float distances[3];
+    for (int i = 0; i < 3; ++i)
+    {
+        const uint2 currentEdgePoints = edgePoints[i];
+        float3 A = v[currentEdgePoints[0]];
+        float3 B = v[currentEdgePoints[1]];
+        float3 AB = B - A;
+        float ABLen = length(AB);
+        float triangleHeightToOpositeVertex = triangleAreaTimesTwo / ABLen;
+
+        distances[i] = triangleHeightToOpositeVertex * baryCoord[opposingVertexIdx[i]];
+    }
 
-                if (height >= minHeight && height <= maxHeight)
-                {
-                    float2 edge = float2(p1.x, p1.y) - float2(p0.x, p0.y);
-                    float scale = (contourLineHeight - minHeight) / (maxHeight - minHeight);
+    float minDistance = nbl::hlsl::numeric_limits<float>::max;
+    if (!outlineStyle.hasStipples() || stretch == InvalidStyleStretchValue)
+    {
+        for (uint i = 0; i < 3; ++i)
+            distances[i] -= outlineThickness;
 
-                    contourLinePoints[contourLinePointsIdx] = scale * edge + float2(p0.x, p0.y);
-                    ++contourLinePointsIdx;
-                }
-            }
+        minDistance = min(distances[0], min(distances[1], distances[2]));
+    }
+    else
+    {
+        for (int i = 0; i < 3; ++i)
+        {
+            if (distances[i] > outlineThickness)
+                continue;
 
-            {
-                nbl::hlsl::shapes::Line<float> lineSegment = nbl::hlsl::shapes::Line<float>::construct(contourLinePoints[0], contourLinePoints[1]);
+            const uint2 currentEdgePoints = edgePoints[i];
+            float3 p0 = v[currentEdgePoints[0]];
+            float3 p1 = v[currentEdgePoints[1]];
 
-                float distance = nbl::hlsl::numeric_limits<float>::max;
-                if (!contourStyle.hasStipples() || stretch == InvalidStyleStretchValue)
-                {
-                    distance = ClippedSignedDistance< nbl::hlsl::shapes::Line<float> >::sdf(lineSegment, input.position.xy, contourThickness, contourStyle.isRoadStyleFlag);
-                }
-                else
-                {
-                    // TODO:
-                    // It might be beneficial to calculate distance between pixel and contour line to early out some pixels and save yourself from stipple sdf computations!
-                    // where you only compute the complex sdf if abs((height - contourVal) / heightDeriv) <= aaFactor
-                    nbl::hlsl::shapes::Line<float>::ArcLengthCalculator arcLenCalc = nbl::hlsl::shapes::Line<float>::ArcLengthCalculator::construct(lineSegment);
-                    LineStyleClipper clipper = LineStyleClipper::construct(contourStyle, lineSegment, arcLenCalc, phaseShift, stretch, worldToScreenRatio);
-                    distance = ClippedSignedDistance<nbl::hlsl::shapes::Line<float>, LineStyleClipper>::sdf(lineSegment, input.position.xy, contourThickness, contourStyle.isRoadStyleFlag, clipper);
-                }
+            // long story short, in order for stipple patterns to be consistent:
+            // - point with lesser x coord should be starting point
+            // - if x coord of both points are equal then point with lesser y value should be starting point
+            if (p1.x < p0.x)
+                nbl::hlsl::swap(p0, p1);
+            else if (p1.x == p0.x && p1.y < p0.y)
+                nbl::hlsl::swap(p0, p1);
 
-                float contourLocalAlpha = smoothstep(+globals.antiAliasingFactor, -globals.antiAliasingFactor, distance) * contourStyle.color.a;
-                textureColor = lerp(textureColor, contourStyle.color.rgb, contourLocalAlpha);
-                localAlpha = max(localAlpha, contourLocalAlpha);
-            }
+            nbl::hlsl::shapes::Line<float> lineSegment = nbl::hlsl::shapes::Line<float>::construct(float2(p0.x, p0.y), float2(p1.x, p1.y));
 
-            // OUTLINE
+            float distance = nbl::hlsl::numeric_limits<float>::max;
+            nbl::hlsl::shapes::Line<float>::ArcLengthCalculator arcLenCalc = nbl::hlsl::shapes::Line<float>::ArcLengthCalculator::construct(lineSegment);
+            LineStyleClipper clipper = LineStyleClipper::construct(outlineStyle, lineSegment, arcLenCalc, phaseShift, stretch, worldToScreenRatio);
+            distance = ClippedSignedDistance<nbl::hlsl::shapes::Line<float>, LineStyleClipper>::sdf(lineSegment, psInput.position.xy, outlineThickness, outlineStyle.isRoadStyleFlag, clipper);
 
-            // find sdf of every edge
-            float triangleAreaTimesTwo;
-            {
-                float3 AB = v[0] - v[1];
-                float3 AC = v[0] - v[2];
-                AB.z = 0.0f;
-                AC.z = 0.0f;
+            minDistance = min(minDistance, distance);
+        }
 
-                // TODO: figure out if there is a faster solution
-                triangleAreaTimesTwo = length(cross(AB, AC));
-            }
+    }
 
-            // calculate sdf of every edge as it wasn't stippled
-            float distances[3];
-            for (int i = 0; i < 3; ++i)
-            {
-                const uint2 currentEdgePoints = edgePoints[i];
-                float3 A = v[currentEdgePoints[0]];
-                float3 B = v[currentEdgePoints[1]];
-                float3 AB = B - A;
-                float ABLen = length(AB);
-                float triangleHeightToOpositeVertex = triangleAreaTimesTwo / ABLen;
-                distances[i] = triangleHeightToOpositeVertex * baryCoord[opposingVertexIdx[i]];
-            }
+    outputColor.a = smoothstep(+globals.antiAliasingFactor, -globals.antiAliasingFactor, minDistance) * outlineStyle.color.a;
+    outputColor.rgb = outlineStyle.color.rgb;
 
-            float minDistance = nbl::hlsl::numeric_limits<float>::max;
-            if (!outlineStyle.hasStipples() || stretch == InvalidStyleStretchValue)
-            {
-                for (uint i = 0; i < 3; ++i)
-                    distances[i] -= outlineThickness;
+    return outputColor;
+}
 
-                minDistance = min(distances[0], min(distances[1], distances[2]));
-            }
-            else
-            {
-                for (int i = 0; i < 3; ++i)
-                {
-                    if (distances[i] > outlineThickness)
-                        continue;
-
-                    const uint2 currentEdgePoints = edgePoints[i];
-                    float3 p0 = v[currentEdgePoints[0]];
-                    float3 p1 = v[currentEdgePoints[1]];
-
-                    // long story short, in order for stipple patterns to be consistent:
-                    // - point with lesser x coord should be starting point
-                    // - if x coord of both points are equal then point with lesser y value should be starting point
-                    if (p1.x < p0.x)
-                        nbl::hlsl::swap(p0, p1);
-                    else if (p1.x == p0.x && p1.y < p0.y)
-                        nbl::hlsl::swap(p0, p1);
-
-                    nbl::hlsl::shapes::Line<float> lineSegment = nbl::hlsl::shapes::Line<float>::construct(float2(p0.x, p0.y), float2(p1.x, p1.y));
-                
-                    float distance = nbl::hlsl::numeric_limits<float>::max;
-                    nbl::hlsl::shapes::Line<float>::ArcLengthCalculator arcLenCalc = nbl::hlsl::shapes::Line<float>::ArcLengthCalculator::construct(lineSegment);
-                    LineStyleClipper clipper = LineStyleClipper::construct(outlineStyle, lineSegment, arcLenCalc, phaseShift, stretch, worldToScreenRatio);
-                    distance = ClippedSignedDistance<nbl::hlsl::shapes::Line<float>, LineStyleClipper>::sdf(lineSegment, input.position.xy, outlineThickness, outlineStyle.isRoadStyleFlag, clipper);
+struct DTMColorBlender
+{
+    void init()
+    {
+        colorCount = 0;
+    }
 
-                    minDistance = min(minDistance, distance);
-                }
+    void addColorOnTop(in float4 color)
+    {
+        colors[colorCount] = color;
+        colorCount++;
+    }
 
-            }
+    float4 blend()
+    {
+        if (colorCount == 0)
+            return float4(0.0f, 0.0f, 0.0f, 1.0f);
 
-            float outlineLocalAlpha = smoothstep(+globals.antiAliasingFactor, -globals.antiAliasingFactor, minDistance) * outlineStyle.color.a;
-            textureColor = lerp(textureColor, outlineStyle.color.rgb, outlineLocalAlpha);
-            localAlpha = max(localAlpha, outlineLocalAlpha);
+        float4 outputColor = colors[0];
+        for (int i = 1; i < colorCount; ++i)
+        {
+            outputColor.rgb = colors[i].rgb * colors[i].a + outputColor.rgb * outputColor.a * (1.0f - colors[i].a);
+            outputColor.a = colors[i].a + outputColor.a * (1.0f - colors[i].a);
         }
 
-        return calculateFinalColor<nbl::hlsl::jit::device_capabilities::fragmentShaderPixelInterlock>(uint2(input.position.xy), localAlpha, currentMainObjectIdx, textureColor, true);
+        return outputColor;
+    }
+
+    int colorCount;
+    float4 colors[3];
+};
+
+[[vk::spvexecutionmode(spv::ExecutionModePixelInterlockOrderedEXT)]]
+[shader("pixel")]
+float4 fragMain(PSInput input) : SV_TARGET
+{
+    float localAlpha = 0.0f;
+    float3 textureColor = float3(0, 0, 0); // color sampled from a texture
+
+    ObjectType objType = input.getObjType();
+    const uint32_t currentMainObjectIdx = input.getMainObjectIdx();
+    const MainObject mainObj = loadMainObject(currentMainObjectIdx);
+    
+    if (pc.isDTMRendering)
+    {   
+        // TRIANGLE RENDERING
+        {
+            DTMSettings dtm = loadDTMSettings(mainObj.dtmSettingsIdx);
+
+            float3 v[3];
+            v[0] = input.getScreenSpaceVertexAttribs(0);
+            v[1] = input.getScreenSpaceVertexAttribs(1);
+            v[2] = input.getScreenSpaceVertexAttribs(2);
+
+            // indices of points constructing every edge
+            uint2 edgePoints[3];
+            edgePoints[0] = uint2(0, 1);
+            edgePoints[1] = uint2(1, 2);
+            edgePoints[2] = uint2(2, 0);
+
+            const float3 baryCoord = calculateDTMTriangleBarycentrics(v[0], v[1], v[2], input.position.xy);
+            float height = baryCoord.x * v[0].z + baryCoord.y * v[1].z + baryCoord.z * v[2].z;
+            float heightDeriv = fwidth(height);
+
+            DTMColorBlender blender;
+            blender.init();
+            if(dtm.drawHeightsFlag)
+                blender.addColorOnTop(calculateDTMHeightColor(dtm, v, heightDeriv, input.position.xy, height));
+            if (dtm.drawContoursFlag)
+                blender.addColorOnTop(calculateDTMContourColor(dtm, v, edgePoints, input, height));
+            if (dtm.drawOutlineFlag)
+                blender.addColorOnTop(calculateDTMOutlineColor(dtm, v, edgePoints, input, baryCoord, height));
+            float4 dtmColor = blender.blend();
+
+            textureColor = dtmColor.rgb;
+            localAlpha = dtmColor.a;
+
+            return calculateFinalColor<nbl::hlsl::jit::device_capabilities::fragmentShaderPixelInterlock>(uint2(input.position.xy), localAlpha, currentMainObjectIdx, textureColor, true);
+        }
     }
     else
     {

From 1db627c52e32644bdf276bfcdec4afbf92cf16c8 Mon Sep 17 00:00:00 2001
From: Erfan Ahmadi <ahmadierfan99@gmail.com>
Date: Tue, 15 Apr 2025 12:46:56 +0330
Subject: [PATCH 161/529] Fix DISCRETE_FIXED_LENGTH_INTERVALS + Refactors

---
 62_CAD/CTriangleMesh.h                        |  31 +-
 62_CAD/DrawResourcesFiller.cpp                |  22 +-
 62_CAD/main.cpp                               |  46 +-
 62_CAD/shaders/globals.hlsl                   |  48 +-
 .../main_pipeline/fragment_shader.hlsl        | 618 ++++++++++--------
 5 files changed, 418 insertions(+), 347 deletions(-)

diff --git a/62_CAD/CTriangleMesh.h b/62_CAD/CTriangleMesh.h
index c1dcbca68..1860dedc9 100644
--- a/62_CAD/CTriangleMesh.h
+++ b/62_CAD/CTriangleMesh.h
@@ -8,26 +8,33 @@ using namespace nbl;
 
 struct DTMSettingsInfo
 {
-	enum E_HEIGHT_SHADING_MODE
-	{
-		DISCRETE_VARIABLE_LENGTH_INTERVALS,
-		DISCRETE_FIXED_LENGTH_INTERVALS,
-		CONTINOUS_INTERVALS
-	};
-
 	LineStyleInfo outlineLineStyleInfo;
 	LineStyleInfo contourLineStyleInfo;
 	
+	uint32_t mode; // E_DTM_MODE
+
 	float contourLinesStartHeight;
 	float contourLinesEndHeight;
 	float contourLinesHeightInterval;
-
-	float intervalWidth;
+	
+	// Height Shading Mode
 	E_HEIGHT_SHADING_MODE heightShadingMode;
 
-	bool drawHeightsFlag;
-	bool drawContoursFlag;
-	bool drawOutlineFlag;
+	// Used as fixed interval length for "DISCRETE_FIXED_LENGTH_INTERVALS" shading mode
+	float intervalLength;
+
+	// Converts an interval index to its corresponding height value
+	// For example, if this value is 10.0, then an interval index of 2 corresponds to a height of 20.0.
+	// This computed height is later used to determine the interpolated color for shading.
+	// It makes sense for this variable to be always equal to `intervalLength` but sometimes it's a different scaling so that last index corresponds to largestHeight
+	float intervalIndexToHeightMultiplier;
+	
+	// Used for "DISCRETE_FIXED_LENGTH_INTERVALS" shading mode
+	// If `isCenteredShading` is true, the intervals are centered around `minHeight`, meaning the
+	// first interval spans [minHeight - intervalLength / 2.0, minHeight + intervalLength / 2.0].
+	// Otherwise, intervals are aligned from `minHeight` upward, so the first interval spans
+	// [minHeight, minHeight + intervalLength].
+	bool isCenteredShading;
 
 	void addHeightColorMapEntry(float height, float32_t4 color)
 	{
diff --git a/62_CAD/DrawResourcesFiller.cpp b/62_CAD/DrawResourcesFiller.cpp
index c11b0a67f..ad2f160c8 100644
--- a/62_CAD/DrawResourcesFiller.cpp
+++ b/62_CAD/DrawResourcesFiller.cpp
@@ -632,6 +632,9 @@ uint32_t DrawResourcesFiller::addDTMSettings_Internal(const DTMSettingsInfo& dtm
 	// TODO: Maybe constraint by a max size? and return InvalidIdx if it would exceed
 
 	DTMSettings dtmSettings;
+
+	dtmSettings.mode = dtmSettingsInfo.mode;
+
 	dtmSettings.contourLinesStartHeight = dtmSettingsInfo.contourLinesStartHeight;
 	dtmSettings.contourLinesEndHeight = dtmSettingsInfo.contourLinesEndHeight;
 	dtmSettings.contourLinesHeightInterval = dtmSettingsInfo.contourLinesHeightInterval;
@@ -639,24 +642,23 @@ uint32_t DrawResourcesFiller::addDTMSettings_Internal(const DTMSettingsInfo& dtm
 	dtmSettings.outlineLineStyleIdx = addLineStyle_Internal(dtmSettingsInfo.outlineLineStyleInfo);
 	dtmSettings.contourLineStyleIdx = addLineStyle_Internal(dtmSettingsInfo.contourLineStyleInfo);
 
+
 	switch (dtmSettingsInfo.heightShadingMode)
 	{
-	case DTMSettingsInfo::E_HEIGHT_SHADING_MODE::DISCRETE_VARIABLE_LENGTH_INTERVALS:
-		dtmSettings.intervalWidth = std::numeric_limits<float>::infinity();
+	case E_HEIGHT_SHADING_MODE::DISCRETE_VARIABLE_LENGTH_INTERVALS:
+		dtmSettings.intervalLength = std::numeric_limits<float>::infinity();
 		break;
-	case DTMSettingsInfo::E_HEIGHT_SHADING_MODE::DISCRETE_FIXED_LENGTH_INTERVALS:
-		dtmSettings.intervalWidth = dtmSettingsInfo.intervalWidth;
+	case E_HEIGHT_SHADING_MODE::DISCRETE_FIXED_LENGTH_INTERVALS:
+		dtmSettings.intervalLength = dtmSettingsInfo.intervalLength;
 		break;
-	case DTMSettingsInfo::E_HEIGHT_SHADING_MODE::CONTINOUS_INTERVALS:
-		dtmSettings.intervalWidth = 0.0f;
+	case E_HEIGHT_SHADING_MODE::CONTINOUS_INTERVALS:
+		dtmSettings.intervalLength = 0.0f;
 		break;
 	}
+	dtmSettings.intervalIndexToHeightMultiplier = dtmSettingsInfo.intervalIndexToHeightMultiplier;
+	dtmSettings.isCenteredShading = static_cast<int>(dtmSettingsInfo.isCenteredShading);
 	_NBL_DEBUG_BREAK_IF(!dtmSettingsInfo.fillShaderDTMSettingsHeightColorMap(dtmSettings));
 
-	dtmSettings.drawHeightsFlag = static_cast<int>(dtmSettingsInfo.drawHeightsFlag);
-	dtmSettings.drawContoursFlag = static_cast<int>(dtmSettingsInfo.drawContoursFlag);
-	dtmSettings.drawOutlineFlag = static_cast<int>(dtmSettingsInfo.drawOutlineFlag);
-
 	for (uint32_t i = 0u; i < resourcesCollection.dtmSettings.vector.size(); ++i)
 	{
 		const DTMSettings& itr = resourcesCollection.dtmSettings.vector[i];
diff --git a/62_CAD/main.cpp b/62_CAD/main.cpp
index 48ca4f5a3..16532cba7 100644
--- a/62_CAD/main.cpp
+++ b/62_CAD/main.cpp
@@ -625,7 +625,7 @@ class ComputerAidedDesign final : public examples::SimpleWindowedApplication, pu
 	double m_timeElapsed = 0.0;
 	std::chrono::steady_clock::time_point lastTime;
 	uint32_t m_hatchDebugStep = 0u;
-	DTMSettingsInfo::E_HEIGHT_SHADING_MODE m_shadingModeExample = DTMSettingsInfo::E_HEIGHT_SHADING_MODE::DISCRETE_VARIABLE_LENGTH_INTERVALS;
+	E_HEIGHT_SHADING_MODE m_shadingModeExample = E_HEIGHT_SHADING_MODE::DISCRETE_VARIABLE_LENGTH_INTERVALS;
 
 	inline bool onAppInitialized(smart_refctd_ptr<ISystem>&& system) override
 	{
@@ -1073,15 +1073,15 @@ class ComputerAidedDesign final : public examples::SimpleWindowedApplication, pu
 					}
 					if (ev.action == nbl::ui::SKeyboardEvent::E_KEY_ACTION::ECA_PRESSED && ev.keyCode == nbl::ui::E_KEY_CODE::EKC_1)
 					{
-						m_shadingModeExample = DTMSettingsInfo::E_HEIGHT_SHADING_MODE::DISCRETE_VARIABLE_LENGTH_INTERVALS;
+						m_shadingModeExample = E_HEIGHT_SHADING_MODE::DISCRETE_VARIABLE_LENGTH_INTERVALS;
 					}
 					if (ev.action == nbl::ui::SKeyboardEvent::E_KEY_ACTION::ECA_PRESSED && ev.keyCode == nbl::ui::E_KEY_CODE::EKC_2)
 					{
-						m_shadingModeExample = DTMSettingsInfo::E_HEIGHT_SHADING_MODE::DISCRETE_FIXED_LENGTH_INTERVALS;
+						m_shadingModeExample = E_HEIGHT_SHADING_MODE::DISCRETE_FIXED_LENGTH_INTERVALS;
 					}
 					if (ev.action == nbl::ui::SKeyboardEvent::E_KEY_ACTION::ECA_PRESSED && ev.keyCode == nbl::ui::E_KEY_CODE::EKC_3)
 					{
-						m_shadingModeExample = DTMSettingsInfo::E_HEIGHT_SHADING_MODE::CONTINOUS_INTERVALS;
+						m_shadingModeExample = E_HEIGHT_SHADING_MODE::CONTINOUS_INTERVALS;
 					}
 				}
 			}
@@ -3246,6 +3246,9 @@ class ComputerAidedDesign final : public examples::SimpleWindowedApplication, pu
 			mesh.setIndices(std::move(indices));
 
 			DTMSettingsInfo dtmSettingsInfo;
+			
+			dtmSettingsInfo.mode = E_DTM_MODE::HEIGHT_SHADING;
+
 			dtmSettingsInfo.contourLinesStartHeight = 20;
 			dtmSettingsInfo.contourLinesEndHeight = 90;
 			dtmSettingsInfo.contourLinesHeightInterval = 10;
@@ -3264,9 +3267,6 @@ class ComputerAidedDesign final : public examples::SimpleWindowedApplication, pu
 			std::array<double, 4> contourStipplePattern = { 0.0f, -5.0f, 10.0f, -5.0f };
 			dtmSettingsInfo.contourLineStyleInfo.setStipplePatternData(contourStipplePattern);
 
-			dtmSettingsInfo.drawHeightsFlag = true;
-			dtmSettingsInfo.drawContoursFlag = true;
-			dtmSettingsInfo.drawOutlineFlag = true;
 
 			// PRESS 1, 2, 3 TO SWITCH HEIGHT SHADING MODE
 			// 1 - DISCRETE_VARIABLE_LENGTH_INTERVALS
@@ -3275,9 +3275,9 @@ class ComputerAidedDesign final : public examples::SimpleWindowedApplication, pu
 			float animatedAlpha = (std::cos(m_timeElapsed * 0.0005) + 1.0) * 0.5;
 			switch (m_shadingModeExample)
 			{
-				case DTMSettingsInfo::E_HEIGHT_SHADING_MODE::DISCRETE_VARIABLE_LENGTH_INTERVALS:
+				case E_HEIGHT_SHADING_MODE::DISCRETE_VARIABLE_LENGTH_INTERVALS:
 				{
-					dtmSettingsInfo.heightShadingMode = DTMSettingsInfo::E_HEIGHT_SHADING_MODE::DISCRETE_VARIABLE_LENGTH_INTERVALS;
+					dtmSettingsInfo.heightShadingMode = E_HEIGHT_SHADING_MODE::DISCRETE_VARIABLE_LENGTH_INTERVALS;
 					
 					dtmSettingsInfo.addHeightColorMapEntry(-10.0f, float32_t4(0.5f, 1.0f, 1.0f, animatedAlpha));
 					dtmSettingsInfo.addHeightColorMapEntry(20.0f, float32_t4(0.0f, 1.0f, 0.0f, 1.0f));
@@ -3286,21 +3286,27 @@ class ComputerAidedDesign final : public examples::SimpleWindowedApplication, pu
 					dtmSettingsInfo.addHeightColorMapEntry(90.0f, float32_t4(1.0f, 0.0f, 0.0f, 1.0f));
 					break;
 				}
-				case DTMSettingsInfo::E_HEIGHT_SHADING_MODE::DISCRETE_FIXED_LENGTH_INTERVALS:
+				case E_HEIGHT_SHADING_MODE::DISCRETE_FIXED_LENGTH_INTERVALS:
 				{
-					dtmSettingsInfo.intervalWidth = 8.0f;
-					dtmSettingsInfo.heightShadingMode = DTMSettingsInfo::E_HEIGHT_SHADING_MODE::DISCRETE_FIXED_LENGTH_INTERVALS;
-					dtmSettingsInfo.addHeightColorMapEntry(0.0f, float32_t4(0.0f, 1.0f, 0.0f, 1.0f));
-					dtmSettingsInfo.addHeightColorMapEntry(50.0f, float32_t4(1.0f, 1.0f, 0.0f, animatedAlpha));
+					dtmSettingsInfo.intervalLength = 10.0f;
+					dtmSettingsInfo.intervalIndexToHeightMultiplier = dtmSettingsInfo.intervalLength;
+					dtmSettingsInfo.isCenteredShading = false;
+					dtmSettingsInfo.heightShadingMode = E_HEIGHT_SHADING_MODE::DISCRETE_FIXED_LENGTH_INTERVALS;
+					dtmSettingsInfo.addHeightColorMapEntry(0.0f,   float32_t4(0.0f, 0.0f, 1.0f, 1.0f));
+					dtmSettingsInfo.addHeightColorMapEntry(25.0f,  float32_t4(0.0f, 1.0f, 1.0f, 1.0));
+					dtmSettingsInfo.addHeightColorMapEntry(50.0f,  float32_t4(0.0f, 1.0f, 0.0f, 1.0));
+					dtmSettingsInfo.addHeightColorMapEntry(75.0f,  float32_t4(1.0f, 1.0f, 0.0f, 1.0));
 					dtmSettingsInfo.addHeightColorMapEntry(100.0f, float32_t4(1.0f, 0.0f, 0.0f, 1.0f));
 					break;
 				}
-				case DTMSettingsInfo::E_HEIGHT_SHADING_MODE::CONTINOUS_INTERVALS:
+				case E_HEIGHT_SHADING_MODE::CONTINOUS_INTERVALS:
 				{
-					dtmSettingsInfo.heightShadingMode = DTMSettingsInfo::E_HEIGHT_SHADING_MODE::CONTINOUS_INTERVALS;
-					dtmSettingsInfo.addHeightColorMapEntry(-10.0f, float32_t4(0.0f, 1.0f, 0.0f, animatedAlpha));
-					dtmSettingsInfo.addHeightColorMapEntry(30.0f, float32_t4(1.0f, 1.0f, 0.0f, animatedAlpha));
-					dtmSettingsInfo.addHeightColorMapEntry(90.0f, float32_t4(1.0f, 0.0f, 0.0f, animatedAlpha));
+					dtmSettingsInfo.heightShadingMode = E_HEIGHT_SHADING_MODE::CONTINOUS_INTERVALS;
+					dtmSettingsInfo.addHeightColorMapEntry(0.0f,   float32_t4(0.0f, 0.0f, 1.0f, 1.0f));
+					dtmSettingsInfo.addHeightColorMapEntry(25.0f,  float32_t4(0.0f, 1.0f, 1.0f, 1.0));
+					dtmSettingsInfo.addHeightColorMapEntry(50.0f,  float32_t4(0.0f, 1.0f, 0.0f, 1.0));
+					dtmSettingsInfo.addHeightColorMapEntry(75.0f,  float32_t4(1.0f, 1.0f, 0.0f, 1.0));
+					dtmSettingsInfo.addHeightColorMapEntry(100.0f, float32_t4(1.0f, 0.0f, 0.0f, 1.0f));
 					break;
 				}
 			}
@@ -3311,7 +3317,7 @@ class ComputerAidedDesign final : public examples::SimpleWindowedApplication, pu
 			dtmSettingsInfo.outlineLineStyleInfo.color = float32_t4(0.0f, 0.39f, 1.0f, 1.0f);
 			for (auto& v : mesh.m_vertices)
 			{
-				v.pos += float64_t2(400.0, 200.0);
+				v.pos += float64_t2(450.0, 200.0);
 				v.height -= 10.0;
 			}
 
diff --git a/62_CAD/shaders/globals.hlsl b/62_CAD/shaders/globals.hlsl
index bf4c06db7..a83acb094 100644
--- a/62_CAD/shaders/globals.hlsl
+++ b/62_CAD/shaders/globals.hlsl
@@ -354,15 +354,28 @@ struct LineStyle
     }
 };
 
+enum E_DTM_MODE
+{
+    OUTLINE         = 1 << 0,
+    CONTOUR         = 1 << 1,
+    HEIGHT_SHADING  = 1 << 2,
+};
+
+enum class E_HEIGHT_SHADING_MODE : uint32_t
+{
+    DISCRETE_VARIABLE_LENGTH_INTERVALS,
+    DISCRETE_FIXED_LENGTH_INTERVALS,
+    CONTINOUS_INTERVALS
+};
+    
+// Documentation and explanation of variables in DTMSettingsInfo
 struct DTMSettings
 {
     const static uint32_t HeightColorMapMaxEntries = 16u;
     uint32_t outlineLineStyleIdx; // index into line styles
     uint32_t contourLineStyleIdx; // index into line styles
 
-    int drawHeightsFlag;
-    int drawContoursFlag;
-    int drawOutlineFlag;
+    uint32_t mode; // E_DTM_MODE
 
     // contour lines
     float contourLinesStartHeight;
@@ -370,27 +383,26 @@ struct DTMSettings
     float contourLinesHeightInterval;
 
     // height-color map
-    float intervalWidth;
+    float intervalLength;
+	float intervalIndexToHeightMultiplier;
+    int isCenteredShading;
+    
     uint32_t heightColorEntryCount;
     float heightColorMapHeights[HeightColorMapMaxEntries];
     float32_t4 heightColorMapColors[HeightColorMapMaxEntries];
-
-    enum E_HEIGHT_SHADING_MODE
-    {
-        DISCRETE_VARIABLE_LENGTH_INTERVALS,
-        DISCRETE_FIXED_LENGTH_INTERVALS,
-        CONTINOUS_INTERVALS
-    };
-
+    
     E_HEIGHT_SHADING_MODE determineHeightShadingMode()
     {
-        if (nbl::hlsl::isinf(intervalWidth))
-            return DISCRETE_VARIABLE_LENGTH_INTERVALS;
-        if (intervalWidth == 0.0f)
-            return CONTINOUS_INTERVALS;
-
-        return DISCRETE_FIXED_LENGTH_INTERVALS;
+        if (nbl::hlsl::isinf(intervalLength))
+            return E_HEIGHT_SHADING_MODE::DISCRETE_VARIABLE_LENGTH_INTERVALS;
+        if (intervalLength == 0.0f)
+            return E_HEIGHT_SHADING_MODE::CONTINOUS_INTERVALS;
+        return E_HEIGHT_SHADING_MODE::DISCRETE_FIXED_LENGTH_INTERVALS;
     }
+    
+    bool drawOutlineEnabled() { return  (mode & E_DTM_MODE::OUTLINE) != 0u; } 
+    bool drawContourEnabled() { return (mode & E_DTM_MODE::CONTOUR) != 0u; } 
+    bool drawHeightShadingEnabled() { return (mode & E_DTM_MODE::HEIGHT_SHADING) != 0u; } 
 };
 
 #ifndef __HLSL_VERSION
diff --git a/62_CAD/shaders/main_pipeline/fragment_shader.hlsl b/62_CAD/shaders/main_pipeline/fragment_shader.hlsl
index be5e0bf07..dc5262568 100644
--- a/62_CAD/shaders/main_pipeline/fragment_shader.hlsl
+++ b/62_CAD/shaders/main_pipeline/fragment_shader.hlsl
@@ -420,22 +420,80 @@ float dot2(in float2 vec)
     return dot(vec, vec);
 }
 
-struct DTMHeightShadingAAInfo
+
+// TODO: Later move these functions and structs to dtmSettings.hlsl and a namespace like dtmSettings::height_shading or dtmSettings::contours, etc..
+
+struct HeightSegmentTransitionData
 {
     float currentHeight;
     float4 currentSegmentColor;
-    float nearestSegmentHeight;
-    float4 nearestSegmentColor;
+    float boundaryHeight;
+    float4 otherSegmentColor;
 };
 
-void calculateBetweenHeightShadingRegionsAntiAliasing(in DTMSettings dtm, in DTMHeightShadingAAInfo aaInfo, in float heightDeriv, out float4 outputColor)
+// NOTE[Erfan to Przemek][REMOVE WHEN READ]: I renamed to `smoothHeightSegmentTransition` and made it return value instead of take `out` param + removed applying it to final output color (it's responsibility of the caller now)
+// Now the resposibility of this  function is just to "Figure out what the interpolated color at the transition is." and doesn't assume how it's gonna be applied to the final color
+// that's more predictible and atomic. Additionally I think `out` functions make the code a little bit more unreadable as well
+
+// This function interpolates between the current and nearest segment colors based on the
+// screen-space distance to the segment boundary. The result is a smoothly blended color
+// useful for visualizing discrete height levels without harsh edges.
+float4 smoothHeightSegmentTransition(in HeightSegmentTransitionData transitionInfo, in float heightDeriv)
 {
-    float pxDistanceToNearestSegment = abs(aaInfo.currentHeight - aaInfo.nearestSegmentHeight) / heightDeriv;
+    float pxDistanceToNearestSegment = abs((transitionInfo.currentHeight - transitionInfo.boundaryHeight) / heightDeriv);
     float nearestSegmentColorCoverage = smoothstep(-globals.antiAliasingFactor, globals.antiAliasingFactor, pxDistanceToNearestSegment);
-    float4 localHeightColor = lerp(aaInfo.nearestSegmentColor, aaInfo.currentSegmentColor, nearestSegmentColorCoverage);
+    float4 localHeightColor = lerp(transitionInfo.otherSegmentColor, transitionInfo.currentSegmentColor, nearestSegmentColorCoverage);
+    return localHeightColor;
+}
 
-    outputColor.a *= localHeightColor.a;
-    outputColor.rgb = localHeightColor.rgb;
+// Computes the continuous position of a height value within uniform intervals.
+// flooring this value will give the interval index
+//
+// If `isCenteredShading` is true, the intervals are centered around `minHeight`, meaning the
+// first interval spans [minHeight - intervalLength / 2.0, minHeight + intervalLength / 2.0].
+// Otherwise, intervals are aligned from `minHeight` upward, so the first interval spans
+// [minHeight, minHeight + intervalLength].
+//
+// Parameters:
+// - height: The height value to classify.
+// - minHeight: The reference starting height for interval calculation.
+// - intervalLength: The length of each interval segment.
+// - isCenteredShading: Whether to center the shading intervals around minHeight.
+//
+// Returns:
+// - A float representing the continuous position within the interval grid.
+float getIntervalPosition(in float height, in float minHeight, in float intervalLength, in bool isCenteredShading)
+{
+    if (isCenteredShading)
+        return ( (height - minHeight) / intervalLength + 0.5f);
+    else
+        return ( (height - minHeight) / intervalLength );
+}
+
+void getIntervalHeightAndColor(in int intervalIndex, in DTMSettings dtmSettings, out float4 outIntervalColor, out float outIntervalHeight)
+{
+    float minShadingHeight = dtmSettings.heightColorMapHeights[0];
+    outIntervalHeight = minShadingHeight + float(intervalIndex) * dtmSettings.intervalIndexToHeightMultiplier;
+
+    DTMSettingsHeightsAccessor dtmHeightsAccessor = { dtmSettings };
+    uint32_t upperBoundHeightIndex = min(nbl::hlsl::upper_bound(dtmHeightsAccessor, 0, dtmSettings.heightColorEntryCount, outIntervalHeight), dtmSettings.heightColorEntryCount-1u);
+    uint32_t lowerBoundHeightIndex = max(upperBoundHeightIndex - 1, 0);
+
+    float upperBoundHeight = dtmSettings.heightColorMapHeights[upperBoundHeightIndex];
+    float lowerBoundHeight = dtmSettings.heightColorMapHeights[lowerBoundHeightIndex];
+
+    float4 upperBoundColor = dtmSettings.heightColorMapColors[upperBoundHeightIndex];
+    float4 lowerBoundColor = dtmSettings.heightColorMapColors[lowerBoundHeightIndex];
+    
+    if (upperBoundHeight == lowerBoundHeight)
+    {
+        outIntervalColor = upperBoundColor;
+    }
+    else
+    {
+        float interpolationVal = (outIntervalHeight - lowerBoundHeight) / (upperBoundHeight - lowerBoundHeight);
+        outIntervalColor = lerp(lowerBoundColor, upperBoundColor, interpolationVal);
+    }
 }
 
 float3 calculateDTMTriangleBarycentrics(in float2 v1, in float2 v2, in float2 v3, in float2 p)
@@ -447,14 +505,14 @@ float3 calculateDTMTriangleBarycentrics(in float2 v1, in float2 v2, in float2 v3
     return float3(u, v, w);
 }
 
-float4 calculateDTMHeightColor(in DTMSettings dtm, in float3 v[3], in float heightDeriv, in float2 fragPos, in float height)
+float4 calculateDTMHeightColor(in DTMSettings dtmSettings, in float3 v[3], in float heightDeriv, in float2 fragPos, in float height)
 {
     float4 outputColor = float4(0.0f, 0.0f, 0.0f, 1.0f);
 
     // HEIGHT SHADING
-    const uint32_t heightMapSize = dtm.heightColorEntryCount;
-    float minShadingHeight = dtm.heightColorMapHeights[0];
-    float maxShadingHeight = dtm.heightColorMapHeights[heightMapSize - 1];
+    const uint32_t heightMapSize = dtmSettings.heightColorEntryCount;
+    float minShadingHeight = dtmSettings.heightColorMapHeights[0];
+    float maxShadingHeight = dtmSettings.heightColorMapHeights[heightMapSize - 1];
 
     if (heightMapSize > 0)
     {
@@ -487,11 +545,11 @@ float4 calculateDTMHeightColor(in DTMSettings dtm, in float3 v[3], in float heig
         outputColor.a = 1.0f - smoothstep(0.0f, globals.antiAliasingFactor * 2.0f, convexPolygonSdf);
 
         // calculate height color
-        DTMSettings::E_HEIGHT_SHADING_MODE mode = dtm.determineHeightShadingMode();
+        E_HEIGHT_SHADING_MODE mode = dtmSettings.determineHeightShadingMode();
 
-        if (mode == DTMSettings::E_HEIGHT_SHADING_MODE::DISCRETE_VARIABLE_LENGTH_INTERVALS)
+        if (mode == E_HEIGHT_SHADING_MODE::DISCRETE_VARIABLE_LENGTH_INTERVALS)
         {
-            DTMSettingsHeightsAccessor dtmHeightsAccessor = { dtm };
+            DTMSettingsHeightsAccessor dtmHeightsAccessor = { dtmSettings };
             int upperBoundIndex = nbl::hlsl::upper_bound(dtmHeightsAccessor, 0, heightMapSize, height);
             int mapIndex = max(upperBoundIndex - 1, 0);
             int mapIndexPrev = max(mapIndex - 1, 0);
@@ -501,72 +559,61 @@ float4 calculateDTMHeightColor(in DTMSettings dtm, in float3 v[3], in float heig
             // if color idx is >= length of the colours array then it means it's also > 0.0 and this blend with prev is true
             // if color idx is > 0 and < len - 1, then it depends on the current pixel's height value and two closest height values
             bool blendWithPrev = (mapIndex > 0)
-                && (mapIndex >= heightMapSize - 1 || (height * 2.0 < dtm.heightColorMapHeights[upperBoundIndex] + dtm.heightColorMapHeights[mapIndex]));
+                && (mapIndex >= heightMapSize - 1 || (height * 2.0 < dtmSettings.heightColorMapHeights[upperBoundIndex] + dtmSettings.heightColorMapHeights[mapIndex]));
 
-            DTMHeightShadingAAInfo aaInfo;
-            aaInfo.currentHeight = height;
-            aaInfo.currentSegmentColor = dtm.heightColorMapColors[mapIndex];
-            aaInfo.nearestSegmentHeight = blendWithPrev ? dtm.heightColorMapHeights[mapIndex] : dtm.heightColorMapHeights[mapIndexNext];
-            aaInfo.nearestSegmentColor = blendWithPrev ? dtm.heightColorMapColors[mapIndexPrev] : dtm.heightColorMapColors[mapIndexNext];
+            HeightSegmentTransitionData transitionInfo;
+            transitionInfo.currentHeight = height;
+            transitionInfo.currentSegmentColor = dtmSettings.heightColorMapColors[mapIndex];
+            transitionInfo.boundaryHeight = blendWithPrev ? dtmSettings.heightColorMapHeights[mapIndex] : dtmSettings.heightColorMapHeights[mapIndexNext];
+            transitionInfo.otherSegmentColor = blendWithPrev ? dtmSettings.heightColorMapColors[mapIndexPrev] : dtmSettings.heightColorMapColors[mapIndexNext];
 
-            calculateBetweenHeightShadingRegionsAntiAliasing(dtm, aaInfo, heightDeriv, outputColor);
+            float4 localHeightColor = smoothHeightSegmentTransition(transitionInfo, heightDeriv);
+            outputColor.rgb = localHeightColor.rgb;
+            outputColor.a *= localHeightColor.a;
         }
-        else if (mode == DTMSettings::E_HEIGHT_SHADING_MODE::DISCRETE_FIXED_LENGTH_INTERVALS)
+        else if (mode == E_HEIGHT_SHADING_MODE::DISCRETE_FIXED_LENGTH_INTERVALS)
         {
-            float interval = dtm.intervalWidth;
-            float heightMinShadingHeightDiff = (height - minShadingHeight);
-            int sectionIndex = int(heightMinShadingHeightDiff / interval);
-            float heightTmp = minShadingHeight + float(sectionIndex) * interval;
-
-            DTMSettingsHeightsAccessor dtmHeightsAccessor = { dtm };
-            uint32_t upperBoundHeightIndex = nbl::hlsl::upper_bound(dtmHeightsAccessor, 0, heightMapSize, height);
-            uint32_t lowerBoundHeightIndex = max(upperBoundHeightIndex - 1, 0);
-
-            float upperBoundHeight = dtm.heightColorMapHeights[upperBoundHeightIndex];
-            float lowerBoundHeight = dtm.heightColorMapHeights[lowerBoundHeightIndex];
-
-            float4 upperBoundColor = dtm.heightColorMapColors[upperBoundHeightIndex];
-            float4 lowerBoundColor = dtm.heightColorMapColors[lowerBoundHeightIndex];
-
-            float interpolationVal;
-            bool blendWithPrev;
-            if (upperBoundHeightIndex == 0)
-            {
-                interpolationVal = 1.0f; // TODO: investigate if it is correct
-                blendWithPrev = false;
-            }
-            else
-            {
-                interpolationVal = (heightTmp - lowerBoundHeight) / (upperBoundHeight - lowerBoundHeight);
-                blendWithPrev = height - interval * sectionIndex < 0.5f; // TODO: investigate if it is correct
-            }
+            float intervalPosition = getIntervalPosition(height, minShadingHeight, dtmSettings.intervalLength, dtmSettings.isCenteredShading);
+            float positionWithinInterval = frac(intervalPosition);
+            int intervalIndex = nbl::hlsl::_static_cast<int>(intervalPosition);
 
-            DTMHeightShadingAAInfo aaInfo;
-            aaInfo.currentHeight = height;
-            aaInfo.currentSegmentColor = lerp(lowerBoundColor, upperBoundColor, interpolationVal);
+            float4 currentIntervalColor;
+            float currentIntervalHeight;
+            getIntervalHeightAndColor(intervalIndex, dtmSettings, currentIntervalColor, currentIntervalHeight);
+            
+            bool blendWithPrev = (positionWithinInterval < 0.5f);
+            
+            HeightSegmentTransitionData transitionInfo;
+            transitionInfo.currentHeight = height;
+            transitionInfo.currentSegmentColor = currentIntervalColor;
             if (blendWithPrev)
             {
-                aaInfo.nearestSegmentHeight = heightTmp;
-                aaInfo.nearestSegmentColor = lerp(lowerBoundColor, upperBoundColor, interpolationVal - 1.0f / interval);
+                int prevIntervalIdx = max(intervalIndex - 1, 0);
+                float prevIntervalHeight; // unused, the currentIntervalHeight is the boundary height between current and prev
+                getIntervalHeightAndColor(prevIntervalIdx, dtmSettings, transitionInfo.otherSegmentColor, prevIntervalHeight);
+                transitionInfo.boundaryHeight = currentIntervalHeight;
             }
             else
             {
-                aaInfo.nearestSegmentHeight = heightTmp + interval;
-                aaInfo.nearestSegmentColor = lerp(lowerBoundColor, upperBoundColor, interpolationVal + 1.0f / interval);
+                int nextIntervalIdx = intervalIndex + 1;
+                getIntervalHeightAndColor(nextIntervalIdx, dtmSettings, transitionInfo.otherSegmentColor, transitionInfo.boundaryHeight);
             }
-            calculateBetweenHeightShadingRegionsAntiAliasing(dtm, aaInfo, heightDeriv, outputColor);
+            
+            float4 localHeightColor = smoothHeightSegmentTransition(transitionInfo, heightDeriv);
+            outputColor.rgb = localHeightColor.rgb;
+            outputColor.a *= localHeightColor.a;
         }
-        else if (mode == DTMSettings::E_HEIGHT_SHADING_MODE::CONTINOUS_INTERVALS)
+        else if (mode == E_HEIGHT_SHADING_MODE::CONTINOUS_INTERVALS)
         {
-            DTMSettingsHeightsAccessor dtmHeightsAccessor = { dtm };
+            DTMSettingsHeightsAccessor dtmHeightsAccessor = { dtmSettings };
             uint32_t upperBoundHeightIndex = nbl::hlsl::upper_bound(dtmHeightsAccessor, 0, heightMapSize, height);
             uint32_t lowerBoundHeightIndex = upperBoundHeightIndex == 0 ? upperBoundHeightIndex : upperBoundHeightIndex - 1;
 
-            float upperBoundHeight = dtm.heightColorMapHeights[upperBoundHeightIndex];
-            float lowerBoundHeight = dtm.heightColorMapHeights[lowerBoundHeightIndex];
+            float upperBoundHeight = dtmSettings.heightColorMapHeights[upperBoundHeightIndex];
+            float lowerBoundHeight = dtmSettings.heightColorMapHeights[lowerBoundHeightIndex];
 
-            float4 upperBoundColor = dtm.heightColorMapColors[upperBoundHeightIndex];
-            float4 lowerBoundColor = dtm.heightColorMapColors[lowerBoundHeightIndex];
+            float4 upperBoundColor = dtmSettings.heightColorMapColors[upperBoundHeightIndex];
+            float4 lowerBoundColor = dtmSettings.heightColorMapColors[lowerBoundHeightIndex];
 
             float interpolationVal;
             if (upperBoundHeightIndex == 0)
@@ -584,20 +631,20 @@ float4 calculateDTMHeightColor(in DTMSettings dtm, in float3 v[3], in float heig
     return outputColor; 
 }
 
-float4 calculateDTMContourColor(in DTMSettings dtm, in float3 v[3], in uint2 edgePoints[3], in PSInput psInput, in float height)
+float4 calculateDTMContourColor(in DTMSettings dtmSettings, in float3 v[3], in uint2 edgePoints[3], in PSInput psInput, in float height)
 {
     float4 outputColor;
 
-    LineStyle contourStyle = loadLineStyle(dtm.contourLineStyleIdx);
+    LineStyle contourStyle = loadLineStyle(dtmSettings.contourLineStyleIdx);
     const float contourThickness = psInput.getContourLineThickness();
     float stretch = 1.0f;
     float phaseShift = 0.0f;
     const float worldToScreenRatio = psInput.getCurrentWorldToScreenRatio();
 
     // TODO: move to ubo or push constants
-    const float startHeight = dtm.contourLinesStartHeight;
-    const float endHeight = dtm.contourLinesEndHeight;
-    const float interval = dtm.contourLinesHeightInterval;
+    const float startHeight = dtmSettings.contourLinesStartHeight;
+    const float endHeight = dtmSettings.contourLinesEndHeight;
+    const float interval = dtmSettings.contourLinesHeightInterval;
 
     // TODO: can be precomputed
     const int maxContourLineIdx = (endHeight - startHeight + 1) / interval;
@@ -660,11 +707,11 @@ float4 calculateDTMContourColor(in DTMSettings dtm, in float3 v[3], in uint2 edg
     return outputColor;
 }
 
-float4 calculateDTMOutlineColor(in DTMSettings dtm, in float3 v[3], in uint2 edgePoints[3], in PSInput psInput, in float3 baryCoord, in float height)
+float4 calculateDTMOutlineColor(in DTMSettings dtmSettings, in float3 v[3], in uint2 edgePoints[3], in PSInput psInput, in float3 baryCoord, in float height)
 {
     float4 outputColor;
 
-    LineStyle outlineStyle = loadLineStyle(dtm.outlineLineStyleIdx);
+    LineStyle outlineStyle = loadLineStyle(dtmSettings.outlineLineStyleIdx);
     const float outlineThickness = psInput.getOutlineThickness();
     const float phaseShift = 0.0f; // input.getCurrentPhaseShift();
     const float worldToScreenRatio = psInput.getCurrentWorldToScreenRatio();
@@ -792,270 +839,267 @@ float4 fragMain(PSInput input) : SV_TARGET
     
     if (pc.isDTMRendering)
     {   
-        // TRIANGLE RENDERING
-        {
-            DTMSettings dtm = loadDTMSettings(mainObj.dtmSettingsIdx);
-
-            float3 v[3];
-            v[0] = input.getScreenSpaceVertexAttribs(0);
-            v[1] = input.getScreenSpaceVertexAttribs(1);
-            v[2] = input.getScreenSpaceVertexAttribs(2);
-
-            // indices of points constructing every edge
-            uint2 edgePoints[3];
-            edgePoints[0] = uint2(0, 1);
-            edgePoints[1] = uint2(1, 2);
-            edgePoints[2] = uint2(2, 0);
-
-            const float3 baryCoord = calculateDTMTriangleBarycentrics(v[0], v[1], v[2], input.position.xy);
-            float height = baryCoord.x * v[0].z + baryCoord.y * v[1].z + baryCoord.z * v[2].z;
-            float heightDeriv = fwidth(height);
-
-            DTMColorBlender blender;
-            blender.init();
-            if(dtm.drawHeightsFlag)
-                blender.addColorOnTop(calculateDTMHeightColor(dtm, v, heightDeriv, input.position.xy, height));
-            if (dtm.drawContoursFlag)
-                blender.addColorOnTop(calculateDTMContourColor(dtm, v, edgePoints, input, height));
-            if (dtm.drawOutlineFlag)
-                blender.addColorOnTop(calculateDTMOutlineColor(dtm, v, edgePoints, input, baryCoord, height));
-            float4 dtmColor = blender.blend();
-
-            textureColor = dtmColor.rgb;
-            localAlpha = dtmColor.a;
-
-            return calculateFinalColor<nbl::hlsl::jit::device_capabilities::fragmentShaderPixelInterlock>(uint2(input.position.xy), localAlpha, currentMainObjectIdx, textureColor, true);
-        }
+        DTMSettings dtmSettings = loadDTMSettings(mainObj.dtmSettingsIdx);
+
+        float3 v[3];
+        v[0] = input.getScreenSpaceVertexAttribs(0);
+        v[1] = input.getScreenSpaceVertexAttribs(1);
+        v[2] = input.getScreenSpaceVertexAttribs(2);
+
+        // indices of points constructing every edge
+        uint2 edgePoints[3];
+        edgePoints[0] = uint2(0, 1);
+        edgePoints[1] = uint2(1, 2);
+        edgePoints[2] = uint2(2, 0);
+
+        const float3 baryCoord = calculateDTMTriangleBarycentrics(v[0], v[1], v[2], input.position.xy);
+        float height = baryCoord.x * v[0].z + baryCoord.y * v[1].z + baryCoord.z * v[2].z;
+        float heightDeriv = fwidth(height);
+
+        DTMColorBlender blender;
+        blender.init();
+        if(dtmSettings.drawHeightShadingEnabled())
+            blender.addColorOnTop(calculateDTMHeightColor(dtmSettings, v, heightDeriv, input.position.xy, height));
+        if (dtmSettings.drawContourEnabled())
+            blender.addColorOnTop(calculateDTMContourColor(dtmSettings, v, edgePoints, input, height));
+        if (dtmSettings.drawOutlineEnabled())
+            blender.addColorOnTop(calculateDTMOutlineColor(dtmSettings, v, edgePoints, input, baryCoord, height));
+        float4 dtmColor = blender.blend();
+
+        textureColor = dtmColor.rgb;
+        localAlpha = dtmColor.a;
+
+        return calculateFinalColor<nbl::hlsl::jit::device_capabilities::fragmentShaderPixelInterlock>(uint2(input.position.xy), localAlpha, currentMainObjectIdx, textureColor, true);
     }
     else
     {
         // figure out local alpha with sdf
         if (objType == ObjectType::LINE || objType == ObjectType::QUAD_BEZIER || objType == ObjectType::POLYLINE_CONNECTOR)
-    {
-        float distance = nbl::hlsl::numeric_limits<float>::max;
-        if (objType == ObjectType::LINE)
         {
-            const float2 start = input.getLineStart();
-            const float2 end = input.getLineEnd();
-            const uint32_t styleIdx = mainObj.styleIdx;
-            const float thickness = input.getLineThickness();
-            const float phaseShift = input.getCurrentPhaseShift();
-            const float stretch = input.getPatternStretch();
-            const float worldToScreenRatio = input.getCurrentWorldToScreenRatio();
+            float distance = nbl::hlsl::numeric_limits<float>::max;
+            if (objType == ObjectType::LINE)
+            {
+                const float2 start = input.getLineStart();
+                const float2 end = input.getLineEnd();
+                const uint32_t styleIdx = mainObj.styleIdx;
+                const float thickness = input.getLineThickness();
+                const float phaseShift = input.getCurrentPhaseShift();
+                const float stretch = input.getPatternStretch();
+                const float worldToScreenRatio = input.getCurrentWorldToScreenRatio();
 
-            nbl::hlsl::shapes::Line<float> lineSegment = nbl::hlsl::shapes::Line<float>::construct(start, end);
+                nbl::hlsl::shapes::Line<float> lineSegment = nbl::hlsl::shapes::Line<float>::construct(start, end);
 
-            LineStyle style = loadLineStyle(styleIdx);
+                LineStyle style = loadLineStyle(styleIdx);
 
-            if (!style.hasStipples() || stretch == InvalidStyleStretchValue)
-            {
-                distance = ClippedSignedDistance< nbl::hlsl::shapes::Line<float> >::sdf(lineSegment, input.position.xy, thickness, style.isRoadStyleFlag);
+                if (!style.hasStipples() || stretch == InvalidStyleStretchValue)
+                {
+                    distance = ClippedSignedDistance< nbl::hlsl::shapes::Line<float> >::sdf(lineSegment, input.position.xy, thickness, style.isRoadStyleFlag);
+                }
+                else
+                {
+                    nbl::hlsl::shapes::Line<float>::ArcLengthCalculator arcLenCalc = nbl::hlsl::shapes::Line<float>::ArcLengthCalculator::construct(lineSegment);
+                    LineStyleClipper clipper = LineStyleClipper::construct(loadLineStyle(styleIdx), lineSegment, arcLenCalc, phaseShift, stretch, worldToScreenRatio);
+                    distance = ClippedSignedDistance<nbl::hlsl::shapes::Line<float>, LineStyleClipper>::sdf(lineSegment, input.position.xy, thickness, style.isRoadStyleFlag, clipper);
+                }
             }
-            else
+            else if (objType == ObjectType::QUAD_BEZIER)
             {
-                nbl::hlsl::shapes::Line<float>::ArcLengthCalculator arcLenCalc = nbl::hlsl::shapes::Line<float>::ArcLengthCalculator::construct(lineSegment);
-                LineStyleClipper clipper = LineStyleClipper::construct(loadLineStyle(styleIdx), lineSegment, arcLenCalc, phaseShift, stretch, worldToScreenRatio);
-                distance = ClippedSignedDistance<nbl::hlsl::shapes::Line<float>, LineStyleClipper>::sdf(lineSegment, input.position.xy, thickness, style.isRoadStyleFlag, clipper);
-            }
-        }
-        else if (objType == ObjectType::QUAD_BEZIER)
-        {
-            nbl::hlsl::shapes::Quadratic<float> quadratic = input.getQuadratic();
-            nbl::hlsl::shapes::Quadratic<float>::ArcLengthCalculator arcLenCalc = input.getQuadraticArcLengthCalculator();
+                nbl::hlsl::shapes::Quadratic<float> quadratic = input.getQuadratic();
+                nbl::hlsl::shapes::Quadratic<float>::ArcLengthCalculator arcLenCalc = input.getQuadraticArcLengthCalculator();
 
-            const uint32_t styleIdx = mainObj.styleIdx;
-            const float thickness = input.getLineThickness();
-            const float phaseShift = input.getCurrentPhaseShift();
-            const float stretch = input.getPatternStretch();
-            const float worldToScreenRatio = input.getCurrentWorldToScreenRatio();
+                const uint32_t styleIdx = mainObj.styleIdx;
+                const float thickness = input.getLineThickness();
+                const float phaseShift = input.getCurrentPhaseShift();
+                const float stretch = input.getPatternStretch();
+                const float worldToScreenRatio = input.getCurrentWorldToScreenRatio();
 
-            LineStyle style = loadLineStyle(styleIdx);
-            if (!style.hasStipples() || stretch == InvalidStyleStretchValue)
-            {
-                distance = ClippedSignedDistance< nbl::hlsl::shapes::Quadratic<float> >::sdf(quadratic, input.position.xy, thickness, style.isRoadStyleFlag);
+                LineStyle style = loadLineStyle(styleIdx);
+                if (!style.hasStipples() || stretch == InvalidStyleStretchValue)
+                {
+                    distance = ClippedSignedDistance< nbl::hlsl::shapes::Quadratic<float> >::sdf(quadratic, input.position.xy, thickness, style.isRoadStyleFlag);
+                }
+                else
+                {
+                    BezierStyleClipper clipper = BezierStyleClipper::construct(loadLineStyle(styleIdx), quadratic, arcLenCalc, phaseShift, stretch, worldToScreenRatio);
+                    distance = ClippedSignedDistance<nbl::hlsl::shapes::Quadratic<float>, BezierStyleClipper>::sdf(quadratic, input.position.xy, thickness, style.isRoadStyleFlag, clipper);
+                }
             }
-            else
+            else if (objType == ObjectType::POLYLINE_CONNECTOR)
             {
-                BezierStyleClipper clipper = BezierStyleClipper::construct(loadLineStyle(styleIdx), quadratic, arcLenCalc, phaseShift, stretch, worldToScreenRatio);
-                distance = ClippedSignedDistance<nbl::hlsl::shapes::Quadratic<float>, BezierStyleClipper>::sdf(quadratic, input.position.xy, thickness, style.isRoadStyleFlag, clipper);
-            }
-        }
-        else if (objType == ObjectType::POLYLINE_CONNECTOR)
-        {
-            const float2 P = input.position.xy - input.getPolylineConnectorCircleCenter();
-            distance = miterSDF(
-                P,
-                input.getLineThickness(),
-                input.getPolylineConnectorTrapezoidStart(),
-                input.getPolylineConnectorTrapezoidEnd(),
-                input.getPolylineConnectorTrapezoidLongBase(),
-                input.getPolylineConnectorTrapezoidShortBase());
+                const float2 P = input.position.xy - input.getPolylineConnectorCircleCenter();
+                distance = miterSDF(
+                    P,
+                    input.getLineThickness(),
+                    input.getPolylineConnectorTrapezoidStart(),
+                    input.getPolylineConnectorTrapezoidEnd(),
+                    input.getPolylineConnectorTrapezoidLongBase(),
+                    input.getPolylineConnectorTrapezoidShortBase());
 
+            }
+            localAlpha = smoothstep(+globals.antiAliasingFactor, -globals.antiAliasingFactor, distance);
         }
-        localAlpha = smoothstep(+globals.antiAliasingFactor, -globals.antiAliasingFactor, distance);
-    }
         else if (objType == ObjectType::CURVE_BOX) 
-    {
-        const float minorBBoxUV = input.getMinorBBoxUV();
-        const float majorBBoxUV = input.getMajorBBoxUV();
+        {
+            const float minorBBoxUV = input.getMinorBBoxUV();
+            const float majorBBoxUV = input.getMajorBBoxUV();
 
-        nbl::hlsl::math::equations::Quadratic<float> curveMinMinor = input.getCurveMinMinor();
-        nbl::hlsl::math::equations::Quadratic<float> curveMinMajor = input.getCurveMinMajor();
-        nbl::hlsl::math::equations::Quadratic<float> curveMaxMinor = input.getCurveMaxMinor();
-        nbl::hlsl::math::equations::Quadratic<float> curveMaxMajor = input.getCurveMaxMajor();
+            nbl::hlsl::math::equations::Quadratic<float> curveMinMinor = input.getCurveMinMinor();
+            nbl::hlsl::math::equations::Quadratic<float> curveMinMajor = input.getCurveMinMajor();
+            nbl::hlsl::math::equations::Quadratic<float> curveMaxMinor = input.getCurveMaxMinor();
+            nbl::hlsl::math::equations::Quadratic<float> curveMaxMajor = input.getCurveMaxMajor();
 
-        //  TODO(Optimization): Can we ignore this majorBBoxUV clamp and rely on the t clamp that happens next? then we can pass `PrecomputedRootFinder`s instead of computing the values per pixel.
-        nbl::hlsl::math::equations::Quadratic<float> minCurveEquation = nbl::hlsl::math::equations::Quadratic<float>::construct(curveMinMajor.a, curveMinMajor.b, curveMinMajor.c - clamp(majorBBoxUV, 0.0, 1.0));
-        nbl::hlsl::math::equations::Quadratic<float> maxCurveEquation = nbl::hlsl::math::equations::Quadratic<float>::construct(curveMaxMajor.a, curveMaxMajor.b, curveMaxMajor.c - clamp(majorBBoxUV, 0.0, 1.0));
+            //  TODO(Optimization): Can we ignore this majorBBoxUV clamp and rely on the t clamp that happens next? then we can pass `PrecomputedRootFinder`s instead of computing the values per pixel.
+            nbl::hlsl::math::equations::Quadratic<float> minCurveEquation = nbl::hlsl::math::equations::Quadratic<float>::construct(curveMinMajor.a, curveMinMajor.b, curveMinMajor.c - clamp(majorBBoxUV, 0.0, 1.0));
+            nbl::hlsl::math::equations::Quadratic<float> maxCurveEquation = nbl::hlsl::math::equations::Quadratic<float>::construct(curveMaxMajor.a, curveMaxMajor.b, curveMaxMajor.c - clamp(majorBBoxUV, 0.0, 1.0));
 
-        const float minT = clamp(PrecomputedRootFinder<float>::construct(minCurveEquation).computeRoots(), 0.0, 1.0);
-        const float minEv = curveMinMinor.evaluate(minT);
+            const float minT = clamp(PrecomputedRootFinder<float>::construct(minCurveEquation).computeRoots(), 0.0, 1.0);
+            const float minEv = curveMinMinor.evaluate(minT);
 
-        const float maxT = clamp(PrecomputedRootFinder<float>::construct(maxCurveEquation).computeRoots(), 0.0, 1.0);
-        const float maxEv = curveMaxMinor.evaluate(maxT);
+            const float maxT = clamp(PrecomputedRootFinder<float>::construct(maxCurveEquation).computeRoots(), 0.0, 1.0);
+            const float maxEv = curveMaxMinor.evaluate(maxT);
 
-        const bool insideMajor = majorBBoxUV >= 0.0 && majorBBoxUV <= 1.0;
-        const bool insideMinor = minorBBoxUV >= minEv && minorBBoxUV <= maxEv;
+            const bool insideMajor = majorBBoxUV >= 0.0 && majorBBoxUV <= 1.0;
+            const bool insideMinor = minorBBoxUV >= minEv && minorBBoxUV <= maxEv;
 
-        if (insideMinor && insideMajor)
-        {
-            localAlpha = 1.0;
-        }
-        else
-        {
-            // Find the true SDF of a hatch box boundary which is bounded by two curves, It requires knowing the distance from the current UV to the closest point on bounding curves and the limiting lines (in major direction)
-            // We also keep track of distance vector (minor, major) to convert to screenspace distance for anti-aliasing with screenspace aaFactor
-            const float InvalidT = nbl::hlsl::numeric_limits<float32_t>::max;
-            const float MAX_DISTANCE_SQUARED = nbl::hlsl::numeric_limits<float32_t>::max;
+            if (insideMinor && insideMajor)
+            {
+                localAlpha = 1.0;
+            }
+            else
+            {
+                // Find the true SDF of a hatch box boundary which is bounded by two curves, It requires knowing the distance from the current UV to the closest point on bounding curves and the limiting lines (in major direction)
+                // We also keep track of distance vector (minor, major) to convert to screenspace distance for anti-aliasing with screenspace aaFactor
+                const float InvalidT = nbl::hlsl::numeric_limits<float32_t>::max;
+                const float MAX_DISTANCE_SQUARED = nbl::hlsl::numeric_limits<float32_t>::max;
 
-            const float2 boxScreenSpaceSize = input.getCurveBoxScreenSpaceSize();
+                const float2 boxScreenSpaceSize = input.getCurveBoxScreenSpaceSize();
 
 
-            float closestDistanceSquared = MAX_DISTANCE_SQUARED;
-            const float2 pos = float2(minorBBoxUV, majorBBoxUV) * boxScreenSpaceSize;
+                float closestDistanceSquared = MAX_DISTANCE_SQUARED;
+                const float2 pos = float2(minorBBoxUV, majorBBoxUV) * boxScreenSpaceSize;
 
-            if (minorBBoxUV < minEv)
-            {
-                // DO SDF of Min Curve
-                nbl::hlsl::shapes::Quadratic<float> minCurve = nbl::hlsl::shapes::Quadratic<float>::construct(
-                    float2(curveMinMinor.a, curveMinMajor.a) * boxScreenSpaceSize,
-                    float2(curveMinMinor.b, curveMinMajor.b) * boxScreenSpaceSize,
-                    float2(curveMinMinor.c, curveMinMajor.c) * boxScreenSpaceSize);
-
-                nbl::hlsl::shapes::Quadratic<float>::Candidates candidates = minCurve.getClosestCandidates(pos);
-                [[unroll(nbl::hlsl::shapes::Quadratic<float>::MaxCandidates)]]
-                for (uint32_t i = 0; i < nbl::hlsl::shapes::Quadratic<float>::MaxCandidates; i++)
+                if (minorBBoxUV < minEv)
                 {
-                    candidates[i] = clamp(candidates[i], 0.0, 1.0);
-                    const float2 distVector = minCurve.evaluate(candidates[i]) - pos;
-                    const float candidateDistanceSquared = dot(distVector, distVector);
-                    if (candidateDistanceSquared < closestDistanceSquared)
-                        closestDistanceSquared = candidateDistanceSquared;
+                    // DO SDF of Min Curve
+                    nbl::hlsl::shapes::Quadratic<float> minCurve = nbl::hlsl::shapes::Quadratic<float>::construct(
+                        float2(curveMinMinor.a, curveMinMajor.a) * boxScreenSpaceSize,
+                        float2(curveMinMinor.b, curveMinMajor.b) * boxScreenSpaceSize,
+                        float2(curveMinMinor.c, curveMinMajor.c) * boxScreenSpaceSize);
+
+                    nbl::hlsl::shapes::Quadratic<float>::Candidates candidates = minCurve.getClosestCandidates(pos);
+                    [[unroll(nbl::hlsl::shapes::Quadratic<float>::MaxCandidates)]]
+                    for (uint32_t i = 0; i < nbl::hlsl::shapes::Quadratic<float>::MaxCandidates; i++)
+                    {
+                        candidates[i] = clamp(candidates[i], 0.0, 1.0);
+                        const float2 distVector = minCurve.evaluate(candidates[i]) - pos;
+                        const float candidateDistanceSquared = dot(distVector, distVector);
+                        if (candidateDistanceSquared < closestDistanceSquared)
+                            closestDistanceSquared = candidateDistanceSquared;
+                    }
                 }
-            }
-            else if (minorBBoxUV > maxEv)
-            {
-                // Do SDF of Max Curve
-                nbl::hlsl::shapes::Quadratic<float> maxCurve = nbl::hlsl::shapes::Quadratic<float>::construct(
-                    float2(curveMaxMinor.a, curveMaxMajor.a) * boxScreenSpaceSize,
-                    float2(curveMaxMinor.b, curveMaxMajor.b) * boxScreenSpaceSize,
-                    float2(curveMaxMinor.c, curveMaxMajor.c) * boxScreenSpaceSize);
-                nbl::hlsl::shapes::Quadratic<float>::Candidates candidates = maxCurve.getClosestCandidates(pos);
-                [[unroll(nbl::hlsl::shapes::Quadratic<float>::MaxCandidates)]]
-                for (uint32_t i = 0; i < nbl::hlsl::shapes::Quadratic<float>::MaxCandidates; i++)
+                else if (minorBBoxUV > maxEv)
                 {
-                    candidates[i] = clamp(candidates[i], 0.0, 1.0);
-                    const float2 distVector = maxCurve.evaluate(candidates[i]) - pos;
-                    const float candidateDistanceSquared = dot(distVector, distVector);
-                    if (candidateDistanceSquared < closestDistanceSquared)
-                        closestDistanceSquared = candidateDistanceSquared;
+                    // Do SDF of Max Curve
+                    nbl::hlsl::shapes::Quadratic<float> maxCurve = nbl::hlsl::shapes::Quadratic<float>::construct(
+                        float2(curveMaxMinor.a, curveMaxMajor.a) * boxScreenSpaceSize,
+                        float2(curveMaxMinor.b, curveMaxMajor.b) * boxScreenSpaceSize,
+                        float2(curveMaxMinor.c, curveMaxMajor.c) * boxScreenSpaceSize);
+                    nbl::hlsl::shapes::Quadratic<float>::Candidates candidates = maxCurve.getClosestCandidates(pos);
+                    [[unroll(nbl::hlsl::shapes::Quadratic<float>::MaxCandidates)]]
+                    for (uint32_t i = 0; i < nbl::hlsl::shapes::Quadratic<float>::MaxCandidates; i++)
+                    {
+                        candidates[i] = clamp(candidates[i], 0.0, 1.0);
+                        const float2 distVector = maxCurve.evaluate(candidates[i]) - pos;
+                        const float candidateDistanceSquared = dot(distVector, distVector);
+                        if (candidateDistanceSquared < closestDistanceSquared)
+                            closestDistanceSquared = candidateDistanceSquared;
+                    }
                 }
-            }
 
-            if (!insideMajor)
-            {
-                const bool minLessThanMax = minEv < maxEv;
-                float2 majorDistVector = float2(MAX_DISTANCE_SQUARED, MAX_DISTANCE_SQUARED);
-                if (majorBBoxUV > 1.0)
+                if (!insideMajor)
                 {
-                    const float2 minCurveEnd = float2(minEv, 1.0) * boxScreenSpaceSize;
-                    if (minLessThanMax)
-                        majorDistVector = sdLineDstVec(pos, minCurveEnd, float2(maxEv, 1.0) * boxScreenSpaceSize);
-                    else
-                        majorDistVector = pos - minCurveEnd;
-                }
-                else
-                {
-                    const float2 minCurveStart = float2(minEv, 0.0) * boxScreenSpaceSize;
-                    if (minLessThanMax)
-                        majorDistVector = sdLineDstVec(pos, minCurveStart, float2(maxEv, 0.0) * boxScreenSpaceSize);
+                    const bool minLessThanMax = minEv < maxEv;
+                    float2 majorDistVector = float2(MAX_DISTANCE_SQUARED, MAX_DISTANCE_SQUARED);
+                    if (majorBBoxUV > 1.0)
+                    {
+                        const float2 minCurveEnd = float2(minEv, 1.0) * boxScreenSpaceSize;
+                        if (minLessThanMax)
+                            majorDistVector = sdLineDstVec(pos, minCurveEnd, float2(maxEv, 1.0) * boxScreenSpaceSize);
+                        else
+                            majorDistVector = pos - minCurveEnd;
+                    }
                     else
-                        majorDistVector = pos - minCurveStart;
+                    {
+                        const float2 minCurveStart = float2(minEv, 0.0) * boxScreenSpaceSize;
+                        if (minLessThanMax)
+                            majorDistVector = sdLineDstVec(pos, minCurveStart, float2(maxEv, 0.0) * boxScreenSpaceSize);
+                        else
+                            majorDistVector = pos - minCurveStart;
+                    }
+
+                    const float majorDistSq = dot(majorDistVector, majorDistVector);
+                    if (majorDistSq < closestDistanceSquared)
+                        closestDistanceSquared = majorDistSq;
                 }
 
-                const float majorDistSq = dot(majorDistVector, majorDistVector);
-                if (majorDistSq < closestDistanceSquared)
-                    closestDistanceSquared = majorDistSq;
+                const float dist = sqrt(closestDistanceSquared);
+                localAlpha = 1.0f - smoothstep(0.0, globals.antiAliasingFactor, dist);
             }
 
-            const float dist = sqrt(closestDistanceSquared);
-            localAlpha = 1.0f - smoothstep(0.0, globals.antiAliasingFactor, dist);
-        }
-
-        LineStyle style = loadLineStyle(mainObj.styleIdx);
-        uint32_t textureId = asuint(style.screenSpaceLineWidth);
-        if (textureId != InvalidTextureIdx)
-        {
-            // For Hatch fiils we sample the first mip as we don't fill the others, because they are constant in screenspace and render as expected
-            // If later on we decided that we can have different sizes here, we should do computations similar to FONT_GLYPH
-            float3 msdfSample = msdfTextures.SampleLevel(msdfSampler, float3(frac(input.position.xy / HatchFillMSDFSceenSpaceSize), float(textureId)), 0.0).xyz;
-            float msdf = nbl::hlsl::text::msdfDistance(msdfSample, MSDFPixelRange * HatchFillMSDFSceenSpaceSize / MSDFSize);
-            localAlpha *= smoothstep(+globals.antiAliasingFactor / 2.0, -globals.antiAliasingFactor / 2.0f, msdf);
+            LineStyle style = loadLineStyle(mainObj.styleIdx);
+            uint32_t textureId = asuint(style.screenSpaceLineWidth);
+            if (textureId != InvalidTextureIdx)
+            {
+                // For Hatch fiils we sample the first mip as we don't fill the others, because they are constant in screenspace and render as expected
+                // If later on we decided that we can have different sizes here, we should do computations similar to FONT_GLYPH
+                float3 msdfSample = msdfTextures.SampleLevel(msdfSampler, float3(frac(input.position.xy / HatchFillMSDFSceenSpaceSize), float(textureId)), 0.0).xyz;
+                float msdf = nbl::hlsl::text::msdfDistance(msdfSample, MSDFPixelRange * HatchFillMSDFSceenSpaceSize / MSDFSize);
+                localAlpha *= smoothstep(+globals.antiAliasingFactor / 2.0, -globals.antiAliasingFactor / 2.0f, msdf);
+            }
         }
-    }
         else if (objType == ObjectType::FONT_GLYPH) 
-    {
-        const float2 uv = input.getFontGlyphUV();
-        const uint32_t textureId = input.getFontGlyphTextureId();
-
-        if (textureId != InvalidTextureIdx)
         {
-            float mipLevel = msdfTextures.CalculateLevelOfDetail(msdfSampler, uv);
-            float3 msdfSample = msdfTextures.SampleLevel(msdfSampler, float3(uv, float(textureId)), mipLevel);
-            float msdf = nbl::hlsl::text::msdfDistance(msdfSample, input.getFontGlyphPxRange());
-            /*
-                explaining "*= exp2(max(mipLevel,0.0))"
-                Each mip level has constant MSDFPixelRange
-                Which essentially makes the msdfSamples here (Harware Sampled) have different scales per mip
-                As we go up 1 mip level, the msdf distance should be multiplied by 2.0
-                While this makes total sense for NEAREST mip sampling when mipLevel is an integer and only one mip is being sampled.
-                It's a bit complex when it comes to trilinear filtering (LINEAR mip sampling), but it works in practice!
+            const float2 uv = input.getFontGlyphUV();
+            const uint32_t textureId = input.getFontGlyphTextureId();
+
+            if (textureId != InvalidTextureIdx)
+            {
+                float mipLevel = msdfTextures.CalculateLevelOfDetail(msdfSampler, uv);
+                float3 msdfSample = msdfTextures.SampleLevel(msdfSampler, float3(uv, float(textureId)), mipLevel);
+                float msdf = nbl::hlsl::text::msdfDistance(msdfSample, input.getFontGlyphPxRange());
+                /*
+                    explaining "*= exp2(max(mipLevel,0.0))"
+                    Each mip level has constant MSDFPixelRange
+                    Which essentially makes the msdfSamples here (Harware Sampled) have different scales per mip
+                    As we go up 1 mip level, the msdf distance should be multiplied by 2.0
+                    While this makes total sense for NEAREST mip sampling when mipLevel is an integer and only one mip is being sampled.
+                    It's a bit complex when it comes to trilinear filtering (LINEAR mip sampling), but it works in practice!
                 
-                Alternatively you can think of it as doing this instead:
-                localAlpha = smoothstep(+globals.antiAliasingFactor / exp2(max(mipLevel,0.0)), 0.0, msdf);
-                Which is reducing the aa feathering as we go up the mip levels. 
-                to avoid aa feathering of the MAX_MSDF_DISTANCE_VALUE to be less than aa factor and eventually color it and cause greyed out area around the main glyph
-            */
-            msdf *= exp2(max(mipLevel,0.0));
+                    Alternatively you can think of it as doing this instead:
+                    localAlpha = smoothstep(+globals.antiAliasingFactor / exp2(max(mipLevel,0.0)), 0.0, msdf);
+                    Which is reducing the aa feathering as we go up the mip levels. 
+                    to avoid aa feathering of the MAX_MSDF_DISTANCE_VALUE to be less than aa factor and eventually color it and cause greyed out area around the main glyph
+                */
+                msdf *= exp2(max(mipLevel,0.0));
             
-            LineStyle style = loadLineStyle(mainObj.styleIdx);
-            const float screenPxRange = input.getFontGlyphPxRange() / MSDFPixelRangeHalf;
-            const float bolden = style.worldSpaceLineWidth * screenPxRange; // worldSpaceLineWidth is actually boldenInPixels, aliased TextStyle with LineStyle
-            localAlpha = smoothstep(+globals.antiAliasingFactor / 2.0f + bolden, -globals.antiAliasingFactor / 2.0f + bolden, msdf);
+                LineStyle style = loadLineStyle(mainObj.styleIdx);
+                const float screenPxRange = input.getFontGlyphPxRange() / MSDFPixelRangeHalf;
+                const float bolden = style.worldSpaceLineWidth * screenPxRange; // worldSpaceLineWidth is actually boldenInPixels, aliased TextStyle with LineStyle
+                localAlpha = smoothstep(+globals.antiAliasingFactor / 2.0f + bolden, -globals.antiAliasingFactor / 2.0f + bolden, msdf);
+            }
         }
-    }
         else if (objType == ObjectType::IMAGE) 
-    {
-        const float2 uv = input.getImageUV();
-        const uint32_t textureId = input.getImageTextureId();
-
-        if (textureId != InvalidTextureIdx)
         {
-            float4 colorSample = textures[NonUniformResourceIndex(textureId)].Sample(textureSampler, float2(uv.x, uv.y));
-            textureColor = colorSample.rgb;
-            localAlpha = colorSample.a;
+            const float2 uv = input.getImageUV();
+            const uint32_t textureId = input.getImageTextureId();
+
+            if (textureId != InvalidTextureIdx)
+            {
+                float4 colorSample = textures[NonUniformResourceIndex(textureId)].Sample(textureSampler, float2(uv.x, uv.y));
+                textureColor = colorSample.rgb;
+                localAlpha = colorSample.a;
+            }
         }
-    }
 
         uint2 fragCoord = uint2(input.position.xy);
         

From 432b931caa1192bf6982178af4a0c9686d7222ba Mon Sep 17 00:00:00 2001
From: Erfan Ahmadi <ahmadierfan99@gmail.com>
Date: Tue, 15 Apr 2025 12:55:45 +0330
Subject: [PATCH 162/529] clear function for Mesh

---
 62_CAD/CTriangleMesh.h | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/62_CAD/CTriangleMesh.h b/62_CAD/CTriangleMesh.h
index 1860dedc9..1753687b2 100644
--- a/62_CAD/CTriangleMesh.h
+++ b/62_CAD/CTriangleMesh.h
@@ -110,6 +110,12 @@ class CTriangleMesh final
 	{
 		return m_indices.size();
 	}
+	
+	inline void clear()
+	{
+		m_vertices.clear();
+		m_indices.clear();
+	}
 
 	core::vector<vertex_t> m_vertices;
 	core::vector<index_t> m_indices;

From 88dcf44da0465b3e04a7255ba26a67fa342b5552 Mon Sep 17 00:00:00 2001
From: Przemek <minikers21@gmail.com>
Date: Tue, 15 Apr 2025 13:04:24 +0200
Subject: [PATCH 163/529] Fixed non-stippled lines

---
 62_CAD/main.cpp                               | 28 ++++----
 .../main_pipeline/fragment_shader.hlsl        | 71 +++++++------------
 2 files changed, 41 insertions(+), 58 deletions(-)

diff --git a/62_CAD/main.cpp b/62_CAD/main.cpp
index 16532cba7..c16f17c2d 100644
--- a/62_CAD/main.cpp
+++ b/62_CAD/main.cpp
@@ -3247,7 +3247,7 @@ class ComputerAidedDesign final : public examples::SimpleWindowedApplication, pu
 
 			DTMSettingsInfo dtmSettingsInfo;
 			
-			dtmSettingsInfo.mode = E_DTM_MODE::HEIGHT_SHADING;
+			dtmSettingsInfo.mode = E_DTM_MODE::HEIGHT_SHADING | E_DTM_MODE::CONTOUR | E_DTM_MODE::OUTLINE;
 
 			dtmSettingsInfo.contourLinesStartHeight = 20;
 			dtmSettingsInfo.contourLinesEndHeight = 90;
@@ -3256,9 +3256,9 @@ class ComputerAidedDesign final : public examples::SimpleWindowedApplication, pu
 			LineStyleInfo outlineStyle = {};
 			dtmSettingsInfo.outlineLineStyleInfo.screenSpaceLineWidth = 0.0f;
 			dtmSettingsInfo.outlineLineStyleInfo.worldSpaceLineWidth = 3.0f;
-			dtmSettingsInfo.outlineLineStyleInfo.color = float32_t4(0.0f, 0.39f, 0.0f, 0.5f);
-			std::array<double, 4> outlineStipplePattern = { 0.0f, -5.0f, 2.0f, -5.0f };
-			dtmSettingsInfo.outlineLineStyleInfo.setStipplePatternData(outlineStipplePattern);
+			dtmSettingsInfo.outlineLineStyleInfo.color = float32_t4(0.0f, 0.39f, 0.0f, 1.0f);
+			//std::array<double, 4> outlineStipplePattern = { 0.0f, -5.0f, 2.0f, -5.0f };
+			//dtmSettingsInfo.outlineLineStyleInfo.setStipplePatternData(outlineStipplePattern);
 
 			LineStyleInfo contourStyle = {};
 			dtmSettingsInfo.contourLineStyleInfo.screenSpaceLineWidth = 0.0f;
@@ -3292,21 +3292,21 @@ class ComputerAidedDesign final : public examples::SimpleWindowedApplication, pu
 					dtmSettingsInfo.intervalIndexToHeightMultiplier = dtmSettingsInfo.intervalLength;
 					dtmSettingsInfo.isCenteredShading = false;
 					dtmSettingsInfo.heightShadingMode = E_HEIGHT_SHADING_MODE::DISCRETE_FIXED_LENGTH_INTERVALS;
-					dtmSettingsInfo.addHeightColorMapEntry(0.0f,   float32_t4(0.0f, 0.0f, 1.0f, 1.0f));
-					dtmSettingsInfo.addHeightColorMapEntry(25.0f,  float32_t4(0.0f, 1.0f, 1.0f, 1.0));
-					dtmSettingsInfo.addHeightColorMapEntry(50.0f,  float32_t4(0.0f, 1.0f, 0.0f, 1.0));
-					dtmSettingsInfo.addHeightColorMapEntry(75.0f,  float32_t4(1.0f, 1.0f, 0.0f, 1.0));
-					dtmSettingsInfo.addHeightColorMapEntry(100.0f, float32_t4(1.0f, 0.0f, 0.0f, 1.0f));
+					dtmSettingsInfo.addHeightColorMapEntry(0.0f,   float32_t4(0.0f, 0.0f, 1.0f, animatedAlpha));
+					dtmSettingsInfo.addHeightColorMapEntry(25.0f,  float32_t4(0.0f, 1.0f, 1.0f, animatedAlpha));
+					dtmSettingsInfo.addHeightColorMapEntry(50.0f,  float32_t4(0.0f, 1.0f, 0.0f, animatedAlpha));
+					dtmSettingsInfo.addHeightColorMapEntry(75.0f,  float32_t4(1.0f, 1.0f, 0.0f, animatedAlpha));
+					dtmSettingsInfo.addHeightColorMapEntry(100.0f, float32_t4(1.0f, 0.0f, 0.0f, animatedAlpha));
 					break;
 				}
 				case E_HEIGHT_SHADING_MODE::CONTINOUS_INTERVALS:
 				{
 					dtmSettingsInfo.heightShadingMode = E_HEIGHT_SHADING_MODE::CONTINOUS_INTERVALS;
-					dtmSettingsInfo.addHeightColorMapEntry(0.0f,   float32_t4(0.0f, 0.0f, 1.0f, 1.0f));
-					dtmSettingsInfo.addHeightColorMapEntry(25.0f,  float32_t4(0.0f, 1.0f, 1.0f, 1.0));
-					dtmSettingsInfo.addHeightColorMapEntry(50.0f,  float32_t4(0.0f, 1.0f, 0.0f, 1.0));
-					dtmSettingsInfo.addHeightColorMapEntry(75.0f,  float32_t4(1.0f, 1.0f, 0.0f, 1.0));
-					dtmSettingsInfo.addHeightColorMapEntry(100.0f, float32_t4(1.0f, 0.0f, 0.0f, 1.0f));
+					dtmSettingsInfo.addHeightColorMapEntry(0.0f,   float32_t4(0.0f, 0.0f, 1.0f, animatedAlpha));
+					dtmSettingsInfo.addHeightColorMapEntry(25.0f,  float32_t4(0.0f, 1.0f, 1.0f, animatedAlpha));
+					dtmSettingsInfo.addHeightColorMapEntry(50.0f, float32_t4(0.0f, 1.0f, 0.0f, animatedAlpha));
+					dtmSettingsInfo.addHeightColorMapEntry(75.0f, float32_t4(1.0f, 1.0f, 0.0f, animatedAlpha));
+					dtmSettingsInfo.addHeightColorMapEntry(90.0f, float32_t4(1.0f, 0.0f, 0.0f, animatedAlpha));
 					break;
 				}
 			}
diff --git a/62_CAD/shaders/main_pipeline/fragment_shader.hlsl b/62_CAD/shaders/main_pipeline/fragment_shader.hlsl
index dc5262568..d7a9493b2 100644
--- a/62_CAD/shaders/main_pipeline/fragment_shader.hlsl
+++ b/62_CAD/shaders/main_pipeline/fragment_shader.hlsl
@@ -431,10 +431,6 @@ struct HeightSegmentTransitionData
     float4 otherSegmentColor;
 };
 
-// NOTE[Erfan to Przemek][REMOVE WHEN READ]: I renamed to `smoothHeightSegmentTransition` and made it return value instead of take `out` param + removed applying it to final output color (it's responsibility of the caller now)
-// Now the resposibility of this  function is just to "Figure out what the interpolated color at the transition is." and doesn't assume how it's gonna be applied to the final color
-// that's more predictible and atomic. Additionally I think `out` functions make the code a little bit more unreadable as well
-
 // This function interpolates between the current and nearest segment colors based on the
 // screen-space distance to the segment boundary. The result is a smoothly blended color
 // useful for visualizing discrete height levels without harsh edges.
@@ -682,6 +678,7 @@ float4 calculateDTMContourColor(in DTMSettings dtmSettings, in float3 v[3], in u
         }
     }
 
+    if(contourLinePointsIdx == 2)
     {
         nbl::hlsl::shapes::Line<float> lineSegment = nbl::hlsl::shapes::Line<float>::construct(contourLinePoints[0], contourLinePoints[1]);
 
@@ -746,16 +743,27 @@ float4 calculateDTMOutlineColor(in DTMSettings dtmSettings, in float3 v[3], in u
         float ABLen = length(AB);
         float triangleHeightToOpositeVertex = triangleAreaTimesTwo / ABLen;
 
-        distances[i] = triangleHeightToOpositeVertex * baryCoord[opposingVertexIdx[i]];
+        distances[i] = abs(triangleHeightToOpositeVertex * baryCoord[opposingVertexIdx[i]]);
     }
 
     float minDistance = nbl::hlsl::numeric_limits<float>::max;
     if (!outlineStyle.hasStipples() || stretch == InvalidStyleStretchValue)
     {
-        for (uint i = 0; i < 3; ++i)
-            distances[i] -= outlineThickness;
+        for (int i = 0; i < 3; ++i)
+        {
+            if (distances[i] > outlineThickness)
+                continue;
 
-        minDistance = min(distances[0], min(distances[1], distances[2]));
+            const uint2 currentEdgePoints = edgePoints[i];
+            float3 p0 = v[currentEdgePoints[0]];
+            float3 p1 = v[currentEdgePoints[1]];
+
+            float distance = nbl::hlsl::numeric_limits<float>::max;
+            nbl::hlsl::shapes::Line<float> lineSegment = nbl::hlsl::shapes::Line<float>::construct(float2(p0.x, p0.y), float2(p1.x, p1.y));
+            distance = ClippedSignedDistance<nbl::hlsl::shapes::Line<float> >::sdf(lineSegment, psInput.position.xy, outlineThickness, outlineStyle.isRoadStyleFlag);
+
+            minDistance = min(minDistance, distance);
+        }
     }
     else
     {
@@ -794,37 +802,14 @@ float4 calculateDTMOutlineColor(in DTMSettings dtmSettings, in float3 v[3], in u
     return outputColor;
 }
 
-struct DTMColorBlender
+float4 blendColorOnTop(in float4 colorBelow, in float4 colorAbove)
 {
-    void init()
-    {
-        colorCount = 0;
-    }
-
-    void addColorOnTop(in float4 color)
-    {
-        colors[colorCount] = color;
-        colorCount++;
-    }
-
-    float4 blend()
-    {
-        if (colorCount == 0)
-            return float4(0.0f, 0.0f, 0.0f, 1.0f);
-
-        float4 outputColor = colors[0];
-        for (int i = 1; i < colorCount; ++i)
-        {
-            outputColor.rgb = colors[i].rgb * colors[i].a + outputColor.rgb * outputColor.a * (1.0f - colors[i].a);
-            outputColor.a = colors[i].a + outputColor.a * (1.0f - colors[i].a);
-        }
-
-        return outputColor;
-    }
+    float4 outputColor = colorBelow;
+    outputColor.rgb = colorAbove.rgb * colorAbove.a + outputColor.rgb * outputColor.a * (1.0f - colorAbove.a);
+    outputColor.a = colorAbove.a + outputColor.a * (1.0f - colorAbove.a);
 
-    int colorCount;
-    float4 colors[3];
-};
+    return outputColor;
+}
 
 [[vk::spvexecutionmode(spv::ExecutionModePixelInterlockOrderedEXT)]]
 [shader("pixel")]
@@ -856,15 +841,13 @@ float4 fragMain(PSInput input) : SV_TARGET
         float height = baryCoord.x * v[0].z + baryCoord.y * v[1].z + baryCoord.z * v[2].z;
         float heightDeriv = fwidth(height);
 
-        DTMColorBlender blender;
-        blender.init();
-        if(dtmSettings.drawHeightShadingEnabled())
-            blender.addColorOnTop(calculateDTMHeightColor(dtmSettings, v, heightDeriv, input.position.xy, height));
+        float4 dtmColor = float4(0.0f, 0.0f, 0.0f, 0.0f);
+        if (dtmSettings.drawHeightShadingEnabled())
+            dtmColor = blendColorOnTop(dtmColor, calculateDTMHeightColor(dtmSettings, v, heightDeriv, input.position.xy, height));
         if (dtmSettings.drawContourEnabled())
-            blender.addColorOnTop(calculateDTMContourColor(dtmSettings, v, edgePoints, input, height));
+            dtmColor = blendColorOnTop(dtmColor, calculateDTMContourColor(dtmSettings, v, edgePoints, input, height));
         if (dtmSettings.drawOutlineEnabled())
-            blender.addColorOnTop(calculateDTMOutlineColor(dtmSettings, v, edgePoints, input, baryCoord, height));
-        float4 dtmColor = blender.blend();
+            dtmColor = blendColorOnTop(dtmColor, calculateDTMOutlineColor(dtmSettings, v, edgePoints, input, baryCoord, height));
 
         textureColor = dtmColor.rgb;
         localAlpha = dtmColor.a;

From 906443947fb830c1d1d75ed48f3d47e97b6b335c Mon Sep 17 00:00:00 2001
From: Erfan Ahmadi <ahmadierfan99@gmail.com>
Date: Tue, 15 Apr 2025 14:44:04 +0330
Subject: [PATCH 164/529] getIntervalHeightAndColor small fix

---
 62_CAD/shaders/main_pipeline/fragment_shader.hlsl | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/62_CAD/shaders/main_pipeline/fragment_shader.hlsl b/62_CAD/shaders/main_pipeline/fragment_shader.hlsl
index d7a9493b2..f9cd52ec3 100644
--- a/62_CAD/shaders/main_pipeline/fragment_shader.hlsl
+++ b/62_CAD/shaders/main_pipeline/fragment_shader.hlsl
@@ -469,10 +469,15 @@ float getIntervalPosition(in float height, in float minHeight, in float interval
 void getIntervalHeightAndColor(in int intervalIndex, in DTMSettings dtmSettings, out float4 outIntervalColor, out float outIntervalHeight)
 {
     float minShadingHeight = dtmSettings.heightColorMapHeights[0];
-    outIntervalHeight = minShadingHeight + float(intervalIndex) * dtmSettings.intervalIndexToHeightMultiplier;
+    float heightForColor = minShadingHeight + float(intervalIndex) * dtmSettings.intervalIndexToHeightMultiplier;
+    
+    if (dtmSettings.isCenteredShading)
+        outIntervalHeight = minShadingHeight + (float(intervalIndex) - 0.5) * dtmSettings.intervalLength;
+    else
+        outIntervalHeight = minShadingHeight + (float(intervalIndex)) * dtmSettings.intervalLength;
 
     DTMSettingsHeightsAccessor dtmHeightsAccessor = { dtmSettings };
-    uint32_t upperBoundHeightIndex = min(nbl::hlsl::upper_bound(dtmHeightsAccessor, 0, dtmSettings.heightColorEntryCount, outIntervalHeight), dtmSettings.heightColorEntryCount-1u);
+    uint32_t upperBoundHeightIndex = min(nbl::hlsl::upper_bound(dtmHeightsAccessor, 0, dtmSettings.heightColorEntryCount, heightForColor), dtmSettings.heightColorEntryCount-1u);
     uint32_t lowerBoundHeightIndex = max(upperBoundHeightIndex - 1, 0);
 
     float upperBoundHeight = dtmSettings.heightColorMapHeights[upperBoundHeightIndex];
@@ -487,7 +492,7 @@ void getIntervalHeightAndColor(in int intervalIndex, in DTMSettings dtmSettings,
     }
     else
     {
-        float interpolationVal = (outIntervalHeight - lowerBoundHeight) / (upperBoundHeight - lowerBoundHeight);
+        float interpolationVal = (heightForColor - lowerBoundHeight) / (upperBoundHeight - lowerBoundHeight);
         outIntervalColor = lerp(lowerBoundColor, upperBoundColor, interpolationVal);
     }
 }

From 9707b33b35b9f2cf4ddacecd64d55434144de7b3 Mon Sep 17 00:00:00 2001
From: Przemek <minikers21@gmail.com>
Date: Tue, 15 Apr 2025 15:43:44 +0200
Subject: [PATCH 165/529] Separated DTMSettingsInfo struct

---
 62_CAD/CTriangleMesh.h                        |  39 ++++--
 62_CAD/DrawResourcesFiller.cpp                |  71 +++++++----
 62_CAD/DrawResourcesFiller.h                  |   9 +-
 62_CAD/main.cpp                               | 114 +++++++++++-------
 .../shaders/main_pipeline/vertex_shader.hlsl  |   1 -
 5 files changed, 149 insertions(+), 85 deletions(-)

diff --git a/62_CAD/CTriangleMesh.h b/62_CAD/CTriangleMesh.h
index 1753687b2..0740cf114 100644
--- a/62_CAD/CTriangleMesh.h
+++ b/62_CAD/CTriangleMesh.h
@@ -6,17 +6,9 @@
 
 using namespace nbl;
 
-struct DTMSettingsInfo
+struct DTMHeightShadingInfo
 {
-	LineStyleInfo outlineLineStyleInfo;
-	LineStyleInfo contourLineStyleInfo;
-	
-	uint32_t mode; // E_DTM_MODE
-
-	float contourLinesStartHeight;
-	float contourLinesEndHeight;
-	float contourLinesHeightInterval;
-	
+	bool enabled;
 	// Height Shading Mode
 	E_HEIGHT_SHADING_MODE heightShadingMode;
 
@@ -28,7 +20,7 @@ struct DTMSettingsInfo
 	// This computed height is later used to determine the interpolated color for shading.
 	// It makes sense for this variable to be always equal to `intervalLength` but sometimes it's a different scaling so that last index corresponds to largestHeight
 	float intervalIndexToHeightMultiplier;
-	
+
 	// Used for "DISCRETE_FIXED_LENGTH_INTERVALS" shading mode
 	// If `isCenteredShading` is true, the intervals are centered around `minHeight`, meaning the
 	// first interval spans [minHeight - intervalLength / 2.0, minHeight + intervalLength / 2.0].
@@ -58,7 +50,7 @@ struct DTMSettingsInfo
 
 		return true;
 	}
-
+	
 private:
 	struct HeightColor
 	{
@@ -74,6 +66,29 @@ struct DTMSettingsInfo
 	std::set<HeightColor> heightColorSet;
 };
 
+struct DTMContourInfo
+{
+	bool enabled;
+	LineStyleInfo lineStyleInfo;
+
+	float startHeight;
+	float endHeight;
+	float heightInterval;
+};
+
+struct DTMOutlineInfo
+{
+	bool enabled;
+	LineStyleInfo lineStyleInfo;
+};
+
+struct DTMSettingsInfo
+{
+	DTMHeightShadingInfo heightShadingInfo;
+	DTMContourInfo contourInfo;
+	DTMOutlineInfo outlineInfo;
+};
+
 class CTriangleMesh final
 {
 public:
diff --git a/62_CAD/DrawResourcesFiller.cpp b/62_CAD/DrawResourcesFiller.cpp
index ad2f160c8..4085b4d30 100644
--- a/62_CAD/DrawResourcesFiller.cpp
+++ b/62_CAD/DrawResourcesFiller.cpp
@@ -134,11 +134,16 @@ void DrawResourcesFiller::drawPolyline(const CPolylineBase& polyline, SIntendedS
 	}
 }
 
-void DrawResourcesFiller::drawTriangleMesh(const CTriangleMesh& mesh, const DTMSettingsInfo& dtmSettingsInfo, SIntendedSubmitInfo& intendedNextSubmit)
+void DrawResourcesFiller::drawTriangleMesh(
+	const CTriangleMesh& mesh,
+	const DTMHeightShadingInfo& dtmHeightShadingInfo,
+	const DTMContourInfo& dtmContourInfo,
+	const DTMOutlineInfo& dtmOutlineInfo,
+	SIntendedSubmitInfo& intendedNextSubmit)
 {
 	flushDrawObjects(); // flushes draw call construction of any possible draw objects before dtm, because currently we're sepaerating dtm draw calls from drawObj draw calls
 
-	setActiveDTMSettings(dtmSettingsInfo);
+	setActiveDTMSettings(dtmHeightShadingInfo, dtmContourInfo, dtmOutlineInfo); // TODO !!!!
 	beginMainObject(MainObjectType::DTM);
 
 	DrawCallData drawCallData = {}; 
@@ -352,9 +357,14 @@ void DrawResourcesFiller::setActiveLineStyle(const LineStyleInfo& lineStyle)
 	activeLineStyleIndex = InvalidStyleIdx;
 }
 
-void DrawResourcesFiller::setActiveDTMSettings(const DTMSettingsInfo& dtmSettings)
+void DrawResourcesFiller::setActiveDTMSettings(const DTMHeightShadingInfo& heightShadingInfo, const DTMContourInfo& contourInfo, const DTMOutlineInfo& outlineInfo)
 {
-	activeDTMSettings = dtmSettings;
+	DTMSettingsInfo dtmSettingsInfo;
+	dtmSettingsInfo.heightShadingInfo = heightShadingInfo;
+	dtmSettingsInfo.contourInfo = contourInfo;
+	dtmSettingsInfo.outlineInfo = outlineInfo;
+
+	activeDTMSettings = dtmSettingsInfo;
 	activeDTMSettingsIndex = InvalidDTMSettingsIdx;
 }
 
@@ -633,31 +643,42 @@ uint32_t DrawResourcesFiller::addDTMSettings_Internal(const DTMSettingsInfo& dtm
 
 	DTMSettings dtmSettings;
 
-	dtmSettings.mode = dtmSettingsInfo.mode;
-
-	dtmSettings.contourLinesStartHeight = dtmSettingsInfo.contourLinesStartHeight;
-	dtmSettings.contourLinesEndHeight = dtmSettingsInfo.contourLinesEndHeight;
-	dtmSettings.contourLinesHeightInterval = dtmSettingsInfo.contourLinesHeightInterval;
-
-	dtmSettings.outlineLineStyleIdx = addLineStyle_Internal(dtmSettingsInfo.outlineLineStyleInfo);
-	dtmSettings.contourLineStyleIdx = addLineStyle_Internal(dtmSettingsInfo.contourLineStyleInfo);
+	////dtmSettingsInfo.mode = E_DTM_MODE::HEIGHT_SHADING | E_DTM_MODE::CONTOUR | E_DTM_MODE::OUTLINE;
 
+	dtmSettings.mode = 0u;
+	if (dtmSettingsInfo.heightShadingInfo.enabled)
+	{
+		dtmSettings.mode |= E_DTM_MODE::HEIGHT_SHADING;
 
-	switch (dtmSettingsInfo.heightShadingMode)
+		switch (dtmSettingsInfo.heightShadingInfo.heightShadingMode)
+		{
+		case E_HEIGHT_SHADING_MODE::DISCRETE_VARIABLE_LENGTH_INTERVALS:
+			dtmSettings.intervalLength = std::numeric_limits<float>::infinity();
+			break;
+		case E_HEIGHT_SHADING_MODE::DISCRETE_FIXED_LENGTH_INTERVALS:
+			dtmSettings.intervalLength = dtmSettingsInfo.heightShadingInfo.intervalLength;
+			break;
+		case E_HEIGHT_SHADING_MODE::CONTINOUS_INTERVALS:
+			dtmSettings.intervalLength = 0.0f;
+			break;
+		}
+		dtmSettings.intervalIndexToHeightMultiplier = dtmSettingsInfo.heightShadingInfo.intervalIndexToHeightMultiplier;
+		dtmSettings.isCenteredShading = static_cast<int>(dtmSettingsInfo.heightShadingInfo.isCenteredShading);
+		_NBL_DEBUG_BREAK_IF(!dtmSettingsInfo.heightShadingInfo.fillShaderDTMSettingsHeightColorMap(dtmSettings));
+	}
+	if (dtmSettingsInfo.contourInfo.enabled)
+	{
+		dtmSettings.mode |= E_DTM_MODE::CONTOUR;
+		dtmSettings.contourLinesStartHeight = dtmSettingsInfo.contourInfo.startHeight;
+		dtmSettings.contourLinesEndHeight = dtmSettingsInfo.contourInfo.endHeight;
+		dtmSettings.contourLinesHeightInterval = dtmSettingsInfo.contourInfo.heightInterval;
+		dtmSettings.contourLineStyleIdx = addLineStyle_Internal(dtmSettingsInfo.contourInfo.lineStyleInfo);
+	}
+	if (dtmSettingsInfo.outlineInfo.enabled)
 	{
-	case E_HEIGHT_SHADING_MODE::DISCRETE_VARIABLE_LENGTH_INTERVALS:
-		dtmSettings.intervalLength = std::numeric_limits<float>::infinity();
-		break;
-	case E_HEIGHT_SHADING_MODE::DISCRETE_FIXED_LENGTH_INTERVALS:
-		dtmSettings.intervalLength = dtmSettingsInfo.intervalLength;
-		break;
-	case E_HEIGHT_SHADING_MODE::CONTINOUS_INTERVALS:
-		dtmSettings.intervalLength = 0.0f;
-		break;
+		dtmSettings.mode |= E_DTM_MODE::OUTLINE;
+		dtmSettings.outlineLineStyleIdx = addLineStyle_Internal(dtmSettingsInfo.outlineInfo.lineStyleInfo);
 	}
-	dtmSettings.intervalIndexToHeightMultiplier = dtmSettingsInfo.intervalIndexToHeightMultiplier;
-	dtmSettings.isCenteredShading = static_cast<int>(dtmSettingsInfo.isCenteredShading);
-	_NBL_DEBUG_BREAK_IF(!dtmSettingsInfo.fillShaderDTMSettingsHeightColorMap(dtmSettings));
 
 	for (uint32_t i = 0u; i < resourcesCollection.dtmSettings.vector.size(); ++i)
 	{
diff --git a/62_CAD/DrawResourcesFiller.h b/62_CAD/DrawResourcesFiller.h
index 846046a43..f0618fd27 100644
--- a/62_CAD/DrawResourcesFiller.h
+++ b/62_CAD/DrawResourcesFiller.h
@@ -155,7 +155,12 @@ struct DrawResourcesFiller
 	/// WARNING: make sure this function  is called within begin/endMainObject scope
 	void drawPolyline(const CPolylineBase& polyline, SIntendedSubmitInfo& intendedNextSubmit);
 	
-	void drawTriangleMesh(const CTriangleMesh& mesh, const DTMSettingsInfo& dtmSettings, SIntendedSubmitInfo& intendedNextSubmit);
+	void drawTriangleMesh(
+		const CTriangleMesh& mesh,
+		const DTMHeightShadingInfo& dtmHeightShadingInfo,
+		const DTMContourInfo& dtmContourInfo,
+		const DTMOutlineInfo& dtmOutlineInfo,
+		SIntendedSubmitInfo& intendedNextSubmit);
 
 	// ! Convinience function for Hatch with MSDF Pattern and a solid background
 	void drawHatch(
@@ -223,7 +228,7 @@ struct DrawResourcesFiller
 
 	// Setting Active Resources:
 	void setActiveLineStyle(const LineStyleInfo& lineStyle);
-	void setActiveDTMSettings(const DTMSettingsInfo& dtmSettings);
+	void setActiveDTMSettings(const DTMHeightShadingInfo& heightShadingInfo, const DTMContourInfo& contourInfo, const DTMOutlineInfo& outlineInfo);
 
 	void beginMainObject(MainObjectType type);
 	void endMainObject();
diff --git a/62_CAD/main.cpp b/62_CAD/main.cpp
index c16f17c2d..da7ceb275 100644
--- a/62_CAD/main.cpp
+++ b/62_CAD/main.cpp
@@ -3245,83 +3245,107 @@ class ComputerAidedDesign final : public examples::SimpleWindowedApplication, pu
 			mesh.setVertices(std::move(vertices));
 			mesh.setIndices(std::move(indices));
 
-			DTMSettingsInfo dtmSettingsInfo;
-			
-			dtmSettingsInfo.mode = E_DTM_MODE::HEIGHT_SHADING | E_DTM_MODE::CONTOUR | E_DTM_MODE::OUTLINE;
-
-			dtmSettingsInfo.contourLinesStartHeight = 20;
-			dtmSettingsInfo.contourLinesEndHeight = 90;
-			dtmSettingsInfo.contourLinesHeightInterval = 10;
-
-			LineStyleInfo outlineStyle = {};
-			dtmSettingsInfo.outlineLineStyleInfo.screenSpaceLineWidth = 0.0f;
-			dtmSettingsInfo.outlineLineStyleInfo.worldSpaceLineWidth = 3.0f;
-			dtmSettingsInfo.outlineLineStyleInfo.color = float32_t4(0.0f, 0.39f, 0.0f, 1.0f);
-			//std::array<double, 4> outlineStipplePattern = { 0.0f, -5.0f, 2.0f, -5.0f };
+			// TODO: remove
+			//DTMSettingsInfo dtmSettingsInfo;
+			//
+			////dtmSettingsInfo.mode = E_DTM_MODE::HEIGHT_SHADING | E_DTM_MODE::CONTOUR | E_DTM_MODE::OUTLINE;
+			//dtmSettingsInfo.mode = E_DTM_MODE::CONTOUR;
+			//dtmSettingsInfo.contourLinesStartHeight = 20;
+			//dtmSettingsInfo.contourLinesEndHeight = 90;
+			//dtmSettingsInfo.contourLinesHeightInterval = 10;
+
+			//LineStyleInfo outlineStyle = {};
+			//dtmSettingsInfo.outlineLineStyleInfo.screenSpaceLineWidth = 3.0f;
+			//dtmSettingsInfo.outlineLineStyleInfo.worldSpaceLineWidth = 0.0f;
+			//dtmSettingsInfo.outlineLineStyleInfo.color = float32_t4(0.0f, 0.39f, 0.0f, 1.0f);
+			//std::array<double, 4> outlineStipplePattern = { 0.0f, -5.0f, 20.0f, -5.0f };
 			//dtmSettingsInfo.outlineLineStyleInfo.setStipplePatternData(outlineStipplePattern);
 
-			LineStyleInfo contourStyle = {};
-			dtmSettingsInfo.contourLineStyleInfo.screenSpaceLineWidth = 0.0f;
-			dtmSettingsInfo.contourLineStyleInfo.worldSpaceLineWidth = 1.0f;
-			dtmSettingsInfo.contourLineStyleInfo.color = float32_t4(0.0f, 0.0f, 1.0f, 0.7f);
+			//LineStyleInfo contourStyle = {};
+			//dtmSettingsInfo.contourLineStyleInfo.screenSpaceLineWidth = 0.0f;
+			//dtmSettingsInfo.contourLineStyleInfo.worldSpaceLineWidth = 1.0f;
+			//dtmSettingsInfo.contourLineStyleInfo.color = float32_t4(0.0f, 0.0f, 1.0f, 0.7f);
+			//std::array<double, 4> contourStipplePattern = { 0.0f, -5.0f, 10.0f, -5.0f };
+			//dtmSettingsInfo.contourLineStyleInfo.setStipplePatternData(contourStipplePattern);
+
+			DTMOutlineInfo outlineInfo;
+			outlineInfo.enabled = true;
+			outlineInfo.lineStyleInfo.screenSpaceLineWidth = 3.0f;
+			outlineInfo.lineStyleInfo.worldSpaceLineWidth = 0.0f;
+			outlineInfo.lineStyleInfo.color = float32_t4(0.0f, 0.39f, 0.0f, 1.0f);
+			std::array<double, 4> outlineStipplePattern = { 0.0f, -5.0f, 20.0f, -5.0f };
+			outlineInfo.lineStyleInfo.setStipplePatternData(outlineStipplePattern);
+
+			DTMContourInfo contourInfo;
+			contourInfo.enabled = true;
+			contourInfo.startHeight = 20;
+			contourInfo.endHeight = 90;
+			contourInfo.heightInterval = 10;
+			contourInfo.lineStyleInfo.screenSpaceLineWidth = 0.0f;
+			contourInfo.lineStyleInfo.worldSpaceLineWidth = 1.0f;
+			contourInfo.lineStyleInfo.color = float32_t4(0.0f, 0.0f, 1.0f, 0.7f);
 			std::array<double, 4> contourStipplePattern = { 0.0f, -5.0f, 10.0f, -5.0f };
-			dtmSettingsInfo.contourLineStyleInfo.setStipplePatternData(contourStipplePattern);
-
+			contourInfo.lineStyleInfo.setStipplePatternData(contourStipplePattern);
 
 			// PRESS 1, 2, 3 TO SWITCH HEIGHT SHADING MODE
 			// 1 - DISCRETE_VARIABLE_LENGTH_INTERVALS
 			// 2 - DISCRETE_FIXED_LENGTH_INTERVALS
 			// 3 - CONTINOUS_INTERVALS
 			float animatedAlpha = (std::cos(m_timeElapsed * 0.0005) + 1.0) * 0.5;
+			DTMHeightShadingInfo heightShadingInfo;
+			heightShadingInfo.enabled = true;
 			switch (m_shadingModeExample)
 			{
 				case E_HEIGHT_SHADING_MODE::DISCRETE_VARIABLE_LENGTH_INTERVALS:
 				{
-					dtmSettingsInfo.heightShadingMode = E_HEIGHT_SHADING_MODE::DISCRETE_VARIABLE_LENGTH_INTERVALS;
-					
-					dtmSettingsInfo.addHeightColorMapEntry(-10.0f, float32_t4(0.5f, 1.0f, 1.0f, animatedAlpha));
-					dtmSettingsInfo.addHeightColorMapEntry(20.0f, float32_t4(0.0f, 1.0f, 0.0f, 1.0f));
-					dtmSettingsInfo.addHeightColorMapEntry(25.0f, float32_t4(1.0f, 1.0f, 0.0f, animatedAlpha));
-					dtmSettingsInfo.addHeightColorMapEntry(70.0f, float32_t4(1.0f, 0.0f, 0.0f, 1.0f));
-					dtmSettingsInfo.addHeightColorMapEntry(90.0f, float32_t4(1.0f, 0.0f, 0.0f, 1.0f));
+					heightShadingInfo.heightShadingMode = E_HEIGHT_SHADING_MODE::DISCRETE_VARIABLE_LENGTH_INTERVALS;
+
+					heightShadingInfo.addHeightColorMapEntry(-10.0f, float32_t4(0.5f, 1.0f, 1.0f, animatedAlpha));
+					heightShadingInfo.addHeightColorMapEntry(20.0f, float32_t4(0.0f, 1.0f, 0.0f, 1.0f));
+					heightShadingInfo.addHeightColorMapEntry(25.0f, float32_t4(1.0f, 1.0f, 0.0f, animatedAlpha));
+					heightShadingInfo.addHeightColorMapEntry(70.0f, float32_t4(1.0f, 0.0f, 0.0f, 1.0f));
+					heightShadingInfo.addHeightColorMapEntry(90.0f, float32_t4(1.0f, 0.0f, 0.0f, 1.0f));
+
 					break;
 				}
 				case E_HEIGHT_SHADING_MODE::DISCRETE_FIXED_LENGTH_INTERVALS:
 				{
-					dtmSettingsInfo.intervalLength = 10.0f;
-					dtmSettingsInfo.intervalIndexToHeightMultiplier = dtmSettingsInfo.intervalLength;
-					dtmSettingsInfo.isCenteredShading = false;
-					dtmSettingsInfo.heightShadingMode = E_HEIGHT_SHADING_MODE::DISCRETE_FIXED_LENGTH_INTERVALS;
-					dtmSettingsInfo.addHeightColorMapEntry(0.0f,   float32_t4(0.0f, 0.0f, 1.0f, animatedAlpha));
-					dtmSettingsInfo.addHeightColorMapEntry(25.0f,  float32_t4(0.0f, 1.0f, 1.0f, animatedAlpha));
-					dtmSettingsInfo.addHeightColorMapEntry(50.0f,  float32_t4(0.0f, 1.0f, 0.0f, animatedAlpha));
-					dtmSettingsInfo.addHeightColorMapEntry(75.0f,  float32_t4(1.0f, 1.0f, 0.0f, animatedAlpha));
-					dtmSettingsInfo.addHeightColorMapEntry(100.0f, float32_t4(1.0f, 0.0f, 0.0f, animatedAlpha));
+					heightShadingInfo.intervalLength = 10.0f;
+					heightShadingInfo.intervalIndexToHeightMultiplier = heightShadingInfo.intervalLength;
+					heightShadingInfo.isCenteredShading = false;
+					heightShadingInfo.heightShadingMode = E_HEIGHT_SHADING_MODE::DISCRETE_FIXED_LENGTH_INTERVALS;
+					heightShadingInfo.addHeightColorMapEntry(0.0f, float32_t4(0.0f, 0.0f, 1.0f, animatedAlpha));
+					heightShadingInfo.addHeightColorMapEntry(25.0f, float32_t4(0.0f, 1.0f, 1.0f, animatedAlpha));
+					heightShadingInfo.addHeightColorMapEntry(50.0f, float32_t4(0.0f, 1.0f, 0.0f, animatedAlpha));
+					heightShadingInfo.addHeightColorMapEntry(75.0f, float32_t4(1.0f, 1.0f, 0.0f, animatedAlpha));
+					heightShadingInfo.addHeightColorMapEntry(100.0f, float32_t4(1.0f, 0.0f, 0.0f, animatedAlpha));
+
 					break;
 				}
 				case E_HEIGHT_SHADING_MODE::CONTINOUS_INTERVALS:
 				{
-					dtmSettingsInfo.heightShadingMode = E_HEIGHT_SHADING_MODE::CONTINOUS_INTERVALS;
-					dtmSettingsInfo.addHeightColorMapEntry(0.0f,   float32_t4(0.0f, 0.0f, 1.0f, animatedAlpha));
-					dtmSettingsInfo.addHeightColorMapEntry(25.0f,  float32_t4(0.0f, 1.0f, 1.0f, animatedAlpha));
-					dtmSettingsInfo.addHeightColorMapEntry(50.0f, float32_t4(0.0f, 1.0f, 0.0f, animatedAlpha));
-					dtmSettingsInfo.addHeightColorMapEntry(75.0f, float32_t4(1.0f, 1.0f, 0.0f, animatedAlpha));
-					dtmSettingsInfo.addHeightColorMapEntry(90.0f, float32_t4(1.0f, 0.0f, 0.0f, animatedAlpha));
+					heightShadingInfo.heightShadingMode = E_HEIGHT_SHADING_MODE::CONTINOUS_INTERVALS;
+					heightShadingInfo.addHeightColorMapEntry(0.0f, float32_t4(0.0f, 0.0f, 1.0f, animatedAlpha));
+					heightShadingInfo.addHeightColorMapEntry(25.0f, float32_t4(0.0f, 1.0f, 1.0f, animatedAlpha));
+					heightShadingInfo.addHeightColorMapEntry(50.0f, float32_t4(0.0f, 1.0f, 0.0f, animatedAlpha));
+					heightShadingInfo.addHeightColorMapEntry(75.0f, float32_t4(1.0f, 1.0f, 0.0f, animatedAlpha));
+					heightShadingInfo.addHeightColorMapEntry(90.0f, float32_t4(1.0f, 0.0f, 0.0f, animatedAlpha));
+
 					break;
 				}
 			}
 
-			drawResourcesFiller.drawTriangleMesh(mesh, dtmSettingsInfo, intendedNextSubmit);
+			drawResourcesFiller.drawTriangleMesh(mesh, heightShadingInfo, contourInfo, outlineInfo, intendedNextSubmit);
 
-			dtmSettingsInfo.contourLineStyleInfo.color = float32_t4(1.0f, 0.39f, 0.0f, 1.0f);
-			dtmSettingsInfo.outlineLineStyleInfo.color = float32_t4(0.0f, 0.39f, 1.0f, 1.0f);
+			contourInfo.lineStyleInfo.color = float32_t4(1.0f, 0.39f, 0.0f, 1.0f);
+			outlineInfo.lineStyleInfo.color = float32_t4(0.0f, 0.39f, 1.0f, 1.0f);
 			for (auto& v : mesh.m_vertices)
 			{
 				v.pos += float64_t2(450.0, 200.0);
 				v.height -= 10.0;
 			}
 
-			drawResourcesFiller.drawTriangleMesh(mesh, dtmSettingsInfo, intendedNextSubmit);
+			drawResourcesFiller.drawTriangleMesh(mesh, heightShadingInfo, contourInfo, outlineInfo, intendedNextSubmit);
 		}
 
 		drawResourcesFiller.finalizeAllCopiesToGPU(intendedNextSubmit);
diff --git a/62_CAD/shaders/main_pipeline/vertex_shader.hlsl b/62_CAD/shaders/main_pipeline/vertex_shader.hlsl
index 20c29f16a..f726104b5 100644
--- a/62_CAD/shaders/main_pipeline/vertex_shader.hlsl
+++ b/62_CAD/shaders/main_pipeline/vertex_shader.hlsl
@@ -146,7 +146,6 @@ PSInput main(uint vertexID : SV_VertexID)
                 (clipProjectionData.projectionToNDC[0].x * _static_cast<pfloat64_t>(globals.resolution.x))))
         );
 
-        // TODO: line style of contour line has to be set too!
         DTMSettings dtm = loadDTMSettings(mainObj.dtmSettingsIdx);
         LineStyle outlineStyle = loadLineStyle(dtm.outlineLineStyleIdx);
         LineStyle contourStyle = loadLineStyle(dtm.contourLineStyleIdx);

From fdbf20c7d701063ab93a4c00f575d3aceb319700 Mon Sep 17 00:00:00 2001
From: Przemek <minikers21@gmail.com>
Date: Thu, 17 Apr 2025 11:38:23 +0200
Subject: [PATCH 166/529] Fixes

---
 62_CAD/CTriangleMesh.h         |  12 +---
 62_CAD/DrawResourcesFiller.cpp |  27 +++------
 62_CAD/DrawResourcesFiller.h   |   6 +-
 62_CAD/main.cpp                | 106 +++++++++++++--------------------
 4 files changed, 53 insertions(+), 98 deletions(-)

diff --git a/62_CAD/CTriangleMesh.h b/62_CAD/CTriangleMesh.h
index 0740cf114..16995c28a 100644
--- a/62_CAD/CTriangleMesh.h
+++ b/62_CAD/CTriangleMesh.h
@@ -8,7 +8,6 @@ using namespace nbl;
 
 struct DTMHeightShadingInfo
 {
-	bool enabled;
 	// Height Shading Mode
 	E_HEIGHT_SHADING_MODE heightShadingMode;
 
@@ -68,7 +67,6 @@ struct DTMHeightShadingInfo
 
 struct DTMContourInfo
 {
-	bool enabled;
 	LineStyleInfo lineStyleInfo;
 
 	float startHeight;
@@ -76,17 +74,13 @@ struct DTMContourInfo
 	float heightInterval;
 };
 
-struct DTMOutlineInfo
-{
-	bool enabled;
-	LineStyleInfo lineStyleInfo;
-};
-
 struct DTMSettingsInfo
 {
+	uint32_t mode = 0u;
+
 	DTMHeightShadingInfo heightShadingInfo;
 	DTMContourInfo contourInfo;
-	DTMOutlineInfo outlineInfo;
+	LineStyleInfo outlineStyleInfo;
 };
 
 class CTriangleMesh final
diff --git a/62_CAD/DrawResourcesFiller.cpp b/62_CAD/DrawResourcesFiller.cpp
index 4085b4d30..a255bc700 100644
--- a/62_CAD/DrawResourcesFiller.cpp
+++ b/62_CAD/DrawResourcesFiller.cpp
@@ -136,14 +136,12 @@ void DrawResourcesFiller::drawPolyline(const CPolylineBase& polyline, SIntendedS
 
 void DrawResourcesFiller::drawTriangleMesh(
 	const CTriangleMesh& mesh,
-	const DTMHeightShadingInfo& dtmHeightShadingInfo,
-	const DTMContourInfo& dtmContourInfo,
-	const DTMOutlineInfo& dtmOutlineInfo,
+	const DTMSettingsInfo& dtmSettingsInfo,
 	SIntendedSubmitInfo& intendedNextSubmit)
 {
 	flushDrawObjects(); // flushes draw call construction of any possible draw objects before dtm, because currently we're sepaerating dtm draw calls from drawObj draw calls
 
-	setActiveDTMSettings(dtmHeightShadingInfo, dtmContourInfo, dtmOutlineInfo); // TODO !!!!
+	setActiveDTMSettings(dtmSettingsInfo); // TODO !!!!
 	beginMainObject(MainObjectType::DTM);
 
 	DrawCallData drawCallData = {}; 
@@ -357,13 +355,8 @@ void DrawResourcesFiller::setActiveLineStyle(const LineStyleInfo& lineStyle)
 	activeLineStyleIndex = InvalidStyleIdx;
 }
 
-void DrawResourcesFiller::setActiveDTMSettings(const DTMHeightShadingInfo& heightShadingInfo, const DTMContourInfo& contourInfo, const DTMOutlineInfo& outlineInfo)
+void DrawResourcesFiller::setActiveDTMSettings(const DTMSettingsInfo& dtmSettingsInfo)
 {
-	DTMSettingsInfo dtmSettingsInfo;
-	dtmSettingsInfo.heightShadingInfo = heightShadingInfo;
-	dtmSettingsInfo.contourInfo = contourInfo;
-	dtmSettingsInfo.outlineInfo = outlineInfo;
-
 	activeDTMSettings = dtmSettingsInfo;
 	activeDTMSettingsIndex = InvalidDTMSettingsIdx;
 }
@@ -645,11 +638,9 @@ uint32_t DrawResourcesFiller::addDTMSettings_Internal(const DTMSettingsInfo& dtm
 
 	////dtmSettingsInfo.mode = E_DTM_MODE::HEIGHT_SHADING | E_DTM_MODE::CONTOUR | E_DTM_MODE::OUTLINE;
 
-	dtmSettings.mode = 0u;
-	if (dtmSettingsInfo.heightShadingInfo.enabled)
+	dtmSettings.mode = dtmSettingsInfo.mode;
+	if (dtmSettings.mode & E_DTM_MODE::HEIGHT_SHADING)
 	{
-		dtmSettings.mode |= E_DTM_MODE::HEIGHT_SHADING;
-
 		switch (dtmSettingsInfo.heightShadingInfo.heightShadingMode)
 		{
 		case E_HEIGHT_SHADING_MODE::DISCRETE_VARIABLE_LENGTH_INTERVALS:
@@ -666,18 +657,16 @@ uint32_t DrawResourcesFiller::addDTMSettings_Internal(const DTMSettingsInfo& dtm
 		dtmSettings.isCenteredShading = static_cast<int>(dtmSettingsInfo.heightShadingInfo.isCenteredShading);
 		_NBL_DEBUG_BREAK_IF(!dtmSettingsInfo.heightShadingInfo.fillShaderDTMSettingsHeightColorMap(dtmSettings));
 	}
-	if (dtmSettingsInfo.contourInfo.enabled)
+	if (dtmSettings.mode & E_DTM_MODE::CONTOUR)
 	{
-		dtmSettings.mode |= E_DTM_MODE::CONTOUR;
 		dtmSettings.contourLinesStartHeight = dtmSettingsInfo.contourInfo.startHeight;
 		dtmSettings.contourLinesEndHeight = dtmSettingsInfo.contourInfo.endHeight;
 		dtmSettings.contourLinesHeightInterval = dtmSettingsInfo.contourInfo.heightInterval;
 		dtmSettings.contourLineStyleIdx = addLineStyle_Internal(dtmSettingsInfo.contourInfo.lineStyleInfo);
 	}
-	if (dtmSettingsInfo.outlineInfo.enabled)
+	if (dtmSettings.mode & E_DTM_MODE::OUTLINE)
 	{
-		dtmSettings.mode |= E_DTM_MODE::OUTLINE;
-		dtmSettings.outlineLineStyleIdx = addLineStyle_Internal(dtmSettingsInfo.outlineInfo.lineStyleInfo);
+		dtmSettings.outlineLineStyleIdx = addLineStyle_Internal(dtmSettingsInfo.outlineStyleInfo);
 	}
 
 	for (uint32_t i = 0u; i < resourcesCollection.dtmSettings.vector.size(); ++i)
diff --git a/62_CAD/DrawResourcesFiller.h b/62_CAD/DrawResourcesFiller.h
index f0618fd27..196ba6885 100644
--- a/62_CAD/DrawResourcesFiller.h
+++ b/62_CAD/DrawResourcesFiller.h
@@ -157,9 +157,7 @@ struct DrawResourcesFiller
 	
 	void drawTriangleMesh(
 		const CTriangleMesh& mesh,
-		const DTMHeightShadingInfo& dtmHeightShadingInfo,
-		const DTMContourInfo& dtmContourInfo,
-		const DTMOutlineInfo& dtmOutlineInfo,
+		const DTMSettingsInfo& dtmSettingsInfo,
 		SIntendedSubmitInfo& intendedNextSubmit);
 
 	// ! Convinience function for Hatch with MSDF Pattern and a solid background
@@ -228,7 +226,7 @@ struct DrawResourcesFiller
 
 	// Setting Active Resources:
 	void setActiveLineStyle(const LineStyleInfo& lineStyle);
-	void setActiveDTMSettings(const DTMHeightShadingInfo& heightShadingInfo, const DTMContourInfo& contourInfo, const DTMOutlineInfo& outlineInfo);
+	void setActiveDTMSettings(const DTMSettingsInfo& dtmSettingsInfo);
 
 	void beginMainObject(MainObjectType type);
 	void endMainObject();
diff --git a/62_CAD/main.cpp b/62_CAD/main.cpp
index da7ceb275..6d3a2b431 100644
--- a/62_CAD/main.cpp
+++ b/62_CAD/main.cpp
@@ -3245,107 +3245,81 @@ class ComputerAidedDesign final : public examples::SimpleWindowedApplication, pu
 			mesh.setVertices(std::move(vertices));
 			mesh.setIndices(std::move(indices));
 
-			// TODO: remove
-			//DTMSettingsInfo dtmSettingsInfo;
-			//
-			////dtmSettingsInfo.mode = E_DTM_MODE::HEIGHT_SHADING | E_DTM_MODE::CONTOUR | E_DTM_MODE::OUTLINE;
-			//dtmSettingsInfo.mode = E_DTM_MODE::CONTOUR;
-			//dtmSettingsInfo.contourLinesStartHeight = 20;
-			//dtmSettingsInfo.contourLinesEndHeight = 90;
-			//dtmSettingsInfo.contourLinesHeightInterval = 10;
-
-			//LineStyleInfo outlineStyle = {};
-			//dtmSettingsInfo.outlineLineStyleInfo.screenSpaceLineWidth = 3.0f;
-			//dtmSettingsInfo.outlineLineStyleInfo.worldSpaceLineWidth = 0.0f;
-			//dtmSettingsInfo.outlineLineStyleInfo.color = float32_t4(0.0f, 0.39f, 0.0f, 1.0f);
-			//std::array<double, 4> outlineStipplePattern = { 0.0f, -5.0f, 20.0f, -5.0f };
-			//dtmSettingsInfo.outlineLineStyleInfo.setStipplePatternData(outlineStipplePattern);
-
-			//LineStyleInfo contourStyle = {};
-			//dtmSettingsInfo.contourLineStyleInfo.screenSpaceLineWidth = 0.0f;
-			//dtmSettingsInfo.contourLineStyleInfo.worldSpaceLineWidth = 1.0f;
-			//dtmSettingsInfo.contourLineStyleInfo.color = float32_t4(0.0f, 0.0f, 1.0f, 0.7f);
-			//std::array<double, 4> contourStipplePattern = { 0.0f, -5.0f, 10.0f, -5.0f };
-			//dtmSettingsInfo.contourLineStyleInfo.setStipplePatternData(contourStipplePattern);
-
-			DTMOutlineInfo outlineInfo;
-			outlineInfo.enabled = true;
-			outlineInfo.lineStyleInfo.screenSpaceLineWidth = 3.0f;
-			outlineInfo.lineStyleInfo.worldSpaceLineWidth = 0.0f;
-			outlineInfo.lineStyleInfo.color = float32_t4(0.0f, 0.39f, 0.0f, 1.0f);
+			DTMSettingsInfo dtmInfo;
+			dtmInfo.mode = E_DTM_MODE::HEIGHT_SHADING | E_DTM_MODE::CONTOUR | E_DTM_MODE::OUTLINE;
+
+			dtmInfo.outlineStyleInfo.screenSpaceLineWidth = 3.0f;
+			dtmInfo.outlineStyleInfo.worldSpaceLineWidth = 0.0f;
+			dtmInfo.outlineStyleInfo.color = float32_t4(0.0f, 0.39f, 0.0f, 1.0f);
 			std::array<double, 4> outlineStipplePattern = { 0.0f, -5.0f, 20.0f, -5.0f };
-			outlineInfo.lineStyleInfo.setStipplePatternData(outlineStipplePattern);
-
-			DTMContourInfo contourInfo;
-			contourInfo.enabled = true;
-			contourInfo.startHeight = 20;
-			contourInfo.endHeight = 90;
-			contourInfo.heightInterval = 10;
-			contourInfo.lineStyleInfo.screenSpaceLineWidth = 0.0f;
-			contourInfo.lineStyleInfo.worldSpaceLineWidth = 1.0f;
-			contourInfo.lineStyleInfo.color = float32_t4(0.0f, 0.0f, 1.0f, 0.7f);
+			dtmInfo.outlineStyleInfo.setStipplePatternData(outlineStipplePattern);
+
+			dtmInfo.contourInfo.startHeight = 20;
+			dtmInfo.contourInfo.endHeight = 90;
+			dtmInfo.contourInfo.heightInterval = 10;
+			dtmInfo.contourInfo.lineStyleInfo.screenSpaceLineWidth = 0.0f;
+			dtmInfo.contourInfo.lineStyleInfo.worldSpaceLineWidth = 1.0f;
+			dtmInfo.contourInfo.lineStyleInfo.color = float32_t4(0.0f, 0.0f, 1.0f, 0.7f);
 			std::array<double, 4> contourStipplePattern = { 0.0f, -5.0f, 10.0f, -5.0f };
-			contourInfo.lineStyleInfo.setStipplePatternData(contourStipplePattern);
+			dtmInfo.contourInfo.lineStyleInfo.setStipplePatternData(contourStipplePattern);
 
 			// PRESS 1, 2, 3 TO SWITCH HEIGHT SHADING MODE
 			// 1 - DISCRETE_VARIABLE_LENGTH_INTERVALS
 			// 2 - DISCRETE_FIXED_LENGTH_INTERVALS
 			// 3 - CONTINOUS_INTERVALS
 			float animatedAlpha = (std::cos(m_timeElapsed * 0.0005) + 1.0) * 0.5;
-			DTMHeightShadingInfo heightShadingInfo;
-			heightShadingInfo.enabled = true;
 			switch (m_shadingModeExample)
 			{
 				case E_HEIGHT_SHADING_MODE::DISCRETE_VARIABLE_LENGTH_INTERVALS:
 				{
-					heightShadingInfo.heightShadingMode = E_HEIGHT_SHADING_MODE::DISCRETE_VARIABLE_LENGTH_INTERVALS;
+					dtmInfo.heightShadingInfo.heightShadingMode = E_HEIGHT_SHADING_MODE::DISCRETE_VARIABLE_LENGTH_INTERVALS;
 
-					heightShadingInfo.addHeightColorMapEntry(-10.0f, float32_t4(0.5f, 1.0f, 1.0f, animatedAlpha));
-					heightShadingInfo.addHeightColorMapEntry(20.0f, float32_t4(0.0f, 1.0f, 0.0f, 1.0f));
-					heightShadingInfo.addHeightColorMapEntry(25.0f, float32_t4(1.0f, 1.0f, 0.0f, animatedAlpha));
-					heightShadingInfo.addHeightColorMapEntry(70.0f, float32_t4(1.0f, 0.0f, 0.0f, 1.0f));
-					heightShadingInfo.addHeightColorMapEntry(90.0f, float32_t4(1.0f, 0.0f, 0.0f, 1.0f));
+					dtmInfo.heightShadingInfo.addHeightColorMapEntry(-10.0f, float32_t4(0.5f, 1.0f, 1.0f, animatedAlpha));
+					dtmInfo.heightShadingInfo.addHeightColorMapEntry(20.0f, float32_t4(0.0f, 1.0f, 0.0f, 1.0f));
+					dtmInfo.heightShadingInfo.addHeightColorMapEntry(25.0f, float32_t4(1.0f, 1.0f, 0.0f, animatedAlpha));
+					dtmInfo.heightShadingInfo.addHeightColorMapEntry(70.0f, float32_t4(1.0f, 0.0f, 0.0f, 1.0f));
+					dtmInfo.heightShadingInfo.addHeightColorMapEntry(90.0f, float32_t4(1.0f, 0.0f, 0.0f, 1.0f));
 
 					break;
 				}
 				case E_HEIGHT_SHADING_MODE::DISCRETE_FIXED_LENGTH_INTERVALS:
 				{
-					heightShadingInfo.intervalLength = 10.0f;
-					heightShadingInfo.intervalIndexToHeightMultiplier = heightShadingInfo.intervalLength;
-					heightShadingInfo.isCenteredShading = false;
-					heightShadingInfo.heightShadingMode = E_HEIGHT_SHADING_MODE::DISCRETE_FIXED_LENGTH_INTERVALS;
-					heightShadingInfo.addHeightColorMapEntry(0.0f, float32_t4(0.0f, 0.0f, 1.0f, animatedAlpha));
-					heightShadingInfo.addHeightColorMapEntry(25.0f, float32_t4(0.0f, 1.0f, 1.0f, animatedAlpha));
-					heightShadingInfo.addHeightColorMapEntry(50.0f, float32_t4(0.0f, 1.0f, 0.0f, animatedAlpha));
-					heightShadingInfo.addHeightColorMapEntry(75.0f, float32_t4(1.0f, 1.0f, 0.0f, animatedAlpha));
-					heightShadingInfo.addHeightColorMapEntry(100.0f, float32_t4(1.0f, 0.0f, 0.0f, animatedAlpha));
+					dtmInfo.heightShadingInfo.intervalLength = 10.0f;
+					dtmInfo.heightShadingInfo.intervalIndexToHeightMultiplier = dtmInfo.heightShadingInfo.intervalLength;
+					dtmInfo.heightShadingInfo.isCenteredShading = false;
+					dtmInfo.heightShadingInfo.heightShadingMode = E_HEIGHT_SHADING_MODE::DISCRETE_FIXED_LENGTH_INTERVALS;
+					dtmInfo.heightShadingInfo.addHeightColorMapEntry(0.0f, float32_t4(0.0f, 0.0f, 1.0f, animatedAlpha));
+					dtmInfo.heightShadingInfo.addHeightColorMapEntry(25.0f, float32_t4(0.0f, 1.0f, 1.0f, animatedAlpha));
+					dtmInfo.heightShadingInfo.addHeightColorMapEntry(50.0f, float32_t4(0.0f, 1.0f, 0.0f, animatedAlpha));
+					dtmInfo.heightShadingInfo.addHeightColorMapEntry(75.0f, float32_t4(1.0f, 1.0f, 0.0f, animatedAlpha));
+					dtmInfo.heightShadingInfo.addHeightColorMapEntry(100.0f, float32_t4(1.0f, 0.0f, 0.0f, animatedAlpha));
 
 					break;
 				}
 				case E_HEIGHT_SHADING_MODE::CONTINOUS_INTERVALS:
 				{
-					heightShadingInfo.heightShadingMode = E_HEIGHT_SHADING_MODE::CONTINOUS_INTERVALS;
-					heightShadingInfo.addHeightColorMapEntry(0.0f, float32_t4(0.0f, 0.0f, 1.0f, animatedAlpha));
-					heightShadingInfo.addHeightColorMapEntry(25.0f, float32_t4(0.0f, 1.0f, 1.0f, animatedAlpha));
-					heightShadingInfo.addHeightColorMapEntry(50.0f, float32_t4(0.0f, 1.0f, 0.0f, animatedAlpha));
-					heightShadingInfo.addHeightColorMapEntry(75.0f, float32_t4(1.0f, 1.0f, 0.0f, animatedAlpha));
-					heightShadingInfo.addHeightColorMapEntry(90.0f, float32_t4(1.0f, 0.0f, 0.0f, animatedAlpha));
+					dtmInfo.heightShadingInfo.heightShadingMode = E_HEIGHT_SHADING_MODE::CONTINOUS_INTERVALS;
+					dtmInfo.heightShadingInfo.addHeightColorMapEntry(0.0f, float32_t4(0.0f, 0.0f, 1.0f, animatedAlpha));
+					dtmInfo.heightShadingInfo.addHeightColorMapEntry(25.0f, float32_t4(0.0f, 1.0f, 1.0f, animatedAlpha));
+					dtmInfo.heightShadingInfo.addHeightColorMapEntry(50.0f, float32_t4(0.0f, 1.0f, 0.0f, animatedAlpha));
+					dtmInfo.heightShadingInfo.addHeightColorMapEntry(75.0f, float32_t4(1.0f, 1.0f, 0.0f, animatedAlpha));
+					dtmInfo.heightShadingInfo.addHeightColorMapEntry(90.0f, float32_t4(1.0f, 0.0f, 0.0f, animatedAlpha));
 
 					break;
 				}
 			}
 
-			drawResourcesFiller.drawTriangleMesh(mesh, heightShadingInfo, contourInfo, outlineInfo, intendedNextSubmit);
+			drawResourcesFiller.drawTriangleMesh(mesh, dtmInfo, intendedNextSubmit);
 
-			contourInfo.lineStyleInfo.color = float32_t4(1.0f, 0.39f, 0.0f, 1.0f);
-			outlineInfo.lineStyleInfo.color = float32_t4(0.0f, 0.39f, 1.0f, 1.0f);
+			dtmInfo.contourInfo.lineStyleInfo.color = float32_t4(1.0f, 0.39f, 0.0f, 1.0f);
+			dtmInfo.outlineStyleInfo.color = float32_t4(0.0f, 0.39f, 1.0f, 1.0f);
 			for (auto& v : mesh.m_vertices)
 			{
 				v.pos += float64_t2(450.0, 200.0);
 				v.height -= 10.0;
 			}
 
-			drawResourcesFiller.drawTriangleMesh(mesh, heightShadingInfo, contourInfo, outlineInfo, intendedNextSubmit);
+			drawResourcesFiller.drawTriangleMesh(mesh, dtmInfo, intendedNextSubmit);
 		}
 
 		drawResourcesFiller.finalizeAllCopiesToGPU(intendedNextSubmit);

From 08b2442e2f01ff3928a601f02bc1b2189add6ef5 Mon Sep 17 00:00:00 2001
From: devsh <devsh@devsh.eu>
Date: Fri, 18 Apr 2025 22:40:55 +0200
Subject: [PATCH 167/529] make example 07 run again after slight updates to API

---
 07_StagingAndMultipleQueues/main.cpp | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/07_StagingAndMultipleQueues/main.cpp b/07_StagingAndMultipleQueues/main.cpp
index 658a28a35..875053d60 100644
--- a/07_StagingAndMultipleQueues/main.cpp
+++ b/07_StagingAndMultipleQueues/main.cpp
@@ -432,15 +432,16 @@ class StagingAndMultipleQueuesApp final : public application_templates::BasicMul
 			submitInfo[0].waitSemaphores = waitSemaphoreSubmitInfo;
 			// there's no save to wait on, or need to prevent signal-after-submit because Renderdoc freezes because it
 			// starts capturing immediately upon a submit and can't defer a capture till semaphores signal.
-			if (imageToProcessId<SUBMITS_IN_FLIGHT || m_api->isRunningInRenderdoc())
+			const bool isRunningInRenderdoc = m_api->runningInGraphicsDebugger()==IAPIConnection::EDebuggerType::Renderdoc;
+			if (imageToProcessId<SUBMITS_IN_FLIGHT || isRunningInRenderdoc)
 				submitInfo[0].waitSemaphores = {waitSemaphoreSubmitInfo,1};
-			if (m_api->isRunningInRenderdoc() && imageToProcessId>=SUBMITS_IN_FLIGHT)
+			if (isRunningInRenderdoc && imageToProcessId>=SUBMITS_IN_FLIGHT)
 			for (auto old = histogramsSaved.load(); old < histogramSaveWaitSemaphoreValue; old = histogramsSaved.load())
 				histogramsSaved.wait(old);
 			// Some Devices like all of the Intel GPUs do not have enough queues for us to allocate different queues to compute and transfers,
 			// so our `BasicMultiQueueApplication` will "alias" a single queue to both usages. Normally you don't need to care, but here we're
 			// attempting to do "out-of-order" "submit-before-signal" so we need to "hold back" submissions if the queues are aliased!
-			if (getTransferUpQueue()==computeQueue || m_api->isRunningInRenderdoc())
+			if (getTransferUpQueue()==computeQueue || isRunningInRenderdoc)
 			for (auto old = transfersSubmitted.load(); old <= imageToProcessId; old = transfersSubmitted.load())
 				transfersSubmitted.wait(old);
 			computeQueue->submit(submitInfo);

From 32ec0af7bd1a1e6be1020b5790ebba864b041a9a Mon Sep 17 00:00:00 2001
From: devsh <devsh@devsh.eu>
Date: Sat, 19 Apr 2025 12:43:38 +0200
Subject: [PATCH 168/529] FFT examples now fully deprecate the old, and move
 Clustered Rendering to old

---
 CMakeLists.txt                                |   8 +-
 old_to_refactor/49_ComputeFFT/CMakeLists.txt  |  11 -
 .../49_ComputeFFT/config.json.template        |  28 -
 .../49_ComputeFFT/extra_parameters.glsl       |  16 -
 .../49_ComputeFFT/fft_convolve_ifft.comp      | 109 ---
 .../49_ComputeFFT/image_first_fft.comp        |  56 --
 old_to_refactor/49_ComputeFFT/last_fft.comp   |  72 --
 old_to_refactor/49_ComputeFFT/main.cpp        | 753 ------------------
 .../49_ComputeFFT/normalization.comp          |  34 -
 old_to_refactor/49_ComputeFFT/pipeline.groovy |  50 --
 .../60_ClusteredRendering}/CMakeLists.txt     |   0
 .../config.json.template                      |   0
 .../60_ClusteredRendering}/main.cpp           |   0
 .../60_ClusteredRendering}/pipeline.groovy    |   0
 14 files changed, 1 insertion(+), 1136 deletions(-)
 delete mode 100644 old_to_refactor/49_ComputeFFT/CMakeLists.txt
 delete mode 100644 old_to_refactor/49_ComputeFFT/config.json.template
 delete mode 100644 old_to_refactor/49_ComputeFFT/extra_parameters.glsl
 delete mode 100644 old_to_refactor/49_ComputeFFT/fft_convolve_ifft.comp
 delete mode 100644 old_to_refactor/49_ComputeFFT/image_first_fft.comp
 delete mode 100644 old_to_refactor/49_ComputeFFT/last_fft.comp
 delete mode 100644 old_to_refactor/49_ComputeFFT/main.cpp
 delete mode 100644 old_to_refactor/49_ComputeFFT/normalization.comp
 delete mode 100644 old_to_refactor/49_ComputeFFT/pipeline.groovy
 rename {60_ClusteredRendering => old_to_refactor/60_ClusteredRendering}/CMakeLists.txt (100%)
 rename {60_ClusteredRendering => old_to_refactor/60_ClusteredRendering}/config.json.template (100%)
 rename {60_ClusteredRendering => old_to_refactor/60_ClusteredRendering}/main.cpp (100%)
 rename {60_ClusteredRendering => old_to_refactor/60_ClusteredRendering}/pipeline.groovy (100%)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index f358d962d..24fb7fad8 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -52,7 +52,6 @@ if(NBL_BUILD_EXAMPLES)
 
 	# Waiting for a refactor
 	#add_subdirectory(27_PLYSTLDemo EXCLUDE_FROM_ALL)
-	#add_subdirectory(29_SpecializationConstants EXCLUDE_FROM_ALL)
 	#add_subdirectory(33_Draw3DLine EXCLUDE_FROM_ALL)
 
 	# Unit Test Examples
@@ -75,16 +74,11 @@ if(NBL_BUILD_EXAMPLES)
 	#	add_subdirectory(39_DenoiserTonemapper EXCLUDE_FROM_ALL)
 	# endif()
 
-	add_subdirectory(42_FragmentShaderPathTracer EXCLUDE_FROM_ALL)
 	#add_subdirectory(43_SumAndCDFFilters EXCLUDE_FROM_ALL)
-	#add_subdirectory(45_BRDFEvalTest EXCLUDE_FROM_ALL)
-	#add_subdirectory(46_SamplingValidation EXCLUDE_FROM_ALL)
 	add_subdirectory(47_DerivMapTest EXCLUDE_FROM_ALL)
 	add_subdirectory(53_ComputeShaders EXCLUDE_FROM_ALL)
 	add_subdirectory(54_Transformations EXCLUDE_FROM_ALL)
 	add_subdirectory(55_RGB18E7S3 EXCLUDE_FROM_ALL)
-	add_subdirectory(56_RayQuery EXCLUDE_FROM_ALL)
-	add_subdirectory(60_ClusteredRendering EXCLUDE_FROM_ALL)
 	add_subdirectory(61_UI EXCLUDE_FROM_ALL)
 	add_subdirectory(62_CAD EXCLUDE_FROM_ALL)
 	add_subdirectory(62_SchusslerTest EXCLUDE_FROM_ALL)
@@ -95,7 +89,7 @@ if(NBL_BUILD_EXAMPLES)
 	add_subdirectory(67_RayQueryGeometry EXCLUDE_FROM_ALL)
 	add_subdirectory(68_JpegLoading EXCLUDE_FROM_ALL)
 
-  add_subdirectory(70_FLIPFluids EXCLUDE_FROM_ALL)
+  	add_subdirectory(70_FLIPFluids EXCLUDE_FROM_ALL)
 	add_subdirectory(71_RayTracingPipeline EXCLUDE_FROM_ALL)
 
 	NBL_HOOK_COMMON_API("${NBL_COMMON_API_TARGETS}")
diff --git a/old_to_refactor/49_ComputeFFT/CMakeLists.txt b/old_to_refactor/49_ComputeFFT/CMakeLists.txt
deleted file mode 100644
index b591db9e9..000000000
--- a/old_to_refactor/49_ComputeFFT/CMakeLists.txt
+++ /dev/null
@@ -1,11 +0,0 @@
-
-include(common RESULT_VARIABLE RES)
-if(NOT RES)
-	message(FATAL_ERROR "common.cmake not found. Should be in {repo_root}/cmake directory")
-endif()
-
-set(EXAMPLE_SOURCES
-	../../src/nbl/ext/FFT/FFT.cpp
-)
-
-nbl_create_executable_project("${EXAMPLE_SOURCES}" "" "" "" "${NBL_EXECUTABLE_PROJECT_CREATION_PCH_TARGET}")
\ No newline at end of file
diff --git a/old_to_refactor/49_ComputeFFT/config.json.template b/old_to_refactor/49_ComputeFFT/config.json.template
deleted file mode 100644
index f961745c1..000000000
--- a/old_to_refactor/49_ComputeFFT/config.json.template
+++ /dev/null
@@ -1,28 +0,0 @@
-{
-  "enableParallelBuild": true,
-  "threadsPerBuildProcess" : 2,
-  "isExecuted": false,
-  "scriptPath": "",
-  "cmake": {
-    "configurations": [ "Release", "Debug", "RelWithDebInfo" ],
-    "buildModes": [],
-    "requiredOptions": []
-  }, 
-  "profiles": [
-    {
-      "backend": "vulkan",
-      "platform": "windows",
-      "buildModes": [],
-      "runConfiguration": "Release",
-      "gpuArchitectures": []
-    }
-  ],
-  "dependencies": [],
-  "data": [
-    {
-      "dependencies": [],
-      "command": [""],
-      "outputs": []
-    }
-  ]
-}
\ No newline at end of file
diff --git a/old_to_refactor/49_ComputeFFT/extra_parameters.glsl b/old_to_refactor/49_ComputeFFT/extra_parameters.glsl
deleted file mode 100644
index 032f4c363..000000000
--- a/old_to_refactor/49_ComputeFFT/extra_parameters.glsl
+++ /dev/null
@@ -1,16 +0,0 @@
-// Copyright (C) 2018-2020 - DevSH Graphics Programming Sp. z O.O.
-// This file is part of the "Nabla Engine".
-// For conditions of distribution and use, see copyright notice in nabla.h
-
-#include "nbl/builtin/glsl/ext/FFT/parameters_struct.glsl"
-struct convolve_parameters_t
-{
-    nbl_glsl_ext_FFT_Parameters_t fft;
-    vec2    kernel_half_pixel_size;
-};
-
-struct image_store_parameters_t
-{
-    nbl_glsl_ext_FFT_Parameters_t fft;
-    ivec2   unpad_offset;
-};
\ No newline at end of file
diff --git a/old_to_refactor/49_ComputeFFT/fft_convolve_ifft.comp b/old_to_refactor/49_ComputeFFT/fft_convolve_ifft.comp
deleted file mode 100644
index 18702fe81..000000000
--- a/old_to_refactor/49_ComputeFFT/fft_convolve_ifft.comp
+++ /dev/null
@@ -1,109 +0,0 @@
-layout(local_size_x=_NBL_GLSL_WORKGROUP_SIZE_, local_size_y=1, local_size_z=1) in;
-
-layout(set=0, binding=2) uniform sampler2D NormalizedKernel[3];
-
-/* TODO: Hardcode the parameters for the frequent FFTs
-uvec3 nbl_glsl_ext_FFT_Parameters_t_getDimensions()
-{
-	return uvec3(1280u,1024u,1u);
-}
-bool nbl_glsl_ext_FFT_Parameters_t_getIsInverse()
-{
-	return false;
-}
-uint nbl_glsl_ext_FFT_Parameters_t_getDirection()
-{
-	return 0u;
-}
-uint nbl_glsl_ext_FFT_Parameters_t_getMaxChannel()
-{
-    return 2u;
-}
-uint nbl_glsl_ext_FFT_Parameters_t_getLog2FFTSize()
-{
-    return 11u;
-}
-uint nbl_glsl_ext_FFT_Parameters_t_getPaddingType()
-{
-    return 3u; // _NBL_GLSL_EXT_FFT_PAD_MIRROR_;
-}
-uvec4 nbl_glsl_ext_FFT_Parameters_t_getInputStrides()
-{
-	return uvec4(1024u,1u,0u,1024u*1280u);
-}
-uvec4 nbl_glsl_ext_FFT_Parameters_t_getOutputStrides()
-{
-	return uvec4(1u,1280u,0u,1280u*1024u);
-}
-#define _NBL_GLSL_EXT_FFT_PARAMETERS_METHODS_DECLARED_
-*/
-
-#include "extra_parameters.glsl"
-layout(push_constant) uniform PushConstants
-{
-	convolve_parameters_t params;
-} pc;
-#define _NBL_GLSL_EXT_FFT_PUSH_CONSTANTS_DEFINED_
-
-nbl_glsl_ext_FFT_Parameters_t nbl_glsl_ext_FFT_getParameters()
-{
-	return pc.params.fft;
-}
-#define _NBL_GLSL_EXT_FFT_GET_PARAMETERS_DEFINED_
-
-#define _NBL_GLSL_EXT_FFT_MAIN_DEFINED_
-#include "nbl/builtin/glsl/ext/FFT/default_compute_fft.comp"
-
-void convolve(in uint item_per_thread_count, in uint ch) 
-{
-	// TODO: decouple kernel size from image size (can't get the math to work in my head)
-	for(uint t=0u; t<item_per_thread_count; t++)
-	{
-		const uint tid = _NBL_GLSL_WORKGROUP_SIZE_*t+gl_LocalInvocationIndex;
-
-		nbl_glsl_complex sourceSpectrum = nbl_glsl_ext_FFT_impl_values[t];
-		
-		//
-		const uvec3 coords = nbl_glsl_ext_FFT_getCoordinates(tid);
-        vec2 uv = vec2(bitfieldReverse(coords.xy))/vec2(4294967296.f)+pc.params.kernel_half_pixel_size;
-		//
-		nbl_glsl_complex convSpectrum = textureLod(NormalizedKernel[ch],uv,0).xy;
-		nbl_glsl_ext_FFT_impl_values[t] = nbl_glsl_complex_mul(sourceSpectrum,convSpectrum);
-	}
-}
-
-void main()
-{
-    const uint log2FFTSize = nbl_glsl_ext_FFT_Parameters_t_getLog2FFTSize();
-    const uint item_per_thread_count = 0x1u<<(log2FFTSize-_NBL_GLSL_WORKGROUP_SIZE_LOG2_);
-	
-	for(uint ch=0u; ch<=nbl_glsl_ext_FFT_Parameters_t_getMaxChannel(); ++ch)
-	{
-		// Load Values into local memory
-		for(uint t=0u; t<item_per_thread_count; t++)
-		{
-			const uint tid = (t<<_NBL_GLSL_WORKGROUP_SIZE_LOG2_)|gl_LocalInvocationIndex;
-			const uint trueDim = nbl_glsl_ext_FFT_Parameters_t_getDimensions()[nbl_glsl_ext_FFT_Parameters_t_getDirection()];
-			nbl_glsl_ext_FFT_impl_values[t] = nbl_glsl_ext_FFT_getPaddedData(nbl_glsl_ext_FFT_getPaddedCoordinates(tid,log2FFTSize,trueDim),ch);
-		}
-		nbl_glsl_ext_FFT_preloaded(false,log2FFTSize);
-		barrier();
-
-		convolve(item_per_thread_count,ch);
-	
-		barrier();
-		nbl_glsl_ext_FFT_preloaded(true,log2FFTSize);
-		// write out to main memory
-		// we override the setting to happen with padded coordinates because we dont want the padding to be written at all
-		for(uint t=0u; t<item_per_thread_count; t++)
-		{
-			const uint tid = (t<<_NBL_GLSL_WORKGROUP_SIZE_LOG2_)|gl_LocalInvocationIndex;
-			const uint trueDim = nbl_glsl_ext_FFT_Parameters_t_getDimensions()[nbl_glsl_ext_FFT_Parameters_t_getDirection()];
-			// we also prevent certain threads from writing the memory out
-			const uint padding = ((0x1u<<log2FFTSize)-trueDim)>>1u;
-			const uint shifted = tid-padding;
-			if (tid>=padding && shifted<trueDim)
-				nbl_glsl_ext_FFT_setData(ivec3(nbl_glsl_ext_FFT_getCoordinates(shifted)),ch,nbl_glsl_ext_FFT_impl_values[t]);
-		}
-	}
-}
\ No newline at end of file
diff --git a/old_to_refactor/49_ComputeFFT/image_first_fft.comp b/old_to_refactor/49_ComputeFFT/image_first_fft.comp
deleted file mode 100644
index d849669ca..000000000
--- a/old_to_refactor/49_ComputeFFT/image_first_fft.comp
+++ /dev/null
@@ -1,56 +0,0 @@
-layout(local_size_x=_NBL_GLSL_WORKGROUP_SIZE_, local_size_y=1, local_size_z=1) in;
-
-// Input Descriptor
-layout(set=0, binding=0) uniform sampler2D inputImage;
-#define _NBL_GLSL_EXT_FFT_INPUT_DESCRIPTOR_DEFINED_
-
-#include <nbl/builtin/glsl/math/complex.glsl>
-nbl_glsl_complex nbl_glsl_ext_FFT_getPaddedData(in ivec3 coordinate, in uint channel) 
-{
-	ivec2 inputImageSize = textureSize(inputImage, 0);
-	vec2 normalizedCoords = (vec2(coordinate.xy)+vec2(0.5f))/(vec2(inputImageSize)*KERNEL_SCALE);
-	vec4 texelValue = textureLod(inputImage, normalizedCoords+vec2(0.5-0.5/KERNEL_SCALE), -log2(KERNEL_SCALE));
-	return nbl_glsl_complex(texelValue[channel], 0.0f);
-}
-#define _NBL_GLSL_EXT_FFT_GET_PADDED_DATA_DEFINED_
-
-
-/* TODO: Hardcode the parameters for the frequent FFTs
-#if _NBL_GLSL_EXT_FFT_MAX_DIM_SIZE_>512
-uvec3 nbl_glsl_ext_FFT_Parameters_t_getDimensions()
-{
-	return uvec3(1280u,720u,1u);
-}
-bool nbl_glsl_ext_FFT_Parameters_t_getIsInverse()
-{
-	return false;
-}
-uint nbl_glsl_ext_FFT_Parameters_t_getDirection()
-{
-	return 1u;
-}
-uint nbl_glsl_ext_FFT_Parameters_t_getMaxChannel()
-{
-    return 2u;
-}
-uint nbl_glsl_ext_FFT_Parameters_t_getLog2FFTSize()
-{
-    return 10u;
-}
-uint nbl_glsl_ext_FFT_Parameters_t_getPaddingType()
-{
-    return 3u; // _NBL_GLSL_EXT_FFT_PAD_MIRROR_;
-}
-uvec4 nbl_glsl_ext_FFT_Parameters_t_getInputStrides()
-{
-	return uvec4(0xdeadbeefu);
-}
-uvec4 nbl_glsl_ext_FFT_Parameters_t_getOutputStrides()
-{
-	return uvec4(1024u,1u,0u,1024u*1280u);
-}
-#define _NBL_GLSL_EXT_FFT_PARAMETERS_METHODS_DECLARED_
-#endif
-*/
-
-#include "nbl/builtin/glsl/ext/FFT/default_compute_fft.comp"
\ No newline at end of file
diff --git a/old_to_refactor/49_ComputeFFT/last_fft.comp b/old_to_refactor/49_ComputeFFT/last_fft.comp
deleted file mode 100644
index 2183ef63c..000000000
--- a/old_to_refactor/49_ComputeFFT/last_fft.comp
+++ /dev/null
@@ -1,72 +0,0 @@
-layout(local_size_x=_NBL_GLSL_WORKGROUP_SIZE_, local_size_y=1, local_size_z=1) in;
- 
-// Output Descriptor
-layout(set=0, binding=1, rgba16f) uniform image2D outImage;
-#define _NBL_GLSL_EXT_FFT_OUTPUT_DESCRIPTOR_DEFINED_
-
-/* TODO: Hardcode the parameters for the frequent FFTs
-uvec3 nbl_glsl_ext_FFT_Parameters_t_getDimensions()
-{
-	return uvec3(1280u,1024u,1u);
-}
-bool nbl_glsl_ext_FFT_Parameters_t_getIsInverse()
-{
-	return true;
-}
-uint nbl_glsl_ext_FFT_Parameters_t_getDirection()
-{
-	return 1u;
-}
-uint nbl_glsl_ext_FFT_Parameters_t_getMaxChannel()
-{
-    return 2u;
-}
-uint nbl_glsl_ext_FFT_Parameters_t_getLog2FFTSize()
-{
-    return 10u;
-}
-uint nbl_glsl_ext_FFT_Parameters_t_getPaddingType()
-{
-    return 3u; // _NBL_GLSL_EXT_FFT_PAD_MIRROR_;
-}
-uvec4 nbl_glsl_ext_FFT_Parameters_t_getInputStrides()
-{
-	return uvec4(1u,1280u,0u,1280u*1024u);
-}
-uvec4 nbl_glsl_ext_FFT_Parameters_t_getOutputStrides()
-{
-	return uvec4(0xdeadbeefu);
-}
-#define _NBL_GLSL_EXT_FFT_PARAMETERS_METHODS_DECLARED_
-*/
-
-#include "extra_parameters.glsl"
-layout(push_constant) uniform PushConstants
-{
-	image_store_parameters_t params;
-} pc;
-#define _NBL_GLSL_EXT_FFT_PUSH_CONSTANTS_DEFINED_
-
-nbl_glsl_ext_FFT_Parameters_t nbl_glsl_ext_FFT_getParameters()
-{
-	return pc.params.fft;
-}
-#define _NBL_GLSL_EXT_FFT_GET_PARAMETERS_DEFINED_
-
-
-#include <nbl/builtin/glsl/math/complex.glsl>
-void nbl_glsl_ext_FFT_setData(in uvec3 coordinate, in uint channel, in nbl_glsl_complex complex_value)
-{
-	const ivec2 coords = ivec2(coordinate.xy)-pc.params.unpad_offset;
-
-    if (all(lessThanEqual(ivec2(0),coords)) && all(greaterThan(imageSize(outImage),coords)))
-    {
-        vec4 color_value = imageLoad(outImage, coords);
-        color_value[channel] = complex_value.x;
-        imageStore(outImage, coords, color_value);
-    }
-}
-#define _NBL_GLSL_EXT_FFT_SET_DATA_DEFINED_
-
-
-#include "nbl/builtin/glsl/ext/FFT/default_compute_fft.comp"
\ No newline at end of file
diff --git a/old_to_refactor/49_ComputeFFT/main.cpp b/old_to_refactor/49_ComputeFFT/main.cpp
deleted file mode 100644
index ba2b7e33e..000000000
--- a/old_to_refactor/49_ComputeFFT/main.cpp
+++ /dev/null
@@ -1,753 +0,0 @@
-// Copyright (C) 2018-2020 - DevSH Graphics Programming Sp. z O.O.
-// This file is part of the "Nabla Engine".
-// For conditions of distribution and use, see copyright notice in nabla.h
-
-#define _NBL_STATIC_LIB_
-#include <nabla.h>
-#include <iostream>
-#include <cstdio>
-
-#include "nbl/ext/FFT/FFT.h"
-#include "../common/QToQuitEventReceiver.h"
-
-using namespace nbl;
-using namespace nbl::core;
-using namespace nbl::asset;
-using namespace nbl::video;
-
-using FFTClass = ext::FFT::FFT;
-
-constexpr uint32_t channelCountOverride = 3u;
-
-inline core::smart_refctd_ptr<video::IGPUSpecializedShader> createShader(
-	video::IVideoDriver* driver,
-	const uint32_t maxFFTlen,
-	const bool useHalfStorage,
-	const char* includeMainName,
-	float kernelScale = 1.f)
-{
-	const char* sourceFmt =
-R"===(#version 430 core
-
-#define _NBL_GLSL_WORKGROUP_SIZE_ %u
-#define _NBL_GLSL_EXT_FFT_MAX_DIM_SIZE_ %u
-#define _NBL_GLSL_EXT_FFT_HALF_STORAGE_ %u
-
-#define KERNEL_SCALE %f
- 
-#include "%s"
-
-)===";
-
-	const size_t extraSize = 4u+8u+8u+128u;
-	
-	constexpr uint32_t DEFAULT_WORK_GROUP_SIZE = FFTClass::DEFAULT_WORK_GROUP_SIZE;
-	auto shader = core::make_smart_refctd_ptr<ICPUBuffer>(strlen(sourceFmt)+extraSize+1u);
-	snprintf(
-		reinterpret_cast<char*>(shader->getPointer()),shader->getSize(), sourceFmt,
-		DEFAULT_WORK_GROUP_SIZE,
-		maxFFTlen,
-		useHalfStorage ? 1u:0u,
-		kernelScale,
-		includeMainName
-	);
-
-	auto cpuSpecializedShader = core::make_smart_refctd_ptr<ICPUSpecializedShader>(
-		core::make_smart_refctd_ptr<ICPUShader>(std::move(shader),ICPUShader::buffer_contains_glsl),
-		ISpecializedShader::SInfo{nullptr, nullptr, "main", asset::ISpecializedShader::ESS_COMPUTE}
-	);
-	
-	auto gpuShader = driver->createShader(nbl::core::smart_refctd_ptr<const ICPUShader>(cpuSpecializedShader->getUnspecialized()));
-	
-	auto gpuSpecializedShader = driver->createSpecializedShader(gpuShader.get(), cpuSpecializedShader->getSpecializationInfo());
-
-	return gpuSpecializedShader;
-}
-
-
-
-inline void updateDescriptorSet_Convolution (
-	video::IVideoDriver * driver,
-	video::IGPUDescriptorSet * set,
-	core::smart_refctd_ptr<video::IGPUBuffer> inputBufferDescriptor,
-	core::smart_refctd_ptr<video::IGPUBuffer> outputBufferDescriptor,
-	const core::smart_refctd_ptr<video::IGPUImageView>* kernelNormalizedSpectrumImageDescriptors)
-{
-	constexpr uint32_t descCount = 3u;
-	video::IGPUDescriptorSet::SDescriptorInfo pInfos[descCount-1u+channelCountOverride];
-	video::IGPUDescriptorSet::SWriteDescriptorSet pWrites[descCount];
-
-	for (auto i = 0; i < descCount; i++)
-	{
-		pWrites[i].binding = i;
-		pWrites[i].dstSet = set;
-		pWrites[i].arrayElement = 0u;
-		pWrites[i].info = pInfos+i;
-	}
-
-	// Input Buffer 
-	pWrites[0].descriptorType = asset::EDT_STORAGE_BUFFER;
-	pWrites[0].count = 1;
-	pInfos[0].desc = inputBufferDescriptor;
-	pInfos[0].buffer.size = inputBufferDescriptor->getSize();
-	pInfos[0].buffer.offset = 0u;
-	
-	//
-	pWrites[1].descriptorType = asset::EDT_STORAGE_BUFFER;
-	pWrites[1].count = 1;
-	pInfos[1].desc = outputBufferDescriptor;
-	pInfos[1].buffer.size = outputBufferDescriptor->getSize();
-	pInfos[1].buffer.offset = 0u;
-
-	// Kernel Buffer 
-	pWrites[2].descriptorType = asset::EDT_COMBINED_IMAGE_SAMPLER;
-	pWrites[2].count = channelCountOverride;
-	for (uint32_t i=0u; i<channelCountOverride; i++)
-	{
-		auto& info = pInfos[2u+i];
-		info.desc = kernelNormalizedSpectrumImageDescriptors[i];
-		//info.image.imageLayout = ;
-		info.image.sampler = nullptr;
-	}
-
-	driver->updateDescriptorSets(descCount, pWrites, 0u, nullptr);
-}
-inline void updateDescriptorSet_LastFFT (
-	video::IVideoDriver * driver,
-	video::IGPUDescriptorSet * set,
-	core::smart_refctd_ptr<video::IGPUBuffer> inputBufferDescriptor,
-	core::smart_refctd_ptr<video::IGPUImageView> outputImageDescriptor)
-{
-	video::IGPUDescriptorSet::SDescriptorInfo pInfos[2];
-	video::IGPUDescriptorSet::SWriteDescriptorSet pWrites[2];
-
-	for (auto i = 0; i< 2; i++)
-	{
-		pWrites[i].dstSet = set;
-		pWrites[i].arrayElement = 0u;
-		pWrites[i].count = 1u;
-		pWrites[i].info = pInfos+i;
-	}
-
-	// Input Buffer 
-	pWrites[0].binding = 0;
-	pWrites[0].descriptorType = asset::EDT_STORAGE_BUFFER;
-	pWrites[0].count = 1;
-	pInfos[0].desc = inputBufferDescriptor;
-	pInfos[0].buffer.size = inputBufferDescriptor->getSize();
-	pInfos[0].buffer.offset = 0u;
-
-	// Output Buffer 
-	pWrites[1].binding = 1;
-	pWrites[1].descriptorType = asset::EDT_STORAGE_IMAGE;
-	pWrites[1].count = 1;
-	pInfos[1].desc = outputImageDescriptor;
-	pInfos[1].image.sampler = nullptr;
-	pInfos[1].image.imageLayout = static_cast<asset::E_IMAGE_LAYOUT>(0u);;
-
-	driver->updateDescriptorSets(2u, pWrites, 0u, nullptr);
-}
-
-using nbl_glsl_ext_FFT_Parameters_t = ext::FFT::FFT::Parameters_t;
-struct vec2
-{
-	float x,y;
-};
-struct ivec2
-{
-	int32_t x,y;
-};
-#include "extra_parameters.glsl"
-
-
-int main()
-{
-	nbl::SIrrlichtCreationParameters deviceParams;
-	deviceParams.Bits = 24; //may have to set to 32bit for some platforms
-	deviceParams.ZBufferBits = 24; //we'd like 32bit here
-	deviceParams.DriverType = EDT_OPENGL; //! Only Well functioning driver, software renderer left for sake of 2D image drawing
-	deviceParams.WindowSize = dimension2d<uint32_t>(1280, 720);
-	deviceParams.Fullscreen = false;
-	deviceParams.Vsync = true; //! If supported by target platform
-	deviceParams.Doublebuffer = true;
-	deviceParams.Stencilbuffer = false; //! This will not even be a choice soon
-
-	auto device = createDeviceEx(deviceParams);
-	if (!device)
-		return 1; // could not create selected driver.
-
-	QToQuitEventReceiver receiver;
-	device->setEventReceiver(&receiver);
-
-	IVideoDriver* driver = device->getVideoDriver();
-	
-	nbl::io::IFileSystem* filesystem = device->getFileSystem();
-	IAssetManager* am = device->getAssetManager();
-	// Loading SrcImage and Kernel Image from File
-
-	IAssetLoader::SAssetLoadParams lp;
-	auto srcImageBundle = am->getAsset("../../media/colorexr.exr", lp);
-	auto kerImageBundle = am->getAsset("../../media/kernels/physical_flare_256.exr", lp);
-
-	// get GPU image views
-	smart_refctd_ptr<IGPUImageView> srcImageView;
-	{
-		auto srcGpuImages = driver->getGPUObjectsFromAssets<ICPUImage>(srcImageBundle.getContents());
-
-		IGPUImageView::SCreationParams srcImgViewInfo;
-		srcImgViewInfo.flags = static_cast<IGPUImageView::E_CREATE_FLAGS>(0u);
-		srcImgViewInfo.image = srcGpuImages->operator[](0u);
-		srcImgViewInfo.viewType = IGPUImageView::ET_2D;
-		srcImgViewInfo.format = srcImgViewInfo.image->getCreationParameters().format;
-		srcImgViewInfo.subresourceRange.aspectMask = static_cast<IImage::E_ASPECT_FLAGS>(0u);
-		srcImgViewInfo.subresourceRange.baseMipLevel = 0;
-		srcImgViewInfo.subresourceRange.levelCount = 1;
-		srcImgViewInfo.subresourceRange.baseArrayLayer = 0;
-		srcImgViewInfo.subresourceRange.layerCount = 1;
-		srcImageView = driver->createImageView(std::move(srcImgViewInfo));
-	}
-	smart_refctd_ptr<IGPUImageView> kerImageView;
-	{
-		auto kerGpuImages = driver->getGPUObjectsFromAssets<ICPUImage>(kerImageBundle.getContents());
-
-		IGPUImageView::SCreationParams kerImgViewInfo;
-		kerImgViewInfo.flags = static_cast<IGPUImageView::E_CREATE_FLAGS>(0u);
-		kerImgViewInfo.image = kerGpuImages->operator[](0u);
-		kerImgViewInfo.viewType = IGPUImageView::ET_2D;
-		kerImgViewInfo.format = kerImgViewInfo.image->getCreationParameters().format;
-		kerImgViewInfo.subresourceRange.aspectMask = static_cast<IImage::E_ASPECT_FLAGS>(0u);
-		kerImgViewInfo.subresourceRange.baseMipLevel = 0;
-		kerImgViewInfo.subresourceRange.levelCount = kerImgViewInfo.image->getCreationParameters().mipLevels;
-		kerImgViewInfo.subresourceRange.baseArrayLayer = 0;
-		kerImgViewInfo.subresourceRange.layerCount = 1;
-		kerImageView = driver->createImageView(std::move(kerImgViewInfo));
-	}
-
-	// agree on formats
-	const E_FORMAT srcFormat = srcImageView->getCreationParameters().format;
-	uint32_t srcNumChannels = getFormatChannelCount(srcFormat);
-	uint32_t kerNumChannels = getFormatChannelCount(kerImageView->getCreationParameters().format);
-	//! OVERRIDE (we dont need alpha)
-	srcNumChannels = channelCountOverride;
-	kerNumChannels = channelCountOverride;
-	assert(srcNumChannels == kerNumChannels); // Just to make sure, because the other case is not handled in this example
-
-	// Create Out Image
-	smart_refctd_ptr<IGPUImage> outImg;
-	smart_refctd_ptr<IGPUImageView> outImgView;
-	{
-		auto dstImgViewInfo = srcImageView->getCreationParameters();
-
-		auto dstImgInfo = dstImgViewInfo.image->getCreationParameters();
-		outImg = driver->createDeviceLocalGPUImageOnDedMem(std::move(dstImgInfo));
-
-		dstImgViewInfo.image = outImg;
-		outImgView = driver->createImageView(IGPUImageView::SCreationParams(dstImgViewInfo));
-	}
-
-	// input pipeline
-	auto imageFirstFFTPipelineLayout = [driver]() -> auto
-	{
-		IGPUDescriptorSetLayout::SBinding bnd[] =
-		{
-			{
-				0u,
-				EDT_COMBINED_IMAGE_SAMPLER,
-				1u,
-				ISpecializedShader::ESS_COMPUTE,
-				nullptr
-			},
-			{
-				1u,
-				EDT_STORAGE_BUFFER,
-				1u,
-				ISpecializedShader::ESS_COMPUTE,
-				nullptr
-			}
-		};
-	
-		core::SRange<const asset::SPushConstantRange> pcRange = FFTClass::getDefaultPushConstantRanges();
-		core::SRange<const video::IGPUDescriptorSetLayout::SBinding> bindings = {bnd,bnd+sizeof(bnd)/sizeof(IGPUDescriptorSetLayout::SBinding)};
-
-		return driver->createPipelineLayout(
-			pcRange.begin(),pcRange.end(),
-			driver->createDescriptorSetLayout(bindings.begin(),bindings.end()),nullptr,nullptr,nullptr
-		);
-	}();
-	auto convolvePipelineLayout = [driver]() -> auto
-	{
-		IGPUSampler::SParams params =
-		{
-			{
-				ISampler::ETC_REPEAT,
-				ISampler::ETC_REPEAT,
-				ISampler::ETC_REPEAT,
-				ISampler::ETBC_FLOAT_OPAQUE_BLACK,
-				ISampler::ETF_LINEAR, // is it needed?
-				ISampler::ETF_LINEAR,
-				ISampler::ESMM_NEAREST,
-				0u,
-				0u,
-				ISampler::ECO_ALWAYS
-			}
-		};
-		auto sampler = driver->createSampler(std::move(params));
-		smart_refctd_ptr<IGPUSampler> samplers[channelCountOverride];
-		std::fill_n(samplers,channelCountOverride,sampler);
-
-		IGPUDescriptorSetLayout::SBinding bnd[] =
-		{
-			{
-				0u,
-				EDT_STORAGE_BUFFER,
-				1u,
-				ISpecializedShader::ESS_COMPUTE,
-				nullptr
-			},
-			{
-				1u,
-				EDT_STORAGE_BUFFER,
-				1u,
-				ISpecializedShader::ESS_COMPUTE,
-				nullptr
-			},
-			{
-				2u,
-				EDT_COMBINED_IMAGE_SAMPLER,
-				channelCountOverride,
-				ISpecializedShader::ESS_COMPUTE,
-				samplers
-			}
-		};
-	
-		const asset::SPushConstantRange pcRange = {ISpecializedShader::ESS_COMPUTE,0u,sizeof(convolve_parameters_t)};
-		core::SRange<const video::IGPUDescriptorSetLayout::SBinding> bindings = {bnd,bnd+sizeof(bnd)/sizeof(IGPUDescriptorSetLayout::SBinding)};
-
-		return driver->createPipelineLayout(
-			&pcRange,&pcRange+1,
-			driver->createDescriptorSetLayout(bindings.begin(),bindings.end()),nullptr,nullptr,nullptr
-		);
-	}();
-	auto lastFFTPipelineLayout = [driver]() -> auto
-	{
-		IGPUDescriptorSetLayout::SBinding bnd[] =
-		{
-			{
-				0u,
-				EDT_STORAGE_BUFFER,
-				1u,
-				ISpecializedShader::ESS_COMPUTE,
-				nullptr
-			},
-			{
-				1u,
-				EDT_STORAGE_IMAGE,
-				1u,
-				ISpecializedShader::ESS_COMPUTE,
-				nullptr
-			},
-		};
-		
-		const asset::SPushConstantRange pcRange = {ISpecializedShader::ESS_COMPUTE,0u,sizeof(image_store_parameters_t)};
-		core::SRange<const video::IGPUDescriptorSetLayout::SBinding> bindings = {bnd, bnd+sizeof(bnd)/sizeof(IGPUDescriptorSetLayout::SBinding)};;
-
-		return driver->createPipelineLayout(
-			&pcRange,&pcRange+1,
-			driver->createDescriptorSetLayout(bindings.begin(),bindings.end()),nullptr,nullptr,nullptr
-		);
-	}();
-
-	const float bloomRelativeScale = 0.25f;
-	const auto kerDim = kerImageView->getCreationParameters().image->getCreationParameters().extent;
-	const auto srcDim = srcImageView->getCreationParameters().image->getCreationParameters().extent;
-	const float bloomScale = core::min(float(srcDim.width)/float(kerDim.width),float(srcDim.height)/float(kerDim.height))*bloomRelativeScale;
-	if (bloomScale>1.f)
-		std::cout << "WARNING: Bloom Kernel will Clip and loose sharpness, increase resolution of bloom kernel!" << std::endl;
-	const auto marginSrcDim = [srcDim,kerDim,bloomScale]() -> auto
-	{
-		auto tmp = srcDim;
-		for (auto i=0u; i<3u; i++)
-		{
-			const auto coord = (&kerDim.width)[i];
-			if (coord>1u)
-				(&tmp.width)[i] += core::max(coord*bloomScale,1u)-1u;
-		}
-		return tmp;
-	}();
-	constexpr bool useHalfFloats = true;
-	// Allocate Output Buffer
-	auto fftOutputBuffer_0 = driver->createDeviceLocalGPUBufferOnDedMem(FFTClass::getOutputBufferSize(useHalfFloats,marginSrcDim,srcNumChannels));
-	auto fftOutputBuffer_1 = driver->createDeviceLocalGPUBufferOnDedMem(FFTClass::getOutputBufferSize(useHalfFloats,marginSrcDim,srcNumChannels));
-	core::smart_refctd_ptr<IGPUImageView> kernelNormalizedSpectrums[channelCountOverride];
-
-	auto updateDescriptorSet = [driver](video::IGPUDescriptorSet* set, core::smart_refctd_ptr<IGPUImageView> inputImageDescriptor, asset::ISampler::E_TEXTURE_CLAMP textureWrap, core::smart_refctd_ptr<IGPUBuffer> outputBufferDescriptor) -> void
-	{
-		IGPUSampler::SParams params =
-		{
-			{
-				textureWrap,
-				textureWrap,
-				textureWrap,
-				ISampler::ETBC_FLOAT_OPAQUE_BLACK,
-				ISampler::ETF_LINEAR,
-				ISampler::ETF_LINEAR,
-				ISampler::ESMM_LINEAR,
-				8u,
-				0u,
-				ISampler::ECO_ALWAYS
-			}
-		};
-		auto sampler = driver->createSampler(std::move(params));
-		
-		constexpr auto kDescriptorCount = 2u;
-		video::IGPUDescriptorSet::SDescriptorInfo pInfos[kDescriptorCount];
-		video::IGPUDescriptorSet::SWriteDescriptorSet pWrites[kDescriptorCount];
-
-		for (auto i=0; i<kDescriptorCount; i++)
-		{
-			pWrites[i].dstSet = set;
-			pWrites[i].arrayElement = 0u;
-			pWrites[i].count = 1u;
-			pWrites[i].info = pInfos+i;
-		}
-
-		// Input Buffer 
-		pWrites[0].binding = 0;
-		pWrites[0].descriptorType = asset::EDT_COMBINED_IMAGE_SAMPLER;
-		pWrites[0].count = 1;
-		pInfos[0].desc = inputImageDescriptor;
-		pInfos[0].image.sampler = sampler;
-		pInfos[0].image.imageLayout = static_cast<asset::E_IMAGE_LAYOUT>(0u);
-
-		// Output Buffer 
-		pWrites[1].binding = 1;
-		pWrites[1].descriptorType = asset::EDT_STORAGE_BUFFER;
-		pWrites[1].count = 1;
-		pInfos[1].desc = outputBufferDescriptor;
-		pInfos[1].buffer.size = outputBufferDescriptor->getSize();
-		pInfos[1].buffer.offset = 0u;
-
-		driver->updateDescriptorSets(2u, pWrites, 0u, nullptr);
-	};
-
-	// Precompute Kernel FFT
-	{
-		const VkExtent3D paddedKerDim = FFTClass::padDimensions(kerDim);
-
-		// create kernel spectrums
-		auto createKernelSpectrum = [&]() -> auto
-		{
-			video::IGPUImage::SCreationParams imageParams;
-			imageParams.flags = static_cast<asset::IImage::E_CREATE_FLAGS>(0u);
-			imageParams.type = asset::IImage::ET_2D;
-			imageParams.format = useHalfFloats ? EF_R16G16_SFLOAT:EF_R32G32_SFLOAT;
-			imageParams.extent = { paddedKerDim.width,paddedKerDim.height,1u};
-			imageParams.mipLevels = 1u;
-			imageParams.arrayLayers = 1u;
-			imageParams.samples = asset::IImage::ESCF_1_BIT;
-
-			video::IGPUImageView::SCreationParams viewParams;
-			viewParams.flags = static_cast<video::IGPUImageView::E_CREATE_FLAGS>(0u);
-			viewParams.image = driver->createGPUImageOnDedMem(std::move(imageParams),driver->getDeviceLocalGPUMemoryReqs());
-			viewParams.viewType = video::IGPUImageView::ET_2D;
-			viewParams.format = useHalfFloats ? EF_R16G16_SFLOAT:EF_R32G32_SFLOAT;
-			viewParams.components = {};
-			viewParams.subresourceRange = {};
-			viewParams.subresourceRange.levelCount = 1u;
-			viewParams.subresourceRange.layerCount = 1u;
-			return driver->createImageView(std::move(viewParams));
-		};
-		for (uint32_t i=0u; i<channelCountOverride; i++)
-			kernelNormalizedSpectrums[i] = createKernelSpectrum();
-		
-
-		FFTClass::Parameters_t fftPushConstants[2];
-		FFTClass::DispatchInfo_t fftDispatchInfo[2];
-		const ISampler::E_TEXTURE_CLAMP fftPadding[2] = {ISampler::ETC_CLAMP_TO_BORDER,ISampler::ETC_CLAMP_TO_BORDER};
-		const auto passes = FFTClass::buildParameters(false,srcNumChannels,kerDim,fftPushConstants,fftDispatchInfo,fftPadding);
-		assert(passes==2u);
-		// last FFT pipeline
-		core::smart_refctd_ptr<IGPUComputePipeline> fftPipeline_SSBOInput(core::make_smart_refctd_ptr<FFTClass>(driver,0x1u<<fftPushConstants[1].getLog2FFTSize(),useHalfFloats)->getDefaultPipeline());
-
-		// descriptor sets
-		core::smart_refctd_ptr<IGPUDescriptorSet> fftDescriptorSet_Ker_FFT[2] =
-		{
-			driver->createDescriptorSet(core::smart_refctd_ptr<const IGPUDescriptorSetLayout>(imageFirstFFTPipelineLayout->getDescriptorSetLayout(0u))),
-			driver->createDescriptorSet(core::smart_refctd_ptr<const IGPUDescriptorSetLayout>(fftPipeline_SSBOInput->getLayout()->getDescriptorSetLayout(0u)))
-		};
-		updateDescriptorSet(fftDescriptorSet_Ker_FFT[0].get(), kerImageView, ISampler::ETC_CLAMP_TO_BORDER, fftOutputBuffer_0);
-		FFTClass::updateDescriptorSet(driver,fftDescriptorSet_Ker_FFT[1].get(), fftOutputBuffer_0, fftOutputBuffer_1);
-		
-		// Normalization of FFT spectrum
-		struct NormalizationPushConstants
-		{
-			ext::FFT::uvec4 stride;
-			ext::FFT::uvec4 bitreverse_shift;
-		};
-		auto fftPipelineLayout_KernelNormalization = [&]() -> auto
-		{
-			IGPUDescriptorSetLayout::SBinding bnd[] =
-			{
-				{
-					0u,
-					EDT_STORAGE_BUFFER,
-					1u,
-					ISpecializedShader::ESS_COMPUTE,
-					nullptr
-				},
-				{
-					1u,
-					EDT_STORAGE_IMAGE,
-					channelCountOverride,
-					ISpecializedShader::ESS_COMPUTE,
-					nullptr
-				},
-			};
-			SPushConstantRange pc_rng;
-			pc_rng.offset = 0u;
-			pc_rng.size = sizeof(NormalizationPushConstants);
-			pc_rng.stageFlags = ISpecializedShader::ESS_COMPUTE;
-			return driver->createPipelineLayout(
-				&pc_rng,&pc_rng+1u,
-				driver->createDescriptorSetLayout(bnd,bnd+2),nullptr,nullptr,nullptr
-			);
-		}();
-		auto fftDescriptorSet_KernelNormalization = [&]() -> auto
-		{
-			auto dset = driver->createDescriptorSet(core::smart_refctd_ptr<const IGPUDescriptorSetLayout>(fftPipelineLayout_KernelNormalization->getDescriptorSetLayout(0u)));
-
-			video::IGPUDescriptorSet::SDescriptorInfo pInfos[1+channelCountOverride];
-			video::IGPUDescriptorSet::SWriteDescriptorSet pWrites[2];
-
-			for (auto i = 0; i < 2; i++)
-			{
-				pWrites[i].dstSet = dset.get();
-				pWrites[i].arrayElement = 0u;
-				pWrites[i].count = 1u;
-				pWrites[i].info = pInfos + i;
-			}
-
-			// In Buffer 
-			pWrites[0].binding = 0;
-			pWrites[0].descriptorType = asset::EDT_STORAGE_BUFFER;
-			pWrites[0].count = 1;
-			pInfos[0].desc = fftOutputBuffer_1;
-			pInfos[0].buffer.size = fftOutputBuffer_1->getSize();
-			pInfos[0].buffer.offset = 0u;
-
-			// Out Buffer 
-			pWrites[1].binding = 1;
-			pWrites[1].descriptorType = asset::EDT_STORAGE_IMAGE;
-			pWrites[1].count = channelCountOverride;
-			for (uint32_t i=0u; i<channelCountOverride; i++)
-			{
-				auto& info = pInfos[1u+i];
-				info.desc = kernelNormalizedSpectrums[i];
-				//info.image.imageLayout = ;
-				info.image.sampler = nullptr;
-			}
-
-			driver->updateDescriptorSets(2u, pWrites, 0u, nullptr);
-			return dset;
-		}();
-
-		// Ker Image First Axis FFT
-		{
-			auto fftPipeline_ImageInput = driver->createComputePipeline(nullptr,core::smart_refctd_ptr(imageFirstFFTPipelineLayout),createShader(driver,0x1u<<fftPushConstants[0].getLog2FFTSize(),useHalfFloats,"../image_first_fft.comp",bloomScale));
-			driver->bindComputePipeline(fftPipeline_ImageInput.get());
-			driver->bindDescriptorSets(EPBP_COMPUTE, imageFirstFFTPipelineLayout.get(), 0u, 1u, &fftDescriptorSet_Ker_FFT[0].get(), nullptr);
-			FFTClass::dispatchHelper(driver, imageFirstFFTPipelineLayout.get(), fftPushConstants[0], fftDispatchInfo[0]);
-		}
-
-		// Ker Image Last Axis FFT
-		driver->bindComputePipeline(fftPipeline_SSBOInput.get());
-		driver->bindDescriptorSets(EPBP_COMPUTE, fftPipeline_SSBOInput->getLayout(), 0u, 1u, &fftDescriptorSet_Ker_FFT[1].get(), nullptr);
-		FFTClass::dispatchHelper(driver, fftPipeline_SSBOInput->getLayout(), fftPushConstants[1], fftDispatchInfo[1]);
-		
-		// Ker Normalization
-		auto fftPipeline_KernelNormalization = driver->createComputePipeline(nullptr,core::smart_refctd_ptr(fftPipelineLayout_KernelNormalization),createShader(driver,0xdeadbeefu,useHalfFloats,"../normalization.comp"));
-		driver->bindComputePipeline(fftPipeline_KernelNormalization.get());
-		driver->bindDescriptorSets(EPBP_COMPUTE, fftPipelineLayout_KernelNormalization.get(), 0u, 1u, &fftDescriptorSet_KernelNormalization.get(), nullptr);
-		{
-			NormalizationPushConstants normalizationPC;
-			normalizationPC.stride = fftPushConstants[1].output_strides;
-			normalizationPC.bitreverse_shift.x = 32-core::findMSB(paddedKerDim.width);
-			normalizationPC.bitreverse_shift.y = 32-core::findMSB(paddedKerDim.height);
-			normalizationPC.bitreverse_shift.z = 0;
-			driver->pushConstants(fftPipelineLayout_KernelNormalization.get(),ICPUSpecializedShader::ESS_COMPUTE,0u,sizeof(normalizationPC),&normalizationPC);
-		}
-		{
-			const uint32_t dispatchSizeX = (paddedKerDim.width-1u)/16u+1u;
-			const uint32_t dispatchSizeY = (paddedKerDim.height-1u)/16u+1u;
-			driver->dispatch(dispatchSizeX,dispatchSizeY,kerNumChannels);
-			FFTClass::defaultBarrier();
-		}
-	}
-	
-	FFTClass::Parameters_t fftPushConstants[3];
-	FFTClass::DispatchInfo_t fftDispatchInfo[3];
-	const ISampler::E_TEXTURE_CLAMP fftPadding[2] = {ISampler::ETC_MIRROR,ISampler::ETC_MIRROR};
-	const auto passes = FFTClass::buildParameters(false,srcNumChannels,srcDim,fftPushConstants,fftDispatchInfo,fftPadding,marginSrcDim);
-	{
-		// override for less work and storage (dont need to store the extra padding of the last axis after iFFT)
-		fftPushConstants[1].output_strides.x = fftPushConstants[0].input_strides.x;
-		fftPushConstants[1].output_strides.y = fftPushConstants[0].input_strides.y;
-		fftPushConstants[1].output_strides.z = fftPushConstants[1].input_strides.z;
-		fftPushConstants[1].output_strides.w = fftPushConstants[1].input_strides.w;
-		// iFFT
-		fftPushConstants[2].input_dimensions = fftPushConstants[1].input_dimensions;
-		{
-			fftPushConstants[2].input_dimensions.w = fftPushConstants[0].input_dimensions.w^0x80000000u;
-			fftPushConstants[2].input_strides = fftPushConstants[1].output_strides;
-			fftPushConstants[2].output_strides = fftPushConstants[0].input_strides;
-		}
-		fftDispatchInfo[2] = fftDispatchInfo[0];
-	}
-	assert(passes==2);
-	// pipelines
-	auto fftPipeline_ImageInput = driver->createComputePipeline(nullptr,core::smart_refctd_ptr(imageFirstFFTPipelineLayout),createShader(driver,0x1u<<fftPushConstants[0].getLog2FFTSize(),useHalfFloats,"../image_first_fft.comp"));
-	auto convolvePipeline = driver->createComputePipeline(nullptr, std::move(convolvePipelineLayout), createShader(driver,0x1u<<fftPushConstants[1].getLog2FFTSize(),useHalfFloats, "../fft_convolve_ifft.comp"));
-	auto lastFFTPipeline = driver->createComputePipeline(nullptr, std::move(lastFFTPipelineLayout), createShader(driver,0x1u<<fftPushConstants[0].getLog2FFTSize(),useHalfFloats,"../last_fft.comp"));
-
-	// Src First Axis FFT
-	auto fftDescriptorSet_Src_FirstFFT = driver->createDescriptorSet(core::smart_refctd_ptr<const IGPUDescriptorSetLayout>(imageFirstFFTPipelineLayout->getDescriptorSetLayout(0u)));
-	updateDescriptorSet(fftDescriptorSet_Src_FirstFFT.get(), srcImageView, ISampler::ETC_MIRROR, fftOutputBuffer_0);
-
-	// Convolution
-	auto convolveDescriptorSet = driver->createDescriptorSet(core::smart_refctd_ptr<const IGPUDescriptorSetLayout>(convolvePipeline->getLayout()->getDescriptorSetLayout(0u)));
-	updateDescriptorSet_Convolution(driver, convolveDescriptorSet.get(), fftOutputBuffer_0, fftOutputBuffer_1, kernelNormalizedSpectrums);
-
-	// Last Axis IFFT
-	auto lastFFTDescriptorSet = driver->createDescriptorSet(core::smart_refctd_ptr<const IGPUDescriptorSetLayout>(lastFFTPipeline->getLayout()->getDescriptorSetLayout(0u)));
-	updateDescriptorSet_LastFFT(driver, lastFFTDescriptorSet.get(), fftOutputBuffer_1, outImgView);
-
-	uint32_t outBufferIx = 0u;
-	auto lastPresentStamp = std::chrono::high_resolution_clock::now();
-	bool savedToFile = false;
-	
-	auto downloadStagingArea = driver->getDefaultDownStreamingBuffer();
-	
-	auto blitFBO = driver->addFrameBuffer();
-	blitFBO->attach(video::EFAP_COLOR_ATTACHMENT0, std::move(outImgView));
-
-	while (device->run() && receiver.keepOpen())
-	{
-		driver->beginScene(false, false);
-
-		// Src Image First Axis FFT
-		driver->bindComputePipeline(fftPipeline_ImageInput.get());
-		driver->bindDescriptorSets(EPBP_COMPUTE, imageFirstFFTPipelineLayout.get(), 0u, 1u, &fftDescriptorSet_Src_FirstFFT.get(), nullptr);
-		FFTClass::dispatchHelper(driver, imageFirstFFTPipelineLayout.get(), fftPushConstants[0], fftDispatchInfo[0]);
-
-		// Src Image Last Axis FFT + Convolution + Convolved Last Axis IFFT Y
-		driver->bindComputePipeline(convolvePipeline.get());
-		driver->bindDescriptorSets(EPBP_COMPUTE, convolvePipeline->getLayout(), 0u, 1u, &convolveDescriptorSet.get(), nullptr);
-		{
-			const auto& kernelImgExtent = kernelNormalizedSpectrums[0]->getCreationParameters().image->getCreationParameters().extent;
-			vec2 kernel_half_pixel_size{0.5f,0.5f};
-			kernel_half_pixel_size.x /= kernelImgExtent.width;
-			kernel_half_pixel_size.y /= kernelImgExtent.height;
-			driver->pushConstants(convolvePipeline->getLayout(),ISpecializedShader::ESS_COMPUTE,offsetof(convolve_parameters_t,kernel_half_pixel_size),sizeof(convolve_parameters_t::kernel_half_pixel_size),&kernel_half_pixel_size);
-		}
-		FFTClass::dispatchHelper(driver, convolvePipeline->getLayout(), fftPushConstants[1], fftDispatchInfo[1]);
-
-		// Last FFT Padding and Copy to GPU Image
-		driver->bindComputePipeline(lastFFTPipeline.get());
-		driver->bindDescriptorSets(EPBP_COMPUTE, lastFFTPipeline->getLayout(), 0u, 1u, &lastFFTDescriptorSet.get(), nullptr);
-		{
-			const auto paddedSrcDim = FFTClass::padDimensions(marginSrcDim);
-			ivec2 unpad_offset = { 0,0 };
-			for (auto i=0u; i<2u; i++)
-			if (fftDispatchInfo[2].workGroupCount[i]==1u)
-				(&unpad_offset.x)[i] = ((&paddedSrcDim.width)[i]-(&srcDim.width)[i])>>1u;
-			driver->pushConstants(lastFFTPipeline->getLayout(),ISpecializedShader::ESS_COMPUTE,offsetof(image_store_parameters_t,unpad_offset),sizeof(image_store_parameters_t::unpad_offset),&unpad_offset);
-		}
-		FFTClass::dispatchHelper(driver, lastFFTPipeline->getLayout(), fftPushConstants[2], fftDispatchInfo[2]);
-		
-		if(!savedToFile) 
-		{
-			savedToFile = true;
-			
-			core::smart_refctd_ptr<ICPUImageView> imageView;
-			const uint32_t colorBufferBytesize = srcDim.height * srcDim.width * asset::getTexelOrBlockBytesize(srcFormat);
-
-			// create image
-			ICPUImage::SCreationParams imgParams;
-			imgParams.flags = static_cast<ICPUImage::E_CREATE_FLAGS>(0u); // no flags
-			imgParams.type = ICPUImage::ET_2D;
-			imgParams.format = srcFormat;
-			imgParams.extent = srcDim;
-			imgParams.mipLevels = 1u;
-			imgParams.arrayLayers = 1u;
-			imgParams.samples = ICPUImage::ESCF_1_BIT;
-			auto image = ICPUImage::create(std::move(imgParams));
-
-			constexpr uint64_t timeoutInNanoSeconds = 300000000000u;
-			const auto waitPoint = std::chrono::high_resolution_clock::now()+std::chrono::nanoseconds(timeoutInNanoSeconds);
-
-			uint32_t address = std::remove_pointer<decltype(downloadStagingArea)>::type::invalid_address; // remember without initializing the address to be allocated to invalid_address you won't get an allocation!
-			const uint32_t alignment = 4096u; // common page size
-			auto unallocatedSize = downloadStagingArea->multi_alloc(waitPoint, 1u, &address, &colorBufferBytesize, &alignment);
-			if (unallocatedSize)
-			{
-				os::Printer::log("Could not download the buffer from the GPU!", ELL_ERROR);
-			}
-
-			// set up regions
-			auto regions = core::make_refctd_dynamic_array<core::smart_refctd_dynamic_array<IImage::SBufferCopy> >(1u);
-			{
-				auto& region = regions->front();
-
-				region.bufferOffset = 0u;
-				region.bufferRowLength = 0u;
-				region.bufferImageHeight = 0u;
-				//region.imageSubresource.aspectMask = wait for Vulkan;
-				region.imageSubresource.mipLevel = 0u;
-				region.imageSubresource.baseArrayLayer = 0u;
-				region.imageSubresource.layerCount = 1u;
-				region.imageOffset = { 0u,0u,0u };
-				region.imageExtent = imgParams.extent;
-			}
-
-			driver->copyImageToBuffer(outImg.get(), downloadStagingArea->getBuffer(), 1, &regions->front());
-
-			auto downloadFence = driver->placeFence(true);
-
-			auto* data = reinterpret_cast<uint8_t*>(downloadStagingArea->getBufferPointer()) + address;
-			auto cpubufferalias = core::make_smart_refctd_ptr<asset::CCustomAllocatorCPUBuffer<core::null_allocator<uint8_t> > >(colorBufferBytesize, data, core::adopt_memory);
-			image->setBufferAndRegions(std::move(cpubufferalias),regions);
-			
-			// wait for download fence and then invalidate the CPU cache
-			{
-				auto result = downloadFence->waitCPU(timeoutInNanoSeconds,true);
-				if (result==E_DRIVER_FENCE_RETVAL::EDFR_TIMEOUT_EXPIRED||result==E_DRIVER_FENCE_RETVAL::EDFR_FAIL)
-				{
-					os::Printer::log("Could not download the buffer from the GPU, fence not signalled!", ELL_ERROR);
-					downloadStagingArea->multi_free(1u, &address, &colorBufferBytesize, nullptr);
-					continue;
-				}
-				if (downloadStagingArea->needsManualFlushOrInvalidate())
-					driver->invalidateMappedMemoryRanges({{downloadStagingArea->getBuffer()->getBoundMemory(),address,colorBufferBytesize}});
-			}
-
-			// create image view
-			ICPUImageView::SCreationParams imgViewParams;
-			imgViewParams.flags = static_cast<ICPUImageView::E_CREATE_FLAGS>(0u);
-			imgViewParams.format = image->getCreationParameters().format;
-			imgViewParams.image = std::move(image);
-			imgViewParams.viewType = ICPUImageView::ET_2D;
-			imgViewParams.subresourceRange = {static_cast<IImage::E_ASPECT_FLAGS>(0u),0u,1u,0u,1u};
-			imageView = ICPUImageView::create(std::move(imgViewParams));
-
-			IAssetWriter::SAssetWriteParams wp(imageView.get());
-			volatile bool success = am->writeAsset("convolved_exr.exr", wp);
-			assert(success);
-		}
-		
-		driver->blitRenderTargets(blitFBO, nullptr, false, false);
-
-		driver->endScene();
-	}
-
-	return 0;
-}
\ No newline at end of file
diff --git a/old_to_refactor/49_ComputeFFT/normalization.comp b/old_to_refactor/49_ComputeFFT/normalization.comp
deleted file mode 100644
index b3926090d..000000000
--- a/old_to_refactor/49_ComputeFFT/normalization.comp
+++ /dev/null
@@ -1,34 +0,0 @@
-layout(local_size_x=16, local_size_y=16, local_size_z=1) in;
- 
-#include <nbl/builtin/glsl/ext/FFT/types.glsl>
-
-layout(set=0, binding=0) restrict readonly buffer InBuffer
-{
-	nbl_glsl_ext_FFT_storage_t in_data[];
-};
-
-layout(set=0, binding=1, rg16f) uniform image2D NormalizedKernel[3];
-
-layout(push_constant) uniform PushConstants
-{
-	uvec4 strides;
-	uvec4 bitreverse_shift;
-} pc;
-
-#include <nbl/builtin/glsl/colorspace/encodeCIEXYZ.glsl>
-
-void main()
-{
-	nbl_glsl_complex value = nbl_glsl_ext_FFT_storage_t_get(in_data[nbl_glsl_dot(gl_GlobalInvocationID,pc.strides.xyz)]);
-	
-	// imaginary component will be 0, image shall be positive
-	vec3 avg;
-	for (uint i=0u; i<3u; i++)
-		avg[i] = nbl_glsl_ext_FFT_storage_t_get(in_data[pc.strides.z*i]).x;
-	const float power = (nbl_glsl_scRGBtoXYZ*avg).y;
-
-	const uvec2 coord = bitfieldReverse(gl_GlobalInvocationID.xy)>>pc.bitreverse_shift.xy;
-	const nbl_glsl_complex shift = nbl_glsl_expImaginary(-nbl_glsl_PI*float(coord.x+coord.y));
-	value = nbl_glsl_complex_mul(value,shift)/power;
-	imageStore(NormalizedKernel[gl_WorkGroupID.z],ivec2(coord),vec4(value,0.0,0.0));
-}
\ No newline at end of file
diff --git a/old_to_refactor/49_ComputeFFT/pipeline.groovy b/old_to_refactor/49_ComputeFFT/pipeline.groovy
deleted file mode 100644
index 64874da2a..000000000
--- a/old_to_refactor/49_ComputeFFT/pipeline.groovy
+++ /dev/null
@@ -1,50 +0,0 @@
-import org.DevshGraphicsProgramming.Agent
-import org.DevshGraphicsProgramming.BuilderInfo
-import org.DevshGraphicsProgramming.IBuilder
-
-class CComputeFFTBuilder extends IBuilder
-{
-	public CComputeFFTBuilder(Agent _agent, _info)
-	{
-		super(_agent, _info)
-	}
-	
-	@Override
-	public boolean prepare(Map axisMapping)
-	{
-		return true
-	}
-	
-	@Override
-  	public boolean build(Map axisMapping)
-	{
-		IBuilder.CONFIGURATION config = axisMapping.get("CONFIGURATION")
-		IBuilder.BUILD_TYPE buildType = axisMapping.get("BUILD_TYPE")
-		
-		def nameOfBuildDirectory = getNameOfBuildDirectory(buildType)
-		def nameOfConfig = getNameOfConfig(config)
-		
-		agent.execute("cmake --build ${info.rootProjectPath}/${nameOfBuildDirectory}/${info.targetProjectPathRelativeToRoot} --target ${info.targetBaseName} --config ${nameOfConfig} -j12 -v")
-		
-		return true
-	}
-	
-	@Override
-  	public boolean test(Map axisMapping)
-	{
-		return true
-	}
-	
-	@Override
-	public boolean install(Map axisMapping)
-	{
-		return true
-	}
-}
-
-def create(Agent _agent, _info)
-{
-	return new CComputeFFTBuilder(_agent, _info)
-}
-
-return this
\ No newline at end of file
diff --git a/60_ClusteredRendering/CMakeLists.txt b/old_to_refactor/60_ClusteredRendering/CMakeLists.txt
similarity index 100%
rename from 60_ClusteredRendering/CMakeLists.txt
rename to old_to_refactor/60_ClusteredRendering/CMakeLists.txt
diff --git a/60_ClusteredRendering/config.json.template b/old_to_refactor/60_ClusteredRendering/config.json.template
similarity index 100%
rename from 60_ClusteredRendering/config.json.template
rename to old_to_refactor/60_ClusteredRendering/config.json.template
diff --git a/60_ClusteredRendering/main.cpp b/old_to_refactor/60_ClusteredRendering/main.cpp
similarity index 100%
rename from 60_ClusteredRendering/main.cpp
rename to old_to_refactor/60_ClusteredRendering/main.cpp
diff --git a/60_ClusteredRendering/pipeline.groovy b/old_to_refactor/60_ClusteredRendering/pipeline.groovy
similarity index 100%
rename from 60_ClusteredRendering/pipeline.groovy
rename to old_to_refactor/60_ClusteredRendering/pipeline.groovy

From 24f952d2c5baa9bd890335591654b4c5e8b7669a Mon Sep 17 00:00:00 2001
From: devsh <devsh@devsh.eu>
Date: Sat, 19 Apr 2025 13:10:10 +0200
Subject: [PATCH 169/529] count path depth properly, bug from unremoved testing
 code caused MIS=off at any bounce > 2

added a wishlist of TODOs
---
 .../app_resources/common.glsl                    | 16 +++++++++-------
 30_ComputeShaderPathTracer/main.cpp              |  5 +++--
 2 files changed, 12 insertions(+), 9 deletions(-)

diff --git a/30_ComputeShaderPathTracer/app_resources/common.glsl b/30_ComputeShaderPathTracer/app_resources/common.glsl
index 2463f82cf..aaadae4a8 100644
--- a/30_ComputeShaderPathTracer/app_resources/common.glsl
+++ b/30_ComputeShaderPathTracer/app_resources/common.glsl
@@ -352,9 +352,9 @@ struct Payload_t
     vec3 accumulation;
     float otherTechniqueHeuristic;
     vec3 throughput;
-    #ifdef KILL_DIFFUSE_SPECULAR_PATHS
+#ifdef KILL_DIFFUSE_SPECULAR_PATHS
     bool hasDiffuse;
-    #endif
+#endif
 };
 
 struct Ray_t
@@ -491,6 +491,7 @@ layout (constant_id = 1) const int MAX_SAMPLES_LOG2 = 10;
 
 #include <nbl/builtin/glsl/random/xoroshiro.glsl>
 
+// TODO: use PCG hash + XOROSHIRO and don't read any textures
 mat2x3 rand3d(in uint protoDimension, in uint _sample, inout nbl_glsl_xoroshiro64star_state_t scramble_state)
 {
     mat2x3 retval;
@@ -552,6 +553,7 @@ nbl_glsl_LightSample nbl_glsl_light_generate_and_remainder_and_pdf(out vec3 rema
 }
 
 uint getBSDFLightIDAndDetermineNormal(out vec3 normal, in uint objectID, in vec3 intersection);
+// returns whether to stop tracing
 bool closestHitProgram(in uint depth, in uint _sample, inout Ray_t ray, inout nbl_glsl_xoroshiro64star_state_t scramble_state)
 {
     const MutableRay_t _mutable = ray._mutable;
@@ -602,7 +604,7 @@ bool closestHitProgram(in uint depth, in uint _sample, inout Ray_t ray, inout nb
 
         const bool isBSDF = BSDFNode_isBSDF(bsdf);
         //rand
-        mat2x3 epsilon = rand3d(depth,_sample,scramble_state);
+        mat2x3 epsilon = rand3d(depth*2,_sample,scramble_state);
 
         // thresholds
         const float bsdfPdfThreshold = 0.0001;
@@ -613,7 +615,7 @@ bool closestHitProgram(in uint depth, in uint _sample, inout Ray_t ray, inout nb
         // do NEE
         const float neeProbability = 1.0;// BSDFNode_getNEEProb(bsdf);
         float rcpChoiceProb;
-        if (!nbl_glsl_partitionRandVariable(neeProbability,epsilon[0].z,rcpChoiceProb) && depth<2u)
+        if (!nbl_glsl_partitionRandVariable(neeProbability,epsilon[0].z,rcpChoiceProb))
         {
             vec3 neeContrib; float lightPdf, t;
             nbl_glsl_LightSample nee_sample = nbl_glsl_light_generate_and_remainder_and_pdf(
@@ -748,15 +750,15 @@ void main()
             ray._payload.accumulation = vec3(0.0);
             ray._payload.otherTechniqueHeuristic = 0.0; // needed for direct eye-light paths
             ray._payload.throughput = vec3(1.0);
-            #ifdef KILL_DIFFUSE_SPECULAR_PATHS
+#ifdef KILL_DIFFUSE_SPECULAR_PATHS
             ray._payload.hasDiffuse = false;
-            #endif
+#endif
         }
 
         // bounces
         {
             bool hit = true; bool rayAlive = true;
-            for (int d=1; d<=PTPushConstant.depth && hit && rayAlive; d+=2)
+            for (int d=1; d<=PTPushConstant.depth && hit && rayAlive; d++)
             {
                 ray._mutable.intersectionT = nbl_glsl_FLT_MAX;
                 ray._mutable.objectID = traceRay(ray._mutable.intersectionT,ray._immutable.origin,ray._immutable.direction);
diff --git a/30_ComputeShaderPathTracer/main.cpp b/30_ComputeShaderPathTracer/main.cpp
index 26d673002..ed93cf81f 100644
--- a/30_ComputeShaderPathTracer/main.cpp
+++ b/30_ComputeShaderPathTracer/main.cpp
@@ -15,13 +15,14 @@ using namespace asset;
 using namespace ui;
 using namespace video;
 
+// TODO: share push constants
 struct PTPushConstant {
 	matrix4SIMD invMVP;
 	int sampleCount;
 	int depth;
 };
 
-// TODO: Add a QueryPool for timestamping once its ready
+// TODO: Add a QueryPool for timestamping once its ready (actually add IMGUI mspf plotter)
 // TODO: Do buffer creation using assConv
 class ComputeShaderPathtracer final : public examples::SimpleWindowedApplication, public application_templates::MonoAssetManagerAndBuiltinResourceApplication
 {
@@ -859,7 +860,7 @@ class ComputeShaderPathtracer final : public examples::SimpleWindowedApplication
 					ImGui::SliderFloat("zFar", &zFar, 110.f, 10000.f);
 					ImGui::ListBox("Shader", &PTPipline, shaderNames, E_LIGHT_GEOMETRY::ELG_COUNT);
 					ImGui::SliderInt("SPP", &spp, 1, MaxBufferSamples);
-					ImGui::SliderInt("Depth", &depth, 1, MaxBufferDimensions / 3);
+					ImGui::SliderInt("Depth", &depth, 1, MaxBufferDimensions / 6);
 
 					ImGui::Text("X: %f Y: %f", io.MousePos.x, io.MousePos.y);
 

From 3253b6f7c43042562158acde5924a7fdbef8f7cc Mon Sep 17 00:00:00 2001
From: devsh <devsh@devsh.eu>
Date: Sat, 19 Apr 2025 13:39:06 +0200
Subject: [PATCH 170/529] fix NEE_ONLY setting

---
 .../app_resources/common.glsl                 | 55 +++++++++++--------
 1 file changed, 32 insertions(+), 23 deletions(-)

diff --git a/30_ComputeShaderPathTracer/app_resources/common.glsl b/30_ComputeShaderPathTracer/app_resources/common.glsl
index aaadae4a8..65ed0609e 100644
--- a/30_ComputeShaderPathTracer/app_resources/common.glsl
+++ b/30_ComputeShaderPathTracer/app_resources/common.glsl
@@ -596,7 +596,7 @@ bool closestHitProgram(in uint depth, in uint _sample, inout Ray_t ray, inout nb
         if (BSDFNode_isNotDiffuse(bsdf))
         {
             if (ray._payload.hasDiffuse)
-                return true;
+                return false;
         }
         else
             ray._payload.hasDiffuse = true;
@@ -613,47 +613,55 @@ bool closestHitProgram(in uint depth, in uint _sample, inout Ray_t ray, inout nb
         const float monochromeEta = dot(throughputCIE_Y,BSDFNode_getEta(bsdf)[0])/(throughputCIE_Y.r+throughputCIE_Y.g+throughputCIE_Y.b);
 
         // do NEE
-        const float neeProbability = 1.0;// BSDFNode_getNEEProb(bsdf);
+#ifndef NEE_ONLY
+        // to turn off NEE, set this to 0
+        const float neeProbability = BSDFNode_getNEEProb(bsdf);
         float rcpChoiceProb;
         if (!nbl_glsl_partitionRandVariable(neeProbability,epsilon[0].z,rcpChoiceProb))
         {
+#endif
             vec3 neeContrib; float lightPdf, t;
             nbl_glsl_LightSample nee_sample = nbl_glsl_light_generate_and_remainder_and_pdf(
                 neeContrib, lightPdf, t,
                 intersection, interaction,
                 isBSDF, epsilon[0], depth
             );
-            // We don't allow non watertight transmitters in this renderer
+            // We don't allow non watertight transmitters in this renderer & scene, one cannot reach a light from the backface (optimization)
             bool validPath = nee_sample.NdotL>nbl_glsl_FLT_MIN;
             // but if we allowed non-watertight transmitters (single water surface), it would make sense just to apply this line by itself
             nbl_glsl_AnisotropicMicrofacetCache _cache;
             validPath = validPath && nbl_glsl_calcAnisotropicMicrofacetCache(_cache, interaction, nee_sample, monochromeEta);
+            // infinite PDF would mean a point light or a thin line, but our lights have finite radiance per steradian (area lights)
             if (lightPdf<nbl_glsl_FLT_MAX)
             {
-            if (any(isnan(nee_sample.L)))
-                ray._payload.accumulation += vec3(1000.f,0.f,0.f);
-            else
-            if (all(equal(vec3(69.f),nee_sample.L)))
-                ray._payload.accumulation += vec3(0.f,1000.f,0.f);
-            else
-            if (validPath)
-            {
-                float bsdfPdf;
-                neeContrib *= nbl_glsl_bsdf_cos_remainder_and_pdf(bsdfPdf,nee_sample,interaction,bsdf,monochromeEta,_cache)*throughput;
-                const float otherGenOverChoice = bsdfPdf*rcpChoiceProb;
+                // debug coloring
+                if (any(isnan(nee_sample.L)))
+                    ray._payload.accumulation += vec3(1000.f,0.f,0.f);
+                else
+                if (all(equal(vec3(69.f),nee_sample.L)))
+                    ray._payload.accumulation += vec3(0.f,1000.f,0.f);
+                else
+                if (validPath) // normally one would check for a valid path first, because zero solid angle light is less likely
+                {
+                    float bsdfPdf;
+                    // this is kinda the wrong fuction to use, we should use eval_and_pdf instead (because eval returns 0 for directions accidentally coincident with dirac delta)
+                    neeContrib *= nbl_glsl_bsdf_cos_remainder_and_pdf(bsdfPdf,nee_sample,interaction,bsdf,monochromeEta,_cache)*throughput;
+                    // this is why we need to multiply `bsdfPdf` back in, and why we have a check for the BxDF PDF not being INF
 #ifndef NEE_ONLY
-                const float otherGenOverLightAndChoice = otherGenOverChoice/lightPdf;
-                neeContrib *= otherGenOverChoice/(1.f+otherGenOverLightAndChoice*otherGenOverLightAndChoice); // MIS weight
+                    const float otherGenOverChoice = bsdfPdf*rcpChoiceProb;
+                    const float otherGenOverLightAndChoice = otherGenOverChoice/lightPdf;
+                    // MIS weight (TODO: is it correct? should `otherGenOverLightAndChoice` contain the `rcpChoiceProb` ?)
+                    neeContrib *= otherGenOverChoice/(1.f+otherGenOverLightAndChoice*otherGenOverLightAndChoice);
 #else
-                neeContrib *= otherGenOverChoice;
+                    neeContrib *= bsdfPdf;
 #endif
-                if (bsdfPdf<nbl_glsl_FLT_MAX && getLuma(neeContrib)>lumaContributionThreshold && traceRay(t,intersection+nee_sample.L*t*getStartTolerance(depth),nee_sample.L)==-1)
-                    ray._payload.accumulation += neeContrib;
-            }}
+                    if (bsdfPdf<nbl_glsl_FLT_MAX && getLuma(neeContrib)>lumaContributionThreshold && traceRay(t,intersection+nee_sample.L*t*getStartTolerance(depth),nee_sample.L)==-1)
+                        ray._payload.accumulation += neeContrib;
+                }
+            }
+#ifndef NEE_ONLY
         }
-#if NEE_ONLY
-        return false;
-#endif
+
         // sample BSDF
         float bsdfPdf; vec3 bsdfSampleL;
         {
@@ -682,6 +690,7 @@ bool closestHitProgram(in uint depth, in uint _sample, inout Ray_t ray, inout nb
             #endif
             return true;
         }
+#endif
     }
     return false;
 }

From c699bd086acd886745943cf478e9bf1160d36212 Mon Sep 17 00:00:00 2001
From: devsh <devsh@devsh.eu>
Date: Sat, 19 Apr 2025 13:45:29 +0200
Subject: [PATCH 171/529] Remove legacy deprecated/reimplemented examples

---
 42_FragmentShaderPathTracer/CMakeLists.txt    |    7 -
 42_FragmentShaderPathTracer/common.glsl       |  812 ------------
 .../config.json.template                      |   28 -
 .../litByRectangle.comp                       |  182 ---
 42_FragmentShaderPathTracer/litBySphere.comp  |   60 -
 .../litByTriangle.comp                        |  105 --
 42_FragmentShaderPathTracer/main.cpp          |  693 ----------
 42_FragmentShaderPathTracer/pipeline.groovy   |   50 -
 53_ComputeShaders/CMakeLists.txt              |    6 -
 53_ComputeShaders/computeShader.comp          |   95 --
 53_ComputeShaders/config.json.template        |   28 -
 53_ComputeShaders/fragmentShader.frag         |   12 -
 53_ComputeShaders/geometryShader.geom         |   27 -
 53_ComputeShaders/main.cpp                    |  694 ----------
 53_ComputeShaders/pipeline.groovy             |   50 -
 53_ComputeShaders/shaderCommon.glsl           |    6 -
 53_ComputeShaders/vertexShader.vert           |   23 -
 56_RayQuery/CMakeLists.txt                    |    7 -
 56_RayQuery/common.glsl                       |  793 -----------
 56_RayQuery/config.json.template              |   28 -
 56_RayQuery/litByRectangle.comp               |  106 --
 56_RayQuery/litBySphere.comp                  |   61 -
 56_RayQuery/litByTriangle.comp                |  105 --
 56_RayQuery/main.cpp                          | 1156 -----------------
 56_RayQuery/pipeline.groovy                   |   50 -
 CMakeLists.txt                                |    1 -
 26 files changed, 5185 deletions(-)
 delete mode 100644 42_FragmentShaderPathTracer/CMakeLists.txt
 delete mode 100644 42_FragmentShaderPathTracer/common.glsl
 delete mode 100644 42_FragmentShaderPathTracer/config.json.template
 delete mode 100644 42_FragmentShaderPathTracer/litByRectangle.comp
 delete mode 100644 42_FragmentShaderPathTracer/litBySphere.comp
 delete mode 100644 42_FragmentShaderPathTracer/litByTriangle.comp
 delete mode 100644 42_FragmentShaderPathTracer/main.cpp
 delete mode 100644 42_FragmentShaderPathTracer/pipeline.groovy
 delete mode 100644 53_ComputeShaders/CMakeLists.txt
 delete mode 100644 53_ComputeShaders/computeShader.comp
 delete mode 100644 53_ComputeShaders/config.json.template
 delete mode 100644 53_ComputeShaders/fragmentShader.frag
 delete mode 100644 53_ComputeShaders/geometryShader.geom
 delete mode 100644 53_ComputeShaders/main.cpp
 delete mode 100644 53_ComputeShaders/pipeline.groovy
 delete mode 100644 53_ComputeShaders/shaderCommon.glsl
 delete mode 100644 53_ComputeShaders/vertexShader.vert
 delete mode 100644 56_RayQuery/CMakeLists.txt
 delete mode 100644 56_RayQuery/common.glsl
 delete mode 100644 56_RayQuery/config.json.template
 delete mode 100644 56_RayQuery/litByRectangle.comp
 delete mode 100644 56_RayQuery/litBySphere.comp
 delete mode 100644 56_RayQuery/litByTriangle.comp
 delete mode 100644 56_RayQuery/main.cpp
 delete mode 100644 56_RayQuery/pipeline.groovy

diff --git a/42_FragmentShaderPathTracer/CMakeLists.txt b/42_FragmentShaderPathTracer/CMakeLists.txt
deleted file mode 100644
index a476b6203..000000000
--- a/42_FragmentShaderPathTracer/CMakeLists.txt
+++ /dev/null
@@ -1,7 +0,0 @@
-
-include(common RESULT_VARIABLE RES)
-if(NOT RES)
-	message(FATAL_ERROR "common.cmake not found. Should be in {repo_root}/cmake directory")
-endif()
-
-nbl_create_executable_project("" "" "" "" "${NBL_EXECUTABLE_PROJECT_CREATION_PCH_TARGET}")
\ No newline at end of file
diff --git a/42_FragmentShaderPathTracer/common.glsl b/42_FragmentShaderPathTracer/common.glsl
deleted file mode 100644
index 20f7a7359..000000000
--- a/42_FragmentShaderPathTracer/common.glsl
+++ /dev/null
@@ -1,812 +0,0 @@
-// Copyright (C) 2018-2020 - DevSH Graphics Programming Sp. z O.O.
-// This file is part of the "Nabla Engine".
-// For conditions of distribution and use, see copyright notice in nabla.h
-
-// basic settings
-#define MAX_DEPTH 3
-#define SAMPLES 128
-
-// firefly and variance reduction techniques
-//#define KILL_DIFFUSE_SPECULAR_PATHS
-//#define VISUALIZE_HIGH_VARIANCE
-
-layout(set = 2, binding = 0) uniform sampler2D envMap; 
-layout(set = 2, binding = 1) uniform usamplerBuffer sampleSequence;
-layout(set = 2, binding = 2) uniform usampler2D scramblebuf;
-
-layout(set=0, binding=0, rgba16f) uniform image2D outImage;
-
-#ifndef _NBL_GLSL_WORKGROUP_SIZE_
-#define _NBL_GLSL_WORKGROUP_SIZE_ 16
-layout(local_size_x=_NBL_GLSL_WORKGROUP_SIZE_, local_size_y=_NBL_GLSL_WORKGROUP_SIZE_, local_size_z=1) in;
-#endif
-
-ivec2 getCoordinates() {
-    return ivec2(gl_GlobalInvocationID.xy);
-}
-
-vec2 getTexCoords() {
-    ivec2 imageSize = imageSize(outImage);
-    ivec2 iCoords = getCoordinates();
-    return vec2(float(iCoords.x) / imageSize.x, 1.0 - float(iCoords.y) / imageSize.y);
-}
-
-
-#include <nbl/builtin/glsl/limits/numeric.glsl>
-#include <nbl/builtin/glsl/math/constants.glsl>
-#include <nbl/builtin/glsl/utils/common.glsl>
-#include <nbl/builtin/glsl/utils/surface_transform.glsl>
-
-#include <nbl/builtin/glsl/sampling/box_muller_transform.glsl>
-
-layout(set = 1, binding = 0, row_major, std140) uniform UBO
-{
-	nbl_glsl_SBasicViewParameters params;
-} cameraData;
-
-
-#define INVALID_ID_16BIT 0xffffu
-struct Sphere
-{
-    vec3 position;
-    float radius2;
-    uint bsdfLightIDs;
-}; 
-
-Sphere Sphere_Sphere(in vec3 position, in float radius, in uint bsdfID, in uint lightID)
-{
-    Sphere sphere;
-    sphere.position = position;
-    sphere.radius2 = radius*radius;
-    sphere.bsdfLightIDs = bitfieldInsert(bsdfID,lightID,16,16);
-    return sphere;
-}
-
-// return intersection distance if found, nbl_glsl_FLT_NAN otherwise
-float Sphere_intersect(in Sphere sphere, in vec3 origin, in vec3 direction)
-{
-    vec3 relOrigin = origin-sphere.position;
-    float relOriginLen2 = dot(relOrigin,relOrigin);
-    const float radius2 = sphere.radius2;
-
-    float dirDotRelOrigin = dot(direction,relOrigin);
-    float det = radius2-relOriginLen2+dirDotRelOrigin*dirDotRelOrigin;
-
-    // do some speculative math here
-    float detsqrt = sqrt(det);
-    return -dirDotRelOrigin+(relOriginLen2>radius2 ? (-detsqrt):detsqrt);
-}
-
-vec3 Sphere_getNormal(in Sphere sphere, in vec3 position)
-{
-    const float radiusRcp = inversesqrt(sphere.radius2);
-    return (position-sphere.position)*radiusRcp;
-}
-
-float Sphere_getSolidAngle_impl(in float cosThetaMax)
-{
-    return 2.0*nbl_glsl_PI*(1.0-cosThetaMax);
-}
-float Sphere_getSolidAngle(in Sphere sphere, in vec3 origin)
-{
-    float cosThetaMax = sqrt(1.0-sphere.radius2/nbl_glsl_lengthSq(sphere.position-origin));
-    return Sphere_getSolidAngle_impl(cosThetaMax);
-}
-
-
-Sphere spheres[SPHERE_COUNT] = {
-    Sphere_Sphere(vec3(0.0,-100.5,-1.0),100.0,0u,INVALID_ID_16BIT),
-    Sphere_Sphere(vec3(2.0,0.0,-1.0),0.5,1u,INVALID_ID_16BIT),
-    Sphere_Sphere(vec3(0.0,0.0,-1.0),0.5,2u,INVALID_ID_16BIT),
-    Sphere_Sphere(vec3(-2.0,0.0,-1.0),0.5,3u,INVALID_ID_16BIT),
-    Sphere_Sphere(vec3(2.0,0.0,1.0),0.5,4u,INVALID_ID_16BIT),
-    Sphere_Sphere(vec3(0.0,0.0,1.0),0.5,4u,INVALID_ID_16BIT),
-    Sphere_Sphere(vec3(-2.0,0.0,1.0),0.5,5u,INVALID_ID_16BIT),
-    Sphere_Sphere(vec3(0.5,1.0,0.5),0.5,6u,INVALID_ID_16BIT)
-#if SPHERE_COUNT>8
-    ,Sphere_Sphere(vec3(-1.5,1.5,0.0),0.3,INVALID_ID_16BIT,0u)
-#endif
-};
-
-
-struct Triangle
-{
-    vec3 vertex0;
-    uint bsdfLightIDs;
-    vec3 vertex1;
-    uint padding0;
-    vec3 vertex2;
-    uint padding1;
-};
-
-Triangle Triangle_Triangle(in mat3 vertices, in uint bsdfID, in uint lightID)
-{
-    Triangle tri;
-    tri.vertex0 = vertices[0];
-    tri.vertex1 = vertices[1];
-    tri.vertex2 = vertices[2];
-    //
-    tri.bsdfLightIDs = bitfieldInsert(bsdfID, lightID, 16, 16);
-    return tri;
-}
-
-// return intersection distance if found, nbl_glsl_FLT_NAN otherwise
-float Triangle_intersect(in Triangle tri, in vec3 origin, in vec3 direction)
-{
-    const vec3 edges[2] = vec3[2](tri.vertex1-tri.vertex0,tri.vertex2-tri.vertex0);
-
-    const vec3 h = cross(direction,edges[1]);
-    const float a = dot(edges[0],h);
-
-    const vec3 relOrigin = origin-tri.vertex0;
-
-    const float u = dot(relOrigin,h)/a;
-
-    const vec3 q = cross(relOrigin,edges[0]);
-    const float v = dot(direction,q)/a;
-
-    const float t = dot(edges[1],q)/a;
-
-    return t>0.f&&u>=0.f&&v>=0.f&&(u+v)<=1.f ? t:nbl_glsl_FLT_NAN;
-}
-
-vec3 Triangle_getNormalTimesArea_impl(in mat2x3 edges)
-{
-    return cross(edges[0],edges[1])*0.5;
-}
-vec3 Triangle_getNormalTimesArea(in Triangle tri)
-{
-    return Triangle_getNormalTimesArea_impl(mat2x3(tri.vertex1-tri.vertex0,tri.vertex2-tri.vertex0));
-}
-
-
-
-struct Rectangle
-{
-    vec3 offset;
-    uint bsdfLightIDs;
-    vec3 edge0;
-    uint padding0;
-    vec3 edge1;
-    uint padding1;
-};
-
-Rectangle Rectangle_Rectangle(in vec3 offset, in vec3 edge0, in vec3 edge1, in uint bsdfID, in uint lightID)
-{
-    Rectangle rect;
-    rect.offset = offset;
-    rect.edge0 = edge0;
-    rect.edge1 = edge1;
-    //
-    rect.bsdfLightIDs = bitfieldInsert(bsdfID, lightID, 16, 16);
-    return rect;
-}
-
-void Rectangle_getNormalBasis(in Rectangle rect, out mat3 basis, out vec2 extents)
-{
-    extents = vec2(length(rect.edge0), length(rect.edge1));
-    basis[0] = rect.edge0/extents[0];
-    basis[1] = rect.edge1/extents[1];
-    basis[2] = normalize(cross(basis[0],basis[1]));
-}        
-
-// return intersection distance if found, nbl_glsl_FLT_NAN otherwise
-float Rectangle_intersect(in Rectangle rect, in vec3 origin, in vec3 direction)
-{
-    const vec3 h = cross(direction,rect.edge1);
-    const float a = dot(rect.edge0,h);
-
-    const vec3 relOrigin = origin-rect.offset;
-
-    const float u = dot(relOrigin,h)/a;
-
-    const vec3 q = cross(relOrigin,rect.edge0);
-    const float v = dot(direction,q)/a;
-
-    const float t = dot(rect.edge1,q)/a;
-
-    const bool intersection = t>0.f&&u>=0.f&&v>=0.f&&u<=1.f&&v<=1.f;
-    return intersection ? t:nbl_glsl_FLT_NAN;
-}
-
-vec3 Rectangle_getNormalTimesArea(in Rectangle rect)
-{
-    return cross(rect.edge0,rect.edge1);
-}
-
-
-
-#define DIFFUSE_OP 0u
-#define CONDUCTOR_OP 1u
-#define DIELECTRIC_OP 2u
-#define OP_BITS_OFFSET 0
-#define OP_BITS_SIZE 2
-struct BSDFNode
-{ 
-    uvec4 data[2];
-};
-
-uint BSDFNode_getType(in BSDFNode node)
-{
-    return bitfieldExtract(node.data[0].w,OP_BITS_OFFSET,OP_BITS_SIZE);
-}
-bool BSDFNode_isBSDF(in BSDFNode node)
-{
-    return BSDFNode_getType(node)==DIELECTRIC_OP;
-}
-bool BSDFNode_isNotDiffuse(in BSDFNode node)
-{
-    return BSDFNode_getType(node)!=DIFFUSE_OP;
-}
-float BSDFNode_getRoughness(in BSDFNode node)
-{
-    return uintBitsToFloat(node.data[1].w);
-}
-vec3 BSDFNode_getRealEta(in BSDFNode node)
-{
-    return uintBitsToFloat(node.data[0].rgb);
-}
-vec3 BSDFNode_getImaginaryEta(in BSDFNode node)
-{
-    return uintBitsToFloat(node.data[1].rgb);
-}
-mat2x3 BSDFNode_getEta(in BSDFNode node)
-{
-    return mat2x3(BSDFNode_getRealEta(node),BSDFNode_getImaginaryEta(node));
-}
-#include <nbl/builtin/glsl/bxdf/fresnel.glsl>
-vec3 BSDFNode_getReflectance(in BSDFNode node, in float VdotH)
-{
-    const vec3 albedoOrRealIoR = uintBitsToFloat(node.data[0].rgb);
-    if (BSDFNode_isNotDiffuse(node))
-        return nbl_glsl_fresnel_conductor(albedoOrRealIoR, BSDFNode_getImaginaryEta(node), VdotH);
-    else
-        return albedoOrRealIoR;
-}
-
-float BSDFNode_getNEEProb(in BSDFNode bsdf)
-{
-    const float alpha = BSDFNode_isNotDiffuse(bsdf) ? BSDFNode_getRoughness(bsdf):1.0;
-    return min(8.0*alpha,1.0);
-}
-
-#include <nbl/builtin/glsl/colorspace/EOTF.glsl>
-#include <nbl/builtin/glsl/colorspace/encodeCIEXYZ.glsl>
-float getLuma(in vec3 col)
-{
-    return dot(transpose(nbl_glsl_scRGBtoXYZ)[1],col);
-}
-
-#define BSDF_COUNT 7
-BSDFNode bsdfs[BSDF_COUNT] = {
-    {{uvec4(floatBitsToUint(vec3(0.8,0.8,0.8)),DIFFUSE_OP),floatBitsToUint(vec4(0.0,0.0,0.0,0.0))}},
-    {{uvec4(floatBitsToUint(vec3(0.8,0.4,0.4)),DIFFUSE_OP),floatBitsToUint(vec4(0.0,0.0,0.0,0.0))}},
-    {{uvec4(floatBitsToUint(vec3(0.4,0.8,0.4)),DIFFUSE_OP),floatBitsToUint(vec4(0.0,0.0,0.0,0.0))}},
-    {{uvec4(floatBitsToUint(vec3(1.02,1.02,1.3)),CONDUCTOR_OP),floatBitsToUint(vec4(1.0,1.0,2.0,0.0))}},
-    {{uvec4(floatBitsToUint(vec3(1.02,1.3,1.02)),CONDUCTOR_OP),floatBitsToUint(vec4(1.0,2.0,1.0,0.0))}},
-    {{uvec4(floatBitsToUint(vec3(1.02,1.3,1.02)),CONDUCTOR_OP),floatBitsToUint(vec4(1.0,2.0,1.0,0.15))}},
-    {{uvec4(floatBitsToUint(vec3(1.4,1.45,1.5)),DIELECTRIC_OP),floatBitsToUint(vec4(0.0,0.0,0.0,0.0625))}}
-};
-
-
-struct Light
-{
-    vec3 radiance;
-    uint objectID;
-};
-
-vec3 Light_getRadiance(in Light light)
-{
-    return light.radiance;
-}
-uint Light_getObjectID(in Light light)
-{
-    return light.objectID;
-}
-
-
-#define LIGHT_COUNT 1
-float scene_getLightChoicePdf(in Light light)
-{
-    return 1.0/float(LIGHT_COUNT);
-}
-
-
-#define LIGHT_COUNT 1
-Light lights[LIGHT_COUNT] =
-{
-    {
-        vec3(30.0,25.0,15.0),
-#ifdef POLYGON_METHOD
-        0u
-#else
-        8u
-#endif
-    }
-};
-
-
-
-#define ANY_HIT_FLAG (-2147483648)
-#define DEPTH_BITS_COUNT 8
-#define DEPTH_BITS_OFFSET (31-DEPTH_BITS_COUNT)
-struct ImmutableRay_t
-{
-    vec3 origin;
-    vec3 direction;
-#if POLYGON_METHOD==2
-    vec3 normalAtOrigin;
-    bool wasBSDFAtOrigin;
-#endif
-};
-struct MutableRay_t
-{
-    float intersectionT;
-    uint objectID;
-    /* irrelevant here
-    uint triangleID;
-    vec2 barycentrics;
-    */
-};
-struct Payload_t
-{
-    vec3 accumulation;
-    float otherTechniqueHeuristic;
-    vec3 throughput;
-    #ifdef KILL_DIFFUSE_SPECULAR_PATHS
-    bool hasDiffuse;
-    #endif
-};
-
-struct Ray_t
-{
-    ImmutableRay_t _immutable;
-    MutableRay_t _mutable;
-    Payload_t _payload;
-};
-
-
-#define INTERSECTION_ERROR_BOUND_LOG2 (-8.0)
-float getTolerance_common(in uint depth)
-{
-    float depthRcp = 1.0/float(depth);
-    return INTERSECTION_ERROR_BOUND_LOG2;// *depthRcp*depthRcp;
-}
-float getStartTolerance(in uint depth)
-{
-    return exp2(getTolerance_common(depth));
-}
-float getEndTolerance(in uint depth)
-{
-    return 1.0-exp2(getTolerance_common(depth)+1.0);
-}
-
-
-vec2 SampleSphericalMap(vec3 v)
-{
-    vec2 uv = vec2(atan(v.z, v.x), asin(v.y));
-    uv *= nbl_glsl_RECIPROCAL_PI*0.5;
-    uv += 0.5; 
-    return uv;
-}
-
-void missProgram(in ImmutableRay_t _immutable, inout Payload_t _payload)
-{
-    vec3 finalContribution = _payload.throughput; 
-    // #define USE_ENVMAP
-#ifdef USE_ENVMAP
-	vec2 uv = SampleSphericalMap(_immutable.direction);
-    finalContribution *= textureLod(envMap, uv, 0.0).rgb;
-#else
-    const vec3 kConstantEnvLightRadiance = vec3(0.15, 0.21, 0.3);
-    finalContribution *= kConstantEnvLightRadiance;
-    _payload.accumulation += finalContribution;
-#endif
-}
-
-#include <nbl/builtin/glsl/bxdf/brdf/diffuse/oren_nayar.glsl>
-#include <nbl/builtin/glsl/bxdf/brdf/specular/beckmann.glsl>
-#include <nbl/builtin/glsl/bxdf/brdf/specular/ggx.glsl>
-#include <nbl/builtin/glsl/bxdf/bsdf/diffuse/lambert.glsl>
-#include <nbl/builtin/glsl/bxdf/bsdf/specular/dielectric.glsl>
-#include <nbl/builtin/glsl/bxdf/bsdf/specular/beckmann.glsl>
-#include <nbl/builtin/glsl/bxdf/bsdf/specular/ggx.glsl>
-nbl_glsl_LightSample nbl_glsl_bsdf_cos_generate(in nbl_glsl_AnisotropicViewSurfaceInteraction interaction, in vec3 u, in BSDFNode bsdf, in float monochromeEta, out nbl_glsl_AnisotropicMicrofacetCache _cache)
-{
-    const float a = BSDFNode_getRoughness(bsdf);
-    const mat2x3 ior = BSDFNode_getEta(bsdf);
-    
-    // fresnel stuff for dielectrics
-    float orientedEta, rcpOrientedEta;
-    const bool viewerInsideMedium = nbl_glsl_getOrientedEtas(orientedEta,rcpOrientedEta,interaction.isotropic.NdotV,monochromeEta);
-
-    nbl_glsl_LightSample smpl;
-    nbl_glsl_AnisotropicMicrofacetCache dummy;
-    switch (BSDFNode_getType(bsdf))
-    {
-        case DIFFUSE_OP:
-            smpl = nbl_glsl_oren_nayar_cos_generate(interaction,u.xy,a*a);
-            break;
-        case CONDUCTOR_OP:
-            smpl = nbl_glsl_ggx_cos_generate(interaction,u.xy,a,a,_cache);
-            break;
-        default:
-            smpl = nbl_glsl_ggx_dielectric_cos_generate(interaction,u,a,a,monochromeEta,_cache);
-            break;
-    }
-    return smpl;
-}
-
-vec3 nbl_glsl_bsdf_cos_remainder_and_pdf(out float pdf, in nbl_glsl_LightSample _sample, in nbl_glsl_AnisotropicViewSurfaceInteraction interaction, in BSDFNode bsdf, in float monochromeEta, in nbl_glsl_AnisotropicMicrofacetCache _cache)
-{
-    // are V and L on opposite sides of the surface?
-    const bool transmitted = nbl_glsl_isTransmissionPath(interaction.isotropic.NdotV,_sample.NdotL);
-
-    // is the BSDF or BRDF, if it is then we make the dot products `abs` before `max(,0.0)`
-    const bool transmissive = BSDFNode_isBSDF(bsdf);
-    const float clampedNdotL = nbl_glsl_conditionalAbsOrMax(transmissive,_sample.NdotL,0.0);
-    const float clampedNdotV = nbl_glsl_conditionalAbsOrMax(transmissive,interaction.isotropic.NdotV,0.0);
-
-    vec3 remainder;
-
-    const float minimumProjVectorLen = 0.00000001;
-    if (clampedNdotV>minimumProjVectorLen && clampedNdotL>minimumProjVectorLen)
-    {
-        // fresnel stuff for conductors (but reflectance also doubles as albedo)
-        const mat2x3 ior = BSDFNode_getEta(bsdf);
-        const vec3 reflectance = BSDFNode_getReflectance(bsdf,_cache.isotropic.VdotH);
-
-        // fresnel stuff for dielectrics
-        float orientedEta, rcpOrientedEta;
-        const bool viewerInsideMedium = nbl_glsl_getOrientedEtas(orientedEta,rcpOrientedEta,interaction.isotropic.NdotV,monochromeEta);
-
-        //
-        const float VdotL = dot(interaction.isotropic.V.dir,_sample.L);
-
-        //
-        const float a = max(BSDFNode_getRoughness(bsdf),0.0001); // TODO: @Crisspl 0-roughness still doesn't work! Also Beckmann has a weird dark rim instead as fresnel!?
-        const float a2 = a*a;
-
-        // TODO: refactor into Material Compiler-esque thing
-        switch (BSDFNode_getType(bsdf))
-        {
-            case DIFFUSE_OP:
-                remainder = reflectance*nbl_glsl_oren_nayar_cos_remainder_and_pdf_wo_clamps(pdf,a*a,VdotL,clampedNdotL,clampedNdotV);
-                break;
-            case CONDUCTOR_OP:
-                remainder = nbl_glsl_ggx_cos_remainder_and_pdf_wo_clamps(pdf,nbl_glsl_ggx_trowbridge_reitz(a2,_cache.isotropic.NdotH2),clampedNdotL,_sample.NdotL2,clampedNdotV,interaction.isotropic.NdotV_squared,reflectance,a2);
-                break;
-            default:
-                remainder = vec3(nbl_glsl_ggx_dielectric_cos_remainder_and_pdf(pdf, _sample, interaction.isotropic, _cache.isotropic, monochromeEta, a*a));
-                break;
-        }
-    }
-    else
-        remainder = vec3(0.0);
-    return remainder;
-}
-
-layout (constant_id = 0) const int MAX_DEPTH_LOG2 = 4;
-layout (constant_id = 1) const int MAX_SAMPLES_LOG2 = 10;
-
-
-#include <nbl/builtin/glsl/random/xoroshiro.glsl>
-
-mat2x3 rand3d(in uint protoDimension, in uint _sample, inout nbl_glsl_xoroshiro64star_state_t scramble_state)
-{
-    mat2x3 retval;
-    uint address = bitfieldInsert(protoDimension,_sample,MAX_DEPTH_LOG2,MAX_SAMPLES_LOG2);
-    for (int i=0; i<2u; i++)
-    {
-	    uvec3 seqVal = texelFetch(sampleSequence,int(address)+i).xyz;
-	    seqVal ^= uvec3(nbl_glsl_xoroshiro64star(scramble_state),nbl_glsl_xoroshiro64star(scramble_state),nbl_glsl_xoroshiro64star(scramble_state));
-        retval[i] = vec3(seqVal)*uintBitsToFloat(0x2f800004u);
-    }
-    return retval;
-}
-
-
-void traceRay_extraShape(inout int objectID, inout float intersectionT, in vec3 origin, in vec3 direction);
-int traceRay(inout float intersectionT, in vec3 origin, in vec3 direction)
-{
-    const bool anyHit = intersectionT!=nbl_glsl_FLT_MAX;
-
-	int objectID = -1;
-	for (int i=0; i<SPHERE_COUNT; i++)
-    {
-        float t = Sphere_intersect(spheres[i],origin,direction);
-        bool closerIntersection = t>0.0 && t<intersectionT;
-
-        intersectionT = closerIntersection ? t : intersectionT;
-		objectID = closerIntersection ? i:objectID;
-        
-        // allowing early out results in a performance regression, WTF!?
-        //if (anyHit && closerIntersection)
-           //break;
-    }
-    traceRay_extraShape(objectID,intersectionT,origin,direction);
-    return objectID;
-}
-
-//
-float nbl_glsl_light_deferred_pdf(in Light light, in Ray_t ray);
-vec3 nbl_glsl_light_deferred_eval_and_prob(out float pdf, in Light light, in Ray_t ray)
-{
-    // we don't have to worry about solid angle of the light w.r.t. surface of the light because this function only ever gets called from closestHit routine, so such ray cannot be produced (because lights have no BSDFs here)
-    pdf = scene_getLightChoicePdf(light);
-    pdf *= nbl_glsl_light_deferred_pdf(light,ray);
-    return Light_getRadiance(light);
-}
-
-vec3 nbl_glsl_light_generate_and_pdf(out float pdf, out float newRayMaxT, in vec3 origin, in nbl_glsl_AnisotropicViewSurfaceInteraction interaction, in bool isBSDF, in vec3 xi, in uint objectID);
-nbl_glsl_LightSample nbl_glsl_light_generate_and_remainder_and_pdf(out vec3 remainder, out float pdf, out float newRayMaxT, in vec3 origin, in nbl_glsl_AnisotropicViewSurfaceInteraction interaction, in bool isBSDF, in vec3 xi, in uint depth)
-{
-    // normally we'd pick from set of lights, using `xi.z`
-    const Light light = lights[0];
-    
-    vec3 L = nbl_glsl_light_generate_and_pdf(pdf,newRayMaxT,origin,interaction,isBSDF,xi,Light_getObjectID(light));
-
-    newRayMaxT *= getEndTolerance(depth);
-    pdf *= scene_getLightChoicePdf(light);
-    remainder = Light_getRadiance(light)/pdf;
-    return nbl_glsl_createLightSample(L,interaction);
-}
-
-uint getBSDFLightIDAndDetermineNormal(out vec3 normal, in uint objectID, in vec3 intersection);
-bool closestHitProgram(in uint depth, in uint _sample, inout Ray_t ray, inout nbl_glsl_xoroshiro64star_state_t scramble_state)
-{
-    const MutableRay_t _mutable = ray._mutable;
-    const uint objectID = _mutable.objectID;
-
-    // interaction stuffs
-    const ImmutableRay_t _immutable = ray._immutable;
-    const vec3 intersection = _immutable.origin+_immutable.direction*_mutable.intersectionT;
-
-    uint bsdfLightIDs;
-    nbl_glsl_AnisotropicViewSurfaceInteraction interaction;
-    {
-        nbl_glsl_IsotropicViewSurfaceInteraction isotropic;
-        bsdfLightIDs = getBSDFLightIDAndDetermineNormal(isotropic.N,objectID,intersection);
-
-        isotropic.V.dir = -_immutable.direction;
-        isotropic.NdotV = dot(isotropic.V.dir,isotropic.N);
-        isotropic.NdotV_squared = isotropic.NdotV*isotropic.NdotV;
-
-        interaction = nbl_glsl_calcAnisotropicInteraction(isotropic);
-    }
-
-    //
-    vec3 throughput = ray._payload.throughput;
-
-    // add emissive and finish MIS
-    const uint lightID = bitfieldExtract(bsdfLightIDs,16,16);
-    if (lightID != INVALID_ID_16BIT) // has emissive
-    {
-        float lightPdf;
-        ray._payload.accumulation += nbl_glsl_light_deferred_eval_and_prob(lightPdf,lights[lightID],ray)*throughput/(1.0+lightPdf*lightPdf*ray._payload.otherTechniqueHeuristic);
-    }
-
-    // check if we even have a BSDF at all
-    uint bsdfID = bitfieldExtract(bsdfLightIDs, 0, 16);
-    if (bsdfID != INVALID_ID_16BIT)
-    {
-        BSDFNode bsdf = bsdfs[bsdfID];
-#ifdef KILL_DIFFUSE_SPECULAR_PATHS
-        if (BSDFNode_isNotDiffuse(bsdf))
-        {
-            if (ray._payload.hasDiffuse)
-                return true;
-        }
-        else
-            ray._payload.hasDiffuse = true;
-#endif
-
-        const bool isBSDF = BSDFNode_isBSDF(bsdf);
-        //rand
-        mat2x3 epsilon = rand3d(depth,_sample,scramble_state);
-
-        // thresholds
-        const float bsdfPdfThreshold = 0.0001;
-        const float lumaContributionThreshold = getLuma(nbl_glsl_eotf_sRGB(vec3(1.0)/255.0)); // OETF smallest perceptible value
-        const vec3 throughputCIE_Y = transpose(nbl_glsl_sRGBtoXYZ)[1]*throughput;
-        const float monochromeEta = dot(throughputCIE_Y,BSDFNode_getEta(bsdf)[0])/(throughputCIE_Y.r+throughputCIE_Y.g+throughputCIE_Y.b);
-
-        // do NEE
-        const float neeProbability = 1.0;// BSDFNode_getNEEProb(bsdf);
-        float rcpChoiceProb;
-        if (!nbl_glsl_partitionRandVariable(neeProbability,epsilon[0].z,rcpChoiceProb) && depth<2u)
-        {
-            vec3 neeContrib; float lightPdf, t;
-            nbl_glsl_LightSample nee_sample = nbl_glsl_light_generate_and_remainder_and_pdf(
-                neeContrib, lightPdf, t,
-                intersection, interaction,
-                isBSDF, epsilon[0], depth
-            );
-            // We don't allow non watertight transmitters in this renderer
-            bool validPath = nee_sample.NdotL>nbl_glsl_FLT_MIN;
-            // but if we allowed non-watertight transmitters (single water surface), it would make sense just to apply this line by itself
-            nbl_glsl_AnisotropicMicrofacetCache _cache;
-            validPath = validPath && nbl_glsl_calcAnisotropicMicrofacetCache(_cache, interaction, nee_sample, monochromeEta);
-            if (lightPdf<nbl_glsl_FLT_MAX)
-            {
-            if (any(isnan(nee_sample.L)))
-                ray._payload.accumulation += vec3(1000.f,0.f,0.f);
-            else
-            if (all(equal(vec3(69.f),nee_sample.L)))
-                ray._payload.accumulation += vec3(0.f,1000.f,0.f);
-            else
-            if (validPath)
-            {
-                float bsdfPdf;
-                neeContrib *= nbl_glsl_bsdf_cos_remainder_and_pdf(bsdfPdf,nee_sample,interaction,bsdf,monochromeEta,_cache)*throughput;
-                const float otherGenOverChoice = bsdfPdf*rcpChoiceProb;
-#if 0
-                const float otherGenOverLightAndChoice = otherGenOverChoice/lightPdf;
-                neeContrib *= otherGenOverChoice/(1.f+otherGenOverLightAndChoice*otherGenOverLightAndChoice); // MIS weight
-#else
-                neeContrib *= otherGenOverChoice;
-#endif
-                if (bsdfPdf<nbl_glsl_FLT_MAX && getLuma(neeContrib)>lumaContributionThreshold && traceRay(t,intersection+nee_sample.L*t*getStartTolerance(depth),nee_sample.L)==-1)
-                    ray._payload.accumulation += neeContrib;
-            }}
-        }
-#if 1
-        return false;
-#endif
-        // sample BSDF
-        float bsdfPdf; vec3 bsdfSampleL;
-        {
-            nbl_glsl_AnisotropicMicrofacetCache _cache;
-            nbl_glsl_LightSample bsdf_sample = nbl_glsl_bsdf_cos_generate(interaction,epsilon[1],bsdf,monochromeEta,_cache);
-            // the value of the bsdf divided by the probability of the sample being generated
-            throughput *= nbl_glsl_bsdf_cos_remainder_and_pdf(bsdfPdf,bsdf_sample,interaction,bsdf,monochromeEta,_cache);
-            //
-            bsdfSampleL = bsdf_sample.L;
-        }
-        
-        // additional threshold
-        const float lumaThroughputThreshold = lumaContributionThreshold;
-        if (bsdfPdf>bsdfPdfThreshold && getLuma(throughput)>lumaThroughputThreshold)
-        {
-            ray._payload.throughput = throughput;
-            ray._payload.otherTechniqueHeuristic = neeProbability/bsdfPdf; // numerically stable, don't touch
-            ray._payload.otherTechniqueHeuristic *= ray._payload.otherTechniqueHeuristic;
-                    
-            // trace new ray
-            ray._immutable.origin = intersection+bsdfSampleL*(1.0/*kSceneSize*/)*getStartTolerance(depth);
-            ray._immutable.direction = bsdfSampleL;
-            #if POLYGON_METHOD==2
-            ray._immutable.normalAtOrigin = interaction.isotropic.N;
-            ray._immutable.wasBSDFAtOrigin = isBSDF;
-            #endif
-            return true;
-        }
-    }
-    return false;
-}
-
-void main()
-{
-    const ivec2 imageExtents = imageSize(outImage);
-    const ivec2 coords = getCoordinates();
-    vec2 texCoord = vec2(coords) / vec2(imageExtents);
-    texCoord.y = 1.0 - texCoord.y;
-
-    if (false == (all(lessThanEqual(ivec2(0),coords)) && all(greaterThan(imageExtents,coords)))) {
-        return;
-    }
-
-    if (((MAX_DEPTH-1)>>MAX_DEPTH_LOG2)>0 || ((SAMPLES-1)>>MAX_SAMPLES_LOG2)>0)
-    {
-        vec4 pixelCol = vec4(1.0,0.0,0.0,1.0);
-        imageStore(outImage, coords, pixelCol);
-        return;
-    }
-
-	nbl_glsl_xoroshiro64star_state_t scramble_start_state = texelFetch(scramblebuf,coords,0).rg;
-    const vec2 pixOffsetParam = vec2(1.0)/vec2(textureSize(scramblebuf,0));
-
-
-    const mat4 invMVP = inverse(cameraData.params.MVP);
-    
-    vec4 NDC = vec4(texCoord*vec2(2.0,-2.0)+vec2(-1.0,1.0),0.0,1.0);
-    vec3 camPos;
-    {
-        vec4 tmp = invMVP*NDC;
-        camPos = tmp.xyz/tmp.w;
-        NDC.z = 1.0;
-    }
-
-    vec3 color = vec3(0.0);
-    float meanLumaSquared = 0.0;
-    // TODO: if we collapse the nested for loop, then all GPUs will get `MAX_DEPTH` factor speedup, not just NV with separate PC
-    for (int i=0; i<SAMPLES; i++)
-    {
-        nbl_glsl_xoroshiro64star_state_t scramble_state = scramble_start_state;
-
-        Ray_t ray;
-        // raygen
-        {
-            ray._immutable.origin = camPos;
-
-            vec4 tmp = NDC;
-            // apply stochastic reconstruction filter
-            const float gaussianFilterCutoff = 2.5;
-            const float truncation = exp(-0.5*gaussianFilterCutoff*gaussianFilterCutoff);
-            vec2 remappedRand = rand3d(0u,i,scramble_state)[0].xy;
-            remappedRand.x *= 1.0-truncation;
-            remappedRand.x += truncation;
-            tmp.xy += pixOffsetParam*nbl_glsl_BoxMullerTransform(remappedRand,1.5);
-            // for depth of field we could do another stochastic point-pick
-            tmp = invMVP*tmp;
-            ray._immutable.direction = normalize(tmp.xyz/tmp.w-camPos);
-
-            #if POLYGON_METHOD==2
-                ray._immutable.normalAtOrigin = vec3(0.0,0.0,0.0);
-                ray._immutable.wasBSDFAtOrigin = false;
-            #endif
-
-            ray._payload.accumulation = vec3(0.0);
-            ray._payload.otherTechniqueHeuristic = 0.0; // needed for direct eye-light paths
-            ray._payload.throughput = vec3(1.0);
-            #ifdef KILL_DIFFUSE_SPECULAR_PATHS
-            ray._payload.hasDiffuse = false;
-            #endif
-        }
-
-        // bounces
-        {
-            bool hit = true; bool rayAlive = true;
-            for (int d=1; d<=MAX_DEPTH && hit && rayAlive; d+=2)
-            {
-                ray._mutable.intersectionT = nbl_glsl_FLT_MAX;
-                ray._mutable.objectID = traceRay(ray._mutable.intersectionT,ray._immutable.origin,ray._immutable.direction);
-                hit = ray._mutable.objectID!=-1;
-                if (hit)
-                    rayAlive = closestHitProgram(d, i, ray, scramble_state);
-            }
-            // was last trace a miss?
-            if (!hit)
-                missProgram(ray._immutable,ray._payload);
-        }
-
-        vec3 accumulation = ray._payload.accumulation;
-
-        float rcpSampleSize = 1.0/float(i+1);
-        color += (accumulation-color)*rcpSampleSize;
-        
-        #ifdef VISUALIZE_HIGH_VARIANCE
-            float luma = getLuma(accumulation);
-            meanLumaSquared += (luma*luma-meanLumaSquared)*rcpSampleSize;
-        #endif
-    }
-
-    #ifdef VISUALIZE_HIGH_VARIANCE
-        float variance = getLuma(color);
-        variance *= variance;
-        variance = meanLumaSquared-variance;
-        if (variance>5.0)
-            color = vec3(1.0,0.0,0.0);
-    #endif
-
-    vec4 pixelCol = vec4(color, 1.0);
-    imageStore(outImage, coords, pixelCol);
-}
-/** TODO: Improving Rendering
-
-Now:
-- Always MIS (path correlated reuse)
-- Test MIS alpha (roughness) scheme
-
-Many Lights:
-- Path Guiding
-- Light Importance Lists/Classification
-- Spatio-Temporal Reservoir Sampling
-
-Indirect Light:
-- Bidirectional Path Tracing
-- Uniform Path Sampling / Vertex Connection and Merging / Path Space Regularization
-
-Animations:
-- A-SVGF / BMFR
-**/
\ No newline at end of file
diff --git a/42_FragmentShaderPathTracer/config.json.template b/42_FragmentShaderPathTracer/config.json.template
deleted file mode 100644
index f961745c1..000000000
--- a/42_FragmentShaderPathTracer/config.json.template
+++ /dev/null
@@ -1,28 +0,0 @@
-{
-  "enableParallelBuild": true,
-  "threadsPerBuildProcess" : 2,
-  "isExecuted": false,
-  "scriptPath": "",
-  "cmake": {
-    "configurations": [ "Release", "Debug", "RelWithDebInfo" ],
-    "buildModes": [],
-    "requiredOptions": []
-  }, 
-  "profiles": [
-    {
-      "backend": "vulkan",
-      "platform": "windows",
-      "buildModes": [],
-      "runConfiguration": "Release",
-      "gpuArchitectures": []
-    }
-  ],
-  "dependencies": [],
-  "data": [
-    {
-      "dependencies": [],
-      "command": [""],
-      "outputs": []
-    }
-  ]
-}
\ No newline at end of file
diff --git a/42_FragmentShaderPathTracer/litByRectangle.comp b/42_FragmentShaderPathTracer/litByRectangle.comp
deleted file mode 100644
index 300cef559..000000000
--- a/42_FragmentShaderPathTracer/litByRectangle.comp
+++ /dev/null
@@ -1,182 +0,0 @@
-// Copyright (C) 2018-2020 - DevSH Graphics Programming Sp. z O.O.
-// This file is part of the "Nabla Engine".
-// For conditions of distribution and use, see copyright notice in nabla.h
-
-#version 430 core
-#extension GL_GOOGLE_include_directive : require
-
-#define SPHERE_COUNT 8
-#define POLYGON_METHOD 1 // 0 area sampling, 1 solid angle sampling, 2 approximate projected solid angle sampling
-#include "common.glsl"
-
-#define RECTANGLE_COUNT 1
-const vec3 edge0 = normalize(vec3(2,0,-1));
-const vec3 edge1 = normalize(vec3(2,-5,4));
-Rectangle rectangles[RECTANGLE_COUNT] = {
-    Rectangle_Rectangle(vec3(-3.8,0.35,1.3),edge0*7.0,edge1*0.1,INVALID_ID_16BIT,0u)
-};
-
-
-void traceRay_extraShape(inout int objectID, inout float intersectionT, in vec3 origin, in vec3 direction)
-{
-	for (int i=0; i<RECTANGLE_COUNT; i++)
-    {
-        float t = Rectangle_intersect(rectangles[i],origin,direction);
-        bool closerIntersection = t>0.0 && t<intersectionT;
-
-		objectID = closerIntersection ? (i+SPHERE_COUNT):objectID;
-        intersectionT = closerIntersection ? t:intersectionT;
-    }
-}
-
-#include <nbl/builtin/glsl/sampling/projected_spherical_triangle.glsl>
-#include <nbl/builtin/glsl/barycentric/utils.glsl>
-#include <nbl/builtin/glsl/sampling/spherical_rectangle.glsl>
-
-float nbl_glsl_light_deferred_pdf(in Light light, in Ray_t ray)
-{
-    const Rectangle rect = rectangles[Light_getObjectID(light)];
-    
-    const ImmutableRay_t _immutable = ray._immutable;
-    const vec3 L = _immutable.direction;
-#if POLYGON_METHOD==0
-    const float dist = ray._mutable.intersectionT;
-    return dist*dist/abs(dot(Rectangle_getNormalTimesArea(rect),L));
-#else
-    #ifdef TRIANGLE_REFERENCE
-        const mat3 sphericalVertices[2] = 
-        {
-            nbl_glsl_shapes_getSphericalTriangle(mat3(rect.offset,rect.offset+rect.edge0,rect.offset+rect.edge1),_immutable.origin),
-            nbl_glsl_shapes_getSphericalTriangle(mat3(rect.offset+rect.edge1,rect.offset+rect.edge0,rect.offset+rect.edge0+rect.edge1),_immutable.origin)
-        };
-        float solidAngle[2];
-        vec3 cos_vertices[2],sin_vertices[2];
-        float cos_a[2],cos_c[2],csc_b[2],csc_c[2];
-        for (uint i=0u; i<2u; i++)
-            solidAngle[i] = nbl_glsl_shapes_SolidAngleOfTriangle(sphericalVertices[i],cos_vertices[i],sin_vertices[i],cos_a[i],cos_c[i],csc_b[i],csc_c[i]);
-        const float rectSolidAngle = solidAngle[0]+solidAngle[1];
-        #if POLYGON_METHOD==1
-            return 1.f/rectSolidAngle;
-        #elif POLYGON_METHOD==2
-            // TODO: figure out what breaks for a directly visible light under MIS
-            if (rectSolidAngle > nbl_glsl_FLT_MIN)
-            {
-                const vec2 bary = nbl_glsl_barycentric_reconstructBarycentrics(L*ray._mutable.intersectionT+_immutable.origin-rect.offset,mat2x3(rect.edge0,rect.edge1));
-                const uint i = bary.x>=0.f&&bary.y>=0.f&&(bary.x+bary.y)<=1.f ? 0u:1u;
-
-                float pdf = nbl_glsl_sampling_probProjectedSphericalTriangleSample(solidAngle[i],cos_vertices[i],sin_vertices[i],cos_a[i],cos_c[i],csc_b[i],csc_c[i],sphericalVertices[i],_immutable.normalAtOrigin,_immutable.wasBSDFAtOrigin,L);
-                pdf *= solidAngle[i]/rectSolidAngle;
-                return pdf;
-            }
-            else
-                return nbl_glsl_FLT_INF;
-        #endif
-    #else
-        float pdf;
-        mat3 rectNormalBasis;
-        vec2 rectExtents;
-        Rectangle_getNormalBasis(rect, rectNormalBasis, rectExtents);
-        vec3 sphR0 = nbl_glsl_shapes_getSphericalRectangle(_immutable.origin, rect.offset, rectNormalBasis);
-        float solidAngle = nbl_glsl_shapes_SolidAngleOfRectangle(sphR0, rectExtents);
-        if (solidAngle > nbl_glsl_FLT_MIN)
-        {
-            #if POLYGON_METHOD==1
-            pdf = 1.f/solidAngle;
-            #else
-                #error
-            #endif  
-        }
-        else
-            pdf = nbl_glsl_FLT_INF;
-        return pdf;
-    #endif
-#endif
-}
-
-vec3 nbl_glsl_light_generate_and_pdf(out float pdf, out float newRayMaxT, in vec3 origin, in nbl_glsl_AnisotropicViewSurfaceInteraction interaction, in bool isBSDF, in vec3 xi, in uint objectID)
-{
-    const Rectangle rect = rectangles[objectID];
-    const vec3 N = Rectangle_getNormalTimesArea(rect);
-
-    const vec3 origin2origin = rect.offset-origin;
-#if POLYGON_METHOD==0
-    vec3 L = origin2origin+rect.edge0*xi.x+rect.edge1*xi.y; // TODO: refactor
-    
-    const float distanceSq = dot(L,L);
-    const float rcpDistance = inversesqrt(distanceSq);
-    L *= rcpDistance;
-    
-    pdf = distanceSq/abs(dot(N,L));
-    newRayMaxT = 1.0/rcpDistance;
-    return L;
-#else 
-    #ifdef TRIANGLE_REFERENCE
-        const mat3 sphericalVertices[2] = 
-        {
-            nbl_glsl_shapes_getSphericalTriangle(mat3(rect.offset,rect.offset+rect.edge0,rect.offset+rect.edge1),origin),
-            nbl_glsl_shapes_getSphericalTriangle(mat3(rect.offset+rect.edge1,rect.offset+rect.edge0,rect.offset+rect.edge0+rect.edge1),origin)
-        };
-        float solidAngle[2];
-        vec3 cos_vertices[2],sin_vertices[2];
-        float cos_a[2],cos_c[2],csc_b[2],csc_c[2];
-        for (uint i=0u; i<2u; i++)
-            solidAngle[i] = nbl_glsl_shapes_SolidAngleOfTriangle(sphericalVertices[i],cos_vertices[i],sin_vertices[i],cos_a[i],cos_c[i],csc_b[i],csc_c[i]);
-        vec3 L = vec3(0.f,0.f,0.f);
-        const float rectangleSolidAngle = solidAngle[0]+solidAngle[1];
-        if (rectangleSolidAngle > nbl_glsl_FLT_MIN)
-        {
-            float rcpTriangleChoiceProb;
-            const uint i = nbl_glsl_partitionRandVariable(solidAngle[0]/rectangleSolidAngle,xi.z,rcpTriangleChoiceProb) ? 1u:0u;
-        #if POLYGON_METHOD==1
-            L = nbl_glsl_sampling_generateSphericalTriangleSample(solidAngle[i],cos_vertices[i],sin_vertices[i],cos_a[i],cos_c[i],csc_b[i],csc_c[i],sphericalVertices[i],xi.xy);
-            pdf = 1.f/rectangleSolidAngle;
-        #elif POLYGON_METHOD==2
-            float rcpPdf;
-            L = nbl_glsl_sampling_generateProjectedSphericalTriangleSample(rcpPdf,solidAngle[i],cos_vertices[i],sin_vertices[i],cos_a[i],cos_c[i],csc_b[i],csc_c[i],sphericalVertices[i],interaction.isotropic.N,isBSDF,xi.xy);
-            pdf = 1.f/(rcpPdf*rcpTriangleChoiceProb);
-        #endif
-        }
-        else
-            pdf = nbl_glsl_FLT_INF;
-    #else
-        mat3 rectNormalBasis;
-        vec2 rectExtents;
-        Rectangle_getNormalBasis(rect, rectNormalBasis, rectExtents);
-        vec3 sphR0 = nbl_glsl_shapes_getSphericalRectangle(origin, rect.offset, rectNormalBasis);
-        vec3 L = vec3(0.f,0.f,0.f);
-        float solidAngle;
-        vec2 sphUv = nbl_glsl_sampling_generateSphericalRectangleSample(sphR0, rectExtents, xi.xy, solidAngle);
-        if (solidAngle > nbl_glsl_FLT_MIN)
-        {
-            #if POLYGON_METHOD==1
-            vec3 sph_sample = sphUv[0] * rect.edge0 + sphUv[1] * rect.edge1 + rect.offset;
-            L = normalize(sph_sample - origin);
-            pdf = 1.f/solidAngle;
-            #else
-                #error
-            #endif  
-        }
-        else
-            pdf = nbl_glsl_FLT_INF;
-    #endif
-    newRayMaxT = dot(N,origin2origin)/dot(N,L);
-    return L;
-#endif
-}
-
-
-uint getBSDFLightIDAndDetermineNormal(out vec3 normal, in uint objectID, in vec3 intersection)
-{
-    if (objectID<SPHERE_COUNT)
-    {
-        Sphere sphere = spheres[objectID];
-        normal = Sphere_getNormal(sphere,intersection);
-        return sphere.bsdfLightIDs;
-    }
-    else
-    {
-        Rectangle rect = rectangles[objectID-SPHERE_COUNT];
-        normal = normalize(Rectangle_getNormalTimesArea(rect));
-        return rect.bsdfLightIDs;
-    }
-}
\ No newline at end of file
diff --git a/42_FragmentShaderPathTracer/litBySphere.comp b/42_FragmentShaderPathTracer/litBySphere.comp
deleted file mode 100644
index bd1a48575..000000000
--- a/42_FragmentShaderPathTracer/litBySphere.comp
+++ /dev/null
@@ -1,60 +0,0 @@
-// Copyright (C) 2018-2020 - DevSH Graphics Programming Sp. z O.O.
-// This file is part of the "Nabla Engine".
-// For conditions of distribution and use, see copyright notice in nabla.h
-
-#version 430 core
-#extension GL_GOOGLE_include_directive : require
-
-#define SPHERE_COUNT 9
-#include "common.glsl"
-
-
-void traceRay_extraShape(inout int objectID, inout float intersectionT, in vec3 origin, in vec3 direction)
-{
-}
-
-float nbl_glsl_light_deferred_pdf(in Light light, in Ray_t ray)
-{
-    const Sphere sphere = spheres[ray._mutable.objectID];
-    return 1.0/Sphere_getSolidAngle(sphere,ray._immutable.origin);
-}
-
-vec3 nbl_glsl_light_generate_and_pdf(out float pdf, out float newRayMaxT, in vec3 origin, in nbl_glsl_AnisotropicViewSurfaceInteraction interaction, in bool isBSDF, in vec3 xi, in uint objectID)
-{
-    const Sphere sphere = spheres[objectID];
-
-    vec3 Z = sphere.position-origin;
-    const float distanceSQ = dot(Z,Z);
-    const float cosThetaMax2 = 1.0-sphere.radius2/distanceSQ;
-    if (cosThetaMax2>0.0)
-    {
-        const float rcpDistance = inversesqrt(distanceSQ);
-        Z *= rcpDistance;
-    
-        const float cosThetaMax = sqrt(cosThetaMax2);
-        const float cosTheta = mix(1.0,cosThetaMax,xi.x);
-
-        vec3 L = Z*cosTheta;
-
-        const float cosTheta2 = cosTheta*cosTheta;
-        const float sinTheta = sqrt(1.0-cosTheta2);
-        float sinPhi,cosPhi;
-        nbl_glsl_sincos(2.0*nbl_glsl_PI*xi.y-nbl_glsl_PI,sinPhi,cosPhi);
-        mat2x3 XY = nbl_glsl_frisvad(Z);
-    
-        L += (XY[0]*cosPhi+XY[1]*sinPhi)*sinTheta;
-    
-        newRayMaxT = (cosTheta-sqrt(cosTheta2-cosThetaMax2))/rcpDistance;
-        pdf = 1.0/Sphere_getSolidAngle_impl(cosThetaMax);
-        return L;
-    }
-    pdf = 0.0;
-    return vec3(0.0,0.0,0.0);
-}
-
-uint getBSDFLightIDAndDetermineNormal(out vec3 normal, in uint objectID, in vec3 intersection)
-{
-    Sphere sphere = spheres[objectID];
-    normal = Sphere_getNormal(sphere,intersection);
-    return sphere.bsdfLightIDs;
-}
\ No newline at end of file
diff --git a/42_FragmentShaderPathTracer/litByTriangle.comp b/42_FragmentShaderPathTracer/litByTriangle.comp
deleted file mode 100644
index ba23c82e5..000000000
--- a/42_FragmentShaderPathTracer/litByTriangle.comp
+++ /dev/null
@@ -1,105 +0,0 @@
-// Copyright (C) 2018-2020 - DevSH Graphics Programming Sp. z O.O.
-// This file is part of the "Nabla Engine".
-// For conditions of distribution and use, see copyright notice in nabla.h
-
-#version 430 core
-#extension GL_GOOGLE_include_directive : require
-
-#define SPHERE_COUNT 8
-#define POLYGON_METHOD 1 // 0 area sampling, 1 solid angle sampling, 2 approximate projected solid angle sampling
-#include "common.glsl"
-
-#define TRIANGLE_COUNT 1
-Triangle triangles[TRIANGLE_COUNT] = {
-    Triangle_Triangle(mat3(vec3(-1.8,0.35,0.3),vec3(-1.2,0.35,0.0),vec3(-1.5,0.8,-0.3))*10.0,INVALID_ID_16BIT,0u)
-};
-
-void traceRay_extraShape(inout int objectID, inout float intersectionT, in vec3 origin, in vec3 direction)
-{
-	for (int i=0; i<TRIANGLE_COUNT; i++)
-    {
-        float t = Triangle_intersect(triangles[i],origin,direction);
-        bool closerIntersection = t>0.0 && t<intersectionT;
-
-		objectID = closerIntersection ? (i+SPHERE_COUNT):objectID;
-        intersectionT = closerIntersection ? t:intersectionT;
-    }
-}
-
-
-#include <nbl/builtin/glsl/sampling/projected_spherical_triangle.glsl>
-float nbl_glsl_light_deferred_pdf(in Light light, in Ray_t ray)
-{
-    const Triangle tri = triangles[Light_getObjectID(light)];
-
-    const vec3 L = ray._immutable.direction;
-#if POLYGON_METHOD==0
-    const float dist = ray._mutable.intersectionT;
-    return dist*dist/abs(dot(Triangle_getNormalTimesArea(tri),L));
-#else
-    const ImmutableRay_t _immutable = ray._immutable;
-    const mat3 sphericalVertices = nbl_glsl_shapes_getSphericalTriangle(mat3(tri.vertex0,tri.vertex1,tri.vertex2),_immutable.origin);
-    #if POLYGON_METHOD==1
-        const float rcpProb = nbl_glsl_shapes_SolidAngleOfTriangle(sphericalVertices);
-        // if `rcpProb` is NAN then the triangle's solid angle was close to 0.0 
-        return rcpProb>nbl_glsl_FLT_MIN ? (1.0/rcpProb):nbl_glsl_FLT_MAX;
-    #elif POLYGON_METHOD==2
-        const float pdf = nbl_glsl_sampling_probProjectedSphericalTriangleSample(sphericalVertices,_immutable.normalAtOrigin,_immutable.wasBSDFAtOrigin,L);
-        // if `pdf` is NAN then the triangle's projected solid angle was close to 0.0, if its close to INF then the triangle was very small
-        return pdf<nbl_glsl_FLT_MAX ? pdf:0.0;
-    #endif
-#endif
-}
-
-vec3 nbl_glsl_light_generate_and_pdf(out float pdf, out float newRayMaxT, in vec3 origin, in nbl_glsl_AnisotropicViewSurfaceInteraction interaction, in bool isBSDF, in vec3 xi, in uint objectID)
-{
-    const Triangle tri = triangles[objectID];
-    
-#if POLYGON_METHOD==0
-    const mat2x3 edges = mat2x3(tri.vertex1-tri.vertex0,tri.vertex2-tri.vertex0);
-    const float sqrtU = sqrt(xi.x);
-    vec3 point = tri.vertex0+edges[0]*(1.0-sqrtU)+edges[1]*sqrtU*xi.y;
-    vec3 L = point-origin;
-    
-    const float distanceSq = dot(L,L);
-    const float rcpDistance = inversesqrt(distanceSq);
-    L *= rcpDistance;
-    
-    pdf = distanceSq/abs(dot(Triangle_getNormalTimesArea_impl(edges),L));
-    newRayMaxT = 1.0/rcpDistance;
-    return L;
-#else 
-    float rcpPdf;
-
-    const mat3 sphericalVertices = nbl_glsl_shapes_getSphericalTriangle(mat3(tri.vertex0,tri.vertex1,tri.vertex2),origin);
-#if POLYGON_METHOD==1
-    const vec3 L = nbl_glsl_sampling_generateSphericalTriangleSample(rcpPdf,sphericalVertices,xi.xy);
-#elif POLYGON_METHOD==2
-    const vec3 L = nbl_glsl_sampling_generateProjectedSphericalTriangleSample(rcpPdf,sphericalVertices,interaction.isotropic.N,isBSDF,xi.xy);
-#endif
-
-    // if `rcpProb` is NAN or negative then the triangle's solidAngle or projectedSolidAngle was close to 0.0 
-    pdf = rcpPdf>nbl_glsl_FLT_MIN ? (1.0/rcpPdf):0.0;
-
-    const vec3 N = Triangle_getNormalTimesArea(tri);
-    newRayMaxT = dot(N,tri.vertex0-origin)/dot(N,L);
-    return L;
-#endif
-}
-
-
-uint getBSDFLightIDAndDetermineNormal(out vec3 normal, in uint objectID, in vec3 intersection)
-{
-    if (objectID<SPHERE_COUNT)
-    {
-        Sphere sphere = spheres[objectID];
-        normal = Sphere_getNormal(sphere,intersection);
-        return sphere.bsdfLightIDs;
-    }
-    else
-    {
-        Triangle tri = triangles[objectID-SPHERE_COUNT];
-        normal = normalize(Triangle_getNormalTimesArea(tri));
-        return tri.bsdfLightIDs;
-    }
-}
\ No newline at end of file
diff --git a/42_FragmentShaderPathTracer/main.cpp b/42_FragmentShaderPathTracer/main.cpp
deleted file mode 100644
index f8505b8d1..000000000
--- a/42_FragmentShaderPathTracer/main.cpp
+++ /dev/null
@@ -1,693 +0,0 @@
-// Copyright (C) 2018-2020 - DevSH Graphics Programming Sp. z O.O.
-// This file is part of the "Nabla Engine".
-// For conditions of distribution and use, see copyright notice in nabla.h
-
-#define _NBL_STATIC_LIB_
-#include <nabla.h>
-
-#include "../common/CommonAPI.h"
-#include "CCamera.hpp"
-#include "nbl/ext/ScreenShot/ScreenShot.h"
-#include "nbl/video/utilities/CDumbPresentationOracle.h"
-
-using namespace nbl;
-using namespace core;
-using namespace ui;
-
-
-using namespace nbl;
-using namespace core;
-using namespace asset;
-using namespace video;
-
-smart_refctd_ptr<IGPUImageView> createHDRImageView(nbl::core::smart_refctd_ptr<nbl::video::ILogicalDevice> device, asset::E_FORMAT colorFormat, uint32_t width, uint32_t height)
-{
-	smart_refctd_ptr<IGPUImageView> gpuImageViewColorBuffer;
-	{
-		IGPUImage::SCreationParams imgInfo;
-		imgInfo.format = colorFormat;
-		imgInfo.type = IGPUImage::ET_2D;
-		imgInfo.extent.width = width;
-		imgInfo.extent.height = height;
-		imgInfo.extent.depth = 1u;
-		imgInfo.mipLevels = 1u;
-		imgInfo.arrayLayers = 1u;
-		imgInfo.samples = asset::ICPUImage::ESCF_1_BIT;
-		imgInfo.flags = static_cast<asset::IImage::E_CREATE_FLAGS>(0u);
-		imgInfo.usage = core::bitflag(asset::IImage::EUF_STORAGE_BIT) | asset::IImage::EUF_TRANSFER_SRC_BIT;
-
-		auto image = device->createImage(std::move(imgInfo));
-		auto imageMemReqs = image->getMemoryReqs();
-		imageMemReqs.memoryTypeBits &= device->getPhysicalDevice()->getDeviceLocalMemoryTypeBits();
-		device->allocate(imageMemReqs, image.get());
-
-		IGPUImageView::SCreationParams imgViewInfo;
-		imgViewInfo.image = std::move(image);
-		imgViewInfo.format = colorFormat;
-		imgViewInfo.viewType = IGPUImageView::ET_2D;
-		imgViewInfo.flags = static_cast<IGPUImageView::E_CREATE_FLAGS>(0u);
-		imgViewInfo.subresourceRange.aspectMask = IImage::E_ASPECT_FLAGS::EAF_COLOR_BIT;
-		imgViewInfo.subresourceRange.baseArrayLayer = 0u;
-		imgViewInfo.subresourceRange.baseMipLevel = 0u;
-		imgViewInfo.subresourceRange.layerCount = 1u;
-		imgViewInfo.subresourceRange.levelCount = 1u;
-
-		gpuImageViewColorBuffer = device->createImageView(std::move(imgViewInfo));
-	}
-
-	return gpuImageViewColorBuffer;
-}
-
-struct ShaderParameters
-{
-	const uint32_t MaxDepthLog2 = 4; //5
-	const uint32_t MaxSamplesLog2 = 10; //18
-} kShaderParameters;
-
-enum E_LIGHT_GEOMETRY
-{
-	ELG_SPHERE,
-	ELG_TRIANGLE,
-	ELG_RECTANGLE
-};
-
-struct DispatchInfo_t
-{
-	uint32_t workGroupCount[3];
-};
-
-_NBL_STATIC_INLINE_CONSTEXPR uint32_t DEFAULT_WORK_GROUP_SIZE = 16u;
-
-DispatchInfo_t getDispatchInfo(uint32_t imgWidth, uint32_t imgHeight) {
-	DispatchInfo_t ret = {};
-	ret.workGroupCount[0] = (uint32_t)core::ceil<float>((float)imgWidth / (float)DEFAULT_WORK_GROUP_SIZE);
-	ret.workGroupCount[1] = (uint32_t)core::ceil<float>((float)imgHeight / (float)DEFAULT_WORK_GROUP_SIZE);
-	ret.workGroupCount[2] = 1;
-	return ret;
-}
-
-int main()
-{
-	system::IApplicationFramework::GlobalsInit();
-
-	constexpr uint32_t WIN_W = 1280;
-	constexpr uint32_t WIN_H = 720;
-	constexpr uint32_t FBO_COUNT = 2u;
-	constexpr uint32_t FRAMES_IN_FLIGHT = 5u;
-	constexpr bool LOG_TIMESTAMP = false;
-	static_assert(FRAMES_IN_FLIGHT>FBO_COUNT);
-	
-	const auto swapchainImageUsage = static_cast<asset::IImage::E_USAGE_FLAGS>(asset::IImage::EUF_COLOR_ATTACHMENT_BIT | asset::IImage::EUF_TRANSFER_DST_BIT);
-	CommonAPI::InitParams initParams;
-	initParams.apiType = video::EAT_VULKAN;
-	initParams.appName = { "Compute Shader PathTracer" };
-	initParams.framesInFlight = FRAMES_IN_FLIGHT;
-	initParams.windowWidth = WIN_W;
-	initParams.windowHeight = WIN_H;
-	initParams.swapchainImageCount = FBO_COUNT;
-	initParams.swapchainImageUsage = swapchainImageUsage;
-	initParams.depthFormat = asset::EF_D32_SFLOAT;
-	auto initOutput = CommonAPI::InitWithDefaultExt(std::move(initParams));
-
-	auto system = std::move(initOutput.system);
-	auto window = std::move(initParams.window);
-	auto windowCb = std::move(initParams.windowCb);
-	auto gl = std::move(initOutput.apiConnection);
-	auto surface = std::move(initOutput.surface);
-	auto gpuPhysicalDevice = std::move(initOutput.physicalDevice);
-	auto device = std::move(initOutput.logicalDevice);
-	auto queues = std::move(initOutput.queues);
-	auto graphicsQueue = queues[CommonAPI::InitOutput::EQT_GRAPHICS];
-	auto transferUpQueue = queues[CommonAPI::InitOutput::EQT_TRANSFER_UP];
-	auto computeQueue = queues[CommonAPI::InitOutput::EQT_COMPUTE];
-	auto renderpass = std::move(initOutput.renderToSwapchainRenderpass);
-	auto assetManager = std::move(initOutput.assetManager);
-	auto cpu2gpuParams = std::move(initOutput.cpu2gpuParams);
-	auto logger = std::move(initOutput.logger);
-	auto inputSystem = std::move(initOutput.inputSystem);
-	auto utilities = std::move(initOutput.utilities);
-	auto graphicsCommandPools = std::move(initOutput.commandPools[CommonAPI::InitOutput::EQT_GRAPHICS]);
-	auto computeCommandPools = std::move(initOutput.commandPools[CommonAPI::InitOutput::EQT_COMPUTE]);
-	auto swapchainCreationParams = std::move(initOutput.swapchainCreationParams);
-
-	core::smart_refctd_ptr<video::ISwapchain> swapchain = nullptr;
-	CommonAPI::createSwapchain(std::move(device), swapchainCreationParams, WIN_W, WIN_H, swapchain);
-	assert(swapchain);
-	auto fbo = CommonAPI::createFBOWithSwapchainImages(
-		swapchain->getImageCount(), WIN_W, WIN_H,
-		device, swapchain, renderpass,
-		asset::EF_D32_SFLOAT
-	);
-
-	auto graphicsCmdPoolQueueFamIdx = graphicsQueue->getFamilyIndex();
-
-	nbl::video::IGPUObjectFromAssetConverter CPU2GPU;
-	
-	core::smart_refctd_ptr<nbl::video::IGPUCommandBuffer> cmdbuf[FRAMES_IN_FLIGHT];
-	for (uint32_t i = 0u; i < FRAMES_IN_FLIGHT; i++)
-		device->createCommandBuffers(graphicsCommandPools[i].get(), video::IGPUCommandBuffer::EL_PRIMARY, 1, cmdbuf+i);	
-
-	constexpr uint32_t maxDescriptorCount = 256u;
-	constexpr uint32_t PoolSizesCount = 5u;
-
-	nbl::video::IDescriptorPool::SCreateInfo createInfo;
-	createInfo.maxDescriptorCount[static_cast<uint32_t>(nbl::asset::IDescriptor::E_TYPE::ET_STORAGE_BUFFER)] = maxDescriptorCount * 1;
-	createInfo.maxDescriptorCount[static_cast<uint32_t>(nbl::asset::IDescriptor::E_TYPE::ET_STORAGE_IMAGE)] = maxDescriptorCount * 8;
-	createInfo.maxDescriptorCount[static_cast<uint32_t>(nbl::asset::IDescriptor::E_TYPE::ET_COMBINED_IMAGE_SAMPLER)] = maxDescriptorCount * 2;
-	createInfo.maxDescriptorCount[static_cast<uint32_t>(nbl::asset::IDescriptor::E_TYPE::ET_UNIFORM_TEXEL_BUFFER)] = maxDescriptorCount * 1;
-	createInfo.maxDescriptorCount[static_cast<uint32_t>(nbl::asset::IDescriptor::E_TYPE::ET_UNIFORM_BUFFER)] = maxDescriptorCount * 1;
-	createInfo.maxSets = maxDescriptorCount;
-
-	auto descriptorPool = device->createDescriptorPool(std::move(createInfo));
-
-	const auto timestampQueryPool = device->createQueryPool({
-		.queryType = video::IQueryPool::EQT_TIMESTAMP,
-		.queryCount = 2u
-	});
-
-	// Camera 
-	core::vectorSIMDf cameraPosition(0, 5, -10);
-	matrix4SIMD proj = matrix4SIMD::buildProjectionMatrixPerspectiveFovRH(core::radians(60.0f), video::ISurface::getTransformedAspectRatio(swapchain->getPreTransform(), WIN_W, WIN_H), 0.01f, 500.0f);
-	Camera cam = Camera(cameraPosition, core::vectorSIMDf(0, 0, 0), proj);
-
-	IGPUDescriptorSetLayout::SBinding descriptorSet0Bindings[] = {
-		{ 0u, nbl::asset::IDescriptor::E_TYPE::ET_STORAGE_IMAGE, IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_NONE, IShader::ESS_COMPUTE, 1u, nullptr },
-	};
-	IGPUDescriptorSetLayout::SBinding uboBinding
-	{ 0u, nbl::asset::IDescriptor::E_TYPE::ET_UNIFORM_BUFFER, IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_NONE, IShader::ESS_COMPUTE, 1u, nullptr };
-	IGPUDescriptorSetLayout::SBinding descriptorSet3Bindings[] = {
-		{ 0u, nbl::asset::IDescriptor::E_TYPE::ET_COMBINED_IMAGE_SAMPLER, IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_NONE, IShader::ESS_COMPUTE, 1u, nullptr },
-		{ 1u, nbl::asset::IDescriptor::E_TYPE::ET_UNIFORM_TEXEL_BUFFER, IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_NONE, IShader::ESS_COMPUTE, 1u, nullptr },
-		{ 2u, nbl::asset::IDescriptor::E_TYPE::ET_COMBINED_IMAGE_SAMPLER, IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_NONE, IShader::ESS_COMPUTE, 1u, nullptr },
-	};
-	
-	auto gpuDescriptorSetLayout0 = device->createDescriptorSetLayout(descriptorSet0Bindings, descriptorSet0Bindings + 1u);
-	auto gpuDescriptorSetLayout1 = device->createDescriptorSetLayout(&uboBinding, &uboBinding + 1u);
-	auto gpuDescriptorSetLayout2 = device->createDescriptorSetLayout(descriptorSet3Bindings, descriptorSet3Bindings+3u);
-
-	auto createGpuResources = [&](std::string pathToShader) -> core::smart_refctd_ptr<video::IGPUComputePipeline>
-	{
-		asset::IAssetLoader::SAssetLoadParams params{};
-		params.logger = logger.get();
-		//params.relativeDir = tmp.c_str();
-		auto spec = assetManager->getAsset(pathToShader,params).getContents();
-		
-		if (spec.empty())
-			assert(false);
-
-		auto cpuComputeSpecializedShader = core::smart_refctd_ptr_static_cast<asset::ICPUSpecializedShader>(*spec.begin());
-
-		ISpecializedShader::SInfo info = cpuComputeSpecializedShader->getSpecializationInfo();
-		info.m_backingBuffer = ICPUBuffer::create({ sizeof(ShaderParameters) });
-		memcpy(info.m_backingBuffer->getPointer(),&kShaderParameters,sizeof(ShaderParameters));
-		info.m_entries = core::make_refctd_dynamic_array<core::smart_refctd_dynamic_array<ISpecializedShader::SInfo::SMapEntry>>(2u);
-		for (uint32_t i=0; i<2; i++)
-			info.m_entries->operator[](i) = {i,(uint32_t)(i*sizeof(uint32_t)),sizeof(uint32_t)};
-
-
-		cpuComputeSpecializedShader->setSpecializationInfo(std::move(info));
-
-		auto gpuComputeSpecializedShader = CPU2GPU.getGPUObjectsFromAssets(&cpuComputeSpecializedShader, &cpuComputeSpecializedShader + 1, cpu2gpuParams)->front();
-
-		auto gpuPipelineLayout = device->createPipelineLayout(nullptr, nullptr, core::smart_refctd_ptr(gpuDescriptorSetLayout0), core::smart_refctd_ptr(gpuDescriptorSetLayout1), core::smart_refctd_ptr(gpuDescriptorSetLayout2), nullptr);
-
-		auto gpuPipeline = device->createComputePipeline(nullptr, std::move(gpuPipelineLayout), std::move(gpuComputeSpecializedShader));
-
-		return gpuPipeline;
-	};
-
-	E_LIGHT_GEOMETRY lightGeom = ELG_SPHERE;
-	constexpr const char* shaderPaths[] = {"../litBySphere.comp","../litByTriangle.comp","../litByRectangle.comp"};
-	auto gpuComputePipeline = createGpuResources(shaderPaths[lightGeom]);
-
-	DispatchInfo_t dispatchInfo = getDispatchInfo(WIN_W, WIN_H);
-
-	auto createImageView = [&](std::string pathToOpenEXRHDRIImage)
-	{
-#ifndef _NBL_COMPILE_WITH_OPENEXR_LOADER_
-		assert(false);
-#endif
-
-		auto pathToTexture = pathToOpenEXRHDRIImage;
-		IAssetLoader::SAssetLoadParams lp(0ull, nullptr, IAssetLoader::ECF_DONT_CACHE_REFERENCES);
-		auto cpuTexture = assetManager->getAsset(pathToTexture, lp);
-		auto cpuTextureContents = cpuTexture.getContents();
-		assert(!cpuTextureContents.empty());
-		auto cpuImage = core::smart_refctd_ptr_static_cast<asset::ICPUImage>(*cpuTextureContents.begin());
-		cpuImage->setImageUsageFlags(IImage::E_USAGE_FLAGS::EUF_SAMPLED_BIT);
-
-		ICPUImageView::SCreationParams viewParams;
-		viewParams.flags = static_cast<ICPUImageView::E_CREATE_FLAGS>(0u);
-		viewParams.image = cpuImage;
-		viewParams.format = viewParams.image->getCreationParameters().format;
-		viewParams.viewType = IImageView<ICPUImage>::ET_2D;
-		viewParams.subresourceRange.aspectMask = IImage::E_ASPECT_FLAGS::EAF_COLOR_BIT;
-		viewParams.subresourceRange.baseArrayLayer = 0u;
-		viewParams.subresourceRange.layerCount = 1u;
-		viewParams.subresourceRange.baseMipLevel = 0u;
-		viewParams.subresourceRange.levelCount = 1u;
-
-		auto cpuImageView = ICPUImageView::create(std::move(viewParams));
-
-		cpu2gpuParams.beginCommandBuffers();
-		auto gpuImageView = CPU2GPU.getGPUObjectsFromAssets(&cpuImageView, &cpuImageView + 1u, cpu2gpuParams)->front();
-		cpu2gpuParams.waitForCreationToComplete(false);
-
-		return gpuImageView;
-	};
-	
-	auto gpuEnvmapImageView = createImageView("../../media/envmap/envmap_0.exr");
-
-	smart_refctd_ptr<IGPUBufferView> gpuSequenceBufferView;
-	{
-		const uint32_t MaxDimensions = 3u<<kShaderParameters.MaxDepthLog2;
-		const uint32_t MaxSamples = 1u<<kShaderParameters.MaxSamplesLog2;
-
-		auto sampleSequence = core::make_smart_refctd_ptr<asset::({ sizeof(uint32_t)*MaxDimensions*MaxSamples });
-		
-		core::OwenSampler sampler(MaxDimensions, 0xdeadbeefu);
-		//core::SobolSampler sampler(MaxDimensions);
-
-		auto out = reinterpret_cast<uint32_t*>(sampleSequence->getPointer());
-		for (auto dim=0u; dim<MaxDimensions; dim++)
-		for (uint32_t i=0; i<MaxSamples; i++)
-		{
-			out[i*MaxDimensions+dim] = sampler.sample(dim,i);
-		}
-		
-		// TODO: Temp Fix because createFilledDeviceLocalBufferOnDedMem doesn't take in params
-		// auto gpuSequenceBuffer = utilities->createFilledDeviceLocalBufferOnDedMem(graphicsQueue, sampleSequence->getSize(), sampleSequence->getPointer());
-		core::smart_refctd_ptr<IGPUBuffer> gpuSequenceBuffer;
-		{
-			IGPUBuffer::SCreationParams params = {};
-			const size_t size = sampleSequence->getSize();
-			params.usage = core::bitflag(asset::IBuffer::EUF_TRANSFER_DST_BIT) | asset::IBuffer::EUF_UNIFORM_TEXEL_BUFFER_BIT; 
-			params.size = size;
-			gpuSequenceBuffer = device->createBuffer(std::move(params));
-			auto gpuSequenceBufferMemReqs = gpuSequenceBuffer->getMemoryReqs();
-			gpuSequenceBufferMemReqs.memoryTypeBits &= device->getPhysicalDevice()->getDeviceLocalMemoryTypeBits();
-			device->allocate(gpuSequenceBufferMemReqs, gpuSequenceBuffer.get());
-			utilities->updateBufferRangeViaStagingBufferAutoSubmit(asset::SBufferRange<IGPUBuffer>{0u,size,gpuSequenceBuffer},sampleSequence->getPointer(), graphicsQueue);
-		}
-		gpuSequenceBufferView = device->createBufferView(gpuSequenceBuffer.get(), asset::EF_R32G32B32_UINT);
-	}
-
-	smart_refctd_ptr<IGPUImageView> gpuScrambleImageView;
-	{
-		IGPUImage::SCreationParams imgParams;
-		imgParams.flags = static_cast<IImage::E_CREATE_FLAGS>(0u);
-		imgParams.type = IImage::ET_2D;
-		imgParams.format = EF_R32G32_UINT;
-		imgParams.extent = {WIN_W, WIN_H,1u};
-		imgParams.mipLevels = 1u;
-		imgParams.arrayLayers = 1u;
-		imgParams.samples = IImage::ESCF_1_BIT;
-		imgParams.usage = core::bitflag(IImage::EUF_SAMPLED_BIT) | IImage::EUF_TRANSFER_DST_BIT;
-		imgParams.initialLayout = asset::IImage::EL_UNDEFINED;
-
-		IGPUImage::SBufferCopy region = {};
-		region.bufferOffset = 0u;
-		region.bufferRowLength = 0u;
-		region.bufferImageHeight = 0u;
-		region.imageExtent = imgParams.extent;
-		region.imageOffset = {0u,0u,0u};
-		region.imageSubresource.layerCount = 1u;
-		region.imageSubresource.aspectMask = IImage::E_ASPECT_FLAGS::EAF_COLOR_BIT;
-
-		constexpr auto ScrambleStateChannels = 2u;
-		const auto renderPixelCount = imgParams.extent.width*imgParams.extent.height;
-		core::vector<uint32_t> random(renderPixelCount*ScrambleStateChannels);
-		{
-			core::RandomSampler rng(0xbadc0ffeu);
-			for (auto& pixel : random)
-				pixel = rng.nextSample();
-		}
-
-		// TODO: Temp Fix because createFilledDeviceLocalBufferOnDedMem doesn't take in params
-		// auto buffer = utilities->createFilledDeviceLocalBufferOnDedMem(graphicsQueue, random.size()*sizeof(uint32_t), random.data());
-		core::smart_refctd_ptr<IGPUBuffer> buffer;
-		{
-			IGPUBuffer::SCreationParams params = {};
-			const size_t size = random.size() * sizeof(uint32_t);
-			params.usage = core::bitflag(asset::IBuffer::EUF_TRANSFER_DST_BIT) | asset::IBuffer::EUF_TRANSFER_SRC_BIT; 
-			params.size = size;
-			buffer = device->createBuffer(std::move(params));
-			auto bufferMemReqs = buffer->getMemoryReqs();
-			bufferMemReqs.memoryTypeBits &= device->getPhysicalDevice()->getDeviceLocalMemoryTypeBits();
-			device->allocate(bufferMemReqs, buffer.get());
-			utilities->updateBufferRangeViaStagingBufferAutoSubmit(asset::SBufferRange<IGPUBuffer>{0u,size,buffer},random.data(),graphicsQueue);
-		}
-
-		IGPUImageView::SCreationParams viewParams;
-		viewParams.flags = static_cast<IGPUImageView::E_CREATE_FLAGS>(0u);
-		// TODO: Replace this IGPUBuffer -> IGPUImage to using image upload utility
-		viewParams.image = utilities->createFilledDeviceLocalImageOnDedMem(std::move(imgParams), buffer.get(), 1u, &region, graphicsQueue);
-		viewParams.viewType = IGPUImageView::ET_2D;
-		viewParams.format = EF_R32G32_UINT;
-		viewParams.subresourceRange.aspectMask = IImage::E_ASPECT_FLAGS::EAF_COLOR_BIT;
-		viewParams.subresourceRange.levelCount = 1u;
-		viewParams.subresourceRange.layerCount = 1u;
-		gpuScrambleImageView = device->createImageView(std::move(viewParams));
-	}
-	
-	// Create Out Image TODO
-	constexpr uint32_t MAX_FBO_COUNT = 4u;
-	smart_refctd_ptr<IGPUImageView> outHDRImageViews[MAX_FBO_COUNT] = {};
-	assert(MAX_FBO_COUNT >= swapchain->getImageCount());
-	for(uint32_t i = 0; i < swapchain->getImageCount(); ++i) {
-		outHDRImageViews[i] = createHDRImageView(device, asset::EF_R16G16B16A16_SFLOAT, WIN_W, WIN_H);
-	}
-
-	core::smart_refctd_ptr<IGPUDescriptorSet> descriptorSets0[FBO_COUNT] = {};
-	for(uint32_t i = 0; i < FBO_COUNT; ++i)
-	{
-		auto & descSet = descriptorSets0[i];
-		descSet = descriptorPool->createDescriptorSet(core::smart_refctd_ptr(gpuDescriptorSetLayout0));
-		video::IGPUDescriptorSet::SWriteDescriptorSet writeDescriptorSet;
-		writeDescriptorSet.dstSet = descSet.get();
-		writeDescriptorSet.binding = 0;
-		writeDescriptorSet.count = 1u;
-		writeDescriptorSet.arrayElement = 0u;
-		writeDescriptorSet.descriptorType = asset::IDescriptor::E_TYPE::ET_STORAGE_IMAGE;
-		video::IGPUDescriptorSet::SDescriptorInfo info;
-		{
-			info.desc = outHDRImageViews[i];
-			info.info.image.sampler = nullptr;
-			info.info.image.imageLayout = asset::IImage::EL_GENERAL;
-		}
-		writeDescriptorSet.info = &info;
-		device->updateDescriptorSets(1u, &writeDescriptorSet, 0u, nullptr);
-	}
-	
-	struct SBasicViewParametersAligned
-	{
-		SBasicViewParameters uboData;
-	};
-
-	IGPUBuffer::SCreationParams gpuuboParams = {};
-	gpuuboParams.usage = core::bitflag(IGPUBuffer::EUF_UNIFORM_BUFFER_BIT) | IGPUBuffer::EUF_TRANSFER_DST_BIT;
-	gpuuboParams.size = sizeof(SBasicViewParametersAligned);
-	auto gpuubo = device->createBuffer(std::move(gpuuboParams));
-	auto gpuuboMemReqs = gpuubo->getMemoryReqs();
-	gpuuboMemReqs.memoryTypeBits &= device->getPhysicalDevice()->getDeviceLocalMemoryTypeBits();
-	device->allocate(gpuuboMemReqs, gpuubo.get());
-
-	auto uboDescriptorSet1 = descriptorPool->createDescriptorSet(core::smart_refctd_ptr(gpuDescriptorSetLayout1));
-	{
-		video::IGPUDescriptorSet::SWriteDescriptorSet uboWriteDescriptorSet;
-		uboWriteDescriptorSet.dstSet = uboDescriptorSet1.get();
-		uboWriteDescriptorSet.binding = 0;
-		uboWriteDescriptorSet.count = 1u;
-		uboWriteDescriptorSet.arrayElement = 0u;
-		uboWriteDescriptorSet.descriptorType = asset::IDescriptor::E_TYPE::ET_UNIFORM_BUFFER;
-		video::IGPUDescriptorSet::SDescriptorInfo info;
-		{
-			info.desc = gpuubo;
-			info.info.buffer.offset = 0ull;
-			info.info.buffer.size = sizeof(SBasicViewParametersAligned);
-		}
-		uboWriteDescriptorSet.info = &info;
-		device->updateDescriptorSets(1u, &uboWriteDescriptorSet, 0u, nullptr);
-	}
-
-	ISampler::SParams samplerParams0 = { ISampler::ETC_CLAMP_TO_EDGE, ISampler::ETC_CLAMP_TO_EDGE, ISampler::ETC_CLAMP_TO_EDGE, ISampler::ETBC_FLOAT_OPAQUE_BLACK, ISampler::ETF_LINEAR, ISampler::ETF_LINEAR, ISampler::ESMM_LINEAR, 0u, false, ECO_ALWAYS };
-	auto sampler0 = device->createSampler(samplerParams0);
-	ISampler::SParams samplerParams1 = { ISampler::ETC_CLAMP_TO_EDGE, ISampler::ETC_CLAMP_TO_EDGE, ISampler::ETC_CLAMP_TO_EDGE, ISampler::ETBC_INT_OPAQUE_BLACK, ISampler::ETF_NEAREST, ISampler::ETF_NEAREST, ISampler::ESMM_NEAREST, 0u, false, ECO_ALWAYS };
-	auto sampler1 = device->createSampler(samplerParams1);
-	
-	auto descriptorSet2 = descriptorPool->createDescriptorSet(core::smart_refctd_ptr(gpuDescriptorSetLayout2));
-	{
-		constexpr auto kDescriptorCount = 3;
-		IGPUDescriptorSet::SWriteDescriptorSet samplerWriteDescriptorSet[kDescriptorCount];
-		IGPUDescriptorSet::SDescriptorInfo samplerDescriptorInfo[kDescriptorCount];
-		for (auto i=0; i<kDescriptorCount; i++)
-		{
-			samplerWriteDescriptorSet[i].dstSet = descriptorSet2.get();
-			samplerWriteDescriptorSet[i].binding = i;
-			samplerWriteDescriptorSet[i].arrayElement = 0u;
-			samplerWriteDescriptorSet[i].count = 1u;
-			samplerWriteDescriptorSet[i].info = samplerDescriptorInfo+i;
-		}
-		samplerWriteDescriptorSet[0].descriptorType = nbl::asset::IDescriptor::E_TYPE::ET_COMBINED_IMAGE_SAMPLER;
-		samplerWriteDescriptorSet[1].descriptorType = nbl::asset::IDescriptor::E_TYPE::ET_UNIFORM_TEXEL_BUFFER;
-		samplerWriteDescriptorSet[2].descriptorType = nbl::asset::IDescriptor::E_TYPE::ET_COMBINED_IMAGE_SAMPLER;
-
-		samplerDescriptorInfo[0].desc = gpuEnvmapImageView;
-		{
-			// ISampler::SParams samplerParams = { ISampler::ETC_CLAMP_TO_EDGE, ISampler::ETC_CLAMP_TO_EDGE, ISampler::ETC_CLAMP_TO_EDGE, ISampler::ETBC_FLOAT_OPAQUE_BLACK, ISampler::ETF_LINEAR, ISampler::ETF_LINEAR, ISampler::ESMM_LINEAR, 0u, false, ECO_ALWAYS };
-			samplerDescriptorInfo[0].info.image.sampler = sampler0;
-			samplerDescriptorInfo[0].info.image.imageLayout = asset::IImage::EL_SHADER_READ_ONLY_OPTIMAL;
-		}
-		samplerDescriptorInfo[1].desc = gpuSequenceBufferView;
-		samplerDescriptorInfo[2].desc = gpuScrambleImageView;
-		{
-			// ISampler::SParams samplerParams = { ISampler::ETC_CLAMP_TO_EDGE, ISampler::ETC_CLAMP_TO_EDGE, ISampler::ETC_CLAMP_TO_EDGE, ISampler::ETBC_INT_OPAQUE_BLACK, ISampler::ETF_NEAREST, ISampler::ETF_NEAREST, ISampler::ESMM_NEAREST, 0u, false, ECO_ALWAYS };
-			samplerDescriptorInfo[2].info.image.sampler = sampler1;
-			samplerDescriptorInfo[2].info.image.imageLayout = asset::IImage::EL_SHADER_READ_ONLY_OPTIMAL;
-		}
-
-		device->updateDescriptorSets(kDescriptorCount, samplerWriteDescriptorSet, 0u, nullptr);
-	}
-
-	constexpr uint32_t FRAME_COUNT = 500000u;
-
-	core::smart_refctd_ptr<video::IGPUFence> frameComplete[FRAMES_IN_FLIGHT] = { nullptr };
-	core::smart_refctd_ptr<video::IGPUSemaphore> imageAcquire[FRAMES_IN_FLIGHT] = { nullptr };
-	core::smart_refctd_ptr<video::IGPUSemaphore> renderFinished[FRAMES_IN_FLIGHT] = { nullptr };
-	for (uint32_t i=0u; i<FRAMES_IN_FLIGHT; i++)
-	{
-		imageAcquire[i] = device->createSemaphore();
-		renderFinished[i] = device->createSemaphore();
-	}
-	
-	CDumbPresentationOracle oracle;
-	oracle.reportBeginFrameRecord();
-	constexpr uint64_t MAX_TIMEOUT = 99999999999999ull;
-	
-	// polling for events!
-	CommonAPI::InputSystem::ChannelReader<IMouseEventChannel> mouse;
-	CommonAPI::InputSystem::ChannelReader<IKeyboardEventChannel> keyboard;
-	
-	uint32_t resourceIx = 0;
-	while(windowCb->isWindowOpen())
-	{
-		resourceIx++;
-		if(resourceIx >= FRAMES_IN_FLIGHT) {
-			resourceIx = 0;
-		}
-		
-		oracle.reportEndFrameRecord();
-		double dt = oracle.getDeltaTimeInMicroSeconds() / 1000.0;
-		auto nextPresentationTimeStamp = oracle.getNextPresentationTimeStamp();
-		oracle.reportBeginFrameRecord();
-
-		// Input 
-		inputSystem->getDefaultMouse(&mouse);
-		inputSystem->getDefaultKeyboard(&keyboard);
-
-		cam.beginInputProcessing(nextPresentationTimeStamp);
-		mouse.consumeEvents([&](const IMouseEventChannel::range_t& events) -> void { cam.mouseProcess(events); }, logger.get());
-		keyboard.consumeEvents([&](const IKeyboardEventChannel::range_t& events) -> void { cam.keyboardProcess(events); }, logger.get());
-		cam.endInputProcessing(nextPresentationTimeStamp);
-		
-		auto& cb = cmdbuf[resourceIx];
-		auto& fence = frameComplete[resourceIx];
-		if (fence)
-		while (device->waitForFences(1u,&fence.get(),false,MAX_TIMEOUT)==video::IGPUFence::ES_TIMEOUT)
-		{
-		}
-		else
-			fence = device->createFence(static_cast<video::IGPUFence::E_CREATE_FLAGS>(0));
-		
-		const auto viewMatrix = cam.getViewMatrix();
-		const auto viewProjectionMatrix = matrix4SIMD::concatenateBFollowedByAPrecisely(
-			video::ISurface::getSurfaceTransformationMatrix(swapchain->getPreTransform()),
-			cam.getConcatenatedMatrix()
-		);
-				
-		// safe to proceed
-		cb->begin(IGPUCommandBuffer::EU_NONE);
-		cb->resetQueryPool(timestampQueryPool.get(), 0u, 2u);
-
-		// renderpass 
-		uint32_t imgnum = 0u;
-		swapchain->acquireNextImage(MAX_TIMEOUT,imageAcquire[resourceIx].get(),nullptr,&imgnum);
-		{
-			auto mv = viewMatrix;
-			auto mvp = viewProjectionMatrix;
-			core::matrix3x4SIMD normalMat;
-			mv.getSub3x3InverseTranspose(normalMat);
-
-			SBasicViewParametersAligned viewParams;
-			memcpy(viewParams.uboData.MV, mv.pointer(), sizeof(mv));
-			memcpy(viewParams.uboData.MVP, mvp.pointer(), sizeof(mvp));
-			memcpy(viewParams.uboData.NormalMat, normalMat.pointer(), sizeof(normalMat));
-			
-			asset::SBufferRange<video::IGPUBuffer> range;
-			range.buffer = gpuubo;
-			range.offset = 0ull;
-			range.size = sizeof(viewParams);
-			utilities->updateBufferRangeViaStagingBufferAutoSubmit(range, &viewParams, graphicsQueue);
-		}
-				
-		// TRANSITION outHDRImageViews[imgnum] to EIL_GENERAL (because of descriptorSets0 -> ComputeShader Writes into the image)
-		{
-			IGPUCommandBuffer::SImageMemoryBarrier imageBarriers[3u] = {};
-			imageBarriers[0].barrier.srcAccessMask = asset::EAF_NONE;
-			imageBarriers[0].barrier.dstAccessMask = static_cast<asset::E_ACCESS_FLAGS>(asset::EAF_SHADER_WRITE_BIT);
-			imageBarriers[0].oldLayout = asset::IImage::EL_UNDEFINED;
-			imageBarriers[0].newLayout = asset::IImage::EL_GENERAL;
-			imageBarriers[0].srcQueueFamilyIndex = graphicsCmdPoolQueueFamIdx;
-			imageBarriers[0].dstQueueFamilyIndex = graphicsCmdPoolQueueFamIdx;
-			imageBarriers[0].image = outHDRImageViews[imgnum]->getCreationParameters().image;
-			imageBarriers[0].subresourceRange.aspectMask = asset::IImage::EAF_COLOR_BIT;
-			imageBarriers[0].subresourceRange.baseMipLevel = 0u;
-			imageBarriers[0].subresourceRange.levelCount = 1;
-			imageBarriers[0].subresourceRange.baseArrayLayer = 0u;
-			imageBarriers[0].subresourceRange.layerCount = 1;
-
-			imageBarriers[1].barrier.srcAccessMask = asset::EAF_NONE;
-			imageBarriers[1].barrier.dstAccessMask = static_cast<asset::E_ACCESS_FLAGS>(asset::EAF_SHADER_READ_BIT);
-			imageBarriers[1].oldLayout = asset::IImage::EL_UNDEFINED;
-			imageBarriers[1].newLayout = asset::IImage::EL_SHADER_READ_ONLY_OPTIMAL;
-			imageBarriers[1].srcQueueFamilyIndex = graphicsCmdPoolQueueFamIdx;
-			imageBarriers[1].dstQueueFamilyIndex = graphicsCmdPoolQueueFamIdx;
-			imageBarriers[1].image = gpuScrambleImageView->getCreationParameters().image;
-			imageBarriers[1].subresourceRange.aspectMask = asset::IImage::EAF_COLOR_BIT;
-			imageBarriers[1].subresourceRange.baseMipLevel = 0u;
-			imageBarriers[1].subresourceRange.levelCount = 1;
-			imageBarriers[1].subresourceRange.baseArrayLayer = 0u;
-			imageBarriers[1].subresourceRange.layerCount = 1;
-
-			 imageBarriers[2].barrier.srcAccessMask = asset::EAF_NONE;
-			 imageBarriers[2].barrier.dstAccessMask = static_cast<asset::E_ACCESS_FLAGS>(asset::EAF_SHADER_READ_BIT);
-			 imageBarriers[2].oldLayout = asset::IImage::EL_UNDEFINED;
-			 imageBarriers[2].newLayout = asset::IImage::EL_SHADER_READ_ONLY_OPTIMAL;
-			 imageBarriers[2].srcQueueFamilyIndex = graphicsCmdPoolQueueFamIdx;
-			 imageBarriers[2].dstQueueFamilyIndex = graphicsCmdPoolQueueFamIdx;
-			 imageBarriers[2].image = gpuEnvmapImageView->getCreationParameters().image;
-			 imageBarriers[2].subresourceRange.aspectMask = asset::IImage::EAF_COLOR_BIT;
-			 imageBarriers[2].subresourceRange.baseMipLevel = 0u;
-			 imageBarriers[2].subresourceRange.levelCount = gpuEnvmapImageView->getCreationParameters().subresourceRange.levelCount;
-			 imageBarriers[2].subresourceRange.baseArrayLayer = 0u;
-			 imageBarriers[2].subresourceRange.layerCount = gpuEnvmapImageView->getCreationParameters().subresourceRange.layerCount;
-
-			cb->pipelineBarrier(asset::EPSF_TOP_OF_PIPE_BIT, asset::EPSF_COMPUTE_SHADER_BIT, asset::EDF_NONE, 0u, nullptr, 0u, nullptr, 3u, imageBarriers);
-		}
-
-		// cube envmap handle
-		{
-			cb->writeTimestamp(asset::E_PIPELINE_STAGE_FLAGS::EPSF_TOP_OF_PIPE_BIT, timestampQueryPool.get(), 0u);
-			cb->bindComputePipeline(gpuComputePipeline.get());
-			cb->bindDescriptorSets(EPBP_COMPUTE, gpuComputePipeline->getLayout(), 0u, 1u, &descriptorSets0[imgnum].get());
-			cb->bindDescriptorSets(EPBP_COMPUTE, gpuComputePipeline->getLayout(), 1u, 1u, &uboDescriptorSet1.get());
-			cb->bindDescriptorSets(EPBP_COMPUTE, gpuComputePipeline->getLayout(), 2u, 1u, &descriptorSet2.get());
-			cb->dispatch(dispatchInfo.workGroupCount[0], dispatchInfo.workGroupCount[1], dispatchInfo.workGroupCount[2]);
-			cb->writeTimestamp(asset::E_PIPELINE_STAGE_FLAGS::EPSF_BOTTOM_OF_PIPE_BIT, timestampQueryPool.get(), 1u);
-		}
-		// TODO: tone mapping and stuff
-
-		// Copy HDR Image to SwapChain
-		auto srcImgViewCreationParams = outHDRImageViews[imgnum]->getCreationParameters();
-		auto dstImgViewCreationParams = fbo->begin()[imgnum]->getCreationParameters().attachments[0]->getCreationParameters();
-		
-		// Getting Ready for Blit
-		// TRANSITION outHDRImageViews[imgnum] to EIL_TRANSFER_SRC_OPTIMAL
-		// TRANSITION `fbo[imgnum]->getCreationParameters().attachments[0]` to EIL_TRANSFER_DST_OPTIMAL
-		{
-			IGPUCommandBuffer::SImageMemoryBarrier imageBarriers[2u] = {};
-			imageBarriers[0].barrier.srcAccessMask = asset::EAF_NONE;
-			imageBarriers[0].barrier.dstAccessMask = asset::EAF_TRANSFER_WRITE_BIT;
-			imageBarriers[0].oldLayout = asset::IImage::EL_UNDEFINED;
-			imageBarriers[0].newLayout = asset::IImage::EL_TRANSFER_SRC_OPTIMAL;
-			imageBarriers[0].srcQueueFamilyIndex = graphicsCmdPoolQueueFamIdx;
-			imageBarriers[0].dstQueueFamilyIndex = graphicsCmdPoolQueueFamIdx;
-			imageBarriers[0].image = srcImgViewCreationParams.image;
-			imageBarriers[0].subresourceRange.aspectMask = asset::IImage::EAF_COLOR_BIT;
-			imageBarriers[0].subresourceRange.baseMipLevel = 0u;
-			imageBarriers[0].subresourceRange.levelCount = 1;
-			imageBarriers[0].subresourceRange.baseArrayLayer = 0u;
-			imageBarriers[0].subresourceRange.layerCount = 1;
-
-			imageBarriers[1].barrier.srcAccessMask = asset::EAF_NONE;
-			imageBarriers[1].barrier.dstAccessMask = asset::EAF_TRANSFER_WRITE_BIT;
-			imageBarriers[1].oldLayout = asset::IImage::EL_UNDEFINED;
-			imageBarriers[1].newLayout = asset::IImage::EL_TRANSFER_DST_OPTIMAL;
-			imageBarriers[1].srcQueueFamilyIndex = graphicsCmdPoolQueueFamIdx;
-			imageBarriers[1].dstQueueFamilyIndex = graphicsCmdPoolQueueFamIdx;
-			imageBarriers[1].image = dstImgViewCreationParams.image;
-			imageBarriers[1].subresourceRange.aspectMask = asset::IImage::EAF_COLOR_BIT;
-			imageBarriers[1].subresourceRange.baseMipLevel = 0u;
-			imageBarriers[1].subresourceRange.levelCount = 1;
-			imageBarriers[1].subresourceRange.baseArrayLayer = 0u;
-			imageBarriers[1].subresourceRange.layerCount = 1;
-			cb->pipelineBarrier(asset::EPSF_TRANSFER_BIT, asset::EPSF_TRANSFER_BIT, asset::EDF_NONE, 0u, nullptr, 0u, nullptr, 2u, imageBarriers);
-		}
-
-		// Blit Image
-		{
-			SImageBlit blit = {};
-			blit.srcOffsets[0] = {0, 0, 0};
-			blit.srcOffsets[1] = {WIN_W, WIN_H, 1};
-		
-			blit.srcSubresource.aspectMask = srcImgViewCreationParams.subresourceRange.aspectMask;
-			blit.srcSubresource.mipLevel = srcImgViewCreationParams.subresourceRange.baseMipLevel;
-			blit.srcSubresource.baseArrayLayer = srcImgViewCreationParams.subresourceRange.baseArrayLayer;
-			blit.srcSubresource.layerCount = srcImgViewCreationParams.subresourceRange.layerCount;
-			blit.dstOffsets[0] = {0, 0, 0};
-			blit.dstOffsets[1] = {WIN_W, WIN_H, 1};
-			blit.dstSubresource.aspectMask = dstImgViewCreationParams.subresourceRange.aspectMask;
-			blit.dstSubresource.mipLevel = dstImgViewCreationParams.subresourceRange.baseMipLevel;
-			blit.dstSubresource.baseArrayLayer = dstImgViewCreationParams.subresourceRange.baseArrayLayer;
-			blit.dstSubresource.layerCount = dstImgViewCreationParams.subresourceRange.layerCount;
-
-			auto srcImg = srcImgViewCreationParams.image;
-			auto dstImg = dstImgViewCreationParams.image;
-
-			cb->blitImage(srcImg.get(), asset::IImage::EL_TRANSFER_SRC_OPTIMAL, dstImg.get(), asset::IImage::EL_TRANSFER_DST_OPTIMAL, 1u, &blit , ISampler::ETF_NEAREST);
-		}
-		
-		// TRANSITION `fbo[imgnum]->getCreationParameters().attachments[0]` to EIL_PRESENT
-		{
-			IGPUCommandBuffer::SImageMemoryBarrier imageBarriers[1u] = {};
-			imageBarriers[0].barrier.srcAccessMask = asset::EAF_TRANSFER_WRITE_BIT;
-			imageBarriers[0].barrier.dstAccessMask = asset::EAF_NONE;
-			imageBarriers[0].oldLayout = asset::IImage::EL_TRANSFER_DST_OPTIMAL;
-			imageBarriers[0].newLayout = asset::IImage::EL_PRESENT_SRC;
-			imageBarriers[0].srcQueueFamilyIndex = graphicsCmdPoolQueueFamIdx;
-			imageBarriers[0].dstQueueFamilyIndex = graphicsCmdPoolQueueFamIdx;
-			imageBarriers[0].image = dstImgViewCreationParams.image;
-			imageBarriers[0].subresourceRange.aspectMask = asset::IImage::EAF_COLOR_BIT;
-			imageBarriers[0].subresourceRange.baseMipLevel = 0u;
-			imageBarriers[0].subresourceRange.levelCount = 1;
-			imageBarriers[0].subresourceRange.baseArrayLayer = 0u;
-			imageBarriers[0].subresourceRange.layerCount = 1;
-			cb->pipelineBarrier(asset::EPSF_TRANSFER_BIT, asset::EPSF_TOP_OF_PIPE_BIT, asset::EDF_NONE, 0u, nullptr, 0u, nullptr, 1u, imageBarriers);
-		}
-
-		cb->end();
-		device->resetFences(1, &fence.get());
-		CommonAPI::Submit(device.get(), cb.get(), graphicsQueue, imageAcquire[resourceIx].get(), renderFinished[resourceIx].get(), fence.get());
-		CommonAPI::Present(device.get(), swapchain.get(), graphicsQueue, renderFinished[resourceIx].get(), imgnum);
-		
-		if (LOG_TIMESTAMP)
-		{
-			std::array<uint64_t, 4> timestamps{};
-			auto queryResultFlags = core::bitflag<video::IQueryPool::E_QUERY_RESULTS_FLAGS>(video::IQueryPool::EQRF_WAIT_BIT) | video::IQueryPool::EQRF_WITH_AVAILABILITY_BIT | video::IQueryPool::EQRF_64_BIT;
-			device->getQueryPoolResults(timestampQueryPool.get(), 0u, 2u, sizeof(timestamps), timestamps.data(), sizeof(uint64_t) * 2ull, queryResultFlags);
-			const float timePassed = (timestamps[2] - timestamps[0]) * device->getPhysicalDevice()->getLimits().timestampPeriodInNanoSeconds;
-			logger->log("Time Passed (Seconds) = %f", system::ILogger::ELL_INFO, (timePassed * 1e-9));
-			logger->log("Timestamps availablity: %d, %d", system::ILogger::ELL_INFO, timestamps[1], timestamps[3]);
-		}
-	}
-	
-	const auto& fboCreationParams = fbo->begin()[0]->getCreationParameters();
-	auto gpuSourceImageView = fboCreationParams.attachments[0];
-
-	device->waitIdle();
-
-	// bool status = ext::ScreenShot::createScreenShot(device.get(), queues[decltype(initOutput)::EQT_TRANSFER_UP], renderFinished[0].get(), gpuSourceImageView.get(), assetManager.get(), "ScreenShot.png");
-	// assert(status);
-
-	return 0;
-}
diff --git a/42_FragmentShaderPathTracer/pipeline.groovy b/42_FragmentShaderPathTracer/pipeline.groovy
deleted file mode 100644
index 9e3a71cf3..000000000
--- a/42_FragmentShaderPathTracer/pipeline.groovy
+++ /dev/null
@@ -1,50 +0,0 @@
-import org.DevshGraphicsProgramming.Agent
-import org.DevshGraphicsProgramming.BuilderInfo
-import org.DevshGraphicsProgramming.IBuilder
-
-class CFragmentShaderPathTracerBuilder extends IBuilder
-{
-	public CFragmentShaderPathTracerBuilder(Agent _agent, _info)
-	{
-		super(_agent, _info)
-	}
-	
-	@Override
-	public boolean prepare(Map axisMapping)
-	{
-		return true
-	}
-	
-	@Override
-  	public boolean build(Map axisMapping)
-	{
-		IBuilder.CONFIGURATION config = axisMapping.get("CONFIGURATION")
-		IBuilder.BUILD_TYPE buildType = axisMapping.get("BUILD_TYPE")
-		
-		def nameOfBuildDirectory = getNameOfBuildDirectory(buildType)
-		def nameOfConfig = getNameOfConfig(config)
-		
-		agent.execute("cmake --build ${info.rootProjectPath}/${nameOfBuildDirectory}/${info.targetProjectPathRelativeToRoot} --target ${info.targetBaseName} --config ${nameOfConfig} -j12 -v")
-		
-		return true
-	}
-	
-	@Override
-  	public boolean test(Map axisMapping)
-	{
-		return true
-	}
-	
-	@Override
-	public boolean install(Map axisMapping)
-	{
-		return true
-	}
-}
-
-def create(Agent _agent, _info)
-{
-	return new CFragmentShaderPathTracerBuilder(_agent, _info)
-}
-
-return this
\ No newline at end of file
diff --git a/53_ComputeShaders/CMakeLists.txt b/53_ComputeShaders/CMakeLists.txt
deleted file mode 100644
index 2f9218f93..000000000
--- a/53_ComputeShaders/CMakeLists.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-include(common RESULT_VARIABLE RES)
-if(NOT RES)
-	message(FATAL_ERROR "common.cmake not found. Should be in {repo_root}/cmake directory")
-endif()
-
-nbl_create_executable_project("" "" "" "" "${NBL_EXECUTABLE_PROJECT_CREATION_PCH_TARGET}")
\ No newline at end of file
diff --git a/53_ComputeShaders/computeShader.comp b/53_ComputeShaders/computeShader.comp
deleted file mode 100644
index 033a6aabb..000000000
--- a/53_ComputeShaders/computeShader.comp
+++ /dev/null
@@ -1,95 +0,0 @@
-#version 450 core
-#extension GL_EXT_shader_16bit_storage : require
-
-#include "shaderCommon.glsl"
-
-layout(set = 0, binding = 0, std430) buffer Position
-{
-	vec4 positions[];
-};
-
-layout(set = 0, binding = 1, std430) buffer Velocity
-{
-	vec4 velocities[];
-};
-
-layout(set = 0, binding = 2, std430) buffer Color
-{
-	vec4 colors[];
-};
-
-layout(set = 0, binding = 3, std430) buffer ColorRisingFlag
-{
-	bvec4 colorsRisingFlag[];
-};
-
-layout(local_size_x = 128, local_size_y = 1, local_size_z = 1) in;
-
-void manageColorAxieState(float colorAxie, inout bool colorIntensityRisingAxieFlag)
-{
-	if(colorAxie <= 0)
-		colorIntensityRisingAxieFlag = true;
-	else if(colorAxie >= 1)
-		colorIntensityRisingAxieFlag = false;
-}
-
-void manageColorState(vec3 color)
-{
-	uint globalInvocationID = gl_GlobalInvocationID.x; // the .y and .z are both 1 in this case
-	bvec4 isColorIntensityRising = colorsRisingFlag[globalInvocationID];
-
-	manageColorAxieState(color.x, isColorIntensityRising.x);
-	manageColorAxieState(color.y, isColorIntensityRising.y);
-	manageColorAxieState(color.z, isColorIntensityRising.z);
-
-	colorsRisingFlag[globalInvocationID] = isColorIntensityRising;
-}
-
-float getNewAxieColor(float colorAxie, bool colorIntensityRisingAxieFlag)
-{
-	const float colorDelta = 0.04; 
-
-	if(colorIntensityRisingAxieFlag)
-		colorAxie += colorDelta;
-	else
-		colorAxie -= colorDelta;
-
-	return colorAxie;
-}
-
-vec3 getNewColor(vec3 color)
-{
-	uint globalInvocationID = gl_GlobalInvocationID.x; // the .y and .z are both 1 in this case
-	bvec4 isColorIntensityRising = colorsRisingFlag[globalInvocationID];
-
-	return vec3(getNewAxieColor(color.x, isColorIntensityRising.x), getNewAxieColor(color.y, isColorIntensityRising.y), getNewAxieColor(color.z, isColorIntensityRising.z));
-}
-
-void main()
-{
-	const float deltaTime = 0.004;
-	
-	uint globalInvocationID = gl_GlobalInvocationID.x; // the .y and .z are both 1 in this case
-
-	vec3 position = positions[globalInvocationID].xyz;
-	vec3 velocity = velocities[globalInvocationID].xyz;
-	vec3 color = colors[globalInvocationID].xyz;
-
-	if(!pushConstants.isXPressed)
-	{
-		/*
-		if(pushConstants.isZPressed)
-		{
-			// TODO gravity to force a particle's velocity towards the user
-		}
-		*/
-		position += velocity * deltaTime;
-	}
-		
-	vec3 newComputedColor = getNewColor(color);
-	manageColorState(newComputedColor);
-
-	positions[globalInvocationID].xyz = position;
-	velocities[globalInvocationID].xyz = velocity;
-	colors[globalInvocationID].xyz = newComputedColor;
-}
\ No newline at end of file
diff --git a/53_ComputeShaders/config.json.template b/53_ComputeShaders/config.json.template
deleted file mode 100644
index f961745c1..000000000
--- a/53_ComputeShaders/config.json.template
+++ /dev/null
@@ -1,28 +0,0 @@
-{
-  "enableParallelBuild": true,
-  "threadsPerBuildProcess" : 2,
-  "isExecuted": false,
-  "scriptPath": "",
-  "cmake": {
-    "configurations": [ "Release", "Debug", "RelWithDebInfo" ],
-    "buildModes": [],
-    "requiredOptions": []
-  }, 
-  "profiles": [
-    {
-      "backend": "vulkan",
-      "platform": "windows",
-      "buildModes": [],
-      "runConfiguration": "Release",
-      "gpuArchitectures": []
-    }
-  ],
-  "dependencies": [],
-  "data": [
-    {
-      "dependencies": [],
-      "command": [""],
-      "outputs": []
-    }
-  ]
-}
\ No newline at end of file
diff --git a/53_ComputeShaders/fragmentShader.frag b/53_ComputeShaders/fragmentShader.frag
deleted file mode 100644
index 9fe445b2b..000000000
--- a/53_ComputeShaders/fragmentShader.frag
+++ /dev/null
@@ -1,12 +0,0 @@
-#version 430 core
-
-layout(location = 0) in vec4 inFFullyProjectedVelocity;
-layout(location = 1) in vec4 inFColor;
-
-layout(location = 0) out vec4 outColor;
-
-void main()
-{
-    outColor = inFColor;
-}
-		
\ No newline at end of file
diff --git a/53_ComputeShaders/geometryShader.geom b/53_ComputeShaders/geometryShader.geom
deleted file mode 100644
index 4a8bf36f0..000000000
--- a/53_ComputeShaders/geometryShader.geom
+++ /dev/null
@@ -1,27 +0,0 @@
-#version 450 core
-
-#include "shaderCommon.glsl"
-
-layout(location = 0) in vec4 gFullyProjectedVelocity[];
-layout(location = 1) in vec4 gColor[];
-
-layout(location = 0) out vec4 outFVelocity;
-layout(location = 1) out vec4 outFColor;
-
-layout (points) in;
-layout (line_strip, max_vertices = 2) out;
-
-void main()
-{
-	if(pushConstants.isCPressed)
-	{
-		outFColor = vec4(0.0, 1.0, 0.0, 0.0);
-		gl_Position = gl_in[0].gl_Position;
-		EmitVertex();
-		gl_Position = gl_in[0].gl_Position + gFullyProjectedVelocity[0];
-		EmitVertex();
-
-		EndPrimitive();
-	}
-}
-		
\ No newline at end of file
diff --git a/53_ComputeShaders/main.cpp b/53_ComputeShaders/main.cpp
deleted file mode 100644
index b8fb14017..000000000
--- a/53_ComputeShaders/main.cpp
+++ /dev/null
@@ -1,694 +0,0 @@
-// Copyright (C) 2018-2020 - DevSH Graphics Programming Sp. z O.O.
-// This file is part of the "Nabla Engine".
-// For conditions of distribution and use, see copyright notice in nabla.h
-
-#define _NBL_STATIC_LIB_
-#include <iostream>
-#include <cstdio>
-#include <nabla.h>
-
-#include "CCamera.hpp"
-#include "../common/CommonAPI.h"
-#include "nbl/ext/ScreenShot/ScreenShot.h"
-
-using namespace nbl;
-using namespace asset;
-using namespace core;
-
-/*
-	Uncomment for more detailed logging
-*/
-
-// #define NBL_MORE_LOGS
-
-class CEventReceiver
-{
-public:
-	CEventReceiver() : particlesVectorChangeFlag(false), forceChangeVelocityFlag(false), visualizeVelocityVectorsFlag(false) {}
-
-	void process(const ui::IKeyboardEventChannel::range_t& events)
-	{
-		particlesVectorChangeFlag = false;
-		forceChangeVelocityFlag = false;
-		visualizeVelocityVectorsFlag = false;
-
-		for (auto eventIterator = events.begin(); eventIterator != events.end(); eventIterator++)
-		{
-			auto event = *eventIterator;
-
-			if (event.keyCode == nbl::ui::EKC_X)
-				particlesVectorChangeFlag = true;
-
-			if (event.keyCode == nbl::ui::EKC_Z)
-				forceChangeVelocityFlag = true;
-
-			if (event.keyCode == nbl::ui::EKC_C)
-				visualizeVelocityVectorsFlag = true;
-
-			if (event.keyCode == nbl::ui::EKC_V)
-				visualizeVelocityVectorsFlag = false;
-		}
-	}
-
-	inline bool isXPressed() const { return particlesVectorChangeFlag; }
-	inline bool isZPressed() const { return forceChangeVelocityFlag; }
-	inline bool isCPressed() const { return visualizeVelocityVectorsFlag; }
-
-private:
-	bool particlesVectorChangeFlag;
-	bool forceChangeVelocityFlag;
-	bool visualizeVelocityVectorsFlag;
-};
-
-_NBL_STATIC_INLINE_CONSTEXPR size_t NUMBER_OF_PARTICLES = 1024 * 1024;		// total number of particles to move
-_NBL_STATIC_INLINE_CONSTEXPR size_t WORK_GROUP_SIZE = 128;					// work-items per work-group
-
-enum E_ENTRIES
-{
-	EE_POSITIONS,
-	EE_VELOCITIES,
-	EE_COLORS,
-	EE_COLORS_RISING_FLAG,
-	EE_COUNT
-};
-
-#include "nbl/nblpack.h"
-struct alignas(16) SShaderStorageBufferObject
-{
-	core::vector4df_SIMD positions[NUMBER_OF_PARTICLES];
-	core::vector4df_SIMD velocities[NUMBER_OF_PARTICLES];
-	core::vector4df_SIMD colors[NUMBER_OF_PARTICLES];
-	bool isColorIntensityRising[NUMBER_OF_PARTICLES][4];
-} PACK_STRUCT;
-#include "nbl/nblunpack.h"
-
-static_assert(sizeof(SShaderStorageBufferObject) == sizeof(SShaderStorageBufferObject::positions) + sizeof(SShaderStorageBufferObject::velocities) + sizeof(SShaderStorageBufferObject::colors) + sizeof(SShaderStorageBufferObject::isColorIntensityRising), "There will be inproper alignment!");
-
-#include "nbl/nblpack.h"
-struct alignas(32) SPushConstants
-{
-	uint32_t isXPressed = false;
-	uint32_t isZPressed = false;
-	uint32_t isCPressed = false;
-	core::vector3df currentUserAbsolutePosition;
-} PACK_STRUCT;
-#include "nbl/nblunpack.h"
-
-void triggerRandomSetup(SShaderStorageBufferObject* ssbo)
-{
-	_NBL_STATIC_INLINE_CONSTEXPR float POSITION_EACH_AXIE_MIN = -10.f;
-	_NBL_STATIC_INLINE_CONSTEXPR float POSITION_EACH_AXIE_MAX = 10.f;
-
-	_NBL_STATIC_INLINE_CONSTEXPR float VELOCITY_EACH_AXIE_MIN = 0.f;
-	_NBL_STATIC_INLINE_CONSTEXPR float VELOCITY_EACH_AXIE_MAX = 0.001f;
-
-	_NBL_STATIC_INLINE_CONSTEXPR float COLOR_EACH_AXIE_MIN = 0.f;
-	_NBL_STATIC_INLINE_CONSTEXPR float COLOR_EACH_AXIE_MAX = 1.f;
-
-	auto get_random = [&](const float& min, const float& max)
-	{
-		static std::default_random_engine engine;
-		static std::uniform_real_distribution<> distribution(min, max);
-		return distribution(engine);
-	};
-
-	for (size_t i = 0; i < NUMBER_OF_PARTICLES; ++i)
-	{
-		ssbo->positions[i] = core::vector4df_SIMD(get_random(POSITION_EACH_AXIE_MIN, POSITION_EACH_AXIE_MAX), get_random(POSITION_EACH_AXIE_MIN, POSITION_EACH_AXIE_MAX), get_random(POSITION_EACH_AXIE_MIN, POSITION_EACH_AXIE_MAX), get_random(POSITION_EACH_AXIE_MIN, POSITION_EACH_AXIE_MAX));
-		ssbo->velocities[i] = core::vector4df_SIMD(get_random(VELOCITY_EACH_AXIE_MIN, VELOCITY_EACH_AXIE_MAX), get_random(VELOCITY_EACH_AXIE_MIN, VELOCITY_EACH_AXIE_MAX), get_random(VELOCITY_EACH_AXIE_MIN, VELOCITY_EACH_AXIE_MAX), get_random(VELOCITY_EACH_AXIE_MIN, VELOCITY_EACH_AXIE_MAX));
-		ssbo->colors[i] = core::vector4df_SIMD(get_random(COLOR_EACH_AXIE_MIN, COLOR_EACH_AXIE_MAX), get_random(COLOR_EACH_AXIE_MIN, COLOR_EACH_AXIE_MAX), get_random(COLOR_EACH_AXIE_MIN, COLOR_EACH_AXIE_MAX), get_random(COLOR_EACH_AXIE_MIN, COLOR_EACH_AXIE_MAX));
-
-		for (uint8_t b = 0; b < 4; ++b)
-			ssbo->isColorIntensityRising[i][b] = true;
-	}
-}
-
-class MeshLoadersApp : public ApplicationBase
-{
-	static constexpr uint32_t WIN_W = 1280;
-	static constexpr uint32_t WIN_H = 720;
-	static constexpr uint32_t FBO_COUNT = 2u;
-	static constexpr uint32_t FRAMES_IN_FLIGHT = 1u;
-	static constexpr size_t NBL_FRAMES_TO_AVERAGE = 100ull;
-
-public:
-	nbl::core::smart_refctd_ptr<nbl::ui::IWindowManager> windowManager;
-	nbl::core::smart_refctd_ptr<nbl::ui::IWindow> window;
-	nbl::core::smart_refctd_ptr<CommonAPI::CommonAPIEventCallback> windowCallback;
-	nbl::core::smart_refctd_ptr<nbl::video::IAPIConnection> gl;
-	nbl::core::smart_refctd_ptr<nbl::video::ISurface> surface;
-	nbl::core::smart_refctd_ptr<nbl::video::IUtilities> utilities;
-	nbl::core::smart_refctd_ptr<nbl::video::ILogicalDevice> logicalDevice;
-	nbl::video::IPhysicalDevice* gpuPhysicalDevice;
-	std::array<nbl::video::IGPUQueue*, CommonAPI::InitOutput::MaxQueuesCount> queues = { nullptr, nullptr, nullptr, nullptr };
-	nbl::core::smart_refctd_ptr<nbl::video::ISwapchain> swapchain;
-	nbl::core::smart_refctd_ptr<nbl::video::IGPURenderpass> renderpass;
-	nbl::core::smart_refctd_dynamic_array<nbl::core::smart_refctd_ptr<nbl::video::IGPUFramebuffer>> fbo;
-	std::array<std::array<nbl::core::smart_refctd_ptr<nbl::video::IGPUCommandPool>, CommonAPI::InitOutput::MaxFramesInFlight>, CommonAPI::InitOutput::MaxQueuesCount> commandPools;
-	nbl::core::smart_refctd_ptr<nbl::system::ISystem> system;
-	nbl::core::smart_refctd_ptr<nbl::asset::IAssetManager> assetManager;
-	nbl::video::IGPUObjectFromAssetConverter::SParams cpu2gpuParams;
-	nbl::core::smart_refctd_ptr<nbl::system::ILogger> logger;
-	nbl::core::smart_refctd_ptr<CommonAPI::InputSystem> inputSystem;
-
-	nbl::core::smart_refctd_ptr<video::IGPUFence> gpuTransferFence;
-	nbl::core::smart_refctd_ptr<video::IGPUFence> gpuComputeFence;
-	nbl::video::IGPUObjectFromAssetConverter cpu2gpu;
-
-	core::smart_refctd_ptr<nbl::video::IGPUCommandBuffer> commandBuffers[1];
-
-	CEventReceiver eventReceiver;
-	CommonAPI::InputSystem::ChannelReader<ui::IMouseEventChannel> mouse;
-	CommonAPI::InputSystem::ChannelReader<ui::IKeyboardEventChannel> keyboard;
-
-	Camera camera = Camera(core::vectorSIMDf(0, 0, 0), core::vectorSIMDf(0, 0, 0), core::matrix4SIMD());
-	std::chrono::system_clock::time_point lastTime;
-	size_t frame_count = 0ull;
-	double time_sum = 0;
-	double dtList[NBL_FRAMES_TO_AVERAGE] = {};
-
-	SPushConstants pushConstants;
-	nbl::core::smart_refctd_ptr<video::IGPUComputePipeline> gpuComputePipeline;
-	nbl::core::smart_refctd_ptr<video::IGPUDescriptorSet> gpuCDescriptorSet;
-	nbl::core::smart_refctd_ptr<video::IGPUBuffer> gpuUBO;
-	nbl::core::smart_refctd_ptr<video::IGPUGraphicsPipeline> gpuGraphicsPipeline;
-	nbl::core::smart_refctd_ptr<video::IGPUGraphicsPipeline> gpuGraphicsPipeline2;
-	nbl::core::smart_refctd_ptr<video::IGPUMeshBuffer> gpuMeshBuffer;
-	nbl::core::smart_refctd_ptr<video::IGPUMeshBuffer> gpuMeshBuffer2;
-	core::smart_refctd_ptr<video::IGPUDescriptorSet> gpuGDescriptorSet1;
-	nbl::core::smart_refctd_ptr<nbl::video::IGPUSemaphore> render_finished_sem;
-	nbl::video::ISwapchain::SCreationParams m_swapchainCreationParams;
-
-	void setWindow(core::smart_refctd_ptr<nbl::ui::IWindow>&& wnd) override
-	{
-		window = std::move(wnd);
-	}
-	void setSystem(core::smart_refctd_ptr<nbl::system::ISystem>&& s) override
-	{
-		system = std::move(s);
-	}
-	nbl::ui::IWindow* getWindow() override
-	{
-		return window.get();
-	}
-	video::IAPIConnection* getAPIConnection() override
-	{
-		return gl.get();
-	}
-	video::ILogicalDevice* getLogicalDevice()  override
-	{
-		return logicalDevice.get();
-	}
-	video::IGPURenderpass* getRenderpass() override
-	{
-		return renderpass.get();
-	}
-	void setSurface(core::smart_refctd_ptr<video::ISurface>&& s) override
-	{
-		surface = std::move(s);
-	}
-	void setFBOs(std::vector<core::smart_refctd_ptr<video::IGPUFramebuffer>>& f) override
-	{
-		for (int i = 0; i < f.size(); i++)
-		{
-			fbo->begin()[i] = core::smart_refctd_ptr(f[i]);
-		}
-	}
-	void setSwapchain(core::smart_refctd_ptr<video::ISwapchain>&& s) override
-	{
-		swapchain = std::move(s);
-	}
-	uint32_t getSwapchainImageCount() override
-	{
-		return swapchain->getImageCount();
-	}
-	virtual nbl::asset::E_FORMAT getDepthFormat() override
-	{
-		return nbl::asset::EF_D32_SFLOAT;
-	}
-
-APP_CONSTRUCTOR(MeshLoadersApp)
-
-	void onAppInitialized_impl() override
-	{
-	const auto swapchainImageUsage = static_cast<asset::IImage::E_USAGE_FLAGS>(asset::IImage::EUF_COLOR_ATTACHMENT_BIT);
-		CommonAPI::InitParams initParams;
-		initParams.window = core::smart_refctd_ptr(window);
-		initParams.apiType = video::EAT_VULKAN;
-		initParams.appName = { _NBL_APP_NAME_ };
-		initParams.framesInFlight = FRAMES_IN_FLIGHT;
-		initParams.windowWidth = WIN_W;
-		initParams.windowHeight = WIN_H;
-		initParams.swapchainImageCount = FBO_COUNT;
-		initParams.swapchainImageUsage = swapchainImageUsage;
-		initParams.depthFormat = nbl::asset::EF_D32_SFLOAT;
-		auto initOutput = CommonAPI::InitWithDefaultExt(std::move(initParams));
-
-		window = std::move(initParams.window);
-		gl = std::move(initOutput.apiConnection);
-		surface = std::move(initOutput.surface);
-		gpuPhysicalDevice = std::move(initOutput.physicalDevice);
-		logicalDevice = std::move(initOutput.logicalDevice);
-		queues = std::move(initOutput.queues);
-		renderpass = std::move(initOutput.renderToSwapchainRenderpass);
-		commandPools = std::move(initOutput.commandPools);
-		assetManager = std::move(initOutput.assetManager);
-		logger = std::move(initOutput.logger);
-		inputSystem = std::move(initOutput.inputSystem);
-		windowCallback = std::move(initParams.windowCb);
-		cpu2gpuParams = std::move(initOutput.cpu2gpuParams);
-		m_swapchainCreationParams = std::move(initOutput.swapchainCreationParams);
-		auto defaultGraphicsCommandPool = commandPools[CommonAPI::InitOutput::EQT_GRAPHICS][0];
-
-		CommonAPI::createSwapchain(std::move(logicalDevice), m_swapchainCreationParams, WIN_W, WIN_H, swapchain);
-		assert(swapchain);
-		fbo = CommonAPI::createFBOWithSwapchainImages(
-			swapchain->getImageCount(), WIN_W, WIN_H,
-			logicalDevice, swapchain, renderpass,
-			nbl::asset::EF_D32_SFLOAT
-		);
-
-		logicalDevice->createCommandBuffers(defaultGraphicsCommandPool.get(), nbl::video::IGPUCommandBuffer::EL_PRIMARY, 1, commandBuffers);
-		auto commandBuffer = commandBuffers[0];
-
-		auto createDescriptorPool = [&](const uint32_t itemCount, E_DESCRIPTOR_TYPE descriptorType)
-		{
-			constexpr uint32_t maxItemCount = 256u;
-			{
-				nbl::video::IDescriptorPool::SDescriptorPoolSize poolSize;
-				poolSize.count = itemCount;
-				poolSize.type = descriptorType;
-				return logicalDevice->createDescriptorPool(static_cast<nbl::video::IDescriptorPool::E_CREATE_FLAGS>(0), maxItemCount, 1u, &poolSize);
-			}
-		};
-
-		/*
-			Compute pipeline
-		*/
-
-		auto computeShaderBundle = assetManager->getAsset("../computeShader.comp", {});
-		{
-			bool status = !computeShaderBundle.getContents().empty();
-			assert(status);
-		}
-
-		auto cpuComputeShader = core::smart_refctd_ptr_static_cast<asset::ICPUSpecializedShader>(computeShaderBundle.getContents().begin()[0]);
-		smart_refctd_ptr<video::IGPUSpecializedShader> gpuComputeShader;
-		{
-			auto gpu_array = cpu2gpu.getGPUObjectsFromAssets(&cpuComputeShader, &cpuComputeShader + 1, cpu2gpuParams);
-			if (!gpu_array || gpu_array->size() < 1u || !(*gpu_array)[0])
-				assert(false);
-
-			gpuComputeShader = (*gpu_array)[0];
-		}
-
-		auto cpuSSBOBuffer = ICPUBuffer::create({ sizeof(SShaderStorageBufferObject) });
-		cpuSSBOBuffer->addUsageFlags(asset::IBuffer::EUF_STORAGE_BUFFER_BIT);
-		triggerRandomSetup(reinterpret_cast<SShaderStorageBufferObject*>(cpuSSBOBuffer->getPointer()));
-		core::smart_refctd_ptr<video::IGPUBuffer> gpuSSBOBuffer;
-		{
-			cpu2gpuParams.beginCommandBuffers();
-
-			auto gpu_array = cpu2gpu.getGPUObjectsFromAssets(&cpuSSBOBuffer, &cpuSSBOBuffer + 1, cpu2gpuParams);
-			if (!gpu_array || gpu_array->size() < 1u || !(*gpu_array)[0])
-				assert(false);
-			
-			cpu2gpuParams.waitForCreationToComplete(false);
-
-			auto gpuSSBOOffsetBufferPair = (*gpu_array)[0];
-			gpuSSBOBuffer = core::smart_refctd_ptr<video::IGPUBuffer>(gpuSSBOOffsetBufferPair->getBuffer());
-		}
-
-		video::IGPUDescriptorSetLayout::SBinding gpuBindingsLayout[EE_COUNT] =
-		{
-			{EE_POSITIONS, EDT_STORAGE_BUFFER, 1u, video::IGPUShader::ESS_COMPUTE, nullptr},
-			{EE_VELOCITIES, EDT_STORAGE_BUFFER, 1u, video::IGPUShader::ESS_COMPUTE, nullptr},
-			{EE_COLORS, EDT_STORAGE_BUFFER, 1u, video::IGPUShader::ESS_COMPUTE, nullptr},
-			{EE_COLORS_RISING_FLAG, EDT_STORAGE_BUFFER, 1u, video::IGPUShader::ESS_COMPUTE, nullptr}
-		};
-
-		auto gpuCDescriptorPool = createDescriptorPool(EE_COUNT, EDT_STORAGE_BUFFER);
-		auto gpuCDescriptorSetLayout = logicalDevice->createDescriptorSetLayout(gpuBindingsLayout, gpuBindingsLayout + EE_COUNT);
-		gpuCDescriptorSet = logicalDevice->createDescriptorSet(gpuCDescriptorPool.get(), core::smart_refctd_ptr(gpuCDescriptorSetLayout));
-		{
-			video::IGPUDescriptorSet::SDescriptorInfo gpuDescriptorSetInfos[EE_COUNT];
-
-			gpuDescriptorSetInfos[EE_POSITIONS].desc = gpuSSBOBuffer;
-			gpuDescriptorSetInfos[EE_POSITIONS].buffer.size = sizeof(SShaderStorageBufferObject::positions);
-			gpuDescriptorSetInfos[EE_POSITIONS].buffer.offset = 0;
-
-			gpuDescriptorSetInfos[EE_VELOCITIES].desc = gpuSSBOBuffer;
-			gpuDescriptorSetInfos[EE_VELOCITIES].buffer.size = sizeof(SShaderStorageBufferObject::velocities);
-			gpuDescriptorSetInfos[EE_VELOCITIES].buffer.offset = sizeof(SShaderStorageBufferObject::positions);
-
-			gpuDescriptorSetInfos[EE_COLORS].desc = gpuSSBOBuffer;
-			gpuDescriptorSetInfos[EE_COLORS].buffer.size = sizeof(SShaderStorageBufferObject::colors);
-			gpuDescriptorSetInfos[EE_COLORS].buffer.offset = gpuDescriptorSetInfos[EE_VELOCITIES].buffer.offset + sizeof(SShaderStorageBufferObject::velocities);
-
-			gpuDescriptorSetInfos[EE_COLORS_RISING_FLAG].desc = gpuSSBOBuffer;
-			gpuDescriptorSetInfos[EE_COLORS_RISING_FLAG].buffer.size = sizeof(SShaderStorageBufferObject::isColorIntensityRising);
-			gpuDescriptorSetInfos[EE_COLORS_RISING_FLAG].buffer.offset = gpuDescriptorSetInfos[EE_COLORS].buffer.offset + sizeof(SShaderStorageBufferObject::colors);
-
-			video::IGPUDescriptorSet::SWriteDescriptorSet gpuWrites[EE_COUNT];
-			{
-				for (uint32_t binding = 0u; binding < EE_COUNT; binding++)
-					gpuWrites[binding] = { gpuCDescriptorSet.get(), binding, 0u, 1u, EDT_STORAGE_BUFFER, gpuDescriptorSetInfos + binding };
-				logicalDevice->updateDescriptorSets(EE_COUNT, gpuWrites, 0u, nullptr);
-			}
-		}
-
-		asset::SPushConstantRange pushConstantRange;
-		{
-			pushConstantRange.stageFlags = (asset::IShader::E_SHADER_STAGE)(asset::IShader::ESS_COMPUTE | asset::IShader::ESS_GEOMETRY);
-			pushConstantRange.offset = 0;
-			pushConstantRange.size = sizeof(SPushConstants);
-		}
-
-		auto gpuCPipelineLayout = logicalDevice->createPipelineLayout(&pushConstantRange, &pushConstantRange + 1, std::move(gpuCDescriptorSetLayout), nullptr, nullptr, nullptr);
-		gpuComputePipeline = logicalDevice->createComputePipeline(nullptr, std::move(gpuCPipelineLayout), std::move(gpuComputeShader));
-
-		/*
-			Graphics Pipeline
-		*/
-
-		asset::SVertexInputParams inputVertexParams;
-		inputVertexParams.enabledAttribFlags = core::createBitmask({ EE_POSITIONS, EE_VELOCITIES, EE_COLORS, EE_COLORS_RISING_FLAG });
-		inputVertexParams.enabledBindingFlags = core::createBitmask({ EE_POSITIONS, EE_VELOCITIES, EE_COLORS, EE_COLORS_RISING_FLAG });
-
-		for (uint8_t i = 0; i < EE_COUNT; ++i)
-		{
-			inputVertexParams.bindings[i].stride = (i == EE_COLORS_RISING_FLAG ? getTexelOrBlockBytesize(EF_R8G8B8A8_UINT) : getTexelOrBlockBytesize(EF_R32G32B32A32_SFLOAT));
-			inputVertexParams.bindings[i].inputRate = asset::EVIR_PER_VERTEX;
-
-			inputVertexParams.attributes[i].binding = i;
-			inputVertexParams.attributes[i].format = (i == EE_COLORS_RISING_FLAG ? EF_R8G8B8A8_UINT : asset::EF_R32G32B32A32_SFLOAT);
-			inputVertexParams.attributes[i].relativeOffset = 0;
-		}
-
-		asset::SBlendParams blendParams;
-		asset::SPrimitiveAssemblyParams primitiveAssemblyParams;
-		primitiveAssemblyParams.primitiveType = EPT_POINT_LIST;
-		asset::SRasterizationParams rasterizationParams;
-
-		video::IGPUDescriptorSetLayout::SBinding gpuUboBinding = {};
-		gpuUboBinding.count = 1u;
-		gpuUboBinding.binding = 0;
-		gpuUboBinding.stageFlags = static_cast<asset::ICPUShader::E_SHADER_STAGE>(asset::ICPUShader::ESS_VERTEX | asset::ICPUShader::ESS_FRAGMENT);
-		gpuUboBinding.type = asset::EDT_UNIFORM_BUFFER;
-
-		auto gpuGDescriptorPool = createDescriptorPool(1, EDT_UNIFORM_BUFFER);
-		auto gpuGDs1Layout = logicalDevice->createDescriptorSetLayout(&gpuUboBinding, &gpuUboBinding + 1);
-
-		video::IGPUBuffer::SCreationParams gpuUBOCreationParams;
-		//gpuUBOCreationParams.size = sizeof(SBasicViewParameters);
-		gpuUBOCreationParams.usage = asset::IBuffer::E_USAGE_FLAGS(asset::IBuffer::EUF_UNIFORM_BUFFER_BIT | asset::IBuffer::EUF_INLINE_UPDATE_VIA_CMDBUF);
-		gpuUBOCreationParams.queueFamilyIndexCount = 0u;
-		gpuUBOCreationParams.queueFamilyIndices = nullptr;
-		gpuUBOCreationParams.size = sizeof(SBasicViewParameters);
-
-		gpuUBO = logicalDevice->createBuffer(std::move(gpuUBOCreationParams));
-		auto gpuUBOmemreqs = gpuUBO->getMemoryReqs();
-		gpuUBOmemreqs.memoryTypeBits &= gpuPhysicalDevice->getDeviceLocalMemoryTypeBits();
-		logicalDevice->allocate(gpuUBOmemreqs, gpuUBO.get());
-
-		gpuGDescriptorSet1 = logicalDevice->createDescriptorSet(gpuGDescriptorPool.get(), gpuGDs1Layout);
-		{
-			video::IGPUDescriptorSet::SWriteDescriptorSet write;
-			write.dstSet = gpuGDescriptorSet1.get();
-			write.binding = 0;
-			write.count = 1u;
-			write.arrayElement = 0u;
-			write.descriptorType = asset::EDT_UNIFORM_BUFFER;
-			video::IGPUDescriptorSet::SDescriptorInfo info;
-			{
-				info.desc = gpuUBO;
-				info.buffer.offset = 0ull;
-				info.buffer.size = sizeof(SBasicViewParameters);
-			}
-			write.info = &info;
-			logicalDevice->updateDescriptorSets(1u, &write, 0u, nullptr);
-		}
-
-		auto vertexShaderBundle = assetManager->getAsset("../vertexShader.vert", {});
-		{
-			bool status = !vertexShaderBundle.getContents().empty();
-			assert(status);
-		}
-
-		auto cpuVertexShader = core::smart_refctd_ptr_static_cast<ICPUSpecializedShader>(vertexShaderBundle.getContents().begin()[0]);
-		smart_refctd_ptr<video::IGPUSpecializedShader> gpuVertexShader;
-		{
-			auto gpu_array = cpu2gpu.getGPUObjectsFromAssets(&cpuVertexShader, &cpuVertexShader + 1, cpu2gpuParams);
-			if (!gpu_array || gpu_array->size() < 1u || !(*gpu_array)[0])
-				assert(false);
-
-			gpuVertexShader = (*gpu_array)[0];
-		}
-
-		auto fragmentShaderBundle = assetManager->getAsset("../fragmentShader.frag", {});
-		{
-			bool status = !fragmentShaderBundle.getContents().empty();
-			assert(status);
-		}
-
-		auto cpuFragmentShader = core::smart_refctd_ptr_static_cast<ICPUSpecializedShader>(fragmentShaderBundle.getContents().begin()[0]);
-		smart_refctd_ptr<video::IGPUSpecializedShader> gpuFragmentShader;
-		{
-			auto gpu_array = cpu2gpu.getGPUObjectsFromAssets(&cpuFragmentShader, &cpuFragmentShader + 1, cpu2gpuParams);
-			if (!gpu_array || gpu_array->size() < 1u || !(*gpu_array)[0])
-				assert(false);
-
-			gpuFragmentShader = (*gpu_array)[0];
-		}
-
-		auto geometryShaderBundle = assetManager->getAsset("../geometryShader.geom", {});
-		{
-			bool status = !geometryShaderBundle.getContents().empty();
-			assert(status);
-		}
-
-		auto cpuGeometryShader = core::smart_refctd_ptr_static_cast<ICPUSpecializedShader>(geometryShaderBundle.getContents().begin()[0]);
-		smart_refctd_ptr<video::IGPUSpecializedShader> gpuGeometryShader;
-		{
-			auto gpu_array = cpu2gpu.getGPUObjectsFromAssets(&cpuGeometryShader, &cpuGeometryShader + 1, cpu2gpuParams);
-			if (!gpu_array || gpu_array->size() < 1u || !(*gpu_array)[0])
-				assert(false);
-
-			gpuGeometryShader = (*gpu_array)[0];
-		}
-
-		core::smart_refctd_ptr<video::IGPUSpecializedShader> gpuGShaders[] = { gpuVertexShader, gpuFragmentShader, gpuGeometryShader };
-		auto gpuGShadersPointer = reinterpret_cast<video::IGPUSpecializedShader**>(gpuGShaders);
-
-		auto gpuGPipelineLayout = logicalDevice->createPipelineLayout(&pushConstantRange, &pushConstantRange + 1, nullptr, std::move(gpuGDs1Layout), nullptr, nullptr);
-		auto gpuRenderpassIndependentPipeline = logicalDevice->createRenderpassIndependentPipeline(nullptr, core::smart_refctd_ptr(gpuGPipelineLayout), gpuGShadersPointer, gpuGShadersPointer + 2 /* discard geometry shader*/, inputVertexParams, blendParams, primitiveAssemblyParams, rasterizationParams);
-		auto gpuRenderpassIndependentPipeline2 = logicalDevice->createRenderpassIndependentPipeline(nullptr, core::smart_refctd_ptr(gpuGPipelineLayout), gpuGShadersPointer, gpuGShadersPointer + 3, inputVertexParams, blendParams, primitiveAssemblyParams, rasterizationParams);
-
-		asset::SBufferBinding<video::IGPUBuffer> gpuGbindings[video::IGPUMeshBuffer::MAX_ATTR_BUF_BINDING_COUNT];
-
-		gpuGbindings[EE_POSITIONS].buffer = gpuSSBOBuffer;
-		gpuGbindings[EE_POSITIONS].offset = 0;
-
-		gpuGbindings[EE_VELOCITIES].buffer = gpuSSBOBuffer;
-		gpuGbindings[EE_VELOCITIES].offset = sizeof(SShaderStorageBufferObject::positions);
-
-		gpuGbindings[EE_COLORS].buffer = gpuSSBOBuffer;
-		gpuGbindings[EE_COLORS].offset = gpuGbindings[EE_VELOCITIES].offset + sizeof(SShaderStorageBufferObject::velocities);
-
-		gpuGbindings[EE_COLORS_RISING_FLAG].buffer = gpuSSBOBuffer;
-		gpuGbindings[EE_COLORS_RISING_FLAG].offset = gpuGbindings[EE_COLORS].offset + sizeof(SShaderStorageBufferObject::colors);
-
-		gpuMeshBuffer = core::make_smart_refctd_ptr<video::IGPUMeshBuffer>(std::move(gpuRenderpassIndependentPipeline), nullptr, gpuGbindings, asset::SBufferBinding<video::IGPUBuffer>());
-		{
-			gpuMeshBuffer->setIndexType(asset::EIT_UNKNOWN);
-			gpuMeshBuffer->setIndexCount(NUMBER_OF_PARTICLES);
-		}
-
-		{
-			nbl::video::IGPUGraphicsPipeline::SCreationParams graphicsPipelineParams;
-			graphicsPipelineParams.renderpassIndependent = core::smart_refctd_ptr<nbl::video::IGPURenderpassIndependentPipeline>(const_cast<video::IGPURenderpassIndependentPipeline*>(gpuMeshBuffer->getPipeline()));
-			graphicsPipelineParams.renderpass = core::smart_refctd_ptr(renderpass);
-			gpuGraphicsPipeline = logicalDevice->createGraphicsPipeline(nullptr, std::move(graphicsPipelineParams));
-		}
-
-		gpuMeshBuffer2 = core::make_smart_refctd_ptr<video::IGPUMeshBuffer>(std::move(gpuRenderpassIndependentPipeline2), nullptr, gpuGbindings, asset::SBufferBinding<video::IGPUBuffer>());
-		{
-			gpuMeshBuffer2->setIndexType(asset::EIT_UNKNOWN);
-			gpuMeshBuffer2->setIndexCount(NUMBER_OF_PARTICLES);
-		}
-
-		{
-			nbl::video::IGPUGraphicsPipeline::SCreationParams graphicsPipelineParams;
-			graphicsPipelineParams.renderpassIndependent = core::smart_refctd_ptr<nbl::video::IGPURenderpassIndependentPipeline>(const_cast<video::IGPURenderpassIndependentPipeline*>(gpuMeshBuffer2->getPipeline()));
-			graphicsPipelineParams.renderpass = core::smart_refctd_ptr(renderpass);
-			gpuGraphicsPipeline2 = logicalDevice->createGraphicsPipeline(nullptr, std::move(graphicsPipelineParams));
-		}
-
-		const std::string captionData = "[Nabla Engine] Compute Shaders";
-		window->setCaption(captionData);
-
-		core::vectorSIMDf cameraPosition(0, 0, 0);
-		matrix4SIMD projectionMatrix = matrix4SIMD::buildProjectionMatrixPerspectiveFovLH(core::radians(60.0f), video::ISurface::getTransformedAspectRatio(swapchain->getPreTransform(), WIN_W, WIN_H), 0.001, 1000);
-		camera = Camera(cameraPosition, core::vectorSIMDf(0, 0, -1), projectionMatrix, 10.f, 1.f);
-		lastTime = std::chrono::system_clock::now();
-		for (size_t i = 0ull; i < NBL_FRAMES_TO_AVERAGE; ++i)
-			dtList[i] = 0.0;
-	}
-
-	void onAppTerminated_impl() override
-	{
-		const auto& fboCreationParams = fbo->begin()[0]->getCreationParameters();
-		auto gpuSourceImageView = fboCreationParams.attachments[0];
-
-		bool status = ext::ScreenShot::createScreenShot(logicalDevice.get(),
-			queues[CommonAPI::InitOutput::EQT_TRANSFER_UP],
-			render_finished_sem.get(),
-			gpuSourceImageView.get(),
-			assetManager.get(),
-			"ScreenShot.png",
-			asset::IImage::EL_PRESENT_SRC,
-			asset::EAF_NONE);
-
-		assert(status);
-	}
-
-	void workLoopBody() override
-	{
-		auto renderStart = std::chrono::system_clock::now();
-		const auto renderDt = std::chrono::duration_cast<std::chrono::milliseconds>(renderStart - lastTime).count();
-		lastTime = renderStart;
-		{ // Calculate Simple Moving Average for FrameTime
-			time_sum -= dtList[frame_count];
-			time_sum += renderDt;
-			dtList[frame_count] = renderDt;
-			frame_count++;
-			if (frame_count >= NBL_FRAMES_TO_AVERAGE)
-				frame_count = 0;
-		}
-		const double averageFrameTime = time_sum / (double)NBL_FRAMES_TO_AVERAGE;
-
-#ifdef NBL_MORE_LOGS
-		logger->log("renderDt = %f ------ averageFrameTime = %f", system::ILogger::ELL_INFO, renderDt, averageFrameTime);
-#endif // NBL_MORE_LOGS
-
-		auto averageFrameTimeDuration = std::chrono::duration<double, std::milli>(averageFrameTime);
-		auto nextPresentationTime = renderStart + averageFrameTimeDuration;
-		auto nextPresentationTimeStamp = std::chrono::duration_cast<std::chrono::microseconds>(nextPresentationTime.time_since_epoch());
-
-		inputSystem->getDefaultMouse(&mouse);
-		inputSystem->getDefaultKeyboard(&keyboard);
-
-		camera.beginInputProcessing(nextPresentationTimeStamp);
-		mouse.consumeEvents([&](const ui::IMouseEventChannel::range_t& events) -> void { camera.mouseProcess(events); }, logger.get());
-		keyboard.consumeEvents([&](const ui::IKeyboardEventChannel::range_t& events) -> void { camera.keyboardProcess(events); eventReceiver.process(events); }, logger.get());
-		camera.endInputProcessing(nextPresentationTimeStamp);
-
-		const auto& viewMatrix = camera.getViewMatrix();
-		const auto& viewProjectionMatrix = matrix4SIMD::concatenateBFollowedByAPrecisely(
-			video::ISurface::getSurfaceTransformationMatrix(swapchain->getPreTransform()),
-			camera.getConcatenatedMatrix()
-		);
-
-		auto& commandBuffer = commandBuffers[0];
-		commandBuffer->reset(nbl::video::IGPUCommandBuffer::ERF_RELEASE_RESOURCES_BIT);
-		commandBuffer->begin(video::IGPUCommandBuffer::EU_ONE_TIME_SUBMIT_BIT);  // TODO: Reset Frame's CommandPool
-
-		asset::SViewport viewport;
-		viewport.minDepth = 1.f;
-		viewport.maxDepth = 0.f;
-		viewport.x = 0u;
-		viewport.y = 0u;
-		viewport.width = WIN_W;
-		viewport.height = WIN_H;
-		commandBuffer->setViewport(0u, 1u, &viewport);
-
-		nbl::video::IGPUCommandBuffer::SRenderpassBeginInfo beginInfo;
-		VkRect2D area;
-		area.offset = { 0,0 };
-		area.extent = { WIN_W, WIN_H };
-		nbl::asset::SClearValue clear[2];
-		clear[0].color.float32[0] = 0.f;
-		clear[0].color.float32[1] = 0.f;
-		clear[0].color.float32[2] = 0.f;
-		clear[0].color.float32[3] = 0.f;
-		clear[1].depthStencil.depth = 0.f;
-
-		beginInfo.clearValueCount = 2u;
-		beginInfo.framebuffer = fbo->begin()[0];
-		beginInfo.renderpass = renderpass;
-		beginInfo.renderArea = area;
-		beginInfo.clearValues = clear;
-
-		commandBuffer->beginRenderPass(&beginInfo, nbl::asset::ESC_INLINE);
-
-		pushConstants.isXPressed = eventReceiver.isXPressed();
-		pushConstants.isZPressed = eventReceiver.isZPressed();
-		pushConstants.isCPressed = eventReceiver.isCPressed();
-		pushConstants.currentUserAbsolutePosition = camera.getPosition().getAsVector3df();
-
-		/*
-			Calculation of particle postitions takes place here
-		*/
-
-		commandBuffer->bindComputePipeline(gpuComputePipeline.get());
-		commandBuffer->pushConstants(gpuComputePipeline->getLayout(), asset::IShader::ESS_COMPUTE, 0, sizeof(SPushConstants), &pushConstants);
-		commandBuffer->bindDescriptorSets(EPBP_COMPUTE, gpuComputePipeline->getLayout(), 0, 1, &gpuCDescriptorSet.get(), 0u);
-
-		static_assert(NUMBER_OF_PARTICLES % WORK_GROUP_SIZE == 0, "Inccorect amount!");
-		_NBL_STATIC_INLINE_CONSTEXPR size_t groupCountX = NUMBER_OF_PARTICLES / WORK_GROUP_SIZE;
-
-		commandBuffer->dispatch(groupCountX, 1, 1);
-
-		/*
-			After calculation of positions each particle gets displayed
-		*/
-
-		core::matrix3x4SIMD modelMatrix;
-		modelMatrix.setTranslation(nbl::core::vectorSIMDf(0, 0, 0, 0));
-
-		core::matrix4SIMD mvp = core::concatenateBFollowedByA(viewProjectionMatrix, modelMatrix);
-
-		SBasicViewParameters uboData;
-		memcpy(uboData.MV, viewMatrix.pointer(), sizeof(uboData.MV));
-		memcpy(uboData.MVP, mvp.pointer(), sizeof(uboData.MVP));
-		memcpy(uboData.NormalMat, viewMatrix.pointer(), sizeof(uboData.NormalMat));
-		commandBuffer->updateBuffer(gpuUBO.get(), 0ull, sizeof(uboData), &uboData);
-
-		/*
-			Draw particles
-		*/
-
-		commandBuffer->bindGraphicsPipeline(gpuGraphicsPipeline.get());
-		commandBuffer->bindDescriptorSets(asset::EPBP_GRAPHICS, gpuMeshBuffer->getPipeline()->getLayout(), 1u, 1u, &gpuGDescriptorSet1.get(), 0u);
-		commandBuffer->drawMeshBuffer(gpuMeshBuffer.get());
-
-		/*
-			Draw extras with geometry usage under key c and v conditions
-		*/
-
-		commandBuffer->bindGraphicsPipeline(gpuGraphicsPipeline2.get());
-		commandBuffer->pushConstants(gpuMeshBuffer2->getPipeline()->getLayout(), asset::IShader::ESS_GEOMETRY, 0, sizeof(SPushConstants), &pushConstants);
-		commandBuffer->bindDescriptorSets(asset::EPBP_GRAPHICS, gpuMeshBuffer2->getPipeline()->getLayout(), 1u, 1u, &gpuGDescriptorSet1.get(), 0u);
-		commandBuffer->drawMeshBuffer(gpuMeshBuffer2.get());
-
-		commandBuffer->endRenderPass();
-		commandBuffer->end();
-
-		auto img_acq_sem = logicalDevice->createSemaphore();
-		render_finished_sem = logicalDevice->createSemaphore();
-
-		uint32_t imgnum = 0u;
-		constexpr uint64_t MAX_TIMEOUT = 99999999999999ull; // ns
-		swapchain->acquireNextImage(MAX_TIMEOUT, img_acq_sem.get(), nullptr, &imgnum);
-
-		CommonAPI::Submit(logicalDevice.get(), commandBuffer.get(), queues[CommonAPI::InitOutput::EQT_GRAPHICS], img_acq_sem.get(), render_finished_sem.get());
-		CommonAPI::Present(logicalDevice.get(), swapchain.get(), queues[CommonAPI::InitOutput::EQT_GRAPHICS], render_finished_sem.get(), imgnum);
-	}
-
-	bool keepRunning() override
-	{
-		return windowCallback->isWindowOpen();
-	}
-};
-
-NBL_COMMON_API_MAIN(MeshLoadersApp, MeshLoadersApp::Nabla)
diff --git a/53_ComputeShaders/pipeline.groovy b/53_ComputeShaders/pipeline.groovy
deleted file mode 100644
index e8eb74b5b..000000000
--- a/53_ComputeShaders/pipeline.groovy
+++ /dev/null
@@ -1,50 +0,0 @@
-import org.DevshGraphicsProgramming.Agent
-import org.DevshGraphicsProgramming.BuilderInfo
-import org.DevshGraphicsProgramming.IBuilder
-
-class CComputeShadersBuilder extends IBuilder
-{
-	public CComputeShadersBuilder(Agent _agent, _info)
-	{
-		super(_agent, _info)
-	}
-	
-	@Override
-	public boolean prepare(Map axisMapping)
-	{
-		return true
-	}
-	
-	@Override
-  	public boolean build(Map axisMapping)
-	{
-		IBuilder.CONFIGURATION config = axisMapping.get("CONFIGURATION")
-		IBuilder.BUILD_TYPE buildType = axisMapping.get("BUILD_TYPE")
-		
-		def nameOfBuildDirectory = getNameOfBuildDirectory(buildType)
-		def nameOfConfig = getNameOfConfig(config)
-		
-		agent.execute("cmake --build ${info.rootProjectPath}/${nameOfBuildDirectory}/${info.targetProjectPathRelativeToRoot} --target ${info.targetBaseName} --config ${nameOfConfig} -j12 -v")
-		
-		return true
-	}
-	
-	@Override
-  	public boolean test(Map axisMapping)
-	{
-		return true
-	}
-	
-	@Override
-	public boolean install(Map axisMapping)
-	{
-		return true
-	}
-}
-
-def create(Agent _agent, _info)
-{
-	return new CComputeShadersBuilder(_agent, _info)
-}
-
-return this
\ No newline at end of file
diff --git a/53_ComputeShaders/shaderCommon.glsl b/53_ComputeShaders/shaderCommon.glsl
deleted file mode 100644
index 972a8789a..000000000
--- a/53_ComputeShaders/shaderCommon.glsl
+++ /dev/null
@@ -1,6 +0,0 @@
-layout(push_constant, row_major) uniform Block{
-	bool isXPressed;
-	bool isZPressed;
-	bool isCPressed;
-	vec3 currentUserAbsolutePostion;
-} pushConstants;
\ No newline at end of file
diff --git a/53_ComputeShaders/vertexShader.vert b/53_ComputeShaders/vertexShader.vert
deleted file mode 100644
index 6b14d97c8..000000000
--- a/53_ComputeShaders/vertexShader.vert
+++ /dev/null
@@ -1,23 +0,0 @@
-#version 430 core
-
-layout(location = 0) in vec4 vPosition; 
-layout(location = 1) in vec4 vVelocity;
-layout(location = 2) in vec4 vColor;
-
-#include <nbl/builtin/glsl/utils/common.glsl>
-#include <nbl/builtin/glsl/utils/transform.glsl>
-
-layout (set = 1, binding = 0, row_major, std140) uniform UBO 
-{
-    nbl_glsl_SBasicViewParameters params;
-} cameraData;
-
-layout(location = 0) flat out vec4 outGOrFFullyProjectedVelocity;
-layout(location = 1) flat out vec4 outGorFColor;
-
-void main()
-{
-    gl_Position = (cameraData.params.MVP) * vPosition;
-    outGOrFFullyProjectedVelocity = (cameraData.params.MVP) * vVelocity * 0.0001;
-    outGorFColor = vColor;
-}
\ No newline at end of file
diff --git a/56_RayQuery/CMakeLists.txt b/56_RayQuery/CMakeLists.txt
deleted file mode 100644
index a476b6203..000000000
--- a/56_RayQuery/CMakeLists.txt
+++ /dev/null
@@ -1,7 +0,0 @@
-
-include(common RESULT_VARIABLE RES)
-if(NOT RES)
-	message(FATAL_ERROR "common.cmake not found. Should be in {repo_root}/cmake directory")
-endif()
-
-nbl_create_executable_project("" "" "" "" "${NBL_EXECUTABLE_PROJECT_CREATION_PCH_TARGET}")
\ No newline at end of file
diff --git a/56_RayQuery/common.glsl b/56_RayQuery/common.glsl
deleted file mode 100644
index ad88789f8..000000000
--- a/56_RayQuery/common.glsl
+++ /dev/null
@@ -1,793 +0,0 @@
-// Copyright (C) 2018-2020 - DevSH Graphics Programming Sp. z O.O.
-// This file is part of the "Nabla Engine".
-// For conditions of distribution and use, see copyright notice in nabla.h
-
-// basic settings
-#define MAX_DEPTH 15
-#define SAMPLES 32
-
-// firefly and variance reduction techniques
-//#define KILL_DIFFUSE_SPECULAR_PATHS
-//#define VISUALIZE_HIGH_VARIANCE
-
-#define INVALID_ID_16BIT 0xffffu
-struct Sphere
-{
-    vec3 position;
-    float radius2;
-    uint bsdfLightIDs;
-}; 
-
-layout(set=0, binding=0, rgba16f) uniform image2D outImage;
-
-layout(set = 2, binding = 0) uniform sampler2D envMap; 
-layout(set = 2, binding = 1) uniform usamplerBuffer sampleSequence;
-layout(set = 2, binding = 2) uniform usampler2D scramblebuf;
-layout(set = 2, binding = 3) uniform accelerationStructureEXT topLevelAS;
-layout(set = 2, binding = 4) readonly restrict buffer InputBuffer
-{
-	Sphere spheres[];
-};
-
-#ifndef _NBL_GLSL_WORKGROUP_SIZE_
-#define _NBL_GLSL_WORKGROUP_SIZE_ 16
-layout(local_size_x=_NBL_GLSL_WORKGROUP_SIZE_, local_size_y=_NBL_GLSL_WORKGROUP_SIZE_, local_size_z=1) in;
-#endif
-
-ivec2 getCoordinates() {
-    return ivec2(gl_GlobalInvocationID.xy);
-}
-
-vec2 getTexCoords() {
-    ivec2 imageSize = imageSize(outImage);
-    ivec2 iCoords = getCoordinates();
-    return vec2(float(iCoords.x) / imageSize.x, 1.0 - float(iCoords.y) / imageSize.y);
-}
-
-
-#include <nbl/builtin/glsl/limits/numeric.glsl>
-#include <nbl/builtin/glsl/math/constants.glsl>
-#include <nbl/builtin/glsl/utils/common.glsl>
-
-#include <nbl/builtin/glsl/sampling/box_muller_transform.glsl>
-
-layout(set = 1, binding = 0, row_major, std140) uniform UBO
-{
-	nbl_glsl_SBasicViewParameters params;
-} cameraData;
-
-Sphere Sphere_Sphere(in vec3 position, in float radius, in uint bsdfID, in uint lightID)
-{
-    Sphere sphere;
-    sphere.position = position;
-    sphere.radius2 = radius*radius;
-    sphere.bsdfLightIDs = bitfieldInsert(bsdfID,lightID,16,16);
-    return sphere;
-}
-
-// return intersection distance if found, FLT_NAN otherwise
-float Sphere_intersect(in Sphere sphere, in vec3 origin, in vec3 direction)
-{
-    vec3 relOrigin = origin-sphere.position;
-    float relOriginLen2 = dot(relOrigin,relOrigin);
-    const float radius2 = sphere.radius2;
-
-    float dirDotRelOrigin = dot(direction,relOrigin);
-    float det = radius2-relOriginLen2+dirDotRelOrigin*dirDotRelOrigin;
-
-    // do some speculative math here
-    float detsqrt = sqrt(det);
-    return -dirDotRelOrigin+(relOriginLen2>radius2 ? (-detsqrt):detsqrt);
-}
-
-vec3 Sphere_getNormal(in Sphere sphere, in vec3 position)
-{
-    const float radiusRcp = inversesqrt(sphere.radius2);
-    return (position-sphere.position)*radiusRcp;
-}
-
-float Sphere_getSolidAngle_impl(in float cosThetaMax)
-{
-    return 2.0*nbl_glsl_PI*(1.0-cosThetaMax);
-}
-float Sphere_getSolidAngle(in Sphere sphere, in vec3 origin)
-{
-    float cosThetaMax = sqrt(1.0-sphere.radius2/nbl_glsl_lengthSq(sphere.position-origin));
-    return Sphere_getSolidAngle_impl(cosThetaMax);
-}
-
-struct Triangle
-{
-    vec3 vertex0;
-    uint bsdfLightIDs;
-    vec3 vertex1;
-    uint padding0;
-    vec3 vertex2;
-    uint padding1;
-};
-
-Triangle Triangle_Triangle(in mat3 vertices, in uint bsdfID, in uint lightID)
-{
-    Triangle tri;
-    tri.vertex0 = vertices[0];
-    tri.vertex1 = vertices[1];
-    tri.vertex2 = vertices[2];
-    //
-    tri.bsdfLightIDs = bitfieldInsert(bsdfID, lightID, 16, 16);
-    return tri;
-}
-
-// return intersection distance if found, FLT_NAN otherwise
-float Triangle_intersect(in Triangle tri, in vec3 origin, in vec3 direction)
-{
-    const vec3 edges[2] = vec3[2](tri.vertex1-tri.vertex0,tri.vertex2-tri.vertex0);
-
-    const vec3 h = cross(direction,edges[1]);
-    const float a = dot(edges[0],h);
-
-    const vec3 relOrigin = origin-tri.vertex0;
-
-    const float u = dot(relOrigin,h)/a;
-
-    const vec3 q = cross(relOrigin,edges[0]);
-    const float v = dot(direction,q)/a;
-
-    const float t = dot(edges[1],q)/a;
-
-    return t>0.f&&u>=0.f&&v>=0.f&&(u+v)<=1.f ? t:nbl_glsl_FLT_NAN;
-}
-
-vec3 Triangle_getNormalTimesArea_impl(in mat2x3 edges)
-{
-    return cross(edges[0],edges[1])*0.5;
-}
-vec3 Triangle_getNormalTimesArea(in Triangle tri)
-{
-    return Triangle_getNormalTimesArea_impl(mat2x3(tri.vertex1-tri.vertex0,tri.vertex2-tri.vertex0));
-}
-
-
-
-struct Rectangle
-{
-    vec3 offset;
-    uint bsdfLightIDs;
-    vec3 edge0;
-    uint padding0;
-    vec3 edge1;
-    uint padding1;
-};
-
-Rectangle Rectangle_Rectangle(in vec3 offset, in vec3 edge0, in vec3 edge1, in uint bsdfID, in uint lightID)
-{
-    Rectangle rect;
-    rect.offset = offset;
-    rect.edge0 = edge0;
-    rect.edge1 = edge1;
-    //
-    rect.bsdfLightIDs = bitfieldInsert(bsdfID, lightID, 16, 16);
-    return rect;
-}
-
-// return intersection distance if found, FLT_NAN otherwise
-float Rectangle_intersect(in Rectangle rect, in vec3 origin, in vec3 direction)
-{
-    const vec3 h = cross(direction,rect.edge1);
-    const float a = dot(rect.edge0,h);
-
-    const vec3 relOrigin = origin-rect.offset;
-
-    const float u = dot(relOrigin,h)/a;
-
-    const vec3 q = cross(relOrigin,rect.edge0);
-    const float v = dot(direction,q)/a;
-
-    const float t = dot(rect.edge1,q)/a;
-
-    const bool intersection = t>0.f&&u>=0.f&&v>=0.f&&u<=1.f&&v<=1.f;
-    return intersection ? t:nbl_glsl_FLT_NAN;
-}
-
-vec3 Rectangle_getNormalTimesArea(in Rectangle rect)
-{
-    return cross(rect.edge0,rect.edge1);
-}
-
-
-
-#define DIFFUSE_OP 0u
-#define CONDUCTOR_OP 1u
-#define DIELECTRIC_OP 2u
-#define OP_BITS_OFFSET 0
-#define OP_BITS_SIZE 2
-struct BSDFNode
-{ 
-    uvec4 data[2];
-};
-
-uint BSDFNode_getType(in BSDFNode node)
-{
-    return bitfieldExtract(node.data[0].w,OP_BITS_OFFSET,OP_BITS_SIZE);
-}
-bool BSDFNode_isBSDF(in BSDFNode node)
-{
-    return BSDFNode_getType(node)==DIELECTRIC_OP;
-}
-bool BSDFNode_isNotDiffuse(in BSDFNode node)
-{
-    return BSDFNode_getType(node)!=DIFFUSE_OP;
-}
-float BSDFNode_getRoughness(in BSDFNode node)
-{
-    return uintBitsToFloat(node.data[1].w);
-}
-vec3 BSDFNode_getRealEta(in BSDFNode node)
-{
-    return uintBitsToFloat(node.data[0].rgb);
-}
-vec3 BSDFNode_getImaginaryEta(in BSDFNode node)
-{
-    return uintBitsToFloat(node.data[1].rgb);
-}
-mat2x3 BSDFNode_getEta(in BSDFNode node)
-{
-    return mat2x3(BSDFNode_getRealEta(node),BSDFNode_getImaginaryEta(node));
-}
-#include <nbl/builtin/glsl/bxdf/fresnel.glsl>
-vec3 BSDFNode_getReflectance(in BSDFNode node, in float VdotH)
-{
-    const vec3 albedoOrRealIoR = uintBitsToFloat(node.data[0].rgb);
-    if (BSDFNode_isNotDiffuse(node))
-        return nbl_glsl_fresnel_conductor(albedoOrRealIoR, BSDFNode_getImaginaryEta(node), VdotH);
-    else
-        return albedoOrRealIoR;
-}
-
-float BSDFNode_getNEEProb(in BSDFNode bsdf)
-{
-    const float alpha = BSDFNode_isNotDiffuse(bsdf) ? BSDFNode_getRoughness(bsdf):1.0;
-    return min(8.0*alpha,1.0);
-}
-
-#include <nbl/builtin/glsl/colorspace/EOTF.glsl>
-#include <nbl/builtin/glsl/colorspace/encodeCIEXYZ.glsl>
-float getLuma(in vec3 col)
-{
-    return dot(transpose(nbl_glsl_scRGBtoXYZ)[1],col);
-}
-
-#define BSDF_COUNT 7
-BSDFNode bsdfs[BSDF_COUNT] = {
-    {{uvec4(floatBitsToUint(vec3(0.8,0.8,0.8)),DIFFUSE_OP),floatBitsToUint(vec4(0.0,0.0,0.0,0.0))}},
-    {{uvec4(floatBitsToUint(vec3(0.8,0.4,0.4)),DIFFUSE_OP),floatBitsToUint(vec4(0.0,0.0,0.0,0.0))}},
-    {{uvec4(floatBitsToUint(vec3(0.4,0.8,0.4)),DIFFUSE_OP),floatBitsToUint(vec4(0.0,0.0,0.0,0.0))}},
-    {{uvec4(floatBitsToUint(vec3(1.02,1.02,1.3)),CONDUCTOR_OP),floatBitsToUint(vec4(1.0,1.0,2.0,0.0))}},
-    {{uvec4(floatBitsToUint(vec3(1.02,1.3,1.02)),CONDUCTOR_OP),floatBitsToUint(vec4(1.0,2.0,1.0,0.0))}},
-    {{uvec4(floatBitsToUint(vec3(1.02,1.3,1.02)),CONDUCTOR_OP),floatBitsToUint(vec4(1.0,2.0,1.0,0.15))}},
-    {{uvec4(floatBitsToUint(vec3(1.4,1.45,1.5)),DIELECTRIC_OP),floatBitsToUint(vec4(0.0,0.0,0.0,0.0625))}}
-};
-
-
-struct Light
-{
-    vec3 radiance;
-    uint objectID;
-};
-
-vec3 Light_getRadiance(in Light light)
-{
-    return light.radiance;
-}
-uint Light_getObjectID(in Light light)
-{
-    return light.objectID;
-}
-
-
-#define LIGHT_COUNT 1
-float scene_getLightChoicePdf(in Light light)
-{
-    return 1.0/float(LIGHT_COUNT);
-}
-
-
-#define LIGHT_COUNT 1
-Light lights[LIGHT_COUNT] =
-{
-    {
-        vec3(30.0,25.0,15.0),
-#ifdef POLYGON_METHOD
-        0u
-#else
-        8u
-#endif
-    }
-};
-
-
-
-#define ANY_HIT_FLAG (-2147483648)
-#define DEPTH_BITS_COUNT 8
-#define DEPTH_BITS_OFFSET (31-DEPTH_BITS_COUNT)
-struct ImmutableRay_t
-{
-    vec3 origin;
-    vec3 direction;
-#if POLYGON_METHOD==2
-    vec3 normalAtOrigin;
-    bool wasBSDFAtOrigin;
-#endif
-};
-struct MutableRay_t
-{
-    float intersectionT;
-    uint objectID;
-    /* irrelevant here
-    uint triangleID;
-    vec2 barycentrics;
-    */
-};
-struct Payload_t
-{
-    vec3 accumulation;
-    float otherTechniqueHeuristic;
-    vec3 throughput;
-    #ifdef KILL_DIFFUSE_SPECULAR_PATHS
-    bool hasDiffuse;
-    #endif
-};
-
-struct Ray_t
-{
-    ImmutableRay_t _immutable;
-    MutableRay_t _mutable;
-    Payload_t _payload;
-};
-
-
-#define INTERSECTION_ERROR_BOUND_LOG2 (-8.0)
-float getTolerance_common(in uint depth)
-{
-    float depthRcp = 1.0/float(depth);
-    return INTERSECTION_ERROR_BOUND_LOG2;// *depthRcp*depthRcp;
-}
-float getStartTolerance(in uint depth)
-{
-    return exp2(getTolerance_common(depth));
-}
-float getEndTolerance(in uint depth)
-{
-    return 1.0-exp2(getTolerance_common(depth)+1.0);
-}
-
-
-vec2 SampleSphericalMap(vec3 v)
-{
-    vec2 uv = vec2(atan(v.z, v.x), asin(v.y));
-    uv *= nbl_glsl_RECIPROCAL_PI*0.5;
-    uv += 0.5; 
-    return uv;
-}
-
-void missProgram(in ImmutableRay_t _immutable, inout Payload_t _payload)
-{
-    vec3 finalContribution = _payload.throughput; 
-    // #define USE_ENVMAP
-#ifdef USE_ENVMAP
-	vec2 uv = SampleSphericalMap(_immutable.direction);
-    finalContribution *= textureLod(envMap, uv, 0.0).rgb;
-#else
-    const vec3 kConstantEnvLightRadiance = vec3(0.15, 0.21, 0.3);
-    finalContribution *= kConstantEnvLightRadiance;
-#endif
-    _payload.accumulation += finalContribution;
-}
-
-#include <nbl/builtin/glsl/bxdf/brdf/diffuse/oren_nayar.glsl>
-#include <nbl/builtin/glsl/bxdf/brdf/specular/beckmann.glsl>
-#include <nbl/builtin/glsl/bxdf/brdf/specular/ggx.glsl>
-#include <nbl/builtin/glsl/bxdf/bsdf/diffuse/lambert.glsl>
-#include <nbl/builtin/glsl/bxdf/bsdf/specular/dielectric.glsl>
-#include <nbl/builtin/glsl/bxdf/bsdf/specular/beckmann.glsl>
-#include <nbl/builtin/glsl/bxdf/bsdf/specular/ggx.glsl>
-nbl_glsl_LightSample nbl_glsl_bsdf_cos_generate(in nbl_glsl_AnisotropicViewSurfaceInteraction interaction, in vec3 u, in BSDFNode bsdf, in float monochromeEta, out nbl_glsl_AnisotropicMicrofacetCache _cache)
-{
-    const float a = BSDFNode_getRoughness(bsdf);
-    const mat2x3 ior = BSDFNode_getEta(bsdf);
-    
-    // fresnel stuff for dielectrics
-    float orientedEta, rcpOrientedEta;
-    const bool viewerInsideMedium = nbl_glsl_getOrientedEtas(orientedEta,rcpOrientedEta,interaction.isotropic.NdotV,monochromeEta);
-
-    nbl_glsl_LightSample smpl;
-    nbl_glsl_AnisotropicMicrofacetCache dummy;
-    switch (BSDFNode_getType(bsdf))
-    {
-        case DIFFUSE_OP:
-            smpl = nbl_glsl_oren_nayar_cos_generate(interaction,u.xy,a*a);
-            break;
-        case CONDUCTOR_OP:
-            smpl = nbl_glsl_ggx_cos_generate(interaction,u.xy,a,a,_cache);
-            break;
-        default:
-            smpl = nbl_glsl_ggx_dielectric_cos_generate(interaction,u,a,a,monochromeEta,_cache);
-            break;
-    }
-    return smpl;
-}
-
-vec3 nbl_glsl_bsdf_cos_remainder_and_pdf(out float pdf, in nbl_glsl_LightSample _sample, in nbl_glsl_AnisotropicViewSurfaceInteraction interaction, in BSDFNode bsdf, in float monochromeEta, in nbl_glsl_AnisotropicMicrofacetCache _cache)
-{
-    // are V and L on opposite sides of the surface?
-    const bool transmitted = nbl_glsl_isTransmissionPath(interaction.isotropic.NdotV,_sample.NdotL);
-
-    // is the BSDF or BRDF, if it is then we make the dot products `abs` before `max(,0.0)`
-    const bool transmissive = BSDFNode_isBSDF(bsdf);
-    const float clampedNdotL = nbl_glsl_conditionalAbsOrMax(transmissive,_sample.NdotL,0.0);
-    const float clampedNdotV = nbl_glsl_conditionalAbsOrMax(transmissive,interaction.isotropic.NdotV,0.0);
-
-    vec3 remainder;
-
-    const float minimumProjVectorLen = 0.00000001;
-    if (clampedNdotV>minimumProjVectorLen && clampedNdotL>minimumProjVectorLen)
-    {
-        // fresnel stuff for conductors (but reflectance also doubles as albedo)
-        const mat2x3 ior = BSDFNode_getEta(bsdf);
-        const vec3 reflectance = BSDFNode_getReflectance(bsdf,_cache.isotropic.VdotH);
-
-        // fresnel stuff for dielectrics
-        float orientedEta, rcpOrientedEta;
-        const bool viewerInsideMedium = nbl_glsl_getOrientedEtas(orientedEta,rcpOrientedEta,interaction.isotropic.NdotV,monochromeEta);
-
-        //
-        const float VdotL = dot(interaction.isotropic.V.dir,_sample.L);
-
-        //
-        const float a = max(BSDFNode_getRoughness(bsdf),0.0001); // TODO: @Crisspl 0-roughness still doesn't work! Also Beckmann has a weird dark rim instead as fresnel!?
-        const float a2 = a*a;
-
-        // TODO: refactor into Material Compiler-esque thing
-        switch (BSDFNode_getType(bsdf))
-        {
-            case DIFFUSE_OP:
-                remainder = reflectance*nbl_glsl_oren_nayar_cos_remainder_and_pdf_wo_clamps(pdf,a*a,VdotL,clampedNdotL,clampedNdotV);
-                break;
-            case CONDUCTOR_OP:
-                remainder = nbl_glsl_ggx_cos_remainder_and_pdf_wo_clamps(pdf,nbl_glsl_ggx_trowbridge_reitz(a2,_cache.isotropic.NdotH2),clampedNdotL,_sample.NdotL2,clampedNdotV,interaction.isotropic.NdotV_squared,reflectance,a2);
-                break;
-            default:
-                remainder = vec3(nbl_glsl_ggx_dielectric_cos_remainder_and_pdf(pdf, _sample, interaction.isotropic, _cache.isotropic, monochromeEta, a*a));
-                break;
-        }
-    }
-    else
-        remainder = vec3(0.0);
-    return remainder;
-}
-
-layout (constant_id = 0) const int MAX_DEPTH_LOG2 = 4;
-layout (constant_id = 1) const int MAX_SAMPLES_LOG2 = 10;
-
-
-#include <nbl/builtin/glsl/random/xoroshiro.glsl>
-
-mat2x3 rand3d(in uint protoDimension, in uint _sample, inout nbl_glsl_xoroshiro64star_state_t scramble_state)
-{
-    mat2x3 retval;
-    uint address = bitfieldInsert(protoDimension,_sample,MAX_DEPTH_LOG2,MAX_SAMPLES_LOG2);
-    for (int i=0; i<2u; i++)
-    {
-	    uvec3 seqVal = texelFetch(sampleSequence,int(address)+i).xyz;
-	    seqVal ^= uvec3(nbl_glsl_xoroshiro64star(scramble_state),nbl_glsl_xoroshiro64star(scramble_state),nbl_glsl_xoroshiro64star(scramble_state));
-        retval[i] = vec3(seqVal)*uintBitsToFloat(0x2f800004u);
-    }
-    return retval;
-}
-
-
-void traceRay_extraShape(inout int objectID, inout float intersectionT, in vec3 origin, in vec3 direction);
-int traceRay(inout float intersectionT, in vec3 origin, in vec3 direction)
-{
-	int objectID = -1;
-    
-#define USE_RAY_QUERY
-#ifdef USE_RAY_QUERY
-    rayQueryEXT rayQuery;
-    rayQueryInitializeEXT(rayQuery, topLevelAS, gl_RayFlagsNoneEXT, 0xFF, origin, 0.0, direction, 1000.0);
-    
-    // Start traversal: return false if traversal is complete
-    while(rayQueryProceedEXT(rayQuery))
-    {
-        if(rayQueryGetIntersectionTypeEXT(rayQuery, false) == gl_RayQueryCandidateIntersectionAABBEXT)
-        {
-            int id = rayQueryGetIntersectionPrimitiveIndexEXT(rayQuery, false);
-            float t = Sphere_intersect(spheres[id],origin,direction);
-            bool reportIntersection = (t != nbl_glsl_FLT_NAN && t > 0 && t < intersectionT);
-            if(reportIntersection)
-            {
-                intersectionT = t;
-                objectID = id;
-                rayQueryGenerateIntersectionEXT(rayQuery, t);
-            }
-        }
-    }
-#else
-	for (int i=0; i<SPHERE_COUNT; i++)
-    {
-        float t = Sphere_intersect(spheres[i],origin,direction);
-        bool closerIntersection = t>0.0 && t<intersectionT;
-
-        intersectionT = closerIntersection ? t : intersectionT;
-		objectID = closerIntersection ? i:objectID;
-    }
-#endif
-
-    traceRay_extraShape(objectID,intersectionT,origin,direction);
-    return objectID;
-}
-
-//
-float nbl_glsl_light_deferred_pdf(in Light light, in Ray_t ray);
-vec3 nbl_glsl_light_deferred_eval_and_prob(out float pdf, in Light light, in Ray_t ray)
-{
-    // we don't have to worry about solid angle of the light w.r.t. surface of the light because this function only ever gets called from closestHit routine, so such ray cannot be produced (because lights have no BSDFs here)
-    pdf = scene_getLightChoicePdf(light);
-    pdf *= nbl_glsl_light_deferred_pdf(light,ray);
-    return Light_getRadiance(light);
-}
-
-vec3 nbl_glsl_light_generate_and_pdf(out float pdf, out float newRayMaxT, in vec3 origin, in nbl_glsl_AnisotropicViewSurfaceInteraction interaction, in bool isBSDF, in vec3 xi, in uint objectID);
-nbl_glsl_LightSample nbl_glsl_light_generate_and_remainder_and_pdf(out vec3 remainder, out float pdf, out float newRayMaxT, in vec3 origin, in nbl_glsl_AnisotropicViewSurfaceInteraction interaction, in bool isBSDF, in vec3 xi, in uint depth)
-{
-    // normally we'd pick from set of lights, using `xi.z`
-    const Light light = lights[0];
-    
-    vec3 L = nbl_glsl_light_generate_and_pdf(pdf,newRayMaxT,origin,interaction,isBSDF,xi,Light_getObjectID(light));
-
-    newRayMaxT *= getEndTolerance(depth);
-    pdf *= scene_getLightChoicePdf(light);
-    remainder = Light_getRadiance(light)/pdf;
-    return nbl_glsl_createLightSample(L,interaction);
-}
-
-uint getBSDFLightIDAndDetermineNormal(out vec3 normal, in uint objectID, in vec3 intersection);
-bool closestHitProgram(in uint depth, in uint _sample, inout Ray_t ray, inout nbl_glsl_xoroshiro64star_state_t scramble_state)
-{
-    const MutableRay_t _mutable = ray._mutable;
-    const uint objectID = _mutable.objectID;
-
-    // interaction stuffs
-    const ImmutableRay_t _immutable = ray._immutable;
-    const vec3 intersection = _immutable.origin+_immutable.direction*_mutable.intersectionT;
-
-    uint bsdfLightIDs;
-    nbl_glsl_AnisotropicViewSurfaceInteraction interaction;
-    {
-        nbl_glsl_IsotropicViewSurfaceInteraction isotropic;
-        bsdfLightIDs = getBSDFLightIDAndDetermineNormal(isotropic.N,objectID,intersection);
-
-        isotropic.V.dir = -_immutable.direction;
-        isotropic.NdotV = dot(isotropic.V.dir,isotropic.N);
-        isotropic.NdotV_squared = isotropic.NdotV*isotropic.NdotV;
-
-        interaction = nbl_glsl_calcAnisotropicInteraction(isotropic);
-    }
-
-    //
-    vec3 throughput = ray._payload.throughput;
-
-    // add emissive and finish MIS
-    const uint lightID = bitfieldExtract(bsdfLightIDs,16,16);
-    if (lightID != INVALID_ID_16BIT) // has emissive
-    {
-        float lightPdf;
-        ray._payload.accumulation += nbl_glsl_light_deferred_eval_and_prob(lightPdf,lights[lightID],ray)*throughput/(1.0+lightPdf*lightPdf*ray._payload.otherTechniqueHeuristic);
-    }
-
-    // check if we even have a BSDF at all
-    uint bsdfID = bitfieldExtract(bsdfLightIDs, 0, 16);
-    if (bsdfID != INVALID_ID_16BIT)
-    {
-        BSDFNode bsdf = bsdfs[bsdfID];
-#ifdef KILL_DIFFUSE_SPECULAR_PATHS
-        if (BSDFNode_isNotDiffuse(bsdf))
-        {
-            if (ray._payload.hasDiffuse)
-                return true;
-        }
-        else
-            ray._payload.hasDiffuse = true;
-#endif
-
-        const bool isBSDF = BSDFNode_isBSDF(bsdf);
-        //rand
-        mat2x3 epsilon = rand3d(depth,_sample,scramble_state);
-
-        // thresholds
-        const float bsdfPdfThreshold = 0.0001;
-        const float lumaContributionThreshold = getLuma(nbl_glsl_eotf_sRGB(vec3(1.0)/255.0)); // OETF smallest perceptible value
-        const vec3 throughputCIE_Y = transpose(nbl_glsl_sRGBtoXYZ)[1]*throughput;
-        const float monochromeEta = dot(throughputCIE_Y,BSDFNode_getEta(bsdf)[0])/(throughputCIE_Y.r+throughputCIE_Y.g+throughputCIE_Y.b);
-
-        // do NEE
-        const float neeProbability = BSDFNode_getNEEProb(bsdf);
-        float rcpChoiceProb;
-        if (!nbl_glsl_partitionRandVariable(neeProbability,epsilon[0].z,rcpChoiceProb))
-        {
-            vec3 neeContrib; float lightPdf, t;
-            nbl_glsl_LightSample nee_sample = nbl_glsl_light_generate_and_remainder_and_pdf(
-                neeContrib, lightPdf, t,
-                intersection, interaction,
-                isBSDF, epsilon[0], depth
-            );
-            // We don't allow non watertight transmitters in this renderer
-            bool validPath = nee_sample.NdotL>0.0;
-            // but if we allowed non-watertight transmitters (single water surface), it would make sense just to apply this line by itself
-            nbl_glsl_AnisotropicMicrofacetCache _cache;
-            validPath = validPath && nbl_glsl_calcAnisotropicMicrofacetCache(_cache, interaction, nee_sample, monochromeEta);
-            if (validPath)
-            {
-                float bsdfPdf;
-                neeContrib *= nbl_glsl_bsdf_cos_remainder_and_pdf(bsdfPdf,nee_sample,interaction,bsdf,monochromeEta,_cache)*throughput;
-                const float oc = bsdfPdf*rcpChoiceProb;
-                neeContrib /= 1.0/oc+oc/(lightPdf*lightPdf); // MIS weight
-                if (bsdfPdf<nbl_glsl_FLT_MAX && getLuma(neeContrib)>lumaContributionThreshold && traceRay(t,intersection+nee_sample.L*t*getStartTolerance(depth),nee_sample.L)==-1)
-                    ray._payload.accumulation += neeContrib;
-            }
-        }
-
-        // sample BSDF
-        float bsdfPdf; vec3 bsdfSampleL;
-        {
-            nbl_glsl_AnisotropicMicrofacetCache _cache;
-            nbl_glsl_LightSample bsdf_sample = nbl_glsl_bsdf_cos_generate(interaction,epsilon[1],bsdf,monochromeEta,_cache);
-            // the value of the bsdf divided by the probability of the sample being generated
-            throughput *= nbl_glsl_bsdf_cos_remainder_and_pdf(bsdfPdf,bsdf_sample,interaction,bsdf,monochromeEta,_cache);
-            //
-            bsdfSampleL = bsdf_sample.L;
-        }
-        
-        // additional threshold
-        const float lumaThroughputThreshold = lumaContributionThreshold;
-        if (bsdfPdf>bsdfPdfThreshold && getLuma(throughput)>lumaThroughputThreshold)
-        {
-            ray._payload.throughput = throughput;
-            ray._payload.otherTechniqueHeuristic = neeProbability/bsdfPdf; // numerically stable, don't touch
-            ray._payload.otherTechniqueHeuristic *= ray._payload.otherTechniqueHeuristic;
-                    
-            // trace new ray
-            ray._immutable.origin = intersection+bsdfSampleL*(1.0/*kSceneSize*/)*getStartTolerance(depth);
-            ray._immutable.direction = bsdfSampleL;
-            #if POLYGON_METHOD==2
-            ray._immutable.normalAtOrigin = interaction.isotropic.N;
-            ray._immutable.wasBSDFAtOrigin = isBSDF;
-            #endif
-            return true;
-        }
-    }
-    return false;
-}
-
-void main()
-{
-    const ivec2 coords = getCoordinates();
-    const vec2 texCoord = getTexCoords();
-
-    if (false == (all(lessThanEqual(ivec2(0),coords)) && all(greaterThan(imageSize(outImage),coords)))) {
-        return;
-    }
-
-    if (((MAX_DEPTH-1)>>MAX_DEPTH_LOG2)>0 || ((SAMPLES-1)>>MAX_SAMPLES_LOG2)>0)
-    {
-        vec4 pixelCol = vec4(1.0,0.0,0.0,1.0);
-        imageStore(outImage, coords, pixelCol);
-        return;
-    }
-
-	nbl_glsl_xoroshiro64star_state_t scramble_start_state = texelFetch(scramblebuf,coords,0).rg;
-    const vec2 pixOffsetParam = vec2(1.0)/vec2(textureSize(scramblebuf,0));
-
-
-    const mat4 invMVP = inverse(cameraData.params.MVP);
-    
-    vec4 NDC = vec4(texCoord*vec2(2.0,-2.0)+vec2(-1.0,1.0),0.0,1.0);
-    vec3 camPos;
-    {
-        vec4 tmp = invMVP*NDC;
-        camPos = tmp.xyz/tmp.w;
-        NDC.z = 1.0;
-    }
-
-    vec3 color = vec3(0.0);
-    float meanLumaSquared = 0.0;
-    // TODO: if we collapse the nested for loop, then all GPUs will get `MAX_DEPTH` factor speedup, not just NV with separate PC
-    for (int i=0; i<SAMPLES; i++)
-    {
-        nbl_glsl_xoroshiro64star_state_t scramble_state = scramble_start_state;
-
-        Ray_t ray;
-        // raygen
-        {
-            ray._immutable.origin = camPos;
-
-            vec4 tmp = NDC;
-            // apply stochastic reconstruction filter
-            const float gaussianFilterCutoff = 2.5;
-            const float truncation = exp(-0.5*gaussianFilterCutoff*gaussianFilterCutoff);
-            vec2 remappedRand = rand3d(0u,i,scramble_state)[0].xy;
-            remappedRand.x *= 1.0-truncation;
-            remappedRand.x += truncation;
-            tmp.xy += pixOffsetParam*nbl_glsl_BoxMullerTransform(remappedRand,1.5);
-            // for depth of field we could do another stochastic point-pick
-            tmp = invMVP*tmp;
-            ray._immutable.direction = normalize(tmp.xyz/tmp.w-camPos);
-
-            #if POLYGON_METHOD==2
-                ray._immutable.normalAtOrigin = vec3(0.0,0.0,0.0);
-                ray._immutable.wasBSDFAtOrigin = false;
-            #endif
-
-            ray._payload.accumulation = vec3(0.0);
-            ray._payload.otherTechniqueHeuristic = 0.0; // needed for direct eye-light paths
-            ray._payload.throughput = vec3(1.0);
-            #ifdef KILL_DIFFUSE_SPECULAR_PATHS
-            ray._payload.hasDiffuse = false;
-            #endif
-        }
-
-        // bounces
-        {
-            bool hit = true; bool rayAlive = true;
-            for (int d=1; d<=MAX_DEPTH && hit && rayAlive; d+=2)
-            {
-                ray._mutable.intersectionT = nbl_glsl_FLT_MAX;
-                ray._mutable.objectID = traceRay(ray._mutable.intersectionT,ray._immutable.origin,ray._immutable.direction);
-                hit = ray._mutable.objectID!=-1;
-                if (hit)
-                    rayAlive = closestHitProgram(3u, i, ray, scramble_state);
-            }
-            // was last trace a miss?
-            if (!hit)
-                missProgram(ray._immutable,ray._payload);
-        }
-
-        vec3 accumulation = ray._payload.accumulation;
-
-        float rcpSampleSize = 1.0/float(i+1);
-        color += (accumulation-color)*rcpSampleSize;
-        
-        #ifdef VISUALIZE_HIGH_VARIANCE
-            float luma = getLuma(accumulation);
-            meanLumaSquared += (luma*luma-meanLumaSquared)*rcpSampleSize;
-        #endif
-    }
-
-    #ifdef VISUALIZE_HIGH_VARIANCE
-        float variance = getLuma(color);
-        variance *= variance;
-        variance = meanLumaSquared-variance;
-        if (variance>5.0)
-            color = vec3(1.0,0.0,0.0);
-    #endif
-
-    vec4 pixelCol = vec4(color, 1.0);
-    imageStore(outImage, coords, pixelCol);
-}
-/** TODO: Improving Rendering
-
-Now:
-- Always MIS (path correlated reuse)
-- Test MIS alpha (roughness) scheme
-
-Many Lights:
-- Path Guiding
-- Light Importance Lists/Classification
-- Spatio-Temporal Reservoir Sampling
-
-Indirect Light:
-- Bidirectional Path Tracing
-- Uniform Path Sampling / Vertex Connection and Merging / Path Space Regularization
-
-Animations:
-- A-SVGF / BMFR
-**/
\ No newline at end of file
diff --git a/56_RayQuery/config.json.template b/56_RayQuery/config.json.template
deleted file mode 100644
index f961745c1..000000000
--- a/56_RayQuery/config.json.template
+++ /dev/null
@@ -1,28 +0,0 @@
-{
-  "enableParallelBuild": true,
-  "threadsPerBuildProcess" : 2,
-  "isExecuted": false,
-  "scriptPath": "",
-  "cmake": {
-    "configurations": [ "Release", "Debug", "RelWithDebInfo" ],
-    "buildModes": [],
-    "requiredOptions": []
-  }, 
-  "profiles": [
-    {
-      "backend": "vulkan",
-      "platform": "windows",
-      "buildModes": [],
-      "runConfiguration": "Release",
-      "gpuArchitectures": []
-    }
-  ],
-  "dependencies": [],
-  "data": [
-    {
-      "dependencies": [],
-      "command": [""],
-      "outputs": []
-    }
-  ]
-}
\ No newline at end of file
diff --git a/56_RayQuery/litByRectangle.comp b/56_RayQuery/litByRectangle.comp
deleted file mode 100644
index 829d03398..000000000
--- a/56_RayQuery/litByRectangle.comp
+++ /dev/null
@@ -1,106 +0,0 @@
-// Copyright (C) 2018-2020 - DevSH Graphics Programming Sp. z O.O.
-// This file is part of the "Nabla Engine".
-// For conditions of distribution and use, see copyright notice in nabla.h
-
-#version 460 core
-#extension GL_GOOGLE_include_directive : require
-#extension GL_EXT_ray_query : enable
-
-#define SPHERE_COUNT 8
-#define POLYGON_METHOD 0 // 0 area sampling, 1 solid angle sampling, 2 approximate projected solid angle sampling
-#include "common.glsl"
-
-
-#define RECTANGLE_COUNT 1
-const vec3 edge0 = normalize(vec3(2,0,-1));
-const vec3 edge1 = normalize(vec3(2,-5,4));
-Rectangle rectangles[RECTANGLE_COUNT] = {
-    Rectangle_Rectangle(vec3(-3.8,0.35,1.3),edge0*7.0,edge1*0.1,INVALID_ID_16BIT,0u)
-};
-
-
-void traceRay_extraShape(inout int objectID, inout float intersectionT, in vec3 origin, in vec3 direction)
-{
-	for (int i=0; i<RECTANGLE_COUNT; i++)
-    {
-        float t = Rectangle_intersect(rectangles[i],origin,direction);
-        bool closerIntersection = t>0.0 && t<intersectionT;
-
-		objectID = closerIntersection ? (i+SPHERE_COUNT):objectID;
-        intersectionT = closerIntersection ? t:intersectionT;
-    }
-}
-
-/// #include <nbl/builtin/glsl/sampling/projected_spherical_rectangle.glsl>
-float nbl_glsl_light_deferred_pdf(in Light light, in Ray_t ray)
-{
-    const Rectangle rect = rectangles[Light_getObjectID(light)];
-
-    const vec3 L = ray._immutable.direction;
-#if POLYGON_METHOD==0
-    const float dist = ray._mutable.intersectionT;
-    return dist*dist/abs(dot(Rectangle_getNormalTimesArea(rect),L));
-#else
-    const ImmutableRay_t _immutable = ray._immutable;
-    const mat3 sphericalVertices = nbl_glsl_shapes_getSphericalTriangle(mat3(tri.vertex0,tri.vertex1,tri.vertex2),_immutable.origin);
-    #if POLYGON_METHOD==1
-        const float rcpProb = nbl_glsl_shapes_SolidAngleOfTriangle(sphericalVertices);
-        // if `rcpProb` is NAN then the triangle's solid angle was close to 0.0 
-        return rcpProb>FLT_MIN ? (1.0/rcpProb):nbl_glsl_FLT_MAX;
-    #elif POLYGON_METHOD==2
-        const float pdf = nbl_glsl_sampling_probProjectedSphericalTriangleSample(sphericalVertices,_immutable.normalAtOrigin,_immutable.wasBSDFAtOrigin,L);
-        // if `pdf` is NAN then the triangle's projected solid angle was close to 0.0, if its close to INF then the triangle was very small
-        return pdf<nbl_glsl_FLT_MAX ? pdf:0.0;
-    #endif
-#endif
-}
-
-vec3 nbl_glsl_light_generate_and_pdf(out float pdf, out float newRayMaxT, in vec3 origin, in nbl_glsl_AnisotropicViewSurfaceInteraction interaction, in bool isBSDF, in vec3 xi, in uint objectID)
-{
-    const Rectangle rect = rectangles[objectID];
-    
-#if POLYGON_METHOD==0
-    const vec3 point = rect.offset+rect.edge0*xi.x+rect.edge1*xi.y; // TODO: refactor
-    const vec3 L = point-origin;
-    
-    const float distanceSq = dot(L,L);
-    const float rcpDistance = inversesqrt(distanceSq);
-    
-    pdf = distanceSq/abs(dot(Rectangle_getNormalTimesArea(rect),L));
-    newRayMaxT = 1.0/rcpDistance;
-    return L*rcpDistance;
-#else 
-    float rcpPdf;
-
-    const mat3 sphericalVertices = nbl_glsl_shapes_getSphericalTriangle(mat3(tri.vertex0,tri.vertex1,tri.vertex2),origin);
-#if POLYGON_METHOD==1
-    const vec3 L = nbl_glsl_sampling_generateSphericalTriangleSample(rcpPdf,sphericalVertices,xi.xy);
-#elif POLYGON_METHOD==2
-    const vec3 L = nbl_glsl_sampling_generateProjectedSphericalTriangleSample(rcpPdf,sphericalVertices,interaction.isotropic.N,isBSDF,xi.xy);
-#endif
-
-    // if `rcpProb` is NAN or negative then the triangle's solidAngle or projectedSolidAngle was close to 0.0 
-    pdf = rcpPdf>FLT_MIN ? (1.0/rcpPdf):0.0;
-
-    const vec3 N = Triangle_getNormalTimesArea(tri);
-    newRayMaxT = dot(N,tri.vertex0-origin)/dot(N,L);
-    return L;
-#endif
-}
-
-
-uint getBSDFLightIDAndDetermineNormal(out vec3 normal, in uint objectID, in vec3 intersection)
-{
-    if (objectID<SPHERE_COUNT)
-    {
-        Sphere sphere = spheres[objectID];
-        normal = Sphere_getNormal(sphere,intersection);
-        return sphere.bsdfLightIDs;
-    }
-    else
-    {
-        Rectangle rect = rectangles[objectID-SPHERE_COUNT];
-        normal = normalize(Rectangle_getNormalTimesArea(rect));
-        return rect.bsdfLightIDs;
-    }
-}
\ No newline at end of file
diff --git a/56_RayQuery/litBySphere.comp b/56_RayQuery/litBySphere.comp
deleted file mode 100644
index 21920d331..000000000
--- a/56_RayQuery/litBySphere.comp
+++ /dev/null
@@ -1,61 +0,0 @@
-// Copyright (C) 2018-2020 - DevSH Graphics Programming Sp. z O.O.
-// This file is part of the "Nabla Engine".
-// For conditions of distribution and use, see copyright notice in nabla.h
-
-#version 460 core
-#extension GL_GOOGLE_include_directive : require
-#extension GL_EXT_ray_query : enable
-
-#define SPHERE_COUNT 9
-#include "common.glsl"
-
-
-void traceRay_extraShape(inout int objectID, inout float intersectionT, in vec3 origin, in vec3 direction)
-{
-}
-
-float nbl_glsl_light_deferred_pdf(in Light light, in Ray_t ray)
-{
-    const Sphere sphere = spheres[ray._mutable.objectID];
-    return 1.0/Sphere_getSolidAngle(sphere,ray._immutable.origin);
-}
-
-vec3 nbl_glsl_light_generate_and_pdf(out float pdf, out float newRayMaxT, in vec3 origin, in nbl_glsl_AnisotropicViewSurfaceInteraction interaction, in bool isBSDF, in vec3 xi, in uint objectID)
-{
-    const Sphere sphere = spheres[objectID];
-
-    vec3 Z = sphere.position-origin;
-    const float distanceSQ = dot(Z,Z);
-    const float cosThetaMax2 = 1.0-sphere.radius2/distanceSQ;
-    if (cosThetaMax2>0.0)
-    {
-        const float rcpDistance = inversesqrt(distanceSQ);
-        Z *= rcpDistance;
-    
-        const float cosThetaMax = sqrt(cosThetaMax2);
-        const float cosTheta = mix(1.0,cosThetaMax,xi.x);
-
-        vec3 L = Z*cosTheta;
-
-        const float cosTheta2 = cosTheta*cosTheta;
-        const float sinTheta = sqrt(1.0-cosTheta2);
-        float sinPhi,cosPhi;
-        nbl_glsl_sincos(2.0*nbl_glsl_PI*xi.y-nbl_glsl_PI,sinPhi,cosPhi);
-        mat2x3 XY = nbl_glsl_frisvad(Z);
-    
-        L += (XY[0]*cosPhi+XY[1]*sinPhi)*sinTheta;
-    
-        newRayMaxT = (cosTheta-sqrt(cosTheta2-cosThetaMax2))/rcpDistance;
-        pdf = 1.0/Sphere_getSolidAngle_impl(cosThetaMax);
-        return L;
-    }
-    pdf = 0.0;
-    return vec3(0.0,0.0,0.0);
-}
-
-uint getBSDFLightIDAndDetermineNormal(out vec3 normal, in uint objectID, in vec3 intersection)
-{
-    Sphere sphere = spheres[objectID];
-    normal = Sphere_getNormal(sphere,intersection);
-    return sphere.bsdfLightIDs;
-}
\ No newline at end of file
diff --git a/56_RayQuery/litByTriangle.comp b/56_RayQuery/litByTriangle.comp
deleted file mode 100644
index 1cd1d3ee3..000000000
--- a/56_RayQuery/litByTriangle.comp
+++ /dev/null
@@ -1,105 +0,0 @@
-// Copyright (C) 2018-2020 - DevSH Graphics Programming Sp. z O.O.
-// This file is part of the "Nabla Engine".
-// For conditions of distribution and use, see copyright notice in nabla.h
-
-#version 460 core
-#extension GL_GOOGLE_include_directive : require
-#extension GL_EXT_ray_query : enable
-
-#define SPHERE_COUNT 8
-#define POLYGON_METHOD 0 // 0 area sampling, 1 solid angle sampling, 2 approximate projected solid angle sampling
-#include "common.glsl"
-
-#define TRIANGLE_COUNT 1
-Triangle triangles[TRIANGLE_COUNT] = {
-    Triangle_Triangle(mat3(vec3(-1.8,0.35,0.3),vec3(-1.2,0.35,0.0),vec3(-1.5,0.8,-0.3)),INVALID_ID_16BIT,0u)
-};
-
-void traceRay_extraShape(inout int objectID, inout float intersectionT, in vec3 origin, in vec3 direction)
-{
-	for (int i=0; i<TRIANGLE_COUNT; i++)
-    {
-        float t = Triangle_intersect(triangles[i],origin,direction);
-        bool closerIntersection = t>0.0 && t<intersectionT;
-
-		objectID = closerIntersection ? (i+SPHERE_COUNT):objectID;
-        intersectionT = closerIntersection ? t:intersectionT;
-    }
-}
-
-
-#include <nbl/builtin/glsl/sampling/projected_spherical_triangle.glsl>
-float nbl_glsl_light_deferred_pdf(in Light light, in Ray_t ray)
-{
-    const Triangle tri = triangles[Light_getObjectID(light)];
-
-    const vec3 L = ray._immutable.direction;
-#if POLYGON_METHOD==0
-    const float dist = ray._mutable.intersectionT;
-    return dist*dist/abs(dot(Triangle_getNormalTimesArea(tri),L));
-#else
-    const ImmutableRay_t _immutable = ray._immutable;
-    const mat3 sphericalVertices = nbl_glsl_shapes_getSphericalTriangle(mat3(tri.vertex0,tri.vertex1,tri.vertex2),_immutable.origin);
-    #if POLYGON_METHOD==1
-        const float rcpProb = nbl_glsl_shapes_SolidAngleOfTriangle(sphericalVertices);
-        // if `rcpProb` is NAN then the triangle's solid angle was close to 0.0 
-        return rcpProb>FLT_MIN ? (1.0/rcpProb):nbl_glsl_FLT_MAX;
-    #elif POLYGON_METHOD==2
-        const float pdf = nbl_glsl_sampling_probProjectedSphericalTriangleSample(sphericalVertices,_immutable.normalAtOrigin,_immutable.wasBSDFAtOrigin,L);
-        // if `pdf` is NAN then the triangle's projected solid angle was close to 0.0, if its close to INF then the triangle was very small
-        return pdf<nbl_glsl_FLT_MAX ? pdf:0.0;
-    #endif
-#endif
-}
-
-vec3 nbl_glsl_light_generate_and_pdf(out float pdf, out float newRayMaxT, in vec3 origin, in nbl_glsl_AnisotropicViewSurfaceInteraction interaction, in bool isBSDF, in vec3 xi, in uint objectID)
-{
-    const Triangle tri = triangles[objectID];
-    
-#if POLYGON_METHOD==0
-    const mat2x3 edges = mat2x3(tri.vertex1-tri.vertex0,tri.vertex2-tri.vertex0);
-    const float sqrtU = sqrt(xi.x);
-    vec3 point = tri.vertex0+edges[0]*(1.0-sqrtU)+edges[1]*sqrtU*xi.y;
-    const vec3 L = point-origin;
-    
-    const float distanceSq = dot(L,L);
-    const float rcpDistance = inversesqrt(distanceSq);
-    
-    pdf = distanceSq/abs(dot(Triangle_getNormalTimesArea_impl(edges),L));
-    newRayMaxT = 1.0/rcpDistance;
-    return L*rcpDistance;
-#else 
-    float rcpPdf;
-
-    const mat3 sphericalVertices = nbl_glsl_shapes_getSphericalTriangle(mat3(tri.vertex0,tri.vertex1,tri.vertex2),origin);
-#if POLYGON_METHOD==1
-    const vec3 L = nbl_glsl_sampling_generateSphericalTriangleSample(rcpPdf,sphericalVertices,xi.xy);
-#elif POLYGON_METHOD==2
-    const vec3 L = nbl_glsl_sampling_generateProjectedSphericalTriangleSample(rcpPdf,sphericalVertices,interaction.isotropic.N,isBSDF,xi.xy);
-#endif
-
-    // if `rcpProb` is NAN or negative then the triangle's solidAngle or projectedSolidAngle was close to 0.0 
-    pdf = rcpPdf>FLT_MIN ? (1.0/rcpPdf):0.0;
-
-    const vec3 N = Triangle_getNormalTimesArea(tri);
-    newRayMaxT = dot(N,tri.vertex0-origin)/dot(N,L);
-    return L;
-#endif
-}
-
-
-uint getBSDFLightIDAndDetermineNormal(out vec3 normal, in uint objectID, in vec3 intersection)
-{
-    if (objectID<SPHERE_COUNT)
-    {
-        Sphere sphere = spheres[objectID];
-        normal = Sphere_getNormal(sphere,intersection);
-        return sphere.bsdfLightIDs;
-    }
-    else
-    {
-        Triangle tri = triangles[objectID-SPHERE_COUNT];
-        normal = normalize(Triangle_getNormalTimesArea(tri));
-        return tri.bsdfLightIDs;
-    }
-}
\ No newline at end of file
diff --git a/56_RayQuery/main.cpp b/56_RayQuery/main.cpp
deleted file mode 100644
index fa5c4608d..000000000
--- a/56_RayQuery/main.cpp
+++ /dev/null
@@ -1,1156 +0,0 @@
-// Copyright (C) 2018-2020 - DevSH Graphics Programming Sp. z O.O.
-// Copyright (C) 2018-2020 - DevSH Graphics Programming Sp. z O.O.
-// This file is part of the "Nabla Engine".
-// For conditions of distribution and use, see copyright notice in nabla.h
-
-#define _NBL_STATIC_LIB_
-#include <nabla.h>
-
-#include "../common/CommonAPI.h"
-#include "CCamera.hpp"
-#include "nbl/ext/ScreenShot/ScreenShot.h"
-#include "nbl/video/utilities/CDumbPresentationOracle.h"
-
-using namespace nbl;
-using namespace core;
-using namespace ui;
-
-
-using namespace nbl;
-using namespace core;
-using namespace asset;
-using namespace video;
-
-smart_refctd_ptr<IGPUImageView> createHDRImageView(nbl::core::smart_refctd_ptr<nbl::video::ILogicalDevice> device, asset::E_FORMAT colorFormat, uint32_t width, uint32_t height)
-{
-	smart_refctd_ptr<IGPUImageView> gpuImageViewColorBuffer;
-	{
-		IGPUImage::SCreationParams imgInfo;
-		imgInfo.format = colorFormat;
-		imgInfo.type = IGPUImage::ET_2D;
-		imgInfo.extent.width = width;
-		imgInfo.extent.height = height;
-		imgInfo.extent.depth = 1u;
-		imgInfo.mipLevels = 1u;
-		imgInfo.arrayLayers = 1u;
-		imgInfo.samples = asset::ICPUImage::ESCF_1_BIT;
-		imgInfo.flags = static_cast<asset::IImage::E_CREATE_FLAGS>(0u);
-		imgInfo.usage = core::bitflag(asset::IImage::EUF_STORAGE_BIT) | asset::IImage::EUF_TRANSFER_SRC_BIT;
-
-		// (Erfan -> Cyprian)
-		// auto image = device->createGPUImageOnDedMem(std::move(imgInfo),device->getDeviceLocalGPUMemoryReqs());
-		auto image = device->createImage(std::move(imgInfo));
-		auto imageMemoryReqs = image->getMemoryReqs();
-		imageMemoryReqs.memoryTypeBits &= device->getPhysicalDevice()->getDeviceLocalMemoryTypeBits(); // getDeviceLocalMemoryTypeBits because of previous code getDeviceLocalGPUMemoryReqs
-		auto imageMem = device->allocate(imageMemoryReqs, image.get());
-
-		IGPUImageView::SCreationParams imgViewInfo;
-		imgViewInfo.image = std::move(image);
-		imgViewInfo.format = colorFormat;
-		imgViewInfo.viewType = IGPUImageView::ET_2D;
-		imgViewInfo.flags = static_cast<IGPUImageView::E_CREATE_FLAGS>(0u);
-		imgViewInfo.subresourceRange.aspectMask = IImage::E_ASPECT_FLAGS::EAF_COLOR_BIT;
-		imgViewInfo.subresourceRange.baseArrayLayer = 0u;
-		imgViewInfo.subresourceRange.baseMipLevel = 0u;
-		imgViewInfo.subresourceRange.layerCount = 1u;
-		imgViewInfo.subresourceRange.levelCount = 1u;
-
-		gpuImageViewColorBuffer = device->createImageView(std::move(imgViewInfo));
-	}
-
-	return gpuImageViewColorBuffer;
-}
-
-struct ShaderParameters
-{
-	const uint32_t MaxDepthLog2 = 4; //5
-	const uint32_t MaxSamplesLog2 = 10; //18
-} kShaderParameters;
-
-enum E_LIGHT_GEOMETRY
-{
-	ELG_SPHERE,
-	ELG_TRIANGLE,
-	ELG_RECTANGLE
-};
-
-struct DispatchInfo_t
-{
-	uint32_t workGroupCount[3];
-};
-
-_NBL_STATIC_INLINE_CONSTEXPR uint32_t DEFAULT_WORK_GROUP_SIZE = 16u;
-
-DispatchInfo_t getDispatchInfo(uint32_t imgWidth, uint32_t imgHeight) {
-	DispatchInfo_t ret = {};
-	ret.workGroupCount[0] = (uint32_t)core::ceil<float>((float)imgWidth / (float)DEFAULT_WORK_GROUP_SIZE);
-	ret.workGroupCount[1] = (uint32_t)core::ceil<float>((float)imgHeight / (float)DEFAULT_WORK_GROUP_SIZE);
-	ret.workGroupCount[2] = 1;
-	return ret;
-}
-
-class RayQuerySampleApp : public ApplicationBase
-{
-	constexpr static uint32_t WIN_W = 1280u;
-	constexpr static uint32_t WIN_H = 720u;
-	constexpr static uint32_t FRAMES_IN_FLIGHT = 5u;
-	static constexpr uint64_t MAX_TIMEOUT = 99999999999999ull;
-
-	core::smart_refctd_ptr<nbl::ui::IWindowManager> windowManager;
-	core::smart_refctd_ptr<nbl::ui::IWindow> window;
-	core::smart_refctd_ptr<CommonAPI::CommonAPIEventCallback> windowCb;
-	core::smart_refctd_ptr<nbl::video::IAPIConnection> apiConnection;
-	core::smart_refctd_ptr<nbl::video::ISurface> surface;
-	core::smart_refctd_ptr<nbl::video::IUtilities> utilities;
-	core::smart_refctd_ptr<nbl::video::ILogicalDevice> logicalDevice;
-	video::IPhysicalDevice* physicalDevice;
-	std::array<video::IGPUQueue*, CommonAPI::InitOutput::MaxQueuesCount> queues;
-	core::smart_refctd_ptr<nbl::video::ISwapchain> swapchain;
-	core::smart_refctd_ptr<nbl::video::IGPURenderpass> renderpass;
-	core::smart_refctd_dynamic_array<core::smart_refctd_ptr<video::IGPUFramebuffer>> fbos;
-	std::array<std::array<nbl::core::smart_refctd_ptr<nbl::video::IGPUCommandPool>, CommonAPI::InitOutput::MaxFramesInFlight>, CommonAPI::InitOutput::MaxQueuesCount> commandPools;
-	core::smart_refctd_ptr<nbl::system::ISystem> system;
-	core::smart_refctd_ptr<nbl::asset::IAssetManager> assetManager;
-	video::IGPUObjectFromAssetConverter::SParams cpu2gpuParams;
-	core::smart_refctd_ptr<nbl::system::ILogger> logger;
-	core::smart_refctd_ptr<CommonAPI::InputSystem> inputSystem;
-	video::IGPUObjectFromAssetConverter cpu2gpu;
-	
-	int32_t m_resourceIx = -1;
-	uint32_t m_acquiredNextFBO = {};
-
-	CDumbPresentationOracle oracle;
-	
-	// polling for events!
-	CommonAPI::InputSystem::ChannelReader<IMouseEventChannel> mouse;
-	CommonAPI::InputSystem::ChannelReader<IKeyboardEventChannel> keyboard;
-	
-	core::smart_refctd_ptr<video::IGPUFence> frameUploadDataCompleteFence[FRAMES_IN_FLIGHT] = { nullptr };
-	core::smart_refctd_ptr<video::IGPUFence> frameComplete[FRAMES_IN_FLIGHT] = { nullptr };
-	core::smart_refctd_ptr<video::IGPUSemaphore> imageAcquire[FRAMES_IN_FLIGHT] = { nullptr };
-	core::smart_refctd_ptr<video::IGPUSemaphore> renderFinished[FRAMES_IN_FLIGHT] = { nullptr };
-	core::smart_refctd_ptr<video::IGPUSemaphore> frameUploadDataCompleteSemaphore[FRAMES_IN_FLIGHT] = { nullptr };
-
-	core::smart_refctd_ptr<video::IGPUCommandBuffer> cmdbuf[FRAMES_IN_FLIGHT]; // from graphics
-
-	Camera cam;
-	
-	core::smart_refctd_ptr<video::IGPUBuffer> gpuubo = nullptr;
-	core::smart_refctd_ptr<video::IGPUImageView> gpuEnvmapImageView = nullptr;
-	core::smart_refctd_ptr<IGPUImageView> gpuScrambleImageView;
-
-	core::smart_refctd_ptr<video::IGPUComputePipeline> gpuComputePipeline = nullptr;
-	DispatchInfo_t dispatchInfo = {};
-	
-	core::smart_refctd_ptr<video::IGPUImageView> outHDRImageViews[CommonAPI::InitOutput::MaxSwapChainImageCount] = {};
-
-	core::smart_refctd_ptr<video::IGPUDescriptorSet> descriptorSets0[CommonAPI::InitOutput::MaxSwapChainImageCount] = {};
-	core::smart_refctd_ptr<video::IGPUDescriptorSet> descriptorSet2 = nullptr;
-	core::smart_refctd_ptr<video::IGPUDescriptorSet> uboDescriptorSet1 = nullptr;
-	
-	core::smart_refctd_ptr<IGPUBuffer> aabbsBuffer = nullptr;
-	core::smart_refctd_ptr<IGPUAccelerationStructure> gpuBlas = nullptr;
-	core::smart_refctd_ptr<IGPUAccelerationStructure> gpuBlas2 = nullptr; // Built via CPUObject To GPUObject operations and utility
-	core::smart_refctd_ptr<IGPUAccelerationStructure> gpuTlas = nullptr;
-	core::smart_refctd_ptr<IGPUBuffer> instancesBuffer = nullptr;
-	
-	core::smart_refctd_ptr<IGPUBufferView> gpuSequenceBufferView = nullptr;
-	
-	core::smart_refctd_ptr<video::IGPUSampler> sampler0 = nullptr;
-	core::smart_refctd_ptr<video::IGPUSampler> sampler1 = nullptr;
-	
-	core::smart_refctd_ptr<IGPUBuffer> gpuSequenceBuffer = nullptr;
-	
-	core::smart_refctd_ptr<IGPUBuffer> spheresBuffer = nullptr;
-
-	struct SBasicViewParametersAligned
-	{
-		SBasicViewParameters uboData;
-	};
-
-public:
-	void setWindow(core::smart_refctd_ptr<nbl::ui::IWindow>&& wnd) override
-	{
-		window = std::move(wnd);
-	}
-	nbl::ui::IWindow* getWindow() override
-	{
-		return window.get();
-	}
-	void setSystem(core::smart_refctd_ptr<nbl::system::ISystem>&& system) override
-	{
-		system = std::move(system);
-	}
-
-	APP_CONSTRUCTOR(RayQuerySampleApp);
-	
-	void onAppInitialized_impl() override
-	{
-		const auto swapchainImageUsage = static_cast<asset::IImage::E_USAGE_FLAGS>(asset::IImage::EUF_COLOR_ATTACHMENT_BIT | asset::IImage::EUF_TRANSFER_DST_BIT | asset::IImage::EUF_TRANSFER_SRC_BIT);
-
-		CommonAPI::InitParams initParams;
-		initParams.window = core::smart_refctd_ptr(window);
-		initParams.apiType = video::EAT_VULKAN;
-		initParams.appName = { _NBL_APP_NAME_ };
-		initParams.framesInFlight = FRAMES_IN_FLIGHT;
-		initParams.windowWidth = WIN_W;
-		initParams.windowHeight = WIN_H;
-		initParams.swapchainImageCount = 2u;
-		initParams.swapchainImageUsage = swapchainImageUsage;
-		initParams.depthFormat = asset::EF_D32_SFLOAT;
-		auto initOutput = CommonAPI::InitWithRaytracingExt(std::move(initParams));
-
-		system = std::move(initOutput.system);
-		window = std::move(initParams.window);
-		windowCb = std::move(initParams.windowCb);
-		apiConnection = std::move(initOutput.apiConnection);
-		surface = std::move(initOutput.surface);
-		physicalDevice = std::move(initOutput.physicalDevice);
-		logicalDevice = std::move(initOutput.logicalDevice);
-		utilities = std::move(initOutput.utilities);
-		queues = std::move(initOutput.queues);
-		renderpass = std::move(initOutput.renderToSwapchainRenderpass);
-		commandPools = std::move(initOutput.commandPools);
-		assetManager = std::move(initOutput.assetManager);
-		cpu2gpuParams = std::move(initOutput.cpu2gpuParams);
-		logger = std::move(initOutput.logger);
-		inputSystem = std::move(initOutput.inputSystem);
-		
-		CommonAPI::createSwapchain(std::move(logicalDevice), initOutput.swapchainCreationParams, WIN_W, WIN_H, swapchain);
-		assert(swapchain);
-		fbos = CommonAPI::createFBOWithSwapchainImages(
-			swapchain->getImageCount(), WIN_W, WIN_H,
-			logicalDevice, swapchain, renderpass,
-			asset::EF_D32_SFLOAT
-		);
-		auto graphicsQueue = queues[CommonAPI::InitOutput::EQT_GRAPHICS];
-		auto computeQueue = queues[CommonAPI::InitOutput::EQT_GRAPHICS];
-		auto graphicsCommandPools = commandPools[CommonAPI::InitOutput::EQT_GRAPHICS];
-		auto computeCommandPools =  commandPools[CommonAPI::InitOutput::EQT_COMPUTE];
-
-		video::IGPUObjectFromAssetConverter cpu2gpu;	
-		for (uint32_t i = 0u; i < FRAMES_IN_FLIGHT; i++)
-			logicalDevice->createCommandBuffers(graphicsCommandPools[i].get(), video::IGPUCommandBuffer::EL_PRIMARY, 1, cmdbuf+i);
-
-		core::smart_refctd_ptr<video::IDescriptorPool> descriptorPool = nullptr;
-		{
-			video::IDescriptorPool::SCreateInfo createInfo = {};
-			createInfo.maxSets = CommonAPI::InitOutput::MaxSwapChainImageCount+2;
-			createInfo.maxDescriptorCount[static_cast<uint32_t>(asset::IDescriptor::E_TYPE::ET_STORAGE_BUFFER)] = 1;
-			createInfo.maxDescriptorCount[static_cast<uint32_t>(asset::IDescriptor::E_TYPE::ET_STORAGE_IMAGE)] = CommonAPI::InitOutput::MaxSwapChainImageCount;
-			createInfo.maxDescriptorCount[static_cast<uint32_t>(asset::IDescriptor::E_TYPE::ET_COMBINED_IMAGE_SAMPLER)] = 2;
-			createInfo.maxDescriptorCount[static_cast<uint32_t>(asset::IDescriptor::E_TYPE::ET_UNIFORM_TEXEL_BUFFER)] = 1;
-			createInfo.maxDescriptorCount[static_cast<uint32_t>(asset::IDescriptor::E_TYPE::ET_UNIFORM_BUFFER)] = 1;
-			createInfo.maxDescriptorCount[static_cast<uint32_t>(asset::IDescriptor::E_TYPE::ET_ACCELERATION_STRUCTURE)] = 1;
-			
-			descriptorPool = logicalDevice->createDescriptorPool(std::move(createInfo));
-		}
-
-		// Initialize Spheres
-		constexpr uint32_t SphereCount = 9u;
-		constexpr uint32_t INVALID_ID_16BIT = 0xffffu;
-
-		struct alignas(16) Sphere
-		{
-			Sphere()
-				: position(0.0f, 0.0f, 0.0f)
-				, radius2(0.0f)
-			{
-				bsdfLightIDs = core::bitfieldInsert<uint32_t>(0u,INVALID_ID_16BIT,16,16);
-			}
-
-			Sphere(core::vector3df _position, float _radius, uint32_t _bsdfID, uint32_t _lightID)
-			{
-				position = _position;
-				radius2 = _radius*_radius;
-				bsdfLightIDs = core::bitfieldInsert(_bsdfID,_lightID,16,16);
-			}
-
-			IGPUAccelerationStructure::AABB_Position getAABB() const
-			{
-				float radius = core::sqrt(radius2);
-				return IGPUAccelerationStructure::AABB_Position(position-core::vector3df(radius, radius, radius), position+core::vector3df(radius, radius, radius));
-			}
-
-			core::vector3df position;
-			float radius2;
-			uint32_t bsdfLightIDs;
-		};
-	
-		Sphere spheres[SphereCount] = {};
-		spheres[0] = Sphere(core::vector3df(0.0,-100.5,-1.0), 100.0, 0u, INVALID_ID_16BIT);
-		spheres[1] = Sphere(core::vector3df(3.0,0.0,-1.0), 0.5,	 1u, INVALID_ID_16BIT);
-		spheres[2] = Sphere(core::vector3df(0.0,0.0,-1.0), 0.5,	 2u, INVALID_ID_16BIT);
-		spheres[3] = Sphere(core::vector3df(-3.0,0.0,-1.0), 0.5, 3u, INVALID_ID_16BIT);
-		spheres[4] = Sphere(core::vector3df(3.0,0.0,1.0), 0.5,	 4u, INVALID_ID_16BIT);
-		spheres[5] = Sphere(core::vector3df(0.0,0.0,1.0), 0.5,	 4u, INVALID_ID_16BIT);
-		spheres[6] = Sphere(core::vector3df(-3.0,0.0,1.0), 0.5,	 5u, INVALID_ID_16BIT);
-		spheres[7] = Sphere(core::vector3df(0.5,1.0,0.5), 0.5,	 6u, INVALID_ID_16BIT);
-		spheres[8] = Sphere(core::vector3df(-1.5,1.5,0.0), 0.3,  INVALID_ID_16BIT, 0u);
-
-		// Create Spheres Buffer
-		uint32_t spheresBufferSize = sizeof(Sphere) * SphereCount;
-
-		{
-			IGPUBuffer::SCreationParams params = {};
-			params.size = spheresBufferSize; // (Erfan->Cyprian) See How I moved "createDeviceLocalGPUBufferOnDedMem" second parameter to params.size? IGPUBuffer::SCreationParams::size is very important to be filled unlike before
-			params.usage = core::bitflag(asset::IBuffer::EUF_STORAGE_BUFFER_BIT) | asset::IBuffer::EUF_TRANSFER_DST_BIT; 
-			spheresBuffer = logicalDevice->createBuffer(std::move(params));
-			auto bufferReqs = spheresBuffer->getMemoryReqs();
-			bufferReqs.memoryTypeBits &= logicalDevice->getPhysicalDevice()->getDeviceLocalMemoryTypeBits(); // (Erfan->Cyprian) I used `getDeviceLocalMemoryTypeBits` because of previous createDeviceLocalGPUBufferOnDedMem (Focus on DeviceLocal Part)
-			auto spheresBufferMem = logicalDevice->allocate(bufferReqs, spheresBuffer.get());
-			utilities->updateBufferRangeViaStagingBufferAutoSubmit(asset::SBufferRange<IGPUBuffer>{0u,spheresBufferSize,spheresBuffer}, spheres, graphicsQueue);
-		}
-
-#define TEST_CPU_2_GPU_BLAS
-#ifdef TEST_CPU_2_GPU_BLAS
-		// Acceleration Structure Test
-		// Create + Build BLAS (CPU2GPU Version)
-		{
-			struct AABB {
-				IGPUAccelerationStructure::AABB_Position aabb;
-			};
-			const uint32_t aabbsCount = SphereCount / 2u;
-			uint32_t aabbsBufferSize = sizeof(AABB) * aabbsCount;
-		
-			AABB aabbs[aabbsCount] = {};
-			for(uint32_t i = 0; i < aabbsCount; ++i)
-			{
-				aabbs[i].aabb = spheres[i].getAABB();
-			}
-		
-			// auto raytracingFlags = core::bitflag(asset::IBuffer::EUF_ACCELERATION_STRUCTURE_BUILD_INPUT_READ_ONLY_BIT) | asset::IBuffer::EUF_STORAGE_BUFFER_BIT;
-			// | asset::IBuffer::EUF_TRANSFER_DST_BIT | asset::IBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT
-			core::smart_refctd_ptr<ICPUBuffer> aabbsBuffer = ICPUBuffer::create({ aabbsBufferSize });
-			memcpy(aabbsBuffer->getPointer(), aabbs, aabbsBufferSize);
-
-			ICPUAccelerationStructure::SCreationParams asCreateParams;
-			asCreateParams.type = ICPUAccelerationStructure::ET_BOTTOM_LEVEL;
-			asCreateParams.flags = ICPUAccelerationStructure::ECF_NONE;
-			core::smart_refctd_ptr<ICPUAccelerationStructure> cpuBlas = ICPUAccelerationStructure::create(std::move(asCreateParams));
-		
-			using HostGeom = ICPUAccelerationStructure::HostBuildGeometryInfo::Geom;
-			core::smart_refctd_dynamic_array<HostGeom> geometries = core::make_refctd_dynamic_array<core::smart_refctd_dynamic_array<HostGeom>>(1u);
-
-			HostGeom & simpleGeom = geometries->operator[](0u);
-			simpleGeom.type = IAccelerationStructure::EGT_AABBS;
-			simpleGeom.flags = IAccelerationStructure::EGF_OPAQUE_BIT;
-			simpleGeom.data.aabbs.data.offset = 0u;
-			simpleGeom.data.aabbs.data.buffer = aabbsBuffer;
-			simpleGeom.data.aabbs.stride = sizeof(AABB);
-
-			ICPUAccelerationStructure::HostBuildGeometryInfo buildInfo;
-			buildInfo.type = asCreateParams.type;
-			buildInfo.buildFlags = ICPUAccelerationStructure::EBF_PREFER_FAST_TRACE_BIT;
-			buildInfo.buildMode = ICPUAccelerationStructure::EBM_BUILD;
-			buildInfo.geometries = geometries;
-
-			core::smart_refctd_dynamic_array<ICPUAccelerationStructure::BuildRangeInfo> buildRangeInfos = core::make_refctd_dynamic_array<core::smart_refctd_dynamic_array<ICPUAccelerationStructure::BuildRangeInfo>>(1u);
-			ICPUAccelerationStructure::BuildRangeInfo & firstBuildRangeInfo = buildRangeInfos->operator[](0u);
-			firstBuildRangeInfo.primitiveCount = aabbsCount;
-			firstBuildRangeInfo.primitiveOffset = 0u;
-			firstBuildRangeInfo.firstVertex = 0u;
-			firstBuildRangeInfo.transformOffset = 0u;
-
-			cpuBlas->setBuildInfoAndRanges(std::move(buildInfo), buildRangeInfos);
-
-			// Build BLAS 
-			{
-				cpu2gpuParams.beginCommandBuffers();
-				gpuBlas2 = cpu2gpu.getGPUObjectsFromAssets(&cpuBlas, &cpuBlas + 1u, cpu2gpuParams)->front();
-				cpu2gpuParams.waitForCreationToComplete();
-			}
-		}
-#endif
-
-		// Create + Build BLAS
-		{
-			// Build BLAS with AABBS
-			const uint32_t aabbsCount = SphereCount;
-
-			struct AABB {
-				IGPUAccelerationStructure::AABB_Position aabb;
-			};
-	
-			AABB aabbs[aabbsCount] = {};
-			for(uint32_t i = 0; i < aabbsCount; ++i)
-			{
-				aabbs[i].aabb = spheres[i].getAABB();
-			}
-			auto raytracingFlags = core::bitflag(asset::IBuffer::EUF_ACCELERATION_STRUCTURE_BUILD_INPUT_READ_ONLY_BIT) | asset::IBuffer::EUF_STORAGE_BUFFER_BIT;
-			uint32_t aabbsBufferSize = sizeof(AABB) * aabbsCount;
-
-			{
-				IGPUBuffer::SCreationParams params = {};
-				params.size = aabbsBufferSize;
-				params.usage = raytracingFlags | asset::IBuffer::EUF_TRANSFER_DST_BIT | asset::IBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT; 
-				aabbsBuffer = logicalDevice->createBuffer(std::move(params));
-				auto bufferReqs = aabbsBuffer->getMemoryReqs();
-				bufferReqs.memoryTypeBits &= logicalDevice->getPhysicalDevice()->getDeviceLocalMemoryTypeBits();
-				auto aabbBufferMem = logicalDevice->allocate(bufferReqs, aabbsBuffer.get(), IDeviceMemoryAllocation::EMAF_DEVICE_ADDRESS_BIT);
-				// (Erfan->Cyprian) -> I passed `IDeviceMemoryAllocation::EMAF_DEVICE_ADDRESS_BIT` as a third parameter to the allocate function because the buffer needs the usage `EUF_SHADER_DEVICE_ADDRESS_BIT`
-				//		You don't have to worry about it, it's only used in this example
-				utilities->updateBufferRangeViaStagingBufferAutoSubmit(asset::SBufferRange<IGPUBuffer>{0u,aabbsBufferSize,aabbsBuffer}, aabbs, graphicsQueue);
-			}
-
-			using DeviceGeom = IGPUAccelerationStructure::DeviceBuildGeometryInfo::Geometry;
-
-			DeviceGeom simpleGeom = {};
-			simpleGeom.type = IAccelerationStructure::EGT_AABBS;
-			simpleGeom.flags = IAccelerationStructure::EGF_OPAQUE_BIT;
-			simpleGeom.data.aabbs.data.offset = 0u;
-			simpleGeom.data.aabbs.data.buffer = aabbsBuffer;
-			simpleGeom.data.aabbs.stride = sizeof(AABB);
-
-			IGPUAccelerationStructure::DeviceBuildGeometryInfo blasBuildInfo = {};
-			blasBuildInfo.type = IGPUAccelerationStructure::ET_BOTTOM_LEVEL;
-			blasBuildInfo.buildFlags = IGPUAccelerationStructure::EBF_PREFER_FAST_TRACE_BIT;
-			blasBuildInfo.buildMode = IGPUAccelerationStructure::EBM_BUILD;
-			blasBuildInfo.srcAS = nullptr;
-			blasBuildInfo.dstAS = nullptr;
-			blasBuildInfo.geometries = core::SRange<DeviceGeom>(&simpleGeom, &simpleGeom + 1u);
-			blasBuildInfo.scratchAddr = {};
-	
-			// Get BuildSizes
-			IGPUAccelerationStructure::BuildSizes buildSizes = {};
-			{
-				std::vector<uint32_t> maxPrimCount(1u);
-				maxPrimCount[0] = aabbsCount;
-				buildSizes = logicalDevice->getAccelerationStructureBuildSizes(blasBuildInfo, maxPrimCount.data());
-			}
-	
-			{
-				core::smart_refctd_ptr<IGPUBuffer> asBuffer;
-				IGPUBuffer::SCreationParams params = {};
-				params.size = buildSizes.accelerationStructureSize;
-				params.usage = core::bitflag(asset::IBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT) | asset::IBuffer::EUF_ACCELERATION_STRUCTURE_STORAGE_BIT;
-				asBuffer = logicalDevice->createBuffer(std::move(params));
-				auto bufferReqs = asBuffer->getMemoryReqs();
-				bufferReqs.memoryTypeBits &= logicalDevice->getPhysicalDevice()->getDeviceLocalMemoryTypeBits();
-				auto asBufferMem = logicalDevice->allocate(bufferReqs, asBuffer.get(), IDeviceMemoryAllocation::EMAF_DEVICE_ADDRESS_BIT);
-
-				IGPUAccelerationStructure::SCreationParams blasParams = {};
-				blasParams.type = IGPUAccelerationStructure::ET_BOTTOM_LEVEL;
-				blasParams.flags = IGPUAccelerationStructure::ECF_NONE;
-				blasParams.bufferRange.buffer = asBuffer;
-				blasParams.bufferRange.offset = 0u;
-				blasParams.bufferRange.size = buildSizes.accelerationStructureSize;
-				gpuBlas = logicalDevice->createAccelerationStructure(std::move(blasParams));
-			}
-
-			// Allocate ScratchBuffer
-			core::smart_refctd_ptr<IGPUBuffer> scratchBuffer;
-			{
-				IGPUBuffer::SCreationParams params = {};
-				params.size = buildSizes.buildScratchSize;
-				params.usage = core::bitflag(asset::IBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT) | asset::IBuffer::EUF_STORAGE_BUFFER_BIT; 
-				scratchBuffer = logicalDevice->createBuffer(std::move(params));
-				auto bufferReqs = scratchBuffer->getMemoryReqs();
-				bufferReqs.memoryTypeBits &= logicalDevice->getPhysicalDevice()->getDeviceLocalMemoryTypeBits();
-				auto scratchBufferMem = logicalDevice->allocate(bufferReqs, scratchBuffer.get(), IDeviceMemoryAllocation::EMAF_DEVICE_ADDRESS_BIT);
-			}
-
-			// Complete BLAS Build Info
-			{
-				blasBuildInfo.dstAS = gpuBlas.get();
-				blasBuildInfo.scratchAddr.buffer = scratchBuffer;
-				blasBuildInfo.scratchAddr.offset = 0u;
-			}
-	
-			IGPUAccelerationStructure::BuildRangeInfo firstBuildRangeInfos[1u];
-			firstBuildRangeInfos[0].primitiveCount = aabbsCount;
-			firstBuildRangeInfos[0].primitiveOffset = 0u;
-			firstBuildRangeInfos[0].firstVertex = 0u;
-			firstBuildRangeInfos[0].transformOffset = 0u;
-			IGPUAccelerationStructure::BuildRangeInfo* pRangeInfos[1u];
-			pRangeInfos[0] = firstBuildRangeInfos;
-			// pRangeInfos[1] = &secondBuildRangeInfos;
-
-			// Build BLAS 
-			{
-				utilities->buildAccelerationStructures(computeQueue, core::SRange<IGPUAccelerationStructure::DeviceBuildGeometryInfo>(&blasBuildInfo, &blasBuildInfo + 1u), pRangeInfos);
-			}
-		}
-	
-		// Create + Build TLAS
-		{
-			struct Instance {
-				IGPUAccelerationStructure::Instance instance;
-			};
-
-			const uint32_t instancesCount = 1u;
-			Instance instances[instancesCount] = {};
-			core::matrix3x4SIMD identity;
-			instances[0].instance.mat = identity;
-			instances[0].instance.instanceCustomIndex = 0u;
-			instances[0].instance.mask = 0xFF;
-			instances[0].instance.instanceShaderBindingTableRecordOffset = 0u;
-			instances[0].instance.flags = IAccelerationStructure::EIF_TRIANGLE_FACING_CULL_DISABLE_BIT;
-#ifdef TEST_CPU_2_GPU_BLAS
-			instances[0].instance.accelerationStructureReference = gpuBlas2->getReferenceForDeviceOperations();
-#else
-			instances[0].instance.accelerationStructureReference = gpuBlas->getReferenceForDeviceOperations();
-#endif
-			auto raytracingFlags = core::bitflag(asset::IBuffer::EUF_ACCELERATION_STRUCTURE_BUILD_INPUT_READ_ONLY_BIT) | asset::IBuffer::EUF_STORAGE_BUFFER_BIT;
-	
-			uint32_t instancesBufferSize = sizeof(Instance);
-			{
-				IGPUBuffer::SCreationParams params = {};
-				params.size = instancesBufferSize;
-				params.usage = raytracingFlags | asset::IBuffer::EUF_TRANSFER_DST_BIT | asset::IBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT; 
-				instancesBuffer = logicalDevice->createBuffer(std::move(params));
-				auto bufferReqs = instancesBuffer->getMemoryReqs();
-				bufferReqs.memoryTypeBits &= logicalDevice->getPhysicalDevice()->getDeviceLocalMemoryTypeBits();
-				auto instancesBufferMem = logicalDevice->allocate(bufferReqs, instancesBuffer.get(), IDeviceMemoryAllocation::EMAF_DEVICE_ADDRESS_BIT);
-				utilities->updateBufferRangeViaStagingBufferAutoSubmit(asset::SBufferRange<IGPUBuffer>{0u,instancesBufferSize,instancesBuffer}, instances, graphicsQueue);
-			}
-		
-			using DeviceGeom = IGPUAccelerationStructure::DeviceBuildGeometryInfo::Geometry;
-
-			DeviceGeom blasInstancesGeom = {};
-			blasInstancesGeom.type = IAccelerationStructure::EGT_INSTANCES;
-			blasInstancesGeom.flags = IAccelerationStructure::EGF_NONE;
-			blasInstancesGeom.data.instances.data.offset = 0u;
-			blasInstancesGeom.data.instances.data.buffer = instancesBuffer;
-
-			IGPUAccelerationStructure::DeviceBuildGeometryInfo tlasBuildInfo = {};
-			tlasBuildInfo.type = IGPUAccelerationStructure::ET_TOP_LEVEL;
-			tlasBuildInfo.buildFlags = IGPUAccelerationStructure::EBF_PREFER_FAST_TRACE_BIT;
-			tlasBuildInfo.buildMode = IGPUAccelerationStructure::EBM_BUILD;
-			tlasBuildInfo.srcAS = nullptr;
-			tlasBuildInfo.dstAS = nullptr;
-			tlasBuildInfo.geometries = core::SRange<DeviceGeom>(&blasInstancesGeom, &blasInstancesGeom + 1u);
-			tlasBuildInfo.scratchAddr = {};
-			
-			// Get BuildSizes
-			IGPUAccelerationStructure::BuildSizes buildSizes = {};
-			{
-				std::vector<uint32_t> maxPrimCount(1u); 
-				maxPrimCount[0] = instancesCount;
-				buildSizes = logicalDevice->getAccelerationStructureBuildSizes(tlasBuildInfo, maxPrimCount.data());
-			}
-	
-			{
-				core::smart_refctd_ptr<IGPUBuffer> asBuffer;
-				IGPUBuffer::SCreationParams params = {};
-				params.size = buildSizes.accelerationStructureSize;
-				params.usage = core::bitflag(asset::IBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT) | asset::IBuffer::EUF_ACCELERATION_STRUCTURE_STORAGE_BIT; 
-				asBuffer = logicalDevice->createBuffer(std::move(params));
-				auto bufferReqs = asBuffer->getMemoryReqs();
-				bufferReqs.memoryTypeBits &= logicalDevice->getPhysicalDevice()->getDeviceLocalMemoryTypeBits();
-				auto asBufferMem = logicalDevice->allocate(bufferReqs, asBuffer.get(), IDeviceMemoryAllocation::EMAF_DEVICE_ADDRESS_BIT);
-
-				IGPUAccelerationStructure::SCreationParams tlasParams = {};
-				tlasParams.type = IGPUAccelerationStructure::ET_TOP_LEVEL;
-				tlasParams.flags = IGPUAccelerationStructure::ECF_NONE;
-				tlasParams.bufferRange.buffer = asBuffer;
-				tlasParams.bufferRange.offset = 0u;
-				tlasParams.bufferRange.size = buildSizes.accelerationStructureSize;
-				gpuTlas = logicalDevice->createAccelerationStructure(std::move(tlasParams));
-			}
-
-			// Allocate ScratchBuffer
-			core::smart_refctd_ptr<IGPUBuffer> scratchBuffer;
-			{
-				IGPUBuffer::SCreationParams params = {};
-				params.size = buildSizes.buildScratchSize;
-				params.usage = core::bitflag(asset::IBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT) | asset::IBuffer::EUF_STORAGE_BUFFER_BIT; 
-				scratchBuffer = logicalDevice->createBuffer(std::move(params));
-				auto bufferReqs = scratchBuffer->getMemoryReqs();
-				bufferReqs.memoryTypeBits &= logicalDevice->getPhysicalDevice()->getDeviceLocalMemoryTypeBits();
-				auto scratchBufferMem = logicalDevice->allocate(bufferReqs, scratchBuffer.get(), IDeviceMemoryAllocation::EMAF_DEVICE_ADDRESS_BIT);
-			}
-
-			// Complete BLAS Build Info
-			{
-				tlasBuildInfo.dstAS = gpuTlas.get();
-				tlasBuildInfo.scratchAddr.buffer = scratchBuffer;
-				tlasBuildInfo.scratchAddr.offset = 0u;
-			}
-		
-			IGPUAccelerationStructure::BuildRangeInfo firstBuildRangeInfos[1u];
-			firstBuildRangeInfos[0].primitiveCount = instancesCount;
-			firstBuildRangeInfos[0].primitiveOffset = 0u;
-			firstBuildRangeInfos[0].firstVertex = 0u;
-			firstBuildRangeInfos[0].transformOffset = 0u;
-			IGPUAccelerationStructure::BuildRangeInfo* pRangeInfos[1u];
-			pRangeInfos[0] = firstBuildRangeInfos;
-
-			// Build TLAS 
-			{
-				utilities->buildAccelerationStructures(computeQueue, core::SRange<IGPUAccelerationStructure::DeviceBuildGeometryInfo>(&tlasBuildInfo, &tlasBuildInfo + 1u), pRangeInfos);
-			}
-		}
-	
-
-		// Camera 
-		core::vectorSIMDf cameraPosition(0, 5, -10);
-		matrix4SIMD proj = matrix4SIMD::buildProjectionMatrixPerspectiveFovRH(core::radians(60.0f), video::ISurface::getTransformedAspectRatio(swapchain->getPreTransform(), WIN_W, WIN_H), 0.01f, 500.0f);
-		cam = Camera(cameraPosition, core::vectorSIMDf(0, 0, 0), proj);
-
-		IGPUDescriptorSetLayout::SBinding descriptorSet0Bindings[] =
-		{
-			{ 0u, asset::IDescriptor::E_TYPE::ET_STORAGE_IMAGE, video::IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_NONE, IShader::ESS_COMPUTE, 1u, nullptr },
-		};
-		IGPUDescriptorSetLayout::SBinding uboBinding {0, asset::IDescriptor::E_TYPE::ET_UNIFORM_BUFFER, video::IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_NONE, IShader::ESS_COMPUTE, 1u, nullptr};
-		IGPUDescriptorSetLayout::SBinding descriptorSet3Bindings[] = {
-			{ 0u, asset::IDescriptor::E_TYPE::ET_COMBINED_IMAGE_SAMPLER,	video::IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_NONE, IShader::ESS_COMPUTE, 1u, nullptr },
-			{ 1u, asset::IDescriptor::E_TYPE::ET_UNIFORM_TEXEL_BUFFER,		video::IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_NONE, IShader::ESS_COMPUTE, 1u, nullptr },
-			{ 2u, asset::IDescriptor::E_TYPE::ET_COMBINED_IMAGE_SAMPLER,	video::IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_NONE, IShader::ESS_COMPUTE, 1u, nullptr },
-			{ 3u, asset::IDescriptor::E_TYPE::ET_ACCELERATION_STRUCTURE,	video::IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_NONE, IShader::ESS_COMPUTE, 1u, nullptr },
-			{ 4u, asset::IDescriptor::E_TYPE::ET_STORAGE_BUFFER,			video::IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_NONE, IShader::ESS_COMPUTE, 1u, nullptr }
-		};
-	
-		auto gpuDescriptorSetLayout0 = logicalDevice->createDescriptorSetLayout(descriptorSet0Bindings, descriptorSet0Bindings + 1u);
-		auto gpuDescriptorSetLayout1 = logicalDevice->createDescriptorSetLayout(&uboBinding, &uboBinding + 1u);
-		auto gpuDescriptorSetLayout2 = logicalDevice->createDescriptorSetLayout(descriptorSet3Bindings, descriptorSet3Bindings+5u);
-
-		auto createGpuResources = [&](std::string pathToShader) -> core::smart_refctd_ptr<video::IGPUComputePipeline>
-		{
-			asset::IAssetLoader::SAssetLoadParams params{};
-			params.logger = logger.get();
-			//params.relativeDir = tmp.c_str();
-			auto spec = assetManager->getAsset(pathToShader,params).getContents();
-		
-			if (spec.empty())
-				assert(false);
-
-			auto cpuComputeSpecializedShader = core::smart_refctd_ptr_static_cast<asset::ICPUSpecializedShader>(*spec.begin());
-
-			ISpecializedShader::SInfo info = cpuComputeSpecializedShader->getSpecializationInfo();
-			info.m_backingBuffer = ICPUBuffer::create({ sizeof(ShaderParameters) });
-			memcpy(info.m_backingBuffer->getPointer(),&kShaderParameters,sizeof(ShaderParameters));
-			info.m_entries = core::make_refctd_dynamic_array<core::smart_refctd_dynamic_array<ISpecializedShader::SInfo::SMapEntry>>(2u);
-			for (uint32_t i=0; i<2; i++)
-				info.m_entries->operator[](i) = {i,i*sizeof(uint32_t),sizeof(uint32_t)};
-
-
-			cpuComputeSpecializedShader->setSpecializationInfo(std::move(info));
-
-			auto gpuComputeSpecializedShader = cpu2gpu.getGPUObjectsFromAssets(&cpuComputeSpecializedShader, &cpuComputeSpecializedShader + 1, cpu2gpuParams)->front();
-
-			auto gpuPipelineLayout = logicalDevice->createPipelineLayout(nullptr, nullptr, core::smart_refctd_ptr(gpuDescriptorSetLayout0), core::smart_refctd_ptr(gpuDescriptorSetLayout1), core::smart_refctd_ptr(gpuDescriptorSetLayout2), nullptr);
-
-			auto gpuPipeline = logicalDevice->createComputePipeline(nullptr, std::move(gpuPipelineLayout), std::move(gpuComputeSpecializedShader));
-
-			return gpuPipeline;
-		};
-
-		E_LIGHT_GEOMETRY lightGeom = ELG_SPHERE;
-		constexpr const char* shaderPaths[] = {"../litBySphere.comp","../litByTriangle.comp","../litByRectangle.comp"};
-		gpuComputePipeline = createGpuResources(shaderPaths[lightGeom]);
-
-		dispatchInfo = getDispatchInfo(WIN_W, WIN_H);
-
-		auto createImageView = [&](std::string pathToOpenEXRHDRIImage)
-		{
-			auto pathToTexture = pathToOpenEXRHDRIImage;
-			IAssetLoader::SAssetLoadParams lp(0ull, nullptr, IAssetLoader::ECF_DONT_CACHE_REFERENCES);
-			auto cpuTexture = assetManager->getAsset(pathToTexture, lp);
-			auto cpuTextureContents = cpuTexture.getContents();
-			assert(!cpuTextureContents.empty());
-			auto cpuImage = core::smart_refctd_ptr_static_cast<asset::ICPUImage>(*cpuTextureContents.begin());
-			cpuImage->setImageUsageFlags(IImage::E_USAGE_FLAGS::EUF_SAMPLED_BIT);
-
-			ICPUImageView::SCreationParams viewParams;
-			viewParams.flags = static_cast<ICPUImageView::E_CREATE_FLAGS>(0u);
-			viewParams.image = cpuImage;
-			viewParams.format = viewParams.image->getCreationParameters().format;
-			viewParams.viewType = IImageView<ICPUImage>::ET_2D;
-			viewParams.subresourceRange.aspectMask = IImage::E_ASPECT_FLAGS::EAF_COLOR_BIT;
-			viewParams.subresourceRange.baseArrayLayer = 0u;
-			viewParams.subresourceRange.layerCount = 1u;
-			viewParams.subresourceRange.baseMipLevel = 0u;
-			viewParams.subresourceRange.levelCount = 1u;
-
-			auto cpuImageView = ICPUImageView::create(std::move(viewParams));
-		
-			cpu2gpuParams.beginCommandBuffers();
-			auto gpuImageView = cpu2gpu.getGPUObjectsFromAssets(&cpuImageView, &cpuImageView + 1u, cpu2gpuParams)->front();
-			cpu2gpuParams.waitForCreationToComplete();
-
-			return gpuImageView;
-		};
-	
-		gpuEnvmapImageView = createImageView("../../media/envmap/envmap_0.exr");
-
-		{
-			const uint32_t MaxDimensions = 3u<<kShaderParameters.MaxDepthLog2;
-			const uint32_t MaxSamples = 1u<<kShaderParameters.MaxSamplesLog2;
-
-			auto sampleSequence = core::make_smart_refctd_ptr<asset::({ sizeof(uint32_t)*MaxDimensions*MaxSamples });
-		
-			core::OwenSampler sampler(MaxDimensions, 0xdeadbeefu);
-			//core::SobolSampler sampler(MaxDimensions);
-
-			auto out = reinterpret_cast<uint32_t*>(sampleSequence->getPointer());
-			for (auto dim=0u; dim<MaxDimensions; dim++)
-			for (uint32_t i=0; i<MaxSamples; i++)
-			{
-				out[i*MaxDimensions+dim] = sampler.sample(dim,i);
-			}
-		
-			{
-				const auto bufferSize = sampleSequence->getSize();
-				IGPUBuffer::SCreationParams params = {};
-				params.size = bufferSize;
-				params.usage = core::bitflag(asset::IBuffer::EUF_TRANSFER_DST_BIT) | asset::IBuffer::EUF_UNIFORM_TEXEL_BUFFER_BIT; 
-				gpuSequenceBuffer = logicalDevice->createBuffer(std::move(params));
-				auto bufferReqs = gpuSequenceBuffer->getMemoryReqs();
-				bufferReqs.memoryTypeBits &= logicalDevice->getPhysicalDevice()->getDeviceLocalMemoryTypeBits();
-				auto gpuSequenceBufferMem = logicalDevice->allocate(bufferReqs, gpuSequenceBuffer.get());
-				utilities->updateBufferRangeViaStagingBufferAutoSubmit(asset::SBufferRange<IGPUBuffer>{0u,bufferSize,gpuSequenceBuffer},sampleSequence->getPointer(), graphicsQueue);
-			}
-			gpuSequenceBufferView = logicalDevice->createBufferView(gpuSequenceBuffer.get(), asset::EF_R32G32B32_UINT);
-		}
-
-		{
-			IGPUImage::SCreationParams imgParams;
-			imgParams.flags = static_cast<IImage::E_CREATE_FLAGS>(0u);
-			imgParams.type = IImage::ET_2D;
-			imgParams.format = EF_R32G32_UINT;
-			imgParams.extent = {WIN_W, WIN_H,1u};
-			imgParams.mipLevels = 1u;
-			imgParams.arrayLayers = 1u;
-			imgParams.samples = IImage::ESCF_1_BIT;
-			imgParams.usage = core::bitflag(IImage::EUF_SAMPLED_BIT) | IImage::EUF_TRANSFER_DST_BIT;
-			imgParams.initialLayout = asset::IImage::EL_UNDEFINED;
-
-			IGPUImage::SBufferCopy region = {};
-			region.bufferOffset = 0u;
-			region.bufferRowLength = 0u;
-			region.bufferImageHeight = 0u;
-			region.imageExtent = imgParams.extent;
-			region.imageOffset = {0u,0u,0u};
-			region.imageSubresource.layerCount = 1u;
-			region.imageSubresource.aspectMask = IImage::E_ASPECT_FLAGS::EAF_COLOR_BIT;
-
-			constexpr auto ScrambleStateChannels = 2u;
-			const auto renderPixelCount = imgParams.extent.width*imgParams.extent.height;
-			core::vector<uint32_t> random(renderPixelCount*ScrambleStateChannels);
-			{
-				core::RandomSampler rng(0xbadc0ffeu);
-				for (auto& pixel : random)
-					pixel = rng.nextSample();
-			}
-
-			core::smart_refctd_ptr<IGPUBuffer> scrambleImageBuffer;
-			{
-				const auto bufferSize = random.size() * sizeof(uint32_t);
-				IGPUBuffer::SCreationParams params = {};
-				params.size = bufferSize;
-				params.usage = core::bitflag(asset::IBuffer::EUF_TRANSFER_DST_BIT) | asset::IBuffer::EUF_TRANSFER_SRC_BIT; 
-				scrambleImageBuffer = logicalDevice->createBuffer(std::move(params));
-				auto bufferReqs = scrambleImageBuffer->getMemoryReqs();
-				bufferReqs.memoryTypeBits &= logicalDevice->getPhysicalDevice()->getDeviceLocalMemoryTypeBits();
-				auto bufferMem = logicalDevice->allocate(bufferReqs, scrambleImageBuffer.get());
-				utilities->updateBufferRangeViaStagingBufferAutoSubmit(asset::SBufferRange<IGPUBuffer>{0u,bufferSize,scrambleImageBuffer},random.data(),graphicsQueue);
-			}
-
-			IGPUImageView::SCreationParams viewParams;
-			viewParams.flags = static_cast<IGPUImageView::E_CREATE_FLAGS>(0u);
-			// TODO: Replace this IGPUBuffer -> IGPUImage to using image upload utility
-			viewParams.image = utilities->createFilledDeviceLocalImageOnDedMem(std::move(imgParams), scrambleImageBuffer.get(), 1u, &region, graphicsQueue);
-			viewParams.viewType = IGPUImageView::ET_2D;
-			viewParams.format = EF_R32G32_UINT;
-			viewParams.subresourceRange.aspectMask = IImage::E_ASPECT_FLAGS::EAF_COLOR_BIT;
-			viewParams.subresourceRange.levelCount = 1u;
-			viewParams.subresourceRange.layerCount = 1u;
-			gpuScrambleImageView = logicalDevice->createImageView(std::move(viewParams));
-		}
-	
-		// Create Out Image
-		for(uint32_t i = 0; i < swapchain->getImageCount(); ++i) {
-			outHDRImageViews[i] = createHDRImageView(logicalDevice, asset::EF_R16G16B16A16_SFLOAT, WIN_W, WIN_H);
-		}
-
-		for(uint32_t i = 0; i < swapchain->getImageCount(); ++i)
-		{
-			auto & descSet = descriptorSets0[i];
-			descSet = descriptorPool->createDescriptorSet(core::smart_refctd_ptr(gpuDescriptorSetLayout0));
-			video::IGPUDescriptorSet::SWriteDescriptorSet writeDescriptorSet;
-			writeDescriptorSet.dstSet = descSet.get();
-			writeDescriptorSet.binding = 0;
-			writeDescriptorSet.count = 1u;
-			writeDescriptorSet.arrayElement = 0u;
-			writeDescriptorSet.descriptorType = asset::IDescriptor::E_TYPE::ET_STORAGE_IMAGE;
-			video::IGPUDescriptorSet::SDescriptorInfo info;
-			{
-				info.desc = outHDRImageViews[i];
-				info.info.image.sampler = nullptr;
-				info.info.image.imageLayout = asset::IImage::EL_GENERAL;
-			}
-			writeDescriptorSet.info = &info;
-			logicalDevice->updateDescriptorSets(1u, &writeDescriptorSet, 0u, nullptr);
-		}
-	
-		IGPUBuffer::SCreationParams gpuuboParams = {};
-		gpuuboParams.size = sizeof(SBasicViewParametersAligned);
-		gpuuboParams.usage = core::bitflag(IGPUBuffer::EUF_UNIFORM_BUFFER_BIT) | IGPUBuffer::EUF_TRANSFER_DST_BIT;
-		gpuubo = logicalDevice->createBuffer(std::move(gpuuboParams));
-		auto gpuUboMemReqs = gpuubo->getMemoryReqs();
-		gpuUboMemReqs.memoryTypeBits &= logicalDevice->getPhysicalDevice()->getDeviceLocalMemoryTypeBits();
-		auto gpuUboMem = logicalDevice->allocate(gpuUboMemReqs, gpuubo.get());
-
-		uboDescriptorSet1 = descriptorPool->createDescriptorSet(core::smart_refctd_ptr(gpuDescriptorSetLayout1));
-		{
-			video::IGPUDescriptorSet::SWriteDescriptorSet uboWriteDescriptorSet;
-			uboWriteDescriptorSet.dstSet = uboDescriptorSet1.get();
-			uboWriteDescriptorSet.binding = 0;
-			uboWriteDescriptorSet.count = 1u;
-			uboWriteDescriptorSet.arrayElement = 0u;
-			uboWriteDescriptorSet.descriptorType = asset::IDescriptor::E_TYPE::ET_UNIFORM_BUFFER;
-			video::IGPUDescriptorSet::SDescriptorInfo info;
-			{
-				info.desc = gpuubo;
-				info.info.buffer.offset = 0ull;
-				info.info.buffer.size = sizeof(SBasicViewParametersAligned);
-			}
-			uboWriteDescriptorSet.info = &info;
-			logicalDevice->updateDescriptorSets(1u, &uboWriteDescriptorSet, 0u, nullptr);
-		}
-
-		ISampler::SParams samplerParams0 = { ISampler::ETC_CLAMP_TO_EDGE, ISampler::ETC_CLAMP_TO_EDGE, ISampler::ETC_CLAMP_TO_EDGE, ISampler::ETBC_FLOAT_OPAQUE_BLACK, ISampler::ETF_LINEAR, ISampler::ETF_LINEAR, ISampler::ESMM_LINEAR, 0u, false, ECO_ALWAYS };
-		sampler0 = logicalDevice->createSampler(samplerParams0);
-		ISampler::SParams samplerParams1 = { ISampler::ETC_CLAMP_TO_EDGE, ISampler::ETC_CLAMP_TO_EDGE, ISampler::ETC_CLAMP_TO_EDGE, ISampler::ETBC_INT_OPAQUE_BLACK, ISampler::ETF_NEAREST, ISampler::ETF_NEAREST, ISampler::ESMM_NEAREST, 0u, false, ECO_ALWAYS };
-		sampler1 = logicalDevice->createSampler(samplerParams1);
-		
-		descriptorSet2 = descriptorPool->createDescriptorSet(core::smart_refctd_ptr(gpuDescriptorSetLayout2));
-		{
-			constexpr auto kDescriptorCount = 5;
-			IGPUDescriptorSet::SWriteDescriptorSet writeDescriptorSet2[kDescriptorCount];
-			IGPUDescriptorSet::SDescriptorInfo writeDescriptorInfo[kDescriptorCount];
-			for (auto i=0; i<kDescriptorCount; i++)
-			{
-				writeDescriptorSet2[i].dstSet = descriptorSet2.get();
-				writeDescriptorSet2[i].binding = i;
-				writeDescriptorSet2[i].arrayElement = 0u;
-				writeDescriptorSet2[i].count = 1u;
-				writeDescriptorSet2[i].info = writeDescriptorInfo+i;
-			}
-			writeDescriptorSet2[0].descriptorType = asset::IDescriptor::E_TYPE::ET_COMBINED_IMAGE_SAMPLER;
-			writeDescriptorSet2[1].descriptorType = asset::IDescriptor::E_TYPE::ET_UNIFORM_TEXEL_BUFFER;
-			writeDescriptorSet2[2].descriptorType = asset::IDescriptor::E_TYPE::ET_COMBINED_IMAGE_SAMPLER;
-			writeDescriptorSet2[3].descriptorType = asset::IDescriptor::E_TYPE::ET_ACCELERATION_STRUCTURE;
-			writeDescriptorSet2[4].descriptorType = asset::IDescriptor::E_TYPE::ET_STORAGE_BUFFER;
-
-			writeDescriptorInfo[0].desc = gpuEnvmapImageView;
-			{
-				// ISampler::SParams samplerParams = { ISampler::ETC_CLAMP_TO_EDGE, ISampler::ETC_CLAMP_TO_EDGE, ISampler::ETC_CLAMP_TO_EDGE, ISampler::ETBC_FLOAT_OPAQUE_BLACK, ISampler::ETF_LINEAR, ISampler::ETF_LINEAR, ISampler::ESMM_LINEAR, 0u, false, ECO_ALWAYS };
-				writeDescriptorInfo[0].info.image.sampler = sampler0;
-				writeDescriptorInfo[0].info.image.imageLayout = asset::IImage::EL_SHADER_READ_ONLY_OPTIMAL;
-			}
-			writeDescriptorInfo[1].desc = gpuSequenceBufferView;
-			writeDescriptorInfo[2].desc = gpuScrambleImageView;
-			{
-				// ISampler::SParams samplerParams = { ISampler::ETC_CLAMP_TO_EDGE, ISampler::ETC_CLAMP_TO_EDGE, ISampler::ETC_CLAMP_TO_EDGE, ISampler::ETBC_INT_OPAQUE_BLACK, ISampler::ETF_NEAREST, ISampler::ETF_NEAREST, ISampler::ESMM_NEAREST, 0u, false, ECO_ALWAYS };
-				writeDescriptorInfo[2].info.image.sampler = sampler1;
-				writeDescriptorInfo[2].info.image.imageLayout = asset::IImage::EL_SHADER_READ_ONLY_OPTIMAL;
-			}
-
-			writeDescriptorInfo[3].desc = gpuTlas;
-
-			writeDescriptorInfo[4].desc = spheresBuffer;
-			writeDescriptorInfo[4].info.buffer.offset = 0;
-			writeDescriptorInfo[4].info.buffer.size = spheresBufferSize;
-
-			logicalDevice->updateDescriptorSets(kDescriptorCount, writeDescriptorSet2, 0u, nullptr);
-		}
-
-		constexpr uint32_t FRAME_COUNT = 500000u;
-
-		for (uint32_t i=0u; i<FRAMES_IN_FLIGHT; i++)
-		{
-			imageAcquire[i] = logicalDevice->createSemaphore();
-			renderFinished[i] = logicalDevice->createSemaphore();
-			frameComplete[i] = logicalDevice->createFence(video::IGPUFence::ECF_SIGNALED_BIT);
-			frameUploadDataCompleteSemaphore[i] = logicalDevice->createSemaphore();
-			frameUploadDataCompleteFence[i] = logicalDevice->createFence(video::IGPUFence::ECF_UNSIGNALED);
-		}
-		
-		oracle.reportBeginFrameRecord();
-	}
-
-	void onAppTerminated_impl() override
-	{
-		const auto& fboCreationParams = fbos->begin()[m_acquiredNextFBO]->getCreationParameters();
-		auto gpuSourceImageView = fboCreationParams.attachments[0];
-		logicalDevice->waitIdle();
-
-		bool status = ext::ScreenShot::createScreenShot(
-			logicalDevice.get(),
-			queues[CommonAPI::InitOutput::EQT_TRANSFER_UP],
-			renderFinished[m_resourceIx].get(),
-			gpuSourceImageView.get(),
-			assetManager.get(),
-			"ScreenShot.png",
-			asset::IImage::EL_PRESENT_SRC,
-			asset::EAF_NONE);
-
-		assert(status);
-	}
-
-	void workLoopBody() override
-	{
-		auto& graphicsQueue = queues[CommonAPI::InitOutput::EQT_GRAPHICS];
-
-		m_resourceIx++;
-		if(m_resourceIx >= FRAMES_IN_FLIGHT) {
-			m_resourceIx = 0;
-		}
-		
-		oracle.reportEndFrameRecord();
-		double dt = oracle.getDeltaTimeInMicroSeconds() / 1000.0;
-		auto nextPresentationTimeStamp = oracle.getNextPresentationTimeStamp();
-		oracle.reportBeginFrameRecord();
-
-		// Input 
-		inputSystem->getDefaultMouse(&mouse);
-		inputSystem->getDefaultKeyboard(&keyboard);
-
-		cam.beginInputProcessing(nextPresentationTimeStamp);
-		mouse.consumeEvents([&](const IMouseEventChannel::range_t& events) -> void { cam.mouseProcess(events); }, logger.get());
-		keyboard.consumeEvents([&](const IKeyboardEventChannel::range_t& events) -> void { cam.keyboardProcess(events); }, logger.get());
-		cam.endInputProcessing(nextPresentationTimeStamp);
-		
-		auto& cb = cmdbuf[m_resourceIx];
-		auto& fence = frameComplete[m_resourceIx];
-		while (logicalDevice->waitForFences(1u,&fence.get(),false,MAX_TIMEOUT)==video::IGPUFence::ES_TIMEOUT)
-		{
-		}
-		
-		const auto viewMatrix = cam.getViewMatrix();
-		const auto viewProjectionMatrix = matrix4SIMD::concatenateBFollowedByAPrecisely(
-			video::ISurface::getSurfaceTransformationMatrix(swapchain->getPreTransform()),
-			cam.getConcatenatedMatrix()
-		);
-				
-		// safe to proceed
-		cb->begin(IGPUCommandBuffer::EU_NONE);
-
-		// renderpass 
-		swapchain->acquireNextImage(MAX_TIMEOUT,imageAcquire[m_resourceIx].get(),nullptr,&m_acquiredNextFBO);
-		{
-			auto mv = viewMatrix;
-			auto mvp = viewProjectionMatrix;
-			core::matrix3x4SIMD normalMat;
-			mv.getSub3x3InverseTranspose(normalMat);
-
-			SBasicViewParametersAligned viewParams;
-			memcpy(viewParams.uboData.MV, mv.pointer(), sizeof(mv));
-			memcpy(viewParams.uboData.MVP, mvp.pointer(), sizeof(mvp));
-			memcpy(viewParams.uboData.NormalMat, normalMat.pointer(), sizeof(normalMat));
-			
-			asset::SBufferRange<video::IGPUBuffer> range;
-			range.buffer = gpuubo;
-			range.offset = 0ull;
-			range.size = sizeof(viewParams);
-
-			video::IGPUQueue::SSubmitInfo uploadImageSubmit;
-			uploadImageSubmit.pSignalSemaphores = &frameUploadDataCompleteSemaphore[m_resourceIx].get();
-			uploadImageSubmit.signalSemaphoreCount = 1u;
-			
-			// We know the fence is already signal because of how we structured our execution -> frameUploadDataCompleteSemaphore -> signals to Render Frame -> wait for frameComplete fence to finish -> then we know frameUploadCompleteFence is signalled
-			utilities->getDefaultUpStreamingBuffer()->cull_frees(); // need to cull_frees after fence signalled and before fence is reset again
-			logicalDevice->resetFences(1, &frameUploadDataCompleteFence[m_resourceIx].get());
-
-			utilities->updateBufferRangeViaStagingBufferAutoSubmit(range, &viewParams, graphicsQueue, frameUploadDataCompleteFence[m_resourceIx].get(), uploadImageSubmit);
-			// No need to wait for frameUploadDataCompleteFence in CPU, we'll use semaphores to singal the next stage the upload is complete.
-		}
-				
-		auto graphicsCmdQueueFamIdx = queues[CommonAPI::InitOutput::EQT_GRAPHICS]->getFamilyIndex();
-		// TRANSITION outHDRImageViews[m_acquiredNextFBO] to EIL_GENERAL (because of descriptorSets0 -> ComputeShader Writes into the image)
-		{
-			IGPUCommandBuffer::SImageMemoryBarrier imageBarriers[3u] = {};
-			imageBarriers[0].barrier.srcAccessMask = asset::EAF_NONE;
-			imageBarriers[0].barrier.dstAccessMask = static_cast<asset::E_ACCESS_FLAGS>(asset::EAF_SHADER_WRITE_BIT);
-			imageBarriers[0].oldLayout = asset::IImage::EL_UNDEFINED;
-			imageBarriers[0].newLayout = asset::IImage::EL_GENERAL;
-			imageBarriers[0].srcQueueFamilyIndex = graphicsCmdQueueFamIdx;
-			imageBarriers[0].dstQueueFamilyIndex = graphicsCmdQueueFamIdx;
-			imageBarriers[0].image = outHDRImageViews[m_acquiredNextFBO]->getCreationParameters().image;
-			imageBarriers[0].subresourceRange.aspectMask = asset::IImage::EAF_COLOR_BIT;
-			imageBarriers[0].subresourceRange.baseMipLevel = 0u;
-			imageBarriers[0].subresourceRange.levelCount = 1;
-			imageBarriers[0].subresourceRange.baseArrayLayer = 0u;
-			imageBarriers[0].subresourceRange.layerCount = 1;
-
-			imageBarriers[1].barrier.srcAccessMask = asset::EAF_NONE;
-			imageBarriers[1].barrier.dstAccessMask = static_cast<asset::E_ACCESS_FLAGS>(asset::EAF_SHADER_READ_BIT);
-			imageBarriers[1].oldLayout = asset::IImage::EL_UNDEFINED;
-			imageBarriers[1].newLayout = asset::IImage::EL_SHADER_READ_ONLY_OPTIMAL;
-			imageBarriers[1].srcQueueFamilyIndex = graphicsCmdQueueFamIdx;
-			imageBarriers[1].dstQueueFamilyIndex = graphicsCmdQueueFamIdx;
-			imageBarriers[1].image = gpuScrambleImageView->getCreationParameters().image;
-			imageBarriers[1].subresourceRange.aspectMask = asset::IImage::EAF_COLOR_BIT;
-			imageBarriers[1].subresourceRange.baseMipLevel = 0u;
-			imageBarriers[1].subresourceRange.levelCount = 1;
-			imageBarriers[1].subresourceRange.baseArrayLayer = 0u;
-			imageBarriers[1].subresourceRange.layerCount = 1;
-
-			 imageBarriers[2].barrier.srcAccessMask = asset::EAF_NONE;
-			 imageBarriers[2].barrier.dstAccessMask = static_cast<asset::E_ACCESS_FLAGS>(asset::EAF_SHADER_READ_BIT);
-			 imageBarriers[2].oldLayout = asset::IImage::EL_UNDEFINED;
-			 imageBarriers[2].newLayout = asset::IImage::EL_SHADER_READ_ONLY_OPTIMAL;
-			 imageBarriers[2].srcQueueFamilyIndex = graphicsCmdQueueFamIdx;
-			 imageBarriers[2].dstQueueFamilyIndex = graphicsCmdQueueFamIdx;
-			 imageBarriers[2].image = gpuEnvmapImageView->getCreationParameters().image;
-			 imageBarriers[2].subresourceRange.aspectMask = asset::IImage::EAF_COLOR_BIT;
-			 imageBarriers[2].subresourceRange.baseMipLevel = 0u;
-			 imageBarriers[2].subresourceRange.levelCount = gpuEnvmapImageView->getCreationParameters().subresourceRange.levelCount;
-			 imageBarriers[2].subresourceRange.baseArrayLayer = 0u;
-			 imageBarriers[2].subresourceRange.layerCount = gpuEnvmapImageView->getCreationParameters().subresourceRange.layerCount;
-
-			cb->pipelineBarrier(asset::EPSF_TOP_OF_PIPE_BIT, asset::EPSF_COMPUTE_SHADER_BIT, asset::EDF_NONE, 0u, nullptr, 0u, nullptr, 3u, imageBarriers);
-		}
-
-		// cube envmap handle
-		{
-			cb->bindComputePipeline(gpuComputePipeline.get());
-			cb->bindDescriptorSets(EPBP_COMPUTE, gpuComputePipeline->getLayout(), 0u, 1u, &descriptorSets0[m_acquiredNextFBO].get());
-			cb->bindDescriptorSets(EPBP_COMPUTE, gpuComputePipeline->getLayout(), 1u, 1u, &uboDescriptorSet1.get());
-			cb->bindDescriptorSets(EPBP_COMPUTE, gpuComputePipeline->getLayout(), 2u, 1u, &descriptorSet2.get());
-			cb->dispatch(dispatchInfo.workGroupCount[0], dispatchInfo.workGroupCount[1], dispatchInfo.workGroupCount[2]);
-		}
-		// TODO: tone mapping and stuff
-
-		// Copy HDR Image to SwapChain
-		auto srcImgViewCreationParams = outHDRImageViews[m_acquiredNextFBO]->getCreationParameters();
-		auto dstImgViewCreationParams = fbos->begin()[m_acquiredNextFBO]->getCreationParameters().attachments[0]->getCreationParameters();
-		
-		// Getting Ready for Blit
-		// TRANSITION outHDRImageViews[m_acquiredNextFBO] to EIL_TRANSFER_SRC_OPTIMAL
-		// TRANSITION `fbos[m_acquiredNextFBO]->getCreationParameters().attachments[0]` to EIL_TRANSFER_DST_OPTIMAL
-		{
-			IGPUCommandBuffer::SImageMemoryBarrier imageBarriers[2u] = {};
-			imageBarriers[0].barrier.srcAccessMask = asset::EAF_NONE;
-			imageBarriers[0].barrier.dstAccessMask = asset::EAF_TRANSFER_WRITE_BIT;
-			imageBarriers[0].oldLayout = asset::IImage::EL_UNDEFINED;
-			imageBarriers[0].newLayout = asset::IImage::EL_TRANSFER_SRC_OPTIMAL;
-			imageBarriers[0].srcQueueFamilyIndex = graphicsCmdQueueFamIdx;
-			imageBarriers[0].dstQueueFamilyIndex = graphicsCmdQueueFamIdx;
-			imageBarriers[0].image = srcImgViewCreationParams.image;
-			imageBarriers[0].subresourceRange.aspectMask = asset::IImage::EAF_COLOR_BIT;
-			imageBarriers[0].subresourceRange.baseMipLevel = 0u;
-			imageBarriers[0].subresourceRange.levelCount = 1;
-			imageBarriers[0].subresourceRange.baseArrayLayer = 0u;
-			imageBarriers[0].subresourceRange.layerCount = 1;
-
-			imageBarriers[1].barrier.srcAccessMask = asset::EAF_NONE;
-			imageBarriers[1].barrier.dstAccessMask = asset::EAF_TRANSFER_WRITE_BIT;
-			imageBarriers[1].oldLayout = asset::IImage::EL_UNDEFINED;
-			imageBarriers[1].newLayout = asset::IImage::EL_TRANSFER_DST_OPTIMAL;
-			imageBarriers[1].srcQueueFamilyIndex = graphicsCmdQueueFamIdx;
-			imageBarriers[1].dstQueueFamilyIndex = graphicsCmdQueueFamIdx;
-			imageBarriers[1].image = dstImgViewCreationParams.image;
-			imageBarriers[1].subresourceRange.aspectMask = asset::IImage::EAF_COLOR_BIT;
-			imageBarriers[1].subresourceRange.baseMipLevel = 0u;
-			imageBarriers[1].subresourceRange.levelCount = 1;
-			imageBarriers[1].subresourceRange.baseArrayLayer = 0u;
-			imageBarriers[1].subresourceRange.layerCount = 1;
-			cb->pipelineBarrier(asset::EPSF_TRANSFER_BIT, asset::EPSF_TRANSFER_BIT, asset::EDF_NONE, 0u, nullptr, 0u, nullptr, 2u, imageBarriers);
-		}
-
-		// Blit Image
-		{
-			SImageBlit blit = {};
-			blit.srcOffsets[0] = {0, 0, 0};
-			blit.srcOffsets[1] = {WIN_W, WIN_H, 1};
-		
-			blit.srcSubresource.aspectMask = srcImgViewCreationParams.subresourceRange.aspectMask;
-			blit.srcSubresource.mipLevel = srcImgViewCreationParams.subresourceRange.baseMipLevel;
-			blit.srcSubresource.baseArrayLayer = srcImgViewCreationParams.subresourceRange.baseArrayLayer;
-			blit.srcSubresource.layerCount = srcImgViewCreationParams.subresourceRange.layerCount;
-			blit.dstOffsets[0] = {0, 0, 0};
-			blit.dstOffsets[1] = {WIN_W, WIN_H, 1};
-			blit.dstSubresource.aspectMask = dstImgViewCreationParams.subresourceRange.aspectMask;
-			blit.dstSubresource.mipLevel = dstImgViewCreationParams.subresourceRange.baseMipLevel;
-			blit.dstSubresource.baseArrayLayer = dstImgViewCreationParams.subresourceRange.baseArrayLayer;
-			blit.dstSubresource.layerCount = dstImgViewCreationParams.subresourceRange.layerCount;
-
-			auto srcImg = srcImgViewCreationParams.image;
-			auto dstImg = dstImgViewCreationParams.image;
-
-			cb->blitImage(srcImg.get(), asset::IImage::EL_TRANSFER_SRC_OPTIMAL, dstImg.get(), asset::IImage::EL_TRANSFER_DST_OPTIMAL, 1u, &blit , ISampler::ETF_NEAREST);
-		}
-		
-		// TRANSITION `fbos[m_acquiredNextFBO]->getCreationParameters().attachments[0]` to EIL_PRESENT
-		{
-			IGPUCommandBuffer::SImageMemoryBarrier imageBarriers[1u] = {};
-			imageBarriers[0].barrier.srcAccessMask = asset::EAF_TRANSFER_WRITE_BIT;
-			imageBarriers[0].barrier.dstAccessMask = asset::EAF_NONE;
-			imageBarriers[0].oldLayout = asset::IImage::EL_TRANSFER_DST_OPTIMAL;
-			imageBarriers[0].newLayout = asset::IImage::EL_PRESENT_SRC;
-			imageBarriers[0].srcQueueFamilyIndex = graphicsCmdQueueFamIdx;
-			imageBarriers[0].dstQueueFamilyIndex = graphicsCmdQueueFamIdx;
-			imageBarriers[0].image = dstImgViewCreationParams.image;
-			imageBarriers[0].subresourceRange.aspectMask = asset::IImage::EAF_COLOR_BIT;
-			imageBarriers[0].subresourceRange.baseMipLevel = 0u;
-			imageBarriers[0].subresourceRange.levelCount = 1;
-			imageBarriers[0].subresourceRange.baseArrayLayer = 0u;
-			imageBarriers[0].subresourceRange.layerCount = 1;
-			cb->pipelineBarrier(asset::EPSF_TRANSFER_BIT, asset::EPSF_TOP_OF_PIPE_BIT, asset::EDF_NONE, 0u, nullptr, 0u, nullptr, 1u, imageBarriers);
-		}
-
-		cb->end();
-		logicalDevice->resetFences(1, &fence.get());
-
-		nbl::video::IGPUQueue::SSubmitInfo submit;
-		submit.commandBufferCount = 1u;
-		submit.commandBuffers = &cb.get();
-		submit.signalSemaphoreCount = 1u;
-		submit.pSignalSemaphores = &renderFinished[m_resourceIx].get();
-		nbl::video::IGPUSemaphore* waitSemaphores[2u] = { imageAcquire[m_resourceIx].get(), frameUploadDataCompleteSemaphore[m_resourceIx].get() };
-		asset::E_PIPELINE_STAGE_FLAGS waitStages[2u] = { nbl::asset::EPSF_COLOR_ATTACHMENT_OUTPUT_BIT, nbl::asset::EPSF_RAY_TRACING_SHADER_BIT_KHR} ;
-		submit.waitSemaphoreCount = 2u;
-		submit.pWaitSemaphores = waitSemaphores;
-		submit.pWaitDstStageMask = waitStages;
-
-		graphicsQueue->submit(1u,&submit,fence.get());
-
-		CommonAPI::Present(logicalDevice.get(), swapchain.get(), queues[CommonAPI::InitOutput::EQT_GRAPHICS], renderFinished[m_resourceIx].get(), m_acquiredNextFBO);
-	}
-
-	bool keepRunning() override
-	{
-		return windowCb->isWindowOpen();
-	}
-
-	video::IAPIConnection* getAPIConnection() override
-	{
-		return apiConnection.get();
-	}
-	video::ILogicalDevice* getLogicalDevice()  override
-	{
-		return logicalDevice.get();
-	}
-	video::IGPURenderpass* getRenderpass() override
-	{
-		return renderpass.get();
-	}
-	void setSurface(core::smart_refctd_ptr<video::ISurface>&& s) override
-	{
-		surface = std::move(s);
-	}
-	void setFBOs(std::vector<core::smart_refctd_ptr<video::IGPUFramebuffer>>& f) override
-	{
-		for (int i = 0; i < f.size(); i++)
-		{
-			fbos->begin()[i] = core::smart_refctd_ptr(f[i]);
-		}
-	}
-	void setSwapchain(core::smart_refctd_ptr<video::ISwapchain>&& s) override
-	{
-		swapchain = std::move(s);
-	}
-	uint32_t getSwapchainImageCount() override
-	{
-		return swapchain->getImageCount();
-	}
-	virtual nbl::asset::E_FORMAT getDepthFormat() override
-	{
-		return nbl::asset::EF_D32_SFLOAT;
-	}
-};
-
-NBL_COMMON_API_MAIN(RayQuerySampleApp)
diff --git a/56_RayQuery/pipeline.groovy b/56_RayQuery/pipeline.groovy
deleted file mode 100644
index beba797c3..000000000
--- a/56_RayQuery/pipeline.groovy
+++ /dev/null
@@ -1,50 +0,0 @@
-import org.DevshGraphicsProgramming.Agent
-import org.DevshGraphicsProgramming.BuilderInfo
-import org.DevshGraphicsProgramming.IBuilder
-
-class CRayQueryBuilder extends IBuilder
-{
-	public CRayQueryBuilder(Agent _agent, _info)
-	{
-		super(_agent, _info)
-	}
-	
-	@Override
-	public boolean prepare(Map axisMapping)
-	{
-		return true
-	}
-	
-	@Override
-  	public boolean build(Map axisMapping)
-	{
-		IBuilder.CONFIGURATION config = axisMapping.get("CONFIGURATION")
-		IBuilder.BUILD_TYPE buildType = axisMapping.get("BUILD_TYPE")
-		
-		def nameOfBuildDirectory = getNameOfBuildDirectory(buildType)
-		def nameOfConfig = getNameOfConfig(config)
-		
-		agent.execute("cmake --build ${info.rootProjectPath}/${nameOfBuildDirectory}/${info.targetProjectPathRelativeToRoot} --target ${info.targetBaseName} --config ${nameOfConfig} -j12 -v")
-		
-		return true
-	}
-	
-	@Override
-  	public boolean test(Map axisMapping)
-	{
-		return true
-	}
-	
-	@Override
-	public boolean install(Map axisMapping)
-	{
-		return true
-	}
-}
-
-def create(Agent _agent, _info)
-{
-	return new CRayQueryBuilder(_agent, _info)
-}
-
-return this
\ No newline at end of file
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 24fb7fad8..0b3279a48 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -76,7 +76,6 @@ if(NBL_BUILD_EXAMPLES)
 
 	#add_subdirectory(43_SumAndCDFFilters EXCLUDE_FROM_ALL)
 	add_subdirectory(47_DerivMapTest EXCLUDE_FROM_ALL)
-	add_subdirectory(53_ComputeShaders EXCLUDE_FROM_ALL)
 	add_subdirectory(54_Transformations EXCLUDE_FROM_ALL)
 	add_subdirectory(55_RGB18E7S3 EXCLUDE_FROM_ALL)
 	add_subdirectory(61_UI EXCLUDE_FROM_ALL)

From f4cc4cd22ee4bd5506d794e63caafddf974ed7a4 Mon Sep 17 00:00:00 2001
From: devsh <devsh@devsh.eu>
Date: Sat, 19 Apr 2025 16:04:49 +0200
Subject: [PATCH 172/529] const correctness of BLAS geometry spans

---
 67_RayQueryGeometry/main.cpp   | 3 ++-
 71_RayTracingPipeline/main.cpp | 9 ++++++---
 2 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/67_RayQueryGeometry/main.cpp b/67_RayQueryGeometry/main.cpp
index dab137cbd..b34c474a0 100644
--- a/67_RayQueryGeometry/main.cpp
+++ b/67_RayQueryGeometry/main.cpp
@@ -817,8 +817,9 @@ class RayQueryGeometryApp final : public examples::SimpleWindowedApplication, pu
 
 					ILogicalDevice::AccelerationStructureBuildSizes buildSizes;
 					{
+						const auto* trianglesData = triangles;
 						const uint32_t maxPrimCount[1] = { primitiveCounts[i] };
-						buildSizes = m_device->getAccelerationStructureBuildSizes(blasFlags, false, std::span{&triangles[i], 1}, maxPrimCount);
+						buildSizes = m_device->getAccelerationStructureBuildSizes(blasFlags, false, std::span{trianglesData,1}, maxPrimCount);
 						if (!buildSizes)
 							return logFail("Failed to get BLAS build sizes");
 					}
diff --git a/71_RayTracingPipeline/main.cpp b/71_RayTracingPipeline/main.cpp
index 35c750373..219a7aacb 100644
--- a/71_RayTracingPipeline/main.cpp
+++ b/71_RayTracingPipeline/main.cpp
@@ -1537,10 +1537,13 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication,
           const uint32_t maxPrimCount[1] = { primitiveCounts[i] };
           if (isProcedural)
           {
-            buildSizes = m_device->getAccelerationStructureBuildSizes(blasBuildInfos[i].buildFlags, false, std::span{&aabbs, 1}, maxPrimCount);
-          } else
+            const auto* aabbData = &aabbs;
+            buildSizes = m_device->getAccelerationStructureBuildSizes(blasBuildInfos[i].buildFlags, false, std::span{ aabbData, 1}, maxPrimCount);
+          }
+          else
           {
-            buildSizes = m_device->getAccelerationStructureBuildSizes(blasBuildInfos[i].buildFlags, false, std::span{&triangles[i], 1}, maxPrimCount);
+            const auto* trianglesData = triangles.data();
+            buildSizes = m_device->getAccelerationStructureBuildSizes(blasBuildInfos[i].buildFlags, false, std::span{trianglesData,1}, maxPrimCount);
           }
           if (!buildSizes)
             return logFail("Failed to get BLAS build sizes");

From c8653573a6b93a3962de2f0b5662cb630e9ee51b Mon Sep 17 00:00:00 2001
From: Przemek <minikers21@gmail.com>
Date: Sat, 19 Apr 2025 16:38:55 +0200
Subject: [PATCH 173/529] Saving work

---
 62_CAD/DrawResourcesFiller.cpp                |  4 +-
 62_CAD/main.cpp                               | 10 ++--
 62_CAD/shaders/globals.hlsl                   | 46 +++++++++++++++++--
 .../main_pipeline/fragment_shader.hlsl        | 16 +++----
 .../shaders/main_pipeline/vertex_shader.hlsl  | 29 ++++++------
 5 files changed, 69 insertions(+), 36 deletions(-)

diff --git a/62_CAD/DrawResourcesFiller.cpp b/62_CAD/DrawResourcesFiller.cpp
index a255bc700..c6d898a7c 100644
--- a/62_CAD/DrawResourcesFiller.cpp
+++ b/62_CAD/DrawResourcesFiller.cpp
@@ -141,7 +141,7 @@ void DrawResourcesFiller::drawTriangleMesh(
 {
 	flushDrawObjects(); // flushes draw call construction of any possible draw objects before dtm, because currently we're sepaerating dtm draw calls from drawObj draw calls
 
-	setActiveDTMSettings(dtmSettingsInfo); // TODO !!!!
+	setActiveDTMSettings(dtmSettingsInfo);
 	beginMainObject(MainObjectType::DTM);
 
 	DrawCallData drawCallData = {}; 
@@ -675,7 +675,7 @@ uint32_t DrawResourcesFiller::addDTMSettings_Internal(const DTMSettingsInfo& dtm
 		if (itr == dtmSettings)
 			return i;
 	}
-	
+
 	return resourcesCollection.dtmSettings.addAndGetOffset(dtmSettings); // this will implicitly increase total resource consumption and reduce remaining size --> no need for mem size trackers
 }
 
diff --git a/62_CAD/main.cpp b/62_CAD/main.cpp
index 6d3a2b431..f988458b9 100644
--- a/62_CAD/main.cpp
+++ b/62_CAD/main.cpp
@@ -3246,10 +3246,10 @@ class ComputerAidedDesign final : public examples::SimpleWindowedApplication, pu
 			mesh.setIndices(std::move(indices));
 
 			DTMSettingsInfo dtmInfo;
-			dtmInfo.mode = E_DTM_MODE::HEIGHT_SHADING | E_DTM_MODE::CONTOUR | E_DTM_MODE::OUTLINE;
-
-			dtmInfo.outlineStyleInfo.screenSpaceLineWidth = 3.0f;
-			dtmInfo.outlineStyleInfo.worldSpaceLineWidth = 0.0f;
+			//dtmInfo.mode = E_DTM_MODE::HEIGHT_SHADING | E_DTM_MODE::CONTOUR | E_DTM_MODE::OUTLINE;
+			dtmInfo.mode = E_DTM_MODE::HEIGHT_SHADING;
+			dtmInfo.outlineStyleInfo.screenSpaceLineWidth = 0.0f;
+			dtmInfo.outlineStyleInfo.worldSpaceLineWidth = 3.0f;
 			dtmInfo.outlineStyleInfo.color = float32_t4(0.0f, 0.39f, 0.0f, 1.0f);
 			std::array<double, 4> outlineStipplePattern = { 0.0f, -5.0f, 20.0f, -5.0f };
 			dtmInfo.outlineStyleInfo.setStipplePatternData(outlineStipplePattern);
@@ -3274,7 +3274,7 @@ class ComputerAidedDesign final : public examples::SimpleWindowedApplication, pu
 				{
 					dtmInfo.heightShadingInfo.heightShadingMode = E_HEIGHT_SHADING_MODE::DISCRETE_VARIABLE_LENGTH_INTERVALS;
 
-					dtmInfo.heightShadingInfo.addHeightColorMapEntry(-10.0f, float32_t4(0.5f, 1.0f, 1.0f, animatedAlpha));
+					dtmInfo.heightShadingInfo.addHeightColorMapEntry(-10.0f, float32_t4(0.5f, 1.0f, 1.0f, 1.0f));
 					dtmInfo.heightShadingInfo.addHeightColorMapEntry(20.0f, float32_t4(0.0f, 1.0f, 0.0f, 1.0f));
 					dtmInfo.heightShadingInfo.addHeightColorMapEntry(25.0f, float32_t4(1.0f, 1.0f, 0.0f, animatedAlpha));
 					dtmInfo.heightShadingInfo.addHeightColorMapEntry(70.0f, float32_t4(1.0f, 0.0f, 0.0f, 1.0f));
diff --git a/62_CAD/shaders/globals.hlsl b/62_CAD/shaders/globals.hlsl
index a83acb094..2c86b9732 100644
--- a/62_CAD/shaders/globals.hlsl
+++ b/62_CAD/shaders/globals.hlsl
@@ -400,9 +400,9 @@ struct DTMSettings
         return E_HEIGHT_SHADING_MODE::DISCRETE_FIXED_LENGTH_INTERVALS;
     }
     
-    bool drawOutlineEnabled() { return  (mode & E_DTM_MODE::OUTLINE) != 0u; } 
-    bool drawContourEnabled() { return (mode & E_DTM_MODE::CONTOUR) != 0u; } 
-    bool drawHeightShadingEnabled() { return (mode & E_DTM_MODE::HEIGHT_SHADING) != 0u; } 
+    bool drawOutlineEnabled() NBL_CONST_MEMBER_FUNC { return  (mode & E_DTM_MODE::OUTLINE) != 0u; } 
+    bool drawContourEnabled() NBL_CONST_MEMBER_FUNC { return (mode & E_DTM_MODE::CONTOUR) != 0u; }
+    bool drawHeightShadingEnabled() NBL_CONST_MEMBER_FUNC { return (mode & E_DTM_MODE::HEIGHT_SHADING) != 0u; }
 };
 
 #ifndef __HLSL_VERSION
@@ -430,8 +430,44 @@ inline bool operator==(const LineStyle& lhs, const LineStyle& rhs)
 
 inline bool operator==(const DTMSettings& lhs, const DTMSettings& rhs)
 {
-    return lhs.outlineLineStyleIdx == rhs.outlineLineStyleIdx &&
-        lhs.contourLineStyleIdx == rhs.contourLineStyleIdx;
+    if (lhs.mode != rhs.mode)
+        return false;
+
+    bool equal = true;
+    if (lhs.drawOutlineEnabled())
+    {
+        equal = lhs.outlineLineStyleIdx == rhs.outlineLineStyleIdx;
+    }
+
+    if (!equal)
+        return false;
+
+    if (lhs.drawContourEnabled())
+    {
+        float contourLinesStartHeight;
+        float contourLinesEndHeight;
+        float contourLinesHeightInterval;
+
+        equal = lhs.contourLinesStartHeight == rhs.contourLinesStartHeight &&
+            lhs.contourLinesStartHeight == rhs.contourLinesStartHeight &&
+            lhs.contourLinesStartHeight == rhs.contourLinesStartHeight;
+    }
+
+    if (!equal)
+        return false;
+
+    if (lhs.drawHeightShadingEnabled())
+    {
+        equal = lhs.intervalLength == rhs.intervalLength &&
+            lhs.intervalIndexToHeightMultiplier == rhs.intervalIndexToHeightMultiplier &&
+            lhs.isCenteredShading == rhs.isCenteredShading &&
+            lhs.heightColorEntryCount == rhs.heightColorEntryCount;
+
+        equal == equal && (memcmp(lhs.heightColorMapHeights, rhs.heightColorMapHeights, lhs.heightColorEntryCount * sizeof(float)));
+        equal == equal && (memcmp(lhs.heightColorMapColors, rhs.heightColorMapColors, lhs.heightColorEntryCount * sizeof(float32_t4)));
+    }
+
+    return equal;
 }
 #endif
 
diff --git a/62_CAD/shaders/main_pipeline/fragment_shader.hlsl b/62_CAD/shaders/main_pipeline/fragment_shader.hlsl
index f9cd52ec3..2b44f24b6 100644
--- a/62_CAD/shaders/main_pipeline/fragment_shader.hlsl
+++ b/62_CAD/shaders/main_pipeline/fragment_shader.hlsl
@@ -508,7 +508,7 @@ float3 calculateDTMTriangleBarycentrics(in float2 v1, in float2 v2, in float2 v3
 
 float4 calculateDTMHeightColor(in DTMSettings dtmSettings, in float3 v[3], in float heightDeriv, in float2 fragPos, in float height)
 {
-    float4 outputColor = float4(0.0f, 0.0f, 0.0f, 1.0f);
+    float4 outputColor = float4(0.0f, 0.0f, 0.0f, 0.0f);
 
     // HEIGHT SHADING
     const uint32_t heightMapSize = dtmSettings.heightColorEntryCount;
@@ -542,7 +542,6 @@ float4 calculateDTMHeightColor(in DTMSettings dtmSettings, in float3 v[3], in fl
         convexPolygonSdf = max(convexPolygonSdf, line3Sdf);
         convexPolygonSdf = max(convexPolygonSdf, line4Sdf);
 
-        // TODO: separate
         outputColor.a = 1.0f - smoothstep(0.0f, globals.antiAliasingFactor * 2.0f, convexPolygonSdf);
 
         // calculate height color
@@ -683,6 +682,7 @@ float4 calculateDTMContourColor(in DTMSettings dtmSettings, in float3 v[3], in u
         }
     }
 
+    // TODO: comment next line to fix, figure if it was needed
     if(contourLinePointsIdx == 2)
     {
         nbl::hlsl::shapes::Line<float> lineSegment = nbl::hlsl::shapes::Line<float>::construct(contourLinePoints[0], contourLinePoints[1]);
@@ -704,9 +704,11 @@ float4 calculateDTMContourColor(in DTMSettings dtmSettings, in float3 v[3], in u
         
         outputColor.a = smoothstep(+globals.antiAliasingFactor, -globals.antiAliasingFactor, distance) * contourStyle.color.a;
         outputColor.rgb = contourStyle.color.rgb;
-    }
 
-    return outputColor;
+        return outputColor;
+    }
+    
+    return float4(0.0f, 0.0f, 0.0f, 0.0f);
 }
 
 float4 calculateDTMOutlineColor(in DTMSettings dtmSettings, in float3 v[3], in uint2 edgePoints[3], in PSInput psInput, in float3 baryCoord, in float height)
@@ -756,9 +758,6 @@ float4 calculateDTMOutlineColor(in DTMSettings dtmSettings, in float3 v[3], in u
     {
         for (int i = 0; i < 3; ++i)
         {
-            if (distances[i] > outlineThickness)
-                continue;
-
             const uint2 currentEdgePoints = edgePoints[i];
             float3 p0 = v[currentEdgePoints[0]];
             float3 p1 = v[currentEdgePoints[1]];
@@ -774,9 +773,6 @@ float4 calculateDTMOutlineColor(in DTMSettings dtmSettings, in float3 v[3], in u
     {
         for (int i = 0; i < 3; ++i)
         {
-            if (distances[i] > outlineThickness)
-                continue;
-
             const uint2 currentEdgePoints = edgePoints[i];
             float3 p0 = v[currentEdgePoints[0]];
             float3 p1 = v[currentEdgePoints[1]];
diff --git a/62_CAD/shaders/main_pipeline/vertex_shader.hlsl b/62_CAD/shaders/main_pipeline/vertex_shader.hlsl
index f726104b5..5d9189d34 100644
--- a/62_CAD/shaders/main_pipeline/vertex_shader.hlsl
+++ b/62_CAD/shaders/main_pipeline/vertex_shader.hlsl
@@ -78,12 +78,6 @@ void dilateHatch<false>(out float2 outOffsetVec, out float2 outUV, const float2
 
 PSInput main(uint vertexID : SV_VertexID)
 {
-    // TODO[Przemek]: Disable Everything here and do your own thing as we already discussed, but let's have the same PSInput data passed to fragment.
-    // your programmable pulling will use the baseVertexBufferAddress BDA address and `vertexID` to RawBufferLoad it's vertex. 
-    // ~~Later, most likely We will require pulling all 3 vertices of the triangle, that's where you need to know which triangle you're currently on, and instead of objectID = vertexID/4 which we currently do, you will do vertexID/3 and pull all 3 of it's vertices.~~
-    // Ok, brainfart, a vertex can belong to multiple triangles, I was thinking of AA but triangles share vertices, nevermind my comment above.
-    
-
     ClipProjectionData clipProjectionData;
     
     PSInput outV;
@@ -147,15 +141,22 @@ PSInput main(uint vertexID : SV_VertexID)
         );
 
         DTMSettings dtm = loadDTMSettings(mainObj.dtmSettingsIdx);
-        LineStyle outlineStyle = loadLineStyle(dtm.outlineLineStyleIdx);
-        LineStyle contourStyle = loadLineStyle(dtm.contourLineStyleIdx);
+
         // TODO: maybe move to fragment shader since we may have multiple contour styles later
-        const float screenSpaceOutlineWidth = outlineStyle.screenSpaceLineWidth + _static_cast<float>(_static_cast<pfloat64_t>(outlineStyle.worldSpaceLineWidth) * globals.screenToWorldRatio);
-        const float sdfOutlineThickness = screenSpaceOutlineWidth * 0.5f;
-        const float screenSpaceContourLineWidth = contourStyle.screenSpaceLineWidth + _static_cast<float>(_static_cast<pfloat64_t>(contourStyle.worldSpaceLineWidth) * globals.screenToWorldRatio);
-        const float sdfContourLineThickness = screenSpaceContourLineWidth * 0.5f;
-        outV.setOutlineThickness(sdfOutlineThickness);
-        outV.setContourLineThickness(sdfContourLineThickness);
+        if (dtm.drawOutlineEnabled())
+        {
+            LineStyle outlineStyle = loadLineStyle(dtm.outlineLineStyleIdx);
+            const float screenSpaceOutlineWidth = outlineStyle.screenSpaceLineWidth + _static_cast<float>(_static_cast<pfloat64_t>(outlineStyle.worldSpaceLineWidth) * globals.screenToWorldRatio);
+            const float sdfOutlineThickness = screenSpaceOutlineWidth * 0.5f;
+            outV.setOutlineThickness(sdfOutlineThickness);
+        }
+        if (dtm.drawContourEnabled())
+        {
+            LineStyle contourStyle = loadLineStyle(dtm.contourLineStyleIdx);
+            const float screenSpaceContourLineWidth = contourStyle.screenSpaceLineWidth + _static_cast<float>(_static_cast<pfloat64_t>(contourStyle.worldSpaceLineWidth) * globals.screenToWorldRatio);
+            const float sdfContourLineThickness = screenSpaceContourLineWidth * 0.5f;
+            outV.setContourLineThickness(sdfContourLineThickness);
+        }
 
         // full screen triangle (this will destroy outline, contour line and height drawing)
 #if 0

From ff1d0928f4bd85f7d5b259809bc6b20a6f4a3eba Mon Sep 17 00:00:00 2001
From: devsh <devsh@devsh.eu>
Date: Sun, 20 Apr 2025 03:38:22 +0200
Subject: [PATCH 174/529] make example use new enum without KHR suffix

---
 67_RayQueryGeometry/main.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/67_RayQueryGeometry/main.cpp b/67_RayQueryGeometry/main.cpp
index b34c474a0..aff687742 100644
--- a/67_RayQueryGeometry/main.cpp
+++ b/67_RayQueryGeometry/main.cpp
@@ -806,7 +806,7 @@ class RayQueryGeometryApp final : public examples::SimpleWindowedApplication, pu
 
 					auto blasFlags = bitflag(IGPUBottomLevelAccelerationStructure::BUILD_FLAGS::PREFER_FAST_TRACE_BIT) | IGPUBottomLevelAccelerationStructure::BUILD_FLAGS::ALLOW_COMPACTION_BIT;
 					if (m_physicalDevice->getProperties().limits.rayTracingPositionFetch)
-						blasFlags |= IGPUBottomLevelAccelerationStructure::BUILD_FLAGS::ALLOW_DATA_ACCESS_KHR;
+						blasFlags |= IGPUBottomLevelAccelerationStructure::BUILD_FLAGS::ALLOW_DATA_ACCESS;
 
 					blasBuildInfos[i].buildFlags = blasFlags;
 					blasBuildInfos[i].geometryCount = 1;	// only 1 geometry object per blas

From 99cf5d862560a752f6491192a136154c5868fd84 Mon Sep 17 00:00:00 2001
From: keptsecret <sorchon@gmail.com>
Date: Mon, 21 Apr 2025 14:50:17 +0700
Subject: [PATCH 175/529] coalesced load/store data

---
 .../app_resources/benchmarkSubgroup.comp.hlsl | 32 ++++---------------
 .../app_resources/shaderCommon.hlsl           | 15 +++++----
 .../app_resources/testSubgroup.comp.hlsl      |  4 +--
 73_ArithmeticBench/main.cpp                   |  2 +-
 4 files changed, 18 insertions(+), 35 deletions(-)

diff --git a/73_ArithmeticBench/app_resources/benchmarkSubgroup.comp.hlsl b/73_ArithmeticBench/app_resources/benchmarkSubgroup.comp.hlsl
index 4715f0abf..3dd24e432 100644
--- a/73_ArithmeticBench/app_resources/benchmarkSubgroup.comp.hlsl
+++ b/73_ArithmeticBench/app_resources/benchmarkSubgroup.comp.hlsl
@@ -7,9 +7,9 @@
 // NOTE added dummy output image to be able to profile with Nsight, which still doesn't support profiling headless compute shaders
 [[vk::binding(2, 0)]] RWTexture2D<float32_t4> outImage; // dummy
 
-uint32_t globalIndex()
+uint32_t globalFirstItemIndex(uint32_t itemIdx)
 {
-    return nbl::hlsl::glsl::gl_WorkGroupID().x*WORKGROUP_SIZE+nbl::hlsl::workgroup::SubgroupContiguousIndex();
+    return nbl::hlsl::glsl::gl_WorkGroupID().x*WORKGROUP_SIZE*ITEMS_PER_INVOCATION+((nbl::hlsl::glsl::gl_SubgroupID()*ITEMS_PER_INVOCATION+itemIdx)<<SUBGROUP_SIZE_LOG2);
 }
 
 bool canStore() {return true;}
@@ -18,22 +18,6 @@ bool canStore() {return true;}
 #error "Define NUM_LOOPS!"
 #endif
 
-// template<template<class> class binop, typename T, uint32_t N>
-// static void subbench(NBL_CONST_REF_ARG(type_t) sourceVal)
-// {
-//     using config_t = nbl::hlsl::subgroup::Configuration<SUBGROUP_SIZE_LOG2>;
-//     using params_t = nbl::hlsl::subgroup2::ArithmeticParams<config_t, typename binop<T>::base_t, N, nbl::hlsl::jit::device_capabilities>;
-
-//     const uint32_t storeAddr = sizeof(uint32_t) + sizeof(type_t) * globalIndex();
-
-//     operation_t<params_t> func;
-//     [unroll]
-//     for (uint32_t i = 0; i < NUM_LOOPS; i++)
-//     {
-//         const uint32_t arrIndex = i & 7u;   // i % 8
-//         output[arrIndex].template Store<type_t>(storeAddr, func(sourceVal));
-//     }
-// }
 
 template<template<class> class binop, typename T, uint32_t N>
 static void subbench(NBL_CONST_REF_ARG(type_t) sourceVal)
@@ -47,22 +31,20 @@ static void subbench(NBL_CONST_REF_ARG(type_t) sourceVal)
     for (uint32_t i = 0; i < NUM_LOOPS; i++)
         value = func(value);
 
-    output[binop<T>::BindingIndex].template Store<type_t>(sizeof(uint32_t) + sizeof(type_t) * globalIndex(), value);
+    [unroll]
+    for (uint32_t i = 0; i < ITEMS_PER_INVOCATION; i++)
+        output[binop<T>::BindingIndex].template Store<uint32_t>(sizeof(uint32_t) + sizeof(uint32_t) * (globalFirstItemIndex(i) + nbl::hlsl::glsl::gl_SubgroupInvocationID()), value[i]);
 }
 
 void benchmark()
 {
-    const uint32_t idx = globalIndex() * ITEMS_PER_INVOCATION;
+    const uint32_t idx = nbl::hlsl::glsl::gl_SubgroupInvocationID();
     type_t sourceVal;
-// #if ITEMS_PER_INVOCATION > 1
     [unroll]
     for (uint32_t i = 0; i < ITEMS_PER_INVOCATION; i++)
     {
-        sourceVal[i] = inputValue[idx + i];
+        sourceVal[i] = inputValue[globalFirstItemIndex(i) + idx];
     }
-// #else
-//     sourceVal = inputValue[idx];
-// #endif
 
     subbench<bit_and, uint32_t, ITEMS_PER_INVOCATION>(sourceVal);
     subbench<bit_xor, uint32_t, ITEMS_PER_INVOCATION>(sourceVal);
diff --git a/73_ArithmeticBench/app_resources/shaderCommon.hlsl b/73_ArithmeticBench/app_resources/shaderCommon.hlsl
index 3fdd3c986..f7ee1892c 100644
--- a/73_ArithmeticBench/app_resources/shaderCommon.hlsl
+++ b/73_ArithmeticBench/app_resources/shaderCommon.hlsl
@@ -15,8 +15,8 @@ uint32_t3 nbl::hlsl::glsl::gl_WorkGroupSize() {return uint32_t3(WORKGROUP_SIZE,1
 [[vk::binding(0, 0)]] StructuredBuffer<uint32_t> inputValue;
 [[vk::binding(1, 0)]] RWByteAddressBuffer output[8];
 
-// because subgroups don't match `gl_LocalInvocationIndex` snake curve addressing, we also can't load inputs that way
-uint32_t globalIndex();
+// to get next item, move by subgroupSize
+uint32_t globalFirstItemIndex(uint32_t itemIdx);
 // since we test ITEMS_PER_WG<WorkgroupSize we need this so workgroups don't overwrite each other's outputs
 bool canStore();
 
@@ -53,8 +53,13 @@ static void subtest(NBL_CONST_REF_ARG(type_t) sourceVal)
         output[binop<T>::BindingIndex].template Store<uint32_t>(0,nbl::hlsl::glsl::gl_SubgroupSize());
         
     operation_t<params_t> func;
+    type_t value = func(sourceVal);
     if (canStore())
-        output[binop<T>::BindingIndex].template Store<type_t>(sizeof(uint32_t)+sizeof(type_t)*globalIndex(),func(sourceVal));
+    {
+        [unroll]
+        for (uint32_t i = 0; i < ITEMS_PER_INVOCATION; i++)
+            output[binop<T>::BindingIndex].template Store<uint32_t>(sizeof(uint32_t) + sizeof(uint32_t) * (globalFirstItemIndex(i) + nbl::hlsl::glsl::gl_SubgroupInvocationID()), value[i]);
+    }
 }
 
 
@@ -62,15 +67,11 @@ type_t test()
 {
     const uint32_t idx = globalIndex() * ITEMS_PER_INVOCATION;
     type_t sourceVal;
-// #if ITEMS_PER_INVOCATION > 1
     [unroll]
     for (uint32_t i = 0; i < ITEMS_PER_INVOCATION; i++)
     {
         sourceVal[i] = inputValue[idx + i];
     }
-// #else
-//     sourceVal = inputValue[idx];
-// #endif
 
     subtest<bit_and, uint32_t, ITEMS_PER_INVOCATION>(sourceVal);
     subtest<bit_xor, uint32_t, ITEMS_PER_INVOCATION>(sourceVal);
diff --git a/73_ArithmeticBench/app_resources/testSubgroup.comp.hlsl b/73_ArithmeticBench/app_resources/testSubgroup.comp.hlsl
index 2cc1ccb60..0001d39e0 100644
--- a/73_ArithmeticBench/app_resources/testSubgroup.comp.hlsl
+++ b/73_ArithmeticBench/app_resources/testSubgroup.comp.hlsl
@@ -4,9 +4,9 @@
 
 #include "shaderCommon.hlsl"
 
-uint32_t globalIndex()
+uint32_t globalFirstItemIndex(uint32_t itemIdx)
 {
-    return nbl::hlsl::glsl::gl_WorkGroupID().x*WORKGROUP_SIZE+nbl::hlsl::workgroup::SubgroupContiguousIndex();
+    return nbl::hlsl::glsl::gl_WorkGroupID().x*WORKGROUP_SIZE*ITEMS_PER_INVOCATION+((nbl::hlsl::glsl::gl_SubgroupID()*ITEMS_PER_INVOCATION+itemIdx)<<SUBGROUP_SIZE_LOG2);
 }
 
 bool canStore() {return true;}
diff --git a/73_ArithmeticBench/main.cpp b/73_ArithmeticBench/main.cpp
index d129cfaf9..2d57c131c 100644
--- a/73_ArithmeticBench/main.cpp
+++ b/73_ArithmeticBench/main.cpp
@@ -887,7 +887,7 @@ class ArithmeticBenchApp final : public examples::SimpleWindowedApplication, pub
 	template<class BinOp>
 	using ArithmeticOp = emulatedReduction<BinOp>;	// change this to test other arithmetic ops
 
-	bool b_runTests = false;
+	bool b_runTests = true;
 	uint32_t* inputData = nullptr;
 	uint32_t ItemsPerInvocation = 4u;
 	constexpr static inline uint32_t OutputBufferCount = 8u;

From a3bb526405ce95bafadd34e9307ec526ad6854b4 Mon Sep 17 00:00:00 2001
From: keptsecret <sorchon@gmail.com>
Date: Mon, 21 Apr 2025 15:49:58 +0700
Subject: [PATCH 176/529] fixed some bugs

---
 73_ArithmeticBench/app_resources/shaderCommon.hlsl | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/73_ArithmeticBench/app_resources/shaderCommon.hlsl b/73_ArithmeticBench/app_resources/shaderCommon.hlsl
index f7ee1892c..f4fc9d23a 100644
--- a/73_ArithmeticBench/app_resources/shaderCommon.hlsl
+++ b/73_ArithmeticBench/app_resources/shaderCommon.hlsl
@@ -1,6 +1,7 @@
 #include "common.hlsl"
 
 #include "nbl/builtin/hlsl/glsl_compat/core.hlsl"
+#include "nbl/builtin/hlsl/workgroup/basic.hlsl"
 #include "nbl/builtin/hlsl/subgroup/basic.hlsl"
 #include "nbl/builtin/hlsl/subgroup/arithmetic_portability.hlsl"
 #include "nbl/builtin/hlsl/subgroup2/arithmetic_portability.hlsl"
@@ -49,7 +50,7 @@ static void subtest(NBL_CONST_REF_ARG(type_t) sourceVal)
     using config_t = nbl::hlsl::subgroup2::Configuration<SUBGROUP_SIZE_LOG2>;
     using params_t = nbl::hlsl::subgroup2::ArithmeticParams<config_t, typename binop<T>::base_t, N, nbl::hlsl::jit::device_capabilities>;
 
-    if (globalIndex()==0u)
+    if (nbl::hlsl::glsl::gl_WorkGroupID().x*WORKGROUP_SIZE+nbl::hlsl::workgroup::SubgroupContiguousIndex()==0u)
         output[binop<T>::BindingIndex].template Store<uint32_t>(0,nbl::hlsl::glsl::gl_SubgroupSize());
         
     operation_t<params_t> func;
@@ -65,12 +66,12 @@ static void subtest(NBL_CONST_REF_ARG(type_t) sourceVal)
 
 type_t test()
 {
-    const uint32_t idx = globalIndex() * ITEMS_PER_INVOCATION;
+    const uint32_t idx = nbl::hlsl::glsl::gl_SubgroupInvocationID();
     type_t sourceVal;
     [unroll]
     for (uint32_t i = 0; i < ITEMS_PER_INVOCATION; i++)
     {
-        sourceVal[i] = inputValue[idx + i];
+        sourceVal[i] = inputValue[globalFirstItemIndex(i) + idx];
     }
 
     subtest<bit_and, uint32_t, ITEMS_PER_INVOCATION>(sourceVal);

From 355c605d211400626b947a4d38f439d8c944e539 Mon Sep 17 00:00:00 2001
From: keptsecret <sorchon@gmail.com>
Date: Mon, 21 Apr 2025 16:58:37 +0700
Subject: [PATCH 177/529] disable test by default

---
 73_ArithmeticBench/imgui.ini | 5 +++++
 73_ArithmeticBench/main.cpp  | 2 +-
 2 files changed, 6 insertions(+), 1 deletion(-)
 create mode 100644 73_ArithmeticBench/imgui.ini

diff --git a/73_ArithmeticBench/imgui.ini b/73_ArithmeticBench/imgui.ini
new file mode 100644
index 000000000..4a5c20148
--- /dev/null
+++ b/73_ArithmeticBench/imgui.ini
@@ -0,0 +1,5 @@
+[Window][Debug##Default]
+Pos=60,60
+Size=400,400
+Collapsed=0
+
diff --git a/73_ArithmeticBench/main.cpp b/73_ArithmeticBench/main.cpp
index 2d57c131c..d129cfaf9 100644
--- a/73_ArithmeticBench/main.cpp
+++ b/73_ArithmeticBench/main.cpp
@@ -887,7 +887,7 @@ class ArithmeticBenchApp final : public examples::SimpleWindowedApplication, pub
 	template<class BinOp>
 	using ArithmeticOp = emulatedReduction<BinOp>;	// change this to test other arithmetic ops
 
-	bool b_runTests = true;
+	bool b_runTests = false;
 	uint32_t* inputData = nullptr;
 	uint32_t ItemsPerInvocation = 4u;
 	constexpr static inline uint32_t OutputBufferCount = 8u;

From 95ed1adb218dfc3159ace39ff69a852e81913cda Mon Sep 17 00:00:00 2001
From: Erfan Ahmadi <ahmadierfan99@gmail.com>
Date: Mon, 21 Apr 2025 14:41:23 +0330
Subject: [PATCH 178/529] apply inverse gamma on colors

---
 62_CAD/shaders/main_pipeline/common.hlsl          | 9 +++++++++
 62_CAD/shaders/main_pipeline/fragment_shader.hlsl | 7 ++++++-
 62_CAD/shaders/main_pipeline/resolve_alphas.hlsl  | 5 +++++
 62_CAD/shaders/main_pipeline/vertex_shader.hlsl   | 5 +++++
 4 files changed, 25 insertions(+), 1 deletion(-)

diff --git a/62_CAD/shaders/main_pipeline/common.hlsl b/62_CAD/shaders/main_pipeline/common.hlsl
index 4327cf7fe..3c12a3dcf 100644
--- a/62_CAD/shaders/main_pipeline/common.hlsl
+++ b/62_CAD/shaders/main_pipeline/common.hlsl
@@ -3,6 +3,15 @@
 
 #include "../globals.hlsl"
 
+// This function soley exists to match n4ce's behaviour, colors and color operations for DTMs, Curves, Lines, Hatches are done in linear space and then outputted to linear surface (as if surface had UNORM format, but ours is SRGB)
+// We should do gamma "uncorrection" to account for the fact that our surface format is SRGB and will do gamma correction
+void gammaUncorrect(inout float3 col)
+{
+    bool outputToSRGB = true; // TODO
+    float gamma = (outputToSRGB) ? 2.2f : 1.0f;
+    col.rgb = pow(col.rgb, gamma);
+}
+
 // TODO: Use these in C++ as well once numeric_limits<uint32_t> compiles on C++
 float32_t2 unpackCurveBoxUnorm(uint32_t2 value)
 {
diff --git a/62_CAD/shaders/main_pipeline/fragment_shader.hlsl b/62_CAD/shaders/main_pipeline/fragment_shader.hlsl
index 2b44f24b6..240bf967f 100644
--- a/62_CAD/shaders/main_pipeline/fragment_shader.hlsl
+++ b/62_CAD/shaders/main_pipeline/fragment_shader.hlsl
@@ -409,7 +409,11 @@ float32_t4 calculateFinalColor<true>(const uint2 fragCoord, const float localAlp
     // draw with previous geometry's style's color or stored in texture buffer :kek:
     // we don't need to load the style's color in critical section because we've already retrieved the style index from the stored main obj
     if (toResolveStyleIdx != InvalidStyleIdx) // if toResolveStyleIdx is valid then that means our resolved color should come from line style
+    {
         color = loadLineStyle(toResolveStyleIdx).color;
+        gammaUncorrect(color.rgb); // want to output to SRGB without gamma correction
+    }
+    
     color.a *= float(storedQuantizedAlpha) / 255.f;
     
     return color;
@@ -852,7 +856,8 @@ float4 fragMain(PSInput input) : SV_TARGET
 
         textureColor = dtmColor.rgb;
         localAlpha = dtmColor.a;
-
+        
+        gammaUncorrect(textureColor); // want to output to SRGB without gamma correction
         return calculateFinalColor<nbl::hlsl::jit::device_capabilities::fragmentShaderPixelInterlock>(uint2(input.position.xy), localAlpha, currentMainObjectIdx, textureColor, true);
     }
     else
diff --git a/62_CAD/shaders/main_pipeline/resolve_alphas.hlsl b/62_CAD/shaders/main_pipeline/resolve_alphas.hlsl
index 987dd7c29..86257428f 100644
--- a/62_CAD/shaders/main_pipeline/resolve_alphas.hlsl
+++ b/62_CAD/shaders/main_pipeline/resolve_alphas.hlsl
@@ -60,10 +60,15 @@ float32_t4 calculateFinalColor<true>(const uint2 fragCoord)
     if (!resolve)
         discard;
 
+
     // draw with previous geometry's style's color or stored in texture buffer :kek:
     // we don't need to load the style's color in critical section because we've already retrieved the style index from the stored main obj
     if (toResolveStyleIdx != InvalidStyleIdx) // if toResolveStyleIdx is valid then that means our resolved color should come from line style
+    {
         color = loadLineStyle(toResolveStyleIdx).color;
+        gammaUncorrect(color.rgb); // want to output to SRGB without gamma correction
+    }
+
     color.a *= float(storedQuantizedAlpha) / 255.f;
     
     return color;
diff --git a/62_CAD/shaders/main_pipeline/vertex_shader.hlsl b/62_CAD/shaders/main_pipeline/vertex_shader.hlsl
index 5d9189d34..4c1c74a93 100644
--- a/62_CAD/shaders/main_pipeline/vertex_shader.hlsl
+++ b/62_CAD/shaders/main_pipeline/vertex_shader.hlsl
@@ -26,7 +26,12 @@ float2 QuadraticBezier(float2 p0, float2 p1, float2 p2, float t)
 ClipProjectionData getClipProjectionData(in MainObject mainObj)
 {
     if (mainObj.clipProjectionIndex != InvalidClipProjectionIndex)
+    {
+#ifdef NBL_2D_SHOWCASE_MODE
+        return nbl::hlsl::mul(globals.defaultClipProjection.projectionToNDC, loadCustomClipProjection(mainObj.clipProjectionIndex));
+#endif
         return loadCustomClipProjection(mainObj.clipProjectionIndex);
+    }
     else
         return globals.defaultClipProjection;
 }

From 244b0d0a25fefbd9e1274007a59f390c76ddfbeb Mon Sep 17 00:00:00 2001
From: Erfan Ahmadi <ahmadierfan99@gmail.com>
Date: Mon, 21 Apr 2025 14:57:46 +0330
Subject: [PATCH 179/529] screenToWorldRatio doesn't need to be double, only
 used to transform worldspace lineWidth

---
 62_CAD/main.cpp                                 | 4 ++--
 62_CAD/shaders/globals.hlsl                     | 6 +++---
 62_CAD/shaders/main_pipeline/vertex_shader.hlsl | 6 +++---
 3 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/62_CAD/main.cpp b/62_CAD/main.cpp
index f988458b9..6b4217202 100644
--- a/62_CAD/main.cpp
+++ b/62_CAD/main.cpp
@@ -1216,9 +1216,9 @@ class ComputerAidedDesign final : public examples::SimpleWindowedApplication, pu
 		globalData.defaultClipProjection.projectionToNDC = projectionToNDC;
 		globalData.defaultClipProjection.minClipNDC = float32_t2(-1.0, -1.0);
 		globalData.defaultClipProjection.maxClipNDC = float32_t2(+1.0, +1.0);
-		auto screenToWorld = getScreenToWorldRatio(globalData.defaultClipProjection.projectionToNDC, globalData.resolution);
+		float screenToWorld = getScreenToWorldRatio(globalData.defaultClipProjection.projectionToNDC, globalData.resolution);
 		globalData.screenToWorldRatio = screenToWorld;
-		globalData.worldToScreenRatio = (1.0/screenToWorld);
+		globalData.worldToScreenRatio = (1.0f/screenToWorld);
 		globalData.miterLimit = 10.0f;
 		globalData.currentlyActiveMainObjectIndex = drawResourcesFiller.getActiveMainObjectIndex();
 		SBufferRange<IGPUBuffer> globalBufferUpdateRange = { .offset = 0ull, .size = sizeof(Globals), .buffer = m_globalsBuffer.get() };
diff --git a/62_CAD/shaders/globals.hlsl b/62_CAD/shaders/globals.hlsl
index 2c86b9732..09e809a59 100644
--- a/62_CAD/shaders/globals.hlsl
+++ b/62_CAD/shaders/globals.hlsl
@@ -71,8 +71,8 @@ struct Globals
 {
     Pointers pointers;
     ClipProjectionData defaultClipProjection;
-    pfloat64_t screenToWorldRatio;
-    pfloat64_t worldToScreenRatio;
+    float screenToWorldRatio;
+    float worldToScreenRatio;
     uint32_t2 resolution;
     float antiAliasingFactor;
     uint32_t miterLimit;
@@ -80,7 +80,7 @@ struct Globals
     float32_t _padding;
 };
 #ifndef __HLSL_VERSION
-static_assert(sizeof(Globals) == 176u);
+static_assert(sizeof(Globals) == 168u);
 #endif
 
 #ifdef __HLSL_VERSION
diff --git a/62_CAD/shaders/main_pipeline/vertex_shader.hlsl b/62_CAD/shaders/main_pipeline/vertex_shader.hlsl
index 4c1c74a93..479d05888 100644
--- a/62_CAD/shaders/main_pipeline/vertex_shader.hlsl
+++ b/62_CAD/shaders/main_pipeline/vertex_shader.hlsl
@@ -151,14 +151,14 @@ PSInput main(uint vertexID : SV_VertexID)
         if (dtm.drawOutlineEnabled())
         {
             LineStyle outlineStyle = loadLineStyle(dtm.outlineLineStyleIdx);
-            const float screenSpaceOutlineWidth = outlineStyle.screenSpaceLineWidth + _static_cast<float>(_static_cast<pfloat64_t>(outlineStyle.worldSpaceLineWidth) * globals.screenToWorldRatio);
+            const float screenSpaceOutlineWidth = outlineStyle.screenSpaceLineWidth + outlineStyle.worldSpaceLineWidth * globals.screenToWorldRatio;
             const float sdfOutlineThickness = screenSpaceOutlineWidth * 0.5f;
             outV.setOutlineThickness(sdfOutlineThickness);
         }
         if (dtm.drawContourEnabled())
         {
             LineStyle contourStyle = loadLineStyle(dtm.contourLineStyleIdx);
-            const float screenSpaceContourLineWidth = contourStyle.screenSpaceLineWidth + _static_cast<float>(_static_cast<pfloat64_t>(contourStyle.worldSpaceLineWidth) * globals.screenToWorldRatio);
+            const float screenSpaceContourLineWidth = contourStyle.screenSpaceLineWidth + contourStyle.worldSpaceLineWidth * globals.screenToWorldRatio;
             const float sdfContourLineThickness = screenSpaceContourLineWidth * 0.5f;
             outV.setContourLineThickness(sdfContourLineThickness);
         }
@@ -196,7 +196,7 @@ PSInput main(uint vertexID : SV_VertexID)
             LineStyle lineStyle = loadLineStyle(mainObj.styleIdx);
 
             // Width is on both sides, thickness is one one side of the curve (div by 2.0f)
-            const float screenSpaceLineWidth = lineStyle.screenSpaceLineWidth + _static_cast<float>(_static_cast<pfloat64_t>(lineStyle.worldSpaceLineWidth) * globals.screenToWorldRatio);
+            const float screenSpaceLineWidth = lineStyle.screenSpaceLineWidth + lineStyle.worldSpaceLineWidth * globals.screenToWorldRatio;
             const float antiAliasedLineThickness = screenSpaceLineWidth * 0.5f + globals.antiAliasingFactor;
             const float sdfLineThickness = screenSpaceLineWidth / 2.0f;
             outV.setLineThickness(sdfLineThickness);

From b36e702ed6f9e57e966e450b5d3bbb7e8681165d Mon Sep 17 00:00:00 2001
From: Erfan Ahmadi <ahmadierfan99@gmail.com>
Date: Mon, 21 Apr 2025 16:07:22 +0330
Subject: [PATCH 180/529] cleanups and setup work for multiple contours

---
 62_CAD/CTriangleMesh.h                        |  22 ++--
 62_CAD/DrawResourcesFiller.cpp                |  25 ++--
 62_CAD/DrawResourcesFiller.h                  |   2 +-
 62_CAD/main.cpp                               |  21 ++--
 62_CAD/shaders/globals.hlsl                   |  76 +++++++-----
 62_CAD/shaders/main_pipeline/common.hlsl      |  11 +-
 .../main_pipeline/fragment_shader.hlsl        | 110 +++++++++---------
 .../shaders/main_pipeline/vertex_shader.hlsl  |  29 +----
 8 files changed, 142 insertions(+), 154 deletions(-)

diff --git a/62_CAD/CTriangleMesh.h b/62_CAD/CTriangleMesh.h
index 16995c28a..67daf5221 100644
--- a/62_CAD/CTriangleMesh.h
+++ b/62_CAD/CTriangleMesh.h
@@ -6,7 +6,7 @@
 
 using namespace nbl;
 
-struct DTMHeightShadingInfo
+struct DTMHeightShadingSettingsInfo
 {
 	// Height Shading Mode
 	E_HEIGHT_SHADING_MODE heightShadingMode;
@@ -35,15 +35,15 @@ struct DTMHeightShadingInfo
 	bool fillShaderDTMSettingsHeightColorMap(DTMSettings& dtmSettings) const
 	{
 		const uint32_t mapSize = heightColorSet.size();
-		if (mapSize > DTMSettings::HeightColorMapMaxEntries)
+		if (mapSize > DTMHeightShadingSettings::HeightColorMapMaxEntries)
 			return false;
-		dtmSettings.heightColorEntryCount = mapSize;
+		dtmSettings.heightShadingSettings.heightColorEntryCount = mapSize;
 
 		int index = 0;
 		for (auto it = heightColorSet.begin(); it != heightColorSet.end(); ++it)
 		{
-			dtmSettings.heightColorMapHeights[index] = it->height;
-			dtmSettings.heightColorMapColors[index] = it->color;
+			dtmSettings.heightShadingSettings.heightColorMapHeights[index] = it->height;
+			dtmSettings.heightShadingSettings.heightColorMapColors[index] = it->color;
 			++index;
 		}
 
@@ -65,7 +65,7 @@ struct DTMHeightShadingInfo
 	std::set<HeightColor> heightColorSet;
 };
 
-struct DTMContourInfo
+struct DTMContourSettingsInfo
 {
 	LineStyleInfo lineStyleInfo;
 
@@ -76,11 +76,17 @@ struct DTMContourInfo
 
 struct DTMSettingsInfo
 {
+	static constexpr uint32_t MaxContourSettings = DTMSettings::MaxContourSettings;
+
 	uint32_t mode = 0u;
 
-	DTMHeightShadingInfo heightShadingInfo;
-	DTMContourInfo contourInfo;
+	// outline
 	LineStyleInfo outlineStyleInfo;
+	// contours
+	uint32_t contourSettingsCount = 0u;
+	DTMContourSettingsInfo contourSettings[MaxContourSettings];
+	// height shading
+	DTMHeightShadingSettingsInfo heightShadingInfo;
 };
 
 class CTriangleMesh final
diff --git a/62_CAD/DrawResourcesFiller.cpp b/62_CAD/DrawResourcesFiller.cpp
index c6d898a7c..30fb6d748 100644
--- a/62_CAD/DrawResourcesFiller.cpp
+++ b/62_CAD/DrawResourcesFiller.cpp
@@ -627,7 +627,8 @@ uint32_t DrawResourcesFiller::addLineStyle_Internal(const LineStyleInfo& lineSty
 uint32_t DrawResourcesFiller::addDTMSettings_Internal(const DTMSettingsInfo& dtmSettingsInfo, SIntendedSubmitInfo& intendedNextSubmit)
 {
 	const size_t remainingResourcesSize = calculateRemainingResourcesSize();
-	const size_t maxMemRequired = sizeof(DTMSettings) + 2 * sizeof(LineStyle);
+	const size_t noOfLineStylesRequired = ((dtmSettingsInfo.mode & E_DTM_MODE::OUTLINE) ? 1u : 0u) + dtmSettingsInfo.contourSettingsCount;
+	const size_t maxMemRequired = sizeof(DTMSettings) + noOfLineStylesRequired * sizeof(LineStyle);
 	const bool enoughMem = remainingResourcesSize >= maxMemRequired; // enough remaining memory for 1 more dtm settings with 2 referenced line styles?
 
 	if (!enoughMem)
@@ -644,25 +645,29 @@ uint32_t DrawResourcesFiller::addDTMSettings_Internal(const DTMSettingsInfo& dtm
 		switch (dtmSettingsInfo.heightShadingInfo.heightShadingMode)
 		{
 		case E_HEIGHT_SHADING_MODE::DISCRETE_VARIABLE_LENGTH_INTERVALS:
-			dtmSettings.intervalLength = std::numeric_limits<float>::infinity();
+			dtmSettings.heightShadingSettings.intervalLength = std::numeric_limits<float>::infinity();
 			break;
 		case E_HEIGHT_SHADING_MODE::DISCRETE_FIXED_LENGTH_INTERVALS:
-			dtmSettings.intervalLength = dtmSettingsInfo.heightShadingInfo.intervalLength;
+			dtmSettings.heightShadingSettings.intervalLength = dtmSettingsInfo.heightShadingInfo.intervalLength;
 			break;
 		case E_HEIGHT_SHADING_MODE::CONTINOUS_INTERVALS:
-			dtmSettings.intervalLength = 0.0f;
+			dtmSettings.heightShadingSettings.intervalLength = 0.0f;
 			break;
 		}
-		dtmSettings.intervalIndexToHeightMultiplier = dtmSettingsInfo.heightShadingInfo.intervalIndexToHeightMultiplier;
-		dtmSettings.isCenteredShading = static_cast<int>(dtmSettingsInfo.heightShadingInfo.isCenteredShading);
+		dtmSettings.heightShadingSettings.intervalIndexToHeightMultiplier = dtmSettingsInfo.heightShadingInfo.intervalIndexToHeightMultiplier;
+		dtmSettings.heightShadingSettings.isCenteredShading = static_cast<int>(dtmSettingsInfo.heightShadingInfo.isCenteredShading);
 		_NBL_DEBUG_BREAK_IF(!dtmSettingsInfo.heightShadingInfo.fillShaderDTMSettingsHeightColorMap(dtmSettings));
 	}
 	if (dtmSettings.mode & E_DTM_MODE::CONTOUR)
 	{
-		dtmSettings.contourLinesStartHeight = dtmSettingsInfo.contourInfo.startHeight;
-		dtmSettings.contourLinesEndHeight = dtmSettingsInfo.contourInfo.endHeight;
-		dtmSettings.contourLinesHeightInterval = dtmSettingsInfo.contourInfo.heightInterval;
-		dtmSettings.contourLineStyleIdx = addLineStyle_Internal(dtmSettingsInfo.contourInfo.lineStyleInfo);
+		dtmSettings.contourSettingsCount = dtmSettingsInfo.contourSettingsCount;
+		for (uint32_t i = 0u; i < dtmSettings.contourSettingsCount; ++i)
+		{
+			dtmSettings.contourSettings[i].contourLinesStartHeight = dtmSettingsInfo.contourSettings[i].startHeight;
+			dtmSettings.contourSettings[i].contourLinesEndHeight = dtmSettingsInfo.contourSettings[i].endHeight;
+			dtmSettings.contourSettings[i].contourLinesHeightInterval = dtmSettingsInfo.contourSettings[i].heightInterval;
+			dtmSettings.contourSettings[i].contourLineStyleIdx = addLineStyle_Internal(dtmSettingsInfo.contourSettings[i].lineStyleInfo);
+		}
 	}
 	if (dtmSettings.mode & E_DTM_MODE::OUTLINE)
 	{
diff --git a/62_CAD/DrawResourcesFiller.h b/62_CAD/DrawResourcesFiller.h
index 196ba6885..1e244ae01 100644
--- a/62_CAD/DrawResourcesFiller.h
+++ b/62_CAD/DrawResourcesFiller.h
@@ -129,7 +129,7 @@ struct DrawResourcesFiller
 	{
 		// for auto-submission to work correctly, memory needs to serve at least 2 linestyle, 1 dtm settings, 1 clip proj, 1 main obj, 1 draw obj and 512 bytes of additional mem for geometries and index buffer
 		// this is the ABSOLUTE MINIMUM (if this value is used rendering will probably be as slow as CPU drawing :D)
-		return core::alignUp(sizeof(LineStyle) * 2u + sizeof(DTMSettings) + sizeof(ClipProjectionData) + sizeof(MainObject) + sizeof(DrawObject) + 512ull, ResourcesMaxNaturalAlignment);
+		return core::alignUp(sizeof(LineStyle) + sizeof(LineStyle) * DTMSettings::MaxContourSettings + sizeof(DTMSettings) + sizeof(ClipProjectionData) + sizeof(MainObject) + sizeof(DrawObject) + 512ull, ResourcesMaxNaturalAlignment);
 	}
 
 	void allocateResourcesBuffer(ILogicalDevice* logicalDevice, size_t size);
diff --git a/62_CAD/main.cpp b/62_CAD/main.cpp
index 6b4217202..0f1653591 100644
--- a/62_CAD/main.cpp
+++ b/62_CAD/main.cpp
@@ -3246,22 +3246,23 @@ class ComputerAidedDesign final : public examples::SimpleWindowedApplication, pu
 			mesh.setIndices(std::move(indices));
 
 			DTMSettingsInfo dtmInfo;
-			//dtmInfo.mode = E_DTM_MODE::HEIGHT_SHADING | E_DTM_MODE::CONTOUR | E_DTM_MODE::OUTLINE;
-			dtmInfo.mode = E_DTM_MODE::HEIGHT_SHADING;
+			dtmInfo.mode = E_DTM_MODE::HEIGHT_SHADING | E_DTM_MODE::CONTOUR | E_DTM_MODE::OUTLINE;
+			//dtmInfo.mode = E_DTM_MODE::HEIGHT_SHADING;
 			dtmInfo.outlineStyleInfo.screenSpaceLineWidth = 0.0f;
 			dtmInfo.outlineStyleInfo.worldSpaceLineWidth = 3.0f;
 			dtmInfo.outlineStyleInfo.color = float32_t4(0.0f, 0.39f, 0.0f, 1.0f);
 			std::array<double, 4> outlineStipplePattern = { 0.0f, -5.0f, 20.0f, -5.0f };
 			dtmInfo.outlineStyleInfo.setStipplePatternData(outlineStipplePattern);
 
-			dtmInfo.contourInfo.startHeight = 20;
-			dtmInfo.contourInfo.endHeight = 90;
-			dtmInfo.contourInfo.heightInterval = 10;
-			dtmInfo.contourInfo.lineStyleInfo.screenSpaceLineWidth = 0.0f;
-			dtmInfo.contourInfo.lineStyleInfo.worldSpaceLineWidth = 1.0f;
-			dtmInfo.contourInfo.lineStyleInfo.color = float32_t4(0.0f, 0.0f, 1.0f, 0.7f);
+			dtmInfo.contourSettingsCount = 1u;
+			dtmInfo.contourSettings[0u].startHeight = 20;
+			dtmInfo.contourSettings[0u].endHeight = 90;
+			dtmInfo.contourSettings[0u].heightInterval = 10;
+			dtmInfo.contourSettings[0u].lineStyleInfo.screenSpaceLineWidth = 0.0f;
+			dtmInfo.contourSettings[0u].lineStyleInfo.worldSpaceLineWidth = 1.0f;
+			dtmInfo.contourSettings[0u].lineStyleInfo.color = float32_t4(0.0f, 0.0f, 1.0f, 0.7f);
 			std::array<double, 4> contourStipplePattern = { 0.0f, -5.0f, 10.0f, -5.0f };
-			dtmInfo.contourInfo.lineStyleInfo.setStipplePatternData(contourStipplePattern);
+			dtmInfo.contourSettings[0u].lineStyleInfo.setStipplePatternData(contourStipplePattern);
 
 			// PRESS 1, 2, 3 TO SWITCH HEIGHT SHADING MODE
 			// 1 - DISCRETE_VARIABLE_LENGTH_INTERVALS
@@ -3311,7 +3312,7 @@ class ComputerAidedDesign final : public examples::SimpleWindowedApplication, pu
 
 			drawResourcesFiller.drawTriangleMesh(mesh, dtmInfo, intendedNextSubmit);
 
-			dtmInfo.contourInfo.lineStyleInfo.color = float32_t4(1.0f, 0.39f, 0.0f, 1.0f);
+			dtmInfo.contourSettings[0u].lineStyleInfo.color = float32_t4(1.0f, 0.39f, 0.0f, 1.0f);
 			dtmInfo.outlineStyleInfo.color = float32_t4(0.0f, 0.39f, 1.0f, 1.0f);
 			for (auto& v : mesh.m_vertices)
 			{
diff --git a/62_CAD/shaders/globals.hlsl b/62_CAD/shaders/globals.hlsl
index 09e809a59..bd700785d 100644
--- a/62_CAD/shaders/globals.hlsl
+++ b/62_CAD/shaders/globals.hlsl
@@ -368,20 +368,18 @@ enum class E_HEIGHT_SHADING_MODE : uint32_t
     CONTINOUS_INTERVALS
 };
     
-// Documentation and explanation of variables in DTMSettingsInfo
-struct DTMSettings
+struct DTMContourSettings
 {
-    const static uint32_t HeightColorMapMaxEntries = 16u;
-    uint32_t outlineLineStyleIdx; // index into line styles
     uint32_t contourLineStyleIdx; // index into line styles
-
-    uint32_t mode; // E_DTM_MODE
-
-    // contour lines
     float contourLinesStartHeight;
     float contourLinesEndHeight;
     float contourLinesHeightInterval;
+};
 
+struct DTMHeightShadingSettings
+{
+    const static uint32_t HeightColorMapMaxEntries = 16u;
+    
     // height-color map
     float intervalLength;
 	float intervalIndexToHeightMultiplier;
@@ -399,6 +397,24 @@ struct DTMSettings
             return E_HEIGHT_SHADING_MODE::CONTINOUS_INTERVALS;
         return E_HEIGHT_SHADING_MODE::DISCRETE_FIXED_LENGTH_INTERVALS;
     }
+};
+
+// Documentation and explanation of variables in DTMSettingsInfo
+struct DTMSettings
+{
+    const static uint32_t MaxContourSettings = 8u;
+
+    uint32_t mode; // E_DTM_MODE
+    
+    // outline
+    uint32_t outlineLineStyleIdx;
+
+    // contour lines
+    uint32_t contourSettingsCount;
+    DTMContourSettings contourSettings[MaxContourSettings];
+
+    // height shading
+    DTMHeightShadingSettings heightShadingSettings;
     
     bool drawOutlineEnabled() NBL_CONST_MEMBER_FUNC { return  (mode & E_DTM_MODE::OUTLINE) != 0u; } 
     bool drawContourEnabled() NBL_CONST_MEMBER_FUNC { return (mode & E_DTM_MODE::CONTOUR) != 0u; }
@@ -433,41 +449,39 @@ inline bool operator==(const DTMSettings& lhs, const DTMSettings& rhs)
     if (lhs.mode != rhs.mode)
         return false;
 
-    bool equal = true;
     if (lhs.drawOutlineEnabled())
     {
-        equal = lhs.outlineLineStyleIdx == rhs.outlineLineStyleIdx;
+        if (lhs.outlineLineStyleIdx != rhs.outlineLineStyleIdx)
+            return false;
     }
 
-    if (!equal)
-        return false;
-
     if (lhs.drawContourEnabled())
     {
-        float contourLinesStartHeight;
-        float contourLinesEndHeight;
-        float contourLinesHeightInterval;
-
-        equal = lhs.contourLinesStartHeight == rhs.contourLinesStartHeight &&
-            lhs.contourLinesStartHeight == rhs.contourLinesStartHeight &&
-            lhs.contourLinesStartHeight == rhs.contourLinesStartHeight;
+        if (lhs.contourSettingsCount != rhs.contourSettingsCount)
+            return false;
+        if (!memcmp(lhs.contourSettings, rhs.contourSettings, lhs.contourSettingsCount * sizeof(DTMContourSettings)))
+            return false;
     }
 
-    if (!equal)
-        return false;
-
     if (lhs.drawHeightShadingEnabled())
     {
-        equal = lhs.intervalLength == rhs.intervalLength &&
-            lhs.intervalIndexToHeightMultiplier == rhs.intervalIndexToHeightMultiplier &&
-            lhs.isCenteredShading == rhs.isCenteredShading &&
-            lhs.heightColorEntryCount == rhs.heightColorEntryCount;
-
-        equal == equal && (memcmp(lhs.heightColorMapHeights, rhs.heightColorMapHeights, lhs.heightColorEntryCount * sizeof(float)));
-        equal == equal && (memcmp(lhs.heightColorMapColors, rhs.heightColorMapColors, lhs.heightColorEntryCount * sizeof(float32_t4)));
+        if (lhs.heightShadingSettings.intervalLength != rhs.heightShadingSettings.intervalLength)
+            return false;
+        if (lhs.heightShadingSettings.intervalIndexToHeightMultiplier != rhs.heightShadingSettings.intervalIndexToHeightMultiplier)
+            return false;
+        if (lhs.heightShadingSettings.isCenteredShading != rhs.heightShadingSettings.isCenteredShading)
+            return false;
+        if (lhs.heightShadingSettings.heightColorEntryCount != rhs.heightShadingSettings.heightColorEntryCount)
+            return false;
+        
+                
+        if(!memcmp(lhs.heightShadingSettings.heightColorMapHeights, rhs.heightShadingSettings.heightColorMapHeights, lhs.heightShadingSettings.heightColorEntryCount * sizeof(float)))
+            return false;
+        if(!memcmp(lhs.heightShadingSettings.heightColorMapColors, rhs.heightShadingSettings.heightColorMapColors, lhs.heightShadingSettings.heightColorEntryCount * sizeof(float32_t4)))
+            return false;
     }
 
-    return equal;
+    return true;
 }
 #endif
 
diff --git a/62_CAD/shaders/main_pipeline/common.hlsl b/62_CAD/shaders/main_pipeline/common.hlsl
index 3c12a3dcf..631e421b9 100644
--- a/62_CAD/shaders/main_pipeline/common.hlsl
+++ b/62_CAD/shaders/main_pipeline/common.hlsl
@@ -224,16 +224,7 @@ struct PSInput
     void setImageTextureId(uint32_t textureId) { data2.x = asfloat(textureId); }
 
     /* TRIANGLE MESH */
-
-    float getOutlineThickness() { return asfloat(data1.z); }
-    float getContourLineThickness() { return asfloat(data1.w); }
-
-    void setOutlineThickness(float lineThickness) { data1.z = asuint(lineThickness); }
-    void setContourLineThickness(float stretch) { data1.w = asuint(stretch); }
-
-    void setHeight(float height) { interp_data5.x = height; }
-    float getHeight() { return interp_data5.x; }
-
+    
 #ifndef FRAGMENT_SHADER_INPUT // vertex shader
     void setScreenSpaceVertexAttribs(float3 pos) { vertexScreenSpacePos = pos; }
 #else // fragment shader
diff --git a/62_CAD/shaders/main_pipeline/fragment_shader.hlsl b/62_CAD/shaders/main_pipeline/fragment_shader.hlsl
index 240bf967f..fb6b6e8e8 100644
--- a/62_CAD/shaders/main_pipeline/fragment_shader.hlsl
+++ b/62_CAD/shaders/main_pipeline/fragment_shader.hlsl
@@ -337,12 +337,12 @@ typedef StyleClipper< nbl::hlsl::shapes::Line<float> > LineStyleClipper;
 // for usage in upper_bound function
 struct DTMSettingsHeightsAccessor
 {
-    DTMSettings dtmSettings;
+    DTMHeightShadingSettings settings;
     using value_type = float;
 
     float operator[](const uint32_t ix)
     {
-        return dtmSettings.heightColorMapHeights[ix];
+        return settings.heightColorMapHeights[ix];
     }
 };
 
@@ -470,25 +470,25 @@ float getIntervalPosition(in float height, in float minHeight, in float interval
         return ( (height - minHeight) / intervalLength );
 }
 
-void getIntervalHeightAndColor(in int intervalIndex, in DTMSettings dtmSettings, out float4 outIntervalColor, out float outIntervalHeight)
+void getIntervalHeightAndColor(in int intervalIndex, in DTMHeightShadingSettings settings, out float4 outIntervalColor, out float outIntervalHeight)
 {
-    float minShadingHeight = dtmSettings.heightColorMapHeights[0];
-    float heightForColor = minShadingHeight + float(intervalIndex) * dtmSettings.intervalIndexToHeightMultiplier;
+    float minShadingHeight = settings.heightColorMapHeights[0];
+    float heightForColor = minShadingHeight + float(intervalIndex) * settings.intervalIndexToHeightMultiplier;
     
-    if (dtmSettings.isCenteredShading)
-        outIntervalHeight = minShadingHeight + (float(intervalIndex) - 0.5) * dtmSettings.intervalLength;
+    if (settings.isCenteredShading)
+        outIntervalHeight = minShadingHeight + (float(intervalIndex) - 0.5) * settings.intervalLength;
     else
-        outIntervalHeight = minShadingHeight + (float(intervalIndex)) * dtmSettings.intervalLength;
+        outIntervalHeight = minShadingHeight + (float(intervalIndex)) * settings.intervalLength;
 
-    DTMSettingsHeightsAccessor dtmHeightsAccessor = { dtmSettings };
-    uint32_t upperBoundHeightIndex = min(nbl::hlsl::upper_bound(dtmHeightsAccessor, 0, dtmSettings.heightColorEntryCount, heightForColor), dtmSettings.heightColorEntryCount-1u);
+    DTMSettingsHeightsAccessor dtmHeightsAccessor = { settings };
+    uint32_t upperBoundHeightIndex = min(nbl::hlsl::upper_bound(dtmHeightsAccessor, 0, settings.heightColorEntryCount, heightForColor), settings.heightColorEntryCount-1u);
     uint32_t lowerBoundHeightIndex = max(upperBoundHeightIndex - 1, 0);
 
-    float upperBoundHeight = dtmSettings.heightColorMapHeights[upperBoundHeightIndex];
-    float lowerBoundHeight = dtmSettings.heightColorMapHeights[lowerBoundHeightIndex];
+    float upperBoundHeight = settings.heightColorMapHeights[upperBoundHeightIndex];
+    float lowerBoundHeight = settings.heightColorMapHeights[lowerBoundHeightIndex];
 
-    float4 upperBoundColor = dtmSettings.heightColorMapColors[upperBoundHeightIndex];
-    float4 lowerBoundColor = dtmSettings.heightColorMapColors[lowerBoundHeightIndex];
+    float4 upperBoundColor = settings.heightColorMapColors[upperBoundHeightIndex];
+    float4 lowerBoundColor = settings.heightColorMapColors[lowerBoundHeightIndex];
     
     if (upperBoundHeight == lowerBoundHeight)
     {
@@ -510,14 +510,14 @@ float3 calculateDTMTriangleBarycentrics(in float2 v1, in float2 v2, in float2 v3
     return float3(u, v, w);
 }
 
-float4 calculateDTMHeightColor(in DTMSettings dtmSettings, in float3 v[3], in float heightDeriv, in float2 fragPos, in float height)
+float4 calculateDTMHeightColor(in DTMHeightShadingSettings settings, in float3 v[3], in float heightDeriv, in float2 fragPos, in float height)
 {
     float4 outputColor = float4(0.0f, 0.0f, 0.0f, 0.0f);
 
     // HEIGHT SHADING
-    const uint32_t heightMapSize = dtmSettings.heightColorEntryCount;
-    float minShadingHeight = dtmSettings.heightColorMapHeights[0];
-    float maxShadingHeight = dtmSettings.heightColorMapHeights[heightMapSize - 1];
+    const uint32_t heightMapSize = settings.heightColorEntryCount;
+    float minShadingHeight = settings.heightColorMapHeights[0];
+    float maxShadingHeight = settings.heightColorMapHeights[heightMapSize - 1];
 
     if (heightMapSize > 0)
     {
@@ -549,11 +549,11 @@ float4 calculateDTMHeightColor(in DTMSettings dtmSettings, in float3 v[3], in fl
         outputColor.a = 1.0f - smoothstep(0.0f, globals.antiAliasingFactor * 2.0f, convexPolygonSdf);
 
         // calculate height color
-        E_HEIGHT_SHADING_MODE mode = dtmSettings.determineHeightShadingMode();
+        E_HEIGHT_SHADING_MODE mode = settings.determineHeightShadingMode();
 
         if (mode == E_HEIGHT_SHADING_MODE::DISCRETE_VARIABLE_LENGTH_INTERVALS)
         {
-            DTMSettingsHeightsAccessor dtmHeightsAccessor = { dtmSettings };
+            DTMSettingsHeightsAccessor dtmHeightsAccessor = { settings };
             int upperBoundIndex = nbl::hlsl::upper_bound(dtmHeightsAccessor, 0, heightMapSize, height);
             int mapIndex = max(upperBoundIndex - 1, 0);
             int mapIndexPrev = max(mapIndex - 1, 0);
@@ -563,13 +563,13 @@ float4 calculateDTMHeightColor(in DTMSettings dtmSettings, in float3 v[3], in fl
             // if color idx is >= length of the colours array then it means it's also > 0.0 and this blend with prev is true
             // if color idx is > 0 and < len - 1, then it depends on the current pixel's height value and two closest height values
             bool blendWithPrev = (mapIndex > 0)
-                && (mapIndex >= heightMapSize - 1 || (height * 2.0 < dtmSettings.heightColorMapHeights[upperBoundIndex] + dtmSettings.heightColorMapHeights[mapIndex]));
+                && (mapIndex >= heightMapSize - 1 || (height * 2.0 < settings.heightColorMapHeights[upperBoundIndex] + settings.heightColorMapHeights[mapIndex]));
 
             HeightSegmentTransitionData transitionInfo;
             transitionInfo.currentHeight = height;
-            transitionInfo.currentSegmentColor = dtmSettings.heightColorMapColors[mapIndex];
-            transitionInfo.boundaryHeight = blendWithPrev ? dtmSettings.heightColorMapHeights[mapIndex] : dtmSettings.heightColorMapHeights[mapIndexNext];
-            transitionInfo.otherSegmentColor = blendWithPrev ? dtmSettings.heightColorMapColors[mapIndexPrev] : dtmSettings.heightColorMapColors[mapIndexNext];
+            transitionInfo.currentSegmentColor = settings.heightColorMapColors[mapIndex];
+            transitionInfo.boundaryHeight = blendWithPrev ? settings.heightColorMapHeights[mapIndex] : settings.heightColorMapHeights[mapIndexNext];
+            transitionInfo.otherSegmentColor = blendWithPrev ? settings.heightColorMapColors[mapIndexPrev] : settings.heightColorMapColors[mapIndexNext];
 
             float4 localHeightColor = smoothHeightSegmentTransition(transitionInfo, heightDeriv);
             outputColor.rgb = localHeightColor.rgb;
@@ -577,13 +577,13 @@ float4 calculateDTMHeightColor(in DTMSettings dtmSettings, in float3 v[3], in fl
         }
         else if (mode == E_HEIGHT_SHADING_MODE::DISCRETE_FIXED_LENGTH_INTERVALS)
         {
-            float intervalPosition = getIntervalPosition(height, minShadingHeight, dtmSettings.intervalLength, dtmSettings.isCenteredShading);
+            float intervalPosition = getIntervalPosition(height, minShadingHeight, settings.intervalLength, settings.isCenteredShading);
             float positionWithinInterval = frac(intervalPosition);
             int intervalIndex = nbl::hlsl::_static_cast<int>(intervalPosition);
 
             float4 currentIntervalColor;
             float currentIntervalHeight;
-            getIntervalHeightAndColor(intervalIndex, dtmSettings, currentIntervalColor, currentIntervalHeight);
+            getIntervalHeightAndColor(intervalIndex, settings, currentIntervalColor, currentIntervalHeight);
             
             bool blendWithPrev = (positionWithinInterval < 0.5f);
             
@@ -594,13 +594,13 @@ float4 calculateDTMHeightColor(in DTMSettings dtmSettings, in float3 v[3], in fl
             {
                 int prevIntervalIdx = max(intervalIndex - 1, 0);
                 float prevIntervalHeight; // unused, the currentIntervalHeight is the boundary height between current and prev
-                getIntervalHeightAndColor(prevIntervalIdx, dtmSettings, transitionInfo.otherSegmentColor, prevIntervalHeight);
+                getIntervalHeightAndColor(prevIntervalIdx, settings, transitionInfo.otherSegmentColor, prevIntervalHeight);
                 transitionInfo.boundaryHeight = currentIntervalHeight;
             }
             else
             {
                 int nextIntervalIdx = intervalIndex + 1;
-                getIntervalHeightAndColor(nextIntervalIdx, dtmSettings, transitionInfo.otherSegmentColor, transitionInfo.boundaryHeight);
+                getIntervalHeightAndColor(nextIntervalIdx, settings, transitionInfo.otherSegmentColor, transitionInfo.boundaryHeight);
             }
             
             float4 localHeightColor = smoothHeightSegmentTransition(transitionInfo, heightDeriv);
@@ -609,15 +609,15 @@ float4 calculateDTMHeightColor(in DTMSettings dtmSettings, in float3 v[3], in fl
         }
         else if (mode == E_HEIGHT_SHADING_MODE::CONTINOUS_INTERVALS)
         {
-            DTMSettingsHeightsAccessor dtmHeightsAccessor = { dtmSettings };
+            DTMSettingsHeightsAccessor dtmHeightsAccessor = { settings };
             uint32_t upperBoundHeightIndex = nbl::hlsl::upper_bound(dtmHeightsAccessor, 0, heightMapSize, height);
             uint32_t lowerBoundHeightIndex = upperBoundHeightIndex == 0 ? upperBoundHeightIndex : upperBoundHeightIndex - 1;
 
-            float upperBoundHeight = dtmSettings.heightColorMapHeights[upperBoundHeightIndex];
-            float lowerBoundHeight = dtmSettings.heightColorMapHeights[lowerBoundHeightIndex];
+            float upperBoundHeight = settings.heightColorMapHeights[upperBoundHeightIndex];
+            float lowerBoundHeight = settings.heightColorMapHeights[lowerBoundHeightIndex];
 
-            float4 upperBoundColor = dtmSettings.heightColorMapColors[upperBoundHeightIndex];
-            float4 lowerBoundColor = dtmSettings.heightColorMapColors[lowerBoundHeightIndex];
+            float4 upperBoundColor = settings.heightColorMapColors[upperBoundHeightIndex];
+            float4 lowerBoundColor = settings.heightColorMapColors[lowerBoundHeightIndex];
 
             float interpolationVal;
             if (upperBoundHeightIndex == 0)
@@ -635,20 +635,19 @@ float4 calculateDTMHeightColor(in DTMSettings dtmSettings, in float3 v[3], in fl
     return outputColor; 
 }
 
-float4 calculateDTMContourColor(in DTMSettings dtmSettings, in float3 v[3], in uint2 edgePoints[3], in PSInput psInput, in float height)
+float4 calculateDTMContourColor(in DTMContourSettings contourSettings, in float3 v[3], in uint2 edgePoints[3], in float2 fragPos, in float height)
 {
     float4 outputColor;
 
-    LineStyle contourStyle = loadLineStyle(dtmSettings.contourLineStyleIdx);
-    const float contourThickness = psInput.getContourLineThickness();
+    LineStyle contourStyle = loadLineStyle(contourSettings.contourLineStyleIdx);
+    const float contourThickness = (contourStyle.screenSpaceLineWidth + contourStyle.worldSpaceLineWidth * globals.screenToWorldRatio) * 0.5f;
     float stretch = 1.0f;
     float phaseShift = 0.0f;
-    const float worldToScreenRatio = psInput.getCurrentWorldToScreenRatio();
 
     // TODO: move to ubo or push constants
-    const float startHeight = dtmSettings.contourLinesStartHeight;
-    const float endHeight = dtmSettings.contourLinesEndHeight;
-    const float interval = dtmSettings.contourLinesHeightInterval;
+    const float startHeight = contourSettings.contourLinesStartHeight;
+    const float endHeight = contourSettings.contourLinesEndHeight;
+    const float interval = contourSettings.contourLinesHeightInterval;
 
     // TODO: can be precomputed
     const int maxContourLineIdx = (endHeight - startHeight + 1) / interval;
@@ -694,7 +693,7 @@ float4 calculateDTMContourColor(in DTMSettings dtmSettings, in float3 v[3], in u
         float distance = nbl::hlsl::numeric_limits<float>::max;
         if (!contourStyle.hasStipples() || stretch == InvalidStyleStretchValue)
         {
-            distance = ClippedSignedDistance< nbl::hlsl::shapes::Line<float> >::sdf(lineSegment, psInput.position.xy, contourThickness, contourStyle.isRoadStyleFlag);
+            distance = ClippedSignedDistance< nbl::hlsl::shapes::Line<float> >::sdf(lineSegment, fragPos, contourThickness, contourStyle.isRoadStyleFlag);
         }
         else
         {
@@ -702,8 +701,8 @@ float4 calculateDTMContourColor(in DTMSettings dtmSettings, in float3 v[3], in u
             // It might be beneficial to calculate distance between pixel and contour line to early out some pixels and save yourself from stipple sdf computations!
             // where you only compute the complex sdf if abs((height - contourVal) / heightDeriv) <= aaFactor
             nbl::hlsl::shapes::Line<float>::ArcLengthCalculator arcLenCalc = nbl::hlsl::shapes::Line<float>::ArcLengthCalculator::construct(lineSegment);
-            LineStyleClipper clipper = LineStyleClipper::construct(contourStyle, lineSegment, arcLenCalc, phaseShift, stretch, worldToScreenRatio);
-            distance = ClippedSignedDistance<nbl::hlsl::shapes::Line<float>, LineStyleClipper>::sdf(lineSegment, psInput.position.xy, contourThickness, contourStyle.isRoadStyleFlag, clipper);
+            LineStyleClipper clipper = LineStyleClipper::construct(contourStyle, lineSegment, arcLenCalc, phaseShift, stretch, globals.worldToScreenRatio);
+            distance = ClippedSignedDistance<nbl::hlsl::shapes::Line<float>, LineStyleClipper>::sdf(lineSegment, fragPos, contourThickness, contourStyle.isRoadStyleFlag, clipper);
         }
         
         outputColor.a = smoothstep(+globals.antiAliasingFactor, -globals.antiAliasingFactor, distance) * contourStyle.color.a;
@@ -715,14 +714,13 @@ float4 calculateDTMContourColor(in DTMSettings dtmSettings, in float3 v[3], in u
     return float4(0.0f, 0.0f, 0.0f, 0.0f);
 }
 
-float4 calculateDTMOutlineColor(in DTMSettings dtmSettings, in float3 v[3], in uint2 edgePoints[3], in PSInput psInput, in float3 baryCoord, in float height)
+float4 calculateDTMOutlineColor(in uint outlineLineStyleIdx, in float3 v[3], in uint2 edgePoints[3], in float2 fragPos, in float3 baryCoord, in float height)
 {
     float4 outputColor;
 
-    LineStyle outlineStyle = loadLineStyle(dtmSettings.outlineLineStyleIdx);
-    const float outlineThickness = psInput.getOutlineThickness();
+    LineStyle outlineStyle = loadLineStyle(outlineLineStyleIdx);
+    const float outlineThickness = (outlineStyle.screenSpaceLineWidth + outlineStyle.worldSpaceLineWidth * globals.screenToWorldRatio) * 0.5f;
     const float phaseShift = 0.0f; // input.getCurrentPhaseShift();
-    const float worldToScreenRatio = psInput.getCurrentWorldToScreenRatio();
     const float stretch = 1.0f;
 
     // index of vertex opposing an edge, needed for calculation of triangle heights
@@ -768,7 +766,7 @@ float4 calculateDTMOutlineColor(in DTMSettings dtmSettings, in float3 v[3], in u
 
             float distance = nbl::hlsl::numeric_limits<float>::max;
             nbl::hlsl::shapes::Line<float> lineSegment = nbl::hlsl::shapes::Line<float>::construct(float2(p0.x, p0.y), float2(p1.x, p1.y));
-            distance = ClippedSignedDistance<nbl::hlsl::shapes::Line<float> >::sdf(lineSegment, psInput.position.xy, outlineThickness, outlineStyle.isRoadStyleFlag);
+            distance = ClippedSignedDistance<nbl::hlsl::shapes::Line<float> >::sdf(lineSegment, fragPos, outlineThickness, outlineStyle.isRoadStyleFlag);
 
             minDistance = min(minDistance, distance);
         }
@@ -793,8 +791,8 @@ float4 calculateDTMOutlineColor(in DTMSettings dtmSettings, in float3 v[3], in u
 
             float distance = nbl::hlsl::numeric_limits<float>::max;
             nbl::hlsl::shapes::Line<float>::ArcLengthCalculator arcLenCalc = nbl::hlsl::shapes::Line<float>::ArcLengthCalculator::construct(lineSegment);
-            LineStyleClipper clipper = LineStyleClipper::construct(outlineStyle, lineSegment, arcLenCalc, phaseShift, stretch, worldToScreenRatio);
-            distance = ClippedSignedDistance<nbl::hlsl::shapes::Line<float>, LineStyleClipper>::sdf(lineSegment, psInput.position.xy, outlineThickness, outlineStyle.isRoadStyleFlag, clipper);
+            LineStyleClipper clipper = LineStyleClipper::construct(outlineStyle, lineSegment, arcLenCalc, phaseShift, stretch, globals.worldToScreenRatio);
+            distance = ClippedSignedDistance<nbl::hlsl::shapes::Line<float>, LineStyleClipper>::sdf(lineSegment, fragPos, outlineThickness, outlineStyle.isRoadStyleFlag, clipper);
 
             minDistance = min(minDistance, distance);
         }
@@ -848,11 +846,11 @@ float4 fragMain(PSInput input) : SV_TARGET
 
         float4 dtmColor = float4(0.0f, 0.0f, 0.0f, 0.0f);
         if (dtmSettings.drawHeightShadingEnabled())
-            dtmColor = blendColorOnTop(dtmColor, calculateDTMHeightColor(dtmSettings, v, heightDeriv, input.position.xy, height));
+            dtmColor = blendColorOnTop(dtmColor, calculateDTMHeightColor(dtmSettings.heightShadingSettings, v, heightDeriv, input.position.xy, height));
         if (dtmSettings.drawContourEnabled())
-            dtmColor = blendColorOnTop(dtmColor, calculateDTMContourColor(dtmSettings, v, edgePoints, input, height));
+            dtmColor = blendColorOnTop(dtmColor, calculateDTMContourColor(dtmSettings.contourSettings[0u], v, edgePoints, input.position.xy, height));
         if (dtmSettings.drawOutlineEnabled())
-            dtmColor = blendColorOnTop(dtmColor, calculateDTMOutlineColor(dtmSettings, v, edgePoints, input, baryCoord, height));
+            dtmColor = blendColorOnTop(dtmColor, calculateDTMOutlineColor(dtmSettings.outlineLineStyleIdx, v, edgePoints, input.position.xy, baryCoord, height));
 
         textureColor = dtmColor.rgb;
         localAlpha = dtmColor.a;
@@ -874,7 +872,6 @@ float4 fragMain(PSInput input) : SV_TARGET
                 const float thickness = input.getLineThickness();
                 const float phaseShift = input.getCurrentPhaseShift();
                 const float stretch = input.getPatternStretch();
-                const float worldToScreenRatio = input.getCurrentWorldToScreenRatio();
 
                 nbl::hlsl::shapes::Line<float> lineSegment = nbl::hlsl::shapes::Line<float>::construct(start, end);
 
@@ -887,7 +884,7 @@ float4 fragMain(PSInput input) : SV_TARGET
                 else
                 {
                     nbl::hlsl::shapes::Line<float>::ArcLengthCalculator arcLenCalc = nbl::hlsl::shapes::Line<float>::ArcLengthCalculator::construct(lineSegment);
-                    LineStyleClipper clipper = LineStyleClipper::construct(loadLineStyle(styleIdx), lineSegment, arcLenCalc, phaseShift, stretch, worldToScreenRatio);
+                    LineStyleClipper clipper = LineStyleClipper::construct(loadLineStyle(styleIdx), lineSegment, arcLenCalc, phaseShift, stretch, globals.worldToScreenRatio);
                     distance = ClippedSignedDistance<nbl::hlsl::shapes::Line<float>, LineStyleClipper>::sdf(lineSegment, input.position.xy, thickness, style.isRoadStyleFlag, clipper);
                 }
             }
@@ -900,7 +897,6 @@ float4 fragMain(PSInput input) : SV_TARGET
                 const float thickness = input.getLineThickness();
                 const float phaseShift = input.getCurrentPhaseShift();
                 const float stretch = input.getPatternStretch();
-                const float worldToScreenRatio = input.getCurrentWorldToScreenRatio();
 
                 LineStyle style = loadLineStyle(styleIdx);
                 if (!style.hasStipples() || stretch == InvalidStyleStretchValue)
@@ -909,7 +905,7 @@ float4 fragMain(PSInput input) : SV_TARGET
                 }
                 else
                 {
-                    BezierStyleClipper clipper = BezierStyleClipper::construct(loadLineStyle(styleIdx), quadratic, arcLenCalc, phaseShift, stretch, worldToScreenRatio);
+                    BezierStyleClipper clipper = BezierStyleClipper::construct(loadLineStyle(styleIdx), quadratic, arcLenCalc, phaseShift, stretch, globals.worldToScreenRatio );
                     distance = ClippedSignedDistance<nbl::hlsl::shapes::Quadratic<float>, BezierStyleClipper>::sdf(quadratic, input.position.xy, thickness, style.isRoadStyleFlag, clipper);
                 }
             }
diff --git a/62_CAD/shaders/main_pipeline/vertex_shader.hlsl b/62_CAD/shaders/main_pipeline/vertex_shader.hlsl
index 479d05888..7ce0f43e7 100644
--- a/62_CAD/shaders/main_pipeline/vertex_shader.hlsl
+++ b/62_CAD/shaders/main_pipeline/vertex_shader.hlsl
@@ -128,6 +128,8 @@ PSInput main(uint vertexID : SV_VertexID)
             triangleVertices[2].pos = triangleVertices[2].pos - triangleCentroid;
 
             // TODO: calculate dialation factor
+            // const float dilateByPixels = 0.5 * (dtmSettings.maxScreenSpaceLineWidth + dtmSettings.maxWorldSpaceLineWidth * globals.screenToWorldRatio) + aaFactor;
+        
             pfloat64_t dialationFactor = _static_cast<pfloat64_t>(2.0f);
             pfloat64_t2 dialatedVertex = triangleVertices[currentVertexWithinTriangleIndex].pos * dialationFactor;
 
@@ -138,30 +140,7 @@ PSInput main(uint vertexID : SV_VertexID)
 
         outV.position = transformFromSreenSpaceToNdc(transformedDilatedPos, globals.resolution);
         const float heightAsFloat = nbl::hlsl::_static_cast<float>(vtx.height);
-        outV.setHeight(heightAsFloat);
         outV.setScreenSpaceVertexAttribs(float3(transformedOriginalPos, heightAsFloat));
-        outV.setCurrentWorldToScreenRatio(
-            _static_cast<float>((_static_cast<pfloat64_t>(2.0f) /
-                (clipProjectionData.projectionToNDC[0].x * _static_cast<pfloat64_t>(globals.resolution.x))))
-        );
-
-        DTMSettings dtm = loadDTMSettings(mainObj.dtmSettingsIdx);
-
-        // TODO: maybe move to fragment shader since we may have multiple contour styles later
-        if (dtm.drawOutlineEnabled())
-        {
-            LineStyle outlineStyle = loadLineStyle(dtm.outlineLineStyleIdx);
-            const float screenSpaceOutlineWidth = outlineStyle.screenSpaceLineWidth + outlineStyle.worldSpaceLineWidth * globals.screenToWorldRatio;
-            const float sdfOutlineThickness = screenSpaceOutlineWidth * 0.5f;
-            outV.setOutlineThickness(sdfOutlineThickness);
-        }
-        if (dtm.drawContourEnabled())
-        {
-            LineStyle contourStyle = loadLineStyle(dtm.contourLineStyleIdx);
-            const float screenSpaceContourLineWidth = contourStyle.screenSpaceLineWidth + contourStyle.worldSpaceLineWidth * globals.screenToWorldRatio;
-            const float sdfContourLineThickness = screenSpaceContourLineWidth * 0.5f;
-            outV.setContourLineThickness(sdfContourLineThickness);
-        }
 
         // full screen triangle (this will destroy outline, contour line and height drawing)
 #if 0
@@ -200,10 +179,6 @@ PSInput main(uint vertexID : SV_VertexID)
             const float antiAliasedLineThickness = screenSpaceLineWidth * 0.5f + globals.antiAliasingFactor;
             const float sdfLineThickness = screenSpaceLineWidth / 2.0f;
             outV.setLineThickness(sdfLineThickness);
-            outV.setCurrentWorldToScreenRatio(
-                _static_cast<float>((_static_cast<pfloat64_t>(2.0f) /
-                (clipProjectionData.projectionToNDC[0].x * _static_cast<pfloat64_t>(globals.resolution.x))))
-            );
 
             if (objType == ObjectType::LINE)
             {

From 21f424fbaf2234a6892a684ebc18333ef88b36be Mon Sep 17 00:00:00 2001
From: kevyuu <kevin.kayu@gmail.com>
Date: Tue, 22 Apr 2025 08:13:40 +0700
Subject: [PATCH 181/529] Fix merge conflict

---
 .../app_resources/render.comp.hlsl            | 149 ++++++++++++++++--
 67_RayQueryGeometry/main.cpp                  |   9 +-
 2 files changed, 140 insertions(+), 18 deletions(-)

diff --git a/67_RayQueryGeometry/app_resources/render.comp.hlsl b/67_RayQueryGeometry/app_resources/render.comp.hlsl
index b9323ac74..0279978ad 100644
--- a/67_RayQueryGeometry/app_resources/render.comp.hlsl
+++ b/67_RayQueryGeometry/app_resources/render.comp.hlsl
@@ -6,6 +6,7 @@
 #include "nbl/builtin/hlsl/spirv_intrinsics/raytracing.hlsl"
 #include "nbl/builtin/hlsl/bda/__ptr.hlsl"
 
+
 using namespace nbl::hlsl;
 
 [[vk::push_constant]] SPushConstants pc;
@@ -13,6 +14,17 @@ using namespace nbl::hlsl;
 [[vk::binding(0, 0)]] RaytracingAccelerationStructure topLevelAS;
 
 [[vk::binding(1, 0)]] RWTexture2D<float4> outImage;
+[[vk::constant_id(0)]] const float shader_variant = 1.0;
+
+struct SGeomInfo2
+{
+    uint64_t vertexBufferAddress;
+    uint64_t indexBufferAddress;
+
+    uint32_t vertexStride : 29;
+    uint32_t indexType : 2; // 16 bit, 32 bit or none
+    uint32_t smoothNormals : 1;	// flat for cube, rectangle, disk
+};
 
 float3 unpackNormals3x10(uint32_t v)
 {
@@ -23,6 +35,77 @@ float3 unpackNormals3x10(uint32_t v)
     return clamp(float3(pn) / 511.0, -1.0, 1.0);
 }
 
+float3 calculateSmoothNormals2(int instID, int primID, SGeomInfo2 geom, float2 bary)
+{
+    const uint indexType = geom.indexType;
+    const uint vertexStride = geom.vertexStride;
+
+    const uint64_t vertexBufferAddress = geom.vertexBufferAddress;
+    const uint64_t indexBufferAddress = geom.indexBufferAddress;
+
+    uint32_t3 indices;
+    switch (indexType)
+    {
+        case 0: // EIT_16BIT
+            indices = uint32_t3((nbl::hlsl::bda::__ptr<uint16_t3>::create(indexBufferAddress)+primID).deref().load());
+            break;
+        case 1: // EIT_32BIT
+            indices = uint32_t3((nbl::hlsl::bda::__ptr<uint32_t3>::create(indexBufferAddress)+primID).deref().load());
+            break;
+        default:    // EIT_NONE
+        {
+            indices[0] = primID * 3;
+            indices[1] = indices[0] + 1;
+            indices[2] = indices[0] + 2;
+        }
+    }
+
+    float3 n0, n1, n2;
+    switch (instID)
+    {
+        case OT_CUBE:
+        {
+            // TODO: document why the alignment is 2 here and nowhere else? isnt the `vertexStride` aligned to more than 2 anyway?
+            uint32_t v0 = vk::RawBufferLoad<uint32_t>(vertexBufferAddress + indices[0] * vertexStride, 2u);
+            uint32_t v1 = vk::RawBufferLoad<uint32_t>(vertexBufferAddress + indices[1] * vertexStride, 2u);
+            uint32_t v2 = vk::RawBufferLoad<uint32_t>(vertexBufferAddress + indices[2] * vertexStride, 2u);
+
+            n0 = normalize(nbl::hlsl::spirv::unpackSnorm4x8(v0).xyz);
+            n1 = normalize(nbl::hlsl::spirv::unpackSnorm4x8(v1).xyz);
+            n2 = normalize(nbl::hlsl::spirv::unpackSnorm4x8(v2).xyz);
+        }
+        break;
+        case OT_SPHERE:
+        case OT_CYLINDER:
+        case OT_ARROW:
+        case OT_CONE:
+        {
+            uint32_t v0 = vk::RawBufferLoad<uint32_t>(vertexBufferAddress + indices[0] * vertexStride);
+            uint32_t v1 = vk::RawBufferLoad<uint32_t>(vertexBufferAddress + indices[1] * vertexStride);
+            uint32_t v2 = vk::RawBufferLoad<uint32_t>(vertexBufferAddress + indices[2] * vertexStride);
+
+            n0 = normalize(unpackNormals3x10(v0));
+            n1 = normalize(unpackNormals3x10(v1));
+            n2 = normalize(unpackNormals3x10(v2));
+        }
+        break;
+        case OT_RECTANGLE:
+        case OT_DISK:
+        case OT_ICOSPHERE:
+        default:
+        {
+            n0 = normalize(vk::RawBufferLoad<float3>(vertexBufferAddress + indices[0] * vertexStride));
+            n1 = normalize(vk::RawBufferLoad<float3>(vertexBufferAddress + indices[1] * vertexStride));
+            n2 = normalize(vk::RawBufferLoad<float3>(vertexBufferAddress + indices[2] * vertexStride));
+        }
+    }
+
+    float3 barycentrics = float3(0.0, bary);
+    barycentrics.x = 1.0 - barycentrics.y - barycentrics.z;        
+
+    return barycentrics.x * n0 + barycentrics.y * n1 + barycentrics.z * n2;
+}
+
 float3 calculateSmoothNormals(int instID, int primID, SGeomInfo geom, float2 bary)
 {
     const uint indexType = geom.indexType;
@@ -129,27 +212,65 @@ void main(uint32_t3 threadID : SV_DispatchThreadID)
         const SGeomInfo geom = vk::RawBufferLoad<SGeomInfo>(pc.geometryInfoBuffer + instID * sizeof(SGeomInfo));
         
         float3 normals;
-        if (jit::device_capabilities::rayTracingPositionFetch)
+        float2 barycentrics = spirv::rayQueryGetIntersectionBarycentricsKHR(query, true);
+        normals = calculateSmoothNormals(instID, primID, geom, barycentrics);
+
+        normals = normalize(normals) * 0.5 + 0.5;
+        color = float4(normals, 1.0);
+    }
+
+    outImage[threadID.xy] = color;
+}
+    
+[numthreads(WorkgroupSize, WorkgroupSize, 1)]
+[shader("compute")]
+void main2(uint32_t3 threadID : SV_DispatchThreadID)
+{
+    uint2 coords = threadID.xy;
+    coords.y = nbl::hlsl::glsl::gl_NumWorkGroups().y * WorkgroupSize - coords.y;    // need to invert it
+    
+
+    float4 NDC;
+    NDC.xy = float2(coords) * pc.scaleNDC;
+    NDC.xy += pc.offsetNDC;
+    NDC.zw = float2(0, 1.0);
+    float3 targetPos;
+    {
+        float4 tmp = mul(pc.invMVP, NDC);
+        targetPos = tmp.xyz / tmp.w;
+    }
+
+    float3 direction = normalize(targetPos - pc.camPos);
+
+    spirv::RayQueryKHR query;
+    spirv::rayQueryInitializeKHR(query, topLevelAS, spv::RayFlagsOpaqueKHRMask, 0xFF, pc.camPos, 0.01, direction, 1000.0);
+
+    while (spirv::rayQueryProceedKHR(query)) {}
+
+    float4 color = float4(0, 0, 0, 1);
+
+    if (spirv::rayQueryGetIntersectionTypeKHR(query, true) == spv::RayQueryCommittedIntersectionTypeRayQueryCommittedIntersectionTriangleKHR)
+    {
+        const int instID = spirv::rayQueryGetIntersectionInstanceIdKHR(query, true);
+        const int primID = spirv::rayQueryGetIntersectionPrimitiveIndexKHR(query, true);
+
+        // TODO: candidate for `bda::__ptr<SGeomInfo>`
+        const SGeomInfo2 geom = vk::RawBufferLoad<SGeomInfo2>(pc.geometryInfoBuffer + instID * sizeof(SGeomInfo2));
+        
+        float3 normals;
+        if (geom.smoothNormals)
         {
-            if (geom.smoothNormals)
-            {
-                float2 barycentrics = spirv::rayQueryGetIntersectionBarycentricsKHR(query, true);
-                normals = calculateSmoothNormals(instID, primID, geom, barycentrics);
-            }
-            else
-            {
-                float3 pos[3] = spirv::rayQueryGetIntersectionTriangleVertexPositionsKHR(query, true);
-                normals = cross(pos[1] - pos[0], pos[2] - pos[0]);
-            }
+            float2 barycentrics = spirv::rayQueryGetIntersectionBarycentricsKHR(query, true);
+            normals = calculateSmoothNormals2(instID, primID, geom, barycentrics);
         }
         else
         {
-            float2 barycentrics = spirv::rayQueryGetIntersectionBarycentricsKHR(query, true);
-            normals = calculateSmoothNormals(instID, primID, geom, barycentrics);
+            float3 pos[3] = spirv::rayQueryGetIntersectionTriangleVertexPositionsKHR(query, true);
+            normals = cross(pos[1] - pos[0], pos[2] - pos[0]);
         }
 
         normals = normalize(normals) * 0.5 + 0.5;
-        color = float4(normals, 1.0);
+        color = float4(normals, shader_variant);
     }
 
     outImage[threadID.xy] = color;
diff --git a/67_RayQueryGeometry/main.cpp b/67_RayQueryGeometry/main.cpp
index c4c483263..4c09da5da 100644
--- a/67_RayQueryGeometry/main.cpp
+++ b/67_RayQueryGeometry/main.cpp
@@ -164,8 +164,8 @@ class RayQueryGeometryApp final : public examples::SimpleWindowedApplication, pu
 
 				const auto assets = bundle.getContents();
 				assert(assets.size() == 1);
-				const auto sourceRaw = smart_refctd_ptr_static_cast<IShader>(assets[0]);
-				smart_refctd_ptr<IShader> shader = m_device->compileShader({sourceRaw.get(), nullptr, nullptr, nullptr});
+				smart_refctd_ptr<IShader> shaderSrc = IAsset::castDown<IShader>(assets[0]);
+				auto shader = m_device->compileShader({ shaderSrc.get() });
 				if (!shader)
 					return logFail("Failed to create shader!");
 
@@ -783,7 +783,8 @@ class RayQueryGeometryApp final : public examples::SimpleWindowedApplication, pu
 			{
 				IGPUBottomLevelAccelerationStructure::DeviceBuildInfo blasBuildInfos[OT_COUNT];
 				uint32_t primitiveCounts[OT_COUNT];
-				IGPUBottomLevelAccelerationStructure::Triangles<const IGPUBuffer> triangles[OT_COUNT];
+				using Geometry = IGPUBottomLevelAccelerationStructure::Triangles<const IGPUBuffer>;
+				Geometry triangles[OT_COUNT];
 				uint32_t scratchSizes[OT_COUNT];
 
 				for (uint32_t i = 0; i < objectsGpu.size(); i++)
@@ -819,7 +820,7 @@ class RayQueryGeometryApp final : public examples::SimpleWindowedApplication, pu
 					ILogicalDevice::AccelerationStructureBuildSizes buildSizes;
 					{
 						const uint32_t maxPrimCount[1] = { primitiveCounts[i] };
-						buildSizes = m_device->getAccelerationStructureBuildSizes(blasFlags, false, std::span{&triangles[i], 1}, maxPrimCount);
+						buildSizes = m_device->getAccelerationStructureBuildSizes(blasFlags, false, std::span<const Geometry>{&triangles[i], 1}, maxPrimCount);
 						if (!buildSizes)
 							return logFail("Failed to get BLAS build sizes");
 					}

From 2878d038539324760a2a9f450744b0e7086ca0e8 Mon Sep 17 00:00:00 2001
From: kevyuu <kevin.kayu@gmail.com>
Date: Tue, 22 Apr 2025 08:13:58 +0700
Subject: [PATCH 182/529] Fix Hello Compute to use IShader

---
 02_HelloCompute/main.cpp | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/02_HelloCompute/main.cpp b/02_HelloCompute/main.cpp
index 124cd7dc5..63a9f8832 100644
--- a/02_HelloCompute/main.cpp
+++ b/02_HelloCompute/main.cpp
@@ -94,9 +94,9 @@ class HelloComputeApp final : public nbl::application_templates::MonoSystemMonoL
 				// The convention is that an `ICPU` object represents a potentially Mutable (and in the past, Serializable) recipe for creating an `IGPU` object, and later examples will show automated systems for doing that.
 				// The Assets always form a Directed Acyclic Graph and our type system enforces that property at compile time (i.e. an `IBuffer` cannot reference an `IImageView` even indirectly).
 				// Another reason for the 1:1 pairing of types is that one can use a CPU-to-GPU associative cache (asset manager has a default one) and use the pointers to the CPU objects as UUIDs.
-				// The ICPUShader is just a mutable container for source code (can be high level like HLSL needing compilation to SPIR-V or SPIR-V itself) held in an `nbl::asset::ICPUBuffer`.
+				// The IShader is just a mutable container for source code (can be high level like HLSL needing compilation to SPIR-V or SPIR-V itself) held in an `nbl::asset::ICPUBuffer`.
 				// They can be created: from buffers of code, by compilation from some other source code, or loaded from files (next example will do that).
-				smart_refctd_ptr<nbl::asset::ICPUShader> cpuShader;
+				smart_refctd_ptr<nbl::asset::IShader> cpuShader;
 				{
 					// Normally we'd use the ISystem and the IAssetManager to load shaders flexibly from (virtual) files for ease of development (syntax highlighting and Intellisense),
 					// but I want to show the full process of assembling a shader from raw source code at least once.
@@ -138,7 +138,7 @@ class HelloComputeApp final : public nbl::application_templates::MonoSystemMonoL
 				}
 
 				// Note how each ILogicalDevice method takes a smart-pointer r-value, so that the GPU objects refcount their dependencies
-				smart_refctd_ptr<nbl::video::IGPUShader> shader = device->createShader(cpuShader.get());
+				smart_refctd_ptr<IShader> shader = device->compileShader({.source = cpuShader.get()});
 				if (!shader)
 					return logFail("Failed to create a GPU Shader, seems the Driver doesn't like the SPIR-V we're feeding it!\n");
 
@@ -169,6 +169,7 @@ class HelloComputeApp final : public nbl::application_templates::MonoSystemMonoL
 					// Theoretically a blob of SPIR-V can contain multiple named entry points and one has to be chosen, in practice most compilers only support outputting one (and glslang used to require it be called "main")
 					params.shader.entryPoint = "main";
 					params.shader.shader = shader.get();
+					params.shader.stage = hlsl::ESS_COMPUTE;
 					// we'll cover the specialization constant API in another example
 					if (!device->createComputePipelines(nullptr,{&params,1},&pipeline))
 						return logFail("Failed to create pipelines (compile & link shaders)!\n");

From 2ea63044df93b213ba428c80c3948e80cec57c95 Mon Sep 17 00:00:00 2001
From: kevyuu <kevin.kayu@gmail.com>
Date: Tue, 22 Apr 2025 08:14:13 +0700
Subject: [PATCH 183/529] Fix example 03 to use IShader

---
 03_DeviceSelectionAndSharedSources/Testers.h | 22 ++++++++++++--------
 03_DeviceSelectionAndSharedSources/main.cpp  | 18 ++++++++++------
 2 files changed, 25 insertions(+), 15 deletions(-)

diff --git a/03_DeviceSelectionAndSharedSources/Testers.h b/03_DeviceSelectionAndSharedSources/Testers.h
index a76d4b668..b21da71c4 100644
--- a/03_DeviceSelectionAndSharedSources/Testers.h
+++ b/03_DeviceSelectionAndSharedSources/Testers.h
@@ -24,7 +24,7 @@ class IntrospectionTesterBase
 	const std::string m_functionToTestName = "";
 
 protected:
-	static std::pair<smart_refctd_ptr<ICPUShader>, smart_refctd_ptr<const CSPIRVIntrospector::CStageIntrospectionData>> compileHLSLShaderAndTestIntrospection(
+	static std::pair<smart_refctd_ptr<IShader>, smart_refctd_ptr<const CSPIRVIntrospector::CStageIntrospectionData>> compileHLSLShaderAndTestIntrospection(
 		video::IPhysicalDevice* physicalDevice, video::ILogicalDevice* device, system::ILogger* logger, asset::IAssetManager* assetMgr, const std::string& shaderPath, CSPIRVIntrospector& introspector)
 	{
 		IAssetLoader::SAssetLoadParams lp = {};
@@ -33,15 +33,18 @@ class IntrospectionTesterBase
 		// this time we load a shader directly from a file
 		auto assetBundle = assetMgr->getAsset(shaderPath, lp);
 		const auto assets = assetBundle.getContents();
-		if (assets.empty())
+		const auto* metadata = assetBundle.getMetadata();
+		if (assets.empty() || assetBundle.getAssetType() != IAsset::ET_SHADER)
 		{
 			logFail(logger, "Could not load shader!");
 			assert(0);
 		}
+		const auto hlslMetadata = static_cast<const CHLSLMetadata*>(metadata);
+		const auto shaderStage = hlslMetadata->shaderStages->front();
 
 		// It would be super weird if loading a shader from a file produced more than 1 asset
 		assert(assets.size() == 1);
-		smart_refctd_ptr<ICPUShader> source = IAsset::castDown<ICPUShader>(assets[0]);
+		smart_refctd_ptr<IShader> source = IAsset::castDown<IShader>(assets[0]);
 
 		smart_refctd_ptr<const CSPIRVIntrospector::CStageIntrospectionData> introspection;
 		{
@@ -53,7 +56,7 @@ class IntrospectionTesterBase
 			// The Shader Asset Loaders deduce the stage from the file extension,
 			// if the extension is generic (.glsl or .hlsl) the stage is unknown.
 			// But it can still be overriden from within the source with a `#pragma shader_stage` 
-			options.stage = source->getStage() == IShader::E_SHADER_STAGE::ESS_COMPUTE ? source->getStage() : IShader::E_SHADER_STAGE::ESS_VERTEX; // TODO: do smth with it
+			options.stage = shaderStage == IShader::E_SHADER_STAGE::ESS_COMPUTE ? shaderStage : IShader::E_SHADER_STAGE::ESS_VERTEX; // TODO: do smth with it
 			options.targetSpirvVersion = device->getPhysicalDevice()->getLimits().spirvVersion;
 			// we need to perform an unoptimized compilation with source debug info or we'll lose names of variable sin the introspection
 			options.spirvOptimizer = nullptr;
@@ -186,7 +189,7 @@ class PredefinedLayoutTester final : public IntrospectionTesterBase
 		constexpr uint32_t MERGE_TEST_SHADERS_CNT = mergeTestShadersPaths.size();
 
 		CSPIRVIntrospector introspector[MERGE_TEST_SHADERS_CNT];
-		smart_refctd_ptr<ICPUShader> sources[MERGE_TEST_SHADERS_CNT];
+		smart_refctd_ptr<IShader> sources[MERGE_TEST_SHADERS_CNT];
 
 		for (uint32_t i = 0u; i < MERGE_TEST_SHADERS_CNT; ++i)
 		{
@@ -201,7 +204,7 @@ class PredefinedLayoutTester final : public IntrospectionTesterBase
 				.binding = 0,
 				.type = nbl::asset::IDescriptor::E_TYPE::ET_STORAGE_BUFFER,
 				.createFlags = ICPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_NONE,
-				.stageFlags = ICPUShader::E_SHADER_STAGE::ESS_COMPUTE,
+				.stageFlags = IShader::E_SHADER_STAGE::ESS_COMPUTE,
 				.count = 1,
 				.immutableSamplers = nullptr
 			}
@@ -213,7 +216,7 @@ class PredefinedLayoutTester final : public IntrospectionTesterBase
 					.binding = 0,
 					.type = nbl::asset::IDescriptor::E_TYPE::ET_STORAGE_BUFFER,
 					.createFlags = ICPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_NONE,
-					.stageFlags = ICPUShader::E_SHADER_STAGE::ESS_COMPUTE,
+					.stageFlags = IShader::E_SHADER_STAGE::ESS_COMPUTE,
 					.count = 1,
 					.immutableSamplers = nullptr
 				},
@@ -221,7 +224,7 @@ class PredefinedLayoutTester final : public IntrospectionTesterBase
 					.binding = 1,
 					.type = nbl::asset::IDescriptor::E_TYPE::ET_STORAGE_BUFFER,
 					.createFlags = ICPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_NONE,
-					.stageFlags = ICPUShader::E_SHADER_STAGE::ESS_COMPUTE,
+					.stageFlags = IShader::E_SHADER_STAGE::ESS_COMPUTE,
 					.count = 2,
 					.immutableSamplers = nullptr
 				}
@@ -251,9 +254,10 @@ class PredefinedLayoutTester final : public IntrospectionTesterBase
 		bool pplnCreationSuccess[MERGE_TEST_SHADERS_CNT];
 		for (uint32_t i = 0u; i < MERGE_TEST_SHADERS_CNT; ++i)
 		{
-			ICPUShader::SSpecInfo specInfo;
+			IPipelineBase::SShaderSpecInfo specInfo;
 			specInfo.entryPoint = "main";
 			specInfo.shader = sources[i].get();
+			specInfo.stage = hlsl::ShaderStage::ESS_COMPUTE;
 			pplnCreationSuccess[i] = static_cast<bool>(introspector[i].createApproximateComputePipelineFromIntrospection(specInfo, core::smart_refctd_ptr<ICPUPipelineLayout>(predefinedPplnLayout)));
 		}
 
diff --git a/03_DeviceSelectionAndSharedSources/main.cpp b/03_DeviceSelectionAndSharedSources/main.cpp
index be56791a1..3712b5719 100644
--- a/03_DeviceSelectionAndSharedSources/main.cpp
+++ b/03_DeviceSelectionAndSharedSources/main.cpp
@@ -4,6 +4,7 @@
 
 #include "nbl/application_templates/MonoDeviceApplication.hpp"
 #include "nbl/application_templates/MonoAssetManagerAndBuiltinResourceApplication.hpp"
+#include "nbl/asset/metadata/CHLSLMetadata.h"
 #include "CommonPCH/PCH.hpp"
 
 using namespace nbl;
@@ -60,9 +61,10 @@ class DeviceSelectionAndSharedSourcesApp final : public application_templates::M
 		//shaderIntrospection->debugPrint(m_logger.get());
 
 		// We've now skipped the manual creation of a descriptor set layout, pipeline layout
-		ICPUShader::SSpecInfo specInfo;
+		IPipelineBase::SShaderSpecInfo specInfo;
 		specInfo.entryPoint = "main";
 		specInfo.shader = source.get();
+		specInfo.stage = hlsl::ShaderStage::ESS_COMPUTE;
 
 		smart_refctd_ptr<nbl::asset::ICPUComputePipeline> cpuPipeline = introspector.createApproximateComputePipelineFromIntrospection(specInfo);
 
@@ -236,7 +238,7 @@ class DeviceSelectionAndSharedSourcesApp final : public application_templates::M
 	// Whether to keep invoking the above. In this example because its headless GPU compute, we do all the work in the app initialization.
 	bool keepRunning() override { return false; }
 
-	std::pair<smart_refctd_ptr<ICPUShader>, smart_refctd_ptr<const CSPIRVIntrospector::CStageIntrospectionData>> compileShaderAndTestIntrospection(
+	std::pair<smart_refctd_ptr<IShader>, smart_refctd_ptr<const CSPIRVIntrospector::CStageIntrospectionData>> compileShaderAndTestIntrospection(
 		const std::string& shaderPath, CSPIRVIntrospector& introspector)
 	{
 		IAssetLoader::SAssetLoadParams lp = {};
@@ -245,15 +247,19 @@ class DeviceSelectionAndSharedSourcesApp final : public application_templates::M
 		// this time we load a shader directly from a file
 		auto assetBundle = m_assetMgr->getAsset(shaderPath, lp);
 		const auto assets = assetBundle.getContents();
-		if (assets.empty())
+		if (assets.empty() || assetBundle.getAssetType() != IAsset::ET_SHADER)
 		{
 			logFail("Could not load shader!");
 			assert(0);
 		}
 
+		const auto* metadata = assetBundle.getMetadata();
+    const auto hlslMetadata = static_cast<const CHLSLMetadata*>(metadata);
+		const auto shaderStage = hlslMetadata->shaderStages->front();
+
 		// It would be super weird if loading a shader from a file produced more than 1 asset
 		assert(assets.size() == 1);
-		smart_refctd_ptr<ICPUShader> source = IAsset::castDown<ICPUShader>(assets[0]);
+		smart_refctd_ptr<IShader> source = IAsset::castDown<IShader>(assets[0]);
 		
 		smart_refctd_ptr<const CSPIRVIntrospector::CStageIntrospectionData> introspection;
 		{
@@ -265,7 +271,7 @@ class DeviceSelectionAndSharedSourcesApp final : public application_templates::M
 			// The Shader Asset Loaders deduce the stage from the file extension,
 			// if the extension is generic (.glsl or .hlsl) the stage is unknown.
 			// But it can still be overriden from within the source with a `#pragma shader_stage` 
-			options.stage = source->getStage() == IShader::E_SHADER_STAGE::ESS_COMPUTE ? source->getStage() : IShader::E_SHADER_STAGE::ESS_VERTEX; // TODO: do smth with it
+			options.stage = shaderStage == IShader::E_SHADER_STAGE::ESS_COMPUTE ? shaderStage : IShader::E_SHADER_STAGE::ESS_VERTEX; // TODO: do smth with it
 			options.targetSpirvVersion = m_device->getPhysicalDevice()->getLimits().spirvVersion;
 			// we need to perform an unoptimized compilation with source debug info or we'll lose names of variable sin the introspection
 			options.spirvOptimizer = nullptr;
@@ -277,7 +283,7 @@ class DeviceSelectionAndSharedSourcesApp final : public application_templates::M
 			options.preprocessorOptions.includeFinder = compilerSet->getShaderCompiler(source->getContentType())->getDefaultIncludeFinder();
 
 			auto spirvUnspecialized = compilerSet->compileToSPIRV(source.get(), options);
-			const CSPIRVIntrospector::CStageIntrospectionData::SParams inspctParams = { .entryPoint = "main", .shader = spirvUnspecialized };
+			const CSPIRVIntrospector::CStageIntrospectionData::SParams inspctParams = { .entryPoint = "main", .shader = spirvUnspecialized, .stage = shaderStage };
 
 			introspection = introspector.introspect(inspctParams);
 			introspection->debugPrint(m_logger.get());

From 78990f8cf9d4d2b8c7f66b1adf3a29c4a05823a5 Mon Sep 17 00:00:00 2001
From: kevyuu <kevin.kayu@gmail.com>
Date: Tue, 22 Apr 2025 08:14:27 +0700
Subject: [PATCH 184/529] Fix example 05 to use IShader

---
 05_StreamingAndBufferDeviceAddressApp/main.cpp | 13 +++++--------
 1 file changed, 5 insertions(+), 8 deletions(-)

diff --git a/05_StreamingAndBufferDeviceAddressApp/main.cpp b/05_StreamingAndBufferDeviceAddressApp/main.cpp
index e8f7dbd33..96ccce9f5 100644
--- a/05_StreamingAndBufferDeviceAddressApp/main.cpp
+++ b/05_StreamingAndBufferDeviceAddressApp/main.cpp
@@ -91,7 +91,7 @@ class StreamingAndBufferDeviceAddressApp final : public application_templates::M
 				return false;
 
 			// this time we load a shader directly from a file
-			smart_refctd_ptr<IGPUShader> shader;
+			smart_refctd_ptr<IShader> shader;
 			{
 				IAssetLoader::SAssetLoadParams lp = {};
 				lp.logger = m_logger.get();
@@ -102,14 +102,9 @@ class StreamingAndBufferDeviceAddressApp final : public application_templates::M
 					return logFail("Could not load shader!");
 
 				// lets go straight from ICPUSpecializedShader to IGPUSpecializedShader
-				auto source = IAsset::castDown<ICPUShader>(assets[0]);
+				shader = IAsset::castDown<IShader>(assets[0]);
 				// The down-cast should not fail!
-				assert(source);
-
-				// this time we skip the use of the asset converter since the ICPUShader->IGPUShader path is quick and simple
-				shader = m_device->createShader(source.get());
-				if (!shader)
-					return logFail("Creation of a GPU Shader to from CPU Shader source failed!");
+				assert(shader);
 			}
 
 			// The StreamingTransientDataBuffers are actually composed on top of another useful utility called `CAsyncSingleBufferSubAllocator`
@@ -139,6 +134,8 @@ class StreamingAndBufferDeviceAddressApp final : public application_templates::M
 				IGPUComputePipeline::SCreationParams params = {};
 				params.layout = layout.get();
 				params.shader.shader = shader.get();
+				params.shader.entryPoint = "main";
+				params.shader.stage = hlsl::ShaderStage::ESS_COMPUTE;
 				if (!m_device->createComputePipelines(nullptr,{&params,1},&m_pipeline))
 					return logFail("Failed to create compute pipeline!\n");
 			}

From 01df790d87005ac4e4ecb8bebdd7534ad7d8f7d7 Mon Sep 17 00:00:00 2001
From: kevyuu <kevin.kayu@gmail.com>
Date: Tue, 22 Apr 2025 08:14:41 +0700
Subject: [PATCH 185/529] Fix example 07 to use IShader

---
 07_StagingAndMultipleQueues/main.cpp | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/07_StagingAndMultipleQueues/main.cpp b/07_StagingAndMultipleQueues/main.cpp
index 658a28a35..23f2246bc 100644
--- a/07_StagingAndMultipleQueues/main.cpp
+++ b/07_StagingAndMultipleQueues/main.cpp
@@ -246,7 +246,7 @@ class StagingAndMultipleQueuesApp final : public application_templates::BasicMul
 					.binding = 0,
 					.type = nbl::asset::IDescriptor::E_TYPE::ET_SAMPLED_IMAGE,
 					.createFlags = IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_NONE,
-					.stageFlags = IGPUShader::E_SHADER_STAGE::ESS_COMPUTE,
+					.stageFlags = IShader::E_SHADER_STAGE::ESS_COMPUTE,
 					.count = 1,
 					.immutableSamplers = nullptr
 				},
@@ -254,7 +254,7 @@ class StagingAndMultipleQueuesApp final : public application_templates::BasicMul
 					.binding = 1,
 					.type = nbl::asset::IDescriptor::E_TYPE::ET_STORAGE_BUFFER,
 					.createFlags = IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_NONE,
-					.stageFlags = IGPUShader::E_SHADER_STAGE::ESS_COMPUTE,
+					.stageFlags = IShader::E_SHADER_STAGE::ESS_COMPUTE,
 					.count = 1,
 					.immutableSamplers = nullptr
 				}
@@ -281,18 +281,17 @@ class StagingAndMultipleQueuesApp final : public application_templates::BasicMul
 		}
 
 		// LOAD SHADER FROM FILE
-		smart_refctd_ptr<ICPUShader> source;
+		smart_refctd_ptr<IShader> source;
 		{
-			source = loadFistAssetInBundle<ICPUShader>("../app_resources/comp_shader.hlsl");
-			source->setShaderStage(IShader::E_SHADER_STAGE::ESS_COMPUTE); // can also be done via a #pragma in the shader
+			source = loadFistAssetInBundle<IShader>("../app_resources/comp_shader.hlsl");
 		}
 
 		if (!source)
 			logFailAndTerminate("Could not create a CPU shader!");
 
-		core::smart_refctd_ptr<IGPUShader> shader = m_device->createShader(source.get());
+		core::smart_refctd_ptr<IShader> shader = m_device->compileShader({ source.get() });
 		if(!shader)
-			logFailAndTerminate("Could not create a GPU shader!");
+			logFailAndTerminate("Could not compile shader to spirv!");
 
 		// CREATE COMPUTE PIPELINE
 		SPushConstantRange pc[1];
@@ -312,6 +311,7 @@ class StagingAndMultipleQueuesApp final : public application_templates::BasicMul
 			// Theoretically a blob of SPIR-V can contain multiple named entry points and one has to be chosen, in practice most compilers only support outputting one (and glslang used to require it be called "main")
 			params.shader.entryPoint = "main";
 			params.shader.shader = shader.get();
+			params.shader.stage = hlsl::ShaderStage::ESS_COMPUTE;
 			// we'll cover the specialization constant API in another example
 			if (!m_device->createComputePipelines(nullptr,{&params,1},&pipeline))
 				logFailAndTerminate("Failed to create pipelines (compile & link shaders)!\n");
@@ -432,15 +432,15 @@ class StagingAndMultipleQueuesApp final : public application_templates::BasicMul
 			submitInfo[0].waitSemaphores = waitSemaphoreSubmitInfo;
 			// there's no save to wait on, or need to prevent signal-after-submit because Renderdoc freezes because it
 			// starts capturing immediately upon a submit and can't defer a capture till semaphores signal.
-			if (imageToProcessId<SUBMITS_IN_FLIGHT || m_api->isRunningInRenderdoc())
+			if (imageToProcessId<SUBMITS_IN_FLIGHT || m_api->isRunningInGraphicsDebugger())
 				submitInfo[0].waitSemaphores = {waitSemaphoreSubmitInfo,1};
-			if (m_api->isRunningInRenderdoc() && imageToProcessId>=SUBMITS_IN_FLIGHT)
+			if (m_api->isRunningInGraphicsDebugger() && imageToProcessId>=SUBMITS_IN_FLIGHT)
 			for (auto old = histogramsSaved.load(); old < histogramSaveWaitSemaphoreValue; old = histogramsSaved.load())
 				histogramsSaved.wait(old);
 			// Some Devices like all of the Intel GPUs do not have enough queues for us to allocate different queues to compute and transfers,
 			// so our `BasicMultiQueueApplication` will "alias" a single queue to both usages. Normally you don't need to care, but here we're
 			// attempting to do "out-of-order" "submit-before-signal" so we need to "hold back" submissions if the queues are aliased!
-			if (getTransferUpQueue()==computeQueue || m_api->isRunningInRenderdoc())
+			if (getTransferUpQueue()==computeQueue || m_api->isRunningInGraphicsDebugger())
 			for (auto old = transfersSubmitted.load(); old <= imageToProcessId; old = transfersSubmitted.load())
 				transfersSubmitted.wait(old);
 			computeQueue->submit(submitInfo);

From f46522c3043b132b4aa4a7765ac41f1c9173ae66 Mon Sep 17 00:00:00 2001
From: kevyuu <kevin.kayu@gmail.com>
Date: Tue, 22 Apr 2025 08:14:53 +0700
Subject: [PATCH 186/529] Fix example 70 to use IShader

---
 .../compute/advectParticles.comp.hlsl         |  1 +
 .../compute/applyBodyForces.comp.hlsl         |  1 +
 .../app_resources/compute/diffusion.comp.hlsl |  1 +
 .../compute/genParticleVertices.comp.hlsl     |  1 +
 .../compute/particlesInit.comp.hlsl           |  1 +
 .../compute/prepareCellUpdate.comp.hlsl       |  1 +
 .../compute/pressureSolver.comp.hlsl          |  1 +
 .../compute/updateFluidCells.comp.hlsl        |  1 +
 .../fluidParticles.fragment.hlsl              |  1 +
 .../app_resources/fluidParticles.vertex.hlsl  |  1 +
 70_FLIPFluids/main.cpp                        | 34 +++++++++++--------
 11 files changed, 30 insertions(+), 14 deletions(-)

diff --git a/70_FLIPFluids/app_resources/compute/advectParticles.comp.hlsl b/70_FLIPFluids/app_resources/compute/advectParticles.comp.hlsl
index 2d329ac85..64e94f262 100644
--- a/70_FLIPFluids/app_resources/compute/advectParticles.comp.hlsl
+++ b/70_FLIPFluids/app_resources/compute/advectParticles.comp.hlsl
@@ -26,6 +26,7 @@ using namespace nbl::hlsl;
 
 // TODO: delta time push constant? (but then for CI need a commandline `-fixed-timestep=MS` and `-frames=N` option too)
 [numthreads(WorkgroupSize, 1, 1)]
+[shader("compute")]
 void main(uint32_t3 ID : SV_DispatchThreadID)
 {
     uint32_t pid = ID.x;
diff --git a/70_FLIPFluids/app_resources/compute/applyBodyForces.comp.hlsl b/70_FLIPFluids/app_resources/compute/applyBodyForces.comp.hlsl
index 8ffc5e821..b2c1e0b3f 100644
--- a/70_FLIPFluids/app_resources/compute/applyBodyForces.comp.hlsl
+++ b/70_FLIPFluids/app_resources/compute/applyBodyForces.comp.hlsl
@@ -14,6 +14,7 @@ cbuffer GridData
 
 // TODO: can this kernel be fused with any preceeding/succeeding it?
 [numthreads(WorkgroupGridDim, WorkgroupGridDim, WorkgroupGridDim)]
+[shader("compute")]
 void main(uint32_t3 ID : SV_DispatchThreadID)
 {
     // only gravity for now
diff --git a/70_FLIPFluids/app_resources/compute/diffusion.comp.hlsl b/70_FLIPFluids/app_resources/compute/diffusion.comp.hlsl
index 43a57ed38..e53c91d2d 100644
--- a/70_FLIPFluids/app_resources/compute/diffusion.comp.hlsl
+++ b/70_FLIPFluids/app_resources/compute/diffusion.comp.hlsl
@@ -34,6 +34,7 @@ groupshared uint16_t3 sAxisCellMat[14][14][14]; // TODO: `uint16_t` per axis is
 groupshared float16_t3 sDiffusion[14][14][14];
 
 [numthreads(WorkgroupGridDim, WorkgroupGridDim, WorkgroupGridDim)]
+[shader("compute")]
 void setAxisCellMaterial(uint32_t3 ID : SV_DispatchThreadID)
 {
     int3 cellIdx = ID;
diff --git a/70_FLIPFluids/app_resources/compute/genParticleVertices.comp.hlsl b/70_FLIPFluids/app_resources/compute/genParticleVertices.comp.hlsl
index b66db1ca2..4c4a76690 100644
--- a/70_FLIPFluids/app_resources/compute/genParticleVertices.comp.hlsl
+++ b/70_FLIPFluids/app_resources/compute/genParticleVertices.comp.hlsl
@@ -57,6 +57,7 @@ static const float2 quadUVs[4] = {
 using namespace nbl::hlsl;
 
 [numthreads(WorkgroupSize, 1, 1)]
+[shader("compute")]
 void main(uint32_t3 ID : SV_DispatchThreadID)
 {
     uint32_t pid = ID.x;
diff --git a/70_FLIPFluids/app_resources/compute/particlesInit.comp.hlsl b/70_FLIPFluids/app_resources/compute/particlesInit.comp.hlsl
index 173929b10..27bf4366f 100644
--- a/70_FLIPFluids/app_resources/compute/particlesInit.comp.hlsl
+++ b/70_FLIPFluids/app_resources/compute/particlesInit.comp.hlsl
@@ -17,6 +17,7 @@ cbuffer GridData
 };
 
 [numthreads(WorkgroupSize, 1, 1)]
+[shader("compute")]
 void main(uint32_t3 ID : SV_DispatchThreadID)
 {
     uint32_t pid = ID.x;
diff --git a/70_FLIPFluids/app_resources/compute/prepareCellUpdate.comp.hlsl b/70_FLIPFluids/app_resources/compute/prepareCellUpdate.comp.hlsl
index fe82fe946..157da5bb8 100644
--- a/70_FLIPFluids/app_resources/compute/prepareCellUpdate.comp.hlsl
+++ b/70_FLIPFluids/app_resources/compute/prepareCellUpdate.comp.hlsl
@@ -42,6 +42,7 @@ float getWeight(float3 pPos, float3 cPos, float invSpacing)
 }
 
 [numthreads(WorkgroupSize, 1, 1)]
+[shader("compute")]
 void main(uint32_t3 ID : SV_DispatchThreadID)
 {
     uint pid = ID.x;
diff --git a/70_FLIPFluids/app_resources/compute/pressureSolver.comp.hlsl b/70_FLIPFluids/app_resources/compute/pressureSolver.comp.hlsl
index 668b15c31..b5db995c5 100644
--- a/70_FLIPFluids/app_resources/compute/pressureSolver.comp.hlsl
+++ b/70_FLIPFluids/app_resources/compute/pressureSolver.comp.hlsl
@@ -36,6 +36,7 @@ groupshared float sDivergence[14][14][14];
 groupshared float sPressure[14][14][14];
 
 [numthreads(WorkgroupGridDim, WorkgroupGridDim, WorkgroupGridDim)]
+[shader("compute")]
 void calculateNegativeDivergence(uint32_t3 ID : SV_DispatchThreadID)
 {
     int3 cellIdx = ID;
diff --git a/70_FLIPFluids/app_resources/compute/updateFluidCells.comp.hlsl b/70_FLIPFluids/app_resources/compute/updateFluidCells.comp.hlsl
index 9d7fabd52..62ddfd822 100644
--- a/70_FLIPFluids/app_resources/compute/updateFluidCells.comp.hlsl
+++ b/70_FLIPFluids/app_resources/compute/updateFluidCells.comp.hlsl
@@ -40,6 +40,7 @@ void updateFluidCells(uint32_t3 ID : SV_DispatchThreadID)
 }
 
 [numthreads(WorkgroupGridDim, WorkgroupGridDim, WorkgroupGridDim)]
+[shader("compute")]
 void updateNeighborFluidCells(uint32_t3 ID : SV_DispatchThreadID)
 {
     int3 cIdx = ID;
diff --git a/70_FLIPFluids/app_resources/fluidParticles.fragment.hlsl b/70_FLIPFluids/app_resources/fluidParticles.fragment.hlsl
index e556ce8ed..cac1bfa4a 100644
--- a/70_FLIPFluids/app_resources/fluidParticles.fragment.hlsl
+++ b/70_FLIPFluids/app_resources/fluidParticles.fragment.hlsl
@@ -9,6 +9,7 @@ cbuffer CameraData // TODO: BDA instead of UBO, one less thing in DSLayout
     SMVPParams camParams;
 };
 
+[shader("pixel")]
 float4 main(PSInput input, out float depthTest : SV_DEPTHGREATEREQUAL) : SV_TARGET
 {
     float3 N;
diff --git a/70_FLIPFluids/app_resources/fluidParticles.vertex.hlsl b/70_FLIPFluids/app_resources/fluidParticles.vertex.hlsl
index 4708083c6..89d37eb6f 100644
--- a/70_FLIPFluids/app_resources/fluidParticles.vertex.hlsl
+++ b/70_FLIPFluids/app_resources/fluidParticles.vertex.hlsl
@@ -14,6 +14,7 @@ struct SPushConstants
 #include "nbl/builtin/hlsl/bda/__ptr.hlsl"
 using namespace nbl::hlsl;
 
+[shader("vertex")]
 PSInput main(uint vertexID : SV_VertexID)
 {
     PSInput output;
diff --git a/70_FLIPFluids/main.cpp b/70_FLIPFluids/main.cpp
index 93e753b68..a0d2ad95d 100644
--- a/70_FLIPFluids/main.cpp
+++ b/70_FLIPFluids/main.cpp
@@ -9,6 +9,8 @@
 #include <nbl/builtin/hlsl/cpp_compat.hlsl>
 #include <nbl/builtin/hlsl/cpp_compat/matrix.hlsl>
 
+#include "nbl/asset/metadata/CHLSLMetadata.h"
+
 using namespace nbl::hlsl;
 using namespace nbl;
 using namespace core;
@@ -372,6 +374,7 @@ class FLIPFluidsApp final : public examples::SimpleWindowedApplication, public a
                 params.layout = pipelineLayout.get();
                 params.shader.entryPoint = entryPoint;
                 params.shader.shader = shader.get();
+                params.shader.stage = ESS_COMPUTE;
                 
                 m_device->createComputePipelines(nullptr, { &params,1 }, &pipeline);
             };
@@ -628,6 +631,7 @@ class FLIPFluidsApp final : public examples::SimpleWindowedApplication, public a
                 params.layout = pipelineLayout.get();
                 params.shader.entryPoint = iterateKernel;
                 params.shader.shader = iterateShader.get();
+                params.shader.stage = ESS_COMPUTE;
 
                 m_device->createComputePipelines(nullptr, { &params,1 }, &m_iterateDiffusionPipeline);
             }
@@ -636,6 +640,7 @@ class FLIPFluidsApp final : public examples::SimpleWindowedApplication, public a
                 params.layout = pipelineLayout.get();
                 params.shader.entryPoint = applyKernel;
                 params.shader.shader = applyShader.get();
+                params.shader.stage = ESS_COMPUTE;
 
                 m_device->createComputePipelines(nullptr, { &params,1 }, &m_diffusionPipeline);
             }
@@ -1401,7 +1406,7 @@ class FLIPFluidsApp final : public examples::SimpleWindowedApplication, public a
         numParticles = m_gridData.particleInitSize.x * m_gridData.particleInitSize.y * m_gridData.particleInitSize.z * particlesPerCell;
     }
 
-    smart_refctd_ptr<IGPUShader> compileShader(const std::string& filePath, const std::string& entryPoint = "main")
+    smart_refctd_ptr<IShader> compileShader(const std::string& filePath, const std::string& entryPoint = "main")
     {
         IAssetLoader::SAssetLoadParams lparams = {};
         lparams.logger = m_logger.get();
@@ -1415,14 +1420,16 @@ class FLIPFluidsApp final : public examples::SimpleWindowedApplication, public a
         
         const auto assets = bundle.getContents();
         assert(assets.size() == 1);
-        smart_refctd_ptr<ICPUShader> shaderSrc = IAsset::castDown<ICPUShader>(assets[0]);
+        smart_refctd_ptr<IShader> shaderSrc = IAsset::castDown<IShader>(assets[0]);
+        const auto hlslMetadata = static_cast<const CHLSLMetadata*>(bundle.getMetadata());
+        const auto shaderStage = hlslMetadata->shaderStages->front();
 
-        smart_refctd_ptr<ICPUShader> shader = shaderSrc;
+        smart_refctd_ptr<IShader> shader = shaderSrc;
         if (entryPoint != "main")
         {
             auto compiler = make_smart_refctd_ptr<asset::CHLSLCompiler>(smart_refctd_ptr(m_system));
             CHLSLCompiler::SOptions options = {};
-            options.stage = shaderSrc->getStage();
+            options.stage = shaderStage;
             if (!(options.stage == IShader::E_SHADER_STAGE::ESS_COMPUTE || options.stage == IShader::E_SHADER_STAGE::ESS_FRAGMENT))
                 options.stage = IShader::E_SHADER_STAGE::ESS_VERTEX;
             options.targetSpirvVersion = m_device->getPhysicalDevice()->getLimits().spirvVersion;
@@ -1443,7 +1450,7 @@ class FLIPFluidsApp final : public examples::SimpleWindowedApplication, public a
             shader = compiler->compileToSPIRV((const char*)shaderSrc->getContent()->getPointer(), options);
         }
 
-        return m_device->createShader(shader.get());
+        return m_device->compileShader({ shader.get() });
     }
 
     // TODO: there's a method in IUtilities for this
@@ -1562,7 +1569,7 @@ class FLIPFluidsApp final : public examples::SimpleWindowedApplication, public a
 
         // init shaders and pipeline
 
-        auto compileShader = [&](const std::string& filePath, IShader::E_SHADER_STAGE stage) -> smart_refctd_ptr<IGPUShader>
+        auto compileShader = [&](const std::string& filePath) -> smart_refctd_ptr<IShader>
             {
                 IAssetLoader::SAssetLoadParams lparams = {};
                 lparams.logger = m_logger.get();
@@ -1576,15 +1583,14 @@ class FLIPFluidsApp final : public examples::SimpleWindowedApplication, public a
         
                 const auto assets = bundle.getContents();
                 assert(assets.size() == 1);
-                smart_refctd_ptr<ICPUShader> shaderSrc = IAsset::castDown<ICPUShader>(assets[0]);
-                shaderSrc->setShaderStage(stage);
+                smart_refctd_ptr<IShader> shaderSrc = IAsset::castDown<IShader>(assets[0]);
                 if (!shaderSrc)
                     return nullptr;
 
-                return m_device->createShader(shaderSrc.get());
+                return m_device->compileShader({ shaderSrc.get() });
             };
-        auto vs = compileShader("app_resources/fluidParticles.vertex.hlsl", IShader::E_SHADER_STAGE::ESS_VERTEX);
-        auto fs = compileShader("app_resources/fluidParticles.fragment.hlsl", IShader::E_SHADER_STAGE::ESS_FRAGMENT);
+        auto vs = compileShader("app_resources/fluidParticles.vertex.hlsl");
+        auto fs = compileShader("app_resources/fluidParticles.fragment.hlsl");
 
         smart_refctd_ptr<video::IGPUDescriptorSetLayout> descriptorSetLayout1;
         {
@@ -1629,9 +1635,9 @@ class FLIPFluidsApp final : public examples::SimpleWindowedApplication, public a
         blendParams.blendParams[0u].colorWriteMask = (1u << 0u) | (1u << 1u) | (1u << 2u) | (1u << 3u);
 
         {
-            IGPUShader::SSpecInfo specInfo[3] = {
-                {.shader = vs.get()},
-                {.shader = fs.get()},
+            IPipelineBase::SShaderSpecInfo specInfo[] = {
+                {.shader = vs.get(), .entryPoint = "main", .stage = ESS_VERTEX, },
+                {.shader = fs.get(), .entryPoint = "main", .stage = ESS_FRAGMENT, },
             };
 
             const asset::SPushConstantRange pcRange = { .stageFlags = IShader::E_SHADER_STAGE::ESS_VERTEX, .offset = 0, .size = sizeof(uint64_t) };

From b0b6f648d62ee274bd3b6b34c30f992183402126 Mon Sep 17 00:00:00 2001
From: kevyuu <kevin.kayu@gmail.com>
Date: Tue, 22 Apr 2025 08:15:02 +0700
Subject: [PATCH 187/529] Fix example 30 to use IShader

---
 30_ComputeShaderPathTracer/main.cpp | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/30_ComputeShaderPathTracer/main.cpp b/30_ComputeShaderPathTracer/main.cpp
index 26d673002..44a4dd6ef 100644
--- a/30_ComputeShaderPathTracer/main.cpp
+++ b/30_ComputeShaderPathTracer/main.cpp
@@ -313,12 +313,11 @@ class ComputeShaderPathtracer final : public examples::SimpleWindowedApplication
 						std::exit(-1);
 					}
 
-					auto source = IAsset::castDown<ICPUShader>(assets[0]);
+					auto source = IAsset::castDown<IShader>(assets[0]);
 					// The down-cast should not fail!
 					assert(source);
 
-					// this time we skip the use of the asset converter since the ICPUShader->IGPUShader path is quick and simple
-					auto shader = m_device->createShader(source.get());
+					auto shader = m_device->compileShader({ .source = source.get(), .stage = ESS_COMPUTE });
 					if (!shader)
 					{
 						m_logger->log("Shader creationed failed: %s!", ILogger::ELL_ERROR, pathToShader);
@@ -352,9 +351,10 @@ class ComputeShaderPathtracer final : public examples::SimpleWindowedApplication
 						params.layout = ptPipelineLayout.get();
 						params.shader.shader = ptShader.get();
 						params.shader.entryPoint = "main";
+						params.shader.stage = ESS_COMPUTE;
 						params.shader.entries = nullptr;
 						params.shader.requireFullSubgroups = true;
-						params.shader.requiredSubgroupSize = static_cast<IGPUShader::SSpecInfo::SUBGROUP_SIZE>(5);
+						params.shader.requiredSubgroupSize = static_cast<IPipelineBase::SShaderSpecInfo::SUBGROUP_SIZE>(5);
 						if (!m_device->createComputePipelines(nullptr, { &params, 1 }, m_PTPipelines.data() + index)) {
 							return logFail("Failed to create compute pipeline!\n");
 						}
@@ -373,9 +373,10 @@ class ComputeShaderPathtracer final : public examples::SimpleWindowedApplication
 					if (!fragmentShader)
 						return logFail("Failed to Load and Compile Fragment Shader: lumaMeterShader!");
 
-					const IGPUShader::SSpecInfo fragSpec = {
+					const IPipelineBase::SShaderSpecInfo fragSpec = {
+						.shader = fragmentShader.get(),
 						.entryPoint = "main",
-						.shader = fragmentShader.get()
+						.stage = ESS_FRAGMENT,
 					};
 
 					auto presentLayout = m_device->createPipelineLayout(

From a675cdb16ea67d88f5c730758f4c2cdbfa22d8a6 Mon Sep 17 00:00:00 2001
From: kevyuu <kevin.kayu@gmail.com>
Date: Tue, 22 Apr 2025 08:15:22 +0700
Subject: [PATCH 188/529] Fix example 10 to use IShader

---
 .../app_resources/prefix_sum_shader.comp.hlsl         |  1 +
 .../app_resources/scatter_shader.comp.hlsl            |  1 +
 10_CountingSort/main.cpp                              | 11 ++++++-----
 3 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/10_CountingSort/app_resources/prefix_sum_shader.comp.hlsl b/10_CountingSort/app_resources/prefix_sum_shader.comp.hlsl
index 1e5d2510e..b0301fc3f 100644
--- a/10_CountingSort/app_resources/prefix_sum_shader.comp.hlsl
+++ b/10_CountingSort/app_resources/prefix_sum_shader.comp.hlsl
@@ -4,6 +4,7 @@
 [[vk::push_constant]] CountingPushData pushData;
 
 [numthreads(WorkgroupSize,1,1)]
+[shader("compute")]
 void main(uint32_t3 ID : SV_GroupThreadID, uint32_t3 GroupID : SV_GroupID)
 {
     sort::CountingParameters < uint32_t > params;
diff --git a/10_CountingSort/app_resources/scatter_shader.comp.hlsl b/10_CountingSort/app_resources/scatter_shader.comp.hlsl
index fa502726f..ddecfca2b 100644
--- a/10_CountingSort/app_resources/scatter_shader.comp.hlsl
+++ b/10_CountingSort/app_resources/scatter_shader.comp.hlsl
@@ -6,6 +6,7 @@
 using DoublePtrAccessor = DoubleBdaAccessor<uint32_t>;
 
 [numthreads(WorkgroupSize, 1, 1)]
+[shader("compute")]
 void main(uint32_t3 ID : SV_GroupThreadID, uint32_t3 GroupID : SV_GroupID)
 {
     sort::CountingParameters<uint32_t> params;
diff --git a/10_CountingSort/main.cpp b/10_CountingSort/main.cpp
index 4d0c93516..1fd789ad1 100644
--- a/10_CountingSort/main.cpp
+++ b/10_CountingSort/main.cpp
@@ -37,7 +37,7 @@ class CountingSortApp final : public application_templates::MonoDeviceApplicatio
 			const uint32_t bucket_count = std::min((uint32_t)3000, MaxBucketCount);
 			const uint32_t elements_per_thread = ceil((float)ceil((float)element_count / limits.computeUnits) / WorkgroupSize);
 
-			auto prepShader = [&](const core::string& path) -> smart_refctd_ptr<IGPUShader>
+			auto prepShader = [&](const core::string& path) -> smart_refctd_ptr<IShader>
 			{
 				// this time we load a shader directly from a file
 				IAssetLoader::SAssetLoadParams lp = {};
@@ -51,7 +51,7 @@ class CountingSortApp final : public application_templates::MonoDeviceApplicatio
 					return nullptr;
 				}
 
-				auto source = IAsset::castDown<ICPUShader>(assets[0]);
+				auto source = IAsset::castDown<IShader>(assets[0]);
 				// The down-cast should not fail!
 				assert(source);
 			
@@ -63,8 +63,8 @@ class CountingSortApp final : public application_templates::MonoDeviceApplicatio
 					WorkgroupSize, bucket_count
 				);
 
-				// this time we skip the use of the asset converter since the ICPUShader->IGPUShader path is quick and simple
-				auto shader = m_device->createShader(overrideSource.get());
+				// this time we skip the use of the asset converter since the IShader->IGPUShader path is quick and simple
+				auto shader = m_device->compileShader({ overrideSource.get() });
 				if (!shader)
 				{
 					logFail("Creation of Prefix Sum Shader from CPU Shader source failed!");
@@ -92,9 +92,10 @@ class CountingSortApp final : public application_templates::MonoDeviceApplicatio
 				params.layout = layout.get();
 				params.shader.shader = prefixSumShader.get();
 				params.shader.entryPoint = "main";
+				params.shader.stage = hlsl::ShaderStage::ESS_COMPUTE;
 				params.shader.entries = nullptr;
 				params.shader.requireFullSubgroups = true;
-				params.shader.requiredSubgroupSize = static_cast<IGPUShader::SSpecInfo::SUBGROUP_SIZE>(5);
+				params.shader.requiredSubgroupSize = static_cast<IPipelineBase::SShaderSpecInfo::SUBGROUP_SIZE>(5);
 				if (!m_device->createComputePipelines(nullptr, { &params,1 }, &prefixSumPipeline))
 					return logFail("Failed to create compute pipeline!\n");
 				params.shader.shader = scatterShader.get();

From 4104de5f21d801c25a652649af271f7fe69560c1 Mon Sep 17 00:00:00 2001
From: kevyuu <kevin.kayu@gmail.com>
Date: Tue, 22 Apr 2025 08:15:32 +0700
Subject: [PATCH 189/529] Fix example 11 to use IShader

---
 11_FFT/app_resources/shader.comp.hlsl |  1 +
 11_FFT/main.cpp                       | 22 ++++++++++++----------
 2 files changed, 13 insertions(+), 10 deletions(-)

diff --git a/11_FFT/app_resources/shader.comp.hlsl b/11_FFT/app_resources/shader.comp.hlsl
index ecbf4f092..da3de00cd 100644
--- a/11_FFT/app_resources/shader.comp.hlsl
+++ b/11_FFT/app_resources/shader.comp.hlsl
@@ -60,6 +60,7 @@ struct Accessor
 };
 
 [numthreads(ConstevalParameters::WorkgroupSize,1,1)]
+[shader("compute")]
 void main(uint32_t3 ID : SV_DispatchThreadID)
 {
 	Accessor accessor = Accessor::create(pushConstants.deviceBufferAddress);
diff --git a/11_FFT/main.cpp b/11_FFT/main.cpp
index 80f5f856c..1cac98b1f 100644
--- a/11_FFT/main.cpp
+++ b/11_FFT/main.cpp
@@ -46,13 +46,13 @@ class FFT_Test final : public application_templates::MonoDeviceApplication, publ
 	smart_refctd_ptr<ISemaphore> m_timeline;
 	uint64_t semaphorValue = 0;
 
-	inline core::smart_refctd_ptr<video::IGPUShader> createShader(
+	inline core::smart_refctd_ptr<asset::IShader> createShader(
 		const char* includeMainName)
 	{
 		std::string prelude = "#include \"";
-		auto CPUShader = core::make_smart_refctd_ptr<ICPUShader>((prelude + includeMainName + "\"\n").c_str(), IShader::E_SHADER_STAGE::ESS_COMPUTE, IShader::E_CONTENT_TYPE::ECT_HLSL, includeMainName);
-		assert(CPUShader);
-		return m_device->createShader(CPUShader.get());
+		auto hlslShader = core::make_smart_refctd_ptr<IShader>((prelude + includeMainName + "\"\n").c_str(), IShader::E_CONTENT_TYPE::ECT_HLSL, includeMainName);
+		assert(hlslShader);
+		return m_device->compileShader({ hlslShader.get() });
 	}
 
 public:
@@ -70,7 +70,7 @@ class FFT_Test final : public application_templates::MonoDeviceApplication, publ
 			return false;
 
 		// this time we load a shader directly from a file
-		smart_refctd_ptr<IGPUShader> shader;
+		smart_refctd_ptr<IShader> shader;
 		/* {
 			IAssetLoader::SAssetLoadParams lp = {};
 			lp.logger = m_logger.get();
@@ -81,14 +81,14 @@ class FFT_Test final : public application_templates::MonoDeviceApplication, publ
 				return logFail("Could not load shader!");
 
 			// Cast down the asset to its proper type
-			auto source = IAsset::castDown<ICPUShader>(assets[0]);
+			auto source = IAsset::castDown<IShader>(assets[0]);
 			// The down-cast should not fail!
 			assert(source);
 
-			// Compile directly to IGPUShader
-			shader = m_device->createShader(source.get());
+			// Compile directly to SPIR-V Shader
+			shader = m_device->compileShader({ source.get() });
 			if (!shader)
-				return logFail("Creation of a GPU Shader to from CPU Shader source failed!");
+				return logFail("Creation of a SPIR-V Shader from HLSL Shader source failed!");
 		}*/
 		shader = createShader("app_resources/shader.comp.hlsl");
 
@@ -132,7 +132,9 @@ class FFT_Test final : public application_templates::MonoDeviceApplication, publ
 			IGPUComputePipeline::SCreationParams params = {};
 			params.layout = layout.get();
 			params.shader.shader = shader.get();
-			params.shader.requiredSubgroupSize = static_cast<IGPUShader::SSpecInfo::SUBGROUP_SIZE>(hlsl::findMSB(m_physicalDevice->getLimits().maxSubgroupSize));
+			params.shader.entryPoint = "main";
+			params.shader.stage = hlsl::ESS_COMPUTE;
+			params.shader.requiredSubgroupSize = static_cast<IPipelineBase::SShaderSpecInfo::SUBGROUP_SIZE>(hlsl::findMSB(m_physicalDevice->getLimits().maxSubgroupSize));
 			params.shader.requireFullSubgroups = true;
 			if (!m_device->createComputePipelines(nullptr, { &params,1 }, &m_pipeline))
 				return logFail("Failed to create compute pipeline!\n");

From 1d6fde6c0ff7496be33a1ff66b50a6adbc3a678f Mon Sep 17 00:00:00 2001
From: kevyuu <kevin.kayu@gmail.com>
Date: Tue, 22 Apr 2025 08:15:41 +0700
Subject: [PATCH 190/529] Fix example 22 to use IShader

---
 22_CppCompat/ITester.h | 19 ++++++++++---------
 22_CppCompat/main.cpp  | 10 +++++-----
 2 files changed, 15 insertions(+), 14 deletions(-)

diff --git a/22_CppCompat/ITester.h b/22_CppCompat/ITester.h
index a216fbf40..273f51663 100644
--- a/22_CppCompat/ITester.h
+++ b/22_CppCompat/ITester.h
@@ -5,6 +5,7 @@
 #include "app_resources/common.hlsl"
 #include "nbl/application_templates/MonoDeviceApplication.hpp"
 #include "nbl/application_templates/MonoAssetManagerAndBuiltinResourceApplication.hpp"
+#include "nbl/asset/metadata/CHLSLMetadata.h"
 
 using namespace nbl;
 
@@ -45,14 +46,15 @@ class ITester
             logFail("Failed to create Command Buffers!\n");
 
         // Load shaders, set up pipeline
-        core::smart_refctd_ptr<video::IGPUShader> shader;
+        core::smart_refctd_ptr<asset::IShader> shader;
+        auto shaderStage = ESS_UNKNOWN;
         {
             asset::IAssetLoader::SAssetLoadParams lp = {};
             lp.logger = m_logger.get();
             lp.workingDirectory = ""; // virtual root
             auto assetBundle = m_assetMgr->getAsset(pipleineSetupData.testShaderPath, lp);
             const auto assets = assetBundle.getContents();
-            if (assets.empty())
+            if (assets.empty() || assetBundle.getAssetType() != asset::IAsset::ET_SHADER)
             {
                 logFail("Could not load shader!");
                 assert(0);
@@ -60,12 +62,14 @@ class ITester
 
             // It would be super weird if loading a shader from a file produced more than 1 asset
             assert(assets.size() == 1);
-            core::smart_refctd_ptr<asset::ICPUShader> source = asset::IAsset::castDown<asset::ICPUShader>(assets[0]);
+            core::smart_refctd_ptr<asset::IShader> source = asset::IAsset::castDown<asset::IShader>(assets[0]);
+            const auto hlslMetadata = static_cast<const asset::CHLSLMetadata*>(assetBundle.getMetadata());
+            shaderStage = hlslMetadata->shaderStages->front();
 
             auto* compilerSet = m_assetMgr->getCompilerSet();
 
             asset::IShaderCompiler::SCompilerOptions options = {};
-            options.stage = source->getStage();
+            options.stage = shaderStage;
             options.targetSpirvVersion = m_device->getPhysicalDevice()->getLimits().spirvVersion;
             options.spirvOptimizer = nullptr;
             options.debugInfoFlags |= asset::IShaderCompiler::E_DEBUG_INFO_FLAGS::EDIF_SOURCE_BIT;
@@ -73,11 +77,7 @@ class ITester
             options.preprocessorOptions.logger = m_logger.get();
             options.preprocessorOptions.includeFinder = compilerSet->getShaderCompiler(source->getContentType())->getDefaultIncludeFinder();
 
-            auto spirv = compilerSet->compileToSPIRV(source.get(), options);
-
-            video::ILogicalDevice::SShaderCreationParameters params{};
-            params.cpushader = spirv.get();
-            shader = m_device->createShader(params);
+            shader = compilerSet->compileToSPIRV(source.get(), options);
         }
 
         if (!shader)
@@ -113,6 +113,7 @@ class ITester
             params.layout = m_pplnLayout.get();
             params.shader.entryPoint = "main";
             params.shader.shader = shader.get();
+            params.shader.stage = shaderStage;
             if (!m_device->createComputePipelines(nullptr, { &params,1 }, &m_pipeline))
                 logFail("Failed to create pipelines (compile & link shaders)!\n");
         }
diff --git a/22_CppCompat/main.cpp b/22_CppCompat/main.cpp
index 7fa2556c4..877831c55 100644
--- a/22_CppCompat/main.cpp
+++ b/22_CppCompat/main.cpp
@@ -84,7 +84,7 @@ class CompatibilityTest final : public MonoDeviceApplication, public MonoAssetMa
         m_commandPool = m_device->createCommandPool(m_queue->getFamilyIndex(), IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT);
         m_commandPool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY, { &m_cmdbuf,1 }, smart_refctd_ptr(m_logger));
 
-        smart_refctd_ptr<IGPUShader> shader;
+        smart_refctd_ptr<IShader> shader;
         {
             IAssetLoader::SAssetLoadParams lp = {};
             lp.logger = m_logger.get();
@@ -94,14 +94,12 @@ class CompatibilityTest final : public MonoDeviceApplication, public MonoAssetMa
             if (assets.empty())
                 return logFail("Could not load shader!");
 
-            // lets go straight from ICPUSpecializedShader to IGPUSpecializedShader
-            auto source = IAsset::castDown<ICPUShader>(assets[0]);
+            auto source = IAsset::castDown<IShader>(assets[0]);
             // The down-cast should not fail!
             assert(source);
-            assert(source->getStage() == IShader::E_SHADER_STAGE::ESS_COMPUTE);
 
             // this time we skip the use of the asset converter since the ICPUShader->IGPUShader path is quick and simple
-            shader = m_device->createShader(source.get());
+            shader = m_device->compileShader({ source.get() });
             if (!shader)
                 return logFail("Creation of a GPU Shader to from CPU Shader source failed!");
         }
@@ -129,6 +127,8 @@ class CompatibilityTest final : public MonoDeviceApplication, public MonoAssetMa
             IGPUComputePipeline::SCreationParams params = {};
             params.layout = layout.get();
             params.shader.shader = shader.get();
+            params.shader.entryPoint = "main";
+            params.shader.stage = hlsl::ShaderStage::ESS_COMPUTE;
             if (!m_device->createComputePipelines(nullptr, { &params,1 }, &m_pipeline))
                 return logFail("Failed to create compute pipeline!\n");
         }

From 32c49741134c5498fdc47b5b29ff00313fbcec96 Mon Sep 17 00:00:00 2001
From: kevyuu <kevin.kayu@gmail.com>
Date: Tue, 22 Apr 2025 08:15:55 +0700
Subject: [PATCH 191/529] Fix example 23 to use IShader

---
 .../app_resources/testSubgroup.comp.hlsl      |  1 +
 .../app_resources/testWorkgroup.comp.hlsl     |  1 +
 23_ArithmeticUnitTest/main.cpp                | 19 ++++++++++---------
 3 files changed, 12 insertions(+), 9 deletions(-)

diff --git a/23_ArithmeticUnitTest/app_resources/testSubgroup.comp.hlsl b/23_ArithmeticUnitTest/app_resources/testSubgroup.comp.hlsl
index 479265d73..29114756d 100644
--- a/23_ArithmeticUnitTest/app_resources/testSubgroup.comp.hlsl
+++ b/23_ArithmeticUnitTest/app_resources/testSubgroup.comp.hlsl
@@ -12,6 +12,7 @@ uint32_t globalIndex()
 bool canStore() {return true;}
 
 [numthreads(WORKGROUP_SIZE,1,1)]
+[shader("compute")]
 void main()
 {
 	test();
diff --git a/23_ArithmeticUnitTest/app_resources/testWorkgroup.comp.hlsl b/23_ArithmeticUnitTest/app_resources/testWorkgroup.comp.hlsl
index 9bafae47f..d47dea29e 100644
--- a/23_ArithmeticUnitTest/app_resources/testWorkgroup.comp.hlsl
+++ b/23_ArithmeticUnitTest/app_resources/testWorkgroup.comp.hlsl
@@ -75,6 +75,7 @@ bool canStore()
 }
 
 [numthreads(WORKGROUP_SIZE,1,1)]
+[shader("compute")]
 void main()
 {
 	const type_t sourceVal = test();
diff --git a/23_ArithmeticUnitTest/main.cpp b/23_ArithmeticUnitTest/main.cpp
index 147d231e2..e2d7d3cfe 100644
--- a/23_ArithmeticUnitTest/main.cpp
+++ b/23_ArithmeticUnitTest/main.cpp
@@ -184,7 +184,7 @@ class ArithmeticUnitTestApp final : public application_templates::BasicMultiQueu
 				exit(-1);
 			}
 			auto firstAssetInBundle = bundle.getContents()[0];
-			return smart_refctd_ptr_static_cast<ICPUShader>(firstAssetInBundle);
+			return smart_refctd_ptr_static_cast<IShader>(firstAssetInBundle);
 		};
 
 		auto subgroupTestSource = getShaderSource("app_resources/testSubgroup.comp.hlsl");
@@ -276,17 +276,18 @@ class ArithmeticUnitTestApp final : public application_templates::BasicMultiQueu
 	}
 
 	// create pipeline (specialized every test) [TODO: turn into a future/async]
-	smart_refctd_ptr<IGPUComputePipeline> createPipeline(const ICPUShader* overridenUnspecialized, const uint8_t subgroupSizeLog2)
+	smart_refctd_ptr<IGPUComputePipeline> createPipeline(const IShader* overridenUnspecialized, const uint8_t subgroupSizeLog2)
 	{
-		auto shader = m_device->createShader(overridenUnspecialized);
+		auto shader = m_device->compileShader({ overridenUnspecialized });
 		IGPUComputePipeline::SCreationParams params = {};
 		params.layout = pipelineLayout.get();
 		params.shader = {
-			.entryPoint = "main",
 			.shader = shader.get(),
+			.entryPoint = "main",
+			.stage = hlsl::ESS_COMPUTE,
+			.requiredSubgroupSize = static_cast<IPipelineBase::SShaderSpecInfo::SUBGROUP_SIZE>(subgroupSizeLog2),
+			.requireFullSubgroups = true,
 			.entries = nullptr,
-			.requiredSubgroupSize = static_cast<IGPUShader::SSpecInfo::SUBGROUP_SIZE>(subgroupSizeLog2),
-			.requireFullSubgroups = true
 		};
 		core::smart_refctd_ptr<IGPUComputePipeline> pipeline;
 		if (!m_device->createComputePipelines(m_spirv_isa_cache.get(),{&params,1},&pipeline))
@@ -295,17 +296,17 @@ class ArithmeticUnitTestApp final : public application_templates::BasicMultiQueu
 	}
 
 	/*template<template<class> class Arithmetic, bool WorkgroupTest>
-	bool runTest(const smart_refctd_ptr<const ICPUShader>& source, const uint32_t elementCount, const uint32_t workgroupSize, uint32_t itemsPerWG = ~0u)
+	bool runTest(const smart_refctd_ptr<const IShader>& source, const uint32_t elementCount, const uint32_t workgroupSize, uint32_t itemsPerWG = ~0u)
 	{
 		return true;
 	}*/
 
 	template<template<class> class Arithmetic, bool WorkgroupTest>
-	bool runTest(const smart_refctd_ptr<const ICPUShader>& source, const uint32_t elementCount, const uint8_t subgroupSizeLog2, const uint32_t workgroupSize, uint32_t itemsPerWG = ~0u)
+	bool runTest(const smart_refctd_ptr<const IShader>& source, const uint32_t elementCount, const uint8_t subgroupSizeLog2, const uint32_t workgroupSize, uint32_t itemsPerWG = ~0u)
 	{
 		std::string arith_name = Arithmetic<bit_xor<float>>::name;
 
-		smart_refctd_ptr<ICPUShader> overridenUnspecialized;
+		smart_refctd_ptr<IShader> overridenUnspecialized;
 		if constexpr (WorkgroupTest)
 		{
 			overridenUnspecialized = CHLSLCompiler::createOverridenCopy(

From 8255a3e97851dd7dd500ed26c12b64a58c1e4f63 Mon Sep 17 00:00:00 2001
From: kevyuu <kevin.kayu@gmail.com>
Date: Tue, 22 Apr 2025 08:16:04 +0700
Subject: [PATCH 192/529] Fix example 24 to use IShader

---
 24_ColorSpaceTest/main.cpp | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/24_ColorSpaceTest/main.cpp b/24_ColorSpaceTest/main.cpp
index 844f058fe..1c23a3f2f 100644
--- a/24_ColorSpaceTest/main.cpp
+++ b/24_ColorSpaceTest/main.cpp
@@ -161,7 +161,7 @@ class ColorSpaceTestSampleApp final : public examples::SimpleWindowedApplication
 					return logFail("Failed to create Full Screen Triangle protopipeline or load its vertex shader!");
 
 				// Load Custom Shader
-				auto loadCompileAndCreateShader = [&](const std::string& relPath) -> smart_refctd_ptr<IGPUShader>
+				auto loadCompileAndCreateShader = [&](const std::string& relPath) -> smart_refctd_ptr<IShader>
 					{
 						IAssetLoader::SAssetLoadParams lp = {};
 						lp.logger = m_logger.get();
@@ -172,11 +172,11 @@ class ColorSpaceTestSampleApp final : public examples::SimpleWindowedApplication
 							return nullptr;
 
 						// lets go straight from ICPUSpecializedShader to IGPUSpecializedShader
-						auto source = IAsset::castDown<ICPUShader>(assets[0]);
+						auto source = IAsset::castDown<IShader>(assets[0]);
 						if (!source)
 							return nullptr;
 
-						return m_device->createShader(source.get());
+						return m_device->compileShader({ source.get() });
 					};
 				auto fragmentShader = loadCompileAndCreateShader("app_resources/present.frag.hlsl");
 				if (!fragmentShader)
@@ -255,14 +255,15 @@ class ColorSpaceTestSampleApp final : public examples::SimpleWindowedApplication
 				// Now create the pipeline
 				{
 					const asset::SPushConstantRange range = {
-						.stageFlags = IShader::E_SHADER_STAGE::ESS_FRAGMENT,
+						.stageFlags = ESS_FRAGMENT,
 						.offset = 0,
 						.size = sizeof(push_constants_t)
 					};
 					auto layout = m_device->createPipelineLayout({ &range,1 }, nullptr, nullptr, nullptr, core::smart_refctd_ptr(dsLayout));
-					const IGPUShader::SSpecInfo fragSpec = {
+					const IPipelineBase::SShaderSpecInfo fragSpec = {
+						.shader = fragmentShader.get(),
 						.entryPoint = "main",
-						.shader = fragmentShader.get()
+						.stage = ESS_FRAGMENT,
 					};
 					m_pipeline = fsTriProtoPPln.createPipeline(fragSpec, layout.get(), scResources->getRenderpass()/*,default is subpass 0*/);
 					if (!m_pipeline)
@@ -796,7 +797,7 @@ class ColorSpaceTestSampleApp final : public examples::SimpleWindowedApplication
 							cmdbuf->beginRenderPass(info,IGPUCommandBuffer::SUBPASS_CONTENTS::INLINE);
 						}
 						cmdbuf->bindGraphicsPipeline(m_pipeline.get());
-						cmdbuf->pushConstants(m_pipeline->getLayout(),IGPUShader::E_SHADER_STAGE::ESS_FRAGMENT,0,sizeof(push_constants_t),&pc);
+						cmdbuf->pushConstants(m_pipeline->getLayout(),hlsl::ShaderStage::ESS_FRAGMENT,0,sizeof(push_constants_t),&pc);
 						cmdbuf->bindDescriptorSets(nbl::asset::EPBP_GRAPHICS,m_pipeline->getLayout(),3,1,&ds);
 						ext::FullScreenTriangle::recordDrawCall(cmdbuf);
 						cmdbuf->endRenderPass();

From 847927c291742bc2d95edd2312c75e2a9b835794 Mon Sep 17 00:00:00 2001
From: kevyuu <kevin.kayu@gmail.com>
Date: Tue, 22 Apr 2025 08:16:13 +0700
Subject: [PATCH 193/529] Fix example 25 to use IShader

---
 25_FilterTest/main.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/25_FilterTest/main.cpp b/25_FilterTest/main.cpp
index a66227225..4ce68d66c 100644
--- a/25_FilterTest/main.cpp
+++ b/25_FilterTest/main.cpp
@@ -868,7 +868,7 @@ class BlitFilterTestApp final : public virtual application_templates::BasicMulti
 									logger->log("Failed to fit the preload region in shared memory even for 1x1x1 workgroup!",ILogger::ELL_ERROR);
 									return false;
 								}
-								cmdbuf->pushConstants(layout,IGPUShader::E_SHADER_STAGE::ESS_COMPUTE,0,sizeof(params),&params);
+								cmdbuf->pushConstants(layout,hlsl::ShaderStage::ESS_COMPUTE,0,sizeof(params),&params);
 								cmdbuf->dispatch(params.perWG.getWorkgroupCount(outExtent16));
 								if (m_alphaSemantic==IBlitUtilities::EAS_REFERENCE_OR_COVERAGE)
 								{

From 20cd09a25ec3236188fce0c2933d999fcc1b8f99 Mon Sep 17 00:00:00 2001
From: kevyuu <kevin.kayu@gmail.com>
Date: Tue, 22 Apr 2025 08:16:22 +0700
Subject: [PATCH 194/529] Fix example 26 to use IShader

---
 26_Blur/app_resources/shader.comp.hlsl |  1 +
 26_Blur/main.cpp                       | 37 +++++++++++---------------
 2 files changed, 16 insertions(+), 22 deletions(-)

diff --git a/26_Blur/app_resources/shader.comp.hlsl b/26_Blur/app_resources/shader.comp.hlsl
index 94baa8d2a..99e876ccc 100644
--- a/26_Blur/app_resources/shader.comp.hlsl
+++ b/26_Blur/app_resources/shader.comp.hlsl
@@ -131,6 +131,7 @@ struct ScanSharedMemoryProxy
 };
 
 [numthreads(WORKGROUP_SIZE, 1, 1)]
+[shader("compute")]
 void main()
 {
     ScanSharedMemoryProxy scanSmemAccessor;
diff --git a/26_Blur/main.cpp b/26_Blur/main.cpp
index 8217c4e51..4910ba5f0 100644
--- a/26_Blur/main.cpp
+++ b/26_Blur/main.cpp
@@ -225,7 +225,7 @@ class BlurApp final : public examples::SimpleWindowedApplication, public applica
 			if (!m_vertImg || !m_device->allocate(reqs, m_vertImg.get()).isValid())
 				return logFail("Could not create HDR Image");
 
-			smart_refctd_ptr<IGPUShader> shader;
+			smart_refctd_ptr<IShader> shader;
 			{
 				IAssetLoader::SAssetLoadParams lp = {};
 				lp.logger = m_logger.get();
@@ -236,10 +236,10 @@ class BlurApp final : public examples::SimpleWindowedApplication, public applica
 					return logFail("Failed to load shader from disk");
 
 				// lets go straight from ICPUSpecializedShader to IGPUSpecializedShader
-				auto sourceRaw = IAsset::castDown<ICPUShader>(assets[0]);
+				auto sourceRaw = IAsset::castDown<IShader>(assets[0]);
 				if (!sourceRaw)
 					return logFail("Failed to load shader from disk");
-				smart_refctd_ptr<ICPUShader> source = CHLSLCompiler::createOverridenCopy(
+				smart_refctd_ptr<IShader> source = CHLSLCompiler::createOverridenCopy(
 					sourceRaw.get(),
 					"static const uint16_t WORKGROUP_SIZE = %d;\n"
 					"static const uint16_t MAX_SCANLINE_SIZE = %d;\n"
@@ -264,7 +264,7 @@ class BlurApp final : public examples::SimpleWindowedApplication, public applica
 				auto opt = make_smart_refctd_ptr<ISPIRVOptimizer>(optPasses);
 				shader = m_device->createShader(source.get(), opt.get());
 #else
-				shader = m_device->createShader(source.get());
+				shader = m_device->compileShader({ source.get() });
 #endif
 				if (!shader)
 					return false;
@@ -272,26 +272,19 @@ class BlurApp final : public examples::SimpleWindowedApplication, public applica
 
 			{
 				const asset::SPushConstantRange ranges[] = { {
-					.stageFlags = IGPUShader::E_SHADER_STAGE::ESS_COMPUTE,
+					.stageFlags = hlsl::ShaderStage::ESS_COMPUTE,
 					.offset = 0,
 					.size = sizeof(PushConstants)
 				} };
 				auto layout = m_device->createPipelineLayout(ranges, smart_refctd_ptr(dsLayout));
-				const IGPUComputePipeline::SCreationParams params[] = { {
-					{
-						.layout = layout.get()
-					},
-					{},
-					IGPUComputePipeline::SCreationParams::FLAGS::NONE,
-					{
-						.entryPoint = "main",
-						.shader = shader.get(),
-						.entries = nullptr,
-						.requiredSubgroupSize = static_cast<IGPUShader::SSpecInfo::SUBGROUP_SIZE>(hlsl::findMSB(m_physicalDevice->getLimits().maxSubgroupSize)),
-						.requireFullSubgroups = true
-					}
-				}};
-				if (!m_device->createComputePipelines(nullptr, params, &m_ppln))
+
+				IGPUComputePipeline::SCreationParams params = {};
+				params.layout = layout.get();
+				params.shader.shader = shader.get();
+				params.shader.entryPoint = "main";
+				params.shader.stage = hlsl::ShaderStage::ESS_COMPUTE;
+				params.shader.requireFullSubgroups = true;
+				if (!m_device->createComputePipelines(nullptr, { &params, 1 }, &m_ppln))
 					return logFail("Failed to create Pipeline");
 			}
 
@@ -626,7 +619,7 @@ class BlurApp final : public examples::SimpleWindowedApplication, public applica
 				cb->pipelineBarrier(E_DEPENDENCY_FLAGS::EDF_NONE, { .memBarriers = {}, .bufBarriers = {},.imgBarriers = {&vertImgBarrier,1} });
 				cb->bindDescriptorSets(E_PIPELINE_BIND_POINT::EPBP_COMPUTE, layout, 0, 1, &m_ds0.get());
 				PushConstants pc = { .radius = blurRadius, .activeAxis = 0, .edgeWrapMode = blurEdgeWrapMode };
-				cb->pushConstants(layout, IGPUShader::E_SHADER_STAGE::ESS_COMPUTE, 0, sizeof(pc), &pc);
+				cb->pushConstants(layout,  hlsl::ShaderStage::ESS_COMPUTE, 0, sizeof(pc), &pc);
 				cb->dispatch(image_params.extent.height, 1, 1);
 
 				image_memory_barrier_t horzImgBarrier = {
@@ -646,7 +639,7 @@ class BlurApp final : public examples::SimpleWindowedApplication, public applica
 				cb->pipelineBarrier(E_DEPENDENCY_FLAGS::EDF_NONE, { .memBarriers = {}, .bufBarriers = {},.imgBarriers = {&horzImgBarrier,1} });
 				cb->bindDescriptorSets(E_PIPELINE_BIND_POINT::EPBP_COMPUTE, layout, 0, 1, &m_ds1.get());
 				pc.activeAxis = 1;
-				cb->pushConstants(layout, IGPUShader::E_SHADER_STAGE::ESS_COMPUTE, 0, sizeof(pc), &pc);
+				cb->pushConstants(layout, hlsl::ShaderStage::ESS_COMPUTE, 0, sizeof(pc), &pc);
 				cb->dispatch(image_params.extent.width, 1, 1);
 			}
 

From fabe1dbdaec23694b2dc5d0b2ad5d6bea75eed9f Mon Sep 17 00:00:00 2001
From: kevyuu <kevin.kayu@gmail.com>
Date: Tue, 22 Apr 2025 08:16:31 +0700
Subject: [PATCH 195/529] Fix example 27 to use IShader

---
 .../app_resources/shader.comp.hlsl            |  1 +
 27_MPMCScheduler/main.cpp                     | 32 +++++++------------
 2 files changed, 13 insertions(+), 20 deletions(-)

diff --git a/27_MPMCScheduler/app_resources/shader.comp.hlsl b/27_MPMCScheduler/app_resources/shader.comp.hlsl
index c49ad018c..966963761 100644
--- a/27_MPMCScheduler/app_resources/shader.comp.hlsl
+++ b/27_MPMCScheduler/app_resources/shader.comp.hlsl
@@ -305,6 +305,7 @@ uint32_t3 gl_WorkGroupSize() {return uint32_t3(WorkgroupSizeX*WorkgroupSizeY,1,1
 }
 }
 [numthreads(WorkgroupSizeX*WorkgroupSizeY,1,1)]
+[shader("compute")]
 void main()
 {
     // manually push an explicit workload
diff --git a/27_MPMCScheduler/main.cpp b/27_MPMCScheduler/main.cpp
index c380bf3c6..03275d114 100644
--- a/27_MPMCScheduler/main.cpp
+++ b/27_MPMCScheduler/main.cpp
@@ -69,7 +69,7 @@ class MPMCSchedulerApp final : public examples::SimpleWindowedApplication, publi
 			if (!asset_base_t::onAppInitialized(std::move(system)))
 				return false;
 
-			smart_refctd_ptr<IGPUShader> shader;
+			smart_refctd_ptr<IShader> shader;
 			{
 				IAssetLoader::SAssetLoadParams lp = {};
 				lp.logger = m_logger.get();
@@ -80,11 +80,11 @@ class MPMCSchedulerApp final : public examples::SimpleWindowedApplication, publi
 					return logFail("Failed to load shader from disk");
 
 				// lets go straight from ICPUSpecializedShader to IGPUSpecializedShader
-				auto source = IAsset::castDown<ICPUShader>(assets[0]);
+				auto source = IAsset::castDown<IShader>(assets[0]);
 				if (!source)
 					return logFail("Failed to load shader from disk");
 
-				shader = m_device->createShader(source.get());
+				shader = m_device->compileShader({ source.get() });
 				if (!shader)
 					return false;
 			}
@@ -106,26 +106,18 @@ class MPMCSchedulerApp final : public examples::SimpleWindowedApplication, publi
 
 			{
 				const asset::SPushConstantRange ranges[] = {{
-					.stageFlags = IGPUShader::E_SHADER_STAGE::ESS_COMPUTE,
+					.stageFlags = IShader::E_SHADER_STAGE::ESS_COMPUTE,
 					.offset = 0,
 					.size = sizeof(PushConstants)
 				}};
 				auto layout = m_device->createPipelineLayout(ranges,smart_refctd_ptr(dsLayout));
-				const IGPUComputePipeline::SCreationParams params[] = { {
-					{
-						.layout = layout.get()
-					},
-					{},
-					IGPUComputePipeline::SCreationParams::FLAGS::NONE,
-					{
-						.entryPoint = "main",
-						.shader = shader.get(),
-						.entries = nullptr,
-						.requiredSubgroupSize = IGPUShader::SSpecInfo::SUBGROUP_SIZE::UNKNOWN,
-						.requireFullSubgroups = true
-					}
-				}};
-				if (!m_device->createComputePipelines(nullptr,params,&m_ppln))
+				IGPUComputePipeline::SCreationParams params;
+				params.layout = layout.get();
+				params.shader.shader = shader.get();
+				params.shader.entryPoint = "main";
+				params.shader.stage = hlsl::ShaderStage::ESS_COMPUTE;
+				params.shader.requireFullSubgroups = true;
+				if (!m_device->createComputePipelines(nullptr, { &params, 1 }, &m_ppln))
 					return logFail("Failed to create Pipeline");
 			}
 
@@ -306,7 +298,7 @@ class MPMCSchedulerApp final : public examples::SimpleWindowedApplication, publi
 					.sharedAcceptableIdleCount = 0,
 					.globalAcceptableIdleCount = 0
 				};
-				cb->pushConstants(layout,IGPUShader::E_SHADER_STAGE::ESS_COMPUTE,0,sizeof(pc),&pc);
+				cb->pushConstants(layout,hlsl::ShaderStage::ESS_COMPUTE,0,sizeof(pc),&pc);
 				cb->dispatch(WIN_W/WorkgroupSizeX,WIN_H/WorkgroupSizeY,1);
 			}
 

From d7f9f18171a0b02424386799652af4ef459e73c6 Mon Sep 17 00:00:00 2001
From: kevyuu <kevin.kayu@gmail.com>
Date: Tue, 22 Apr 2025 08:16:41 +0700
Subject: [PATCH 196/529] Fix example 28 to use IShader

---
 .../app_resources/fft_convolve_ifft.hlsl      |  1 +
 .../app_resources/image_fft_first_axis.hlsl   |  1 +
 .../app_resources/image_ifft_first_axis.hlsl  |  1 +
 .../app_resources/kernel_fft_first_axis.hlsl  |  1 +
 .../app_resources/kernel_fft_second_axis.hlsl |  1 +
 .../kernel_spectrum_normalize.hlsl            |  1 +
 28_FFTBloom/main.cpp                          | 25 ++++++++++---------
 7 files changed, 19 insertions(+), 12 deletions(-)

diff --git a/28_FFTBloom/app_resources/fft_convolve_ifft.hlsl b/28_FFTBloom/app_resources/fft_convolve_ifft.hlsl
index 73d9d7850..07c2ec8cf 100644
--- a/28_FFTBloom/app_resources/fft_convolve_ifft.hlsl
+++ b/28_FFTBloom/app_resources/fft_convolve_ifft.hlsl
@@ -223,6 +223,7 @@ NBL_CONSTEXPR_STATIC_INLINE float32_t2 PreloadedSecondAxisAccessor::KernelHalfPi
 NBL_CONSTEXPR_STATIC_INLINE vector<scalar_t, 2> PreloadedSecondAxisAccessor::One = {1.0f, 0.f};
 
 [numthreads(FFTParameters::WorkgroupSize, 1, 1)]
+[shader("compute")]
 void main(uint32_t3 ID : SV_DispatchThreadID)
 {
 	SharedMemoryAccessor sharedmemAccessor;
diff --git a/28_FFTBloom/app_resources/image_fft_first_axis.hlsl b/28_FFTBloom/app_resources/image_fft_first_axis.hlsl
index 864c64b1e..f1478a8d6 100644
--- a/28_FFTBloom/app_resources/image_fft_first_axis.hlsl
+++ b/28_FFTBloom/app_resources/image_fft_first_axis.hlsl
@@ -76,6 +76,7 @@ struct PreloadedFirstAxisAccessor : MultiChannelPreloadedAccessorBase
 };
 
 [numthreads(FFTParameters::WorkgroupSize, 1, 1)]
+[shader("compute")]
 void main(uint32_t3 ID : SV_DispatchThreadID)
 {
 	SharedMemoryAccessor sharedmemAccessor;
diff --git a/28_FFTBloom/app_resources/image_ifft_first_axis.hlsl b/28_FFTBloom/app_resources/image_ifft_first_axis.hlsl
index 9146073dd..b3bef3510 100644
--- a/28_FFTBloom/app_resources/image_ifft_first_axis.hlsl
+++ b/28_FFTBloom/app_resources/image_ifft_first_axis.hlsl
@@ -136,6 +136,7 @@ struct PreloadedFirstAxisAccessor : MultiChannelPreloadedAccessorMirrorTradeBase
 };
 
 [numthreads(FFTParameters::WorkgroupSize, 1, 1)]
+[shader("compute")]
 void main(uint32_t3 ID : SV_DispatchThreadID)
 {
 	SharedMemoryAccessor sharedmemAccessor;
diff --git a/28_FFTBloom/app_resources/kernel_fft_first_axis.hlsl b/28_FFTBloom/app_resources/kernel_fft_first_axis.hlsl
index 51f514c4a..741bac7db 100644
--- a/28_FFTBloom/app_resources/kernel_fft_first_axis.hlsl
+++ b/28_FFTBloom/app_resources/kernel_fft_first_axis.hlsl
@@ -68,6 +68,7 @@ struct PreloadedFirstAxisAccessor : MultiChannelPreloadedAccessorBase
 };
 
 [numthreads(FFTParameters::WorkgroupSize, 1, 1)]
+[shader("compute")]
 void main(uint32_t3 ID : SV_DispatchThreadID)
 {
 	SharedMemoryAccessor sharedmemAccessor;
diff --git a/28_FFTBloom/app_resources/kernel_fft_second_axis.hlsl b/28_FFTBloom/app_resources/kernel_fft_second_axis.hlsl
index ab7216da2..eaecb5d0f 100644
--- a/28_FFTBloom/app_resources/kernel_fft_second_axis.hlsl
+++ b/28_FFTBloom/app_resources/kernel_fft_second_axis.hlsl
@@ -200,6 +200,7 @@ struct PreloadedSecondAxisAccessor : MultiChannelPreloadedAccessorMirrorTradeBas
 };
 
 [numthreads(FFTParameters::WorkgroupSize, 1, 1)]
+[shader("compute")]
 void main(uint32_t3 ID : SV_DispatchThreadID)
 {
 	SharedMemoryAccessor sharedmemAccessor;
diff --git a/28_FFTBloom/app_resources/kernel_spectrum_normalize.hlsl b/28_FFTBloom/app_resources/kernel_spectrum_normalize.hlsl
index f2ef207d3..efe406301 100644
--- a/28_FFTBloom/app_resources/kernel_spectrum_normalize.hlsl
+++ b/28_FFTBloom/app_resources/kernel_spectrum_normalize.hlsl
@@ -2,6 +2,7 @@
 [[vk::binding(2, 0)]] RWTexture2DArray<float32_t2> kernelChannels;
 
 [numthreads(8, 8, 1)]
+[shader("compute")]
 void main(uint32_t3 ID : SV_DispatchThreadID)
 {
 	const scalar_t powerReciprocal = vk::RawBufferLoad<scalar_t>(pushConstants.rowMajorBufferAddress);
diff --git a/28_FFTBloom/main.cpp b/28_FFTBloom/main.cpp
index cc312c3be..4718a4090 100644
--- a/28_FFTBloom/main.cpp
+++ b/28_FFTBloom/main.cpp
@@ -169,7 +169,7 @@ class FFTBloomApp final : public examples::SimpleWindowedApplication, public app
 		float32_t totalSizeReciprocal;
 	};
 
-	inline core::smart_refctd_ptr<video::IGPUShader> createShader(const char* includeMainName, const SShaderConstevalParameters& shaderConstants)
+	inline core::smart_refctd_ptr<IShader> createShader(const char* includeMainName, const SShaderConstevalParameters& shaderConstants)
 	{
 		// The annoying "const static member field must be initialized outside of struct" bug strikes again
 		std::ostringstream kernelHalfPixelSizeStream;
@@ -204,18 +204,17 @@ class FFTBloomApp final : public examples::SimpleWindowedApplication, public app
 
 
 
-		auto CPUShader = core::make_smart_refctd_ptr<ICPUShader>((prelude+"\n#include \"" + includeMainName + "\"\n").c_str(),
-																IShader::E_SHADER_STAGE::ESS_COMPUTE, 
+		auto HLSLShader = core::make_smart_refctd_ptr<IShader>((prelude+"\n#include \"" + includeMainName + "\"\n").c_str(),
 																IShader::E_CONTENT_TYPE::ECT_HLSL, 
 																includeMainName);
-		assert(CPUShader);
+		assert(HLSLShader);
 
 		#ifndef _NBL_DEBUG
 		ISPIRVOptimizer::E_OPTIMIZER_PASS optPasses = ISPIRVOptimizer::EOP_STRIP_DEBUG_INFO;
 		auto opt = make_smart_refctd_ptr<ISPIRVOptimizer>(std::span<ISPIRVOptimizer::E_OPTIMIZER_PASS>(&optPasses, 1));
-		return m_device->createShader({ CPUShader.get(), opt.get(), m_readCache.get(), m_writeCache.get()});
+		return m_device->createShader({ HLSLShader.get(), opt.get(), m_readCache.get(), m_writeCache.get()});
 		#else 
-		return m_device->createShader({ CPUShader.get(), nullptr, m_readCache.get(), m_writeCache.get() });
+		return m_device->compileShader({ HLSLShader.get(), nullptr, m_readCache.get(), m_writeCache.get() });
 		#endif
 	}
 
@@ -709,7 +708,7 @@ class FFTBloomApp final : public examples::SimpleWindowedApplication, public app
 			// Normalization shader needs this info
 			uint16_t secondAxisFFTHalfLengthLog2 = elementsPerInvocationLog2 + workgroupSizeLog2 - 1;
 			// Create shaders
-			smart_refctd_ptr<IGPUShader> shaders[3];
+			smart_refctd_ptr<IShader> shaders[3];
 			uint16_t2 kernelDimensions = { kerDim.width, kerDim.height };
 			SShaderConstevalParameters::SShaderConstevalParametersCreateInfo shaderConstevalInfo = { .useHalfFloats = m_useHalfFloats, .elementsPerInvocationLog2 = elementsPerInvocationLog2, .workgroupSizeLog2 = workgroupSizeLog2, .numWorkgroupsLog2 = secondAxisFFTHalfLengthLog2, .previousWorkgroupSizeLog2 = workgroupSizeLog2 };
 			SShaderConstevalParameters shaderConstevalParameters(shaderConstevalInfo);
@@ -722,11 +721,12 @@ class FFTBloomApp final : public examples::SimpleWindowedApplication, public app
 			for (auto i = 0u; i < 3; i++)
 			{
 				params[i].layout = pipelineLayout.get();
-				params[i].shader.entryPoint = "main";
 				params[i].shader.shader = shaders[i].get();
+				params[i].shader.entryPoint = "main";
+				params[i].shader.stage = hlsl::ShaderStage::ESS_COMPUTE;
 				// Normalization doesn't require full subgroups
 				params[i].shader.requireFullSubgroups = bool(2-i);
-				params[i].shader.requiredSubgroupSize = static_cast<IGPUShader::SSpecInfo::SUBGROUP_SIZE>(hlsl::findMSB(deviceLimits.maxSubgroupSize));
+				params[i].shader.requiredSubgroupSize = static_cast<IPipelineBase::SShaderSpecInfo::SUBGROUP_SIZE>(hlsl::findMSB(deviceLimits.maxSubgroupSize));
 			}
 			
 			smart_refctd_ptr<IGPUComputePipeline> pipelines[3];
@@ -884,7 +884,7 @@ class FFTBloomApp final : public examples::SimpleWindowedApplication, public app
 		uint16_t firstAxisFFTHalfLengthLog2;
 		uint16_t firstAxisFFTElementsPerInvocationLog2;
 		uint16_t firstAxisFFTWorkgroupSizeLog2;
-		smart_refctd_ptr<IGPUShader> shaders[3];
+		smart_refctd_ptr<IShader> shaders[3];
 		{
 			auto [elementsPerInvocationLog2, workgroupSizeLog2] = workgroup::fft::optimalFFTParameters(deviceLimits.maxOptimallyResidentWorkgroupInvocations, m_marginSrcDim.height, deviceLimits.maxSubgroupSize);
 			SShaderConstevalParameters::SShaderConstevalParametersCreateInfo shaderConstevalInfo = { .useHalfFloats = m_useHalfFloats, .elementsPerInvocationLog2 = elementsPerInvocationLog2, .workgroupSizeLog2 = workgroupSizeLog2 };
@@ -926,9 +926,10 @@ class FFTBloomApp final : public examples::SimpleWindowedApplication, public app
 		IGPUComputePipeline::SCreationParams params[3] = {};
 		for (auto i = 0u; i < 3; i++) {
 			params[i].layout = pipelineLayout.get();
-			params[i].shader.entryPoint = "main";
 			params[i].shader.shader = shaders[i].get();
-			params[i].shader.requiredSubgroupSize = static_cast<IGPUShader::SSpecInfo::SUBGROUP_SIZE>(hlsl::findMSB(deviceLimits.maxSubgroupSize));
+			params[i].shader.entryPoint = "main";
+			params[i].shader.stage = hlsl::ShaderStage::ESS_COMPUTE;
+			params[i].shader.requiredSubgroupSize = static_cast<IPipelineBase::SShaderSpecInfo::SUBGROUP_SIZE>(hlsl::findMSB(deviceLimits.maxSubgroupSize));
 			params[i].shader.requireFullSubgroups = true;
 		}
 

From ad5054db7c46c50805bd697e3f8b28937c1310ba Mon Sep 17 00:00:00 2001
From: kevyuu <kevin.kayu@gmail.com>
Date: Tue, 22 Apr 2025 08:16:55 +0700
Subject: [PATCH 197/529] Fix example 71 to use IShader

---
 71_RayTracingPipeline/main.cpp | 41 ++++++++++++++++++++--------------
 1 file changed, 24 insertions(+), 17 deletions(-)

diff --git a/71_RayTracingPipeline/main.cpp b/71_RayTracingPipeline/main.cpp
index 35c750373..c9ee0eafb 100644
--- a/71_RayTracingPipeline/main.cpp
+++ b/71_RayTracingPipeline/main.cpp
@@ -137,7 +137,7 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication,
     }
 
     // Load Custom Shader
-    auto loadCompileAndCreateShader = [&](const std::string& relPath) -> smart_refctd_ptr<IGPUShader>
+    auto loadCompileAndCreateShader = [&](const std::string& relPath) -> smart_refctd_ptr<IShader>
         {
             IAssetLoader::SAssetLoadParams lp = {};
             lp.logger = m_logger.get();
@@ -145,14 +145,20 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication,
             auto assetBundle = m_assetMgr->getAsset(relPath, lp);
             const auto assets = assetBundle.getContents();
             if (assets.empty())
+            {
+                assert(false);
                 return nullptr;
+            }
 
             // lets go straight from ICPUSpecializedShader to IGPUSpecializedShader
-            auto sourceRaw = IAsset::castDown<ICPUShader>(assets[0]);
+            auto sourceRaw = IAsset::castDown<IShader>(assets[0]);
             if (!sourceRaw)
+            {
+                assert(false);
                 return nullptr;
+            }
 
-            return m_device->createShader({ sourceRaw.get(), nullptr, shaderReadCache.get(), shaderWriteCache.get() });
+            return m_device->compileShader({ sourceRaw.get(), nullptr, shaderReadCache.get(), shaderWriteCache.get() });
         };
 
     // load shaders
@@ -335,18 +341,18 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication,
         RTDS_COUNT
       };
 
-      IGPUShader::SSpecInfo shaders[RTDS_COUNT];
-      shaders[RTDS_RAYGEN] = {.shader = raygenShader.get()};
-      shaders[RTDS_MISS] = {.shader = missShader.get()};
-      shaders[RTDS_MISS_SHADOW] = { .shader = missShadowShader.get() };
-      shaders[RTDS_CLOSEST_HIT] = {.shader = closestHitShader.get()};
-      shaders[RTDS_SPHERE_CLOSEST_HIT] = {.shader = proceduralClosestHitShader.get()};
-      shaders[RTDS_ANYHIT_PRIMARY] = {.shader = anyHitShaderColorPayload.get()};
-      shaders[RTDS_ANYHIT_SHADOW] = {.shader = anyHitShaderShadowPayload.get()};
-      shaders[RTDS_INTERSECTION] = {.shader = intersectionHitShader.get() };
-      shaders[RTDS_DIRECTIONAL_CALL] = {.shader = directionalLightCallShader.get()};
-      shaders[RTDS_POINT_CALL] = {.shader = pointLightCallShader.get()};
-      shaders[RTDS_SPOT_CALL] = {.shader = spotLightCallShader.get()};
+      IPipelineBase::SShaderSpecInfo shaders[RTDS_COUNT];
+      shaders[RTDS_RAYGEN] = {.shader = raygenShader.get(), .entryPoint = "main", .stage = ESS_RAYGEN};
+      shaders[RTDS_MISS] = {.shader = missShader.get(), .entryPoint = "main", .stage = ESS_MISS};
+      shaders[RTDS_MISS_SHADOW] = { .shader = missShadowShader.get(), .entryPoint = "main", .stage = ESS_MISS};
+      shaders[RTDS_CLOSEST_HIT] = {.shader = closestHitShader.get(), .entryPoint = "main", .stage = ESS_CLOSEST_HIT};
+      shaders[RTDS_SPHERE_CLOSEST_HIT] = {.shader = proceduralClosestHitShader.get(), .entryPoint = "main", .stage = ESS_CLOSEST_HIT};
+      shaders[RTDS_ANYHIT_PRIMARY] = {.shader = anyHitShaderColorPayload.get(), .entryPoint = "main", .stage = ESS_ANY_HIT};
+      shaders[RTDS_ANYHIT_SHADOW] = {.shader = anyHitShaderShadowPayload.get(), .entryPoint = "main", .stage = ESS_ANY_HIT};
+      shaders[RTDS_INTERSECTION] = {.shader = intersectionHitShader.get(), .entryPoint = "main", .stage = ESS_INTERSECTION };
+      shaders[RTDS_DIRECTIONAL_CALL] = {.shader = directionalLightCallShader.get(), .entryPoint = "main", .stage = ESS_CALLABLE};
+      shaders[RTDS_POINT_CALL] = {.shader = pointLightCallShader.get(), .entryPoint = "main", .stage = ESS_CALLABLE};
+      shaders[RTDS_SPOT_CALL] = {.shader = spotLightCallShader.get(), .entryPoint = "main", .stage = ESS_CALLABLE};
 
       params.layout = pipelineLayout.get();
       params.shaders = std::span(shaders);
@@ -448,9 +454,10 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication,
       if (!fsTriProtoPPln)
         return logFail("Failed to create Full Screen Triangle protopipeline or load its vertex shader!");
 
-      const IGPUShader::SSpecInfo fragSpec = {
+      const IPipelineBase::SShaderSpecInfo fragSpec = {
+        .shader = fragmentShader.get(),
         .entryPoint = "main",
-        .shader = fragmentShader.get()
+        .stage = ESS_FRAGMENT,
       };
 
       auto presentLayout = m_device->createPipelineLayout(

From 3698fb4f2acbaad19ead55ef68a98f76bf5f5f4d Mon Sep 17 00:00:00 2001
From: kevyuu <kevin.kayu@gmail.com>
Date: Tue, 22 Apr 2025 08:17:11 +0700
Subject: [PATCH 198/529] Fix geometry scene creator to use IShader

---
 common/include/CGeomtryCreatorScene.hpp | 29 ++++++++++++-------------
 1 file changed, 14 insertions(+), 15 deletions(-)

diff --git a/common/include/CGeomtryCreatorScene.hpp b/common/include/CGeomtryCreatorScene.hpp
index 0d9bc6edd..6ffad2c73 100644
--- a/common/include/CGeomtryCreatorScene.hpp
+++ b/common/include/CGeomtryCreatorScene.hpp
@@ -46,7 +46,7 @@ constexpr static inline struct ClearValues
 	using image_view_t = std::conditional_t<WithConverter, nbl::asset::ICPUImageView, nbl::video::IGPUImageView>; \
 	using image_t = std::conditional_t<WithConverter, nbl::asset::ICPUImage, nbl::video::IGPUImage>; \
 	using buffer_t = std::conditional_t<WithConverter, nbl::asset::ICPUBuffer, nbl::video::IGPUBuffer>; \
-	using shader_t = std::conditional_t<WithConverter, nbl::asset::ICPUShader, nbl::video::IGPUShader>; \
+	using shader_t = nbl::asset::IShader; \
 	using graphics_pipeline_t = std::conditional_t<WithConverter, nbl::asset::ICPUGraphicsPipeline, nbl::video::IGPUGraphicsPipeline>; \
 	using descriptor_set = std::conditional_t<WithConverter, nbl::asset::ICPUDescriptorSet, nbl::video::IGPUDescriptorSet>; \
 }
@@ -764,36 +764,35 @@ class ResourceBuilder
 	{
 		EXPOSE_NABLA_NAMESPACES();
 
-		auto createShader = [&]<StringLiteral virtualPath>(IShader::E_SHADER_STAGE stage, smart_refctd_ptr<typename Types::shader_t>& outShader) -> smart_refctd_ptr<typename Types::shader_t>
+		auto createShader = [&]<StringLiteral virtualPath>(smart_refctd_ptr<typename Types::shader_t>& outShader) -> smart_refctd_ptr<typename Types::shader_t>
 		{
 			// TODO: use SPIRV loader & our ::system ns to get those cpu shaders, do not create myself (shit I forgot it exists)
 
 			const SBuiltinFile& in = ::geometry::creator::spirv::builtin::get_resource<virtualPath>();
 			const auto buffer = ICPUBuffer::create({ { in.size }, (void*)in.contents, core::getNullMemoryResource() }, adopt_memory);
-			auto shader = make_smart_refctd_ptr<ICPUShader>(smart_refctd_ptr(buffer), stage, IShader::E_CONTENT_TYPE::ECT_SPIRV, ""); // must create cpu instance regardless underlying type
+			auto shader = make_smart_refctd_ptr<IShader>(smart_refctd_ptr(buffer), IShader::E_CONTENT_TYPE::ECT_SPIRV, ""); // must create cpu instance regardless underlying type
 
 			if constexpr (withAssetConverter)
 			{
 				buffer->setContentHash(buffer->computeContentHash());
-				outShader = std::move(shader);
 			}
-			else
-				outShader = utilities->getLogicalDevice()->createShader(shader.get());
+
+      outShader = std::move(shader);
 
 			return outShader;
 		};
 
 		typename ResourcesBundleScratch::Shaders& basic = scratch.shaders[GeometriesCpu::GP_BASIC];
-		createShader.template operator() < NBL_CORE_UNIQUE_STRING_LITERAL_TYPE("geometryCreator/spirv/gc.basic.vertex.spv") > (IShader::E_SHADER_STAGE::ESS_VERTEX, basic.vertex);
-		createShader.template operator() < NBL_CORE_UNIQUE_STRING_LITERAL_TYPE("geometryCreator/spirv/gc.basic.fragment.spv") > (IShader::E_SHADER_STAGE::ESS_FRAGMENT, basic.fragment);
+		createShader.template operator() < NBL_CORE_UNIQUE_STRING_LITERAL_TYPE("geometryCreator/spirv/gc.basic.vertex.spv") > (basic.vertex);
+		createShader.template operator() < NBL_CORE_UNIQUE_STRING_LITERAL_TYPE("geometryCreator/spirv/gc.basic.fragment.spv") > (basic.fragment);
 
 		typename ResourcesBundleScratch::Shaders& cone = scratch.shaders[GeometriesCpu::GP_CONE];
-		createShader.template operator() < NBL_CORE_UNIQUE_STRING_LITERAL_TYPE("geometryCreator/spirv/gc.cone.vertex.spv") > (IShader::E_SHADER_STAGE::ESS_VERTEX, cone.vertex);
-		createShader.template operator() < NBL_CORE_UNIQUE_STRING_LITERAL_TYPE("geometryCreator/spirv/gc.basic.fragment.spv") > (IShader::E_SHADER_STAGE::ESS_FRAGMENT, cone.fragment); // note we reuse fragment from basic!
+		createShader.template operator() < NBL_CORE_UNIQUE_STRING_LITERAL_TYPE("geometryCreator/spirv/gc.cone.vertex.spv") > (cone.vertex);
+		createShader.template operator() < NBL_CORE_UNIQUE_STRING_LITERAL_TYPE("geometryCreator/spirv/gc.basic.fragment.spv") > (cone.fragment); // note we reuse fragment from basic!
 
 		typename ResourcesBundleScratch::Shaders& ico = scratch.shaders[GeometriesCpu::GP_ICO];
-		createShader.template operator() < NBL_CORE_UNIQUE_STRING_LITERAL_TYPE("geometryCreator/spirv/gc.ico.vertex.spv") > (IShader::E_SHADER_STAGE::ESS_VERTEX, ico.vertex);
-		createShader.template operator() < NBL_CORE_UNIQUE_STRING_LITERAL_TYPE("geometryCreator/spirv/gc.basic.fragment.spv") > (IShader::E_SHADER_STAGE::ESS_FRAGMENT, ico.fragment); // note we reuse fragment from basic!
+		createShader.template operator() < NBL_CORE_UNIQUE_STRING_LITERAL_TYPE("geometryCreator/spirv/gc.ico.vertex.spv") > (ico.vertex);
+		createShader.template operator() < NBL_CORE_UNIQUE_STRING_LITERAL_TYPE("geometryCreator/spirv/gc.basic.fragment.spv") > (ico.fragment); // note we reuse fragment from basic!
 			
 		for (const auto& it : scratch.shaders)
 		{
@@ -843,10 +842,10 @@ class ResourceBuilder
 
 			params.rasterization.faceCullingMode = EFCM_NONE;
 			{
-				const typename Types::shader_t::SSpecInfo info [] =
+				const IPipelineBase::SShaderSpecInfo info [] =
 				{
-					{.entryPoint = "VSMain", .shader = scratch.shaders[inGeometry.shadersType].vertex.get() },
-					{.entryPoint = "PSMain", .shader = scratch.shaders[inGeometry.shadersType].fragment.get() }
+					{.shader = scratch.shaders[inGeometry.shadersType].vertex.get(), .entryPoint = "VSMain", .stage = hlsl::ShaderStage::ESS_VERTEX},
+					{.shader = scratch.shaders[inGeometry.shadersType].fragment.get(), .entryPoint = "PSMain", .stage = hlsl::ShaderStage::ESS_FRAGMENT},
 				};
 
 				params.pipeline.layout = scratch.pipelineLayout.get();

From 2462d0adfa9a49e30dacbb0d25c3bd1acc47a62f Mon Sep 17 00:00:00 2001
From: Erfan Ahmadi <ahmadierfan99@gmail.com>
Date: Tue, 22 Apr 2025 08:58:22 +0330
Subject: [PATCH 199/529] multiple contours in fragment shader

---
 62_CAD/main.cpp                                   | 11 +++++++++--
 62_CAD/shaders/main_pipeline/fragment_shader.hlsl |  5 ++++-
 2 files changed, 13 insertions(+), 3 deletions(-)

diff --git a/62_CAD/main.cpp b/62_CAD/main.cpp
index 0f1653591..5f48af58a 100644
--- a/62_CAD/main.cpp
+++ b/62_CAD/main.cpp
@@ -3246,15 +3246,17 @@ class ComputerAidedDesign final : public examples::SimpleWindowedApplication, pu
 			mesh.setIndices(std::move(indices));
 
 			DTMSettingsInfo dtmInfo;
-			dtmInfo.mode = E_DTM_MODE::HEIGHT_SHADING | E_DTM_MODE::CONTOUR | E_DTM_MODE::OUTLINE;
+			//dtmInfo.mode = E_DTM_MODE::HEIGHT_SHADING | E_DTM_MODE::CONTOUR | E_DTM_MODE::OUTLINE;
 			//dtmInfo.mode = E_DTM_MODE::HEIGHT_SHADING;
+			dtmInfo.mode = E_DTM_MODE::CONTOUR;
+
 			dtmInfo.outlineStyleInfo.screenSpaceLineWidth = 0.0f;
 			dtmInfo.outlineStyleInfo.worldSpaceLineWidth = 3.0f;
 			dtmInfo.outlineStyleInfo.color = float32_t4(0.0f, 0.39f, 0.0f, 1.0f);
 			std::array<double, 4> outlineStipplePattern = { 0.0f, -5.0f, 20.0f, -5.0f };
 			dtmInfo.outlineStyleInfo.setStipplePatternData(outlineStipplePattern);
 
-			dtmInfo.contourSettingsCount = 1u;
+			dtmInfo.contourSettingsCount = 2u;
 			dtmInfo.contourSettings[0u].startHeight = 20;
 			dtmInfo.contourSettings[0u].endHeight = 90;
 			dtmInfo.contourSettings[0u].heightInterval = 10;
@@ -3264,6 +3266,11 @@ class ComputerAidedDesign final : public examples::SimpleWindowedApplication, pu
 			std::array<double, 4> contourStipplePattern = { 0.0f, -5.0f, 10.0f, -5.0f };
 			dtmInfo.contourSettings[0u].lineStyleInfo.setStipplePatternData(contourStipplePattern);
 
+			dtmInfo.contourSettings[1u] = dtmInfo.contourSettings[0u];
+			dtmInfo.contourSettings[1u].startHeight += 5.0f;
+			dtmInfo.contourSettings[1u].heightInterval = 13.0f;
+			dtmInfo.contourSettings[1u].lineStyleInfo.color = float32_t4(0.8f, 0.4f, 0.3f, 1.0f);
+
 			// PRESS 1, 2, 3 TO SWITCH HEIGHT SHADING MODE
 			// 1 - DISCRETE_VARIABLE_LENGTH_INTERVALS
 			// 2 - DISCRETE_FIXED_LENGTH_INTERVALS
diff --git a/62_CAD/shaders/main_pipeline/fragment_shader.hlsl b/62_CAD/shaders/main_pipeline/fragment_shader.hlsl
index fb6b6e8e8..31c25a6e5 100644
--- a/62_CAD/shaders/main_pipeline/fragment_shader.hlsl
+++ b/62_CAD/shaders/main_pipeline/fragment_shader.hlsl
@@ -848,7 +848,10 @@ float4 fragMain(PSInput input) : SV_TARGET
         if (dtmSettings.drawHeightShadingEnabled())
             dtmColor = blendColorOnTop(dtmColor, calculateDTMHeightColor(dtmSettings.heightShadingSettings, v, heightDeriv, input.position.xy, height));
         if (dtmSettings.drawContourEnabled())
-            dtmColor = blendColorOnTop(dtmColor, calculateDTMContourColor(dtmSettings.contourSettings[0u], v, edgePoints, input.position.xy, height));
+        {
+            for(uint32_t i = 0; i < dtmSettings.contourSettingsCount; ++i) // TODO: should reverse the order with blendUnder
+                dtmColor = blendColorOnTop(dtmColor, calculateDTMContourColor(dtmSettings.contourSettings[i], v, edgePoints, input.position.xy, height));
+        }
         if (dtmSettings.drawOutlineEnabled())
             dtmColor = blendColorOnTop(dtmColor, calculateDTMOutlineColor(dtmSettings.outlineLineStyleIdx, v, edgePoints, input.position.xy, baryCoord, height));
 

From 287688e18f75aa1a24c054691e5e0e197c9f751f Mon Sep 17 00:00:00 2001
From: Przemek <minikers21@gmail.com>
Date: Tue, 22 Apr 2025 11:05:58 +0200
Subject: [PATCH 200/529] Small example 62 update

---
 62_CAD/main.cpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/62_CAD/main.cpp b/62_CAD/main.cpp
index 5f48af58a..3a32b8fa8 100644
--- a/62_CAD/main.cpp
+++ b/62_CAD/main.cpp
@@ -3245,10 +3245,10 @@ class ComputerAidedDesign final : public examples::SimpleWindowedApplication, pu
 			mesh.setVertices(std::move(vertices));
 			mesh.setIndices(std::move(indices));
 
-			DTMSettingsInfo dtmInfo;
-			//dtmInfo.mode = E_DTM_MODE::HEIGHT_SHADING | E_DTM_MODE::CONTOUR | E_DTM_MODE::OUTLINE;
-			//dtmInfo.mode = E_DTM_MODE::HEIGHT_SHADING;
-			dtmInfo.mode = E_DTM_MODE::CONTOUR;
+			DTMSettingsInfo dtmInfo{};
+			dtmInfo.mode |= E_DTM_MODE::OUTLINE;
+			dtmInfo.mode |= E_DTM_MODE::HEIGHT_SHADING;
+			dtmInfo.mode |= E_DTM_MODE::CONTOUR;
 
 			dtmInfo.outlineStyleInfo.screenSpaceLineWidth = 0.0f;
 			dtmInfo.outlineStyleInfo.worldSpaceLineWidth = 3.0f;

From daa51f68fc040b8927a9c5d1e0bd8e6dc4e3cac1 Mon Sep 17 00:00:00 2001
From: Przemek <minikers21@gmail.com>
Date: Tue, 22 Apr 2025 21:05:25 +0200
Subject: [PATCH 201/529] Refactor

---
 62_CAD/CTriangleMesh.h                        |   2 +-
 62_CAD/main.cpp                               |  10 +-
 62_CAD/shaders/globals.hlsl                   |   2 +-
 62_CAD/shaders/main_pipeline/dtm.hlsl         | 678 ++++++++++++++++
 .../main_pipeline/fragment_shader.hlsl        | 723 +-----------------
 5 files changed, 695 insertions(+), 720 deletions(-)
 create mode 100644 62_CAD/shaders/main_pipeline/dtm.hlsl

diff --git a/62_CAD/CTriangleMesh.h b/62_CAD/CTriangleMesh.h
index 67daf5221..78f7dd99f 100644
--- a/62_CAD/CTriangleMesh.h
+++ b/62_CAD/CTriangleMesh.h
@@ -78,7 +78,7 @@ struct DTMSettingsInfo
 {
 	static constexpr uint32_t MaxContourSettings = DTMSettings::MaxContourSettings;
 
-	uint32_t mode = 0u;
+	uint32_t mode = 0u; // related to E_DTM_MODE
 
 	// outline
 	LineStyleInfo outlineStyleInfo;
diff --git a/62_CAD/main.cpp b/62_CAD/main.cpp
index 3a32b8fa8..9f5392d4b 100644
--- a/62_CAD/main.cpp
+++ b/62_CAD/main.cpp
@@ -1444,8 +1444,6 @@ class ComputerAidedDesign final : public examples::SimpleWindowedApplication, pu
 	
 	void addObjects(SIntendedSubmitInfo& intendedNextSubmit)
 	{
-		// TODO[Przemek]: add your own case, you won't call any other drawResourcesFiller function, only drawMesh with your custom made Mesh (for start it can be a single triangle)
-
 		// we record upload of our objects and if we failed to allocate we submit everything
 		if (!intendedNextSubmit.valid())
 		{
@@ -3232,9 +3230,9 @@ class ComputerAidedDesign final : public examples::SimpleWindowedApplication, pu
 
 			// SINGLE TRIANGLE
 			/*core::vector<TriangleMeshVertex> vertices = {
-				{ float32_t2(0.0, 0.0), -20.0 },
-				{ float32_t2(200.0, 200.0), 100.0 },
-				{ float32_t2(200.0, -200.0), 80.0 }
+				{ float64_t2(0.0, 0.0), -20.0 },
+				{ float64_t2(-200.0, -200.0), 100.0 },
+				{ float64_t2(200.0, -100.0), 80.0 },
 			};
 
 			core::vector<uint32_t> indices = {
@@ -3251,7 +3249,7 @@ class ComputerAidedDesign final : public examples::SimpleWindowedApplication, pu
 			dtmInfo.mode |= E_DTM_MODE::CONTOUR;
 
 			dtmInfo.outlineStyleInfo.screenSpaceLineWidth = 0.0f;
-			dtmInfo.outlineStyleInfo.worldSpaceLineWidth = 3.0f;
+			dtmInfo.outlineStyleInfo.worldSpaceLineWidth = 1.0f;
 			dtmInfo.outlineStyleInfo.color = float32_t4(0.0f, 0.39f, 0.0f, 1.0f);
 			std::array<double, 4> outlineStipplePattern = { 0.0f, -5.0f, 20.0f, -5.0f };
 			dtmInfo.outlineStyleInfo.setStipplePatternData(outlineStipplePattern);
diff --git a/62_CAD/shaders/globals.hlsl b/62_CAD/shaders/globals.hlsl
index bd700785d..045e11f1e 100644
--- a/62_CAD/shaders/globals.hlsl
+++ b/62_CAD/shaders/globals.hlsl
@@ -300,7 +300,7 @@ NBL_CONSTEXPR float InvalidStyleStretchValue = nbl::hlsl::numeric_limits<float>:
 struct TriangleMeshVertex
 {
     pfloat64_t2 pos;
-    pfloat64_t height;
+    pfloat64_t height; // TODO: can be of type float32_t instead
 };
 
 // The color parameter is also used for styling non-curve objects such as text glyphs and hatches with solid color
diff --git a/62_CAD/shaders/main_pipeline/dtm.hlsl b/62_CAD/shaders/main_pipeline/dtm.hlsl
new file mode 100644
index 000000000..5b41eabb3
--- /dev/null
+++ b/62_CAD/shaders/main_pipeline/dtm.hlsl
@@ -0,0 +1,678 @@
+#ifndef _CAD_EXAMPLE_DTM_HLSL_INCLUDED_
+#define _CAD_EXAMPLE_DTM_HLSL_INCLUDED_
+
+#include <nbl/builtin/hlsl/shapes/line.hlsl>
+#include <nbl/builtin/hlsl/algorithm.hlsl>
+
+// TODO: functions outside of the "dtm" namespace need to be moved to another file
+
+// for usage in upper_bound function
+struct StyleAccessor
+{
+    LineStyle style;
+    using value_type = float;
+
+    float operator[](const uint32_t ix)
+    {
+        return style.getStippleValue(ix);
+    }
+};
+
+template<typename CurveType>
+struct StyleClipper
+{
+    using float_t = typename CurveType::scalar_t;
+    using float_t2 = typename CurveType::float_t2;
+    using float_t3 = typename CurveType::float_t3;
+    NBL_CONSTEXPR_STATIC_INLINE float_t AccuracyThresholdT = 0.000001;
+
+    static StyleClipper<CurveType> construct(
+        LineStyle style,
+        CurveType curve,
+        typename CurveType::ArcLengthCalculator arcLenCalc,
+        float phaseShift,
+        float stretch,
+        float worldToScreenRatio)
+    {
+        StyleClipper<CurveType> ret = { style, curve, arcLenCalc, phaseShift, stretch, worldToScreenRatio, 0.0f, 0.0f, 0.0f, 0.0f };
+
+        // values for non-uniform stretching with a rigid segment
+        if (style.rigidSegmentIdx != InvalidRigidSegmentIndex && stretch != 1.0f)
+        {
+            // rigidSegment info in old non stretched pattern
+            ret.rigidSegmentStart = (style.rigidSegmentIdx >= 1u) ? style.getStippleValue(style.rigidSegmentIdx - 1u) : 0.0f;
+            ret.rigidSegmentEnd = (style.rigidSegmentIdx < style.stipplePatternSize) ? style.getStippleValue(style.rigidSegmentIdx) : 1.0f;
+            ret.rigidSegmentLen = ret.rigidSegmentEnd - ret.rigidSegmentStart;
+            // stretch value for non rigid segments
+            ret.nonRigidSegmentStretchValue = (stretch - ret.rigidSegmentLen) / (1.0f - ret.rigidSegmentLen);
+            // rigidSegment info to new stretched pattern
+            ret.rigidSegmentStart *= ret.nonRigidSegmentStretchValue / stretch; // get the new normalized rigid segment start
+            ret.rigidSegmentLen /= stretch; // get the new rigid segment normalized len
+            ret.rigidSegmentEnd = ret.rigidSegmentStart + ret.rigidSegmentLen; // get the new normalized rigid segment end 
+        }
+        else
+        {
+            ret.nonRigidSegmentStretchValue = stretch;
+        }
+
+        return ret;
+    }
+
+    // For non-uniform stretching with a rigid segment (the one segement that shouldn't stretch) the whole pattern changes
+    // instead of transforming each of the style.stipplePattern values (max 14 of them), we transform the normalized place in pattern
+    float getRealNormalizedPlaceInPattern(float normalizedPlaceInPattern)
+    {
+        if (style.rigidSegmentIdx != InvalidRigidSegmentIndex && stretch != 1.0f)
+        {
+            float ret = min(normalizedPlaceInPattern, rigidSegmentStart) / nonRigidSegmentStretchValue; // unstretch parts before rigid segment
+            ret += max(normalizedPlaceInPattern - rigidSegmentEnd, 0.0f) / nonRigidSegmentStretchValue; // unstretch parts after rigid segment
+            ret += max(min(rigidSegmentLen, normalizedPlaceInPattern - rigidSegmentStart), 0.0f); // unstretch parts inside rigid segment
+            ret *= stretch;
+            return ret;
+        }
+        else
+        {
+            return normalizedPlaceInPattern;
+        }
+    }
+
+    float_t2 operator()(float_t t)
+    {
+        // basicaly 0.0 and 1.0 but with a guardband to discard outside the range
+        const float_t minT = 0.0 - 1.0;
+        const float_t maxT = 1.0 + 1.0;
+
+        StyleAccessor styleAccessor = { style };
+        const float_t reciprocalStretchedStipplePatternLen = style.reciprocalStipplePatternLen / stretch;
+        const float_t patternLenInScreenSpace = 1.0 / (worldToScreenRatio * style.reciprocalStipplePatternLen);
+
+        const float_t arcLen = arcLenCalc.calcArcLen(t);
+        const float_t worldSpaceArcLen = arcLen * float_t(worldToScreenRatio);
+        float_t normalizedPlaceInPattern = frac(worldSpaceArcLen * reciprocalStretchedStipplePatternLen + phaseShift);
+        normalizedPlaceInPattern = getRealNormalizedPlaceInPattern(normalizedPlaceInPattern);
+        uint32_t patternIdx = nbl::hlsl::upper_bound(styleAccessor, 0, style.stipplePatternSize, normalizedPlaceInPattern);
+
+        const float_t InvalidT = nbl::hlsl::numeric_limits<float32_t>::infinity;
+        float_t2 ret = float_t2(InvalidT, InvalidT);
+
+        // odd patternIdx means a "no draw section" and current candidate should split into two nearest draw sections
+        const bool notInDrawSection = patternIdx & 0x1;
+
+        // TODO[Erfan]: Disable this piece of code after clipping, and comment the reason, that the bezier start and end at 0.0 and 1.0 should be in drawable sections
+        float_t minDrawT = 0.0;
+        float_t maxDrawT = 1.0;
+        {
+            float_t normalizedPlaceInPatternBegin = frac(phaseShift);
+            normalizedPlaceInPatternBegin = getRealNormalizedPlaceInPattern(normalizedPlaceInPatternBegin);
+            uint32_t patternIdxBegin = nbl::hlsl::upper_bound(styleAccessor, 0, style.stipplePatternSize, normalizedPlaceInPatternBegin);
+            const bool BeginInNonDrawSection = patternIdxBegin & 0x1;
+
+            if (BeginInNonDrawSection)
+            {
+                float_t diffToRightDrawableSection = (patternIdxBegin == style.stipplePatternSize) ? 1.0 : styleAccessor[patternIdxBegin];
+                diffToRightDrawableSection -= normalizedPlaceInPatternBegin;
+                float_t scrSpcOffsetToArcLen1 = diffToRightDrawableSection * patternLenInScreenSpace * ((patternIdxBegin != style.rigidSegmentIdx) ? nonRigidSegmentStretchValue : 1.0);
+                const float_t arcLenForT1 = 0.0 + scrSpcOffsetToArcLen1;
+                minDrawT = arcLenCalc.calcArcLenInverse(curve, minT, maxT, arcLenForT1, AccuracyThresholdT, 0.0);
+            }
+
+            // Completely in non-draw section -> clip away:
+            if (minDrawT >= 1.0)
+                return ret;
+
+            const float_t arcLenEnd = arcLenCalc.calcArcLen(1.0);
+            const float_t worldSpaceArcLenEnd = arcLenEnd * float_t(worldToScreenRatio);
+            float_t normalizedPlaceInPatternEnd = frac(worldSpaceArcLenEnd * reciprocalStretchedStipplePatternLen + phaseShift);
+            normalizedPlaceInPatternEnd = getRealNormalizedPlaceInPattern(normalizedPlaceInPatternEnd);
+            uint32_t patternIdxEnd = nbl::hlsl::upper_bound(styleAccessor, 0, style.stipplePatternSize, normalizedPlaceInPatternEnd);
+            const bool EndInNonDrawSection = patternIdxEnd & 0x1;
+
+            if (EndInNonDrawSection)
+            {
+                float_t diffToLeftDrawableSection = (patternIdxEnd == 0) ? 0.0 : styleAccessor[patternIdxEnd - 1];
+                diffToLeftDrawableSection -= normalizedPlaceInPatternEnd;
+                float_t scrSpcOffsetToArcLen0 = diffToLeftDrawableSection * patternLenInScreenSpace * ((patternIdxEnd != style.rigidSegmentIdx) ? nonRigidSegmentStretchValue : 1.0);
+                const float_t arcLenForT0 = arcLenEnd + scrSpcOffsetToArcLen0;
+                maxDrawT = arcLenCalc.calcArcLenInverse(curve, minT, maxT, arcLenForT0, AccuracyThresholdT, 1.0);
+            }
+        }
+
+        if (notInDrawSection)
+        {
+            float toScreenSpaceLen = patternLenInScreenSpace * ((patternIdx != style.rigidSegmentIdx) ? nonRigidSegmentStretchValue : 1.0);
+
+            float_t diffToLeftDrawableSection = (patternIdx == 0) ? 0.0 : styleAccessor[patternIdx - 1];
+            diffToLeftDrawableSection -= normalizedPlaceInPattern;
+            float_t scrSpcOffsetToArcLen0 = diffToLeftDrawableSection * toScreenSpaceLen;
+            const float_t arcLenForT0 = arcLen + scrSpcOffsetToArcLen0;
+            float_t t0 = arcLenCalc.calcArcLenInverse(curve, minT, maxT, arcLenForT0, AccuracyThresholdT, t);
+            t0 = clamp(t0, minDrawT, maxDrawT);
+
+            float_t diffToRightDrawableSection = (patternIdx == style.stipplePatternSize) ? 1.0 : styleAccessor[patternIdx];
+            diffToRightDrawableSection -= normalizedPlaceInPattern;
+            float_t scrSpcOffsetToArcLen1 = diffToRightDrawableSection * toScreenSpaceLen;
+            const float_t arcLenForT1 = arcLen + scrSpcOffsetToArcLen1;
+            float_t t1 = arcLenCalc.calcArcLenInverse(curve, minT, maxT, arcLenForT1, AccuracyThresholdT, t);
+            t1 = clamp(t1, minDrawT, maxDrawT);
+
+            ret = float_t2(t0, t1);
+        }
+        else
+        {
+            t = clamp(t, minDrawT, maxDrawT);
+            ret = float_t2(t, t);
+        }
+
+        return ret;
+    }
+
+    LineStyle style;
+    CurveType curve;
+    typename CurveType::ArcLengthCalculator arcLenCalc;
+    float phaseShift;
+    float stretch;
+    float worldToScreenRatio;
+    // precomp value for non uniform stretching
+    float rigidSegmentStart;
+    float rigidSegmentEnd;
+    float rigidSegmentLen;
+    float nonRigidSegmentStretchValue;
+};
+
+typedef StyleClipper< nbl::hlsl::shapes::Quadratic<float> > BezierStyleClipper;
+typedef StyleClipper< nbl::hlsl::shapes::Line<float> > LineStyleClipper;
+
+template<typename float_t>
+struct DefaultClipper
+{
+    using float_t2 = vector<float_t, 2>;
+    NBL_CONSTEXPR_STATIC_INLINE float_t AccuracyThresholdT = 0.0;
+
+    static DefaultClipper construct()
+    {
+        DefaultClipper ret;
+        return ret;
+    }
+
+    inline float_t2 operator()(const float_t t)
+    {
+        const float_t ret = clamp(t, 0.0, 1.0);
+        return float_t2(ret, ret);
+    }
+};
+
+template<typename CurveType, typename Clipper = DefaultClipper<typename CurveType::scalar_t> >
+struct ClippedSignedDistance
+{
+    using float_t = typename CurveType::scalar_t;
+    using float_t2 = typename CurveType::float_t2;
+    using float_t3 = typename CurveType::float_t3;
+
+    const static float_t sdf(CurveType curve, float_t2 pos, float_t thickness, bool isRoadStyle, Clipper clipper = DefaultClipper<typename CurveType::scalar_t>::construct())
+    {
+        typename CurveType::Candidates candidates = curve.getClosestCandidates(pos);
+
+        const float_t InvalidT = nbl::hlsl::numeric_limits<float32_t>::max;
+        // TODO: Fix and test, we're not working with squared distance anymore
+        const float_t MAX_DISTANCE_SQUARED = (thickness + 1.0f) * (thickness + 1.0f); // TODO: ' + 1' is too much?
+
+        bool clipped = false;
+        float_t closestDistanceSquared = MAX_DISTANCE_SQUARED;
+        float_t closestT = InvalidT;
+        [[unroll(CurveType::MaxCandidates)]]
+        for (uint32_t i = 0; i < CurveType::MaxCandidates; i++)
+        {
+            const float_t candidateDistanceSquared = length(curve.evaluate(candidates[i]) - pos);
+            if (candidateDistanceSquared < closestDistanceSquared)
+            {
+                float_t2 snappedTs = clipper(candidates[i]);
+
+                if (snappedTs[0] == InvalidT)
+                {
+                    continue;
+                }
+
+                if (snappedTs[0] != candidates[i])
+                {
+                    // left snapped or clamped
+                    const float_t leftSnappedCandidateDistanceSquared = length(curve.evaluate(snappedTs[0]) - pos);
+                    if (leftSnappedCandidateDistanceSquared < closestDistanceSquared)
+                    {
+                        clipped = true;
+                        closestT = snappedTs[0];
+                        closestDistanceSquared = leftSnappedCandidateDistanceSquared;
+                    }
+
+                    if (snappedTs[0] != snappedTs[1])
+                    {
+                        // right snapped or clamped
+                        const float_t rightSnappedCandidateDistanceSquared = length(curve.evaluate(snappedTs[1]) - pos);
+                        if (rightSnappedCandidateDistanceSquared < closestDistanceSquared)
+                        {
+                            clipped = true;
+                            closestT = snappedTs[1];
+                            closestDistanceSquared = rightSnappedCandidateDistanceSquared;
+                        }
+                    }
+                }
+                else
+                {
+                    // no snapping
+                    if (candidateDistanceSquared < closestDistanceSquared)
+                    {
+                        clipped = false;
+                        closestT = candidates[i];
+                        closestDistanceSquared = candidateDistanceSquared;
+                    }
+                }
+            }
+        }
+
+
+        float_t roundedDistance = closestDistanceSquared - thickness;
+        if (!isRoadStyle)
+        {
+            return roundedDistance;
+        }
+        else
+        {
+            const float_t aaWidth = globals.antiAliasingFactor;
+            float_t rectCappedDistance = roundedDistance;
+
+            if (clipped)
+            {
+                float_t2 q = mul(curve.getLocalCoordinateSpace(closestT), pos - curve.evaluate(closestT));
+                rectCappedDistance = capSquare(q, thickness, aaWidth);
+            }
+
+            return rectCappedDistance;
+        }
+    }
+
+    static float capSquare(float_t2 q, float_t th, float_t aaWidth)
+    {
+        float_t2 d = abs(q) - float_t2(aaWidth, th);
+        return length(max(d, 0.0)) + min(max(d.x, d.y), 0.0);
+    }
+};
+
+namespace dtm
+{
+
+// for usage in upper_bound function
+struct DTMSettingsHeightsAccessor
+{
+    DTMHeightShadingSettings settings;
+    using value_type = float;
+
+    float operator[](const uint32_t ix)
+    {
+        return settings.heightColorMapHeights[ix];
+    }
+};
+
+float dot2(in float2 vec)
+{
+    return dot(vec, vec);
+}
+
+// TODO: Later move these functions and structs to dtmSettings.hlsl and a namespace like dtmSettings::height_shading or dtmSettings::contours, etc..
+struct HeightSegmentTransitionData
+{
+    float currentHeight;
+    float4 currentSegmentColor;
+    float boundaryHeight;
+    float4 otherSegmentColor;
+};
+
+// This function interpolates between the current and nearest segment colors based on the
+// screen-space distance to the segment boundary. The result is a smoothly blended color
+// useful for visualizing discrete height levels without harsh edges.
+float4 smoothHeightSegmentTransition(in HeightSegmentTransitionData transitionInfo, in float heightDeriv)
+{
+    float pxDistanceToNearestSegment = abs((transitionInfo.currentHeight - transitionInfo.boundaryHeight) / heightDeriv);
+    float nearestSegmentColorCoverage = smoothstep(-globals.antiAliasingFactor, globals.antiAliasingFactor, pxDistanceToNearestSegment);
+    float4 localHeightColor = lerp(transitionInfo.otherSegmentColor, transitionInfo.currentSegmentColor, nearestSegmentColorCoverage);
+    return localHeightColor;
+}
+
+// Computes the continuous position of a height value within uniform intervals.
+// flooring this value will give the interval index
+//
+// If `isCenteredShading` is true, the intervals are centered around `minHeight`, meaning the
+// first interval spans [minHeight - intervalLength / 2.0, minHeight + intervalLength / 2.0].
+// Otherwise, intervals are aligned from `minHeight` upward, so the first interval spans
+// [minHeight, minHeight + intervalLength].
+//
+// Parameters:
+// - height: The height value to classify.
+// - minHeight: The reference starting height for interval calculation.
+// - intervalLength: The length of each interval segment.
+// - isCenteredShading: Whether to center the shading intervals around minHeight.
+//
+// Returns:
+// - A float representing the continuous position within the interval grid.
+float getIntervalPosition(in float height, in float minHeight, in float intervalLength, in bool isCenteredShading)
+{
+    if (isCenteredShading)
+        return ((height - minHeight) / intervalLength + 0.5f);
+    else
+        return ((height - minHeight) / intervalLength);
+}
+
+void getIntervalHeightAndColor(in int intervalIndex, in DTMHeightShadingSettings settings, out float4 outIntervalColor, out float outIntervalHeight)
+{
+    float minShadingHeight = settings.heightColorMapHeights[0];
+    float heightForColor = minShadingHeight + float(intervalIndex) * settings.intervalIndexToHeightMultiplier;
+
+    if (settings.isCenteredShading)
+        outIntervalHeight = minShadingHeight + (float(intervalIndex) - 0.5) * settings.intervalLength;
+    else
+        outIntervalHeight = minShadingHeight + (float(intervalIndex)) * settings.intervalLength;
+
+    DTMSettingsHeightsAccessor dtmHeightsAccessor = { settings };
+    uint32_t upperBoundHeightIndex = min(nbl::hlsl::upper_bound(dtmHeightsAccessor, 0, settings.heightColorEntryCount, heightForColor), settings.heightColorEntryCount - 1u);
+    uint32_t lowerBoundHeightIndex = max(upperBoundHeightIndex - 1, 0);
+
+    float upperBoundHeight = settings.heightColorMapHeights[upperBoundHeightIndex];
+    float lowerBoundHeight = settings.heightColorMapHeights[lowerBoundHeightIndex];
+
+    float4 upperBoundColor = settings.heightColorMapColors[upperBoundHeightIndex];
+    float4 lowerBoundColor = settings.heightColorMapColors[lowerBoundHeightIndex];
+
+    if (upperBoundHeight == lowerBoundHeight)
+    {
+        outIntervalColor = upperBoundColor;
+    }
+    else
+    {
+        float interpolationVal = (heightForColor - lowerBoundHeight) / (upperBoundHeight - lowerBoundHeight);
+        outIntervalColor = lerp(lowerBoundColor, upperBoundColor, interpolationVal);
+    }
+}
+
+float3 calculateDTMTriangleBarycentrics(in float2 v1, in float2 v2, in float2 v3, in float2 p)
+{
+    float denom = (v2.x - v1.x) * (v3.y - v1.y) - (v3.x - v1.x) * (v2.y - v1.y);
+    float u = ((v2.y - v3.y) * (p.x - v3.x) + (v3.x - v2.x) * (p.y - v3.y)) / denom;
+    float v = ((v3.y - v1.y) * (p.x - v3.x) + (v1.x - v3.x) * (p.y - v3.y)) / denom;
+    float w = 1.0 - u - v;
+    return float3(u, v, w);
+}
+
+float4 calculateDTMHeightColor(in DTMHeightShadingSettings settings, in float3 v[3], in float heightDeriv, in float2 fragPos, in float height)
+{
+    float4 outputColor = float4(0.0f, 0.0f, 0.0f, 0.0f);
+
+    // HEIGHT SHADING
+    const uint32_t heightMapSize = settings.heightColorEntryCount;
+    float minShadingHeight = settings.heightColorMapHeights[0];
+    float maxShadingHeight = settings.heightColorMapHeights[heightMapSize - 1];
+
+    if (heightMapSize > 0)
+    {
+        // partially based on https://www.shadertoy.com/view/XsXSz4 by Inigo Quilez
+        float2 e0 = v[1] - v[0];
+        float2 e1 = v[2] - v[1];
+        float2 e2 = v[0] - v[2];
+
+        float triangleAreaSign = -sign(e0.x * e2.y - e0.y * e2.x);
+        float2 v0 = fragPos - v[0];
+        float2 v1 = fragPos - v[1];
+        float2 v2 = fragPos - v[2];
+
+        float distanceToLine0 = sqrt(dot2(v0 - e0 * dot(v0, e0) / dot(e0, e0)));
+        float distanceToLine1 = sqrt(dot2(v1 - e1 * dot(v1, e1) / dot(e1, e1)));
+        float distanceToLine2 = sqrt(dot2(v2 - e2 * dot(v2, e2) / dot(e2, e2)));
+
+        float line0Sdf = distanceToLine0 * triangleAreaSign * sign(v0.x * e0.y - v0.y * e0.x);
+        float line1Sdf = distanceToLine1 * triangleAreaSign * sign(v1.x * e1.y - v1.y * e1.x);
+        float line2Sdf = distanceToLine2 * triangleAreaSign * sign(v2.x * e2.y - v2.y * e2.x);
+        float line3Sdf = (minShadingHeight - height) / heightDeriv;
+        float line4Sdf = (height - maxShadingHeight) / heightDeriv;
+
+        float convexPolygonSdf = max(line0Sdf, line1Sdf);
+        convexPolygonSdf = max(convexPolygonSdf, line2Sdf);
+        convexPolygonSdf = max(convexPolygonSdf, line3Sdf);
+        convexPolygonSdf = max(convexPolygonSdf, line4Sdf);
+
+        outputColor.a = 1.0f - smoothstep(0.0f, globals.antiAliasingFactor + globals.antiAliasingFactor, convexPolygonSdf);
+
+        // calculate height color
+        E_HEIGHT_SHADING_MODE mode = settings.determineHeightShadingMode();
+        if (mode == E_HEIGHT_SHADING_MODE::DISCRETE_VARIABLE_LENGTH_INTERVALS)
+        {
+            DTMSettingsHeightsAccessor dtmHeightsAccessor = { settings };
+            int upperBoundIndex = nbl::hlsl::upper_bound(dtmHeightsAccessor, 0, heightMapSize, height);
+            int mapIndex = max(upperBoundIndex - 1, 0);
+            int mapIndexPrev = max(mapIndex - 1, 0);
+            int mapIndexNext = min(mapIndex + 1, heightMapSize - 1);
+
+            // logic explainer: if colorIdx is 0.0 then it means blend with next
+            // if color idx is >= length of the colours array then it means it's also > 0.0 and this blend with prev is true
+            // if color idx is > 0 and < len - 1, then it depends on the current pixel's height value and two closest height values
+            bool blendWithPrev = (mapIndex > 0)
+                && (mapIndex >= heightMapSize - 1 || (height * 2.0 < settings.heightColorMapHeights[upperBoundIndex] + settings.heightColorMapHeights[mapIndex]));
+
+            HeightSegmentTransitionData transitionInfo;
+            transitionInfo.currentHeight = height;
+            transitionInfo.currentSegmentColor = settings.heightColorMapColors[mapIndex];
+            transitionInfo.boundaryHeight = blendWithPrev ? settings.heightColorMapHeights[mapIndex] : settings.heightColorMapHeights[mapIndexNext];
+            transitionInfo.otherSegmentColor = blendWithPrev ? settings.heightColorMapColors[mapIndexPrev] : settings.heightColorMapColors[mapIndexNext];
+
+            float4 localHeightColor = smoothHeightSegmentTransition(transitionInfo, heightDeriv);
+            outputColor.rgb = localHeightColor.rgb;
+            outputColor.a *= localHeightColor.a;
+        }
+        else if (mode == E_HEIGHT_SHADING_MODE::DISCRETE_FIXED_LENGTH_INTERVALS)
+        {
+            float intervalPosition = getIntervalPosition(height, minShadingHeight, settings.intervalLength, settings.isCenteredShading);
+            float positionWithinInterval = frac(intervalPosition);
+            int intervalIndex = nbl::hlsl::_static_cast<int>(intervalPosition);
+
+            float4 currentIntervalColor;
+            float currentIntervalHeight;
+            getIntervalHeightAndColor(intervalIndex, settings, currentIntervalColor, currentIntervalHeight);
+
+            bool blendWithPrev = (positionWithinInterval < 0.5f);
+
+            HeightSegmentTransitionData transitionInfo;
+            transitionInfo.currentHeight = height;
+            transitionInfo.currentSegmentColor = currentIntervalColor;
+            if (blendWithPrev)
+            {
+                int prevIntervalIdx = max(intervalIndex - 1, 0);
+                float prevIntervalHeight; // unused, the currentIntervalHeight is the boundary height between current and prev
+                getIntervalHeightAndColor(prevIntervalIdx, settings, transitionInfo.otherSegmentColor, prevIntervalHeight);
+                transitionInfo.boundaryHeight = currentIntervalHeight;
+            }
+            else
+            {
+                int nextIntervalIdx = intervalIndex + 1;
+                getIntervalHeightAndColor(nextIntervalIdx, settings, transitionInfo.otherSegmentColor, transitionInfo.boundaryHeight);
+            }
+
+            float4 localHeightColor = smoothHeightSegmentTransition(transitionInfo, heightDeriv);
+            outputColor.rgb = localHeightColor.rgb;
+            outputColor.a *= localHeightColor.a;
+        }
+        else if (mode == E_HEIGHT_SHADING_MODE::CONTINOUS_INTERVALS)
+        {
+            DTMSettingsHeightsAccessor dtmHeightsAccessor = { settings };
+            uint32_t upperBoundHeightIndex = nbl::hlsl::upper_bound(dtmHeightsAccessor, 0, heightMapSize, height);
+            uint32_t lowerBoundHeightIndex = upperBoundHeightIndex == 0 ? upperBoundHeightIndex : upperBoundHeightIndex - 1;
+
+            float upperBoundHeight = settings.heightColorMapHeights[upperBoundHeightIndex];
+            float lowerBoundHeight = settings.heightColorMapHeights[lowerBoundHeightIndex];
+
+            float4 upperBoundColor = settings.heightColorMapColors[upperBoundHeightIndex];
+            float4 lowerBoundColor = settings.heightColorMapColors[lowerBoundHeightIndex];
+
+            float interpolationVal;
+            if (upperBoundHeightIndex == 0)
+                interpolationVal = 1.0f;
+            else
+                interpolationVal = (height - lowerBoundHeight) / (upperBoundHeight - lowerBoundHeight);
+
+            float4 localHeightColor = lerp(lowerBoundColor, upperBoundColor, interpolationVal);
+
+            outputColor.a *= localHeightColor.a;
+            outputColor.rgb = localHeightColor.rgb * outputColor.a + outputColor.rgb * (1.0f - outputColor.a);
+        }
+    }
+
+    return outputColor;
+}
+
+float4 calculateDTMContourColor(in DTMContourSettings contourSettings, in float3 v[3], in float2 fragPos, in float height)
+{
+    float4 outputColor = float4(0.0f, 0.0f, 0.0f, 0.0f);
+
+    LineStyle contourStyle = loadLineStyle(contourSettings.contourLineStyleIdx);
+    const float contourThickness = (contourStyle.screenSpaceLineWidth + contourStyle.worldSpaceLineWidth * globals.screenToWorldRatio) * 0.5f;
+    float stretch = 1.0f;
+    float phaseShift = 0.0f;
+
+    // TODO: move to ubo or push constants
+    const float startHeight = contourSettings.contourLinesStartHeight;
+    const float endHeight = contourSettings.contourLinesEndHeight;
+    const float interval = contourSettings.contourLinesHeightInterval;
+
+    // TODO: can be precomputed
+    const int maxContourLineIdx = (endHeight - startHeight) / interval;
+
+    // TODO: it actually can output a negative number, fix
+    int contourLineIdx = nbl::hlsl::_static_cast<int>((height - startHeight) / interval + 0.5f);
+    contourLineIdx = clamp(contourLineIdx, 0, maxContourLineIdx);
+    float contourLineHeight = startHeight + interval * contourLineIdx;
+
+    int contourLinePointsIdx = 0;
+    float2 contourLinePoints[2];
+    // TODO: case where heights we are looking for are on all three vertices
+    for (int i = 0; i < 3; ++i)
+    {
+        if (contourLinePointsIdx == 2)
+            break;
+
+        float3 p0 = v[i];
+        float3 p1 = v[(i + 1) % 3];
+
+        if (p1.z < p0.z)
+            nbl::hlsl::swap(p0, p1);
+
+        float minHeight = p0.z;
+        float maxHeight = p1.z;
+
+        if (height >= minHeight && height <= maxHeight)
+        {
+            float2 edge = float2(p1.x, p1.y) - float2(p0.x, p0.y);
+            float scale = (contourLineHeight - minHeight) / (maxHeight - minHeight);
+
+            contourLinePoints[contourLinePointsIdx] = scale * edge + float2(p0.x, p0.y);
+            ++contourLinePointsIdx;
+        }
+    }
+
+    if (contourLinePointsIdx == 2)
+    {
+        nbl::hlsl::shapes::Line<float> lineSegment = nbl::hlsl::shapes::Line<float>::construct(contourLinePoints[0], contourLinePoints[1]);
+
+        float distance = nbl::hlsl::numeric_limits<float>::max;
+        if (!contourStyle.hasStipples() || stretch == InvalidStyleStretchValue)
+        {
+            distance = ClippedSignedDistance< nbl::hlsl::shapes::Line<float> >::sdf(lineSegment, fragPos, contourThickness, contourStyle.isRoadStyleFlag);
+        }
+        else
+        {
+            // TODO:
+            // It might be beneficial to calculate distance between pixel and contour line to early out some pixels and save yourself from stipple sdf computations!
+            // where you only compute the complex sdf if abs((height - contourVal) / heightDeriv) <= aaFactor
+            nbl::hlsl::shapes::Line<float>::ArcLengthCalculator arcLenCalc = nbl::hlsl::shapes::Line<float>::ArcLengthCalculator::construct(lineSegment);
+            LineStyleClipper clipper = LineStyleClipper::construct(contourStyle, lineSegment, arcLenCalc, phaseShift, stretch, globals.worldToScreenRatio);
+            distance = ClippedSignedDistance<nbl::hlsl::shapes::Line<float>, LineStyleClipper>::sdf(lineSegment, fragPos, contourThickness, contourStyle.isRoadStyleFlag, clipper);
+        }
+
+        outputColor.a = 1.0f - smoothstep(-globals.antiAliasingFactor, globals.antiAliasingFactor, distance);
+        outputColor.a *= contourStyle.color.a;
+        outputColor.rgb = contourStyle.color.rgb;
+
+        return outputColor;
+    }
+
+    return float4(0.0f, 0.0f, 0.0f, 0.0f);
+}
+
+float4 calculateDTMOutlineColor(in uint outlineLineStyleIdx, in float3 v[3], in float2 fragPos, in float3 baryCoord, in float height)
+{
+    float4 outputColor;
+
+    LineStyle outlineStyle = loadLineStyle(outlineLineStyleIdx);
+    const float outlineThickness = (outlineStyle.screenSpaceLineWidth + outlineStyle.worldSpaceLineWidth * globals.screenToWorldRatio) * 0.5f;
+    const float phaseShift = 0.0f; // input.getCurrentPhaseShift();
+    const float stretch = 1.0f;
+
+    // index of vertex opposing an edge, needed for calculation of triangle heights
+    uint opposingVertexIdx[3];
+    opposingVertexIdx[0] = 2;
+    opposingVertexIdx[1] = 0;
+    opposingVertexIdx[2] = 1;
+
+    float minDistance = nbl::hlsl::numeric_limits<float>::max;
+    if (!outlineStyle.hasStipples() || stretch == InvalidStyleStretchValue)
+    {
+        for (int i = 0; i < 3; ++i)
+        {
+            float3 p0 = v[i];
+            float3 p1 = v[(i + 1) % 3];
+
+            float distance = nbl::hlsl::numeric_limits<float>::max;
+            nbl::hlsl::shapes::Line<float> lineSegment = nbl::hlsl::shapes::Line<float>::construct(float2(p0.x, p0.y), float2(p1.x, p1.y));
+            distance = ClippedSignedDistance<nbl::hlsl::shapes::Line<float> >::sdf(lineSegment, fragPos, outlineThickness, outlineStyle.isRoadStyleFlag);
+
+            minDistance = min(minDistance, distance);
+        }
+    }
+    else
+    {
+        for (int i = 0; i < 3; ++i)
+        {
+            float3 p0 = v[i];
+            float3 p1 = v[(i + 1) % 3];
+
+            // long story short, in order for stipple patterns to be consistent:
+            // - point with lesser x coord should be starting point
+            // - if x coord of both points are equal then point with lesser y value should be starting point
+            if (p1.x < p0.x)
+                nbl::hlsl::swap(p0, p1);
+            else if (p1.x == p0.x && p1.y < p0.y)
+                nbl::hlsl::swap(p0, p1);
+
+            nbl::hlsl::shapes::Line<float> lineSegment = nbl::hlsl::shapes::Line<float>::construct(float2(p0.x, p0.y), float2(p1.x, p1.y));
+
+            float distance = nbl::hlsl::numeric_limits<float>::max;
+            nbl::hlsl::shapes::Line<float>::ArcLengthCalculator arcLenCalc = nbl::hlsl::shapes::Line<float>::ArcLengthCalculator::construct(lineSegment);
+            LineStyleClipper clipper = LineStyleClipper::construct(outlineStyle, lineSegment, arcLenCalc, phaseShift, stretch, globals.worldToScreenRatio);
+            distance = ClippedSignedDistance<nbl::hlsl::shapes::Line<float>, LineStyleClipper>::sdf(lineSegment, fragPos, outlineThickness, outlineStyle.isRoadStyleFlag, clipper);
+
+            minDistance = min(minDistance, distance);
+        }
+
+    }
+
+    outputColor.a = 1.0f - smoothstep(-globals.antiAliasingFactor, globals.antiAliasingFactor, minDistance);
+    outputColor.a *= outlineStyle.color.a;
+    outputColor.rgb = outlineStyle.color.rgb;
+
+    return outputColor;
+}
+
+float4 blendUnder(in float4 srcColor, in float4 dstColor)
+{
+    dstColor.rgb = dstColor.rgb * dstColor.a + (1 - dstColor.a) * srcColor.a * srcColor.rgb;
+    dstColor.a = (1.0f - srcColor.a) * dstColor.a + srcColor.a;
+
+    return dstColor;
+}
+}
+
+#endif
\ No newline at end of file
diff --git a/62_CAD/shaders/main_pipeline/fragment_shader.hlsl b/62_CAD/shaders/main_pipeline/fragment_shader.hlsl
index 31c25a6e5..6c579dff6 100644
--- a/62_CAD/shaders/main_pipeline/fragment_shader.hlsl
+++ b/62_CAD/shaders/main_pipeline/fragment_shader.hlsl
@@ -1,5 +1,6 @@
 #define FRAGMENT_SHADER_INPUT
 #include "common.hlsl"
+#include "dtm.hlsl"
 #include <nbl/builtin/hlsl/shapes/beziers.hlsl>
 #include <nbl/builtin/hlsl/shapes/line.hlsl>
 #include <nbl/builtin/hlsl/algorithm.hlsl>
@@ -8,294 +9,7 @@
 #include <nbl/builtin/hlsl/spirv_intrinsics/fragment_shader_pixel_interlock.hlsl>
 #include <nbl/builtin/hlsl/jit/device_capabilities.hlsl>
 #include <nbl/builtin/hlsl/text_rendering/msdf.hlsl>
-#include <nbl/builtin/hlsl/spirv_intrinsics/fragment_shader_barycentric.hlsl>
-
-template<typename float_t>
-struct DefaultClipper
-{
-    using float_t2 = vector<float_t, 2>;
-    NBL_CONSTEXPR_STATIC_INLINE float_t AccuracyThresholdT = 0.0;
-
-    static DefaultClipper construct()
-    {
-        DefaultClipper ret;
-        return ret;
-    }
-
-    inline float_t2 operator()(const float_t t)
-    {
-        const float_t ret = clamp(t, 0.0, 1.0);
-        return float_t2(ret, ret);
-    }
-};
-
-// for usage in upper_bound function
-struct StyleAccessor
-{
-    LineStyle style;
-    using value_type = float;
-
-    float operator[](const uint32_t ix)
-    {
-        return style.getStippleValue(ix);
-    }
-};
-
-template<typename CurveType>
-struct StyleClipper
-{
-    using float_t = typename CurveType::scalar_t;
-    using float_t2 = typename CurveType::float_t2;
-    using float_t3 = typename CurveType::float_t3;
-    NBL_CONSTEXPR_STATIC_INLINE float_t AccuracyThresholdT = 0.000001;
-
-    static StyleClipper<CurveType> construct(
-        LineStyle style,
-        CurveType curve,
-        typename CurveType::ArcLengthCalculator arcLenCalc,
-        float phaseShift,
-        float stretch,
-        float worldToScreenRatio)
-    {
-        StyleClipper<CurveType> ret = { style, curve, arcLenCalc, phaseShift, stretch, worldToScreenRatio, 0.0f, 0.0f, 0.0f, 0.0f };
-
-        // values for non-uniform stretching with a rigid segment
-        if (style.rigidSegmentIdx != InvalidRigidSegmentIndex && stretch != 1.0f)
-        {
-            // rigidSegment info in old non stretched pattern
-            ret.rigidSegmentStart = (style.rigidSegmentIdx >= 1u) ? style.getStippleValue(style.rigidSegmentIdx - 1u) : 0.0f;
-            ret.rigidSegmentEnd = (style.rigidSegmentIdx < style.stipplePatternSize) ? style.getStippleValue(style.rigidSegmentIdx) : 1.0f;
-            ret.rigidSegmentLen = ret.rigidSegmentEnd - ret.rigidSegmentStart;
-            // stretch value for non rigid segments
-            ret.nonRigidSegmentStretchValue = (stretch - ret.rigidSegmentLen) / (1.0f - ret.rigidSegmentLen);
-            // rigidSegment info to new stretched pattern
-            ret.rigidSegmentStart *= ret.nonRigidSegmentStretchValue / stretch; // get the new normalized rigid segment start
-            ret.rigidSegmentLen /= stretch; // get the new rigid segment normalized len
-            ret.rigidSegmentEnd = ret.rigidSegmentStart + ret.rigidSegmentLen; // get the new normalized rigid segment end 
-        }
-        else
-        {
-            ret.nonRigidSegmentStretchValue = stretch;
-        }
-        
-        return ret;
-    }
-
-    // For non-uniform stretching with a rigid segment (the one segement that shouldn't stretch) the whole pattern changes
-    // instead of transforming each of the style.stipplePattern values (max 14 of them), we transform the normalized place in pattern
-    float getRealNormalizedPlaceInPattern(float normalizedPlaceInPattern)
-    {
-        if (style.rigidSegmentIdx != InvalidRigidSegmentIndex && stretch != 1.0f)
-        {
-            float ret = min(normalizedPlaceInPattern, rigidSegmentStart) / nonRigidSegmentStretchValue; // unstretch parts before rigid segment
-            ret += max(normalizedPlaceInPattern - rigidSegmentEnd, 0.0f) / nonRigidSegmentStretchValue; // unstretch parts after rigid segment
-            ret += max(min(rigidSegmentLen, normalizedPlaceInPattern - rigidSegmentStart), 0.0f); // unstretch parts inside rigid segment
-            ret *= stretch;
-            return ret;
-        }
-        else
-        {
-            return normalizedPlaceInPattern;
-        }
-    }
-
-    float_t2 operator()(float_t t)
-    {
-        // basicaly 0.0 and 1.0 but with a guardband to discard outside the range
-        const float_t minT = 0.0 - 1.0;
-        const float_t maxT = 1.0 + 1.0;
-
-        StyleAccessor styleAccessor = { style };
-        const float_t reciprocalStretchedStipplePatternLen = style.reciprocalStipplePatternLen / stretch;
-        const float_t patternLenInScreenSpace = 1.0 / (worldToScreenRatio * style.reciprocalStipplePatternLen);
-
-        const float_t arcLen = arcLenCalc.calcArcLen(t);
-        const float_t worldSpaceArcLen = arcLen * float_t(worldToScreenRatio);
-        float_t normalizedPlaceInPattern = frac(worldSpaceArcLen * reciprocalStretchedStipplePatternLen + phaseShift);
-        normalizedPlaceInPattern = getRealNormalizedPlaceInPattern(normalizedPlaceInPattern);
-        uint32_t patternIdx = nbl::hlsl::upper_bound(styleAccessor, 0, style.stipplePatternSize, normalizedPlaceInPattern);
-
-        const float_t InvalidT = nbl::hlsl::numeric_limits<float32_t>::infinity; 
-        float_t2 ret = float_t2(InvalidT, InvalidT);
-
-        // odd patternIdx means a "no draw section" and current candidate should split into two nearest draw sections
-        const bool notInDrawSection = patternIdx & 0x1;
-        
-        // TODO[Erfan]: Disable this piece of code after clipping, and comment the reason, that the bezier start and end at 0.0 and 1.0 should be in drawable sections
-        float_t minDrawT = 0.0;
-        float_t maxDrawT = 1.0;
-        {
-            float_t normalizedPlaceInPatternBegin = frac(phaseShift);
-            normalizedPlaceInPatternBegin = getRealNormalizedPlaceInPattern(normalizedPlaceInPatternBegin);
-            uint32_t patternIdxBegin = nbl::hlsl::upper_bound(styleAccessor, 0, style.stipplePatternSize, normalizedPlaceInPatternBegin);
-            const bool BeginInNonDrawSection = patternIdxBegin & 0x1;
-
-            if (BeginInNonDrawSection)
-            {
-                float_t diffToRightDrawableSection = (patternIdxBegin == style.stipplePatternSize) ? 1.0 : styleAccessor[patternIdxBegin];
-                diffToRightDrawableSection -= normalizedPlaceInPatternBegin;
-                float_t scrSpcOffsetToArcLen1 = diffToRightDrawableSection * patternLenInScreenSpace * ((patternIdxBegin != style.rigidSegmentIdx) ? nonRigidSegmentStretchValue : 1.0);
-                const float_t arcLenForT1 = 0.0 + scrSpcOffsetToArcLen1;
-                minDrawT = arcLenCalc.calcArcLenInverse(curve, minT, maxT, arcLenForT1, AccuracyThresholdT, 0.0);
-            }
-            
-            // Completely in non-draw section -> clip away:
-            if (minDrawT >= 1.0)
-                return ret;
-
-            const float_t arcLenEnd = arcLenCalc.calcArcLen(1.0);
-            const float_t worldSpaceArcLenEnd = arcLenEnd * float_t(worldToScreenRatio);
-            float_t normalizedPlaceInPatternEnd = frac(worldSpaceArcLenEnd * reciprocalStretchedStipplePatternLen + phaseShift);
-            normalizedPlaceInPatternEnd = getRealNormalizedPlaceInPattern(normalizedPlaceInPatternEnd);
-            uint32_t patternIdxEnd = nbl::hlsl::upper_bound(styleAccessor, 0, style.stipplePatternSize, normalizedPlaceInPatternEnd);
-            const bool EndInNonDrawSection = patternIdxEnd & 0x1;
-
-            if (EndInNonDrawSection)
-            {
-                float_t diffToLeftDrawableSection = (patternIdxEnd == 0) ? 0.0 : styleAccessor[patternIdxEnd - 1];
-                diffToLeftDrawableSection -= normalizedPlaceInPatternEnd;
-                float_t scrSpcOffsetToArcLen0 = diffToLeftDrawableSection * patternLenInScreenSpace * ((patternIdxEnd != style.rigidSegmentIdx) ? nonRigidSegmentStretchValue : 1.0);
-                const float_t arcLenForT0 = arcLenEnd + scrSpcOffsetToArcLen0;
-                maxDrawT = arcLenCalc.calcArcLenInverse(curve, minT, maxT, arcLenForT0, AccuracyThresholdT, 1.0);
-            }
-        }
-
-        if (notInDrawSection)
-        {
-            float toScreenSpaceLen = patternLenInScreenSpace * ((patternIdx != style.rigidSegmentIdx) ? nonRigidSegmentStretchValue : 1.0);
-
-            float_t diffToLeftDrawableSection = (patternIdx == 0) ? 0.0 : styleAccessor[patternIdx - 1];
-            diffToLeftDrawableSection -= normalizedPlaceInPattern;
-            float_t scrSpcOffsetToArcLen0 = diffToLeftDrawableSection * toScreenSpaceLen;
-            const float_t arcLenForT0 = arcLen + scrSpcOffsetToArcLen0;
-            float_t t0 = arcLenCalc.calcArcLenInverse(curve, minT, maxT, arcLenForT0, AccuracyThresholdT, t);
-            t0 = clamp(t0, minDrawT, maxDrawT);
-
-            float_t diffToRightDrawableSection = (patternIdx == style.stipplePatternSize) ? 1.0 : styleAccessor[patternIdx];
-            diffToRightDrawableSection -= normalizedPlaceInPattern;
-            float_t scrSpcOffsetToArcLen1 = diffToRightDrawableSection * toScreenSpaceLen;
-            const float_t arcLenForT1 = arcLen + scrSpcOffsetToArcLen1;
-            float_t t1 = arcLenCalc.calcArcLenInverse(curve, minT, maxT, arcLenForT1, AccuracyThresholdT, t);
-            t1 = clamp(t1, minDrawT, maxDrawT);
-
-            ret = float_t2(t0, t1);
-        }
-        else
-        {
-            t = clamp(t, minDrawT, maxDrawT);
-            ret = float_t2(t, t);
-        }
-
-        return ret;
-    }
-
-    LineStyle style;
-    CurveType curve;
-    typename CurveType::ArcLengthCalculator arcLenCalc;
-    float phaseShift;
-    float stretch;
-    float worldToScreenRatio;
-    // precomp value for non uniform stretching
-    float rigidSegmentStart;
-    float rigidSegmentEnd;
-    float rigidSegmentLen;
-    float nonRigidSegmentStretchValue;
-};
-
-template<typename CurveType, typename Clipper = DefaultClipper<typename CurveType::scalar_t> >
-struct ClippedSignedDistance
-{
-    using float_t = typename CurveType::scalar_t;
-    using float_t2 = typename CurveType::float_t2;
-    using float_t3 = typename CurveType::float_t3;
-
-    const static float_t sdf(CurveType curve, float_t2 pos, float_t thickness, bool isRoadStyle, Clipper clipper = DefaultClipper<typename CurveType::scalar_t>::construct())
-    {
-        typename CurveType::Candidates candidates = curve.getClosestCandidates(pos);
-
-        const float_t InvalidT = nbl::hlsl::numeric_limits<float32_t>::max;
-        // TODO: Fix and test, we're not working with squared distance anymore
-        const float_t MAX_DISTANCE_SQUARED = (thickness + 1.0f) * (thickness + 1.0f); // TODO: ' + 1' is too much?
-
-        bool clipped = false;
-        float_t closestDistanceSquared = MAX_DISTANCE_SQUARED;
-        float_t closestT = InvalidT;
-        [[unroll(CurveType::MaxCandidates)]]
-        for (uint32_t i = 0; i < CurveType::MaxCandidates; i++)
-        {
-            const float_t candidateDistanceSquared = length(curve.evaluate(candidates[i]) - pos);
-            if (candidateDistanceSquared < closestDistanceSquared)
-            {
-                float_t2 snappedTs = clipper(candidates[i]);
-
-                if (snappedTs[0] == InvalidT)
-                {
-                    continue;
-                }
-
-                if (snappedTs[0] != candidates[i])
-                {
-                    // left snapped or clamped
-                    const float_t leftSnappedCandidateDistanceSquared = length(curve.evaluate(snappedTs[0]) - pos);
-                    if (leftSnappedCandidateDistanceSquared < closestDistanceSquared)
-                    {
-                        clipped = true;
-                        closestT = snappedTs[0];
-                        closestDistanceSquared = leftSnappedCandidateDistanceSquared;
-                    }
-
-                    if (snappedTs[0] != snappedTs[1])
-                    {
-                        // right snapped or clamped
-                        const float_t rightSnappedCandidateDistanceSquared = length(curve.evaluate(snappedTs[1]) - pos);
-                        if (rightSnappedCandidateDistanceSquared < closestDistanceSquared)
-                        {
-                            clipped = true;
-                            closestT = snappedTs[1];
-                            closestDistanceSquared = rightSnappedCandidateDistanceSquared;
-                        }
-                    }
-                }
-                else
-                {
-                    // no snapping
-                    if (candidateDistanceSquared < closestDistanceSquared)
-                    {
-                        clipped = false;
-                        closestT = candidates[i];
-                        closestDistanceSquared = candidateDistanceSquared;
-                    }
-                }
-            }
-        }
-
-
-        float_t roundedDistance = closestDistanceSquared - thickness;
-        if(!isRoadStyle)
-        {
-            return roundedDistance;
-        }
-        else
-        {
-            const float_t aaWidth = globals.antiAliasingFactor;
-            float_t rectCappedDistance = roundedDistance;
-
-            if (clipped)
-            {
-                float_t2 q = mul(curve.getLocalCoordinateSpace(closestT), pos - curve.evaluate(closestT));
-                rectCappedDistance = capSquare(q, thickness, aaWidth);
-            }
-
-            return rectCappedDistance;
-        }
-    }
-
-    static float capSquare(float_t2 q, float_t th, float_t aaWidth)
-    {
-        float_t2 d = abs(q) - float_t2(aaWidth, th);
-        return length(max(d, 0.0)) + min(max(d.x, d.y), 0.0);
-    }
-};
+//#include <nbl/builtin/hlsl/spirv_intrinsics/fragment_shader_barycentric.hlsl>
 
 // sdf of Isosceles Trapezoid y-aligned by https://iquilezles.org/articles/distfunctions2d/
 float sdTrapezoid(float2 p, float r1, float r2, float he)
@@ -331,21 +45,6 @@ float miterSDF(float2 p, float thickness, float2 a, float2 b, float ra, float rb
     return sdTrapezoid(p, ra, rb, h);
 }
 
-typedef StyleClipper< nbl::hlsl::shapes::Quadratic<float> > BezierStyleClipper;
-typedef StyleClipper< nbl::hlsl::shapes::Line<float> > LineStyleClipper;
-
-// for usage in upper_bound function
-struct DTMSettingsHeightsAccessor
-{
-    DTMHeightShadingSettings settings;
-    using value_type = float;
-
-    float operator[](const uint32_t ix)
-    {
-        return settings.heightColorMapHeights[ix];
-    }
-};
-
 // We need to specialize color calculation based on FragmentShaderInterlock feature availability for our transparency algorithm
 // because there is no `if constexpr` in hlsl
 // @params
@@ -419,401 +118,6 @@ float32_t4 calculateFinalColor<true>(const uint2 fragCoord, const float localAlp
     return color;
 }
 
-float dot2(in float2 vec)
-{
-    return dot(vec, vec);
-}
-
-
-// TODO: Later move these functions and structs to dtmSettings.hlsl and a namespace like dtmSettings::height_shading or dtmSettings::contours, etc..
-
-struct HeightSegmentTransitionData
-{
-    float currentHeight;
-    float4 currentSegmentColor;
-    float boundaryHeight;
-    float4 otherSegmentColor;
-};
-
-// This function interpolates between the current and nearest segment colors based on the
-// screen-space distance to the segment boundary. The result is a smoothly blended color
-// useful for visualizing discrete height levels without harsh edges.
-float4 smoothHeightSegmentTransition(in HeightSegmentTransitionData transitionInfo, in float heightDeriv)
-{
-    float pxDistanceToNearestSegment = abs((transitionInfo.currentHeight - transitionInfo.boundaryHeight) / heightDeriv);
-    float nearestSegmentColorCoverage = smoothstep(-globals.antiAliasingFactor, globals.antiAliasingFactor, pxDistanceToNearestSegment);
-    float4 localHeightColor = lerp(transitionInfo.otherSegmentColor, transitionInfo.currentSegmentColor, nearestSegmentColorCoverage);
-    return localHeightColor;
-}
-
-// Computes the continuous position of a height value within uniform intervals.
-// flooring this value will give the interval index
-//
-// If `isCenteredShading` is true, the intervals are centered around `minHeight`, meaning the
-// first interval spans [minHeight - intervalLength / 2.0, minHeight + intervalLength / 2.0].
-// Otherwise, intervals are aligned from `minHeight` upward, so the first interval spans
-// [minHeight, minHeight + intervalLength].
-//
-// Parameters:
-// - height: The height value to classify.
-// - minHeight: The reference starting height for interval calculation.
-// - intervalLength: The length of each interval segment.
-// - isCenteredShading: Whether to center the shading intervals around minHeight.
-//
-// Returns:
-// - A float representing the continuous position within the interval grid.
-float getIntervalPosition(in float height, in float minHeight, in float intervalLength, in bool isCenteredShading)
-{
-    if (isCenteredShading)
-        return ( (height - minHeight) / intervalLength + 0.5f);
-    else
-        return ( (height - minHeight) / intervalLength );
-}
-
-void getIntervalHeightAndColor(in int intervalIndex, in DTMHeightShadingSettings settings, out float4 outIntervalColor, out float outIntervalHeight)
-{
-    float minShadingHeight = settings.heightColorMapHeights[0];
-    float heightForColor = minShadingHeight + float(intervalIndex) * settings.intervalIndexToHeightMultiplier;
-    
-    if (settings.isCenteredShading)
-        outIntervalHeight = minShadingHeight + (float(intervalIndex) - 0.5) * settings.intervalLength;
-    else
-        outIntervalHeight = minShadingHeight + (float(intervalIndex)) * settings.intervalLength;
-
-    DTMSettingsHeightsAccessor dtmHeightsAccessor = { settings };
-    uint32_t upperBoundHeightIndex = min(nbl::hlsl::upper_bound(dtmHeightsAccessor, 0, settings.heightColorEntryCount, heightForColor), settings.heightColorEntryCount-1u);
-    uint32_t lowerBoundHeightIndex = max(upperBoundHeightIndex - 1, 0);
-
-    float upperBoundHeight = settings.heightColorMapHeights[upperBoundHeightIndex];
-    float lowerBoundHeight = settings.heightColorMapHeights[lowerBoundHeightIndex];
-
-    float4 upperBoundColor = settings.heightColorMapColors[upperBoundHeightIndex];
-    float4 lowerBoundColor = settings.heightColorMapColors[lowerBoundHeightIndex];
-    
-    if (upperBoundHeight == lowerBoundHeight)
-    {
-        outIntervalColor = upperBoundColor;
-    }
-    else
-    {
-        float interpolationVal = (heightForColor - lowerBoundHeight) / (upperBoundHeight - lowerBoundHeight);
-        outIntervalColor = lerp(lowerBoundColor, upperBoundColor, interpolationVal);
-    }
-}
-
-float3 calculateDTMTriangleBarycentrics(in float2 v1, in float2 v2, in float2 v3, in float2 p)
-{
-    float denom = (v2.x - v1.x) * (v3.y - v1.y) - (v3.x - v1.x) * (v2.y - v1.y);
-    float u = ((v2.y - v3.y) * (p.x - v3.x) + (v3.x - v2.x) * (p.y - v3.y)) / denom;
-    float v = ((v3.y - v1.y) * (p.x - v3.x) + (v1.x - v3.x) * (p.y - v3.y)) / denom;
-    float w = 1.0 - u - v;
-    return float3(u, v, w);
-}
-
-float4 calculateDTMHeightColor(in DTMHeightShadingSettings settings, in float3 v[3], in float heightDeriv, in float2 fragPos, in float height)
-{
-    float4 outputColor = float4(0.0f, 0.0f, 0.0f, 0.0f);
-
-    // HEIGHT SHADING
-    const uint32_t heightMapSize = settings.heightColorEntryCount;
-    float minShadingHeight = settings.heightColorMapHeights[0];
-    float maxShadingHeight = settings.heightColorMapHeights[heightMapSize - 1];
-
-    if (heightMapSize > 0)
-    {
-        // partially based on https://www.shadertoy.com/view/XsXSz4 by Inigo Quilez
-        float2 e0 = v[1] - v[0];
-        float2 e1 = v[2] - v[1];
-        float2 e2 = v[0] - v[2];
-
-        float triangleAreaSign = -sign(e0.x * e2.y - e0.y * e2.x);
-        float2 v0 = fragPos - v[0];
-        float2 v1 = fragPos - v[1];
-        float2 v2 = fragPos - v[2];
-
-        float distanceToLine0 = sqrt(dot2(v0 - e0 * dot(v0, e0) / dot(e0, e0)));
-        float distanceToLine1 = sqrt(dot2(v1 - e1 * dot(v1, e1) / dot(e1, e1)));
-        float distanceToLine2 = sqrt(dot2(v2 - e2 * dot(v2, e2) / dot(e2, e2)));
-
-        float line0Sdf = distanceToLine0 * triangleAreaSign * (v0.x * e0.y - v0.y * e0.x);
-        float line1Sdf = distanceToLine1 * triangleAreaSign * (v1.x * e1.y - v1.y * e1.x);
-        float line2Sdf = distanceToLine2 * triangleAreaSign * (v2.x * e2.y - v2.y * e2.x);
-        float line3Sdf = (minShadingHeight - height) / heightDeriv;
-        float line4Sdf = (height - maxShadingHeight) / heightDeriv;
-
-        float convexPolygonSdf = max(line0Sdf, line1Sdf);
-        convexPolygonSdf = max(convexPolygonSdf, line2Sdf);
-        convexPolygonSdf = max(convexPolygonSdf, line3Sdf);
-        convexPolygonSdf = max(convexPolygonSdf, line4Sdf);
-
-        outputColor.a = 1.0f - smoothstep(0.0f, globals.antiAliasingFactor * 2.0f, convexPolygonSdf);
-
-        // calculate height color
-        E_HEIGHT_SHADING_MODE mode = settings.determineHeightShadingMode();
-
-        if (mode == E_HEIGHT_SHADING_MODE::DISCRETE_VARIABLE_LENGTH_INTERVALS)
-        {
-            DTMSettingsHeightsAccessor dtmHeightsAccessor = { settings };
-            int upperBoundIndex = nbl::hlsl::upper_bound(dtmHeightsAccessor, 0, heightMapSize, height);
-            int mapIndex = max(upperBoundIndex - 1, 0);
-            int mapIndexPrev = max(mapIndex - 1, 0);
-            int mapIndexNext = min(mapIndex + 1, heightMapSize - 1);
-
-            // logic explainer: if colorIdx is 0.0 then it means blend with next
-            // if color idx is >= length of the colours array then it means it's also > 0.0 and this blend with prev is true
-            // if color idx is > 0 and < len - 1, then it depends on the current pixel's height value and two closest height values
-            bool blendWithPrev = (mapIndex > 0)
-                && (mapIndex >= heightMapSize - 1 || (height * 2.0 < settings.heightColorMapHeights[upperBoundIndex] + settings.heightColorMapHeights[mapIndex]));
-
-            HeightSegmentTransitionData transitionInfo;
-            transitionInfo.currentHeight = height;
-            transitionInfo.currentSegmentColor = settings.heightColorMapColors[mapIndex];
-            transitionInfo.boundaryHeight = blendWithPrev ? settings.heightColorMapHeights[mapIndex] : settings.heightColorMapHeights[mapIndexNext];
-            transitionInfo.otherSegmentColor = blendWithPrev ? settings.heightColorMapColors[mapIndexPrev] : settings.heightColorMapColors[mapIndexNext];
-
-            float4 localHeightColor = smoothHeightSegmentTransition(transitionInfo, heightDeriv);
-            outputColor.rgb = localHeightColor.rgb;
-            outputColor.a *= localHeightColor.a;
-        }
-        else if (mode == E_HEIGHT_SHADING_MODE::DISCRETE_FIXED_LENGTH_INTERVALS)
-        {
-            float intervalPosition = getIntervalPosition(height, minShadingHeight, settings.intervalLength, settings.isCenteredShading);
-            float positionWithinInterval = frac(intervalPosition);
-            int intervalIndex = nbl::hlsl::_static_cast<int>(intervalPosition);
-
-            float4 currentIntervalColor;
-            float currentIntervalHeight;
-            getIntervalHeightAndColor(intervalIndex, settings, currentIntervalColor, currentIntervalHeight);
-            
-            bool blendWithPrev = (positionWithinInterval < 0.5f);
-            
-            HeightSegmentTransitionData transitionInfo;
-            transitionInfo.currentHeight = height;
-            transitionInfo.currentSegmentColor = currentIntervalColor;
-            if (blendWithPrev)
-            {
-                int prevIntervalIdx = max(intervalIndex - 1, 0);
-                float prevIntervalHeight; // unused, the currentIntervalHeight is the boundary height between current and prev
-                getIntervalHeightAndColor(prevIntervalIdx, settings, transitionInfo.otherSegmentColor, prevIntervalHeight);
-                transitionInfo.boundaryHeight = currentIntervalHeight;
-            }
-            else
-            {
-                int nextIntervalIdx = intervalIndex + 1;
-                getIntervalHeightAndColor(nextIntervalIdx, settings, transitionInfo.otherSegmentColor, transitionInfo.boundaryHeight);
-            }
-            
-            float4 localHeightColor = smoothHeightSegmentTransition(transitionInfo, heightDeriv);
-            outputColor.rgb = localHeightColor.rgb;
-            outputColor.a *= localHeightColor.a;
-        }
-        else if (mode == E_HEIGHT_SHADING_MODE::CONTINOUS_INTERVALS)
-        {
-            DTMSettingsHeightsAccessor dtmHeightsAccessor = { settings };
-            uint32_t upperBoundHeightIndex = nbl::hlsl::upper_bound(dtmHeightsAccessor, 0, heightMapSize, height);
-            uint32_t lowerBoundHeightIndex = upperBoundHeightIndex == 0 ? upperBoundHeightIndex : upperBoundHeightIndex - 1;
-
-            float upperBoundHeight = settings.heightColorMapHeights[upperBoundHeightIndex];
-            float lowerBoundHeight = settings.heightColorMapHeights[lowerBoundHeightIndex];
-
-            float4 upperBoundColor = settings.heightColorMapColors[upperBoundHeightIndex];
-            float4 lowerBoundColor = settings.heightColorMapColors[lowerBoundHeightIndex];
-
-            float interpolationVal;
-            if (upperBoundHeightIndex == 0)
-                interpolationVal = 1.0f;
-            else
-                interpolationVal = (height - lowerBoundHeight) / (upperBoundHeight - lowerBoundHeight);
-
-            float4 localHeightColor = lerp(lowerBoundColor, upperBoundColor, interpolationVal);
-
-            outputColor.a *= localHeightColor.a;
-            outputColor.rgb = localHeightColor.rgb * outputColor.a + outputColor.rgb * (1.0f - outputColor.a);
-        }
-    }
-
-    return outputColor; 
-}
-
-float4 calculateDTMContourColor(in DTMContourSettings contourSettings, in float3 v[3], in uint2 edgePoints[3], in float2 fragPos, in float height)
-{
-    float4 outputColor;
-
-    LineStyle contourStyle = loadLineStyle(contourSettings.contourLineStyleIdx);
-    const float contourThickness = (contourStyle.screenSpaceLineWidth + contourStyle.worldSpaceLineWidth * globals.screenToWorldRatio) * 0.5f;
-    float stretch = 1.0f;
-    float phaseShift = 0.0f;
-
-    // TODO: move to ubo or push constants
-    const float startHeight = contourSettings.contourLinesStartHeight;
-    const float endHeight = contourSettings.contourLinesEndHeight;
-    const float interval = contourSettings.contourLinesHeightInterval;
-
-    // TODO: can be precomputed
-    const int maxContourLineIdx = (endHeight - startHeight + 1) / interval;
-
-    // TODO: it actually can output a negative number, fix
-    int contourLineIdx = nbl::hlsl::_static_cast<int>((height - startHeight + (interval * 0.5f)) / interval);
-    contourLineIdx = clamp(contourLineIdx, 0, maxContourLineIdx);
-    float contourLineHeight = startHeight + interval * contourLineIdx;
-
-    int contourLinePointsIdx = 0;
-    float2 contourLinePoints[2];
-    // TODO: case where heights we are looking for are on all three vertices
-    for (int i = 0; i < 3; ++i)
-    {
-        if (contourLinePointsIdx == 2)
-            break;
-
-        const uint2 currentEdgePoints = edgePoints[i];
-        float3 p0 = v[currentEdgePoints[0]];
-        float3 p1 = v[currentEdgePoints[1]];
-
-        if (p1.z < p0.z)
-            nbl::hlsl::swap(p0, p1);
-
-        float minHeight = p0.z;
-        float maxHeight = p1.z;
-
-        if (height >= minHeight && height <= maxHeight)
-        {
-            float2 edge = float2(p1.x, p1.y) - float2(p0.x, p0.y);
-            float scale = (contourLineHeight - minHeight) / (maxHeight - minHeight);
-
-            contourLinePoints[contourLinePointsIdx] = scale * edge + float2(p0.x, p0.y);
-            ++contourLinePointsIdx;
-        }
-    }
-
-    // TODO: comment next line to fix, figure if it was needed
-    if(contourLinePointsIdx == 2)
-    {
-        nbl::hlsl::shapes::Line<float> lineSegment = nbl::hlsl::shapes::Line<float>::construct(contourLinePoints[0], contourLinePoints[1]);
-
-        float distance = nbl::hlsl::numeric_limits<float>::max;
-        if (!contourStyle.hasStipples() || stretch == InvalidStyleStretchValue)
-        {
-            distance = ClippedSignedDistance< nbl::hlsl::shapes::Line<float> >::sdf(lineSegment, fragPos, contourThickness, contourStyle.isRoadStyleFlag);
-        }
-        else
-        {
-            // TODO:
-            // It might be beneficial to calculate distance between pixel and contour line to early out some pixels and save yourself from stipple sdf computations!
-            // where you only compute the complex sdf if abs((height - contourVal) / heightDeriv) <= aaFactor
-            nbl::hlsl::shapes::Line<float>::ArcLengthCalculator arcLenCalc = nbl::hlsl::shapes::Line<float>::ArcLengthCalculator::construct(lineSegment);
-            LineStyleClipper clipper = LineStyleClipper::construct(contourStyle, lineSegment, arcLenCalc, phaseShift, stretch, globals.worldToScreenRatio);
-            distance = ClippedSignedDistance<nbl::hlsl::shapes::Line<float>, LineStyleClipper>::sdf(lineSegment, fragPos, contourThickness, contourStyle.isRoadStyleFlag, clipper);
-        }
-        
-        outputColor.a = smoothstep(+globals.antiAliasingFactor, -globals.antiAliasingFactor, distance) * contourStyle.color.a;
-        outputColor.rgb = contourStyle.color.rgb;
-
-        return outputColor;
-    }
-    
-    return float4(0.0f, 0.0f, 0.0f, 0.0f);
-}
-
-float4 calculateDTMOutlineColor(in uint outlineLineStyleIdx, in float3 v[3], in uint2 edgePoints[3], in float2 fragPos, in float3 baryCoord, in float height)
-{
-    float4 outputColor;
-
-    LineStyle outlineStyle = loadLineStyle(outlineLineStyleIdx);
-    const float outlineThickness = (outlineStyle.screenSpaceLineWidth + outlineStyle.worldSpaceLineWidth * globals.screenToWorldRatio) * 0.5f;
-    const float phaseShift = 0.0f; // input.getCurrentPhaseShift();
-    const float stretch = 1.0f;
-
-    // index of vertex opposing an edge, needed for calculation of triangle heights
-    uint opposingVertexIdx[3];
-    opposingVertexIdx[0] = 2;
-    opposingVertexIdx[1] = 0;
-    opposingVertexIdx[2] = 1;
-
-    // find sdf of every edge
-    float triangleAreaTimesTwo;
-    {
-        float3 AB = v[0] - v[1];
-        float3 AC = v[0] - v[2];
-        AB.z = 0.0f;
-        AC.z = 0.0f;
-
-        // TODO: figure out if there is a faster solution
-        triangleAreaTimesTwo = length(cross(AB, AC));
-    }
-
-    // calculate sdf of every edge as it wasn't stippled
-    float distances[3];
-    for (int i = 0; i < 3; ++i)
-    {
-        const uint2 currentEdgePoints = edgePoints[i];
-        float3 A = v[currentEdgePoints[0]];
-        float3 B = v[currentEdgePoints[1]];
-        float3 AB = B - A;
-        float ABLen = length(AB);
-        float triangleHeightToOpositeVertex = triangleAreaTimesTwo / ABLen;
-
-        distances[i] = abs(triangleHeightToOpositeVertex * baryCoord[opposingVertexIdx[i]]);
-    }
-
-    float minDistance = nbl::hlsl::numeric_limits<float>::max;
-    if (!outlineStyle.hasStipples() || stretch == InvalidStyleStretchValue)
-    {
-        for (int i = 0; i < 3; ++i)
-        {
-            const uint2 currentEdgePoints = edgePoints[i];
-            float3 p0 = v[currentEdgePoints[0]];
-            float3 p1 = v[currentEdgePoints[1]];
-
-            float distance = nbl::hlsl::numeric_limits<float>::max;
-            nbl::hlsl::shapes::Line<float> lineSegment = nbl::hlsl::shapes::Line<float>::construct(float2(p0.x, p0.y), float2(p1.x, p1.y));
-            distance = ClippedSignedDistance<nbl::hlsl::shapes::Line<float> >::sdf(lineSegment, fragPos, outlineThickness, outlineStyle.isRoadStyleFlag);
-
-            minDistance = min(minDistance, distance);
-        }
-    }
-    else
-    {
-        for (int i = 0; i < 3; ++i)
-        {
-            const uint2 currentEdgePoints = edgePoints[i];
-            float3 p0 = v[currentEdgePoints[0]];
-            float3 p1 = v[currentEdgePoints[1]];
-
-            // long story short, in order for stipple patterns to be consistent:
-            // - point with lesser x coord should be starting point
-            // - if x coord of both points are equal then point with lesser y value should be starting point
-            if (p1.x < p0.x)
-                nbl::hlsl::swap(p0, p1);
-            else if (p1.x == p0.x && p1.y < p0.y)
-                nbl::hlsl::swap(p0, p1);
-
-            nbl::hlsl::shapes::Line<float> lineSegment = nbl::hlsl::shapes::Line<float>::construct(float2(p0.x, p0.y), float2(p1.x, p1.y));
-
-            float distance = nbl::hlsl::numeric_limits<float>::max;
-            nbl::hlsl::shapes::Line<float>::ArcLengthCalculator arcLenCalc = nbl::hlsl::shapes::Line<float>::ArcLengthCalculator::construct(lineSegment);
-            LineStyleClipper clipper = LineStyleClipper::construct(outlineStyle, lineSegment, arcLenCalc, phaseShift, stretch, globals.worldToScreenRatio);
-            distance = ClippedSignedDistance<nbl::hlsl::shapes::Line<float>, LineStyleClipper>::sdf(lineSegment, fragPos, outlineThickness, outlineStyle.isRoadStyleFlag, clipper);
-
-            minDistance = min(minDistance, distance);
-        }
-
-    }
-
-    outputColor.a = smoothstep(+globals.antiAliasingFactor, -globals.antiAliasingFactor, minDistance) * outlineStyle.color.a;
-    outputColor.rgb = outlineStyle.color.rgb;
-
-    return outputColor;
-}
-
-float4 blendColorOnTop(in float4 colorBelow, in float4 colorAbove)
-{
-    float4 outputColor = colorBelow;
-    outputColor.rgb = colorAbove.rgb * colorAbove.a + outputColor.rgb * outputColor.a * (1.0f - colorAbove.a);
-    outputColor.a = colorAbove.a + outputColor.a * (1.0f - colorAbove.a);
-
-    return outputColor;
-}
-
 [[vk::spvexecutionmode(spv::ExecutionModePixelInterlockOrderedEXT)]]
 [shader("pixel")]
 float4 fragMain(PSInput input) : SV_TARGET
@@ -834,30 +138,25 @@ float4 fragMain(PSInput input) : SV_TARGET
         v[1] = input.getScreenSpaceVertexAttribs(1);
         v[2] = input.getScreenSpaceVertexAttribs(2);
 
-        // indices of points constructing every edge
-        uint2 edgePoints[3];
-        edgePoints[0] = uint2(0, 1);
-        edgePoints[1] = uint2(1, 2);
-        edgePoints[2] = uint2(2, 0);
-
-        const float3 baryCoord = calculateDTMTriangleBarycentrics(v[0], v[1], v[2], input.position.xy);
+        const float3 baryCoord = dtm::calculateDTMTriangleBarycentrics(v[0], v[1], v[2], input.position.xy);
         float height = baryCoord.x * v[0].z + baryCoord.y * v[1].z + baryCoord.z * v[2].z;
         float heightDeriv = fwidth(height);
 
         float4 dtmColor = float4(0.0f, 0.0f, 0.0f, 0.0f);
+        
         if (dtmSettings.drawHeightShadingEnabled())
-            dtmColor = blendColorOnTop(dtmColor, calculateDTMHeightColor(dtmSettings.heightShadingSettings, v, heightDeriv, input.position.xy, height));
+            dtmColor = dtm::blendUnder(dtmColor, dtm::calculateDTMHeightColor(dtmSettings.heightShadingSettings, v, heightDeriv, input.position.xy, height));
         if (dtmSettings.drawContourEnabled())
         {
             for(uint32_t i = 0; i < dtmSettings.contourSettingsCount; ++i) // TODO: should reverse the order with blendUnder
-                dtmColor = blendColorOnTop(dtmColor, calculateDTMContourColor(dtmSettings.contourSettings[i], v, edgePoints, input.position.xy, height));
+                dtmColor = dtm::blendUnder(dtmColor, dtm::calculateDTMContourColor(dtmSettings.contourSettings[i], v, input.position.xy, height));
         }
         if (dtmSettings.drawOutlineEnabled())
-            dtmColor = blendColorOnTop(dtmColor, calculateDTMOutlineColor(dtmSettings.outlineLineStyleIdx, v, edgePoints, input.position.xy, baryCoord, height));
+            dtmColor = dtm::blendUnder(dtmColor, dtm::calculateDTMOutlineColor(dtmSettings.outlineLineStyleIdx, v, input.position.xy, baryCoord, height));
 
         textureColor = dtmColor.rgb;
         localAlpha = dtmColor.a;
-        
+
         gammaUncorrect(textureColor); // want to output to SRGB without gamma correction
         return calculateFinalColor<nbl::hlsl::jit::device_capabilities::fragmentShaderPixelInterlock>(uint2(input.position.xy), localAlpha, currentMainObjectIdx, textureColor, true);
     }
@@ -924,7 +223,7 @@ float4 fragMain(PSInput input) : SV_TARGET
                     input.getPolylineConnectorTrapezoidShortBase());
 
             }
-            localAlpha = smoothstep(+globals.antiAliasingFactor, -globals.antiAliasingFactor, distance);
+            localAlpha = 1.0f - smoothstep(-globals.antiAliasingFactor, globals.antiAliasingFactor, distance);
         }
         else if (objType == ObjectType::CURVE_BOX) 
         {
@@ -1042,7 +341,7 @@ float4 fragMain(PSInput input) : SV_TARGET
                 // If later on we decided that we can have different sizes here, we should do computations similar to FONT_GLYPH
                 float3 msdfSample = msdfTextures.SampleLevel(msdfSampler, float3(frac(input.position.xy / HatchFillMSDFSceenSpaceSize), float(textureId)), 0.0).xyz;
                 float msdf = nbl::hlsl::text::msdfDistance(msdfSample, MSDFPixelRange * HatchFillMSDFSceenSpaceSize / MSDFSize);
-                localAlpha *= smoothstep(+globals.antiAliasingFactor / 2.0, -globals.antiAliasingFactor / 2.0f, msdf);
+                localAlpha *= 1.0f - smoothstep(-globals.antiAliasingFactor / 2.0f, globals.antiAliasingFactor / 2.0f, msdf);
             }
         }
         else if (objType == ObjectType::FONT_GLYPH) 
@@ -1073,7 +372,7 @@ float4 fragMain(PSInput input) : SV_TARGET
                 LineStyle style = loadLineStyle(mainObj.styleIdx);
                 const float screenPxRange = input.getFontGlyphPxRange() / MSDFPixelRangeHalf;
                 const float bolden = style.worldSpaceLineWidth * screenPxRange; // worldSpaceLineWidth is actually boldenInPixels, aliased TextStyle with LineStyle
-                localAlpha = smoothstep(+globals.antiAliasingFactor / 2.0f + bolden, -globals.antiAliasingFactor / 2.0f + bolden, msdf);
+                localAlpha = 1.0f - smoothstep(-globals.antiAliasingFactor / 2.0f + bolden, globals.antiAliasingFactor / 2.0f + bolden, msdf);
             }
         }
         else if (objType == ObjectType::IMAGE) 

From 8a771a64002c6f2e2b3af035227ed18fe3947477 Mon Sep 17 00:00:00 2001
From: Przemek <minikers21@gmail.com>
Date: Wed, 23 Apr 2025 11:04:00 +0200
Subject: [PATCH 202/529] Fixed blending

---
 62_CAD/main.cpp                                   | 2 +-
 62_CAD/shaders/main_pipeline/dtm.hlsl             | 4 ++--
 62_CAD/shaders/main_pipeline/fragment_shader.hlsl | 9 +++++----
 3 files changed, 8 insertions(+), 7 deletions(-)

diff --git a/62_CAD/main.cpp b/62_CAD/main.cpp
index 9f5392d4b..822f1b448 100644
--- a/62_CAD/main.cpp
+++ b/62_CAD/main.cpp
@@ -3244,7 +3244,7 @@ class ComputerAidedDesign final : public examples::SimpleWindowedApplication, pu
 			mesh.setIndices(std::move(indices));
 
 			DTMSettingsInfo dtmInfo{};
-			dtmInfo.mode |= E_DTM_MODE::OUTLINE;
+			//dtmInfo.mode |= E_DTM_MODE::OUTLINE;
 			dtmInfo.mode |= E_DTM_MODE::HEIGHT_SHADING;
 			dtmInfo.mode |= E_DTM_MODE::CONTOUR;
 
diff --git a/62_CAD/shaders/main_pipeline/dtm.hlsl b/62_CAD/shaders/main_pipeline/dtm.hlsl
index 5b41eabb3..466b74ee9 100644
--- a/62_CAD/shaders/main_pipeline/dtm.hlsl
+++ b/62_CAD/shaders/main_pipeline/dtm.hlsl
@@ -666,9 +666,9 @@ float4 calculateDTMOutlineColor(in uint outlineLineStyleIdx, in float3 v[3], in
     return outputColor;
 }
 
-float4 blendUnder(in float4 srcColor, in float4 dstColor)
+float4 blendUnder(in float4 dstColor, in float4 srcColor)
 {
-    dstColor.rgb = dstColor.rgb * dstColor.a + (1 - dstColor.a) * srcColor.a * srcColor.rgb;
+    dstColor.rgb = dstColor.rgb + (1 - dstColor.a) * srcColor.a * srcColor.rgb;
     dstColor.a = (1.0f - srcColor.a) * dstColor.a + srcColor.a;
 
     return dstColor;
diff --git a/62_CAD/shaders/main_pipeline/fragment_shader.hlsl b/62_CAD/shaders/main_pipeline/fragment_shader.hlsl
index 6c579dff6..3ac219a66 100644
--- a/62_CAD/shaders/main_pipeline/fragment_shader.hlsl
+++ b/62_CAD/shaders/main_pipeline/fragment_shader.hlsl
@@ -144,15 +144,16 @@ float4 fragMain(PSInput input) : SV_TARGET
 
         float4 dtmColor = float4(0.0f, 0.0f, 0.0f, 0.0f);
         
-        if (dtmSettings.drawHeightShadingEnabled())
-            dtmColor = dtm::blendUnder(dtmColor, dtm::calculateDTMHeightColor(dtmSettings.heightShadingSettings, v, heightDeriv, input.position.xy, height));
+        if (dtmSettings.drawOutlineEnabled())
+            dtmColor = dtm::blendUnder(dtmColor, dtm::calculateDTMOutlineColor(dtmSettings.outlineLineStyleIdx, v, input.position.xy, baryCoord, height));
         if (dtmSettings.drawContourEnabled())
         {
             for(uint32_t i = 0; i < dtmSettings.contourSettingsCount; ++i) // TODO: should reverse the order with blendUnder
                 dtmColor = dtm::blendUnder(dtmColor, dtm::calculateDTMContourColor(dtmSettings.contourSettings[i], v, input.position.xy, height));
         }
-        if (dtmSettings.drawOutlineEnabled())
-            dtmColor = dtm::blendUnder(dtmColor, dtm::calculateDTMOutlineColor(dtmSettings.outlineLineStyleIdx, v, input.position.xy, baryCoord, height));
+        if (dtmSettings.drawHeightShadingEnabled())
+            dtmColor = dtm::blendUnder(dtmColor, dtm::calculateDTMHeightColor(dtmSettings.heightShadingSettings, v, heightDeriv, input.position.xy, height));
+        
 
         textureColor = dtmColor.rgb;
         localAlpha = dtmColor.a;

From fa5d7f1e2ea73767e36319d4e40d704855ceb16d Mon Sep 17 00:00:00 2001
From: Przemek <minikers21@gmail.com>
Date: Thu, 24 Apr 2025 14:05:45 +0200
Subject: [PATCH 203/529] Added `line_style.hlsl` file

---
 62_CAD/main.cpp                              |  42 ++-
 62_CAD/shaders/main_pipeline/dtm.hlsl        | 296 +-----------------
 62_CAD/shaders/main_pipeline/line_style.hlsl | 297 +++++++++++++++++++
 3 files changed, 327 insertions(+), 308 deletions(-)
 create mode 100644 62_CAD/shaders/main_pipeline/line_style.hlsl

diff --git a/62_CAD/main.cpp b/62_CAD/main.cpp
index 822f1b448..6ca03d9d6 100644
--- a/62_CAD/main.cpp
+++ b/62_CAD/main.cpp
@@ -3239,12 +3239,28 @@ class ComputerAidedDesign final : public examples::SimpleWindowedApplication, pu
 				0, 1, 2
 			};*/
 
+			// HOURGLASS
+			/*core::vector<TriangleMeshVertex> vertices = {
+				{ float64_t2(0.0, 0.0), 10.0 },
+				{ float64_t2(-200.0, -200.0), 90.0 },
+				{ float64_t2(200.0, -200.0), 80.0 },
+
+				{ float64_t2(0.0, 0.0), 10.0 },
+				{ float64_t2(200.0, 200.0), 90.0 },
+				{ float64_t2(-200.0, 200.0), 80.0 },
+			};
+
+			core::vector<uint32_t> indices = {
+				0, 1, 2,
+				3, 4, 5
+			};*/
+
 			CTriangleMesh mesh;
 			mesh.setVertices(std::move(vertices));
 			mesh.setIndices(std::move(indices));
 
 			DTMSettingsInfo dtmInfo{};
-			//dtmInfo.mode |= E_DTM_MODE::OUTLINE;
+			dtmInfo.mode |= E_DTM_MODE::OUTLINE;
 			dtmInfo.mode |= E_DTM_MODE::HEIGHT_SHADING;
 			dtmInfo.mode |= E_DTM_MODE::CONTOUR;
 
@@ -3260,7 +3276,7 @@ class ComputerAidedDesign final : public examples::SimpleWindowedApplication, pu
 			dtmInfo.contourSettings[0u].heightInterval = 10;
 			dtmInfo.contourSettings[0u].lineStyleInfo.screenSpaceLineWidth = 0.0f;
 			dtmInfo.contourSettings[0u].lineStyleInfo.worldSpaceLineWidth = 1.0f;
-			dtmInfo.contourSettings[0u].lineStyleInfo.color = float32_t4(0.0f, 0.0f, 1.0f, 0.7f);
+			dtmInfo.contourSettings[0u].lineStyleInfo.color = float32_t4(0.0f, 0.0f, 1.0f, 1.0f);
 			std::array<double, 4> contourStipplePattern = { 0.0f, -5.0f, 10.0f, -5.0f };
 			dtmInfo.contourSettings[0u].lineStyleInfo.setStipplePatternData(contourStipplePattern);
 
@@ -3282,7 +3298,7 @@ class ComputerAidedDesign final : public examples::SimpleWindowedApplication, pu
 
 					dtmInfo.heightShadingInfo.addHeightColorMapEntry(-10.0f, float32_t4(0.5f, 1.0f, 1.0f, 1.0f));
 					dtmInfo.heightShadingInfo.addHeightColorMapEntry(20.0f, float32_t4(0.0f, 1.0f, 0.0f, 1.0f));
-					dtmInfo.heightShadingInfo.addHeightColorMapEntry(25.0f, float32_t4(1.0f, 1.0f, 0.0f, animatedAlpha));
+					dtmInfo.heightShadingInfo.addHeightColorMapEntry(25.0f, float32_t4(1.0f, 1.0f, 0.0f, 1.0f));
 					dtmInfo.heightShadingInfo.addHeightColorMapEntry(70.0f, float32_t4(1.0f, 0.0f, 0.0f, 1.0f));
 					dtmInfo.heightShadingInfo.addHeightColorMapEntry(90.0f, float32_t4(1.0f, 0.0f, 0.0f, 1.0f));
 
@@ -3294,22 +3310,22 @@ class ComputerAidedDesign final : public examples::SimpleWindowedApplication, pu
 					dtmInfo.heightShadingInfo.intervalIndexToHeightMultiplier = dtmInfo.heightShadingInfo.intervalLength;
 					dtmInfo.heightShadingInfo.isCenteredShading = false;
 					dtmInfo.heightShadingInfo.heightShadingMode = E_HEIGHT_SHADING_MODE::DISCRETE_FIXED_LENGTH_INTERVALS;
-					dtmInfo.heightShadingInfo.addHeightColorMapEntry(0.0f, float32_t4(0.0f, 0.0f, 1.0f, animatedAlpha));
-					dtmInfo.heightShadingInfo.addHeightColorMapEntry(25.0f, float32_t4(0.0f, 1.0f, 1.0f, animatedAlpha));
-					dtmInfo.heightShadingInfo.addHeightColorMapEntry(50.0f, float32_t4(0.0f, 1.0f, 0.0f, animatedAlpha));
-					dtmInfo.heightShadingInfo.addHeightColorMapEntry(75.0f, float32_t4(1.0f, 1.0f, 0.0f, animatedAlpha));
-					dtmInfo.heightShadingInfo.addHeightColorMapEntry(100.0f, float32_t4(1.0f, 0.0f, 0.0f, animatedAlpha));
+					dtmInfo.heightShadingInfo.addHeightColorMapEntry(0.0f, float32_t4(0.0f, 0.0f, 1.0f, 1.0f));
+					dtmInfo.heightShadingInfo.addHeightColorMapEntry(25.0f, float32_t4(0.0f, 1.0f, 1.0f, 1.0f));
+					dtmInfo.heightShadingInfo.addHeightColorMapEntry(50.0f, float32_t4(0.0f, 1.0f, 0.0f, 1.0f));
+					dtmInfo.heightShadingInfo.addHeightColorMapEntry(75.0f, float32_t4(1.0f, 1.0f, 0.0f, 1.0f));
+					dtmInfo.heightShadingInfo.addHeightColorMapEntry(100.0f, float32_t4(1.0f, 0.0f, 0.0f, 1.0f));
 
 					break;
 				}
 				case E_HEIGHT_SHADING_MODE::CONTINOUS_INTERVALS:
 				{
 					dtmInfo.heightShadingInfo.heightShadingMode = E_HEIGHT_SHADING_MODE::CONTINOUS_INTERVALS;
-					dtmInfo.heightShadingInfo.addHeightColorMapEntry(0.0f, float32_t4(0.0f, 0.0f, 1.0f, animatedAlpha));
-					dtmInfo.heightShadingInfo.addHeightColorMapEntry(25.0f, float32_t4(0.0f, 1.0f, 1.0f, animatedAlpha));
-					dtmInfo.heightShadingInfo.addHeightColorMapEntry(50.0f, float32_t4(0.0f, 1.0f, 0.0f, animatedAlpha));
-					dtmInfo.heightShadingInfo.addHeightColorMapEntry(75.0f, float32_t4(1.0f, 1.0f, 0.0f, animatedAlpha));
-					dtmInfo.heightShadingInfo.addHeightColorMapEntry(90.0f, float32_t4(1.0f, 0.0f, 0.0f, animatedAlpha));
+					dtmInfo.heightShadingInfo.addHeightColorMapEntry(0.0f, float32_t4(0.0f, 0.0f, 1.0f, 1.0f));
+					dtmInfo.heightShadingInfo.addHeightColorMapEntry(25.0f, float32_t4(0.0f, 1.0f, 1.0f, 1.0f));
+					dtmInfo.heightShadingInfo.addHeightColorMapEntry(50.0f, float32_t4(0.0f, 1.0f, 0.0f, 1.0f));
+					dtmInfo.heightShadingInfo.addHeightColorMapEntry(75.0f, float32_t4(1.0f, 1.0f, 0.0f, 1.0f));
+					dtmInfo.heightShadingInfo.addHeightColorMapEntry(90.0f, float32_t4(1.0f, 0.0f, 0.0f, 1.0f));
 
 					break;
 				}
diff --git a/62_CAD/shaders/main_pipeline/dtm.hlsl b/62_CAD/shaders/main_pipeline/dtm.hlsl
index 466b74ee9..63e1194e4 100644
--- a/62_CAD/shaders/main_pipeline/dtm.hlsl
+++ b/62_CAD/shaders/main_pipeline/dtm.hlsl
@@ -1,300 +1,7 @@
 #ifndef _CAD_EXAMPLE_DTM_HLSL_INCLUDED_
 #define _CAD_EXAMPLE_DTM_HLSL_INCLUDED_
 
-#include <nbl/builtin/hlsl/shapes/line.hlsl>
-#include <nbl/builtin/hlsl/algorithm.hlsl>
-
-// TODO: functions outside of the "dtm" namespace need to be moved to another file
-
-// for usage in upper_bound function
-struct StyleAccessor
-{
-    LineStyle style;
-    using value_type = float;
-
-    float operator[](const uint32_t ix)
-    {
-        return style.getStippleValue(ix);
-    }
-};
-
-template<typename CurveType>
-struct StyleClipper
-{
-    using float_t = typename CurveType::scalar_t;
-    using float_t2 = typename CurveType::float_t2;
-    using float_t3 = typename CurveType::float_t3;
-    NBL_CONSTEXPR_STATIC_INLINE float_t AccuracyThresholdT = 0.000001;
-
-    static StyleClipper<CurveType> construct(
-        LineStyle style,
-        CurveType curve,
-        typename CurveType::ArcLengthCalculator arcLenCalc,
-        float phaseShift,
-        float stretch,
-        float worldToScreenRatio)
-    {
-        StyleClipper<CurveType> ret = { style, curve, arcLenCalc, phaseShift, stretch, worldToScreenRatio, 0.0f, 0.0f, 0.0f, 0.0f };
-
-        // values for non-uniform stretching with a rigid segment
-        if (style.rigidSegmentIdx != InvalidRigidSegmentIndex && stretch != 1.0f)
-        {
-            // rigidSegment info in old non stretched pattern
-            ret.rigidSegmentStart = (style.rigidSegmentIdx >= 1u) ? style.getStippleValue(style.rigidSegmentIdx - 1u) : 0.0f;
-            ret.rigidSegmentEnd = (style.rigidSegmentIdx < style.stipplePatternSize) ? style.getStippleValue(style.rigidSegmentIdx) : 1.0f;
-            ret.rigidSegmentLen = ret.rigidSegmentEnd - ret.rigidSegmentStart;
-            // stretch value for non rigid segments
-            ret.nonRigidSegmentStretchValue = (stretch - ret.rigidSegmentLen) / (1.0f - ret.rigidSegmentLen);
-            // rigidSegment info to new stretched pattern
-            ret.rigidSegmentStart *= ret.nonRigidSegmentStretchValue / stretch; // get the new normalized rigid segment start
-            ret.rigidSegmentLen /= stretch; // get the new rigid segment normalized len
-            ret.rigidSegmentEnd = ret.rigidSegmentStart + ret.rigidSegmentLen; // get the new normalized rigid segment end 
-        }
-        else
-        {
-            ret.nonRigidSegmentStretchValue = stretch;
-        }
-
-        return ret;
-    }
-
-    // For non-uniform stretching with a rigid segment (the one segement that shouldn't stretch) the whole pattern changes
-    // instead of transforming each of the style.stipplePattern values (max 14 of them), we transform the normalized place in pattern
-    float getRealNormalizedPlaceInPattern(float normalizedPlaceInPattern)
-    {
-        if (style.rigidSegmentIdx != InvalidRigidSegmentIndex && stretch != 1.0f)
-        {
-            float ret = min(normalizedPlaceInPattern, rigidSegmentStart) / nonRigidSegmentStretchValue; // unstretch parts before rigid segment
-            ret += max(normalizedPlaceInPattern - rigidSegmentEnd, 0.0f) / nonRigidSegmentStretchValue; // unstretch parts after rigid segment
-            ret += max(min(rigidSegmentLen, normalizedPlaceInPattern - rigidSegmentStart), 0.0f); // unstretch parts inside rigid segment
-            ret *= stretch;
-            return ret;
-        }
-        else
-        {
-            return normalizedPlaceInPattern;
-        }
-    }
-
-    float_t2 operator()(float_t t)
-    {
-        // basicaly 0.0 and 1.0 but with a guardband to discard outside the range
-        const float_t minT = 0.0 - 1.0;
-        const float_t maxT = 1.0 + 1.0;
-
-        StyleAccessor styleAccessor = { style };
-        const float_t reciprocalStretchedStipplePatternLen = style.reciprocalStipplePatternLen / stretch;
-        const float_t patternLenInScreenSpace = 1.0 / (worldToScreenRatio * style.reciprocalStipplePatternLen);
-
-        const float_t arcLen = arcLenCalc.calcArcLen(t);
-        const float_t worldSpaceArcLen = arcLen * float_t(worldToScreenRatio);
-        float_t normalizedPlaceInPattern = frac(worldSpaceArcLen * reciprocalStretchedStipplePatternLen + phaseShift);
-        normalizedPlaceInPattern = getRealNormalizedPlaceInPattern(normalizedPlaceInPattern);
-        uint32_t patternIdx = nbl::hlsl::upper_bound(styleAccessor, 0, style.stipplePatternSize, normalizedPlaceInPattern);
-
-        const float_t InvalidT = nbl::hlsl::numeric_limits<float32_t>::infinity;
-        float_t2 ret = float_t2(InvalidT, InvalidT);
-
-        // odd patternIdx means a "no draw section" and current candidate should split into two nearest draw sections
-        const bool notInDrawSection = patternIdx & 0x1;
-
-        // TODO[Erfan]: Disable this piece of code after clipping, and comment the reason, that the bezier start and end at 0.0 and 1.0 should be in drawable sections
-        float_t minDrawT = 0.0;
-        float_t maxDrawT = 1.0;
-        {
-            float_t normalizedPlaceInPatternBegin = frac(phaseShift);
-            normalizedPlaceInPatternBegin = getRealNormalizedPlaceInPattern(normalizedPlaceInPatternBegin);
-            uint32_t patternIdxBegin = nbl::hlsl::upper_bound(styleAccessor, 0, style.stipplePatternSize, normalizedPlaceInPatternBegin);
-            const bool BeginInNonDrawSection = patternIdxBegin & 0x1;
-
-            if (BeginInNonDrawSection)
-            {
-                float_t diffToRightDrawableSection = (patternIdxBegin == style.stipplePatternSize) ? 1.0 : styleAccessor[patternIdxBegin];
-                diffToRightDrawableSection -= normalizedPlaceInPatternBegin;
-                float_t scrSpcOffsetToArcLen1 = diffToRightDrawableSection * patternLenInScreenSpace * ((patternIdxBegin != style.rigidSegmentIdx) ? nonRigidSegmentStretchValue : 1.0);
-                const float_t arcLenForT1 = 0.0 + scrSpcOffsetToArcLen1;
-                minDrawT = arcLenCalc.calcArcLenInverse(curve, minT, maxT, arcLenForT1, AccuracyThresholdT, 0.0);
-            }
-
-            // Completely in non-draw section -> clip away:
-            if (minDrawT >= 1.0)
-                return ret;
-
-            const float_t arcLenEnd = arcLenCalc.calcArcLen(1.0);
-            const float_t worldSpaceArcLenEnd = arcLenEnd * float_t(worldToScreenRatio);
-            float_t normalizedPlaceInPatternEnd = frac(worldSpaceArcLenEnd * reciprocalStretchedStipplePatternLen + phaseShift);
-            normalizedPlaceInPatternEnd = getRealNormalizedPlaceInPattern(normalizedPlaceInPatternEnd);
-            uint32_t patternIdxEnd = nbl::hlsl::upper_bound(styleAccessor, 0, style.stipplePatternSize, normalizedPlaceInPatternEnd);
-            const bool EndInNonDrawSection = patternIdxEnd & 0x1;
-
-            if (EndInNonDrawSection)
-            {
-                float_t diffToLeftDrawableSection = (patternIdxEnd == 0) ? 0.0 : styleAccessor[patternIdxEnd - 1];
-                diffToLeftDrawableSection -= normalizedPlaceInPatternEnd;
-                float_t scrSpcOffsetToArcLen0 = diffToLeftDrawableSection * patternLenInScreenSpace * ((patternIdxEnd != style.rigidSegmentIdx) ? nonRigidSegmentStretchValue : 1.0);
-                const float_t arcLenForT0 = arcLenEnd + scrSpcOffsetToArcLen0;
-                maxDrawT = arcLenCalc.calcArcLenInverse(curve, minT, maxT, arcLenForT0, AccuracyThresholdT, 1.0);
-            }
-        }
-
-        if (notInDrawSection)
-        {
-            float toScreenSpaceLen = patternLenInScreenSpace * ((patternIdx != style.rigidSegmentIdx) ? nonRigidSegmentStretchValue : 1.0);
-
-            float_t diffToLeftDrawableSection = (patternIdx == 0) ? 0.0 : styleAccessor[patternIdx - 1];
-            diffToLeftDrawableSection -= normalizedPlaceInPattern;
-            float_t scrSpcOffsetToArcLen0 = diffToLeftDrawableSection * toScreenSpaceLen;
-            const float_t arcLenForT0 = arcLen + scrSpcOffsetToArcLen0;
-            float_t t0 = arcLenCalc.calcArcLenInverse(curve, minT, maxT, arcLenForT0, AccuracyThresholdT, t);
-            t0 = clamp(t0, minDrawT, maxDrawT);
-
-            float_t diffToRightDrawableSection = (patternIdx == style.stipplePatternSize) ? 1.0 : styleAccessor[patternIdx];
-            diffToRightDrawableSection -= normalizedPlaceInPattern;
-            float_t scrSpcOffsetToArcLen1 = diffToRightDrawableSection * toScreenSpaceLen;
-            const float_t arcLenForT1 = arcLen + scrSpcOffsetToArcLen1;
-            float_t t1 = arcLenCalc.calcArcLenInverse(curve, minT, maxT, arcLenForT1, AccuracyThresholdT, t);
-            t1 = clamp(t1, minDrawT, maxDrawT);
-
-            ret = float_t2(t0, t1);
-        }
-        else
-        {
-            t = clamp(t, minDrawT, maxDrawT);
-            ret = float_t2(t, t);
-        }
-
-        return ret;
-    }
-
-    LineStyle style;
-    CurveType curve;
-    typename CurveType::ArcLengthCalculator arcLenCalc;
-    float phaseShift;
-    float stretch;
-    float worldToScreenRatio;
-    // precomp value for non uniform stretching
-    float rigidSegmentStart;
-    float rigidSegmentEnd;
-    float rigidSegmentLen;
-    float nonRigidSegmentStretchValue;
-};
-
-typedef StyleClipper< nbl::hlsl::shapes::Quadratic<float> > BezierStyleClipper;
-typedef StyleClipper< nbl::hlsl::shapes::Line<float> > LineStyleClipper;
-
-template<typename float_t>
-struct DefaultClipper
-{
-    using float_t2 = vector<float_t, 2>;
-    NBL_CONSTEXPR_STATIC_INLINE float_t AccuracyThresholdT = 0.0;
-
-    static DefaultClipper construct()
-    {
-        DefaultClipper ret;
-        return ret;
-    }
-
-    inline float_t2 operator()(const float_t t)
-    {
-        const float_t ret = clamp(t, 0.0, 1.0);
-        return float_t2(ret, ret);
-    }
-};
-
-template<typename CurveType, typename Clipper = DefaultClipper<typename CurveType::scalar_t> >
-struct ClippedSignedDistance
-{
-    using float_t = typename CurveType::scalar_t;
-    using float_t2 = typename CurveType::float_t2;
-    using float_t3 = typename CurveType::float_t3;
-
-    const static float_t sdf(CurveType curve, float_t2 pos, float_t thickness, bool isRoadStyle, Clipper clipper = DefaultClipper<typename CurveType::scalar_t>::construct())
-    {
-        typename CurveType::Candidates candidates = curve.getClosestCandidates(pos);
-
-        const float_t InvalidT = nbl::hlsl::numeric_limits<float32_t>::max;
-        // TODO: Fix and test, we're not working with squared distance anymore
-        const float_t MAX_DISTANCE_SQUARED = (thickness + 1.0f) * (thickness + 1.0f); // TODO: ' + 1' is too much?
-
-        bool clipped = false;
-        float_t closestDistanceSquared = MAX_DISTANCE_SQUARED;
-        float_t closestT = InvalidT;
-        [[unroll(CurveType::MaxCandidates)]]
-        for (uint32_t i = 0; i < CurveType::MaxCandidates; i++)
-        {
-            const float_t candidateDistanceSquared = length(curve.evaluate(candidates[i]) - pos);
-            if (candidateDistanceSquared < closestDistanceSquared)
-            {
-                float_t2 snappedTs = clipper(candidates[i]);
-
-                if (snappedTs[0] == InvalidT)
-                {
-                    continue;
-                }
-
-                if (snappedTs[0] != candidates[i])
-                {
-                    // left snapped or clamped
-                    const float_t leftSnappedCandidateDistanceSquared = length(curve.evaluate(snappedTs[0]) - pos);
-                    if (leftSnappedCandidateDistanceSquared < closestDistanceSquared)
-                    {
-                        clipped = true;
-                        closestT = snappedTs[0];
-                        closestDistanceSquared = leftSnappedCandidateDistanceSquared;
-                    }
-
-                    if (snappedTs[0] != snappedTs[1])
-                    {
-                        // right snapped or clamped
-                        const float_t rightSnappedCandidateDistanceSquared = length(curve.evaluate(snappedTs[1]) - pos);
-                        if (rightSnappedCandidateDistanceSquared < closestDistanceSquared)
-                        {
-                            clipped = true;
-                            closestT = snappedTs[1];
-                            closestDistanceSquared = rightSnappedCandidateDistanceSquared;
-                        }
-                    }
-                }
-                else
-                {
-                    // no snapping
-                    if (candidateDistanceSquared < closestDistanceSquared)
-                    {
-                        clipped = false;
-                        closestT = candidates[i];
-                        closestDistanceSquared = candidateDistanceSquared;
-                    }
-                }
-            }
-        }
-
-
-        float_t roundedDistance = closestDistanceSquared - thickness;
-        if (!isRoadStyle)
-        {
-            return roundedDistance;
-        }
-        else
-        {
-            const float_t aaWidth = globals.antiAliasingFactor;
-            float_t rectCappedDistance = roundedDistance;
-
-            if (clipped)
-            {
-                float_t2 q = mul(curve.getLocalCoordinateSpace(closestT), pos - curve.evaluate(closestT));
-                rectCappedDistance = capSquare(q, thickness, aaWidth);
-            }
-
-            return rectCappedDistance;
-        }
-    }
-
-    static float capSquare(float_t2 q, float_t th, float_t aaWidth)
-    {
-        float_t2 d = abs(q) - float_t2(aaWidth, th);
-        return length(max(d, 0.0)) + min(max(d.x, d.y), 0.0);
-    }
-};
+#include "line_style.hlsl"
 
 namespace dtm
 {
@@ -316,7 +23,6 @@ float dot2(in float2 vec)
     return dot(vec, vec);
 }
 
-// TODO: Later move these functions and structs to dtmSettings.hlsl and a namespace like dtmSettings::height_shading or dtmSettings::contours, etc..
 struct HeightSegmentTransitionData
 {
     float currentHeight;
diff --git a/62_CAD/shaders/main_pipeline/line_style.hlsl b/62_CAD/shaders/main_pipeline/line_style.hlsl
new file mode 100644
index 000000000..f50127667
--- /dev/null
+++ b/62_CAD/shaders/main_pipeline/line_style.hlsl
@@ -0,0 +1,297 @@
+#ifndef _CAD_EXAMPLE_LINE_STYLE_HLSL_INCLUDED_
+#define _CAD_EXAMPLE_LINE_STYLE_HLSL_INCLUDED_
+
+#include <nbl/builtin/hlsl/shapes/line.hlsl>
+#include <nbl/builtin/hlsl/algorithm.hlsl>
+
+// for usage in upper_bound function
+struct StyleAccessor
+{
+    LineStyle style;
+    using value_type = float;
+
+    float operator[](const uint32_t ix)
+    {
+        return style.getStippleValue(ix);
+    }
+};
+
+template<typename CurveType>
+struct StyleClipper
+{
+    using float_t = typename CurveType::scalar_t;
+    using float_t2 = typename CurveType::float_t2;
+    using float_t3 = typename CurveType::float_t3;
+    NBL_CONSTEXPR_STATIC_INLINE float_t AccuracyThresholdT = 0.000001;
+
+    static StyleClipper<CurveType> construct(
+        LineStyle style,
+        CurveType curve,
+        typename CurveType::ArcLengthCalculator arcLenCalc,
+        float phaseShift,
+        float stretch,
+        float worldToScreenRatio)
+    {
+        StyleClipper<CurveType> ret = { style, curve, arcLenCalc, phaseShift, stretch, worldToScreenRatio, 0.0f, 0.0f, 0.0f, 0.0f };
+
+        // values for non-uniform stretching with a rigid segment
+        if (style.rigidSegmentIdx != InvalidRigidSegmentIndex && stretch != 1.0f)
+        {
+            // rigidSegment info in old non stretched pattern
+            ret.rigidSegmentStart = (style.rigidSegmentIdx >= 1u) ? style.getStippleValue(style.rigidSegmentIdx - 1u) : 0.0f;
+            ret.rigidSegmentEnd = (style.rigidSegmentIdx < style.stipplePatternSize) ? style.getStippleValue(style.rigidSegmentIdx) : 1.0f;
+            ret.rigidSegmentLen = ret.rigidSegmentEnd - ret.rigidSegmentStart;
+            // stretch value for non rigid segments
+            ret.nonRigidSegmentStretchValue = (stretch - ret.rigidSegmentLen) / (1.0f - ret.rigidSegmentLen);
+            // rigidSegment info to new stretched pattern
+            ret.rigidSegmentStart *= ret.nonRigidSegmentStretchValue / stretch; // get the new normalized rigid segment start
+            ret.rigidSegmentLen /= stretch; // get the new rigid segment normalized len
+            ret.rigidSegmentEnd = ret.rigidSegmentStart + ret.rigidSegmentLen; // get the new normalized rigid segment end 
+        }
+        else
+        {
+            ret.nonRigidSegmentStretchValue = stretch;
+        }
+
+        return ret;
+    }
+
+    // For non-uniform stretching with a rigid segment (the one segement that shouldn't stretch) the whole pattern changes
+    // instead of transforming each of the style.stipplePattern values (max 14 of them), we transform the normalized place in pattern
+    float getRealNormalizedPlaceInPattern(float normalizedPlaceInPattern)
+    {
+        if (style.rigidSegmentIdx != InvalidRigidSegmentIndex && stretch != 1.0f)
+        {
+            float ret = min(normalizedPlaceInPattern, rigidSegmentStart) / nonRigidSegmentStretchValue; // unstretch parts before rigid segment
+            ret += max(normalizedPlaceInPattern - rigidSegmentEnd, 0.0f) / nonRigidSegmentStretchValue; // unstretch parts after rigid segment
+            ret += max(min(rigidSegmentLen, normalizedPlaceInPattern - rigidSegmentStart), 0.0f); // unstretch parts inside rigid segment
+            ret *= stretch;
+            return ret;
+        }
+        else
+        {
+            return normalizedPlaceInPattern;
+        }
+    }
+
+    float_t2 operator()(float_t t)
+    {
+        // basicaly 0.0 and 1.0 but with a guardband to discard outside the range
+        const float_t minT = 0.0 - 1.0;
+        const float_t maxT = 1.0 + 1.0;
+
+        StyleAccessor styleAccessor = { style };
+        const float_t reciprocalStretchedStipplePatternLen = style.reciprocalStipplePatternLen / stretch;
+        const float_t patternLenInScreenSpace = 1.0 / (worldToScreenRatio * style.reciprocalStipplePatternLen);
+
+        const float_t arcLen = arcLenCalc.calcArcLen(t);
+        const float_t worldSpaceArcLen = arcLen * float_t(worldToScreenRatio);
+        float_t normalizedPlaceInPattern = frac(worldSpaceArcLen * reciprocalStretchedStipplePatternLen + phaseShift);
+        normalizedPlaceInPattern = getRealNormalizedPlaceInPattern(normalizedPlaceInPattern);
+        uint32_t patternIdx = nbl::hlsl::upper_bound(styleAccessor, 0, style.stipplePatternSize, normalizedPlaceInPattern);
+
+        const float_t InvalidT = nbl::hlsl::numeric_limits<float32_t>::infinity;
+        float_t2 ret = float_t2(InvalidT, InvalidT);
+
+        // odd patternIdx means a "no draw section" and current candidate should split into two nearest draw sections
+        const bool notInDrawSection = patternIdx & 0x1;
+
+        // TODO[Erfan]: Disable this piece of code after clipping, and comment the reason, that the bezier start and end at 0.0 and 1.0 should be in drawable sections
+        float_t minDrawT = 0.0;
+        float_t maxDrawT = 1.0;
+        {
+            float_t normalizedPlaceInPatternBegin = frac(phaseShift);
+            normalizedPlaceInPatternBegin = getRealNormalizedPlaceInPattern(normalizedPlaceInPatternBegin);
+            uint32_t patternIdxBegin = nbl::hlsl::upper_bound(styleAccessor, 0, style.stipplePatternSize, normalizedPlaceInPatternBegin);
+            const bool BeginInNonDrawSection = patternIdxBegin & 0x1;
+
+            if (BeginInNonDrawSection)
+            {
+                float_t diffToRightDrawableSection = (patternIdxBegin == style.stipplePatternSize) ? 1.0 : styleAccessor[patternIdxBegin];
+                diffToRightDrawableSection -= normalizedPlaceInPatternBegin;
+                float_t scrSpcOffsetToArcLen1 = diffToRightDrawableSection * patternLenInScreenSpace * ((patternIdxBegin != style.rigidSegmentIdx) ? nonRigidSegmentStretchValue : 1.0);
+                const float_t arcLenForT1 = 0.0 + scrSpcOffsetToArcLen1;
+                minDrawT = arcLenCalc.calcArcLenInverse(curve, minT, maxT, arcLenForT1, AccuracyThresholdT, 0.0);
+            }
+
+            // Completely in non-draw section -> clip away:
+            if (minDrawT >= 1.0)
+                return ret;
+
+            const float_t arcLenEnd = arcLenCalc.calcArcLen(1.0);
+            const float_t worldSpaceArcLenEnd = arcLenEnd * float_t(worldToScreenRatio);
+            float_t normalizedPlaceInPatternEnd = frac(worldSpaceArcLenEnd * reciprocalStretchedStipplePatternLen + phaseShift);
+            normalizedPlaceInPatternEnd = getRealNormalizedPlaceInPattern(normalizedPlaceInPatternEnd);
+            uint32_t patternIdxEnd = nbl::hlsl::upper_bound(styleAccessor, 0, style.stipplePatternSize, normalizedPlaceInPatternEnd);
+            const bool EndInNonDrawSection = patternIdxEnd & 0x1;
+
+            if (EndInNonDrawSection)
+            {
+                float_t diffToLeftDrawableSection = (patternIdxEnd == 0) ? 0.0 : styleAccessor[patternIdxEnd - 1];
+                diffToLeftDrawableSection -= normalizedPlaceInPatternEnd;
+                float_t scrSpcOffsetToArcLen0 = diffToLeftDrawableSection * patternLenInScreenSpace * ((patternIdxEnd != style.rigidSegmentIdx) ? nonRigidSegmentStretchValue : 1.0);
+                const float_t arcLenForT0 = arcLenEnd + scrSpcOffsetToArcLen0;
+                maxDrawT = arcLenCalc.calcArcLenInverse(curve, minT, maxT, arcLenForT0, AccuracyThresholdT, 1.0);
+            }
+        }
+
+        if (notInDrawSection)
+        {
+            float toScreenSpaceLen = patternLenInScreenSpace * ((patternIdx != style.rigidSegmentIdx) ? nonRigidSegmentStretchValue : 1.0);
+
+            float_t diffToLeftDrawableSection = (patternIdx == 0) ? 0.0 : styleAccessor[patternIdx - 1];
+            diffToLeftDrawableSection -= normalizedPlaceInPattern;
+            float_t scrSpcOffsetToArcLen0 = diffToLeftDrawableSection * toScreenSpaceLen;
+            const float_t arcLenForT0 = arcLen + scrSpcOffsetToArcLen0;
+            float_t t0 = arcLenCalc.calcArcLenInverse(curve, minT, maxT, arcLenForT0, AccuracyThresholdT, t);
+            t0 = clamp(t0, minDrawT, maxDrawT);
+
+            float_t diffToRightDrawableSection = (patternIdx == style.stipplePatternSize) ? 1.0 : styleAccessor[patternIdx];
+            diffToRightDrawableSection -= normalizedPlaceInPattern;
+            float_t scrSpcOffsetToArcLen1 = diffToRightDrawableSection * toScreenSpaceLen;
+            const float_t arcLenForT1 = arcLen + scrSpcOffsetToArcLen1;
+            float_t t1 = arcLenCalc.calcArcLenInverse(curve, minT, maxT, arcLenForT1, AccuracyThresholdT, t);
+            t1 = clamp(t1, minDrawT, maxDrawT);
+
+            ret = float_t2(t0, t1);
+        }
+        else
+        {
+            t = clamp(t, minDrawT, maxDrawT);
+            ret = float_t2(t, t);
+        }
+
+        return ret;
+    }
+
+    LineStyle style;
+    CurveType curve;
+    typename CurveType::ArcLengthCalculator arcLenCalc;
+    float phaseShift;
+    float stretch;
+    float worldToScreenRatio;
+    // precomp value for non uniform stretching
+    float rigidSegmentStart;
+    float rigidSegmentEnd;
+    float rigidSegmentLen;
+    float nonRigidSegmentStretchValue;
+};
+
+typedef StyleClipper< nbl::hlsl::shapes::Quadratic<float> > BezierStyleClipper;
+typedef StyleClipper< nbl::hlsl::shapes::Line<float> > LineStyleClipper;
+
+template<typename float_t>
+struct DefaultClipper
+{
+    using float_t2 = vector<float_t, 2>;
+    NBL_CONSTEXPR_STATIC_INLINE float_t AccuracyThresholdT = 0.0;
+
+    static DefaultClipper construct()
+    {
+        DefaultClipper ret;
+        return ret;
+    }
+
+    inline float_t2 operator()(const float_t t)
+    {
+        const float_t ret = clamp(t, 0.0, 1.0);
+        return float_t2(ret, ret);
+    }
+};
+
+template<typename CurveType, typename Clipper = DefaultClipper<typename CurveType::scalar_t> >
+struct ClippedSignedDistance
+{
+    using float_t = typename CurveType::scalar_t;
+    using float_t2 = typename CurveType::float_t2;
+    using float_t3 = typename CurveType::float_t3;
+
+    const static float_t sdf(CurveType curve, float_t2 pos, float_t thickness, bool isRoadStyle, Clipper clipper = DefaultClipper<typename CurveType::scalar_t>::construct())
+    {
+        typename CurveType::Candidates candidates = curve.getClosestCandidates(pos);
+
+        const float_t InvalidT = nbl::hlsl::numeric_limits<float32_t>::max;
+        // TODO: Fix and test, we're not working with squared distance anymore
+        const float_t MAX_DISTANCE_SQUARED = (thickness + 1.0f) * (thickness + 1.0f); // TODO: ' + 1' is too much?
+
+        bool clipped = false;
+        float_t closestDistanceSquared = MAX_DISTANCE_SQUARED;
+        float_t closestT = InvalidT;
+        [[unroll(CurveType::MaxCandidates)]]
+        for (uint32_t i = 0; i < CurveType::MaxCandidates; i++)
+        {
+            const float_t candidateDistanceSquared = length(curve.evaluate(candidates[i]) - pos);
+            if (candidateDistanceSquared < closestDistanceSquared)
+            {
+                float_t2 snappedTs = clipper(candidates[i]);
+
+                if (snappedTs[0] == InvalidT)
+                {
+                    continue;
+                }
+
+                if (snappedTs[0] != candidates[i])
+                {
+                    // left snapped or clamped
+                    const float_t leftSnappedCandidateDistanceSquared = length(curve.evaluate(snappedTs[0]) - pos);
+                    if (leftSnappedCandidateDistanceSquared < closestDistanceSquared)
+                    {
+                        clipped = true;
+                        closestT = snappedTs[0];
+                        closestDistanceSquared = leftSnappedCandidateDistanceSquared;
+                    }
+
+                    if (snappedTs[0] != snappedTs[1])
+                    {
+                        // right snapped or clamped
+                        const float_t rightSnappedCandidateDistanceSquared = length(curve.evaluate(snappedTs[1]) - pos);
+                        if (rightSnappedCandidateDistanceSquared < closestDistanceSquared)
+                        {
+                            clipped = true;
+                            closestT = snappedTs[1];
+                            closestDistanceSquared = rightSnappedCandidateDistanceSquared;
+                        }
+                    }
+                }
+                else
+                {
+                    // no snapping
+                    if (candidateDistanceSquared < closestDistanceSquared)
+                    {
+                        clipped = false;
+                        closestT = candidates[i];
+                        closestDistanceSquared = candidateDistanceSquared;
+                    }
+                }
+            }
+        }
+
+
+        float_t roundedDistance = closestDistanceSquared - thickness;
+        if (!isRoadStyle)
+        {
+            return roundedDistance;
+        }
+        else
+        {
+            const float_t aaWidth = globals.antiAliasingFactor;
+            float_t rectCappedDistance = roundedDistance;
+
+            if (clipped)
+            {
+                float_t2 q = mul(curve.getLocalCoordinateSpace(closestT), pos - curve.evaluate(closestT));
+                rectCappedDistance = capSquare(q, thickness, aaWidth);
+            }
+
+            return rectCappedDistance;
+        }
+    }
+
+    static float capSquare(float_t2 q, float_t th, float_t aaWidth)
+    {
+        float_t2 d = abs(q) - float_t2(aaWidth, th);
+        return length(max(d, 0.0)) + min(max(d.x, d.y), 0.0);
+    }
+};
+
+#endif
\ No newline at end of file

From 9ddaa810cf0bfc19130f66de25f43b1be3ad8092 Mon Sep 17 00:00:00 2001
From: Erfan Ahmadi <ahmadierfan99@gmail.com>
Date: Fri, 25 Apr 2025 12:05:23 +0330
Subject: [PATCH 204/529] emulated float64 bug repro

---
 62_CAD/main.cpp | 82 ++++++++++++++++++++++++++++++-------------------
 1 file changed, 51 insertions(+), 31 deletions(-)

diff --git a/62_CAD/main.cpp b/62_CAD/main.cpp
index 6ca03d9d6..e901d07c3 100644
--- a/62_CAD/main.cpp
+++ b/62_CAD/main.cpp
@@ -58,6 +58,7 @@ enum class ExampleMode
 	CASE_7, // Images
 	CASE_8, // MSDF and Text
 	CASE_9, // DTM
+	CASE_BUG, // Bug Repro 
 	CASE_COUNT
 };
 
@@ -72,10 +73,11 @@ constexpr std::array<float, (uint32_t)ExampleMode::CASE_COUNT> cameraExtents =
 	10.0,	// CASE_6
 	10.0,	// CASE_7
 	600.0,	// CASE_8
-	600.0	// CASE_9
+	600.0,	// CASE_9
+	10.0	// CASE_BUG
 };
 
-constexpr ExampleMode mode = ExampleMode::CASE_9;
+constexpr ExampleMode mode = ExampleMode::CASE_BUG;
 
 class Camera2D
 {
@@ -3239,28 +3241,12 @@ class ComputerAidedDesign final : public examples::SimpleWindowedApplication, pu
 				0, 1, 2
 			};*/
 
-			// HOURGLASS
-			/*core::vector<TriangleMeshVertex> vertices = {
-				{ float64_t2(0.0, 0.0), 10.0 },
-				{ float64_t2(-200.0, -200.0), 90.0 },
-				{ float64_t2(200.0, -200.0), 80.0 },
-
-				{ float64_t2(0.0, 0.0), 10.0 },
-				{ float64_t2(200.0, 200.0), 90.0 },
-				{ float64_t2(-200.0, 200.0), 80.0 },
-			};
-
-			core::vector<uint32_t> indices = {
-				0, 1, 2,
-				3, 4, 5
-			};*/
-
 			CTriangleMesh mesh;
 			mesh.setVertices(std::move(vertices));
 			mesh.setIndices(std::move(indices));
 
 			DTMSettingsInfo dtmInfo{};
-			dtmInfo.mode |= E_DTM_MODE::OUTLINE;
+			//dtmInfo.mode |= E_DTM_MODE::OUTLINE;
 			dtmInfo.mode |= E_DTM_MODE::HEIGHT_SHADING;
 			dtmInfo.mode |= E_DTM_MODE::CONTOUR;
 
@@ -3276,7 +3262,7 @@ class ComputerAidedDesign final : public examples::SimpleWindowedApplication, pu
 			dtmInfo.contourSettings[0u].heightInterval = 10;
 			dtmInfo.contourSettings[0u].lineStyleInfo.screenSpaceLineWidth = 0.0f;
 			dtmInfo.contourSettings[0u].lineStyleInfo.worldSpaceLineWidth = 1.0f;
-			dtmInfo.contourSettings[0u].lineStyleInfo.color = float32_t4(0.0f, 0.0f, 1.0f, 1.0f);
+			dtmInfo.contourSettings[0u].lineStyleInfo.color = float32_t4(0.0f, 0.0f, 1.0f, 0.7f);
 			std::array<double, 4> contourStipplePattern = { 0.0f, -5.0f, 10.0f, -5.0f };
 			dtmInfo.contourSettings[0u].lineStyleInfo.setStipplePatternData(contourStipplePattern);
 
@@ -3298,7 +3284,7 @@ class ComputerAidedDesign final : public examples::SimpleWindowedApplication, pu
 
 					dtmInfo.heightShadingInfo.addHeightColorMapEntry(-10.0f, float32_t4(0.5f, 1.0f, 1.0f, 1.0f));
 					dtmInfo.heightShadingInfo.addHeightColorMapEntry(20.0f, float32_t4(0.0f, 1.0f, 0.0f, 1.0f));
-					dtmInfo.heightShadingInfo.addHeightColorMapEntry(25.0f, float32_t4(1.0f, 1.0f, 0.0f, 1.0f));
+					dtmInfo.heightShadingInfo.addHeightColorMapEntry(25.0f, float32_t4(1.0f, 1.0f, 0.0f, animatedAlpha));
 					dtmInfo.heightShadingInfo.addHeightColorMapEntry(70.0f, float32_t4(1.0f, 0.0f, 0.0f, 1.0f));
 					dtmInfo.heightShadingInfo.addHeightColorMapEntry(90.0f, float32_t4(1.0f, 0.0f, 0.0f, 1.0f));
 
@@ -3310,22 +3296,22 @@ class ComputerAidedDesign final : public examples::SimpleWindowedApplication, pu
 					dtmInfo.heightShadingInfo.intervalIndexToHeightMultiplier = dtmInfo.heightShadingInfo.intervalLength;
 					dtmInfo.heightShadingInfo.isCenteredShading = false;
 					dtmInfo.heightShadingInfo.heightShadingMode = E_HEIGHT_SHADING_MODE::DISCRETE_FIXED_LENGTH_INTERVALS;
-					dtmInfo.heightShadingInfo.addHeightColorMapEntry(0.0f, float32_t4(0.0f, 0.0f, 1.0f, 1.0f));
-					dtmInfo.heightShadingInfo.addHeightColorMapEntry(25.0f, float32_t4(0.0f, 1.0f, 1.0f, 1.0f));
-					dtmInfo.heightShadingInfo.addHeightColorMapEntry(50.0f, float32_t4(0.0f, 1.0f, 0.0f, 1.0f));
-					dtmInfo.heightShadingInfo.addHeightColorMapEntry(75.0f, float32_t4(1.0f, 1.0f, 0.0f, 1.0f));
-					dtmInfo.heightShadingInfo.addHeightColorMapEntry(100.0f, float32_t4(1.0f, 0.0f, 0.0f, 1.0f));
+					dtmInfo.heightShadingInfo.addHeightColorMapEntry(0.0f, float32_t4(0.0f, 0.0f, 1.0f, animatedAlpha));
+					dtmInfo.heightShadingInfo.addHeightColorMapEntry(25.0f, float32_t4(0.0f, 1.0f, 1.0f, animatedAlpha));
+					dtmInfo.heightShadingInfo.addHeightColorMapEntry(50.0f, float32_t4(0.0f, 1.0f, 0.0f, animatedAlpha));
+					dtmInfo.heightShadingInfo.addHeightColorMapEntry(75.0f, float32_t4(1.0f, 1.0f, 0.0f, animatedAlpha));
+					dtmInfo.heightShadingInfo.addHeightColorMapEntry(100.0f, float32_t4(1.0f, 0.0f, 0.0f, animatedAlpha));
 
 					break;
 				}
 				case E_HEIGHT_SHADING_MODE::CONTINOUS_INTERVALS:
 				{
 					dtmInfo.heightShadingInfo.heightShadingMode = E_HEIGHT_SHADING_MODE::CONTINOUS_INTERVALS;
-					dtmInfo.heightShadingInfo.addHeightColorMapEntry(0.0f, float32_t4(0.0f, 0.0f, 1.0f, 1.0f));
-					dtmInfo.heightShadingInfo.addHeightColorMapEntry(25.0f, float32_t4(0.0f, 1.0f, 1.0f, 1.0f));
-					dtmInfo.heightShadingInfo.addHeightColorMapEntry(50.0f, float32_t4(0.0f, 1.0f, 0.0f, 1.0f));
-					dtmInfo.heightShadingInfo.addHeightColorMapEntry(75.0f, float32_t4(1.0f, 1.0f, 0.0f, 1.0f));
-					dtmInfo.heightShadingInfo.addHeightColorMapEntry(90.0f, float32_t4(1.0f, 0.0f, 0.0f, 1.0f));
+					dtmInfo.heightShadingInfo.addHeightColorMapEntry(0.0f, float32_t4(0.0f, 0.0f, 1.0f, animatedAlpha));
+					dtmInfo.heightShadingInfo.addHeightColorMapEntry(25.0f, float32_t4(0.0f, 1.0f, 1.0f, animatedAlpha));
+					dtmInfo.heightShadingInfo.addHeightColorMapEntry(50.0f, float32_t4(0.0f, 1.0f, 0.0f, animatedAlpha));
+					dtmInfo.heightShadingInfo.addHeightColorMapEntry(75.0f, float32_t4(1.0f, 1.0f, 0.0f, animatedAlpha));
+					dtmInfo.heightShadingInfo.addHeightColorMapEntry(90.0f, float32_t4(1.0f, 0.0f, 0.0f, animatedAlpha));
 
 					break;
 				}
@@ -3343,6 +3329,40 @@ class ComputerAidedDesign final : public examples::SimpleWindowedApplication, pu
 
 			drawResourcesFiller.drawTriangleMesh(mesh, dtmInfo, intendedNextSubmit);
 		}
+		else if (mode == ExampleMode::CASE_BUG)
+		{
+			CPolyline polyline;
+			
+			LineStyleInfo style = {};
+			style.screenSpaceLineWidth = 1.0f;
+			style.worldSpaceLineWidth = 0.0f;
+			style.color = float32_t4(0.619f, 0.325f, 0.709f, 0.5f);
+
+			for (uint32_t i = 0; i < 128u; ++i)
+			{
+				std::vector<shapes::QuadraticBezier<double>> quadBeziers;
+				curves::EllipticalArcInfo myCircle;
+				{
+					myCircle.majorAxis = { 0.05 , 0.0};
+					myCircle.center = { 0.0 + i * 0.1, i * 0.1 };
+					myCircle.angleBounds = {
+						nbl::core::PI<double>() * 0.0,
+						nbl::core::PI<double>() * 2.0
+					};
+					myCircle.eccentricity = 1.0;
+				}
+
+				curves::Subdivision::AddBezierFunc addToBezier = [&](shapes::QuadraticBezier<double>&& info) -> void
+					{
+						quadBeziers.push_back(info);
+					};
+
+				curves::Subdivision::adaptive(myCircle, 1e-5, addToBezier, 10u);
+				polyline.addQuadBeziers(quadBeziers);
+				drawResourcesFiller.drawPolyline(polyline, style, intendedNextSubmit);
+				polyline.clearEverything();
+			}
+		}
 
 		drawResourcesFiller.finalizeAllCopiesToGPU(intendedNextSubmit);
 	}

From 6b57674651f5eb057d1c632d45122d455a7a48c1 Mon Sep 17 00:00:00 2001
From: keptsecret <sorchon@gmail.com>
Date: Fri, 25 Apr 2025 16:53:14 +0700
Subject: [PATCH 205/529] refactor to load data as vectors, consecutive uints

---
 .../app_resources/benchmarkSubgroup.comp.hlsl | 18 +++-----
 .../app_resources/shaderCommon.hlsl           | 45 ++++++-------------
 .../app_resources/testSubgroup.comp.hlsl      |  4 +-
 3 files changed, 20 insertions(+), 47 deletions(-)

diff --git a/73_ArithmeticBench/app_resources/benchmarkSubgroup.comp.hlsl b/73_ArithmeticBench/app_resources/benchmarkSubgroup.comp.hlsl
index 3dd24e432..2f575d39a 100644
--- a/73_ArithmeticBench/app_resources/benchmarkSubgroup.comp.hlsl
+++ b/73_ArithmeticBench/app_resources/benchmarkSubgroup.comp.hlsl
@@ -7,9 +7,9 @@
 // NOTE added dummy output image to be able to profile with Nsight, which still doesn't support profiling headless compute shaders
 [[vk::binding(2, 0)]] RWTexture2D<float32_t4> outImage; // dummy
 
-uint32_t globalFirstItemIndex(uint32_t itemIdx)
+uint32_t globalIndex()
 {
-    return nbl::hlsl::glsl::gl_WorkGroupID().x*WORKGROUP_SIZE*ITEMS_PER_INVOCATION+((nbl::hlsl::glsl::gl_SubgroupID()*ITEMS_PER_INVOCATION+itemIdx)<<SUBGROUP_SIZE_LOG2);
+    return nbl::hlsl::glsl::gl_WorkGroupID().x*WORKGROUP_SIZE+nbl::hlsl::workgroup::SubgroupContiguousIndex();
 }
 
 bool canStore() {return true;}
@@ -18,7 +18,6 @@ bool canStore() {return true;}
 #error "Define NUM_LOOPS!"
 #endif
 
-
 template<template<class> class binop, typename T, uint32_t N>
 static void subbench(NBL_CONST_REF_ARG(type_t) sourceVal)
 {
@@ -31,20 +30,13 @@ static void subbench(NBL_CONST_REF_ARG(type_t) sourceVal)
     for (uint32_t i = 0; i < NUM_LOOPS; i++)
         value = func(value);
 
-    [unroll]
-    for (uint32_t i = 0; i < ITEMS_PER_INVOCATION; i++)
-        output[binop<T>::BindingIndex].template Store<uint32_t>(sizeof(uint32_t) + sizeof(uint32_t) * (globalFirstItemIndex(i) + nbl::hlsl::glsl::gl_SubgroupInvocationID()), value[i]);
+    output[binop<T>::BindingIndex].template Store<type_t>(sizeof(uint32_t) + sizeof(type_t) * globalIndex(), value);
 }
 
 void benchmark()
 {
-    const uint32_t idx = nbl::hlsl::glsl::gl_SubgroupInvocationID();
-    type_t sourceVal;
-    [unroll]
-    for (uint32_t i = 0; i < ITEMS_PER_INVOCATION; i++)
-    {
-        sourceVal[i] = inputValue[globalFirstItemIndex(i) + idx];
-    }
+    const uint32_t idx = globalIndex();
+    type_t sourceVal = inputValue[idx];
 
     subbench<bit_and, uint32_t, ITEMS_PER_INVOCATION>(sourceVal);
     subbench<bit_xor, uint32_t, ITEMS_PER_INVOCATION>(sourceVal);
diff --git a/73_ArithmeticBench/app_resources/shaderCommon.hlsl b/73_ArithmeticBench/app_resources/shaderCommon.hlsl
index f4fc9d23a..376f69579 100644
--- a/73_ArithmeticBench/app_resources/shaderCommon.hlsl
+++ b/73_ArithmeticBench/app_resources/shaderCommon.hlsl
@@ -1,7 +1,6 @@
 #include "common.hlsl"
 
 #include "nbl/builtin/hlsl/glsl_compat/core.hlsl"
-#include "nbl/builtin/hlsl/workgroup/basic.hlsl"
 #include "nbl/builtin/hlsl/subgroup/basic.hlsl"
 #include "nbl/builtin/hlsl/subgroup/arithmetic_portability.hlsl"
 #include "nbl/builtin/hlsl/subgroup2/arithmetic_portability.hlsl"
@@ -11,29 +10,21 @@
 // https://github.com/microsoft/DirectXShaderCompiler/issues/6144
 uint32_t3 nbl::hlsl::glsl::gl_WorkGroupSize() {return uint32_t3(WORKGROUP_SIZE,1,1);}
 
-// unfortunately DXC chokes on descriptors as static members
-// https://github.com/microsoft/DirectXShaderCompiler/issues/5940
-[[vk::binding(0, 0)]] StructuredBuffer<uint32_t> inputValue;
-[[vk::binding(1, 0)]] RWByteAddressBuffer output[8];
-
-// to get next item, move by subgroupSize
-uint32_t globalFirstItemIndex(uint32_t itemIdx);
-// since we test ITEMS_PER_WG<WorkgroupSize we need this so workgroups don't overwrite each other's outputs
-bool canStore();
-
 #ifndef ITEMS_PER_INVOCATION
 #error "Define ITEMS_PER_INVOCATION!"
 #endif
-//typedef decltype(inputValue[0]) type_t;
-//typedef uint32_t type_t;
-//typedef uint32_t4 type_t;
 
-// #if ITEMS_PER_INVOCATION > 1
 typedef vector<uint32_t, ITEMS_PER_INVOCATION> type_t;
-// #else
-// typedef uint32_t type_t;
-// #endif
 
+// unfortunately DXC chokes on descriptors as static members
+// https://github.com/microsoft/DirectXShaderCompiler/issues/5940
+[[vk::binding(0, 0)]] StructuredBuffer<type_t> inputValue;
+[[vk::binding(1, 0)]] RWByteAddressBuffer output[8];
+
+// because subgroups don't match `gl_LocalInvocationIndex` snake curve addressing, we also can't load inputs that way
+uint32_t globalIndex();
+// since we test ITEMS_PER_WG<WorkgroupSize we need this so workgroups don't overwrite each other's outputs
+bool canStore();
 
 #ifndef OPERATION
 #error "Define OPERATION!"
@@ -50,29 +41,19 @@ static void subtest(NBL_CONST_REF_ARG(type_t) sourceVal)
     using config_t = nbl::hlsl::subgroup2::Configuration<SUBGROUP_SIZE_LOG2>;
     using params_t = nbl::hlsl::subgroup2::ArithmeticParams<config_t, typename binop<T>::base_t, N, nbl::hlsl::jit::device_capabilities>;
 
-    if (nbl::hlsl::glsl::gl_WorkGroupID().x*WORKGROUP_SIZE+nbl::hlsl::workgroup::SubgroupContiguousIndex()==0u)
+    if (globalIndex()==0u)
         output[binop<T>::BindingIndex].template Store<uint32_t>(0,nbl::hlsl::glsl::gl_SubgroupSize());
         
     operation_t<params_t> func;
-    type_t value = func(sourceVal);
     if (canStore())
-    {
-        [unroll]
-        for (uint32_t i = 0; i < ITEMS_PER_INVOCATION; i++)
-            output[binop<T>::BindingIndex].template Store<uint32_t>(sizeof(uint32_t) + sizeof(uint32_t) * (globalFirstItemIndex(i) + nbl::hlsl::glsl::gl_SubgroupInvocationID()), value[i]);
-    }
+        output[binop<T>::BindingIndex].template Store<type_t>(sizeof(uint32_t)+sizeof(type_t)*globalIndex(),func(sourceVal));
 }
 
 
 type_t test()
 {
-    const uint32_t idx = nbl::hlsl::glsl::gl_SubgroupInvocationID();
-    type_t sourceVal;
-    [unroll]
-    for (uint32_t i = 0; i < ITEMS_PER_INVOCATION; i++)
-    {
-        sourceVal[i] = inputValue[globalFirstItemIndex(i) + idx];
-    }
+    const uint32_t idx = globalIndex();
+    type_t sourceVal = inputValue[idx];
 
     subtest<bit_and, uint32_t, ITEMS_PER_INVOCATION>(sourceVal);
     subtest<bit_xor, uint32_t, ITEMS_PER_INVOCATION>(sourceVal);
diff --git a/73_ArithmeticBench/app_resources/testSubgroup.comp.hlsl b/73_ArithmeticBench/app_resources/testSubgroup.comp.hlsl
index 0001d39e0..2cc1ccb60 100644
--- a/73_ArithmeticBench/app_resources/testSubgroup.comp.hlsl
+++ b/73_ArithmeticBench/app_resources/testSubgroup.comp.hlsl
@@ -4,9 +4,9 @@
 
 #include "shaderCommon.hlsl"
 
-uint32_t globalFirstItemIndex(uint32_t itemIdx)
+uint32_t globalIndex()
 {
-    return nbl::hlsl::glsl::gl_WorkGroupID().x*WORKGROUP_SIZE*ITEMS_PER_INVOCATION+((nbl::hlsl::glsl::gl_SubgroupID()*ITEMS_PER_INVOCATION+itemIdx)<<SUBGROUP_SIZE_LOG2);
+    return nbl::hlsl::glsl::gl_WorkGroupID().x*WORKGROUP_SIZE+nbl::hlsl::workgroup::SubgroupContiguousIndex();
 }
 
 bool canStore() {return true;}

From 7da1bec45fa13f9db9b3d1fa0b1542cc5d4a1a15 Mon Sep 17 00:00:00 2001
From: keptsecret <sorchon@gmail.com>
Date: Mon, 28 Apr 2025 10:54:19 +0700
Subject: [PATCH 206/529] initial wg scan test

---
 74a_Workgroup2ScanTest/CMakeLists.txt         |  25 +
 .../app_resources/common.hlsl                 |  96 ++++
 .../app_resources/shaderCommon.hlsl           |  55 +++
 .../app_resources/testSubgroup.comp.hlsl      |  18 +
 .../app_resources/testWorkgroup.comp.hlsl     | 107 ++++
 74a_Workgroup2ScanTest/config.json.template   |  28 ++
 74a_Workgroup2ScanTest/main.cpp               | 462 ++++++++++++++++++
 74a_Workgroup2ScanTest/pipeline.groovy        |  50 ++
 CMakeLists.txt                                |   1 +
 9 files changed, 842 insertions(+)
 create mode 100644 74a_Workgroup2ScanTest/CMakeLists.txt
 create mode 100644 74a_Workgroup2ScanTest/app_resources/common.hlsl
 create mode 100644 74a_Workgroup2ScanTest/app_resources/shaderCommon.hlsl
 create mode 100644 74a_Workgroup2ScanTest/app_resources/testSubgroup.comp.hlsl
 create mode 100644 74a_Workgroup2ScanTest/app_resources/testWorkgroup.comp.hlsl
 create mode 100644 74a_Workgroup2ScanTest/config.json.template
 create mode 100644 74a_Workgroup2ScanTest/main.cpp
 create mode 100644 74a_Workgroup2ScanTest/pipeline.groovy

diff --git a/74a_Workgroup2ScanTest/CMakeLists.txt b/74a_Workgroup2ScanTest/CMakeLists.txt
new file mode 100644
index 000000000..0724366c9
--- /dev/null
+++ b/74a_Workgroup2ScanTest/CMakeLists.txt
@@ -0,0 +1,25 @@
+
+include(common RESULT_VARIABLE RES)
+if(NOT RES)
+	message(FATAL_ERROR "common.cmake not found. Should be in {repo_root}/cmake directory")
+endif()
+
+nbl_create_executable_project("" "" "" "" "${NBL_EXECUTABLE_PROJECT_CREATION_PCH_TARGET}")
+
+if(NBL_EMBED_BUILTIN_RESOURCES)
+	set(_BR_TARGET_ ${EXECUTABLE_NAME}_builtinResourceData)
+	set(RESOURCE_DIR "app_resources")
+
+	get_filename_component(_SEARCH_DIRECTORIES_ "${CMAKE_CURRENT_SOURCE_DIR}" ABSOLUTE)
+	get_filename_component(_OUTPUT_DIRECTORY_SOURCE_ "${CMAKE_CURRENT_BINARY_DIR}/src" ABSOLUTE)
+	get_filename_component(_OUTPUT_DIRECTORY_HEADER_ "${CMAKE_CURRENT_BINARY_DIR}/include" ABSOLUTE)
+
+    file(GLOB_RECURSE BUILTIN_RESOURCE_FILES RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}/${RESOURCE_DIR}" CONFIGURE_DEPENDS "${CMAKE_CURRENT_SOURCE_DIR}/${RESOURCE_DIR}/*")
+    foreach(RES_FILE ${BUILTIN_RESOURCE_FILES})
+      LIST_BUILTIN_RESOURCE(RESOURCES_TO_EMBED "${RES_FILE}")
+    endforeach()
+
+	ADD_CUSTOM_BUILTIN_RESOURCES(${_BR_TARGET_} RESOURCES_TO_EMBED "${_SEARCH_DIRECTORIES_}" "${RESOURCE_DIR}" "nbl::this_example::builtin" "${_OUTPUT_DIRECTORY_HEADER_}" "${_OUTPUT_DIRECTORY_SOURCE_}")
+
+	LINK_BUILTIN_RESOURCES_TO_TARGET(${EXECUTABLE_NAME} ${_BR_TARGET_})
+endif()
\ No newline at end of file
diff --git a/74a_Workgroup2ScanTest/app_resources/common.hlsl b/74a_Workgroup2ScanTest/app_resources/common.hlsl
new file mode 100644
index 000000000..10892a2b9
--- /dev/null
+++ b/74a_Workgroup2ScanTest/app_resources/common.hlsl
@@ -0,0 +1,96 @@
+#include "nbl/builtin/hlsl/cpp_compat.hlsl"
+#include "nbl/builtin/hlsl/functional.hlsl"
+
+template<uint32_t kScanElementCount=1024*1024>
+struct Output
+{
+	NBL_CONSTEXPR_STATIC_INLINE uint32_t ScanElementCount = kScanElementCount;
+
+	uint32_t subgroupSize;
+	uint32_t data[ScanElementCount];
+};
+
+// Thanks to our unified HLSL/C++ STD lib we're able to remove a whole load of code
+template<typename T>
+struct bit_and : nbl::hlsl::bit_and<T>
+{
+	using base_t = nbl::hlsl::bit_and<T>;
+
+	NBL_CONSTEXPR_STATIC_INLINE uint16_t BindingIndex = 0;
+#ifndef __HLSL_VERSION
+	static inline constexpr const char* name = "bit_and";
+#endif
+};
+template<typename T>
+struct bit_or : nbl::hlsl::bit_or<T>
+{
+	using base_t = nbl::hlsl::bit_or<T>;
+
+	NBL_CONSTEXPR_STATIC_INLINE uint16_t BindingIndex = 1;
+#ifndef __HLSL_VERSION
+	static inline constexpr const char* name = "bit_xor";
+#endif
+};
+template<typename T>
+struct bit_xor : nbl::hlsl::bit_xor<T>
+{
+	using base_t = nbl::hlsl::bit_xor<T>;
+
+	NBL_CONSTEXPR_STATIC_INLINE uint16_t BindingIndex = 2;
+#ifndef __HLSL_VERSION
+	static inline constexpr const char* name = "bit_or";
+#endif
+};
+template<typename T>
+struct plus : nbl::hlsl::plus<T>
+{
+	using base_t = nbl::hlsl::plus<T>;
+
+	NBL_CONSTEXPR_STATIC_INLINE uint16_t BindingIndex = 3;
+#ifndef __HLSL_VERSION
+	static inline constexpr const char* name = "plus";
+#endif
+};
+template<typename T>
+struct multiplies : nbl::hlsl::multiplies<T>
+{
+	using base_t = nbl::hlsl::multiplies<T>;
+
+	NBL_CONSTEXPR_STATIC_INLINE uint16_t BindingIndex = 4;
+#ifndef __HLSL_VERSION
+	static inline constexpr const char* name = "multiplies";
+#endif
+};
+template<typename T>
+struct minimum : nbl::hlsl::minimum<T>
+{
+	using base_t = nbl::hlsl::minimum<T>;
+
+	NBL_CONSTEXPR_STATIC_INLINE uint16_t BindingIndex = 5;
+#ifndef __HLSL_VERSION
+	static inline constexpr const char* name = "minimum";
+#endif
+};
+template<typename T>
+struct maximum : nbl::hlsl::maximum<T>
+{
+	using base_t = nbl::hlsl::maximum<T>;
+
+	NBL_CONSTEXPR_STATIC_INLINE uint16_t BindingIndex = 6;
+#ifndef __HLSL_VERSION
+	static inline constexpr const char* name = "maximum";
+#endif
+};
+
+template<typename T>
+struct ballot : nbl::hlsl::plus<T>
+{
+	using base_t = nbl::hlsl::plus<T>;
+
+	NBL_CONSTEXPR_STATIC_INLINE uint16_t BindingIndex = 7;
+#ifndef __HLSL_VERSION
+	static inline constexpr const char* name = "bitcount";
+#endif
+};
+
+#include "nbl/builtin/hlsl/subgroup/basic.hlsl"
\ No newline at end of file
diff --git a/74a_Workgroup2ScanTest/app_resources/shaderCommon.hlsl b/74a_Workgroup2ScanTest/app_resources/shaderCommon.hlsl
new file mode 100644
index 000000000..13ee8d21e
--- /dev/null
+++ b/74a_Workgroup2ScanTest/app_resources/shaderCommon.hlsl
@@ -0,0 +1,55 @@
+#include "common.hlsl"
+
+#include "nbl/builtin/hlsl/glsl_compat/core.hlsl"
+#include "nbl/builtin/hlsl/subgroup/basic.hlsl"
+#include "nbl/builtin/hlsl/subgroup/arithmetic_portability.hlsl"
+
+#include "nbl/builtin/hlsl/jit/device_capabilities.hlsl"
+
+// https://github.com/microsoft/DirectXShaderCompiler/issues/6144
+uint32_t3 nbl::hlsl::glsl::gl_WorkGroupSize() {return uint32_t3(WORKGROUP_SIZE,1,1);}
+
+// unfortunately DXC chokes on descriptors as static members
+// https://github.com/microsoft/DirectXShaderCompiler/issues/5940
+[[vk::binding(0, 0)]] StructuredBuffer<uint32_t> inputValue;
+[[vk::binding(1, 0)]] RWByteAddressBuffer output[8];
+
+// because subgroups don't match `gl_LocalInvocationIndex` snake curve addressing, we also can't load inputs that way
+uint32_t globalIndex();
+// since we test ITEMS_PER_WG<WorkgroupSize we need this so workgroups don't overwrite each other's outputs
+bool canStore();
+
+//typedef decltype(inputValue[0]) type_t;
+typedef uint32_t type_t;
+
+
+#ifndef OPERATION
+#error "Define OPERATION!"
+#endif
+template<template<class> class binop>
+static void subtest(NBL_CONST_REF_ARG(type_t) sourceVal)
+{
+	if (globalIndex()==0u)
+		output[binop<type_t>::BindingIndex].template Store<uint32_t>(0,nbl::hlsl::glsl::gl_SubgroupSize());
+		
+	operation_t<typename binop<type_t>::base_t,nbl::hlsl::jit::device_capabilities> func;
+	if (canStore())
+		output[binop<type_t>::BindingIndex].template Store<type_t>(sizeof(uint32_t)+sizeof(type_t)*globalIndex(),func(sourceVal));
+}
+
+
+type_t test()
+{
+	const type_t sourceVal = inputValue[globalIndex()];
+
+	subtest<bit_and>(sourceVal);
+	subtest<bit_xor>(sourceVal);
+	subtest<bit_or>(sourceVal);
+	subtest<plus>(sourceVal);
+	subtest<multiplies>(sourceVal);
+	subtest<minimum>(sourceVal);
+	subtest<maximum>(sourceVal);
+	return sourceVal;
+}
+
+#include "nbl/builtin/hlsl/workgroup/basic.hlsl"
\ No newline at end of file
diff --git a/74a_Workgroup2ScanTest/app_resources/testSubgroup.comp.hlsl b/74a_Workgroup2ScanTest/app_resources/testSubgroup.comp.hlsl
new file mode 100644
index 000000000..479265d73
--- /dev/null
+++ b/74a_Workgroup2ScanTest/app_resources/testSubgroup.comp.hlsl
@@ -0,0 +1,18 @@
+#pragma shader_stage(compute)
+
+#define operation_t nbl::hlsl::OPERATION
+
+#include "shaderCommon.hlsl"
+
+uint32_t globalIndex()
+{
+	return nbl::hlsl::glsl::gl_WorkGroupID().x*WORKGROUP_SIZE+nbl::hlsl::workgroup::SubgroupContiguousIndex();
+}
+
+bool canStore() {return true;}
+
+[numthreads(WORKGROUP_SIZE,1,1)]
+void main()
+{
+	test();
+}
\ No newline at end of file
diff --git a/74a_Workgroup2ScanTest/app_resources/testWorkgroup.comp.hlsl b/74a_Workgroup2ScanTest/app_resources/testWorkgroup.comp.hlsl
new file mode 100644
index 000000000..9bafae47f
--- /dev/null
+++ b/74a_Workgroup2ScanTest/app_resources/testWorkgroup.comp.hlsl
@@ -0,0 +1,107 @@
+#pragma shader_stage(compute)
+
+
+#include "nbl/builtin/hlsl/workgroup/scratch_size.hlsl"
+
+static const uint32_t ArithmeticSz = nbl::hlsl::workgroup::scratch_size_arithmetic<ITEMS_PER_WG>::value;
+static const uint32_t BallotSz = nbl::hlsl::workgroup::scratch_size_ballot<ITEMS_PER_WG>::value;
+static const uint32_t ScratchSz = ArithmeticSz+BallotSz;
+
+// TODO: Can we make it a static variable in the ScratchProxy struct?
+groupshared uint32_t scratch[ScratchSz];
+
+
+#include "nbl/builtin/hlsl/workgroup/arithmetic.hlsl"
+
+
+template<uint16_t offset>
+struct ScratchProxy
+{
+	void get(const uint32_t ix, NBL_REF_ARG(uint32_t) value)
+	{
+		value = scratch[ix+offset];
+	}
+	void set(const uint32_t ix, const uint32_t value)
+	{
+		scratch[ix+offset] = value;
+	}
+
+	uint32_t atomicOr(const uint32_t ix, const uint32_t value)
+	{
+		return nbl::hlsl::glsl::atomicOr(scratch[ix],value);
+	}
+
+	void workgroupExecutionAndMemoryBarrier()
+	{
+		nbl::hlsl::glsl::barrier();
+		//nbl::hlsl::glsl::memoryBarrierShared(); implied by the above
+	}
+};
+
+static ScratchProxy<0> arithmeticAccessor;
+
+
+#include "nbl/builtin/hlsl/workgroup/broadcast.hlsl"
+
+
+template<class Binop, class device_capabilities>
+struct operation_t
+{
+	using type_t = typename Binop::type_t;
+
+	type_t operator()(type_t value)
+	{
+		type_t retval = nbl::hlsl::OPERATION<Binop,ITEMS_PER_WG,device_capabilities>::template __call<ScratchProxy<0> >(value,arithmeticAccessor);
+		// we barrier before because we alias the accessors for Binop
+		arithmeticAccessor.workgroupExecutionAndMemoryBarrier();
+		return retval;
+	}
+};
+
+
+#include "shaderCommon.hlsl"
+
+static ScratchProxy<ArithmeticSz> ballotAccessor;
+
+
+uint32_t globalIndex()
+{
+	return nbl::hlsl::glsl::gl_WorkGroupID().x*ITEMS_PER_WG+nbl::hlsl::workgroup::SubgroupContiguousIndex();
+}
+
+bool canStore()
+{
+	return nbl::hlsl::workgroup::SubgroupContiguousIndex()<ITEMS_PER_WG;
+}
+
+[numthreads(WORKGROUP_SIZE,1,1)]
+void main()
+{
+	const type_t sourceVal = test();
+	if (globalIndex()==0u)
+		output[ballot<type_t>::BindingIndex].template Store<uint32_t>(0,nbl::hlsl::glsl::gl_SubgroupSize());
+
+	// we can only ballot booleans, so low bit
+	nbl::hlsl::workgroup::ballot<ScratchProxy<ArithmeticSz> >(bool(sourceVal & 0x1u), ballotAccessor);
+	// need to barrier between ballot and usages of a ballot by myself
+	ballotAccessor.workgroupExecutionAndMemoryBarrier();
+
+	uint32_t destVal = 0xdeadbeefu;
+#define CONSTEXPR_OP_TYPE_TEST(IS_OP) nbl::hlsl::is_same<nbl::hlsl::OPERATION<nbl::hlsl::bit_xor<float>,0x45>,nbl::hlsl::workgroup::IS_OP<nbl::hlsl::bit_xor<float>,0x45> >::value
+#define BALLOT_TEMPLATE_ARGS ITEMS_PER_WG,decltype(ballotAccessor),decltype(arithmeticAccessor),nbl::hlsl::jit::device_capabilities
+	if (CONSTEXPR_OP_TYPE_TEST(reduction))
+		destVal = nbl::hlsl::workgroup::ballotBitCount<BALLOT_TEMPLATE_ARGS>(ballotAccessor,arithmeticAccessor);
+	else if (CONSTEXPR_OP_TYPE_TEST(inclusive_scan))
+		destVal = nbl::hlsl::workgroup::ballotInclusiveBitCount<BALLOT_TEMPLATE_ARGS>(ballotAccessor,arithmeticAccessor);
+	else if (CONSTEXPR_OP_TYPE_TEST(exclusive_scan))
+		destVal = nbl::hlsl::workgroup::ballotExclusiveBitCount<BALLOT_TEMPLATE_ARGS>(ballotAccessor,arithmeticAccessor);
+	else
+	{
+		assert(false);
+	}
+#undef BALLOT_TEMPLATE_ARGS
+#undef CONSTEXPR_OP_TYPE_TEST
+
+	if (canStore())
+		output[ballot<type_t>::BindingIndex].template Store<type_t>(sizeof(uint32_t)+sizeof(type_t)*globalIndex(),destVal);
+}
\ No newline at end of file
diff --git a/74a_Workgroup2ScanTest/config.json.template b/74a_Workgroup2ScanTest/config.json.template
new file mode 100644
index 000000000..f961745c1
--- /dev/null
+++ b/74a_Workgroup2ScanTest/config.json.template
@@ -0,0 +1,28 @@
+{
+  "enableParallelBuild": true,
+  "threadsPerBuildProcess" : 2,
+  "isExecuted": false,
+  "scriptPath": "",
+  "cmake": {
+    "configurations": [ "Release", "Debug", "RelWithDebInfo" ],
+    "buildModes": [],
+    "requiredOptions": []
+  }, 
+  "profiles": [
+    {
+      "backend": "vulkan",
+      "platform": "windows",
+      "buildModes": [],
+      "runConfiguration": "Release",
+      "gpuArchitectures": []
+    }
+  ],
+  "dependencies": [],
+  "data": [
+    {
+      "dependencies": [],
+      "command": [""],
+      "outputs": []
+    }
+  ]
+}
\ No newline at end of file
diff --git a/74a_Workgroup2ScanTest/main.cpp b/74a_Workgroup2ScanTest/main.cpp
new file mode 100644
index 000000000..147d231e2
--- /dev/null
+++ b/74a_Workgroup2ScanTest/main.cpp
@@ -0,0 +1,462 @@
+#include "nbl/application_templates/BasicMultiQueueApplication.hpp"
+#include "nbl/application_templates/MonoAssetManagerAndBuiltinResourceApplication.hpp"
+#include "app_resources/common.hlsl"
+
+using namespace nbl;
+using namespace core;
+using namespace asset;
+using namespace system;
+using namespace video;
+
+// method emulations on the CPU, to verify the results of the GPU methods
+template<class Binop>
+struct emulatedReduction
+{
+	using type_t = typename Binop::type_t;
+
+	static inline void impl(type_t* out, const type_t* in, const uint32_t itemCount)
+	{
+		const type_t red = std::reduce(in,in+itemCount,Binop::identity,Binop());
+		std::fill(out,out+itemCount,red);
+	}
+
+	static inline constexpr const char* name = "reduction";
+};
+template<class Binop>
+struct emulatedScanInclusive
+{
+	using type_t = typename Binop::type_t;
+
+	static inline void impl(type_t* out, const type_t* in, const uint32_t itemCount)
+	{
+		std::inclusive_scan(in,in+itemCount,out,Binop());
+	}
+	static inline constexpr const char* name = "inclusive_scan";
+};
+template<class Binop>
+struct emulatedScanExclusive
+{
+	using type_t = typename Binop::type_t;
+
+	static inline void impl(type_t* out, const type_t* in, const uint32_t itemCount)
+	{
+		std::exclusive_scan(in,in+itemCount,out,Binop::identity,Binop());
+	}
+	static inline constexpr const char* name = "exclusive_scan";
+};
+
+class ArithmeticUnitTestApp final : public application_templates::BasicMultiQueueApplication, public application_templates::MonoAssetManagerAndBuiltinResourceApplication
+{
+	using device_base_t = application_templates::BasicMultiQueueApplication;
+	using asset_base_t = application_templates::MonoAssetManagerAndBuiltinResourceApplication;
+
+public:
+	ArithmeticUnitTestApp(const path& _localInputCWD, const path& _localOutputCWD, const path& _sharedInputCWD, const path& _sharedOutputCWD) :
+		system::IApplicationFramework(_localInputCWD, _localOutputCWD, _sharedInputCWD, _sharedOutputCWD) {}
+
+	bool onAppInitialized(smart_refctd_ptr<ISystem>&& system) override
+	{
+		if (!device_base_t::onAppInitialized(std::move(system)))
+			return false;
+		if (!asset_base_t::onAppInitialized(std::move(system)))
+			return false;
+
+		transferDownQueue = getTransferDownQueue();
+		computeQueue = getComputeQueue();
+
+		// TODO: get the element count from argv
+		const uint32_t elementCount = Output<>::ScanElementCount;
+		// populate our random data buffer on the CPU and create a GPU copy
+		inputData = new uint32_t[elementCount];
+		smart_refctd_ptr<IGPUBuffer> gpuinputDataBuffer;
+		{
+			std::mt19937 randGenerator(0xdeadbeefu);
+			for (uint32_t i = 0u; i < elementCount; i++)
+				inputData[i] = randGenerator(); // TODO: change to using xoroshiro, then we can skip having the input buffer at all
+
+			IGPUBuffer::SCreationParams inputDataBufferCreationParams = {};
+			inputDataBufferCreationParams.size = sizeof(Output<>::data[0]) * elementCount;
+			inputDataBufferCreationParams.usage = IGPUBuffer::EUF_STORAGE_BUFFER_BIT | IGPUBuffer::EUF_TRANSFER_DST_BIT;
+			m_utils->createFilledDeviceLocalBufferOnDedMem(
+				SIntendedSubmitInfo{.queue=getTransferUpQueue()},
+				std::move(inputDataBufferCreationParams),
+				inputData
+			).move_into(gpuinputDataBuffer);
+		}
+
+		// create 8 buffers for 8 operations
+		for (auto i=0u; i<OutputBufferCount; i++)
+		{
+			IGPUBuffer::SCreationParams params = {};
+			params.size = sizeof(uint32_t) + gpuinputDataBuffer->getSize();
+			params.usage = bitflag(IGPUBuffer::EUF_STORAGE_BUFFER_BIT) | IGPUBuffer::EUF_TRANSFER_SRC_BIT;
+
+			outputBuffers[i] = m_device->createBuffer(std::move(params));
+			auto mreq = outputBuffers[i]->getMemoryReqs();
+			mreq.memoryTypeBits &= m_physicalDevice->getDeviceLocalMemoryTypeBits();
+			assert(mreq.memoryTypeBits);
+
+			auto bufferMem = m_device->allocate(mreq, outputBuffers[i].get());
+			assert(bufferMem.isValid());
+		}
+
+		// create Descriptor Set and Pipeline Layout
+		{
+			// create Descriptor Set Layout
+			smart_refctd_ptr<IGPUDescriptorSetLayout> dsLayout;
+			{
+				IGPUDescriptorSetLayout::SBinding binding[2];
+				for (uint32_t i = 0u; i < 2; i++)
+					binding[i] = {{},i,IDescriptor::E_TYPE::ET_STORAGE_BUFFER,IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_NONE,IShader::E_SHADER_STAGE::ESS_COMPUTE,1u,nullptr };
+				binding[1].count = OutputBufferCount;
+				dsLayout = m_device->createDescriptorSetLayout(binding);
+			}
+
+			// set and transient pool
+			auto descPool = m_device->createDescriptorPoolForDSLayouts(IDescriptorPool::ECF_NONE,{&dsLayout.get(),1});
+			descriptorSet = descPool->createDescriptorSet(smart_refctd_ptr(dsLayout));
+			{
+				IGPUDescriptorSet::SDescriptorInfo infos[1+OutputBufferCount];
+				infos[0].desc = gpuinputDataBuffer;
+				infos[0].info.buffer = { 0u,gpuinputDataBuffer->getSize() };
+				for (uint32_t i = 1u; i <= OutputBufferCount; i++)
+				{
+					auto buff = outputBuffers[i - 1];
+					infos[i].info.buffer = { 0u,buff->getSize() };
+					infos[i].desc = std::move(buff); // save an atomic in the refcount
+
+				}
+
+				IGPUDescriptorSet::SWriteDescriptorSet writes[2];
+				for (uint32_t i=0u; i<2; i++)
+					writes[i] = {descriptorSet.get(),i,0u,1u,infos+i};
+				writes[1].count = OutputBufferCount;
+
+				m_device->updateDescriptorSets(2, writes, 0u, nullptr);
+			}
+
+			pipelineLayout = m_device->createPipelineLayout({},std::move(dsLayout));
+		}
+
+		const auto spirv_isa_cache_path = localOutputCWD/"spirv_isa_cache.bin";
+		// enclose to make sure file goes out of scope and we can reopen it
+		{
+			smart_refctd_ptr<const IFile> spirv_isa_cache_input;
+			// try to load SPIR-V to ISA cache
+			{
+				ISystem::future_t<smart_refctd_ptr<IFile>> fileCreate;
+				m_system->createFile(fileCreate,spirv_isa_cache_path,IFile::ECF_READ|IFile::ECF_MAPPABLE|IFile::ECF_COHERENT);
+				if (auto lock=fileCreate.acquire())
+					spirv_isa_cache_input = *lock;
+			}
+			// create the cache
+			{
+				std::span<const uint8_t> spirv_isa_cache_data = {};
+				if (spirv_isa_cache_input)
+					spirv_isa_cache_data = {reinterpret_cast<const uint8_t*>(spirv_isa_cache_input->getMappedPointer()),spirv_isa_cache_input->getSize()};
+				else
+					m_logger->log("Failed to load SPIR-V 2 ISA cache!",ILogger::ELL_PERFORMANCE);
+				// Normally we'd deserialize a `ICPUPipelineCache` properly and pass that instead
+				m_spirv_isa_cache = m_device->createPipelineCache(spirv_isa_cache_data);
+			}
+		}
+		{
+			// TODO: rename `deleteDirectory` to just `delete`? and a `IFile::setSize()` ?
+			m_system->deleteDirectory(spirv_isa_cache_path);
+			ISystem::future_t<smart_refctd_ptr<IFile>> fileCreate;
+			m_system->createFile(fileCreate,spirv_isa_cache_path,IFile::ECF_WRITE);
+			// I can be relatively sure I'll succeed to acquire the future, the pointer to created file might be null though.
+			m_spirv_isa_cache_output=*fileCreate.acquire();
+			if (!m_spirv_isa_cache_output)
+				logFail("Failed to Create SPIR-V to ISA cache file.");
+		}
+
+		// load shader source from file
+		auto getShaderSource = [&](const char* filePath) -> auto
+		{
+			IAssetLoader::SAssetLoadParams lparams = {};
+			lparams.logger = m_logger.get();
+			lparams.workingDirectory = "";
+			auto bundle = m_assetMgr->getAsset(filePath, lparams);
+			if (bundle.getContents().empty() || bundle.getAssetType()!=IAsset::ET_SHADER)
+			{
+				m_logger->log("Shader %s not found!", ILogger::ELL_ERROR, filePath);
+				exit(-1);
+			}
+			auto firstAssetInBundle = bundle.getContents()[0];
+			return smart_refctd_ptr_static_cast<ICPUShader>(firstAssetInBundle);
+		};
+
+		auto subgroupTestSource = getShaderSource("app_resources/testSubgroup.comp.hlsl");
+		auto workgroupTestSource = getShaderSource("app_resources/testWorkgroup.comp.hlsl");
+		// now create or retrieve final resources to run our tests
+		sema = m_device->createSemaphore(timelineValue);
+		resultsBuffer = ICPUBuffer::create({ outputBuffers[0]->getSize() });
+		{
+			smart_refctd_ptr<nbl::video::IGPUCommandPool> cmdpool = m_device->createCommandPool(computeQueue->getFamilyIndex(),IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT);
+			if (!cmdpool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY,{&cmdbuf,1}))
+			{
+				logFail("Failed to create Command Buffers!\n");
+				return false;
+			}
+		}
+
+		const auto MaxWorkgroupSize = m_physicalDevice->getLimits().maxComputeWorkGroupInvocations;
+		const auto MinSubgroupSize = m_physicalDevice->getLimits().minSubgroupSize;
+		const auto MaxSubgroupSize = m_physicalDevice->getLimits().maxSubgroupSize;
+		for (auto subgroupSize=MinSubgroupSize; subgroupSize <= MaxSubgroupSize; subgroupSize *= 2u)
+		{
+			const uint8_t subgroupSizeLog2 = hlsl::findMSB(subgroupSize);
+			for (uint32_t workgroupSize = subgroupSize; workgroupSize <= MaxWorkgroupSize; workgroupSize += subgroupSize)
+			{
+				// make sure renderdoc captures everything for debugging
+				m_api->startCapture();
+				m_logger->log("Testing Workgroup Size %u with Subgroup Size %u", ILogger::ELL_INFO, workgroupSize, subgroupSize);
+
+				bool passed = true;
+				// TODO async the testing
+				passed = runTest<emulatedReduction, false>(subgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize) && passed;
+				logTestOutcome(passed, workgroupSize);
+				passed = runTest<emulatedScanInclusive, false>(subgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize) && passed;
+				logTestOutcome(passed, workgroupSize);
+				passed = runTest<emulatedScanExclusive, false>(subgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize) && passed;
+				logTestOutcome(passed, workgroupSize);
+				for (uint32_t itemsPerWG = workgroupSize; itemsPerWG > workgroupSize - subgroupSize; itemsPerWG--)
+				{
+					m_logger->log("Testing Item Count %u", ILogger::ELL_INFO, itemsPerWG);
+					passed = runTest<emulatedReduction, true>(workgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize, itemsPerWG) && passed;
+					logTestOutcome(passed, itemsPerWG);
+					passed = runTest<emulatedScanInclusive, true>(workgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize, itemsPerWG) && passed;
+					logTestOutcome(passed, itemsPerWG);
+					passed = runTest<emulatedScanExclusive, true>(workgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize, itemsPerWG) && passed;
+					logTestOutcome(passed, itemsPerWG);
+				}
+				m_api->endCapture();
+
+				// save cache every now and then	
+				{
+					auto cpu = m_spirv_isa_cache->convertToCPUCache();
+					// Normally we'd beautifully JSON serialize the thing, allow multiple devices & drivers + metadata
+					auto bin = cpu->getEntries().begin()->second.bin;
+					IFile::success_t success;
+					m_spirv_isa_cache_output->write(success,bin->data(),0ull,bin->size());
+					if (!success)
+						logFail("Could not write Create SPIR-V to ISA cache to disk!");
+				}
+			}
+		}
+
+		return true;
+	}
+
+	virtual bool onAppTerminated() override
+	{
+		m_logger->log("==========Result==========", ILogger::ELL_INFO);
+		m_logger->log("Fail Count: %u", ILogger::ELL_INFO, totalFailCount);
+		delete[] inputData;
+		return true;
+	}
+
+	// the unit test is carried out on init
+	void workLoopBody() override {}
+
+	//
+	bool keepRunning() override { return false; }
+
+private:
+	void logTestOutcome(bool passed, uint32_t workgroupSize)
+	{
+		if (passed)
+			m_logger->log("Passed test #%u", ILogger::ELL_INFO, workgroupSize);
+		else
+		{
+			totalFailCount++;
+			m_logger->log("Failed test #%u", ILogger::ELL_ERROR, workgroupSize);
+		}
+	}
+
+	// create pipeline (specialized every test) [TODO: turn into a future/async]
+	smart_refctd_ptr<IGPUComputePipeline> createPipeline(const ICPUShader* overridenUnspecialized, const uint8_t subgroupSizeLog2)
+	{
+		auto shader = m_device->createShader(overridenUnspecialized);
+		IGPUComputePipeline::SCreationParams params = {};
+		params.layout = pipelineLayout.get();
+		params.shader = {
+			.entryPoint = "main",
+			.shader = shader.get(),
+			.entries = nullptr,
+			.requiredSubgroupSize = static_cast<IGPUShader::SSpecInfo::SUBGROUP_SIZE>(subgroupSizeLog2),
+			.requireFullSubgroups = true
+		};
+		core::smart_refctd_ptr<IGPUComputePipeline> pipeline;
+		if (!m_device->createComputePipelines(m_spirv_isa_cache.get(),{&params,1},&pipeline))
+			return nullptr;
+		return pipeline;
+	}
+
+	/*template<template<class> class Arithmetic, bool WorkgroupTest>
+	bool runTest(const smart_refctd_ptr<const ICPUShader>& source, const uint32_t elementCount, const uint32_t workgroupSize, uint32_t itemsPerWG = ~0u)
+	{
+		return true;
+	}*/
+
+	template<template<class> class Arithmetic, bool WorkgroupTest>
+	bool runTest(const smart_refctd_ptr<const ICPUShader>& source, const uint32_t elementCount, const uint8_t subgroupSizeLog2, const uint32_t workgroupSize, uint32_t itemsPerWG = ~0u)
+	{
+		std::string arith_name = Arithmetic<bit_xor<float>>::name;
+
+		smart_refctd_ptr<ICPUShader> overridenUnspecialized;
+		if constexpr (WorkgroupTest)
+		{
+			overridenUnspecialized = CHLSLCompiler::createOverridenCopy(
+				source.get(), "#define OPERATION %s\n#define WORKGROUP_SIZE %d\n#define ITEMS_PER_WG %d\n",
+				(("workgroup::") + arith_name).c_str(), workgroupSize, itemsPerWG
+			);
+		}
+		else
+		{
+			itemsPerWG = workgroupSize;
+			overridenUnspecialized = CHLSLCompiler::createOverridenCopy(
+				source.get(), "#define OPERATION %s\n#define WORKGROUP_SIZE %d\n",
+				(("subgroup::") + arith_name).c_str(), workgroupSize
+			);
+		}
+		auto pipeline = createPipeline(overridenUnspecialized.get(),subgroupSizeLog2);
+
+		// TODO: overlap dispatches with memory readbacks (requires multiple copies of `buffers`)
+		const uint32_t workgroupCount = elementCount / itemsPerWG;
+		cmdbuf->begin(IGPUCommandBuffer::USAGE::NONE);
+		cmdbuf->bindComputePipeline(pipeline.get());
+		cmdbuf->bindDescriptorSets(EPBP_COMPUTE, pipeline->getLayout(), 0u, 1u, &descriptorSet.get());
+		cmdbuf->dispatch(workgroupCount, 1, 1);
+		{
+			IGPUCommandBuffer::SPipelineBarrierDependencyInfo::buffer_barrier_t memoryBarrier[OutputBufferCount];
+			for (auto i=0u; i<OutputBufferCount; i++)
+			{
+				memoryBarrier[i] = {
+					.barrier = {
+						.dep = {
+							.srcStageMask = PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT,
+							.srcAccessMask = ACCESS_FLAGS::SHADER_WRITE_BITS,
+							// in theory we don't need the HOST BITS cause we block on a semaphore but might as well add them
+							.dstStageMask = PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT|PIPELINE_STAGE_FLAGS::HOST_BIT,
+							.dstAccessMask = ACCESS_FLAGS::SHADER_WRITE_BITS|ACCESS_FLAGS::HOST_READ_BIT
+						}
+					},
+					.range = {0ull,outputBuffers[i]->getSize(),outputBuffers[i]}
+				};
+			}
+			IGPUCommandBuffer::SPipelineBarrierDependencyInfo info = {.memBarriers={},.bufBarriers=memoryBarrier};
+			cmdbuf->pipelineBarrier(asset::E_DEPENDENCY_FLAGS::EDF_NONE,info);
+		}
+		cmdbuf->end();
+
+		const IQueue::SSubmitInfo::SSemaphoreInfo signal[1] = {{.semaphore=sema.get(),.value=++timelineValue}};
+		const IQueue::SSubmitInfo::SCommandBufferInfo cmdbufs[1] = {{.cmdbuf=cmdbuf.get()}};
+		const IQueue::SSubmitInfo submits[1] = {{.commandBuffers=cmdbufs,.signalSemaphores=signal}};
+		computeQueue->submit(submits);
+		const ISemaphore::SWaitInfo wait[1] = {{.semaphore=sema.get(),.value=timelineValue}};
+		m_device->blockForSemaphores(wait);
+
+		// check results
+		bool passed = validateResults<Arithmetic, bit_and<uint32_t>, WorkgroupTest>(itemsPerWG, workgroupCount);
+		passed = validateResults<Arithmetic, bit_xor<uint32_t>, WorkgroupTest>(itemsPerWG, workgroupCount) && passed;
+		passed = validateResults<Arithmetic, bit_or<uint32_t>, WorkgroupTest>(itemsPerWG, workgroupCount) && passed;
+		passed = validateResults<Arithmetic, plus<uint32_t>, WorkgroupTest>(itemsPerWG, workgroupCount) && passed;
+		passed = validateResults<Arithmetic, multiplies<uint32_t>, WorkgroupTest>(itemsPerWG, workgroupCount) && passed;
+		passed = validateResults<Arithmetic, minimum<uint32_t>, WorkgroupTest>(itemsPerWG, workgroupCount) && passed;
+		passed = validateResults<Arithmetic, maximum<uint32_t>, WorkgroupTest>(itemsPerWG, workgroupCount) && passed;
+		if constexpr (WorkgroupTest)
+			passed = validateResults<Arithmetic, ballot<uint32_t>, WorkgroupTest>(itemsPerWG, workgroupCount) && passed;
+
+		return passed;
+	}
+
+	//returns true if result matches
+	template<template<class> class Arithmetic, class Binop, bool WorkgroupTest>
+	bool validateResults(const uint32_t itemsPerWG, const uint32_t workgroupCount)
+	{
+		bool success = true;
+
+		// download data
+		const SBufferRange<IGPUBuffer> bufferRange = {0u, resultsBuffer->getSize(), outputBuffers[Binop::BindingIndex]};
+		m_utils->downloadBufferRangeViaStagingBufferAutoSubmit(SIntendedSubmitInfo{.queue=transferDownQueue},bufferRange,resultsBuffer->getPointer());
+
+		using type_t = typename Binop::type_t;
+		const auto dataFromBuffer = reinterpret_cast<const uint32_t*>(resultsBuffer->getPointer());
+		const auto subgroupSize = dataFromBuffer[0];
+		if (subgroupSize<nbl::hlsl::subgroup::MinSubgroupSize || subgroupSize>nbl::hlsl::subgroup::MaxSubgroupSize)
+		{
+			m_logger->log("Unexpected Subgroup Size %u", ILogger::ELL_ERROR, subgroupSize);
+			return false;
+		}
+
+		const auto testData = reinterpret_cast<const type_t*>(dataFromBuffer + 1);
+		// TODO: parallel for (the temporary values need to be threadlocal or what?)
+		// now check if the data obtained has valid values
+		type_t* tmp = new type_t[itemsPerWG];
+		type_t* ballotInput = new type_t[itemsPerWG];
+		for (uint32_t workgroupID = 0u; success && workgroupID < workgroupCount; workgroupID++)
+		{
+			const auto workgroupOffset = workgroupID * itemsPerWG;
+
+			if constexpr (WorkgroupTest)
+			{
+				if constexpr (std::is_same_v<ballot<type_t>, Binop>)
+				{
+					for (auto i = 0u; i < itemsPerWG; i++)
+						ballotInput[i] = inputData[i + workgroupOffset] & 0x1u;
+					Arithmetic<Binop>::impl(tmp, ballotInput, itemsPerWG);
+				}
+				else
+					Arithmetic<Binop>::impl(tmp, inputData + workgroupOffset, itemsPerWG);
+			}
+			else
+			{
+				for (uint32_t pseudoSubgroupID = 0u; pseudoSubgroupID < itemsPerWG; pseudoSubgroupID += subgroupSize)
+					Arithmetic<Binop>::impl(tmp + pseudoSubgroupID, inputData + workgroupOffset + pseudoSubgroupID, subgroupSize);
+			}
+
+			for (uint32_t localInvocationIndex = 0u; localInvocationIndex < itemsPerWG; localInvocationIndex++)
+			{
+				const auto globalInvocationIndex = workgroupOffset + localInvocationIndex;
+				const auto cpuVal = tmp[localInvocationIndex];
+				const auto gpuVal = testData[globalInvocationIndex];
+				if (cpuVal != gpuVal)
+				{
+					m_logger->log(
+						"Failed test #%d  (%s)  (%s) Expected %u got %u for workgroup %d and localinvoc %d",
+						ILogger::ELL_ERROR, itemsPerWG, WorkgroupTest ? "workgroup" : "subgroup", Binop::name,
+						cpuVal, gpuVal, workgroupID, localInvocationIndex
+					);
+					success = false;
+					break;
+				}
+			}
+		}
+		delete[] ballotInput;
+		delete[] tmp;
+
+		return success;
+	}
+
+	IQueue* transferDownQueue;
+	IQueue* computeQueue;
+	smart_refctd_ptr<IGPUPipelineCache> m_spirv_isa_cache;
+	smart_refctd_ptr<IFile> m_spirv_isa_cache_output;
+
+	uint32_t* inputData = nullptr;
+	constexpr static inline uint32_t OutputBufferCount = 8u;
+	smart_refctd_ptr<IGPUBuffer> outputBuffers[OutputBufferCount];
+	smart_refctd_ptr<IGPUDescriptorSet> descriptorSet;
+	smart_refctd_ptr<IGPUPipelineLayout> pipelineLayout;
+
+	smart_refctd_ptr<ISemaphore> sema;
+	uint64_t timelineValue = 0;
+	smart_refctd_ptr<IGPUCommandBuffer> cmdbuf;
+	smart_refctd_ptr<ICPUBuffer> resultsBuffer;
+
+	uint32_t totalFailCount = 0;
+};
+
+NBL_MAIN_FUNC(ArithmeticUnitTestApp)
\ No newline at end of file
diff --git a/74a_Workgroup2ScanTest/pipeline.groovy b/74a_Workgroup2ScanTest/pipeline.groovy
new file mode 100644
index 000000000..7ea9947e0
--- /dev/null
+++ b/74a_Workgroup2ScanTest/pipeline.groovy
@@ -0,0 +1,50 @@
+import org.DevshGraphicsProgramming.Agent
+import org.DevshGraphicsProgramming.BuilderInfo
+import org.DevshGraphicsProgramming.IBuilder
+
+class CArithemticUnitTestBuilder extends IBuilder
+{
+	public CArithemticUnitTestBuilder(Agent _agent, _info)
+	{
+		super(_agent, _info)
+	}
+	
+	@Override
+	public boolean prepare(Map axisMapping)
+	{
+		return true
+	}
+	
+	@Override
+  	public boolean build(Map axisMapping)
+	{
+		IBuilder.CONFIGURATION config = axisMapping.get("CONFIGURATION")
+		IBuilder.BUILD_TYPE buildType = axisMapping.get("BUILD_TYPE")
+		
+		def nameOfBuildDirectory = getNameOfBuildDirectory(buildType)
+		def nameOfConfig = getNameOfConfig(config)
+		
+		agent.execute("cmake --build ${info.rootProjectPath}/${nameOfBuildDirectory}/${info.targetProjectPathRelativeToRoot} --target ${info.targetBaseName} --config ${nameOfConfig} -j12 -v")
+		
+		return true
+	}
+	
+	@Override
+  	public boolean test(Map axisMapping)
+	{
+		return true
+	}
+	
+	@Override
+	public boolean install(Map axisMapping)
+	{
+		return true
+	}
+}
+
+def create(Agent _agent, _info)
+{
+	return new CArithemticUnitTestBuilder(_agent, _info)
+}
+
+return this
\ No newline at end of file
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 59fe4ea46..5d7369560 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -92,6 +92,7 @@ if(NBL_BUILD_EXAMPLES)
 	add_subdirectory(71_RayTracingPipeline EXCLUDE_FROM_ALL)
 
 	add_subdirectory(73_ArithmeticBench EXCLUDE_FROM_ALL)
+	add_subdirectory(74a_Workgroup2ScanTest EXCLUDE_FROM_ALL)
 
 	NBL_HOOK_COMMON_API("${NBL_COMMON_API_TARGETS}")
 endif()

From 750b3d2094484bca2e0c92f03277e0861b1adb77 Mon Sep 17 00:00:00 2001
From: keptsecret <sorchon@gmail.com>
Date: Mon, 28 Apr 2025 17:03:04 +0700
Subject: [PATCH 207/529] working? test for workgroup2 reduce

---
 .../app_resources/shaderCommon.hlsl           |  49 ++--
 .../app_resources/testWorkgroup.comp.hlsl     | 255 +++++++++++++-----
 74a_Workgroup2ScanTest/main.cpp               | 192 +++++++------
 3 files changed, 306 insertions(+), 190 deletions(-)

diff --git a/74a_Workgroup2ScanTest/app_resources/shaderCommon.hlsl b/74a_Workgroup2ScanTest/app_resources/shaderCommon.hlsl
index 13ee8d21e..79bf74e71 100644
--- a/74a_Workgroup2ScanTest/app_resources/shaderCommon.hlsl
+++ b/74a_Workgroup2ScanTest/app_resources/shaderCommon.hlsl
@@ -2,16 +2,22 @@
 
 #include "nbl/builtin/hlsl/glsl_compat/core.hlsl"
 #include "nbl/builtin/hlsl/subgroup/basic.hlsl"
-#include "nbl/builtin/hlsl/subgroup/arithmetic_portability.hlsl"
+#include "nbl/builtin/hlsl/subgroup2/arithmetic_portability.hlsl"
 
 #include "nbl/builtin/hlsl/jit/device_capabilities.hlsl"
 
 // https://github.com/microsoft/DirectXShaderCompiler/issues/6144
 uint32_t3 nbl::hlsl::glsl::gl_WorkGroupSize() {return uint32_t3(WORKGROUP_SIZE,1,1);}
 
+#ifndef ITEMS_PER_INVOCATION
+#error "Define ITEMS_PER_INVOCATION!"
+#endif
+
+typedef vector<uint32_t, ITEMS_PER_INVOCATION> type_t;
+
 // unfortunately DXC chokes on descriptors as static members
 // https://github.com/microsoft/DirectXShaderCompiler/issues/5940
-[[vk::binding(0, 0)]] StructuredBuffer<uint32_t> inputValue;
+[[vk::binding(0, 0)]] StructuredBuffer<type_t> inputValue;
 [[vk::binding(1, 0)]] RWByteAddressBuffer output[8];
 
 // because subgroups don't match `gl_LocalInvocationIndex` snake curve addressing, we also can't load inputs that way
@@ -19,37 +25,36 @@ uint32_t globalIndex();
 // since we test ITEMS_PER_WG<WorkgroupSize we need this so workgroups don't overwrite each other's outputs
 bool canStore();
 
-//typedef decltype(inputValue[0]) type_t;
-typedef uint32_t type_t;
-
-
 #ifndef OPERATION
 #error "Define OPERATION!"
 #endif
+#ifndef SUBGROUP_SIZE_LOG2
+#error "Define SUBGROUP_SIZE_LOG2!"
+#endif
 template<template<class> class binop>
 static void subtest(NBL_CONST_REF_ARG(type_t) sourceVal)
 {
-	if (globalIndex()==0u)
-		output[binop<type_t>::BindingIndex].template Store<uint32_t>(0,nbl::hlsl::glsl::gl_SubgroupSize());
-		
-	operation_t<typename binop<type_t>::base_t,nbl::hlsl::jit::device_capabilities> func;
-	if (canStore())
-		output[binop<type_t>::BindingIndex].template Store<type_t>(sizeof(uint32_t)+sizeof(type_t)*globalIndex(),func(sourceVal));
+    if (globalIndex()==0u)
+        output[binop<type_t>::BindingIndex].template Store<uint32_t>(0,nbl::hlsl::glsl::gl_SubgroupSize());
+        
+    operation_t<typename binop<type_t>::base_t,nbl::hlsl::jit::device_capabilities> func;
+    if (canStore())
+        output[binop<type_t>::BindingIndex].template Store<type_t>(sizeof(uint32_t)+sizeof(type_t)*globalIndex(),func(sourceVal));
 }
 
 
 type_t test()
 {
-	const type_t sourceVal = inputValue[globalIndex()];
-
-	subtest<bit_and>(sourceVal);
-	subtest<bit_xor>(sourceVal);
-	subtest<bit_or>(sourceVal);
-	subtest<plus>(sourceVal);
-	subtest<multiplies>(sourceVal);
-	subtest<minimum>(sourceVal);
-	subtest<maximum>(sourceVal);
-	return sourceVal;
+    const type_t sourceVal = inputValue[globalIndex()];
+
+    subtest<bit_and>(sourceVal);
+    subtest<bit_xor>(sourceVal);
+    subtest<bit_or>(sourceVal);
+    subtest<plus>(sourceVal);
+    subtest<multiplies>(sourceVal);
+    subtest<minimum>(sourceVal);
+    subtest<maximum>(sourceVal);
+    return sourceVal;
 }
 
 #include "nbl/builtin/hlsl/workgroup/basic.hlsl"
\ No newline at end of file
diff --git a/74a_Workgroup2ScanTest/app_resources/testWorkgroup.comp.hlsl b/74a_Workgroup2ScanTest/app_resources/testWorkgroup.comp.hlsl
index 9bafae47f..315550da0 100644
--- a/74a_Workgroup2ScanTest/app_resources/testWorkgroup.comp.hlsl
+++ b/74a_Workgroup2ScanTest/app_resources/testWorkgroup.comp.hlsl
@@ -3,105 +3,222 @@
 
 #include "nbl/builtin/hlsl/workgroup/scratch_size.hlsl"
 
-static const uint32_t ArithmeticSz = nbl::hlsl::workgroup::scratch_size_arithmetic<ITEMS_PER_WG>::value;
-static const uint32_t BallotSz = nbl::hlsl::workgroup::scratch_size_ballot<ITEMS_PER_WG>::value;
-static const uint32_t ScratchSz = ArithmeticSz+BallotSz;
+// static const uint32_t ArithmeticSz = nbl::hlsl::workgroup::scratch_size_arithmetic<ITEMS_PER_WG>::value;
+// static const uint32_t BallotSz = nbl::hlsl::workgroup::scratch_size_ballot<ITEMS_PER_WG>::value;
+// static const uint32_t ScratchSz = ArithmeticSz+BallotSz;
 
 // TODO: Can we make it a static variable in the ScratchProxy struct?
-groupshared uint32_t scratch[ScratchSz];
+// groupshared uint32_t ballotScratch[ScratchSz];  // TODO probably remove, not balloting
 
 
-#include "nbl/builtin/hlsl/workgroup/arithmetic.hlsl"
+#include "nbl/builtin/hlsl/workgroup2/arithmetic.hlsl"
 
+#include "nbl/builtin/hlsl/workgroup/broadcast.hlsl"
+
+#include "nbl/builtin/hlsl/glsl_compat/core.hlsl"
+#include "nbl/builtin/hlsl/subgroup/basic.hlsl"
+#include "nbl/builtin/hlsl/subgroup2/arithmetic_portability.hlsl"
+
+#include "nbl/builtin/hlsl/jit/device_capabilities.hlsl"
+
+#include "common.hlsl"
+
+// https://github.com/microsoft/DirectXShaderCompiler/issues/6144
+uint32_t3 nbl::hlsl::glsl::gl_WorkGroupSize() {return uint32_t3(WORKGROUP_SIZE,1,1);}
+
+#define ITEMS_PER_INVOCATION 1
+
+#ifndef ITEMS_PER_INVOCATION
+#error "Define ITEMS_PER_INVOCATION!"
+#endif
+
+typedef vector<uint32_t, ITEMS_PER_INVOCATION> type_t;
 
-template<uint16_t offset>
+// unfortunately DXC chokes on descriptors as static members
+// https://github.com/microsoft/DirectXShaderCompiler/issues/5940
+[[vk::binding(0, 0)]] StructuredBuffer<type_t> inputValue;
+[[vk::binding(1, 0)]] RWByteAddressBuffer output[8];
+
+// because subgroups don't match `gl_LocalInvocationIndex` snake curve addressing, we also can't load inputs that way
+uint32_t globalIndex();
+// since we test ITEMS_PER_WG<WorkgroupSize we need this so workgroups don't overwrite each other's outputs
+bool canStore();
+
+#define SUBGROUP_SIZE_LOG2 5
+
+#ifndef OPERATION
+#error "Define OPERATION!"
+#endif
+#ifndef SUBGROUP_SIZE_LOG2
+#error "Define SUBGROUP_SIZE_LOG2!"
+#endif
+
+using config_t = nbl::hlsl::workgroup2::Configuration<WORKGROUP_SIZE, SUBGROUP_SIZE_LOG2, ITEMS_PER_INVOCATION>;
+
+groupshared vector<uint32_t, config_t::ItemsPerInvocation_1> scratch[config_t::SubgroupSize];  // final (level 1) scan needs to fit in one subgroup exactly
+
+template<class Config>
 struct ScratchProxy
 {
-	void get(const uint32_t ix, NBL_REF_ARG(uint32_t) value)
-	{
-		value = scratch[ix+offset];
-	}
-	void set(const uint32_t ix, const uint32_t value)
-	{
-		scratch[ix+offset] = value;
-	}
-
-	uint32_t atomicOr(const uint32_t ix, const uint32_t value)
-	{
-		return nbl::hlsl::glsl::atomicOr(scratch[ix],value);
-	}
-
-	void workgroupExecutionAndMemoryBarrier()
-	{
-		nbl::hlsl::glsl::barrier();
-		//nbl::hlsl::glsl::memoryBarrierShared(); implied by the above
-	}
+    using stype_t = vector<uint32_t, Config::ItemsPerInvocation_1>;
+
+    stype_t get(const uint32_t ix)
+    {
+        return scratch[ix];
+    }
+    void set(const uint32_t ix, const stype_t value)
+    {
+        scratch[ix] = value;
+    }
+
+    stype_t atomicOr(const uint32_t ix, const stype_t value)
+    {
+        return nbl::hlsl::glsl::atomicOr(scratch[ix],value);
+    }
+
+    void workgroupExecutionAndMemoryBarrier()
+    {
+        nbl::hlsl::glsl::barrier();
+        //nbl::hlsl::glsl::memoryBarrierShared(); implied by the above
+    }
 };
 
-static ScratchProxy<0> arithmeticAccessor;
-
-
-#include "nbl/builtin/hlsl/workgroup/broadcast.hlsl"
+template<class Config, class Binop>
+struct DataProxy
+{
+    using dtype_t = vector<uint32_t, Config::ItemsPerInvocation_0>;
+    static_assert(nbl::hlsl::is_same_v<dtype_t, type_t>);
+
+    dtype_t get(const uint32_t ix)
+    {
+        return inputValue[ix];
+    }
+    void set(const uint32_t ix, const dtype_t value)
+    {
+        // inputValue[ix] = value;
+        output[Binop::BindingIndex].template Store<type_t>(sizeof(uint32_t) + sizeof(type_t) * globalIndex(), value);
+        output[Binop::BindingIndex].template Store<type_t>(sizeof(uint32_t) + sizeof(type_t) * ix, value);
+    }
+
+    void workgroupExecutionAndMemoryBarrier()
+    {
+        nbl::hlsl::glsl::barrier();
+        //nbl::hlsl::glsl::memoryBarrierShared(); implied by the above
+    }
+};
 
+static ScratchProxy<config_t> arithmeticAccessor;
 
 template<class Binop, class device_capabilities>
 struct operation_t
 {
-	using type_t = typename Binop::type_t;
-
-	type_t operator()(type_t value)
-	{
-		type_t retval = nbl::hlsl::OPERATION<Binop,ITEMS_PER_WG,device_capabilities>::template __call<ScratchProxy<0> >(value,arithmeticAccessor);
-		// we barrier before because we alias the accessors for Binop
-		arithmeticAccessor.workgroupExecutionAndMemoryBarrier();
-		return retval;
-	}
+    using binop_base_t = typename Binop::base_t;
+    using otype_t = typename Binop::type_t;
+
+    void operator()()
+    {
+        DataProxy<config_t,Binop> dataAccessor;
+        nbl::hlsl::OPERATION<config_t,binop_base_t,device_capabilities>::template __call<DataProxy<config_t,Binop>, ScratchProxy<config_t> >(dataAccessor,arithmeticAccessor);
+        // we barrier before because we alias the accessors for Binop
+        arithmeticAccessor.workgroupExecutionAndMemoryBarrier();
+        // return retval;
+    }
 };
 
 
-#include "shaderCommon.hlsl"
+template<template<class> class binop, typename T, uint32_t N>
+static void subtest(NBL_CONST_REF_ARG(type_t) sourceVal)
+{
+    if (globalIndex()==0u)
+        output[binop<T>::BindingIndex].template Store<uint32_t>(0,nbl::hlsl::glsl::gl_SubgroupSize());
+        
+    operation_t<binop<T>,nbl::hlsl::jit::device_capabilities> func;
+    // if (canStore())
+        // output[binop<type_t>::BindingIndex].template Store<type_t>(sizeof(uint32_t)+sizeof(type_t)*globalIndex(),func());
+        func(); // store is done with data accessor now
+}
+
+
+type_t test()
+{
+    const type_t sourceVal = inputValue[globalIndex()];
+
+    subtest<bit_and, uint32_t, ITEMS_PER_INVOCATION>(sourceVal);
+    subtest<bit_xor, uint32_t, ITEMS_PER_INVOCATION>(sourceVal);
+    subtest<bit_or, uint32_t, ITEMS_PER_INVOCATION>(sourceVal);
+    subtest<plus, uint32_t, ITEMS_PER_INVOCATION>(sourceVal);
+    subtest<multiplies, uint32_t, ITEMS_PER_INVOCATION>(sourceVal);
+    subtest<minimum, uint32_t, ITEMS_PER_INVOCATION>(sourceVal);
+    subtest<maximum, uint32_t, ITEMS_PER_INVOCATION>(sourceVal);
+    return sourceVal;
+}
+
+
+// template<uint16_t offset>
+// struct BallotProxy
+// {
+//     void get(const uint32_t ix, NBL_REF_ARG(uint32_t) value)
+//     {
+//         value = ballotScratch[ix+offset];
+//     }
+//     void set(const uint32_t ix, const uint32_t value)
+//     {
+//         ballotScratch[ix+offset] = value;
+//     }
+
+//     uint32_t atomicOr(const uint32_t ix, const uint32_t value)
+//     {
+//         return nbl::hlsl::glsl::atomicOr(ballotScratch[ix],value);
+//     }
+
+//     void workgroupExecutionAndMemoryBarrier()
+//     {
+//         nbl::hlsl::glsl::barrier();
+//         //nbl::hlsl::glsl::memoryBarrierShared(); implied by the above
+//     }
+// };
 
-static ScratchProxy<ArithmeticSz> ballotAccessor;
+// static BallotProxy<ArithmeticSz> ballotAccessor;
 
 
 uint32_t globalIndex()
 {
-	return nbl::hlsl::glsl::gl_WorkGroupID().x*ITEMS_PER_WG+nbl::hlsl::workgroup::SubgroupContiguousIndex();
+    return nbl::hlsl::glsl::gl_WorkGroupID().x*ITEMS_PER_WG+nbl::hlsl::workgroup::SubgroupContiguousIndex();
 }
 
 bool canStore()
 {
-	return nbl::hlsl::workgroup::SubgroupContiguousIndex()<ITEMS_PER_WG;
+    return nbl::hlsl::workgroup::SubgroupContiguousIndex()<ITEMS_PER_WG;
 }
 
 [numthreads(WORKGROUP_SIZE,1,1)]
 void main()
 {
-	const type_t sourceVal = test();
-	if (globalIndex()==0u)
-		output[ballot<type_t>::BindingIndex].template Store<uint32_t>(0,nbl::hlsl::glsl::gl_SubgroupSize());
-
-	// we can only ballot booleans, so low bit
-	nbl::hlsl::workgroup::ballot<ScratchProxy<ArithmeticSz> >(bool(sourceVal & 0x1u), ballotAccessor);
-	// need to barrier between ballot and usages of a ballot by myself
-	ballotAccessor.workgroupExecutionAndMemoryBarrier();
-
-	uint32_t destVal = 0xdeadbeefu;
-#define CONSTEXPR_OP_TYPE_TEST(IS_OP) nbl::hlsl::is_same<nbl::hlsl::OPERATION<nbl::hlsl::bit_xor<float>,0x45>,nbl::hlsl::workgroup::IS_OP<nbl::hlsl::bit_xor<float>,0x45> >::value
-#define BALLOT_TEMPLATE_ARGS ITEMS_PER_WG,decltype(ballotAccessor),decltype(arithmeticAccessor),nbl::hlsl::jit::device_capabilities
-	if (CONSTEXPR_OP_TYPE_TEST(reduction))
-		destVal = nbl::hlsl::workgroup::ballotBitCount<BALLOT_TEMPLATE_ARGS>(ballotAccessor,arithmeticAccessor);
-	else if (CONSTEXPR_OP_TYPE_TEST(inclusive_scan))
-		destVal = nbl::hlsl::workgroup::ballotInclusiveBitCount<BALLOT_TEMPLATE_ARGS>(ballotAccessor,arithmeticAccessor);
-	else if (CONSTEXPR_OP_TYPE_TEST(exclusive_scan))
-		destVal = nbl::hlsl::workgroup::ballotExclusiveBitCount<BALLOT_TEMPLATE_ARGS>(ballotAccessor,arithmeticAccessor);
-	else
-	{
-		assert(false);
-	}
-#undef BALLOT_TEMPLATE_ARGS
-#undef CONSTEXPR_OP_TYPE_TEST
-
-	if (canStore())
-		output[ballot<type_t>::BindingIndex].template Store<type_t>(sizeof(uint32_t)+sizeof(type_t)*globalIndex(),destVal);
+    const type_t sourceVal = test();
+//     if (globalIndex()==0u)
+//         output[ballot<type_t>::BindingIndex].template Store<uint32_t>(0,nbl::hlsl::glsl::gl_SubgroupSize());
+
+//     // we can only ballot booleans, so low bit
+//     nbl::hlsl::workgroup::ballot<ScratchProxy<ArithmeticSz> >(bool(sourceVal & 0x1u), ballotAccessor);
+//     // need to barrier between ballot and usages of a ballot by myself
+//     ballotAccessor.workgroupExecutionAndMemoryBarrier();
+
+//     uint32_t destVal = 0xdeadbeefu;
+// #define CONSTEXPR_OP_TYPE_TEST(IS_OP) nbl::hlsl::is_same<nbl::hlsl::OPERATION<nbl::hlsl::bit_xor<float>,0x45>,nbl::hlsl::workgroup::IS_OP<nbl::hlsl::bit_xor<float>,0x45> >::value
+// #define BALLOT_TEMPLATE_ARGS ITEMS_PER_WG,decltype(ballotAccessor),decltype(arithmeticAccessor),nbl::hlsl::jit::device_capabilities
+//     if (CONSTEXPR_OP_TYPE_TEST(reduction))
+//         destVal = nbl::hlsl::workgroup::ballotBitCount<BALLOT_TEMPLATE_ARGS>(ballotAccessor,arithmeticAccessor);
+//     else if (CONSTEXPR_OP_TYPE_TEST(inclusive_scan))
+//         destVal = nbl::hlsl::workgroup::ballotInclusiveBitCount<BALLOT_TEMPLATE_ARGS>(ballotAccessor,arithmeticAccessor);
+//     else if (CONSTEXPR_OP_TYPE_TEST(exclusive_scan))
+//         destVal = nbl::hlsl::workgroup::ballotExclusiveBitCount<BALLOT_TEMPLATE_ARGS>(ballotAccessor,arithmeticAccessor);
+//     else
+//     {
+//         assert(false);
+//     }
+// #undef BALLOT_TEMPLATE_ARGS
+// #undef CONSTEXPR_OP_TYPE_TEST
+
+//     if (canStore())
+//         output[ballot<type_t>::BindingIndex].template Store<type_t>(sizeof(uint32_t)+sizeof(type_t)*globalIndex(),destVal);
 }
\ No newline at end of file
diff --git a/74a_Workgroup2ScanTest/main.cpp b/74a_Workgroup2ScanTest/main.cpp
index 147d231e2..7e11726d6 100644
--- a/74a_Workgroup2ScanTest/main.cpp
+++ b/74a_Workgroup2ScanTest/main.cpp
@@ -45,13 +45,13 @@ struct emulatedScanExclusive
 	static inline constexpr const char* name = "exclusive_scan";
 };
 
-class ArithmeticUnitTestApp final : public application_templates::BasicMultiQueueApplication, public application_templates::MonoAssetManagerAndBuiltinResourceApplication
+class Workgroup2ScanTestApp final : public application_templates::BasicMultiQueueApplication, public application_templates::MonoAssetManagerAndBuiltinResourceApplication
 {
 	using device_base_t = application_templates::BasicMultiQueueApplication;
 	using asset_base_t = application_templates::MonoAssetManagerAndBuiltinResourceApplication;
 
 public:
-	ArithmeticUnitTestApp(const path& _localInputCWD, const path& _localOutputCWD, const path& _sharedInputCWD, const path& _sharedOutputCWD) :
+	Workgroup2ScanTestApp(const path& _localInputCWD, const path& _localOutputCWD, const path& _sharedInputCWD, const path& _sharedOutputCWD) :
 		system::IApplicationFramework(_localInputCWD, _localOutputCWD, _sharedInputCWD, _sharedOutputCWD) {}
 
 	bool onAppInitialized(smart_refctd_ptr<ISystem>&& system) override
@@ -138,38 +138,38 @@ class ArithmeticUnitTestApp final : public application_templates::BasicMultiQueu
 			pipelineLayout = m_device->createPipelineLayout({},std::move(dsLayout));
 		}
 
-		const auto spirv_isa_cache_path = localOutputCWD/"spirv_isa_cache.bin";
-		// enclose to make sure file goes out of scope and we can reopen it
-		{
-			smart_refctd_ptr<const IFile> spirv_isa_cache_input;
-			// try to load SPIR-V to ISA cache
-			{
-				ISystem::future_t<smart_refctd_ptr<IFile>> fileCreate;
-				m_system->createFile(fileCreate,spirv_isa_cache_path,IFile::ECF_READ|IFile::ECF_MAPPABLE|IFile::ECF_COHERENT);
-				if (auto lock=fileCreate.acquire())
-					spirv_isa_cache_input = *lock;
-			}
-			// create the cache
-			{
-				std::span<const uint8_t> spirv_isa_cache_data = {};
-				if (spirv_isa_cache_input)
-					spirv_isa_cache_data = {reinterpret_cast<const uint8_t*>(spirv_isa_cache_input->getMappedPointer()),spirv_isa_cache_input->getSize()};
-				else
-					m_logger->log("Failed to load SPIR-V 2 ISA cache!",ILogger::ELL_PERFORMANCE);
-				// Normally we'd deserialize a `ICPUPipelineCache` properly and pass that instead
-				m_spirv_isa_cache = m_device->createPipelineCache(spirv_isa_cache_data);
-			}
-		}
-		{
-			// TODO: rename `deleteDirectory` to just `delete`? and a `IFile::setSize()` ?
-			m_system->deleteDirectory(spirv_isa_cache_path);
-			ISystem::future_t<smart_refctd_ptr<IFile>> fileCreate;
-			m_system->createFile(fileCreate,spirv_isa_cache_path,IFile::ECF_WRITE);
-			// I can be relatively sure I'll succeed to acquire the future, the pointer to created file might be null though.
-			m_spirv_isa_cache_output=*fileCreate.acquire();
-			if (!m_spirv_isa_cache_output)
-				logFail("Failed to Create SPIR-V to ISA cache file.");
-		}
+		//const auto spirv_isa_cache_path = localOutputCWD/"spirv_isa_cache.bin";
+		//// enclose to make sure file goes out of scope and we can reopen it
+		//{
+		//	smart_refctd_ptr<const IFile> spirv_isa_cache_input;
+		//	// try to load SPIR-V to ISA cache
+		//	{
+		//		ISystem::future_t<smart_refctd_ptr<IFile>> fileCreate;
+		//		m_system->createFile(fileCreate,spirv_isa_cache_path,IFile::ECF_READ|IFile::ECF_MAPPABLE|IFile::ECF_COHERENT);
+		//		if (auto lock=fileCreate.acquire())
+		//			spirv_isa_cache_input = *lock;
+		//	}
+		//	// create the cache
+		//	{
+		//		std::span<const uint8_t> spirv_isa_cache_data = {};
+		//		if (spirv_isa_cache_input)
+		//			spirv_isa_cache_data = {reinterpret_cast<const uint8_t*>(spirv_isa_cache_input->getMappedPointer()),spirv_isa_cache_input->getSize()};
+		//		else
+		//			m_logger->log("Failed to load SPIR-V 2 ISA cache!",ILogger::ELL_PERFORMANCE);
+		//		// Normally we'd deserialize a `ICPUPipelineCache` properly and pass that instead
+		//		m_spirv_isa_cache = m_device->createPipelineCache(spirv_isa_cache_data);
+		//	}
+		//}
+		//{
+		//	// TODO: rename `deleteDirectory` to just `delete`? and a `IFile::setSize()` ?
+		//	m_system->deleteDirectory(spirv_isa_cache_path);
+		//	ISystem::future_t<smart_refctd_ptr<IFile>> fileCreate;
+		//	m_system->createFile(fileCreate,spirv_isa_cache_path,IFile::ECF_WRITE);
+		//	// I can be relatively sure I'll succeed to acquire the future, the pointer to created file might be null though.
+		//	m_spirv_isa_cache_output=*fileCreate.acquire();
+		//	if (!m_spirv_isa_cache_output)
+		//		logFail("Failed to Create SPIR-V to ISA cache file.");
+		//}
 
 		// load shader source from file
 		auto getShaderSource = [&](const char* filePath) -> auto
@@ -187,7 +187,7 @@ class ArithmeticUnitTestApp final : public application_templates::BasicMultiQueu
 			return smart_refctd_ptr_static_cast<ICPUShader>(firstAssetInBundle);
 		};
 
-		auto subgroupTestSource = getShaderSource("app_resources/testSubgroup.comp.hlsl");
+		//auto subgroupTestSource = getShaderSource("app_resources/testSubgroup.comp.hlsl");
 		auto workgroupTestSource = getShaderSource("app_resources/testWorkgroup.comp.hlsl");
 		// now create or retrieve final resources to run our tests
 		sema = m_device->createSemaphore(timelineValue);
@@ -202,47 +202,47 @@ class ArithmeticUnitTestApp final : public application_templates::BasicMultiQueu
 		}
 
 		const auto MaxWorkgroupSize = m_physicalDevice->getLimits().maxComputeWorkGroupInvocations;
+		const std::array<uint32_t, 2> WorkgroupSizes = { 512, 1024 };
 		const auto MinSubgroupSize = m_physicalDevice->getLimits().minSubgroupSize;
 		const auto MaxSubgroupSize = m_physicalDevice->getLimits().maxSubgroupSize;
 		for (auto subgroupSize=MinSubgroupSize; subgroupSize <= MaxSubgroupSize; subgroupSize *= 2u)
 		{
 			const uint8_t subgroupSizeLog2 = hlsl::findMSB(subgroupSize);
-			for (uint32_t workgroupSize = subgroupSize; workgroupSize <= MaxWorkgroupSize; workgroupSize += subgroupSize)
+			for (uint32_t i = 0; i < WorkgroupSizes.size(); i++)
 			{
+				const uint32_t workgroupSize = WorkgroupSizes[i];
 				// make sure renderdoc captures everything for debugging
 				m_api->startCapture();
 				m_logger->log("Testing Workgroup Size %u with Subgroup Size %u", ILogger::ELL_INFO, workgroupSize, subgroupSize);
 
 				bool passed = true;
 				// TODO async the testing
-				passed = runTest<emulatedReduction, false>(subgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize) && passed;
-				logTestOutcome(passed, workgroupSize);
-				passed = runTest<emulatedScanInclusive, false>(subgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize) && passed;
-				logTestOutcome(passed, workgroupSize);
-				passed = runTest<emulatedScanExclusive, false>(subgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize) && passed;
-				logTestOutcome(passed, workgroupSize);
-				for (uint32_t itemsPerWG = workgroupSize; itemsPerWG > workgroupSize - subgroupSize; itemsPerWG--)
-				{
-					m_logger->log("Testing Item Count %u", ILogger::ELL_INFO, itemsPerWG);
-					passed = runTest<emulatedReduction, true>(workgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize, itemsPerWG) && passed;
-					logTestOutcome(passed, itemsPerWG);
-					passed = runTest<emulatedScanInclusive, true>(workgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize, itemsPerWG) && passed;
-					logTestOutcome(passed, itemsPerWG);
-					passed = runTest<emulatedScanExclusive, true>(workgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize, itemsPerWG) && passed;
-					logTestOutcome(passed, itemsPerWG);
-				}
+				//passed = runTest<emulatedReduction, false>(subgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize) && passed;
+				//logTestOutcome(passed, workgroupSize);
+				//passed = runTest<emulatedScanInclusive, false>(subgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize) && passed;
+				//logTestOutcome(passed, workgroupSize);
+				//passed = runTest<emulatedScanExclusive, false>(subgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize) && passed;
+				//logTestOutcome(passed, workgroupSize);
+				const uint32_t itemsPerWG = workgroupSize;
+				m_logger->log("Testing Item Count %u", ILogger::ELL_INFO, itemsPerWG);
+				passed = runTest<emulatedReduction, true>(workgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize, itemsPerWG) && passed;
+				logTestOutcome(passed, itemsPerWG);
+				//passed = runTest<emulatedScanInclusive, true>(workgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize, itemsPerWG) && passed;
+				//logTestOutcome(passed, itemsPerWG);
+				//passed = runTest<emulatedScanExclusive, true>(workgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize, itemsPerWG) && passed;
+				//logTestOutcome(passed, itemsPerWG);
 				m_api->endCapture();
 
 				// save cache every now and then	
-				{
-					auto cpu = m_spirv_isa_cache->convertToCPUCache();
-					// Normally we'd beautifully JSON serialize the thing, allow multiple devices & drivers + metadata
-					auto bin = cpu->getEntries().begin()->second.bin;
-					IFile::success_t success;
-					m_spirv_isa_cache_output->write(success,bin->data(),0ull,bin->size());
-					if (!success)
-						logFail("Could not write Create SPIR-V to ISA cache to disk!");
-				}
+				//{
+				//	auto cpu = m_spirv_isa_cache->convertToCPUCache();
+				//	// Normally we'd beautifully JSON serialize the thing, allow multiple devices & drivers + metadata
+				//	auto bin = cpu->getEntries().begin()->second.bin;
+				//	IFile::success_t success;
+				//	m_spirv_isa_cache_output->write(success,bin->data(),0ull,bin->size());
+				//	if (!success)
+				//		logFail("Could not write Create SPIR-V to ISA cache to disk!");
+				//}
 			}
 		}
 
@@ -294,33 +294,27 @@ class ArithmeticUnitTestApp final : public application_templates::BasicMultiQueu
 		return pipeline;
 	}
 
-	/*template<template<class> class Arithmetic, bool WorkgroupTest>
-	bool runTest(const smart_refctd_ptr<const ICPUShader>& source, const uint32_t elementCount, const uint32_t workgroupSize, uint32_t itemsPerWG = ~0u)
-	{
-		return true;
-	}*/
-
 	template<template<class> class Arithmetic, bool WorkgroupTest>
 	bool runTest(const smart_refctd_ptr<const ICPUShader>& source, const uint32_t elementCount, const uint8_t subgroupSizeLog2, const uint32_t workgroupSize, uint32_t itemsPerWG = ~0u)
 	{
 		std::string arith_name = Arithmetic<bit_xor<float>>::name;
 
 		smart_refctd_ptr<ICPUShader> overridenUnspecialized;
-		if constexpr (WorkgroupTest)
-		{
+		//if constexpr (WorkgroupTest)
+		//{
 			overridenUnspecialized = CHLSLCompiler::createOverridenCopy(
 				source.get(), "#define OPERATION %s\n#define WORKGROUP_SIZE %d\n#define ITEMS_PER_WG %d\n",
-				(("workgroup::") + arith_name).c_str(), workgroupSize, itemsPerWG
-			);
-		}
-		else
-		{
-			itemsPerWG = workgroupSize;
-			overridenUnspecialized = CHLSLCompiler::createOverridenCopy(
-				source.get(), "#define OPERATION %s\n#define WORKGROUP_SIZE %d\n",
-				(("subgroup::") + arith_name).c_str(), workgroupSize
+				(("workgroup2::") + arith_name).c_str(), workgroupSize, itemsPerWG
 			);
-		}
+		//}
+		//else
+		//{
+		//	itemsPerWG = workgroupSize;
+		//	overridenUnspecialized = CHLSLCompiler::createOverridenCopy(
+		//		source.get(), "#define OPERATION %s\n#define WORKGROUP_SIZE %d\n",
+		//		(("subgroup::") + arith_name).c_str(), workgroupSize
+		//	);
+		//}
 		auto pipeline = createPipeline(overridenUnspecialized.get(),subgroupSizeLog2);
 
 		// TODO: overlap dispatches with memory readbacks (requires multiple copies of `buffers`)
@@ -366,8 +360,8 @@ class ArithmeticUnitTestApp final : public application_templates::BasicMultiQueu
 		passed = validateResults<Arithmetic, multiplies<uint32_t>, WorkgroupTest>(itemsPerWG, workgroupCount) && passed;
 		passed = validateResults<Arithmetic, minimum<uint32_t>, WorkgroupTest>(itemsPerWG, workgroupCount) && passed;
 		passed = validateResults<Arithmetic, maximum<uint32_t>, WorkgroupTest>(itemsPerWG, workgroupCount) && passed;
-		if constexpr (WorkgroupTest)
-			passed = validateResults<Arithmetic, ballot<uint32_t>, WorkgroupTest>(itemsPerWG, workgroupCount) && passed;
+		//if constexpr (WorkgroupTest)
+		//	passed = validateResults<Arithmetic, ballot<uint32_t>, WorkgroupTest>(itemsPerWG, workgroupCount) && passed;
 
 		return passed;
 	}
@@ -395,27 +389,27 @@ class ArithmeticUnitTestApp final : public application_templates::BasicMultiQueu
 		// TODO: parallel for (the temporary values need to be threadlocal or what?)
 		// now check if the data obtained has valid values
 		type_t* tmp = new type_t[itemsPerWG];
-		type_t* ballotInput = new type_t[itemsPerWG];
+		//type_t* ballotInput = new type_t[itemsPerWG];
 		for (uint32_t workgroupID = 0u; success && workgroupID < workgroupCount; workgroupID++)
 		{
 			const auto workgroupOffset = workgroupID * itemsPerWG;
 
-			if constexpr (WorkgroupTest)
-			{
-				if constexpr (std::is_same_v<ballot<type_t>, Binop>)
-				{
-					for (auto i = 0u; i < itemsPerWG; i++)
-						ballotInput[i] = inputData[i + workgroupOffset] & 0x1u;
-					Arithmetic<Binop>::impl(tmp, ballotInput, itemsPerWG);
-				}
-				else
+			//if constexpr (WorkgroupTest)
+			//{
+			//	if constexpr (std::is_same_v<ballot<type_t>, Binop>)
+			//	{
+			//		for (auto i = 0u; i < itemsPerWG; i++)
+			//			ballotInput[i] = inputData[i + workgroupOffset] & 0x1u;
+			//		Arithmetic<Binop>::impl(tmp, ballotInput, itemsPerWG);
+			//	}
+			//	else
 					Arithmetic<Binop>::impl(tmp, inputData + workgroupOffset, itemsPerWG);
-			}
-			else
-			{
-				for (uint32_t pseudoSubgroupID = 0u; pseudoSubgroupID < itemsPerWG; pseudoSubgroupID += subgroupSize)
-					Arithmetic<Binop>::impl(tmp + pseudoSubgroupID, inputData + workgroupOffset + pseudoSubgroupID, subgroupSize);
-			}
+			//}
+			//else
+			//{
+			//	for (uint32_t pseudoSubgroupID = 0u; pseudoSubgroupID < itemsPerWG; pseudoSubgroupID += subgroupSize)
+			//		Arithmetic<Binop>::impl(tmp + pseudoSubgroupID, inputData + workgroupOffset + pseudoSubgroupID, subgroupSize);
+			//}
 
 			for (uint32_t localInvocationIndex = 0u; localInvocationIndex < itemsPerWG; localInvocationIndex++)
 			{
@@ -434,7 +428,7 @@ class ArithmeticUnitTestApp final : public application_templates::BasicMultiQueu
 				}
 			}
 		}
-		delete[] ballotInput;
+		//delete[] ballotInput;
 		delete[] tmp;
 
 		return success;
@@ -459,4 +453,4 @@ class ArithmeticUnitTestApp final : public application_templates::BasicMultiQueu
 	uint32_t totalFailCount = 0;
 };
 
-NBL_MAIN_FUNC(ArithmeticUnitTestApp)
\ No newline at end of file
+NBL_MAIN_FUNC(Workgroup2ScanTestApp)
\ No newline at end of file

From f11b3df746c4c69721daeb264a925c9e8dd86d1d Mon Sep 17 00:00:00 2001
From: keptsecret <sorchon@gmail.com>
Date: Tue, 29 Apr 2025 12:04:14 +0700
Subject: [PATCH 208/529] fixes to test

---
 74a_Workgroup2ScanTest/app_resources/testWorkgroup.comp.hlsl | 2 +-
 74a_Workgroup2ScanTest/main.cpp                              | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/74a_Workgroup2ScanTest/app_resources/testWorkgroup.comp.hlsl b/74a_Workgroup2ScanTest/app_resources/testWorkgroup.comp.hlsl
index 315550da0..1f313e6f8 100644
--- a/74a_Workgroup2ScanTest/app_resources/testWorkgroup.comp.hlsl
+++ b/74a_Workgroup2ScanTest/app_resources/testWorkgroup.comp.hlsl
@@ -96,7 +96,7 @@ struct DataProxy
     void set(const uint32_t ix, const dtype_t value)
     {
         // inputValue[ix] = value;
-        output[Binop::BindingIndex].template Store<type_t>(sizeof(uint32_t) + sizeof(type_t) * globalIndex(), value);
+        // output[Binop::BindingIndex].template Store<type_t>(sizeof(uint32_t) + sizeof(type_t) * globalIndex(), value);
         output[Binop::BindingIndex].template Store<type_t>(sizeof(uint32_t) + sizeof(type_t) * ix, value);
     }
 
diff --git a/74a_Workgroup2ScanTest/main.cpp b/74a_Workgroup2ScanTest/main.cpp
index 7e11726d6..4dc337e20 100644
--- a/74a_Workgroup2ScanTest/main.cpp
+++ b/74a_Workgroup2ScanTest/main.cpp
@@ -223,7 +223,7 @@ class Workgroup2ScanTestApp final : public application_templates::BasicMultiQueu
 				//logTestOutcome(passed, workgroupSize);
 				//passed = runTest<emulatedScanExclusive, false>(subgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize) && passed;
 				//logTestOutcome(passed, workgroupSize);
-				const uint32_t itemsPerWG = workgroupSize;
+				const uint32_t itemsPerWG = 1024;	// TODO use Config::VirtualWorkgroupSize somehow
 				m_logger->log("Testing Item Count %u", ILogger::ELL_INFO, itemsPerWG);
 				passed = runTest<emulatedReduction, true>(workgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize, itemsPerWG) && passed;
 				logTestOutcome(passed, itemsPerWG);
@@ -318,7 +318,7 @@ class Workgroup2ScanTestApp final : public application_templates::BasicMultiQueu
 		auto pipeline = createPipeline(overridenUnspecialized.get(),subgroupSizeLog2);
 
 		// TODO: overlap dispatches with memory readbacks (requires multiple copies of `buffers`)
-		const uint32_t workgroupCount = elementCount / itemsPerWG;
+		const uint32_t workgroupCount = elementCount / itemsPerWG;	// TODO use Config::VirtualWorkgroupSize somehow
 		cmdbuf->begin(IGPUCommandBuffer::USAGE::NONE);
 		cmdbuf->bindComputePipeline(pipeline.get());
 		cmdbuf->bindDescriptorSets(EPBP_COMPUTE, pipeline->getLayout(), 0u, 1u, &descriptorSet.get());

From 9f690ee8077344754aa4045f7cbce99b2c16abee Mon Sep 17 00:00:00 2001
From: keptsecret <sorchon@gmail.com>
Date: Tue, 29 Apr 2025 15:53:42 +0700
Subject: [PATCH 209/529] tests with multiple items per invoc

---
 .../app_resources/testWorkgroup.comp.hlsl     |  4 +-
 74a_Workgroup2ScanTest/main.cpp               | 59 +++++++++++++++----
 2 files changed, 51 insertions(+), 12 deletions(-)

diff --git a/74a_Workgroup2ScanTest/app_resources/testWorkgroup.comp.hlsl b/74a_Workgroup2ScanTest/app_resources/testWorkgroup.comp.hlsl
index 1f313e6f8..67bb9c5f2 100644
--- a/74a_Workgroup2ScanTest/app_resources/testWorkgroup.comp.hlsl
+++ b/74a_Workgroup2ScanTest/app_resources/testWorkgroup.comp.hlsl
@@ -26,7 +26,7 @@
 // https://github.com/microsoft/DirectXShaderCompiler/issues/6144
 uint32_t3 nbl::hlsl::glsl::gl_WorkGroupSize() {return uint32_t3(WORKGROUP_SIZE,1,1);}
 
-#define ITEMS_PER_INVOCATION 1
+// #define ITEMS_PER_INVOCATION 1
 
 #ifndef ITEMS_PER_INVOCATION
 #error "Define ITEMS_PER_INVOCATION!"
@@ -44,7 +44,7 @@ uint32_t globalIndex();
 // since we test ITEMS_PER_WG<WorkgroupSize we need this so workgroups don't overwrite each other's outputs
 bool canStore();
 
-#define SUBGROUP_SIZE_LOG2 5
+// #define SUBGROUP_SIZE_LOG2 5
 
 #ifndef OPERATION
 #error "Define OPERATION!"
diff --git a/74a_Workgroup2ScanTest/main.cpp b/74a_Workgroup2ScanTest/main.cpp
index 4dc337e20..123dda5a4 100644
--- a/74a_Workgroup2ScanTest/main.cpp
+++ b/74a_Workgroup2ScanTest/main.cpp
@@ -223,7 +223,7 @@ class Workgroup2ScanTestApp final : public application_templates::BasicMultiQueu
 				//logTestOutcome(passed, workgroupSize);
 				//passed = runTest<emulatedScanExclusive, false>(subgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize) && passed;
 				//logTestOutcome(passed, workgroupSize);
-				const uint32_t itemsPerWG = 1024;	// TODO use Config::VirtualWorkgroupSize somehow
+				const uint32_t itemsPerWG = ItemsPerInvocation * max(workgroupSize >> subgroupSizeLog2, subgroupSize) << subgroupSizeLog2;	// TODO use Config::VirtualWorkgroupSize somehow
 				m_logger->log("Testing Item Count %u", ILogger::ELL_INFO, itemsPerWG);
 				passed = runTest<emulatedReduction, true>(workgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize, itemsPerWG) && passed;
 				logTestOutcome(passed, itemsPerWG);
@@ -289,7 +289,7 @@ class Workgroup2ScanTestApp final : public application_templates::BasicMultiQueu
 			.requireFullSubgroups = true
 		};
 		core::smart_refctd_ptr<IGPUComputePipeline> pipeline;
-		if (!m_device->createComputePipelines(m_spirv_isa_cache.get(),{&params,1},&pipeline))
+		if (!m_device->createComputePipelines(nullptr,{&params,1},&pipeline))
 			return nullptr;
 		return pipeline;
 	}
@@ -299,13 +299,13 @@ class Workgroup2ScanTestApp final : public application_templates::BasicMultiQueu
 	{
 		std::string arith_name = Arithmetic<bit_xor<float>>::name;
 
-		smart_refctd_ptr<ICPUShader> overridenUnspecialized;
+		//smart_refctd_ptr<ICPUShader> overridenUnspecialized;
 		//if constexpr (WorkgroupTest)
 		//{
-			overridenUnspecialized = CHLSLCompiler::createOverridenCopy(
-				source.get(), "#define OPERATION %s\n#define WORKGROUP_SIZE %d\n#define ITEMS_PER_WG %d\n",
-				(("workgroup2::") + arith_name).c_str(), workgroupSize, itemsPerWG
-			);
+			//overridenUnspecialized = CHLSLCompiler::createOverridenCopy(
+			//	source.get(), "#define OPERATION %s\n#define WORKGROUP_SIZE %d\n#define ITEMS_PER_WG %d\n",
+			//	(("workgroup2::") + arith_name).c_str(), workgroupSize, itemsPerWG
+			//);
 		//}
 		//else
 		//{
@@ -315,7 +315,46 @@ class Workgroup2ScanTestApp final : public application_templates::BasicMultiQueu
 		//		(("subgroup::") + arith_name).c_str(), workgroupSize
 		//	);
 		//}
-		auto pipeline = createPipeline(overridenUnspecialized.get(),subgroupSizeLog2);
+
+		auto compiler = make_smart_refctd_ptr<asset::CHLSLCompiler>(smart_refctd_ptr(m_system));
+		CHLSLCompiler::SOptions options = {};
+		options.stage = IShader::E_SHADER_STAGE::ESS_COMPUTE;
+		options.targetSpirvVersion = m_device->getPhysicalDevice()->getLimits().spirvVersion;
+		options.spirvOptimizer = nullptr;
+#ifndef _NBL_DEBUG
+		ISPIRVOptimizer::E_OPTIMIZER_PASS optPasses = ISPIRVOptimizer::EOP_STRIP_DEBUG_INFO;
+		auto opt = make_smart_refctd_ptr<ISPIRVOptimizer>(std::span<ISPIRVOptimizer::E_OPTIMIZER_PASS>(&optPasses, 1));
+		options.spirvOptimizer = opt.get();
+#else
+		options.debugInfoFlags |= IShaderCompiler::E_DEBUG_INFO_FLAGS::EDIF_LINE_BIT;
+#endif
+		options.preprocessorOptions.sourceIdentifier = source->getFilepathHint();
+		options.preprocessorOptions.logger = m_logger.get();
+
+		auto* includeFinder = compiler->getDefaultIncludeFinder();
+		includeFinder->addSearchPath("nbl/builtin/hlsl/jit", core::make_smart_refctd_ptr<CJITIncludeLoader>(m_physicalDevice->getLimits(), m_device->getEnabledFeatures()));
+		options.preprocessorOptions.includeFinder = includeFinder;
+
+		const std::string definitions[5] = {
+			"workgroup2::" + arith_name,
+			std::to_string(workgroupSize),
+			std::to_string(itemsPerWG),
+			std::to_string(ItemsPerInvocation),
+			std::to_string(subgroupSizeLog2)
+		};
+
+		const IShaderCompiler::SMacroDefinition defines[5] = {
+			{ "OPERATION", definitions[0] },
+			{ "WORKGROUP_SIZE", definitions[1] },
+			{ "ITEMS_PER_WG", definitions[2] },
+			{ "ITEMS_PER_INVOCATION", definitions[3] },
+			{ "SUBGROUP_SIZE_LOG2", definitions[4] }
+		};
+		options.preprocessorOptions.extraDefines = { defines, defines + 5 };
+
+		smart_refctd_ptr<ICPUShader> overriddenUnspecialized = compiler->compileToSPIRV((const char*)source->getContent()->getPointer(), options);
+
+		auto pipeline = createPipeline(overriddenUnspecialized.get(),subgroupSizeLog2);
 
 		// TODO: overlap dispatches with memory readbacks (requires multiple copies of `buffers`)
 		const uint32_t workgroupCount = elementCount / itemsPerWG;	// TODO use Config::VirtualWorkgroupSize somehow
@@ -436,8 +475,6 @@ class Workgroup2ScanTestApp final : public application_templates::BasicMultiQueu
 
 	IQueue* transferDownQueue;
 	IQueue* computeQueue;
-	smart_refctd_ptr<IGPUPipelineCache> m_spirv_isa_cache;
-	smart_refctd_ptr<IFile> m_spirv_isa_cache_output;
 
 	uint32_t* inputData = nullptr;
 	constexpr static inline uint32_t OutputBufferCount = 8u;
@@ -451,6 +488,8 @@ class Workgroup2ScanTestApp final : public application_templates::BasicMultiQueu
 	smart_refctd_ptr<ICPUBuffer> resultsBuffer;
 
 	uint32_t totalFailCount = 0;
+
+	uint32_t ItemsPerInvocation = 4u;
 };
 
 NBL_MAIN_FUNC(Workgroup2ScanTestApp)
\ No newline at end of file

From 755f89ac0b833fe22c2f832c171e465b1ecbd31b Mon Sep 17 00:00:00 2001
From: keptsecret <sorchon@gmail.com>
Date: Tue, 29 Apr 2025 17:02:35 +0700
Subject: [PATCH 210/529] inclusive scan test

---
 74a_Workgroup2ScanTest/main.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/74a_Workgroup2ScanTest/main.cpp b/74a_Workgroup2ScanTest/main.cpp
index 123dda5a4..9c695b280 100644
--- a/74a_Workgroup2ScanTest/main.cpp
+++ b/74a_Workgroup2ScanTest/main.cpp
@@ -227,8 +227,8 @@ class Workgroup2ScanTestApp final : public application_templates::BasicMultiQueu
 				m_logger->log("Testing Item Count %u", ILogger::ELL_INFO, itemsPerWG);
 				passed = runTest<emulatedReduction, true>(workgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize, itemsPerWG) && passed;
 				logTestOutcome(passed, itemsPerWG);
-				//passed = runTest<emulatedScanInclusive, true>(workgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize, itemsPerWG) && passed;
-				//logTestOutcome(passed, itemsPerWG);
+				passed = runTest<emulatedScanInclusive, true>(workgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize, itemsPerWG) && passed;
+				logTestOutcome(passed, itemsPerWG);
 				//passed = runTest<emulatedScanExclusive, true>(workgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize, itemsPerWG) && passed;
 				//logTestOutcome(passed, itemsPerWG);
 				m_api->endCapture();
@@ -489,7 +489,7 @@ class Workgroup2ScanTestApp final : public application_templates::BasicMultiQueu
 
 	uint32_t totalFailCount = 0;
 
-	uint32_t ItemsPerInvocation = 4u;
+	uint32_t ItemsPerInvocation = 1u;
 };
 
 NBL_MAIN_FUNC(Workgroup2ScanTestApp)
\ No newline at end of file

From 12f3d84898db815ec5f6610daba753a1cd428a03 Mon Sep 17 00:00:00 2001
From: Erfan Ahmadi <ahmadierfan99@gmail.com>
Date: Wed, 30 Apr 2025 08:46:25 +0400
Subject: [PATCH 211/529] grid dtm tasks

---
 62_CAD/DrawResourcesFiller.cpp                  |  3 +++
 62_CAD/shaders/globals.hlsl                     |  5 +++++
 .../shaders/main_pipeline/fragment_shader.hlsl  | 17 +++++++++++++++++
 62_CAD/shaders/main_pipeline/vertex_shader.hlsl |  1 +
 4 files changed, 26 insertions(+)

diff --git a/62_CAD/DrawResourcesFiller.cpp b/62_CAD/DrawResourcesFiller.cpp
index 30fb6d748..d12837691 100644
--- a/62_CAD/DrawResourcesFiller.cpp
+++ b/62_CAD/DrawResourcesFiller.cpp
@@ -281,6 +281,9 @@ void DrawResourcesFiller::drawFontGlyph(
 	}
 }
 
+// TODO[Przemek]: similar to other drawXXX and drawXXX_internal functions that create mainobjects, drawObjects and push additional info in geometry buffer, input to function would be a GridDTMInfo
+// We don't have an allocator or memory management for texture updates yet, see how `_test_addImageObject` is being temporarily used (Descriptor updates and pipeline barriers) to upload an image into gpu and update a descriptor slot (it will become more sophisticated but doesn't block you)
+
 void DrawResourcesFiller::_test_addImageObject(float64_t2 topLeftPos, float32_t2 size, float32_t rotation, SIntendedSubmitInfo& intendedNextSubmit)
 {
 	auto addImageObject_Internal = [&](const ImageObjectInfo& imageObjectInfo, uint32_t mainObjIdx) -> bool
diff --git a/62_CAD/shaders/globals.hlsl b/62_CAD/shaders/globals.hlsl
index 045e11f1e..d12c80bef 100644
--- a/62_CAD/shaders/globals.hlsl
+++ b/62_CAD/shaders/globals.hlsl
@@ -231,6 +231,11 @@ struct ImageObjectInfo
     uint32_t textureID; // 4 bytes (32)
 };
 
+/*
+GRID DTM Info similar to `ImageObjectInfo`
+other than textureID, there will be dtmSettingsIdx referencing a dtmSettings
+*/
+
 static uint32_t packR11G11B10_UNORM(float32_t3 color)
 {
     // Scale and convert to integers
diff --git a/62_CAD/shaders/main_pipeline/fragment_shader.hlsl b/62_CAD/shaders/main_pipeline/fragment_shader.hlsl
index 3ac219a66..326c4cf0d 100644
--- a/62_CAD/shaders/main_pipeline/fragment_shader.hlsl
+++ b/62_CAD/shaders/main_pipeline/fragment_shader.hlsl
@@ -388,6 +388,23 @@ float4 fragMain(PSInput input) : SV_TARGET
                 localAlpha = colorSample.a;
             }
         }
+        // objType GRID_DTM here
+        {
+            // NOTE: create and read from a texture as a last step, you can generate the height values procedurally from a function while you're working on the sdf stuff.
+            
+            // Query dtm settings
+            // use texture Gather to get 4 corners: https://learn.microsoft.com/en-us/windows/win32/direct3dhlsl/dx-graphics-hlsl-to-gather
+            // A. the outlines can be stippled, use phaseshift of the line such that they started from the grid's origin worldspace coordinate
+            // B. the contours are computed for triangles, use the same function as for dtms, choose between the two triangles based on local UV coords in current cell
+                // Make it so we can choose which diagonal to use to construct the triangle, it's either u=v or u=1-v
+            // C. Height shading same as contours (split into two triangles)
+
+            // Heights can have invalid values (let's say NaN) if a cell corner has NaN value then no triangle (for contour and shading) and no outline should include that corner. (see DTM image in discord with gaps)
+            
+            // TODO: we need to emulate dilation and do sdf of neighbouring cells as well. because contours, outlines and shading can bleed into other cells for AA.
+            // [NOTE] Do dilation as last step, when everything else works fine
+        }
+        
 
         uint2 fragCoord = uint2(input.position.xy);
         
diff --git a/62_CAD/shaders/main_pipeline/vertex_shader.hlsl b/62_CAD/shaders/main_pipeline/vertex_shader.hlsl
index 7ce0f43e7..9d4a384a1 100644
--- a/62_CAD/shaders/main_pipeline/vertex_shader.hlsl
+++ b/62_CAD/shaders/main_pipeline/vertex_shader.hlsl
@@ -620,6 +620,7 @@ PSInput main(uint vertexID : SV_VertexID)
             outV.setImageUV(uv);
             outV.setImageTextureId(textureID);
         }
+        // TODO: Przemek objType GRID_DTM, Similar transformations to IMAGE
 
     // Make the cage fullscreen for testing: 
 #if 0

From b8415ad608844d9cfaa8ffc9fe9d15e2c31db71b Mon Sep 17 00:00:00 2001
From: keptsecret <sorchon@gmail.com>
Date: Wed, 30 Apr 2025 14:08:05 +0700
Subject: [PATCH 212/529] exclusive scan test, remove comments

---
 74a_Workgroup2ScanTest/main.cpp | 99 ++-------------------------------
 1 file changed, 5 insertions(+), 94 deletions(-)

diff --git a/74a_Workgroup2ScanTest/main.cpp b/74a_Workgroup2ScanTest/main.cpp
index 9c695b280..f0064a4c0 100644
--- a/74a_Workgroup2ScanTest/main.cpp
+++ b/74a_Workgroup2ScanTest/main.cpp
@@ -138,39 +138,6 @@ class Workgroup2ScanTestApp final : public application_templates::BasicMultiQueu
 			pipelineLayout = m_device->createPipelineLayout({},std::move(dsLayout));
 		}
 
-		//const auto spirv_isa_cache_path = localOutputCWD/"spirv_isa_cache.bin";
-		//// enclose to make sure file goes out of scope and we can reopen it
-		//{
-		//	smart_refctd_ptr<const IFile> spirv_isa_cache_input;
-		//	// try to load SPIR-V to ISA cache
-		//	{
-		//		ISystem::future_t<smart_refctd_ptr<IFile>> fileCreate;
-		//		m_system->createFile(fileCreate,spirv_isa_cache_path,IFile::ECF_READ|IFile::ECF_MAPPABLE|IFile::ECF_COHERENT);
-		//		if (auto lock=fileCreate.acquire())
-		//			spirv_isa_cache_input = *lock;
-		//	}
-		//	// create the cache
-		//	{
-		//		std::span<const uint8_t> spirv_isa_cache_data = {};
-		//		if (spirv_isa_cache_input)
-		//			spirv_isa_cache_data = {reinterpret_cast<const uint8_t*>(spirv_isa_cache_input->getMappedPointer()),spirv_isa_cache_input->getSize()};
-		//		else
-		//			m_logger->log("Failed to load SPIR-V 2 ISA cache!",ILogger::ELL_PERFORMANCE);
-		//		// Normally we'd deserialize a `ICPUPipelineCache` properly and pass that instead
-		//		m_spirv_isa_cache = m_device->createPipelineCache(spirv_isa_cache_data);
-		//	}
-		//}
-		//{
-		//	// TODO: rename `deleteDirectory` to just `delete`? and a `IFile::setSize()` ?
-		//	m_system->deleteDirectory(spirv_isa_cache_path);
-		//	ISystem::future_t<smart_refctd_ptr<IFile>> fileCreate;
-		//	m_system->createFile(fileCreate,spirv_isa_cache_path,IFile::ECF_WRITE);
-		//	// I can be relatively sure I'll succeed to acquire the future, the pointer to created file might be null though.
-		//	m_spirv_isa_cache_output=*fileCreate.acquire();
-		//	if (!m_spirv_isa_cache_output)
-		//		logFail("Failed to Create SPIR-V to ISA cache file.");
-		//}
-
 		// load shader source from file
 		auto getShaderSource = [&](const char* filePath) -> auto
 		{
@@ -187,7 +154,6 @@ class Workgroup2ScanTestApp final : public application_templates::BasicMultiQueu
 			return smart_refctd_ptr_static_cast<ICPUShader>(firstAssetInBundle);
 		};
 
-		//auto subgroupTestSource = getShaderSource("app_resources/testSubgroup.comp.hlsl");
 		auto workgroupTestSource = getShaderSource("app_resources/testWorkgroup.comp.hlsl");
 		// now create or retrieve final resources to run our tests
 		sema = m_device->createSemaphore(timelineValue);
@@ -216,33 +182,15 @@ class Workgroup2ScanTestApp final : public application_templates::BasicMultiQueu
 				m_logger->log("Testing Workgroup Size %u with Subgroup Size %u", ILogger::ELL_INFO, workgroupSize, subgroupSize);
 
 				bool passed = true;
-				// TODO async the testing
-				//passed = runTest<emulatedReduction, false>(subgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize) && passed;
-				//logTestOutcome(passed, workgroupSize);
-				//passed = runTest<emulatedScanInclusive, false>(subgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize) && passed;
-				//logTestOutcome(passed, workgroupSize);
-				//passed = runTest<emulatedScanExclusive, false>(subgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize) && passed;
-				//logTestOutcome(passed, workgroupSize);
 				const uint32_t itemsPerWG = ItemsPerInvocation * max(workgroupSize >> subgroupSizeLog2, subgroupSize) << subgroupSizeLog2;	// TODO use Config::VirtualWorkgroupSize somehow
 				m_logger->log("Testing Item Count %u", ILogger::ELL_INFO, itemsPerWG);
 				passed = runTest<emulatedReduction, true>(workgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize, itemsPerWG) && passed;
 				logTestOutcome(passed, itemsPerWG);
 				passed = runTest<emulatedScanInclusive, true>(workgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize, itemsPerWG) && passed;
 				logTestOutcome(passed, itemsPerWG);
-				//passed = runTest<emulatedScanExclusive, true>(workgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize, itemsPerWG) && passed;
-				//logTestOutcome(passed, itemsPerWG);
+				passed = runTest<emulatedScanExclusive, true>(workgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize, itemsPerWG) && passed;
+				logTestOutcome(passed, itemsPerWG);
 				m_api->endCapture();
-
-				// save cache every now and then	
-				//{
-				//	auto cpu = m_spirv_isa_cache->convertToCPUCache();
-				//	// Normally we'd beautifully JSON serialize the thing, allow multiple devices & drivers + metadata
-				//	auto bin = cpu->getEntries().begin()->second.bin;
-				//	IFile::success_t success;
-				//	m_spirv_isa_cache_output->write(success,bin->data(),0ull,bin->size());
-				//	if (!success)
-				//		logFail("Could not write Create SPIR-V to ISA cache to disk!");
-				//}
 			}
 		}
 
@@ -299,23 +247,6 @@ class Workgroup2ScanTestApp final : public application_templates::BasicMultiQueu
 	{
 		std::string arith_name = Arithmetic<bit_xor<float>>::name;
 
-		//smart_refctd_ptr<ICPUShader> overridenUnspecialized;
-		//if constexpr (WorkgroupTest)
-		//{
-			//overridenUnspecialized = CHLSLCompiler::createOverridenCopy(
-			//	source.get(), "#define OPERATION %s\n#define WORKGROUP_SIZE %d\n#define ITEMS_PER_WG %d\n",
-			//	(("workgroup2::") + arith_name).c_str(), workgroupSize, itemsPerWG
-			//);
-		//}
-		//else
-		//{
-		//	itemsPerWG = workgroupSize;
-		//	overridenUnspecialized = CHLSLCompiler::createOverridenCopy(
-		//		source.get(), "#define OPERATION %s\n#define WORKGROUP_SIZE %d\n",
-		//		(("subgroup::") + arith_name).c_str(), workgroupSize
-		//	);
-		//}
-
 		auto compiler = make_smart_refctd_ptr<asset::CHLSLCompiler>(smart_refctd_ptr(m_system));
 		CHLSLCompiler::SOptions options = {};
 		options.stage = IShader::E_SHADER_STAGE::ESS_COMPUTE;
@@ -357,7 +288,7 @@ class Workgroup2ScanTestApp final : public application_templates::BasicMultiQueu
 		auto pipeline = createPipeline(overriddenUnspecialized.get(),subgroupSizeLog2);
 
 		// TODO: overlap dispatches with memory readbacks (requires multiple copies of `buffers`)
-		const uint32_t workgroupCount = elementCount / itemsPerWG;	// TODO use Config::VirtualWorkgroupSize somehow
+		const uint32_t workgroupCount = elementCount / itemsPerWG;
 		cmdbuf->begin(IGPUCommandBuffer::USAGE::NONE);
 		cmdbuf->bindComputePipeline(pipeline.get());
 		cmdbuf->bindDescriptorSets(EPBP_COMPUTE, pipeline->getLayout(), 0u, 1u, &descriptorSet.get());
@@ -399,8 +330,6 @@ class Workgroup2ScanTestApp final : public application_templates::BasicMultiQueu
 		passed = validateResults<Arithmetic, multiplies<uint32_t>, WorkgroupTest>(itemsPerWG, workgroupCount) && passed;
 		passed = validateResults<Arithmetic, minimum<uint32_t>, WorkgroupTest>(itemsPerWG, workgroupCount) && passed;
 		passed = validateResults<Arithmetic, maximum<uint32_t>, WorkgroupTest>(itemsPerWG, workgroupCount) && passed;
-		//if constexpr (WorkgroupTest)
-		//	passed = validateResults<Arithmetic, ballot<uint32_t>, WorkgroupTest>(itemsPerWG, workgroupCount) && passed;
 
 		return passed;
 	}
@@ -428,27 +357,10 @@ class Workgroup2ScanTestApp final : public application_templates::BasicMultiQueu
 		// TODO: parallel for (the temporary values need to be threadlocal or what?)
 		// now check if the data obtained has valid values
 		type_t* tmp = new type_t[itemsPerWG];
-		//type_t* ballotInput = new type_t[itemsPerWG];
 		for (uint32_t workgroupID = 0u; success && workgroupID < workgroupCount; workgroupID++)
 		{
 			const auto workgroupOffset = workgroupID * itemsPerWG;
-
-			//if constexpr (WorkgroupTest)
-			//{
-			//	if constexpr (std::is_same_v<ballot<type_t>, Binop>)
-			//	{
-			//		for (auto i = 0u; i < itemsPerWG; i++)
-			//			ballotInput[i] = inputData[i + workgroupOffset] & 0x1u;
-			//		Arithmetic<Binop>::impl(tmp, ballotInput, itemsPerWG);
-			//	}
-			//	else
-					Arithmetic<Binop>::impl(tmp, inputData + workgroupOffset, itemsPerWG);
-			//}
-			//else
-			//{
-			//	for (uint32_t pseudoSubgroupID = 0u; pseudoSubgroupID < itemsPerWG; pseudoSubgroupID += subgroupSize)
-			//		Arithmetic<Binop>::impl(tmp + pseudoSubgroupID, inputData + workgroupOffset + pseudoSubgroupID, subgroupSize);
-			//}
+			Arithmetic<Binop>::impl(tmp, inputData + workgroupOffset, itemsPerWG);
 
 			for (uint32_t localInvocationIndex = 0u; localInvocationIndex < itemsPerWG; localInvocationIndex++)
 			{
@@ -467,7 +379,6 @@ class Workgroup2ScanTestApp final : public application_templates::BasicMultiQueu
 				}
 			}
 		}
-		//delete[] ballotInput;
 		delete[] tmp;
 
 		return success;
@@ -489,7 +400,7 @@ class Workgroup2ScanTestApp final : public application_templates::BasicMultiQueu
 
 	uint32_t totalFailCount = 0;
 
-	uint32_t ItemsPerInvocation = 1u;
+	uint32_t ItemsPerInvocation = 4u;
 };
 
 NBL_MAIN_FUNC(Workgroup2ScanTestApp)
\ No newline at end of file

From 474281d8d81cddc30aa289355889ead235014e98 Mon Sep 17 00:00:00 2001
From: keptsecret <sorchon@gmail.com>
Date: Thu, 1 May 2025 12:18:08 +0700
Subject: [PATCH 213/529] benchmark shader, new common header

---
 .../benchmarkWorkgroup.comp.hlsl              |  97 ++++++++++++
 .../app_resources/testWorkgroup.comp.hlsl     | 145 +-----------------
 .../app_resources/workgroupCommon.hlsl        |  71 +++++++++
 74a_Workgroup2ScanTest/main.cpp               |   5 +-
 4 files changed, 174 insertions(+), 144 deletions(-)
 create mode 100644 74a_Workgroup2ScanTest/app_resources/benchmarkWorkgroup.comp.hlsl
 create mode 100644 74a_Workgroup2ScanTest/app_resources/workgroupCommon.hlsl

diff --git a/74a_Workgroup2ScanTest/app_resources/benchmarkWorkgroup.comp.hlsl b/74a_Workgroup2ScanTest/app_resources/benchmarkWorkgroup.comp.hlsl
new file mode 100644
index 000000000..f758f6ac8
--- /dev/null
+++ b/74a_Workgroup2ScanTest/app_resources/benchmarkWorkgroup.comp.hlsl
@@ -0,0 +1,97 @@
+#pragma shader_stage(compute)
+
+#include "workgroupCommon.hlsl"
+
+template<class Config, class Binop>
+struct DataProxy
+{
+    using dtype_t = vector<uint32_t, Config::ItemsPerInvocation_0>;
+    static_assert(nbl::hlsl::is_same_v<dtype_t, type_t>);
+
+    dtype_t get(const uint32_t ix)
+    {
+        // return inputValue[ix];
+        return inputVal;
+    }
+    void set(const uint32_t ix, const dtype_t value)
+    {
+        // output[Binop::BindingIndex].template Store<type_t>(sizeof(uint32_t) + sizeof(type_t) * ix, value);
+        outputVal = value;
+    }
+
+    void workgroupExecutionAndMemoryBarrier()
+    {
+        nbl::hlsl::glsl::barrier();
+        //nbl::hlsl::glsl::memoryBarrierShared(); implied by the above
+    }
+
+    // to avoid multiple load/store in benchmark, also values not that important?
+    dtype_t inputVal;
+    dtype_t outputVal;
+};
+
+static ScratchProxy<config_t> arithmeticAccessor;
+
+template<class Binop, class device_capabilities>
+struct operation_t
+{
+    using binop_base_t = typename Binop::base_t;
+    using otype_t = typename Binop::type_t;
+
+    otype_t operator()()
+    {
+        DataProxy<config_t,Binop> dataAccessor;
+        dataAccessor.inputVal = inputValue[globalIndex()];
+        nbl::hlsl::OPERATION<config_t,binop_base_t,device_capabilities>::template __call<DataProxy<config_t,Binop>, ScratchProxy<config_t> >(dataAccessor,arithmeticAccessor);
+        // we barrier before because we alias the accessors for Binop
+        arithmeticAccessor.workgroupExecutionAndMemoryBarrier();
+        return dataAccessor.outputVal;
+    }
+};
+
+
+template<template<class> class binop, typename T, uint32_t N>
+static void subtest(NBL_CONST_REF_ARG(type_t) sourceVal)
+{
+    if (globalIndex()==0u)
+        output[binop<T>::BindingIndex].template Store<uint32_t>(0,nbl::hlsl::glsl::gl_SubgroupSize());
+
+    type_t value;
+    operation_t<binop<T>,nbl::hlsl::jit::device_capabilities> func;
+    for (uint32_t i = 0; i < NUM_LOOPS; i++)
+        value = func(); // store is done with data accessor now
+
+    output[binop<T>::BindingIndex].template Store<type_t>(sizeof(uint32_t) + sizeof(type_t) * globalIndex(), value);
+}
+
+
+type_t test()
+{
+    const type_t sourceVal = inputValue[globalIndex()];
+
+    subtest<bit_and, uint32_t, ITEMS_PER_INVOCATION>(sourceVal);
+    subtest<bit_xor, uint32_t, ITEMS_PER_INVOCATION>(sourceVal);
+    subtest<bit_or, uint32_t, ITEMS_PER_INVOCATION>(sourceVal);
+    subtest<plus, uint32_t, ITEMS_PER_INVOCATION>(sourceVal);
+    subtest<multiplies, uint32_t, ITEMS_PER_INVOCATION>(sourceVal);
+    subtest<minimum, uint32_t, ITEMS_PER_INVOCATION>(sourceVal);
+    subtest<maximum, uint32_t, ITEMS_PER_INVOCATION>(sourceVal);
+    return sourceVal;
+}
+
+
+uint32_t globalIndex()
+{
+    return nbl::hlsl::glsl::gl_WorkGroupID().x*ITEMS_PER_WG+nbl::hlsl::workgroup::SubgroupContiguousIndex();
+}
+
+bool canStore()
+{
+    return nbl::hlsl::workgroup::SubgroupContiguousIndex()<ITEMS_PER_WG;
+}
+
+[numthreads(WORKGROUP_SIZE,1,1)]
+void main()
+{
+    const type_t sourceVal = test();
+}
\ No newline at end of file
diff --git a/74a_Workgroup2ScanTest/app_resources/testWorkgroup.comp.hlsl b/74a_Workgroup2ScanTest/app_resources/testWorkgroup.comp.hlsl
index 67bb9c5f2..ac4104279 100644
--- a/74a_Workgroup2ScanTest/app_resources/testWorkgroup.comp.hlsl
+++ b/74a_Workgroup2ScanTest/app_resources/testWorkgroup.comp.hlsl
@@ -1,87 +1,6 @@
 #pragma shader_stage(compute)
 
-
-#include "nbl/builtin/hlsl/workgroup/scratch_size.hlsl"
-
-// static const uint32_t ArithmeticSz = nbl::hlsl::workgroup::scratch_size_arithmetic<ITEMS_PER_WG>::value;
-// static const uint32_t BallotSz = nbl::hlsl::workgroup::scratch_size_ballot<ITEMS_PER_WG>::value;
-// static const uint32_t ScratchSz = ArithmeticSz+BallotSz;
-
-// TODO: Can we make it a static variable in the ScratchProxy struct?
-// groupshared uint32_t ballotScratch[ScratchSz];  // TODO probably remove, not balloting
-
-
-#include "nbl/builtin/hlsl/workgroup2/arithmetic.hlsl"
-
-#include "nbl/builtin/hlsl/workgroup/broadcast.hlsl"
-
-#include "nbl/builtin/hlsl/glsl_compat/core.hlsl"
-#include "nbl/builtin/hlsl/subgroup/basic.hlsl"
-#include "nbl/builtin/hlsl/subgroup2/arithmetic_portability.hlsl"
-
-#include "nbl/builtin/hlsl/jit/device_capabilities.hlsl"
-
-#include "common.hlsl"
-
-// https://github.com/microsoft/DirectXShaderCompiler/issues/6144
-uint32_t3 nbl::hlsl::glsl::gl_WorkGroupSize() {return uint32_t3(WORKGROUP_SIZE,1,1);}
-
-// #define ITEMS_PER_INVOCATION 1
-
-#ifndef ITEMS_PER_INVOCATION
-#error "Define ITEMS_PER_INVOCATION!"
-#endif
-
-typedef vector<uint32_t, ITEMS_PER_INVOCATION> type_t;
-
-// unfortunately DXC chokes on descriptors as static members
-// https://github.com/microsoft/DirectXShaderCompiler/issues/5940
-[[vk::binding(0, 0)]] StructuredBuffer<type_t> inputValue;
-[[vk::binding(1, 0)]] RWByteAddressBuffer output[8];
-
-// because subgroups don't match `gl_LocalInvocationIndex` snake curve addressing, we also can't load inputs that way
-uint32_t globalIndex();
-// since we test ITEMS_PER_WG<WorkgroupSize we need this so workgroups don't overwrite each other's outputs
-bool canStore();
-
-// #define SUBGROUP_SIZE_LOG2 5
-
-#ifndef OPERATION
-#error "Define OPERATION!"
-#endif
-#ifndef SUBGROUP_SIZE_LOG2
-#error "Define SUBGROUP_SIZE_LOG2!"
-#endif
-
-using config_t = nbl::hlsl::workgroup2::Configuration<WORKGROUP_SIZE, SUBGROUP_SIZE_LOG2, ITEMS_PER_INVOCATION>;
-
-groupshared vector<uint32_t, config_t::ItemsPerInvocation_1> scratch[config_t::SubgroupSize];  // final (level 1) scan needs to fit in one subgroup exactly
-
-template<class Config>
-struct ScratchProxy
-{
-    using stype_t = vector<uint32_t, Config::ItemsPerInvocation_1>;
-
-    stype_t get(const uint32_t ix)
-    {
-        return scratch[ix];
-    }
-    void set(const uint32_t ix, const stype_t value)
-    {
-        scratch[ix] = value;
-    }
-
-    stype_t atomicOr(const uint32_t ix, const stype_t value)
-    {
-        return nbl::hlsl::glsl::atomicOr(scratch[ix],value);
-    }
-
-    void workgroupExecutionAndMemoryBarrier()
-    {
-        nbl::hlsl::glsl::barrier();
-        //nbl::hlsl::glsl::memoryBarrierShared(); implied by the above
-    }
-};
+#include "workgroupCommon.hlsl"
 
 template<class Config, class Binop>
 struct DataProxy
@@ -95,8 +14,6 @@ struct DataProxy
     }
     void set(const uint32_t ix, const dtype_t value)
     {
-        // inputValue[ix] = value;
-        // output[Binop::BindingIndex].template Store<type_t>(sizeof(uint32_t) + sizeof(type_t) * globalIndex(), value);
         output[Binop::BindingIndex].template Store<type_t>(sizeof(uint32_t) + sizeof(type_t) * ix, value);
     }
 
@@ -121,7 +38,6 @@ struct operation_t
         nbl::hlsl::OPERATION<config_t,binop_base_t,device_capabilities>::template __call<DataProxy<config_t,Binop>, ScratchProxy<config_t> >(dataAccessor,arithmeticAccessor);
         // we barrier before because we alias the accessors for Binop
         arithmeticAccessor.workgroupExecutionAndMemoryBarrier();
-        // return retval;
     }
 };
 
@@ -131,11 +47,9 @@ static void subtest(NBL_CONST_REF_ARG(type_t) sourceVal)
 {
     if (globalIndex()==0u)
         output[binop<T>::BindingIndex].template Store<uint32_t>(0,nbl::hlsl::glsl::gl_SubgroupSize());
-        
+
     operation_t<binop<T>,nbl::hlsl::jit::device_capabilities> func;
-    // if (canStore())
-        // output[binop<type_t>::BindingIndex].template Store<type_t>(sizeof(uint32_t)+sizeof(type_t)*globalIndex(),func());
-        func(); // store is done with data accessor now
+    func(); // store is done with data accessor now
 }
 
 
@@ -154,33 +68,6 @@ type_t test()
 }
 
 
-// template<uint16_t offset>
-// struct BallotProxy
-// {
-//     void get(const uint32_t ix, NBL_REF_ARG(uint32_t) value)
-//     {
-//         value = ballotScratch[ix+offset];
-//     }
-//     void set(const uint32_t ix, const uint32_t value)
-//     {
-//         ballotScratch[ix+offset] = value;
-//     }
-
-//     uint32_t atomicOr(const uint32_t ix, const uint32_t value)
-//     {
-//         return nbl::hlsl::glsl::atomicOr(ballotScratch[ix],value);
-//     }
-
-//     void workgroupExecutionAndMemoryBarrier()
-//     {
-//         nbl::hlsl::glsl::barrier();
-//         //nbl::hlsl::glsl::memoryBarrierShared(); implied by the above
-//     }
-// };
-
-// static BallotProxy<ArithmeticSz> ballotAccessor;
-
-
 uint32_t globalIndex()
 {
     return nbl::hlsl::glsl::gl_WorkGroupID().x*ITEMS_PER_WG+nbl::hlsl::workgroup::SubgroupContiguousIndex();
@@ -195,30 +82,4 @@ bool canStore()
 void main()
 {
     const type_t sourceVal = test();
-//     if (globalIndex()==0u)
-//         output[ballot<type_t>::BindingIndex].template Store<uint32_t>(0,nbl::hlsl::glsl::gl_SubgroupSize());
-
-//     // we can only ballot booleans, so low bit
-//     nbl::hlsl::workgroup::ballot<ScratchProxy<ArithmeticSz> >(bool(sourceVal & 0x1u), ballotAccessor);
-//     // need to barrier between ballot and usages of a ballot by myself
-//     ballotAccessor.workgroupExecutionAndMemoryBarrier();
-
-//     uint32_t destVal = 0xdeadbeefu;
-// #define CONSTEXPR_OP_TYPE_TEST(IS_OP) nbl::hlsl::is_same<nbl::hlsl::OPERATION<nbl::hlsl::bit_xor<float>,0x45>,nbl::hlsl::workgroup::IS_OP<nbl::hlsl::bit_xor<float>,0x45> >::value
-// #define BALLOT_TEMPLATE_ARGS ITEMS_PER_WG,decltype(ballotAccessor),decltype(arithmeticAccessor),nbl::hlsl::jit::device_capabilities
-//     if (CONSTEXPR_OP_TYPE_TEST(reduction))
-//         destVal = nbl::hlsl::workgroup::ballotBitCount<BALLOT_TEMPLATE_ARGS>(ballotAccessor,arithmeticAccessor);
-//     else if (CONSTEXPR_OP_TYPE_TEST(inclusive_scan))
-//         destVal = nbl::hlsl::workgroup::ballotInclusiveBitCount<BALLOT_TEMPLATE_ARGS>(ballotAccessor,arithmeticAccessor);
-//     else if (CONSTEXPR_OP_TYPE_TEST(exclusive_scan))
-//         destVal = nbl::hlsl::workgroup::ballotExclusiveBitCount<BALLOT_TEMPLATE_ARGS>(ballotAccessor,arithmeticAccessor);
-//     else
-//     {
-//         assert(false);
-//     }
-// #undef BALLOT_TEMPLATE_ARGS
-// #undef CONSTEXPR_OP_TYPE_TEST
-
-//     if (canStore())
-//         output[ballot<type_t>::BindingIndex].template Store<type_t>(sizeof(uint32_t)+sizeof(type_t)*globalIndex(),destVal);
 }
\ No newline at end of file
diff --git a/74a_Workgroup2ScanTest/app_resources/workgroupCommon.hlsl b/74a_Workgroup2ScanTest/app_resources/workgroupCommon.hlsl
new file mode 100644
index 000000000..362b48253
--- /dev/null
+++ b/74a_Workgroup2ScanTest/app_resources/workgroupCommon.hlsl
@@ -0,0 +1,71 @@
+#include "nbl/builtin/hlsl/workgroup/scratch_size.hlsl"
+
+#include "nbl/builtin/hlsl/workgroup2/arithmetic.hlsl"
+
+#include "nbl/builtin/hlsl/workgroup/broadcast.hlsl"
+
+#include "nbl/builtin/hlsl/glsl_compat/core.hlsl"
+#include "nbl/builtin/hlsl/subgroup/basic.hlsl"
+#include "nbl/builtin/hlsl/subgroup2/arithmetic_portability.hlsl"
+
+#include "nbl/builtin/hlsl/jit/device_capabilities.hlsl"
+
+#include "common.hlsl"
+
+static const uint32_t WORKGROUP_SIZE = 1u << WORKGROUP_SIZE_LOG2;
+
+// https://github.com/microsoft/DirectXShaderCompiler/issues/6144
+uint32_t3 nbl::hlsl::glsl::gl_WorkGroupSize() {return uint32_t3(WORKGROUP_SIZE,1,1);}
+
+#ifndef ITEMS_PER_INVOCATION
+#error "Define ITEMS_PER_INVOCATION!"
+#endif
+
+typedef vector<uint32_t, ITEMS_PER_INVOCATION> type_t;
+
+// unfortunately DXC chokes on descriptors as static members
+// https://github.com/microsoft/DirectXShaderCompiler/issues/5940
+[[vk::binding(0, 0)]] StructuredBuffer<type_t> inputValue;
+[[vk::binding(1, 0)]] RWByteAddressBuffer output[8];
+
+// because subgroups don't match `gl_LocalInvocationIndex` snake curve addressing, we also can't load inputs that way
+uint32_t globalIndex();
+// since we test ITEMS_PER_WG<WorkgroupSize we need this so workgroups don't overwrite each other's outputs
+bool canStore();
+
+#ifndef OPERATION
+#error "Define OPERATION!"
+#endif
+#ifndef SUBGROUP_SIZE_LOG2
+#error "Define SUBGROUP_SIZE_LOG2!"
+#endif
+
+using config_t = nbl::hlsl::workgroup2::Configuration<WORKGROUP_SIZE_LOG2, SUBGROUP_SIZE_LOG2, ITEMS_PER_INVOCATION>;
+
+groupshared vector<uint32_t, config_t::ItemsPerInvocation_1> scratch[config_t::SubgroupSize];  // final (level 1) scan needs to fit in one subgroup exactly
+
+template<class Config>
+struct ScratchProxy
+{
+    using stype_t = vector<uint32_t, Config::ItemsPerInvocation_1>;
+
+    stype_t get(const uint32_t ix)
+    {
+        return scratch[ix];
+    }
+    void set(const uint32_t ix, const stype_t value)
+    {
+        scratch[ix] = value;
+    }
+
+    stype_t atomicOr(const uint32_t ix, const stype_t value)
+    {
+        return nbl::hlsl::glsl::atomicOr(scratch[ix],value);
+    }
+
+    void workgroupExecutionAndMemoryBarrier()
+    {
+        nbl::hlsl::glsl::barrier();
+        //nbl::hlsl::glsl::memoryBarrierShared(); implied by the above
+    }
+};
diff --git a/74a_Workgroup2ScanTest/main.cpp b/74a_Workgroup2ScanTest/main.cpp
index f0064a4c0..c5e8370be 100644
--- a/74a_Workgroup2ScanTest/main.cpp
+++ b/74a_Workgroup2ScanTest/main.cpp
@@ -246,6 +246,7 @@ class Workgroup2ScanTestApp final : public application_templates::BasicMultiQueu
 	bool runTest(const smart_refctd_ptr<const ICPUShader>& source, const uint32_t elementCount, const uint8_t subgroupSizeLog2, const uint32_t workgroupSize, uint32_t itemsPerWG = ~0u)
 	{
 		std::string arith_name = Arithmetic<bit_xor<float>>::name;
+		const uint32_t workgroupSizeLog2 = hlsl::findMSB(workgroupSize);
 
 		auto compiler = make_smart_refctd_ptr<asset::CHLSLCompiler>(smart_refctd_ptr(m_system));
 		CHLSLCompiler::SOptions options = {};
@@ -268,7 +269,7 @@ class Workgroup2ScanTestApp final : public application_templates::BasicMultiQueu
 
 		const std::string definitions[5] = {
 			"workgroup2::" + arith_name,
-			std::to_string(workgroupSize),
+			std::to_string(workgroupSizeLog2),
 			std::to_string(itemsPerWG),
 			std::to_string(ItemsPerInvocation),
 			std::to_string(subgroupSizeLog2)
@@ -276,7 +277,7 @@ class Workgroup2ScanTestApp final : public application_templates::BasicMultiQueu
 
 		const IShaderCompiler::SMacroDefinition defines[5] = {
 			{ "OPERATION", definitions[0] },
-			{ "WORKGROUP_SIZE", definitions[1] },
+			{ "WORKGROUP_SIZE_LOG2", definitions[1] },
 			{ "ITEMS_PER_WG", definitions[2] },
 			{ "ITEMS_PER_INVOCATION", definitions[3] },
 			{ "SUBGROUP_SIZE_LOG2", definitions[4] }

From 3e1cd3e5496b3fdd56b6ef6f628a332c1e0599b0 Mon Sep 17 00:00:00 2001
From: Erfan Ahmadi <ahmadierfan99@gmail.com>
Date: Thu, 1 May 2025 14:03:16 +0400
Subject: [PATCH 214/529] Fix and separate custom clip projection

---
 62_CAD/DrawResourcesFiller.cpp                | 88 ++++++++++++++-----
 62_CAD/DrawResourcesFiller.h                  | 67 +++++++++-----
 62_CAD/main.cpp                               | 65 +++++++++-----
 62_CAD/shaders/globals.hlsl                   | 43 +++++----
 .../shaders/main_pipeline/vertex_shader.hlsl  | 38 ++++++--
 5 files changed, 206 insertions(+), 95 deletions(-)

diff --git a/62_CAD/DrawResourcesFiller.cpp b/62_CAD/DrawResourcesFiller.cpp
index d12837691..759db16f3 100644
--- a/62_CAD/DrawResourcesFiller.cpp
+++ b/62_CAD/DrawResourcesFiller.cpp
@@ -376,19 +376,33 @@ void DrawResourcesFiller::endMainObject()
 	activeMainObjectIndex = InvalidMainObjectIdx;
 }
 
-void DrawResourcesFiller::pushClipProjectionData(const ClipProjectionData& clipProjectionData)
+void DrawResourcesFiller::pushCustomProjection(const float64_t3x3& projection)
 {
-	activeClipProjections.push_back(clipProjectionData);
-	activeClipProjectionIndices.push_back(InvalidClipProjectionIndex);
+	activeProjections.push_back(projection);
+	activeProjectionIndices.push_back(InvalidCustomProjectionIndex);
 }
 
-void DrawResourcesFiller::popClipProjectionData()
+void DrawResourcesFiller::popCustomProjection()
 {
-	if (activeClipProjections.empty())
+	if (activeProjections.empty())
 		return;
 
-	activeClipProjections.pop_back();
-	activeClipProjectionIndices.pop_back();
+	activeProjections.pop_back();
+	activeProjectionIndices.pop_back();
+}
+
+void DrawResourcesFiller::pushCustomClipRect(const WorldClipRect& clipRect)
+{
+	activeClipRects.push_back(clipRect);
+	activeClipRectIndices.push_back(InvalidCustomClipRectIndex);
+}
+
+void DrawResourcesFiller::popCustomClipRect()
+{	if (activeClipRects.empty())
+		return;
+
+	activeClipRects.pop_back();
+	activeClipRectIndices.pop_back();
 }
 
 bool DrawResourcesFiller::finalizeBufferCopies(SIntendedSubmitInfo& intendedNextSubmit)
@@ -437,7 +451,8 @@ bool DrawResourcesFiller::finalizeBufferCopies(SIntendedSubmitInfo& intendedNext
 
 	copyCPUFilledDrawBuffer(resourcesCollection.lineStyles);
 	copyCPUFilledDrawBuffer(resourcesCollection.dtmSettings);
-	copyCPUFilledDrawBuffer(resourcesCollection.clipProjections);
+	copyCPUFilledDrawBuffer(resourcesCollection.customProjections);
+	copyCPUFilledDrawBuffer(resourcesCollection.customClipRects);
 	copyCPUFilledDrawBuffer(resourcesCollection.mainObjects);
 	copyCPUFilledDrawBuffer(resourcesCollection.drawObjects);
 	copyCPUFilledDrawBuffer(resourcesCollection.indexBuffer);
@@ -703,15 +718,26 @@ uint32_t DrawResourcesFiller::acquireActiveDTMSettingsIndex_SubmitIfNeeded(SInte
 	return activeDTMSettingsIndex;
 }
 
-uint32_t DrawResourcesFiller::acquireActiveClipProjectionIndex_SubmitIfNeeded(SIntendedSubmitInfo& intendedNextSubmit)
+uint32_t DrawResourcesFiller::acquireActiveCustomProjectionIndex_SubmitIfNeeded(SIntendedSubmitInfo& intendedNextSubmit)
 {
-	if (activeClipProjectionIndices.empty())
-		return InvalidClipProjectionIndex;
+	if (activeProjectionIndices.empty())
+		return InvalidCustomProjectionIndex;
 
-	if (activeClipProjectionIndices.back() == InvalidClipProjectionIndex)
-		activeClipProjectionIndices.back() = addClipProjectionData_SubmitIfNeeded(activeClipProjections.back(), intendedNextSubmit);
+	if (activeProjectionIndices.back() == InvalidCustomProjectionIndex)
+		activeProjectionIndices.back() = addCustomProjection_SubmitIfNeeded(activeProjections.back(), intendedNextSubmit);
 	
-	return activeClipProjectionIndices.back();
+	return activeProjectionIndices.back();
+}
+
+uint32_t DrawResourcesFiller::acquireActiveCustomClipRectIndex_SubmitIfNeeded(SIntendedSubmitInfo& intendedNextSubmit)
+{
+	if (activeClipRectIndices.empty())
+		return InvalidCustomClipRectIndex;
+
+	if (activeClipRectIndices.back() == InvalidCustomClipRectIndex)
+		activeClipRectIndices.back() = addCustomClipRect_SubmitIfNeeded(activeClipRects.back(), intendedNextSubmit);
+	
+	return activeClipRectIndices.back();
 }
 
 uint32_t DrawResourcesFiller::acquireActiveMainObjectIndex_SubmitIfNeeded(SIntendedSubmitInfo& intendedNextSubmit)
@@ -729,14 +755,16 @@ uint32_t DrawResourcesFiller::acquireActiveMainObjectIndex_SubmitIfNeeded(SInten
 		(activeMainObjectType == MainObjectType::HATCH) ||
 		(activeMainObjectType == MainObjectType::TEXT);
 	const bool needsDTMSettings = (activeMainObjectType == MainObjectType::DTM);
-	const bool needsCustomClipProjection = (!activeClipProjectionIndices.empty());
+	const bool needsCustomProjection = (!activeProjectionIndices.empty());
+	const bool needsCustomClipRect = (!activeClipRectIndices.empty());
 
 	const size_t remainingResourcesSize = calculateRemainingResourcesSize();
 	// making sure MainObject and everything it references fits into remaining resources mem
 	size_t memRequired = sizeof(MainObject);
 	if (needsLineStyle) memRequired += sizeof(LineStyle);
 	if (needsDTMSettings) memRequired += sizeof(DTMSettings);
-	if (needsCustomClipProjection) memRequired += sizeof(ClipProjectionData);
+	if (needsCustomProjection) memRequired += sizeof(float64_t3x3);
+	if (needsCustomClipRect) memRequired += sizeof(WorldClipRect);
 
 	const bool enoughMem = remainingResourcesSize >= memRequired; // enough remaining memory for 1 more dtm settings with 2 referenced line styles?
 	const bool needToOverflowSubmit = (!enoughMem) || (resourcesCollection.mainObjects.vector.size() >= MaxIndexableMainObjects);
@@ -754,7 +782,8 @@ uint32_t DrawResourcesFiller::acquireActiveMainObjectIndex_SubmitIfNeeded(SInten
 	// if something here triggers a auto-submit it's a possible bug with calculating `memRequired` above, TODO: assert that somehow?
 	mainObject.styleIdx = (needsLineStyle) ? acquireActiveLineStyleIndex_SubmitIfNeeded(intendedNextSubmit) : InvalidStyleIdx;
 	mainObject.dtmSettingsIdx = (needsDTMSettings) ? acquireActiveDTMSettingsIndex_SubmitIfNeeded(intendedNextSubmit) : InvalidDTMSettingsIdx;
-	mainObject.clipProjectionIndex = (needsCustomClipProjection) ? acquireActiveClipProjectionIndex_SubmitIfNeeded(intendedNextSubmit) : InvalidClipProjectionIndex;
+	mainObject.customProjectionIndex = (needsCustomProjection) ? acquireActiveCustomProjectionIndex_SubmitIfNeeded(intendedNextSubmit) : InvalidCustomProjectionIndex;
+	mainObject.customClipRectIndex = (needsCustomClipRect) ? acquireActiveCustomClipRectIndex_SubmitIfNeeded(intendedNextSubmit) : InvalidCustomClipRectIndex;
 	activeMainObjectIndex = resourcesCollection.mainObjects.addAndGetOffset(mainObject);
 	return activeMainObjectIndex;
 }
@@ -793,10 +822,27 @@ uint32_t DrawResourcesFiller::addDTMSettings_SubmitIfNeeded(const DTMSettingsInf
 	return outDTMSettingIdx;
 }
 
-uint32_t DrawResourcesFiller::addClipProjectionData_SubmitIfNeeded(const ClipProjectionData& clipProjectionData, SIntendedSubmitInfo& intendedNextSubmit)
+uint32_t DrawResourcesFiller::addCustomProjection_SubmitIfNeeded(const float64_t3x3& projection, SIntendedSubmitInfo& intendedNextSubmit)
+{
+	const size_t remainingResourcesSize = calculateRemainingResourcesSize();
+	const size_t memRequired = sizeof(float64_t3x3);
+	const bool enoughMem = remainingResourcesSize >= memRequired; // enough remaining memory for 1 more dtm settings with 2 referenced line styles?
+
+	if (!enoughMem)
+	{
+		finalizeAllCopiesToGPU(intendedNextSubmit);
+		submitDraws(intendedNextSubmit);
+		reset(); // resets everything! be careful!
+	}
+	
+	resourcesCollection.customProjections.vector.push_back(projection); // this will implicitly increase total resource consumption and reduce remaining size --> no need for mem size trackers
+	return resourcesCollection.customProjections.vector.size() - 1u;
+}
+
+uint32_t DrawResourcesFiller::addCustomClipRect_SubmitIfNeeded(const WorldClipRect& clipRect, SIntendedSubmitInfo& intendedNextSubmit)
 {
 	const size_t remainingResourcesSize = calculateRemainingResourcesSize();
-	const size_t memRequired = sizeof(ClipProjectionData);
+	const size_t memRequired = sizeof(WorldClipRect);
 	const bool enoughMem = remainingResourcesSize >= memRequired; // enough remaining memory for 1 more dtm settings with 2 referenced line styles?
 
 	if (!enoughMem)
@@ -806,8 +852,8 @@ uint32_t DrawResourcesFiller::addClipProjectionData_SubmitIfNeeded(const ClipPro
 		reset(); // resets everything! be careful!
 	}
 	
-	resourcesCollection.clipProjections.vector.push_back(clipProjectionData); // this will implicitly increase total resource consumption and reduce remaining size --> no need for mem size trackers
-	return resourcesCollection.clipProjections.vector.size() - 1u;
+	resourcesCollection.customClipRects.vector.push_back(clipRect); // this will implicitly increase total resource consumption and reduce remaining size --> no need for mem size trackers
+	return resourcesCollection.customClipRects.vector.size() - 1u;
 }
 
 void DrawResourcesFiller::addPolylineObjects_Internal(const CPolylineBase& polyline, const CPolylineBase::SectionInfo& section, uint32_t& currentObjectInSection, uint32_t mainObjIdx)
diff --git a/62_CAD/DrawResourcesFiller.h b/62_CAD/DrawResourcesFiller.h
index 1e244ae01..b92685959 100644
--- a/62_CAD/DrawResourcesFiller.h
+++ b/62_CAD/DrawResourcesFiller.h
@@ -14,9 +14,8 @@ using namespace nbl::asset;
 using namespace nbl::ext::TextRendering;
 
 static_assert(sizeof(DrawObject) == 16u);
-static_assert(sizeof(MainObject) == 12u);
+static_assert(sizeof(MainObject) == 16u);
 static_assert(sizeof(LineStyle) == 88u);
-static_assert(sizeof(ClipProjectionData) == 88u);
 
 // ! DrawResourcesFiller
 // ! This class provides important functionality to manage resources needed for a draw.
@@ -92,7 +91,8 @@ struct DrawResourcesFiller
 		// auto-submission level 0 resources (settings that mainObj references)
 		CPUGeneratedResource<LineStyle> lineStyles;
 		CPUGeneratedResource<DTMSettings> dtmSettings;
-		CPUGeneratedResource<ClipProjectionData> clipProjections;
+		CPUGeneratedResource<float64_t3x3> customProjections;
+		CPUGeneratedResource<WorldClipRect> customClipRects;
 	
 		// auto-submission level 1 buffers (mainObj that drawObjs references, if all drawObjs+idxBuffer+geometryInfo doesn't fit into mem this will be broken down into many)
 		CPUGeneratedResource<MainObject> mainObjects;
@@ -109,7 +109,8 @@ struct DrawResourcesFiller
 			return
 				lineStyles.getAlignedStorageSize() +
 				dtmSettings.getAlignedStorageSize() +
-				clipProjections.getAlignedStorageSize() +
+				customProjections.getAlignedStorageSize() +
+				customClipRects.getAlignedStorageSize() +
 				mainObjects.getAlignedStorageSize() +
 				drawObjects.getAlignedStorageSize() +
 				indexBuffer.getAlignedStorageSize() +
@@ -129,7 +130,7 @@ struct DrawResourcesFiller
 	{
 		// for auto-submission to work correctly, memory needs to serve at least 2 linestyle, 1 dtm settings, 1 clip proj, 1 main obj, 1 draw obj and 512 bytes of additional mem for geometries and index buffer
 		// this is the ABSOLUTE MINIMUM (if this value is used rendering will probably be as slow as CPU drawing :D)
-		return core::alignUp(sizeof(LineStyle) + sizeof(LineStyle) * DTMSettings::MaxContourSettings + sizeof(DTMSettings) + sizeof(ClipProjectionData) + sizeof(MainObject) + sizeof(DrawObject) + 512ull, ResourcesMaxNaturalAlignment);
+		return core::alignUp(sizeof(LineStyle) + sizeof(LineStyle) * DTMSettings::MaxContourSettings + sizeof(DTMSettings) + sizeof(WorldClipRect) + sizeof(float64_t3x3) + sizeof(MainObject) + sizeof(DrawObject) + 512ull, ResourcesMaxNaturalAlignment);
 	}
 
 	void allocateResourcesBuffer(ILogicalDevice* logicalDevice, size_t size);
@@ -207,7 +208,8 @@ struct DrawResourcesFiller
 	{
 		resetDrawObjects();
 		resetMainObjects();
-		resetCustomClipProjections();
+		resetCustomProjections();
+		resetCustomClipRects();
 		resetLineStyles();
 		resetDTMSettings();
 
@@ -231,10 +233,14 @@ struct DrawResourcesFiller
 	void beginMainObject(MainObjectType type);
 	void endMainObject();
 
-	void pushClipProjectionData(const ClipProjectionData& clipProjectionData);
-	void popClipProjectionData();
+	void pushCustomProjection(const float64_t3x3& projection);
+	void popCustomProjection();
+	
+	void pushCustomClipRect(const WorldClipRect& clipRect);
+	void popCustomClipRect();
 
-	const std::deque<ClipProjectionData>& getClipProjectionStack() const { return activeClipProjections; }
+	const std::deque<float64_t3x3>& getProjectionStack() const { return activeProjections; }
+	const std::deque<WorldClipRect>& getClipRectsStack() const { return activeClipRects; }
 
 	smart_refctd_ptr<IGPUImageView> getMSDFsTextureArray() { return msdfTextureArray; }
 
@@ -317,9 +323,13 @@ struct DrawResourcesFiller
 	// If it's been invalidated then it will request to add to resources again ( auto-submission happens If there is not enough memory to add again)
 	uint32_t acquireActiveDTMSettingsIndex_SubmitIfNeeded(SIntendedSubmitInfo& intendedNextSubmit);
 
-	// Gets resource index to the active clip projection data from the top of stack 
+	// Gets resource index to the active projection data from the top of stack 
+	// If it's been invalidated then it will request to add to resources again ( auto-submission happens If there is not enough memory to add again)
+	uint32_t acquireActiveCustomProjectionIndex_SubmitIfNeeded(SIntendedSubmitInfo& intendedNextSubmit);
+	
+	// Gets resource index to the active clip data from the top of stack 
 	// If it's been invalidated then it will request to add to resources again ( auto-submission happens If there is not enough memory to add again)
-	uint32_t acquireActiveClipProjectionIndex_SubmitIfNeeded(SIntendedSubmitInfo& intendedNextSubmit);
+	uint32_t acquireActiveCustomClipRectIndex_SubmitIfNeeded(SIntendedSubmitInfo& intendedNextSubmit);
 	
 	// Gets resource index to the active main object data
 	// If it's been invalidated then it will request to add to resources again ( auto-submission happens If there is not enough memory to add again)
@@ -331,8 +341,11 @@ struct DrawResourcesFiller
 	/// Attempts to add dtmSettings to resources. If it fails to do, due to resource limitations, auto-submits and tries again. 
 	uint32_t addDTMSettings_SubmitIfNeeded(const DTMSettingsInfo& dtmSettings, SIntendedSubmitInfo& intendedNextSubmit);
 	
-	/// Attempts to add clipProjection to resources. If it fails to do, due to resource limitations, auto-submits and tries again. 
-	uint32_t addClipProjectionData_SubmitIfNeeded(const ClipProjectionData& clipProjectionData, SIntendedSubmitInfo& intendedNextSubmit);
+	/// Attempts to add custom projection to gpu resources. If it fails to do, due to resource limitations, auto-submits and tries again. 
+	uint32_t addCustomProjection_SubmitIfNeeded(const float64_t3x3& projection, SIntendedSubmitInfo& intendedNextSubmit);
+	
+	/// Attempts to add custom clip to gpu resources. If it fails to do, due to resource limitations, auto-submits and tries again. 
+	uint32_t addCustomClipRect_SubmitIfNeeded(const WorldClipRect& clipRect, SIntendedSubmitInfo& intendedNextSubmit);
 	
 	/// returns index to added LineStyleInfo, returns Invalid index if it exceeds resource limitations
 	uint32_t addLineStyle_Internal(const LineStyleInfo& lineStyleInfo);
@@ -372,13 +385,22 @@ struct DrawResourcesFiller
 		resourcesCollection.geometryInfo.vector.clear();
 	}
 
-	void resetCustomClipProjections()
+	void resetCustomProjections()
 	{
-		resourcesCollection.clipProjections.vector.clear();
+		resourcesCollection.customProjections.vector.clear();
 		
-		// Invalidate all the clip projection addresses because activeClipProjections buffer got reset
-		for (auto& clipProjAddr : activeClipProjectionIndices)
-			clipProjAddr = InvalidClipProjectionIndex;
+		// Invalidate all the clip projection addresses because activeProjections buffer got reset
+		for (auto& addr : activeProjectionIndices)
+			addr = InvalidCustomProjectionIndex;
+	}
+
+	void resetCustomClipRects()
+	{
+		resourcesCollection.customClipRects.vector.clear();
+		
+		// Invalidate all the clip projection addresses because activeProjections buffer got reset
+		for (auto& addr : activeClipRectIndices)
+			addr = InvalidCustomClipRectIndex;
 	}
 
 	void resetLineStyles()
@@ -502,9 +524,12 @@ struct DrawResourcesFiller
 	MainObjectType activeMainObjectType;
 	uint32_t activeMainObjectIndex = InvalidMainObjectIdx;
 
-	// The ClipProjections are stack, because user can push/pop ClipProjections in any order
-	std::deque<ClipProjectionData> activeClipProjections; // stack of clip projections stored so we can resubmit them if geometry buffer got reset.
-	std::deque<uint32_t> activeClipProjectionIndices; // stack of clip projection gpu addresses in geometry buffer. to keep track of them in push/pops
+	// The ClipRects & Projections are stack, because user can push/pop ClipRects & Projections in any order
+	std::deque<float64_t3x3> activeProjections; // stack of projections stored so we can resubmit them if geometry buffer got reset.
+	std::deque<uint32_t> activeProjectionIndices; // stack of projection gpu addresses in geometry buffer. to keep track of them in push/pops
+	
+	std::deque<WorldClipRect> activeClipRects; // stack of clips stored so we can resubmit them if geometry buffer got reset.
+	std::deque<uint32_t> activeClipRectIndices; // stack of clips gpu addresses in geometry buffer. to keep track of them in push/pops
 
 	// MSDF
 	GetGlyphMSDFTextureFunc getGlyphMSDF;
diff --git a/62_CAD/main.cpp b/62_CAD/main.cpp
index e901d07c3..755a649b2 100644
--- a/62_CAD/main.cpp
+++ b/62_CAD/main.cpp
@@ -77,7 +77,7 @@ constexpr std::array<float, (uint32_t)ExampleMode::CASE_COUNT> cameraExtents =
 	10.0	// CASE_BUG
 };
 
-constexpr ExampleMode mode = ExampleMode::CASE_BUG;
+constexpr ExampleMode mode = ExampleMode::CASE_6;
 
 class Camera2D
 {
@@ -1208,17 +1208,16 @@ class ComputerAidedDesign final : public examples::SimpleWindowedApplication, pu
 		globalData.pointers = {
 			.lineStyles				= baseAddress + resources.lineStyles.bufferOffset,
 			.dtmSettings			= baseAddress + resources.dtmSettings.bufferOffset,
-			.customClipProjections	= baseAddress + resources.clipProjections.bufferOffset,
+			.customProjections		= baseAddress + resources.customProjections.bufferOffset,
+			.customClipRects		= baseAddress + resources.customClipRects.bufferOffset,
 			.mainObjects			= baseAddress + resources.mainObjects.bufferOffset,
 			.drawObjects			= baseAddress + resources.drawObjects.bufferOffset,
 			.geometryBuffer			= baseAddress + resources.geometryInfo.bufferOffset,
 		};
 		globalData.antiAliasingFactor = 1.0;// +abs(cos(m_timeElapsed * 0.0008)) * 20.0f;
 		globalData.resolution = uint32_t2{ m_window->getWidth(), m_window->getHeight() };
-		globalData.defaultClipProjection.projectionToNDC = projectionToNDC;
-		globalData.defaultClipProjection.minClipNDC = float32_t2(-1.0, -1.0);
-		globalData.defaultClipProjection.maxClipNDC = float32_t2(+1.0, +1.0);
-		float screenToWorld = getScreenToWorldRatio(globalData.defaultClipProjection.projectionToNDC, globalData.resolution);
+		globalData.defaultProjectionToNDC = projectionToNDC;
+		float screenToWorld = getScreenToWorldRatio(globalData.defaultProjectionToNDC, globalData.resolution);
 		globalData.screenToWorldRatio = screenToWorld;
 		globalData.worldToScreenRatio = (1.0f/screenToWorld);
 		globalData.miterLimit = 10.0f;
@@ -2715,16 +2714,19 @@ class ComputerAidedDesign final : public examples::SimpleWindowedApplication, pu
 		}
 		else if (mode == ExampleMode::CASE_6)
 		{
+			float64_t3x3 customProjection = float64_t3x3{
+				1.0, 0.0, cos(m_timeElapsed * 0.0005) * 100.0,
+				0.0, 1.0, 0.0,
+				0.0, 0.0, 1.0
+			};
+
 			// left half of screen should be red and right half should be green
-			const auto& cameraProj = m_Camera.constructViewProjection();
-			ClipProjectionData showLeft = {};
-			showLeft.projectionToNDC = cameraProj;
-			showLeft.minClipNDC = float32_t2(-1.0, -1.0);
-			showLeft.maxClipNDC = float32_t2(0.0, +1.0);
-			ClipProjectionData showRight = {};
-			showRight.projectionToNDC = cameraProj;
-			showRight.minClipNDC = float32_t2(0.0, -1.0);
-			showRight.maxClipNDC = float32_t2(+1.0, +1.0);
+			WorldClipRect showLeft = {};
+			showLeft.minClip  = float64_t2(-100.0, -1000.0);
+			showLeft.maxClip  = float64_t2(0.0, +1000.0);
+			WorldClipRect showRight = {};
+			showRight.minClip = float64_t2(0.0, -1000.0);
+			showRight.maxClip = float64_t2(100.0, +1000.0);
 
 			LineStyleInfo leftLineStyle = {};
 			leftLineStyle.screenSpaceLineWidth = 3.0f;
@@ -2779,35 +2781,37 @@ class ComputerAidedDesign final : public examples::SimpleWindowedApplication, pu
 			}
 
 			// we do redundant and nested push/pops to test
-			drawResourcesFiller.pushClipProjectionData(showLeft);
+			drawResourcesFiller.pushCustomClipRect(showLeft);
 			{
 				drawResourcesFiller.drawPolyline(polyline1, leftLineStyle, intendedNextSubmit);
 
-				drawResourcesFiller.pushClipProjectionData(showRight);
+				drawResourcesFiller.pushCustomClipRect(showRight);
+				drawResourcesFiller.pushCustomProjection(customProjection);
 				{
 					drawResourcesFiller.drawPolyline(polyline1, rightLineStyle, intendedNextSubmit);
 					drawResourcesFiller.drawPolyline(polyline2, rightLineStyle, intendedNextSubmit);
 				}
-				drawResourcesFiller.popClipProjectionData();
+				drawResourcesFiller.popCustomProjection();
+				drawResourcesFiller.popCustomClipRect();
 				
 				drawResourcesFiller.drawPolyline(polyline2, leftLineStyle, intendedNextSubmit);
 
-				drawResourcesFiller.pushClipProjectionData(showRight);
+				drawResourcesFiller.pushCustomClipRect(showRight);
 				{
 					drawResourcesFiller.drawPolyline(polyline3, rightLineStyle, intendedNextSubmit);
 					drawResourcesFiller.drawPolyline(polyline2, rightLineStyle, intendedNextSubmit);
 					
-					drawResourcesFiller.pushClipProjectionData(showLeft);
+					drawResourcesFiller.pushCustomClipRect(showLeft);
 					{
 					drawResourcesFiller.drawPolyline(polyline1, leftLineStyle, intendedNextSubmit);
 					}
-					drawResourcesFiller.popClipProjectionData();
+					drawResourcesFiller.popCustomClipRect();
 				}
-				drawResourcesFiller.popClipProjectionData();
+				drawResourcesFiller.popCustomClipRect();
 
 				drawResourcesFiller.drawPolyline(polyline2, leftLineStyle, intendedNextSubmit);
 			}
-			drawResourcesFiller.popClipProjectionData();
+			drawResourcesFiller.popCustomClipRect();
 			
 		}
 		else if (mode == ExampleMode::CASE_7)
@@ -3362,6 +3366,21 @@ class ComputerAidedDesign final : public examples::SimpleWindowedApplication, pu
 				drawResourcesFiller.drawPolyline(polyline, style, intendedNextSubmit);
 				polyline.clearEverything();
 			}
+			
+			float64_t2 line0[2u] = 
+			{
+				float64_t2(-1.0, 0.0),
+				float64_t2(+1.0, 0.0),
+			};
+			float64_t2 line1[2u] = 
+			{
+				float64_t2(0.0, -1.0),
+				float64_t2(0.0, +1.0),
+			};
+
+			polyline.addLinePoints(line0);
+			polyline.addLinePoints(line1);
+			drawResourcesFiller.drawPolyline(polyline, style, intendedNextSubmit);
 		}
 
 		drawResourcesFiller.finalizeAllCopiesToGPU(intendedNextSubmit);
diff --git a/62_CAD/shaders/globals.hlsl b/62_CAD/shaders/globals.hlsl
index d12c80bef..d3a4968bb 100644
--- a/62_CAD/shaders/globals.hlsl
+++ b/62_CAD/shaders/globals.hlsl
@@ -38,39 +38,30 @@ struct PushConstants
     uint32_t isDTMRendering;
 };
 
-// TODO: Compute this in a compute shader from the world counterparts
-//      because this struct includes NDC coordinates, the values will change based camera zoom and move
-//      of course we could have the clip values to be in world units and also the matrix to transform to world instead of ndc but that requires extra computations(matrix multiplications) per vertex
-struct ClipProjectionData
-{
-    pfloat64_t3x3 projectionToNDC; // 72 -> because we use scalar_layout
-    float32_t2 minClipNDC; // 80
-    float32_t2 maxClipNDC; // 88
+struct WorldClipRect
+{
+    pfloat64_t2 minClip; // min clip of a rect in worldspace coordinates of the original space (globals.defaultProjectionToNDC)
+    pfloat64_t2 maxClip; // max clip of a rect in worldspace coordinates of the original space (globals.defaultProjectionToNDC)
 };
 
-#ifndef __HLSL_VERSION
-static_assert(offsetof(ClipProjectionData, projectionToNDC) == 0u);
-static_assert(offsetof(ClipProjectionData, minClipNDC) == 72u);
-static_assert(offsetof(ClipProjectionData, maxClipNDC) == 80u);
-#endif
-
 struct Pointers
 {
     uint64_t lineStyles;
     uint64_t dtmSettings;
-    uint64_t customClipProjections;
+    uint64_t customProjections;
+    uint64_t customClipRects;
     uint64_t mainObjects;
     uint64_t drawObjects;
     uint64_t geometryBuffer;
 };
 #ifndef __HLSL_VERSION
-static_assert(sizeof(Pointers) == 48u);
+static_assert(sizeof(Pointers) == 56u);
 #endif
 
 struct Globals
 {
     Pointers pointers;
-    ClipProjectionData defaultClipProjection;
+    pfloat64_t3x3 defaultProjectionToNDC;
     float screenToWorldRatio;
     float worldToScreenRatio;
     uint32_t2 resolution;
@@ -80,7 +71,7 @@ struct Globals
     float32_t _padding;
 };
 #ifndef __HLSL_VERSION
-static_assert(sizeof(Globals) == 168u);
+static_assert(sizeof(Globals) == 160u);
 #endif
 
 #ifdef __HLSL_VERSION
@@ -143,11 +134,14 @@ enum class MajorAxis : uint32_t
 };
 
 // Consists of multiple DrawObjects
+// [IDEA]: In GPU-driven rendering, to save mem for MainObject data fetching: many of these can be shared amongst different main objects, we could find these styles, settings, etc indices with upper_bound
+// [TODO]: pack indices and members of mainObject and DrawObject + enforce max size for autosubmit --> but do it only after the mainobject definition is finalized in gpu-driven rendering work
 struct MainObject
 {
     uint32_t styleIdx;
     uint32_t dtmSettingsIdx;
-    uint32_t clipProjectionIndex;
+    uint32_t customProjectionIndex;
+    uint32_t customClipRectIndex;
 };
 
 struct DrawObject
@@ -496,7 +490,8 @@ NBL_CONSTEXPR uint32_t MaxIndexableMainObjects = (1u << MainObjectIdxBits) - 1u;
 NBL_CONSTEXPR uint32_t InvalidStyleIdx = nbl::hlsl::numeric_limits<uint32_t>::max;
 NBL_CONSTEXPR uint32_t InvalidDTMSettingsIdx = nbl::hlsl::numeric_limits<uint32_t>::max;
 NBL_CONSTEXPR uint32_t InvalidMainObjectIdx = MaxIndexableMainObjects;
-NBL_CONSTEXPR uint32_t InvalidClipProjectionIndex = nbl::hlsl::numeric_limits<uint32_t>::max;
+NBL_CONSTEXPR uint32_t InvalidCustomProjectionIndex = nbl::hlsl::numeric_limits<uint32_t>::max;
+NBL_CONSTEXPR uint32_t InvalidCustomClipRectIndex = nbl::hlsl::numeric_limits<uint32_t>::max;
 NBL_CONSTEXPR uint32_t InvalidTextureIdx = nbl::hlsl::numeric_limits<uint32_t>::max;
 
 // Hatches
@@ -521,9 +516,13 @@ DTMSettings loadDTMSettings(const uint32_t index)
 {
     return vk::RawBufferLoad<DTMSettings>(globals.pointers.dtmSettings + index * sizeof(DTMSettings), 8u);
 }
-ClipProjectionData loadCustomClipProjection(const uint32_t index)
+pfloat64_t3x3 loadCustomProjection(const uint32_t index)
+{
+    return vk::RawBufferLoad<pfloat64_t3x3>(globals.pointers.customProjections + index * sizeof(pfloat64_t3x3), 8u);
+}
+WorldClipRect loadCustomClipRect(const uint32_t index)
 {
-    return vk::RawBufferLoad<ClipProjectionData>(globals.pointers.customClipProjections + index * sizeof(ClipProjectionData), 8u);
+    return vk::RawBufferLoad<WorldClipRect>(globals.pointers.customClipRects + index * sizeof(WorldClipRect), 8u);
 }
 MainObject loadMainObject(const uint32_t index)
 {
diff --git a/62_CAD/shaders/main_pipeline/vertex_shader.hlsl b/62_CAD/shaders/main_pipeline/vertex_shader.hlsl
index 9d4a384a1..66101410e 100644
--- a/62_CAD/shaders/main_pipeline/vertex_shader.hlsl
+++ b/62_CAD/shaders/main_pipeline/vertex_shader.hlsl
@@ -23,17 +23,39 @@ float2 QuadraticBezier(float2 p0, float2 p1, float2 p2, float t)
     return shapes::QuadraticBezier<float>::construct(p0, p1, p2).evaluate(t);
 }
 
-ClipProjectionData getClipProjectionData(in MainObject mainObj)
+struct NDCClipProjectionData
 {
-    if (mainObj.clipProjectionIndex != InvalidClipProjectionIndex)
+    pfloat64_t3x3 projectionToNDC; // pre-multiplied projection in a tree
+    float32_t2 minClipNDC;
+    float32_t2 maxClipNDC;
+};
+
+NDCClipProjectionData getClipProjectionData(in MainObject mainObj)
+{
+    NDCClipProjectionData ret;
+    if (mainObj.customProjectionIndex != InvalidCustomProjectionIndex)
     {
-#ifdef NBL_2D_SHOWCASE_MODE
-        return nbl::hlsl::mul(globals.defaultClipProjection.projectionToNDC, loadCustomClipProjection(mainObj.clipProjectionIndex));
-#endif
-        return loadCustomClipProjection(mainObj.clipProjectionIndex);
+        // If projection type is worldspace projection and clip:
+        pfloat64_t3x3 customProjection = loadCustomProjection(mainObj.customProjectionIndex);
+        ret.projectionToNDC = nbl::hlsl::mul(globals.defaultProjectionToNDC, customProjection);
+    }
+    else
+        ret.projectionToNDC = globals.defaultProjectionToNDC;
+
+    if (mainObj.customClipRectIndex != InvalidCustomClipRectIndex)
+    {
+        WorldClipRect worldClipRect = loadCustomClipRect(mainObj.customClipRectIndex);
+        
+        /// [NOTE]: Optimization: we avoid looking for min/max in the shader because minClip and maxClip in default worldspace are defined in such a way that minClip.y > maxClip.y so minClipNDC.y < maxClipNDC.y
+        ret.minClipNDC = nbl::hlsl::_static_cast<float32_t2>(transformPointNdc(globals.defaultProjectionToNDC, worldClipRect.minClip));
+        ret.maxClipNDC = nbl::hlsl::_static_cast<float32_t2>(transformPointNdc(globals.defaultProjectionToNDC, worldClipRect.maxClip));
     }
     else
-        return globals.defaultClipProjection;
+    {
+        ret.minClipNDC = float2(-1.0f, -1.0f);
+        ret.maxClipNDC = float2(+1.0f, +1.0f);
+    }
+    return ret;
 }
 
 float2 transformPointScreenSpace(pfloat64_t3x3 transformation, uint32_t2 resolution, pfloat64_t2 point2d)
@@ -83,7 +105,7 @@ void dilateHatch<false>(out float2 outOffsetVec, out float2 outUV, const float2
 
 PSInput main(uint vertexID : SV_VertexID)
 {
-    ClipProjectionData clipProjectionData;
+    NDCClipProjectionData clipProjectionData;
     
     PSInput outV;
 

From 6e27153186f8c94b34ea3de1ce03179a799721e0 Mon Sep 17 00:00:00 2001
From: Erfan Ahmadi <ahmadierfan99@gmail.com>
Date: Thu, 1 May 2025 14:04:45 +0400
Subject: [PATCH 215/529] small example edit

---
 62_CAD/main.cpp | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/62_CAD/main.cpp b/62_CAD/main.cpp
index 755a649b2..fc40daccb 100644
--- a/62_CAD/main.cpp
+++ b/62_CAD/main.cpp
@@ -2720,13 +2720,14 @@ class ComputerAidedDesign final : public examples::SimpleWindowedApplication, pu
 				0.0, 0.0, 1.0
 			};
 
-			// left half of screen should be red and right half should be green
+			/// [NOTE]: We set minClip and maxClip (in default worldspace) in such a way that minClip.y > maxClip.y so that minClipNDC.y < maxClipNDC.y
+			// left half should be red and right half should be green
 			WorldClipRect showLeft = {};
-			showLeft.minClip  = float64_t2(-100.0, -1000.0);
-			showLeft.maxClip  = float64_t2(0.0, +1000.0);
+			showLeft.minClip  = float64_t2(-100.0, +1000.0);
+			showLeft.maxClip  = float64_t2(0.0, -1000.0);
 			WorldClipRect showRight = {};
-			showRight.minClip = float64_t2(0.0, -1000.0);
-			showRight.maxClip = float64_t2(100.0, +1000.0);
+			showRight.minClip = float64_t2(0.0, +1000.0);
+			showRight.maxClip = float64_t2(100.0, -1000.0);
 
 			LineStyleInfo leftLineStyle = {};
 			leftLineStyle.screenSpaceLineWidth = 3.0f;

From 7d063322b5994d66e871367bddaf9a5631f39bc3 Mon Sep 17 00:00:00 2001
From: keptsecret <sorchon@gmail.com>
Date: Fri, 2 May 2025 09:41:18 +0700
Subject: [PATCH 216/529] test smaller workgroup sizes

---
 74a_Workgroup2ScanTest/app_resources/workgroupCommon.hlsl | 6 +++---
 74a_Workgroup2ScanTest/main.cpp                           | 4 ++--
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/74a_Workgroup2ScanTest/app_resources/workgroupCommon.hlsl b/74a_Workgroup2ScanTest/app_resources/workgroupCommon.hlsl
index 362b48253..5de666c4b 100644
--- a/74a_Workgroup2ScanTest/app_resources/workgroupCommon.hlsl
+++ b/74a_Workgroup2ScanTest/app_resources/workgroupCommon.hlsl
@@ -21,7 +21,9 @@ uint32_t3 nbl::hlsl::glsl::gl_WorkGroupSize() {return uint32_t3(WORKGROUP_SIZE,1
 #error "Define ITEMS_PER_INVOCATION!"
 #endif
 
-typedef vector<uint32_t, ITEMS_PER_INVOCATION> type_t;
+using config_t = nbl::hlsl::workgroup2::Configuration<WORKGROUP_SIZE_LOG2, SUBGROUP_SIZE_LOG2, ITEMS_PER_INVOCATION>;
+
+typedef vector<uint32_t, config_t::ItemsPerInvocation_0> type_t;
 
 // unfortunately DXC chokes on descriptors as static members
 // https://github.com/microsoft/DirectXShaderCompiler/issues/5940
@@ -40,8 +42,6 @@ bool canStore();
 #error "Define SUBGROUP_SIZE_LOG2!"
 #endif
 
-using config_t = nbl::hlsl::workgroup2::Configuration<WORKGROUP_SIZE_LOG2, SUBGROUP_SIZE_LOG2, ITEMS_PER_INVOCATION>;
-
 groupshared vector<uint32_t, config_t::ItemsPerInvocation_1> scratch[config_t::SubgroupSize];  // final (level 1) scan needs to fit in one subgroup exactly
 
 template<class Config>
diff --git a/74a_Workgroup2ScanTest/main.cpp b/74a_Workgroup2ScanTest/main.cpp
index c5e8370be..e40b87100 100644
--- a/74a_Workgroup2ScanTest/main.cpp
+++ b/74a_Workgroup2ScanTest/main.cpp
@@ -168,7 +168,7 @@ class Workgroup2ScanTestApp final : public application_templates::BasicMultiQueu
 		}
 
 		const auto MaxWorkgroupSize = m_physicalDevice->getLimits().maxComputeWorkGroupInvocations;
-		const std::array<uint32_t, 2> WorkgroupSizes = { 512, 1024 };
+		const std::array<uint32_t, 4> WorkgroupSizes = { 32, 64, 512, 1024 };
 		const auto MinSubgroupSize = m_physicalDevice->getLimits().minSubgroupSize;
 		const auto MaxSubgroupSize = m_physicalDevice->getLimits().maxSubgroupSize;
 		for (auto subgroupSize=MinSubgroupSize; subgroupSize <= MaxSubgroupSize; subgroupSize *= 2u)
@@ -182,7 +182,7 @@ class Workgroup2ScanTestApp final : public application_templates::BasicMultiQueu
 				m_logger->log("Testing Workgroup Size %u with Subgroup Size %u", ILogger::ELL_INFO, workgroupSize, subgroupSize);
 
 				bool passed = true;
-				const uint32_t itemsPerWG = ItemsPerInvocation * max(workgroupSize >> subgroupSizeLog2, subgroupSize) << subgroupSizeLog2;	// TODO use Config::VirtualWorkgroupSize somehow
+				const uint32_t itemsPerWG = workgroupSize <= 4 * subgroupSize ? workgroupSize : ItemsPerInvocation * max(workgroupSize >> subgroupSizeLog2, subgroupSize) << subgroupSizeLog2;	// TODO use Config somehow
 				m_logger->log("Testing Item Count %u", ILogger::ELL_INFO, itemsPerWG);
 				passed = runTest<emulatedReduction, true>(workgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize, itemsPerWG) && passed;
 				logTestOutcome(passed, itemsPerWG);

From 874557c1e091634c945b643e85bf215c2e304d87 Mon Sep 17 00:00:00 2001
From: keptsecret <sorchon@gmail.com>
Date: Fri, 2 May 2025 10:52:11 +0700
Subject: [PATCH 217/529] expanded scratch proxy funcs

---
 .../app_resources/workgroupCommon.hlsl                 | 10 ++++++++++
 74a_Workgroup2ScanTest/main.cpp                        |  2 +-
 2 files changed, 11 insertions(+), 1 deletion(-)

diff --git a/74a_Workgroup2ScanTest/app_resources/workgroupCommon.hlsl b/74a_Workgroup2ScanTest/app_resources/workgroupCommon.hlsl
index 5de666c4b..e60856bf8 100644
--- a/74a_Workgroup2ScanTest/app_resources/workgroupCommon.hlsl
+++ b/74a_Workgroup2ScanTest/app_resources/workgroupCommon.hlsl
@@ -47,6 +47,7 @@ groupshared vector<uint32_t, config_t::ItemsPerInvocation_1> scratch[config_t::S
 template<class Config>
 struct ScratchProxy
 {
+    using scalar_t = uint32_t;
     using stype_t = vector<uint32_t, Config::ItemsPerInvocation_1>;
 
     stype_t get(const uint32_t ix)
@@ -58,6 +59,15 @@ struct ScratchProxy
         scratch[ix] = value;
     }
 
+    scalar_t getByComponent(const uint32_t ix)
+    {
+        return scratch[ix/Config::ItemsPerInvocation_1][ix&(Config::ItemsPerInvocation_1-1)];
+    }
+    void setByComponent(const uint32_t ix, const scalar_t value)
+    {
+        scratch[ix/Config::ItemsPerInvocation_1][ix&(Config::ItemsPerInvocation_1-1)] = value;
+    }
+
     stype_t atomicOr(const uint32_t ix, const stype_t value)
     {
         return nbl::hlsl::glsl::atomicOr(scratch[ix],value);
diff --git a/74a_Workgroup2ScanTest/main.cpp b/74a_Workgroup2ScanTest/main.cpp
index e40b87100..57e70bf68 100644
--- a/74a_Workgroup2ScanTest/main.cpp
+++ b/74a_Workgroup2ScanTest/main.cpp
@@ -168,7 +168,7 @@ class Workgroup2ScanTestApp final : public application_templates::BasicMultiQueu
 		}
 
 		const auto MaxWorkgroupSize = m_physicalDevice->getLimits().maxComputeWorkGroupInvocations;
-		const std::array<uint32_t, 4> WorkgroupSizes = { 32, 64, 512, 1024 };
+		const std::array<uint32_t, 3> WorkgroupSizes = { 64, 512, 1024 };
 		const auto MinSubgroupSize = m_physicalDevice->getLimits().minSubgroupSize;
 		const auto MaxSubgroupSize = m_physicalDevice->getLimits().maxSubgroupSize;
 		for (auto subgroupSize=MinSubgroupSize; subgroupSize <= MaxSubgroupSize; subgroupSize *= 2u)

From 6706df65de1b09513930d8e40e7d2c67d1f3b42c Mon Sep 17 00:00:00 2001
From: Erfan Ahmadi <ahmadierfan99@gmail.com>
Date: Fri, 2 May 2025 09:47:59 +0400
Subject: [PATCH 218/529] plus should be drawn, emulated fp64 bug

---
 62_CAD/DrawResourcesFiller.h                  |  4 +-
 62_CAD/main.cpp                               | 50 +++++++++----------
 .../shaders/main_pipeline/vertex_shader.hlsl  |  9 ++++
 3 files changed, 36 insertions(+), 27 deletions(-)

diff --git a/62_CAD/DrawResourcesFiller.h b/62_CAD/DrawResourcesFiller.h
index b92685959..c6ae52920 100644
--- a/62_CAD/DrawResourcesFiller.h
+++ b/62_CAD/DrawResourcesFiller.h
@@ -239,8 +239,8 @@ struct DrawResourcesFiller
 	void pushCustomClipRect(const WorldClipRect& clipRect);
 	void popCustomClipRect();
 
-	const std::deque<float64_t3x3>& getProjectionStack() const { return activeProjections; }
-	const std::deque<WorldClipRect>& getClipRectsStack() const { return activeClipRects; }
+	const std::deque<float64_t3x3>& getCustomProjectionStack() const { return activeProjections; }
+	const std::deque<WorldClipRect>& getCustomClipRectsStack() const { return activeClipRects; }
 
 	smart_refctd_ptr<IGPUImageView> getMSDFsTextureArray() { return msdfTextureArray; }
 
diff --git a/62_CAD/main.cpp b/62_CAD/main.cpp
index fc40daccb..89938d2b0 100644
--- a/62_CAD/main.cpp
+++ b/62_CAD/main.cpp
@@ -77,7 +77,7 @@ constexpr std::array<float, (uint32_t)ExampleMode::CASE_COUNT> cameraExtents =
 	10.0	// CASE_BUG
 };
 
-constexpr ExampleMode mode = ExampleMode::CASE_6;
+constexpr ExampleMode mode = ExampleMode::CASE_BUG;
 
 class Camera2D
 {
@@ -3343,30 +3343,30 @@ class ComputerAidedDesign final : public examples::SimpleWindowedApplication, pu
 			style.worldSpaceLineWidth = 0.0f;
 			style.color = float32_t4(0.619f, 0.325f, 0.709f, 0.5f);
 
-			for (uint32_t i = 0; i < 128u; ++i)
-			{
-				std::vector<shapes::QuadraticBezier<double>> quadBeziers;
-				curves::EllipticalArcInfo myCircle;
-				{
-					myCircle.majorAxis = { 0.05 , 0.0};
-					myCircle.center = { 0.0 + i * 0.1, i * 0.1 };
-					myCircle.angleBounds = {
-						nbl::core::PI<double>() * 0.0,
-						nbl::core::PI<double>() * 2.0
-					};
-					myCircle.eccentricity = 1.0;
-				}
-
-				curves::Subdivision::AddBezierFunc addToBezier = [&](shapes::QuadraticBezier<double>&& info) -> void
-					{
-						quadBeziers.push_back(info);
-					};
-
-				curves::Subdivision::adaptive(myCircle, 1e-5, addToBezier, 10u);
-				polyline.addQuadBeziers(quadBeziers);
-				drawResourcesFiller.drawPolyline(polyline, style, intendedNextSubmit);
-				polyline.clearEverything();
-			}
+			//for (uint32_t i = 0; i < 128u; ++i)
+			//{
+			//	std::vector<shapes::QuadraticBezier<double>> quadBeziers;
+			//	curves::EllipticalArcInfo myCircle;
+			//	{
+			//		myCircle.majorAxis = { 0.05 , 0.0};
+			//		myCircle.center = { 0.0 + i * 0.1, i * 0.1 };
+			//		myCircle.angleBounds = {
+			//			nbl::core::PI<double>() * 0.0,
+			//			nbl::core::PI<double>() * 2.0
+			//		};
+			//		myCircle.eccentricity = 1.0;
+			//	}
+
+			//	curves::Subdivision::AddBezierFunc addToBezier = [&](shapes::QuadraticBezier<double>&& info) -> void
+			//		{
+			//			quadBeziers.push_back(info);
+			//		};
+
+			//	curves::Subdivision::adaptive(myCircle, 1e-5, addToBezier, 10u);
+			//	polyline.addQuadBeziers(quadBeziers);
+			//	drawResourcesFiller.drawPolyline(polyline, style, intendedNextSubmit);
+			//	polyline.clearEverything();
+			//}
 			
 			float64_t2 line0[2u] = 
 			{
diff --git a/62_CAD/shaders/main_pipeline/vertex_shader.hlsl b/62_CAD/shaders/main_pipeline/vertex_shader.hlsl
index 66101410e..c979f3b0b 100644
--- a/62_CAD/shaders/main_pipeline/vertex_shader.hlsl
+++ b/62_CAD/shaders/main_pipeline/vertex_shader.hlsl
@@ -32,9 +32,15 @@ struct NDCClipProjectionData
 
 NDCClipProjectionData getClipProjectionData(in MainObject mainObj)
 {
+    pfloat64_t3x3 weirdProjection =  nbl::hlsl::_static_cast<pfloat64_t3x3>(
+        float32_t3x3(1.0f, 0.0f, 0.0f,
+                     0.0f, 1.0f, 0.0f,
+                     0.0f, 0.0f, 1.0f));
+    
     NDCClipProjectionData ret;
     if (mainObj.customProjectionIndex != InvalidCustomProjectionIndex)
     {
+    
         // If projection type is worldspace projection and clip:
         pfloat64_t3x3 customProjection = loadCustomProjection(mainObj.customProjectionIndex);
         ret.projectionToNDC = nbl::hlsl::mul(globals.defaultProjectionToNDC, customProjection);
@@ -55,6 +61,9 @@ NDCClipProjectionData getClipProjectionData(in MainObject mainObj)
         ret.minClipNDC = float2(-1.0f, -1.0f);
         ret.maxClipNDC = float2(+1.0f, +1.0f);
     }
+    
+    ret.projectionToNDC = nbl::hlsl::mul(ret.projectionToNDC, weirdProjection);
+    
     return ret;
 }
 

From c9d2abf21fb9c4af6407b53a4e7025be5ed28d8e Mon Sep 17 00:00:00 2001
From: Erfan Ahmadi <ahmadierfan99@gmail.com>
Date: Fri, 2 May 2025 11:19:31 +0400
Subject: [PATCH 219/529] Fixed Geometry and Fixed Screenspace sized polylines
 handle

---
 62_CAD/DrawResourcesFiller.cpp                | 32 +++++++++++++++++--
 62_CAD/DrawResourcesFiller.h                  | 11 +++++--
 62_CAD/shaders/globals.hlsl                   | 14 ++++++--
 .../shaders/main_pipeline/vertex_shader.hlsl  |  9 ++----
 4 files changed, 52 insertions(+), 14 deletions(-)

diff --git a/62_CAD/DrawResourcesFiller.cpp b/62_CAD/DrawResourcesFiller.cpp
index 759db16f3..b2f4e4950 100644
--- a/62_CAD/DrawResourcesFiller.cpp
+++ b/62_CAD/DrawResourcesFiller.cpp
@@ -87,11 +87,36 @@ void DrawResourcesFiller::drawPolyline(const CPolylineBase& polyline, const Line
 
 	setActiveLineStyle(lineStyleInfo);
 	
-	beginMainObject(MainObjectType::POLYLINE);
+	beginMainObject(MainObjectType::POLYLINE, TransformationType::NORMAL);
 	drawPolyline(polyline, intendedNextSubmit);
 	endMainObject();
 }
 
+void DrawResourcesFiller::drawFixedGeometryPolyline(const CPolylineBase& polyline, const LineStyleInfo& lineStyleInfo, const float64_t3x3& transformation, TransformationType transformationType, SIntendedSubmitInfo& intendedNextSubmit)
+{
+	if (!lineStyleInfo.isVisible())
+		return;
+
+	setActiveLineStyle(lineStyleInfo);
+	
+	if (!activeProjections.empty())
+	{
+		// if there is already an active custom projection, it should be considered into the transformation of the fixed geometry polyline
+		float64_t3x3 newTransformation = nbl::hlsl::mul(activeProjections.back(), transformation);
+		pushCustomProjection(newTransformation);
+	}
+	else
+	{
+		// will be multiplied by the default projection matrix from the left (in shader), no need to consider it here
+		pushCustomProjection(transformation);
+	}
+
+	beginMainObject(MainObjectType::POLYLINE, TransformationType::FIXED_SCREENSPACE_SIZE);
+	drawPolyline(polyline, intendedNextSubmit);
+	endMainObject();
+	popCustomProjection();
+}
+
 void DrawResourcesFiller::drawPolyline(const CPolylineBase& polyline, SIntendedSubmitInfo& intendedNextSubmit)
 {
 	uint32_t mainObjectIdx = acquireActiveMainObjectIndex_SubmitIfNeeded(intendedNextSubmit);
@@ -364,15 +389,17 @@ void DrawResourcesFiller::setActiveDTMSettings(const DTMSettingsInfo& dtmSetting
 	activeDTMSettingsIndex = InvalidDTMSettingsIdx;
 }
 
-void DrawResourcesFiller::beginMainObject(MainObjectType type)
+void DrawResourcesFiller::beginMainObject(MainObjectType type, TransformationType transformationType)
 {
 	activeMainObjectType = type;
+	activeMainObjectTransformationType = transformationType;
 	activeMainObjectIndex = InvalidMainObjectIdx;
 }
 
 void DrawResourcesFiller::endMainObject()
 {
 	activeMainObjectType = MainObjectType::NONE;
+	activeMainObjectTransformationType = TransformationType::NORMAL;
 	activeMainObjectIndex = InvalidMainObjectIdx;
 }
 
@@ -784,6 +811,7 @@ uint32_t DrawResourcesFiller::acquireActiveMainObjectIndex_SubmitIfNeeded(SInten
 	mainObject.dtmSettingsIdx = (needsDTMSettings) ? acquireActiveDTMSettingsIndex_SubmitIfNeeded(intendedNextSubmit) : InvalidDTMSettingsIdx;
 	mainObject.customProjectionIndex = (needsCustomProjection) ? acquireActiveCustomProjectionIndex_SubmitIfNeeded(intendedNextSubmit) : InvalidCustomProjectionIndex;
 	mainObject.customClipRectIndex = (needsCustomClipRect) ? acquireActiveCustomClipRectIndex_SubmitIfNeeded(intendedNextSubmit) : InvalidCustomClipRectIndex;
+	mainObject.transformationType = (uint32_t)activeMainObjectTransformationType;
 	activeMainObjectIndex = resourcesCollection.mainObjects.addAndGetOffset(mainObject);
 	return activeMainObjectIndex;
 }
diff --git a/62_CAD/DrawResourcesFiller.h b/62_CAD/DrawResourcesFiller.h
index c6ae52920..d64e5c0af 100644
--- a/62_CAD/DrawResourcesFiller.h
+++ b/62_CAD/DrawResourcesFiller.h
@@ -14,7 +14,7 @@ using namespace nbl::asset;
 using namespace nbl::ext::TextRendering;
 
 static_assert(sizeof(DrawObject) == 16u);
-static_assert(sizeof(MainObject) == 16u);
+static_assert(sizeof(MainObject) == 20u);
 static_assert(sizeof(LineStyle) == 88u);
 
 // ! DrawResourcesFiller
@@ -149,9 +149,15 @@ struct DrawResourcesFiller
 	// take a `SIntendedSubmitInfo` like others, but don't use it as I don't want you to handle anything regarding autoSubmit
 	// somehow retrieve or calculate the geometry buffer offsets of your vertex and index buffer to be used outside for binding purposes
 
+	
 	//! this function fills buffers required for drawing a polyline and submits a draw through provided callback when there is not enough memory.
 	void drawPolyline(const CPolylineBase& polyline, const LineStyleInfo& lineStyleInfo, SIntendedSubmitInfo& intendedNextSubmit);
 
+
+	//! Draws a fixed-geometry polyline using a custom transformation.
+	//! TODO: Change `polyline` input to an ID referencing a possibly cached instance in our buffers, allowing reuse and avoiding redundant uploads.
+	void drawFixedGeometryPolyline(const CPolylineBase& polyline, const LineStyleInfo& lineStyleInfo, const float64_t3x3& transformation, TransformationType transformationType, SIntendedSubmitInfo& intendedNextSubmit);
+	
 	/// Use this in a begin/endMainObject scope when you want to draw different polylines that should essentially be a single main object (no self-blending between components of a single main object)
 	/// WARNING: make sure this function  is called within begin/endMainObject scope
 	void drawPolyline(const CPolylineBase& polyline, SIntendedSubmitInfo& intendedNextSubmit);
@@ -230,7 +236,7 @@ struct DrawResourcesFiller
 	void setActiveLineStyle(const LineStyleInfo& lineStyle);
 	void setActiveDTMSettings(const DTMSettingsInfo& dtmSettingsInfo);
 
-	void beginMainObject(MainObjectType type);
+	void beginMainObject(MainObjectType type, TransformationType transformationType = TransformationType::NORMAL);
 	void endMainObject();
 
 	void pushCustomProjection(const float64_t3x3& projection);
@@ -522,6 +528,7 @@ struct DrawResourcesFiller
 	uint32_t activeDTMSettingsIndex = InvalidDTMSettingsIdx;
 
 	MainObjectType activeMainObjectType;
+	TransformationType activeMainObjectTransformationType;
 	uint32_t activeMainObjectIndex = InvalidMainObjectIdx;
 
 	// The ClipRects & Projections are stack, because user can push/pop ClipRects & Projections in any order
diff --git a/62_CAD/shaders/globals.hlsl b/62_CAD/shaders/globals.hlsl
index d3a4968bb..e5fe21e03 100644
--- a/62_CAD/shaders/globals.hlsl
+++ b/62_CAD/shaders/globals.hlsl
@@ -1,7 +1,7 @@
 #ifndef _CAD_EXAMPLE_GLOBALS_HLSL_INCLUDED_
 #define _CAD_EXAMPLE_GLOBALS_HLSL_INCLUDED_
 
-#define NBL_FORCE_EMULATED_FLOAT_64
+// #define NBL_FORCE_EMULATED_FLOAT_64
 
 #include <nbl/builtin/hlsl/portable/float64_t.hlsl>
 #include <nbl/builtin/hlsl/portable/vector_t.hlsl>
@@ -62,6 +62,7 @@ struct Globals
 {
     Pointers pointers;
     pfloat64_t3x3 defaultProjectionToNDC;
+    pfloat64_t3x3 screenToWorldScaleTransform; // Pre-multiply your transform with this to scale in screen space (e.g., scale 100.0 means 100 screen pixels).
     float screenToWorldRatio;
     float worldToScreenRatio;
     uint32_t2 resolution;
@@ -71,7 +72,7 @@ struct Globals
     float32_t _padding;
 };
 #ifndef __HLSL_VERSION
-static_assert(sizeof(Globals) == 160u);
+static_assert(sizeof(Globals) == 232u);
 #endif
 
 #ifdef __HLSL_VERSION
@@ -133,6 +134,13 @@ enum class MajorAxis : uint32_t
     MAJOR_Y = 1u,
 };
 
+enum TransformationType 
+{
+    NORMAL = 0,
+    FIXED_SCREENSPACE_SIZE
+};
+
+
 // Consists of multiple DrawObjects
 // [IDEA]: In GPU-driven rendering, to save mem for MainObject data fetching: many of these can be shared amongst different main objects, we could find these styles, settings, etc indices with upper_bound
 // [TODO]: pack indices and members of mainObject and DrawObject + enforce max size for autosubmit --> but do it only after the mainobject definition is finalized in gpu-driven rendering work
@@ -142,6 +150,7 @@ struct MainObject
     uint32_t dtmSettingsIdx;
     uint32_t customProjectionIndex;
     uint32_t customClipRectIndex;
+    uint32_t transformationType; // todo pack later, it's just 2 possible values atm
 };
 
 struct DrawObject
@@ -151,7 +160,6 @@ struct DrawObject
     uint64_t geometryAddress;
 };
 
-
 // Goes into geometry buffer, needs to be aligned by 8
 struct LinePointInfo
 {
diff --git a/62_CAD/shaders/main_pipeline/vertex_shader.hlsl b/62_CAD/shaders/main_pipeline/vertex_shader.hlsl
index c979f3b0b..a25426b95 100644
--- a/62_CAD/shaders/main_pipeline/vertex_shader.hlsl
+++ b/62_CAD/shaders/main_pipeline/vertex_shader.hlsl
@@ -32,15 +32,9 @@ struct NDCClipProjectionData
 
 NDCClipProjectionData getClipProjectionData(in MainObject mainObj)
 {
-    pfloat64_t3x3 weirdProjection =  nbl::hlsl::_static_cast<pfloat64_t3x3>(
-        float32_t3x3(1.0f, 0.0f, 0.0f,
-                     0.0f, 1.0f, 0.0f,
-                     0.0f, 0.0f, 1.0f));
-    
     NDCClipProjectionData ret;
     if (mainObj.customProjectionIndex != InvalidCustomProjectionIndex)
     {
-    
         // If projection type is worldspace projection and clip:
         pfloat64_t3x3 customProjection = loadCustomProjection(mainObj.customProjectionIndex);
         ret.projectionToNDC = nbl::hlsl::mul(globals.defaultProjectionToNDC, customProjection);
@@ -62,7 +56,8 @@ NDCClipProjectionData getClipProjectionData(in MainObject mainObj)
         ret.maxClipNDC = float2(+1.0f, +1.0f);
     }
     
-    ret.projectionToNDC = nbl::hlsl::mul(ret.projectionToNDC, weirdProjection);
+    if (mainObj.transformationType == TransformationType::FIXED_SCREENSPACE_SIZE)
+        ret.projectionToNDC = nbl::hlsl::mul(ret.projectionToNDC, globals.screenToWorldScaleTransform);
     
     return ret;
 }

From 7381460e1c9a5b0bc1a8f319c0f81b0d577e97d8 Mon Sep 17 00:00:00 2001
From: Erfan Ahmadi <ahmadierfan99@gmail.com>
Date: Fri, 2 May 2025 11:19:59 +0400
Subject: [PATCH 220/529] drawFixedGeometryPolyline add case to example

---
 62_CAD/main.cpp | 105 +++++++++++++++++++++++++++++++-----------------
 1 file changed, 68 insertions(+), 37 deletions(-)

diff --git a/62_CAD/main.cpp b/62_CAD/main.cpp
index 89938d2b0..04d99a9cf 100644
--- a/62_CAD/main.cpp
+++ b/62_CAD/main.cpp
@@ -58,7 +58,7 @@ enum class ExampleMode
 	CASE_7, // Images
 	CASE_8, // MSDF and Text
 	CASE_9, // DTM
-	CASE_BUG, // Bug Repro 
+	CASE_BUG, // Bug Repro, after fix, rename to CASE_10 and comment should be: testing fixed geometry and emulated fp64 corner cases
 	CASE_COUNT
 };
 
@@ -1220,6 +1220,9 @@ class ComputerAidedDesign final : public examples::SimpleWindowedApplication, pu
 		float screenToWorld = getScreenToWorldRatio(globalData.defaultProjectionToNDC, globalData.resolution);
 		globalData.screenToWorldRatio = screenToWorld;
 		globalData.worldToScreenRatio = (1.0f/screenToWorld);
+		globalData.screenToWorldScaleTransform = float64_t3x3(globalData.worldToScreenRatio, 0.0f, 0.0f,
+														 0.0f, globalData.worldToScreenRatio, 0.0f,
+														 0.0f, 0.0f, 1.0f);
 		globalData.miterLimit = 10.0f;
 		globalData.currentlyActiveMainObjectIndex = drawResourcesFiller.getActiveMainObjectIndex();
 		SBufferRange<IGPUBuffer> globalBufferUpdateRange = { .offset = 0ull, .size = sizeof(Globals), .buffer = m_globalsBuffer.get() };
@@ -3343,45 +3346,73 @@ class ComputerAidedDesign final : public examples::SimpleWindowedApplication, pu
 			style.worldSpaceLineWidth = 0.0f;
 			style.color = float32_t4(0.619f, 0.325f, 0.709f, 0.5f);
 
-			//for (uint32_t i = 0; i < 128u; ++i)
-			//{
-			//	std::vector<shapes::QuadraticBezier<double>> quadBeziers;
-			//	curves::EllipticalArcInfo myCircle;
-			//	{
-			//		myCircle.majorAxis = { 0.05 , 0.0};
-			//		myCircle.center = { 0.0 + i * 0.1, i * 0.1 };
-			//		myCircle.angleBounds = {
-			//			nbl::core::PI<double>() * 0.0,
-			//			nbl::core::PI<double>() * 2.0
-			//		};
-			//		myCircle.eccentricity = 1.0;
-			//	}
-
-			//	curves::Subdivision::AddBezierFunc addToBezier = [&](shapes::QuadraticBezier<double>&& info) -> void
-			//		{
-			//			quadBeziers.push_back(info);
-			//		};
-
-			//	curves::Subdivision::adaptive(myCircle, 1e-5, addToBezier, 10u);
-			//	polyline.addQuadBeziers(quadBeziers);
-			//	drawResourcesFiller.drawPolyline(polyline, style, intendedNextSubmit);
-			//	polyline.clearEverything();
-			//}
-			
-			float64_t2 line0[2u] = 
+			for (uint32_t i = 0; i < 128u; ++i)
 			{
-				float64_t2(-1.0, 0.0),
-				float64_t2(+1.0, 0.0),
-			};
-			float64_t2 line1[2u] = 
+				std::vector<shapes::QuadraticBezier<double>> quadBeziers;
+				curves::EllipticalArcInfo myCircle;
+				{
+					myCircle.majorAxis = { 0.05 , 0.0};
+					myCircle.center = { 0.0 + i * 0.1, i * 0.1 };
+					myCircle.angleBounds = {
+						nbl::core::PI<double>() * 0.0,
+						nbl::core::PI<double>() * 2.0
+					};
+					myCircle.eccentricity = 1.0;
+				}
+
+				curves::Subdivision::AddBezierFunc addToBezier = [&](shapes::QuadraticBezier<double>&& info) -> void
+					{
+						quadBeziers.push_back(info);
+					};
+
+				curves::Subdivision::adaptive(myCircle, 1e-5, addToBezier, 10u);
+				polyline.addQuadBeziers(quadBeziers);
+				drawResourcesFiller.drawPolyline(polyline, style, intendedNextSubmit);
+				polyline.clearEverything();
+			}
+			
+			// Testing Fixed Geometry
 			{
-				float64_t2(0.0, -1.0),
-				float64_t2(0.0, +1.0),
-			};
+				float64_t2 line0[2u] =
+				{
+					float64_t2(-1.0, 0.0),
+					float64_t2(+1.0, 0.0),
+				};
+				float64_t2 line1[2u] =
+				{
+					float64_t2(0.0, -1.0),
+					float64_t2(0.0, +1.0),
+				};
 
-			polyline.addLinePoints(line0);
-			polyline.addLinePoints(line1);
-			drawResourcesFiller.drawPolyline(polyline, style, intendedNextSubmit);
+				float64_t3x3 translateMat =
+				{
+					1.0, 0.0, 0.0,
+					0.0, 1.0, 0.0,
+					0.0, 0.0, 1.0
+				};
+
+				float64_t angle = m_timeElapsed * 0.001;
+				float64_t2 dir = float64_t2{ cos(angle), sin(angle) };
+				float64_t3x3 rotateMat =
+				{
+					dir.x, -dir.y, 0.0,
+					dir.y, dir.x,  0.0,
+					0.0, 0.0, 1.0
+				};
+
+				float64_t2 scale = float64_t2{ 10.0, 10.0 };
+				float64_t3x3 scaleMat =
+				{
+					scale.x, 0.0, 0.0,
+					0.0, scale.y, 0.0,
+					0.0, 0.0, 1.0
+				};
+
+				float64_t3x3 transformation = nbl::hlsl::mul(translateMat, nbl::hlsl::mul(rotateMat, scaleMat));
+				polyline.addLinePoints(line0);
+				polyline.addLinePoints(line1);
+				drawResourcesFiller.drawFixedGeometryPolyline(polyline, style, transformation, TransformationType::FIXED_SCREENSPACE_SIZE, intendedNextSubmit);
+			}
 		}
 
 		drawResourcesFiller.finalizeAllCopiesToGPU(intendedNextSubmit);

From 1db6e0461f06c9316ae4181d25e9e42e97c34707 Mon Sep 17 00:00:00 2001
From: Erfan Ahmadi <ahmadierfan99@gmail.com>
Date: Fri, 2 May 2025 13:04:19 +0400
Subject: [PATCH 221/529] small fixes on transformation type

---
 62_CAD/DrawResourcesFiller.cpp | 6 +++---
 62_CAD/DrawResourcesFiller.h   | 2 +-
 62_CAD/shaders/globals.hlsl    | 4 ++--
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/62_CAD/DrawResourcesFiller.cpp b/62_CAD/DrawResourcesFiller.cpp
index b2f4e4950..55273873e 100644
--- a/62_CAD/DrawResourcesFiller.cpp
+++ b/62_CAD/DrawResourcesFiller.cpp
@@ -87,7 +87,7 @@ void DrawResourcesFiller::drawPolyline(const CPolylineBase& polyline, const Line
 
 	setActiveLineStyle(lineStyleInfo);
 	
-	beginMainObject(MainObjectType::POLYLINE, TransformationType::NORMAL);
+	beginMainObject(MainObjectType::POLYLINE, TransformationType::TT_NORMAL);
 	drawPolyline(polyline, intendedNextSubmit);
 	endMainObject();
 }
@@ -111,7 +111,7 @@ void DrawResourcesFiller::drawFixedGeometryPolyline(const CPolylineBase& polylin
 		pushCustomProjection(transformation);
 	}
 
-	beginMainObject(MainObjectType::POLYLINE, TransformationType::FIXED_SCREENSPACE_SIZE);
+	beginMainObject(MainObjectType::POLYLINE, transformationType);
 	drawPolyline(polyline, intendedNextSubmit);
 	endMainObject();
 	popCustomProjection();
@@ -399,7 +399,7 @@ void DrawResourcesFiller::beginMainObject(MainObjectType type, TransformationTyp
 void DrawResourcesFiller::endMainObject()
 {
 	activeMainObjectType = MainObjectType::NONE;
-	activeMainObjectTransformationType = TransformationType::NORMAL;
+	activeMainObjectTransformationType = TransformationType::TT_NORMAL;
 	activeMainObjectIndex = InvalidMainObjectIdx;
 }
 
diff --git a/62_CAD/DrawResourcesFiller.h b/62_CAD/DrawResourcesFiller.h
index d64e5c0af..049299974 100644
--- a/62_CAD/DrawResourcesFiller.h
+++ b/62_CAD/DrawResourcesFiller.h
@@ -236,7 +236,7 @@ struct DrawResourcesFiller
 	void setActiveLineStyle(const LineStyleInfo& lineStyle);
 	void setActiveDTMSettings(const DTMSettingsInfo& dtmSettingsInfo);
 
-	void beginMainObject(MainObjectType type, TransformationType transformationType = TransformationType::NORMAL);
+	void beginMainObject(MainObjectType type, TransformationType transformationType = TransformationType::TT_NORMAL);
 	void endMainObject();
 
 	void pushCustomProjection(const float64_t3x3& projection);
diff --git a/62_CAD/shaders/globals.hlsl b/62_CAD/shaders/globals.hlsl
index e5fe21e03..a71c920a6 100644
--- a/62_CAD/shaders/globals.hlsl
+++ b/62_CAD/shaders/globals.hlsl
@@ -136,8 +136,8 @@ enum class MajorAxis : uint32_t
 
 enum TransformationType 
 {
-    NORMAL = 0,
-    FIXED_SCREENSPACE_SIZE
+    TT_NORMAL = 0,
+    TT_FIXED_SCREENSPACE_SIZE
 };
 
 

From 7521f571d80aee25bb88f6ee30e416e709f60d74 Mon Sep 17 00:00:00 2001
From: Erfan Ahmadi <ahmadierfan99@gmail.com>
Date: Fri, 2 May 2025 13:05:07 +0400
Subject: [PATCH 222/529] fix vtx shader typo

---
 62_CAD/shaders/main_pipeline/vertex_shader.hlsl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/62_CAD/shaders/main_pipeline/vertex_shader.hlsl b/62_CAD/shaders/main_pipeline/vertex_shader.hlsl
index a25426b95..478ad964f 100644
--- a/62_CAD/shaders/main_pipeline/vertex_shader.hlsl
+++ b/62_CAD/shaders/main_pipeline/vertex_shader.hlsl
@@ -56,7 +56,7 @@ NDCClipProjectionData getClipProjectionData(in MainObject mainObj)
         ret.maxClipNDC = float2(+1.0f, +1.0f);
     }
     
-    if (mainObj.transformationType == TransformationType::FIXED_SCREENSPACE_SIZE)
+    if (mainObj.transformationType == TransformationType::TT_FIXED_SCREENSPACE_SIZE)
         ret.projectionToNDC = nbl::hlsl::mul(ret.projectionToNDC, globals.screenToWorldScaleTransform);
     
     return ret;

From 7ab6f5d7fbe22841da07b3fcf03a235e5b04e1e3 Mon Sep 17 00:00:00 2001
From: Erfan Ahmadi <ahmadierfan99@gmail.com>
Date: Sat, 3 May 2025 10:44:30 +0400
Subject: [PATCH 223/529] small example edit

---
 62_CAD/main.cpp | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

diff --git a/62_CAD/main.cpp b/62_CAD/main.cpp
index 04d99a9cf..9ab67ffe2 100644
--- a/62_CAD/main.cpp
+++ b/62_CAD/main.cpp
@@ -77,7 +77,7 @@ constexpr std::array<float, (uint32_t)ExampleMode::CASE_COUNT> cameraExtents =
 	10.0	// CASE_BUG
 };
 
-constexpr ExampleMode mode = ExampleMode::CASE_BUG;
+constexpr ExampleMode mode = ExampleMode::CASE_2;
 
 class Camera2D
 {
@@ -3342,8 +3342,7 @@ class ComputerAidedDesign final : public examples::SimpleWindowedApplication, pu
 			CPolyline polyline;
 			
 			LineStyleInfo style = {};
-			style.screenSpaceLineWidth = 1.0f;
-			style.worldSpaceLineWidth = 0.0f;
+			style.screenSpaceLineWidth = 4.0f;
 			style.color = float32_t4(0.619f, 0.325f, 0.709f, 0.5f);
 
 			for (uint32_t i = 0; i < 128u; ++i)
@@ -3367,7 +3366,7 @@ class ComputerAidedDesign final : public examples::SimpleWindowedApplication, pu
 
 				curves::Subdivision::adaptive(myCircle, 1e-5, addToBezier, 10u);
 				polyline.addQuadBeziers(quadBeziers);
-				drawResourcesFiller.drawPolyline(polyline, style, intendedNextSubmit);
+				// drawResourcesFiller.drawPolyline(polyline, style, intendedNextSubmit);
 				polyline.clearEverything();
 			}
 			
@@ -3378,10 +3377,11 @@ class ComputerAidedDesign final : public examples::SimpleWindowedApplication, pu
 					float64_t2(-1.0, 0.0),
 					float64_t2(+1.0, 0.0),
 				};
-				float64_t2 line1[2u] =
+				float64_t2 line1[3u] =
 				{
 					float64_t2(0.0, -1.0),
 					float64_t2(0.0, +1.0),
+					float64_t2(+1.0, +1.0),
 				};
 
 				float64_t3x3 translateMat =
@@ -3390,7 +3390,7 @@ class ComputerAidedDesign final : public examples::SimpleWindowedApplication, pu
 					0.0, 1.0, 0.0,
 					0.0, 0.0, 1.0
 				};
-
+				
 				float64_t angle = m_timeElapsed * 0.001;
 				float64_t2 dir = float64_t2{ cos(angle), sin(angle) };
 				float64_t3x3 rotateMat =
@@ -3400,7 +3400,7 @@ class ComputerAidedDesign final : public examples::SimpleWindowedApplication, pu
 					0.0, 0.0, 1.0
 				};
 
-				float64_t2 scale = float64_t2{ 10.0, 10.0 };
+				float64_t2 scale = float64_t2{ 100.0, 100.0 };
 				float64_t3x3 scaleMat =
 				{
 					scale.x, 0.0, 0.0,
@@ -3411,7 +3411,9 @@ class ComputerAidedDesign final : public examples::SimpleWindowedApplication, pu
 				float64_t3x3 transformation = nbl::hlsl::mul(translateMat, nbl::hlsl::mul(rotateMat, scaleMat));
 				polyline.addLinePoints(line0);
 				polyline.addLinePoints(line1);
-				drawResourcesFiller.drawFixedGeometryPolyline(polyline, style, transformation, TransformationType::FIXED_SCREENSPACE_SIZE, intendedNextSubmit);
+				polyline.preprocessPolylineWithStyle(style);
+				// drawResourcesFiller.drawPolyline(polyline, intendedNextSubmit);
+				drawResourcesFiller.drawFixedGeometryPolyline(polyline, style, transformation, TransformationType::TT_FIXED_SCREENSPACE_SIZE, intendedNextSubmit);
 			}
 		}
 

From f88f6b6a0710c052bac6b1d5bedbd2af9842af53 Mon Sep 17 00:00:00 2001
From: Erfan Ahmadi <ahmadierfan99@gmail.com>
Date: Sun, 4 May 2025 17:40:00 +0400
Subject: [PATCH 224/529] Preparation work for backing up and restoring draw
 resources cache (for view-only GPU mode in n4ce)

---
 62_CAD/DrawResourcesFiller.cpp | 88 +++++++++++++++++-----------------
 62_CAD/DrawResourcesFiller.h   | 74 ++++++++++++++++++----------
 62_CAD/shaders/globals.hlsl    |  1 +
 3 files changed, 92 insertions(+), 71 deletions(-)

diff --git a/62_CAD/DrawResourcesFiller.cpp b/62_CAD/DrawResourcesFiller.cpp
index 55273873e..7bfb92cea 100644
--- a/62_CAD/DrawResourcesFiller.cpp
+++ b/62_CAD/DrawResourcesFiller.cpp
@@ -17,6 +17,7 @@ void DrawResourcesFiller::setSubmitDrawsFunction(const SubmitFunc& func)
 
 void DrawResourcesFiller::allocateResourcesBuffer(ILogicalDevice* logicalDevice, size_t size)
 {
+	// TODO: Make this function failable and report insufficient memory if less that getMinimumRequiredResourcesBufferSize, TODO: Have retry mechanism to allocate less mem
 	size = core::alignUp(size, ResourcesMaxNaturalAlignment);
 	size = core::max(size, getMinimumRequiredResourcesBufferSize());
 	// size = 368u; STRESS TEST
@@ -33,9 +34,7 @@ void DrawResourcesFiller::allocateResourcesBuffer(ILogicalDevice* logicalDevice,
 
 void DrawResourcesFiller::allocateMSDFTextures(ILogicalDevice* logicalDevice, uint32_t maxMSDFs, uint32_t2 msdfsExtent)
 {
-	msdfLRUCache = std::unique_ptr<MSDFsLRUCache>(new MSDFsLRUCache(maxMSDFs));
-	msdfTextureArrayIndexAllocator = core::make_smart_refctd_ptr<IndexAllocator>(core::smart_refctd_ptr<ILogicalDevice>(logicalDevice), maxMSDFs);
-
+	// TODO: Make this function failable and report insufficient memory
 	asset::E_FORMAT msdfFormat = MSDFTextureFormat;
 	asset::VkExtent3D MSDFsExtent = { msdfsExtent.x, msdfsExtent.y, 1u }; 
 	assert(maxMSDFs <= logicalDevice->getPhysicalDevice()->getLimits().maxImageArrayLayers);
@@ -78,6 +77,10 @@ void DrawResourcesFiller::allocateMSDFTextures(ILogicalDevice* logicalDevice, ui
 
 		msdfTextureArray = logicalDevice->createImageView(std::move(imgViewInfo));
 	}
+
+	msdfLRUCache = std::unique_ptr<MSDFsLRUCache>(new MSDFsLRUCache(maxMSDFs));
+	msdfTextureArrayIndexAllocator = core::make_smart_refctd_ptr<IndexAllocator>(core::smart_refctd_ptr<ILogicalDevice>(logicalDevice), maxMSDFs);
+	msdfStagedCPUImages.resize(maxMSDFs);
 }
 
 void DrawResourcesFiller::drawPolyline(const CPolylineBase& polyline, const LineStyleInfo& lineStyleInfo, SIntendedSubmitInfo& intendedNextSubmit)
@@ -373,7 +376,7 @@ bool DrawResourcesFiller::finalizeAllCopiesToGPU(SIntendedSubmitInfo& intendedNe
 	bool success = true;
 	flushDrawObjects();
 	success &= finalizeBufferCopies(intendedNextSubmit);
-	success &= finalizeTextureCopies(intendedNextSubmit);
+	success &= finalizeMSDFImagesCopies(intendedNextSubmit);
 	return success;
 }
 
@@ -488,13 +491,8 @@ bool DrawResourcesFiller::finalizeBufferCopies(SIntendedSubmitInfo& intendedNext
 	return true;
 }
 
-bool DrawResourcesFiller::finalizeTextureCopies(SIntendedSubmitInfo& intendedNextSubmit)
+bool DrawResourcesFiller::finalizeMSDFImagesCopies(SIntendedSubmitInfo& intendedNextSubmit)
 {
-	msdfTextureArrayIndicesUsed.clear(); // clear msdf textures used in the frame, because the frame finished and called this function.
-
-	if (!msdfTextureCopies.size() && m_hasInitializedMSDFTextureArrays) // even if the textureCopies are empty, we want to continue if not initialized yet so that the layout of all layers become READ_ONLY_OPTIMAL
-		return true; // yay successfully copied nothing
-
 	auto* cmdBuffInfo = intendedNextSubmit.getCommandBufferForRecording();
 	
 	if (cmdBuffInfo)
@@ -533,21 +531,20 @@ bool DrawResourcesFiller::finalizeTextureCopies(SIntendedSubmitInfo& intendedNex
 
 		// Do the copies and advance the iterator.
 		// this is the pattern we use for iterating when entries will get erased if processed successfully, but may get skipped for later.
-		auto oit = msdfTextureCopies.begin();
-		for (auto iit = msdfTextureCopies.begin(); iit != msdfTextureCopies.end(); iit++)
+		for (uint32_t i = 0u; i < msdfStagedCPUImages.size(); ++i)
 		{
-			bool copySuccess = true;
-			if (iit->image && iit->index < msdfImage->getCreationParameters().arrayLayers)
+			auto& stagedMSDF = msdfStagedCPUImages[i];
+			if (stagedMSDF.image && i < msdfImage->getCreationParameters().arrayLayers)
 			{
-				for (uint32_t mip = 0; mip < iit->image->getCreationParameters().mipLevels; mip++)
+				for (uint32_t mip = 0; mip < stagedMSDF.image->getCreationParameters().mipLevels; mip++)
 				{
-					auto mipImageRegion = iit->image->getRegion(mip, core::vectorSIMDu32(0u, 0u));
+					auto mipImageRegion = stagedMSDF.image->getRegion(mip, core::vectorSIMDu32(0u, 0u));
 					if (mipImageRegion)
 					{
 						asset::IImage::SBufferCopy region = {};
 						region.imageSubresource.aspectMask = asset::IImage::EAF_COLOR_BIT;
 						region.imageSubresource.mipLevel = mipImageRegion->imageSubresource.mipLevel;
-						region.imageSubresource.baseArrayLayer = iit->index;
+						region.imageSubresource.baseArrayLayer = i;
 						region.imageSubresource.layerCount = 1u;
 						region.bufferOffset = 0u;
 						region.bufferRowLength = mipImageRegion->getExtent().width;
@@ -555,46 +552,30 @@ bool DrawResourcesFiller::finalizeTextureCopies(SIntendedSubmitInfo& intendedNex
 						region.imageExtent = mipImageRegion->imageExtent;
 						region.imageOffset = { 0u, 0u, 0u };
 
-						auto buffer = reinterpret_cast<uint8_t*>(iit->image->getBuffer()->getPointer());
+						auto buffer = reinterpret_cast<uint8_t*>(stagedMSDF.image->getBuffer()->getPointer());
 						auto bufferOffset = mipImageRegion->bufferOffset;
 
-						if (!m_utilities->updateImageViaStagingBuffer(
+						stagedMSDF.uploadedToGPU = m_utilities->updateImageViaStagingBuffer(
 							intendedNextSubmit,
 							buffer + bufferOffset,
 							nbl::ext::TextRendering::TextRenderer::MSDFTextureFormat,
 							msdfImage.get(),
 							IImage::LAYOUT::TRANSFER_DST_OPTIMAL,
-							{ &region, &region + 1 }))
-						{
-							// TODO: Log which mip failed
-							copySuccess = false;
-						}
+							{ &region, &region + 1 });
 					}
 					else
 					{
-						// TODO: Log
-						copySuccess = false;
+						assert(false);
+						stagedMSDF.uploadedToGPU = false;
 					}
 				}
 			}
 			else
 			{
 				assert(false);
-				copySuccess = false;
-			}
-
-			if (!copySuccess)
-			{
-				// we move the failed copy to the oit and advance it
-				if (oit != iit)
-					*oit = *iit;
-				oit++;
+				stagedMSDF.uploadedToGPU = false;
 			}
 		}
-		// trim
-		const auto newSize = std::distance(msdfTextureCopies.begin(), oit);
-		_NBL_DEBUG_BREAK_IF(newSize != 0u); // we had failed copies
-		msdfTextureCopies.resize(newSize);
 
 		// preparing msdfs for use
 		image_barrier_t afterTransferImageBarrier[] =
@@ -1169,12 +1150,12 @@ uint32_t DrawResourcesFiller::addMSDFTexture(const MSDFInputInfo& msdfInput, cor
 
 	// TextureReferences hold the semaValue related to the "scratch semaphore" in IntendedSubmitInfo
 	// Every single submit increases this value by 1
-	// The reason for hiolding on to the lastUsedSema is deferred dealloc, which we call in the case of eviction, making sure we get rid of the entry inside the allocator only when the texture is done being used
+	// The reason for holding on to the lastUsedSema is deferred dealloc, which we call in the case of eviction, making sure we get rid of the entry inside the allocator only when the texture is done being used
 	const auto nextSemaSignal = intendedNextSubmit.getFutureScratchSemaphore();
 
 	auto evictionCallback = [&](const MSDFReference& evicted)
 	{
-		if (msdfTextureArrayIndicesUsed.contains(evicted.alloc_idx)) 
+		if (msdfStagedCPUImages[evicted.alloc_idx].usedThisFrame)
 		{
 			// Dealloc once submission is finished
 			msdfTextureArrayIndexAllocator->multi_deallocate(1u, &evicted.alloc_idx, nextSemaSignal);
@@ -1187,6 +1168,7 @@ uint32_t DrawResourcesFiller::addMSDFTexture(const MSDFInputInfo& msdfInput, cor
 			// We didn't use it this frame, so it's safe to dealloc now, withou needing to "overflow" submit
 			msdfTextureArrayIndexAllocator->multi_deallocate(1u, &evicted.alloc_idx);
 		}
+		msdfStagedCPUImages[evicted.alloc_idx].evict();
 	};
 	
 	// We pass nextSemaValue instead of constructing a new MSDFReference and passing it into `insert` that's because we might get a cache hit and only update the value of the nextSema
@@ -1201,8 +1183,9 @@ uint32_t DrawResourcesFiller::addMSDFTexture(const MSDFInputInfo& msdfInput, cor
 
 		if (inserted->alloc_idx != IndexAllocator::AddressAllocator::invalid_address)
 		{
-			// We queue copy and finalize all on `finalizeTextureCopies` function called before draw calls to make sure it's in mem
-			msdfTextureCopies.push_back({ .image = std::move(cpuImage), .index = inserted->alloc_idx });
+			// We stage copy, finalizeMSDFImagesCopies will push it into GPU
+			msdfStagedCPUImages[inserted->alloc_idx].image = std::move(cpuImage);
+			msdfStagedCPUImages[inserted->alloc_idx].uploadedToGPU = false;
 		}
 		else
 		{
@@ -1213,7 +1196,22 @@ uint32_t DrawResourcesFiller::addMSDFTexture(const MSDFInputInfo& msdfInput, cor
 	
 	assert(inserted->alloc_idx != InvalidTextureIdx); // shouldn't happen, because we're using LRU cache, so worst case eviction will happen + multi-deallocate and next next multi_allocate should definitely succeed
 	if (inserted->alloc_idx != InvalidTextureIdx)
-		msdfTextureArrayIndicesUsed.emplace(inserted->alloc_idx);
+	{
+		msdfStagedCPUImages[inserted->alloc_idx].usedThisFrame = true;
+	}
 
 	return inserted->alloc_idx;
-}
\ No newline at end of file
+}
+
+void DrawResourcesFiller::flushDrawObjects()
+{
+	if (resourcesCollection.drawObjects.getCount() > drawObjectsFlushedToDrawCalls)
+	{
+		DrawCallData drawCall = {};
+		drawCall.isDTMRendering = false;
+		drawCall.drawObj.drawObjectStart = drawObjectsFlushedToDrawCalls;
+		drawCall.drawObj.drawObjectCount = resourcesCollection.drawObjects.getCount() - drawObjectsFlushedToDrawCalls;
+		drawCalls.push_back(drawCall);
+		drawObjectsFlushedToDrawCalls = resourcesCollection.drawObjects.getCount();
+	}
+}
diff --git a/62_CAD/DrawResourcesFiller.h b/62_CAD/DrawResourcesFiller.h
index 049299974..fb4f2c97a 100644
--- a/62_CAD/DrawResourcesFiller.h
+++ b/62_CAD/DrawResourcesFiller.h
@@ -218,6 +218,7 @@ struct DrawResourcesFiller
 		resetCustomClipRects();
 		resetLineStyles();
 		resetDTMSettings();
+		resetMSDFsUsageState();
 
 		drawObjectsFlushedToDrawCalls = 0ull;
 		drawCalls.clear();
@@ -261,7 +262,7 @@ struct DrawResourcesFiller
 	/// For advanced use only, (passed to shaders for them to know if we overflow-submitted in the middle if a main obj
 	uint32_t getActiveMainObjectIndex() const { return activeMainObjectIndex; }
 
-	// TODO: Remove these later, these are for multiple draw calls instead of a single one.
+	// NOTE: Most probably Going to get removed soon with a single draw call in GPU-driven rendering
 	struct DrawCallData
 	{
 		union
@@ -282,37 +283,29 @@ struct DrawResourcesFiller
 		bool isDTMRendering;
 	};
 
-	uint64_t drawObjectsFlushedToDrawCalls = 0ull;
+	const std::vector<DrawCallData>& getDrawCalls() const { return drawCalls; }
 
-	void flushDrawObjects()
+	// ! This is all the textures and buffers that were staged on CPU and eventually copied to GPU in a single submit
+	// ! This data is prepped and ready to be consumed by GPU with no further transformations applied on the data.
+	// ! You can back this up,  and replay your scene without having to traverse your scene and do AddXXX, DrawXXX all over again.
+	struct DrawResourcesCache
 	{
-		if (resourcesCollection.drawObjects.getCount() > drawObjectsFlushedToDrawCalls)
-		{
-			DrawCallData drawCall = {};
-			drawCall.isDTMRendering = false;
-			drawCall.drawObj.drawObjectStart = drawObjectsFlushedToDrawCalls;
-			drawCall.drawObj.drawObjectCount = resourcesCollection.drawObjects.getCount() - drawObjectsFlushedToDrawCalls;
-			drawCalls.push_back(drawCall);
-			drawObjectsFlushedToDrawCalls = resourcesCollection.drawObjects.getCount();
-		}
-	}
-
-	std::vector<DrawCallData> drawCalls; // either dtms or objects
+		// TODO: Resources Colletion
+		// TODO: MSDFs Staging Cache
+		// TODO: Draw Calls Data
+		// TODO: Get total memory consumption
+	};
 
+	// TODO: Backup which gives DrawResourcesCache
+	// TODO: Restore which gets DrawResourcesCache
 
 protected:
-	
-	struct MSDFTextureCopy
-	{
-		core::smart_refctd_ptr<ICPUImage> image;
-		uint32_t index;
-	};
 
 	SubmitFunc submitDraws;
 	
 	bool finalizeBufferCopies(SIntendedSubmitInfo& intendedNextSubmit);
 
-	bool finalizeTextureCopies(SIntendedSubmitInfo& intendedNextSubmit);
+	bool finalizeMSDFImagesCopies(SIntendedSubmitInfo& intendedNextSubmit);
 
 	const size_t calculateRemainingResourcesSize() const;
 
@@ -420,6 +413,12 @@ struct DrawResourcesFiller
 		resourcesCollection.dtmSettings.vector.clear();
 		activeDTMSettingsIndex = InvalidDTMSettingsIdx;
 	}
+	
+	void resetMSDFsUsageState()
+	{
+		for (auto& stagedMSDF : msdfStagedCPUImages)
+			stagedMSDF.usedThisFrame = false;
+	}
 
 	// MSDF Hashing and Caching Internal Functions 
 	enum class MSDFType : uint8_t
@@ -511,6 +510,13 @@ struct DrawResourcesFiller
 	// If you haven't created a mainObject yet, then pass InvalidMainObjectIdx
 	uint32_t addMSDFTexture(const MSDFInputInfo& msdfInput, core::smart_refctd_ptr<ICPUImage>&& cpuImage, SIntendedSubmitInfo& intendedNextSubmit);
 	
+	// Flushes Current Draw Call and adds to drawCalls
+	void flushDrawObjects();
+
+	// DrawCalls Data
+	uint64_t drawObjectsFlushedToDrawCalls = 0ull;
+	std::vector<DrawCallData> drawCalls; // either dtms or objects
+
 	// ResourcesCollection and packed into GPUBuffer
 	ResourcesCollection resourcesCollection;
 	nbl::core::smart_refctd_ptr<IGPUBuffer> resourcesGPUBuffer;
@@ -529,8 +535,8 @@ struct DrawResourcesFiller
 
 	MainObjectType activeMainObjectType;
 	TransformationType activeMainObjectTransformationType;
-	uint32_t activeMainObjectIndex = InvalidMainObjectIdx;
 
+	uint32_t activeMainObjectIndex = InvalidMainObjectIdx;
 	// The ClipRects & Projections are stack, because user can push/pop ClipRects & Projections in any order
 	std::deque<float64_t3x3> activeProjections; // stack of projections stored so we can resubmit them if geometry buffer got reset.
 	std::deque<uint32_t> activeProjectionIndices; // stack of projection gpu addresses in geometry buffer. to keep track of them in push/pops
@@ -538,16 +544,32 @@ struct DrawResourcesFiller
 	std::deque<WorldClipRect> activeClipRects; // stack of clips stored so we can resubmit them if geometry buffer got reset.
 	std::deque<uint32_t> activeClipRectIndices; // stack of clips gpu addresses in geometry buffer. to keep track of them in push/pops
 
-	// MSDF
+	struct MSDFStagedCPUImage
+	{
+		core::smart_refctd_ptr<ICPUImage> image;
+		bool uploadedToGPU : 1u;
+		// TODO: Use frame counter instead, generalize struct to all textures probably, DONT try to abuse scratchSema.nextSignal as frame tracker, because there can be "cached" draws where no submits happen.
+		bool usedThisFrame : 1u;
+
+		bool isValid() const { return image.get() != nullptr; }
+		void evict()
+		{
+			image = nullptr;
+			uploadedToGPU = false;
+			usedThisFrame = false;
+		}
+	};
+
 	GetGlyphMSDFTextureFunc getGlyphMSDF;
 	GetHatchFillPatternMSDFTextureFunc getHatchFillPatternMSDF;
 
 	using MSDFsLRUCache = core::LRUCache<MSDFInputInfo, MSDFReference, MSDFInputInfoHash>;
 	smart_refctd_ptr<IGPUImageView>		msdfTextureArray; // view to the resource holding all the msdfs in it's layers
 	smart_refctd_ptr<IndexAllocator>	msdfTextureArrayIndexAllocator;
-	std::set<uint32_t>					msdfTextureArrayIndicesUsed = {}; // indices in the msdf texture array allocator that have been used in the current frame // TODO: make this a dynamic bitset
-	std::vector<MSDFTextureCopy>		msdfTextureCopies = {}; // queued up texture copies
 	std::unique_ptr<MSDFsLRUCache>		msdfLRUCache; // LRU Cache to evict Least Recently Used in case of overflow
+
+	// TODO: Maybe move this to Resources Collection?
+	std::vector<MSDFStagedCPUImage>		msdfStagedCPUImages = {}; // cached cpu imaged + their status, size equals to LRUCache size
 	static constexpr asset::E_FORMAT	MSDFTextureFormat = asset::E_FORMAT::EF_R8G8B8A8_SNORM;
 
 	bool m_hasInitializedMSDFTextureArrays = false;
diff --git a/62_CAD/shaders/globals.hlsl b/62_CAD/shaders/globals.hlsl
index a71c920a6..69346ee14 100644
--- a/62_CAD/shaders/globals.hlsl
+++ b/62_CAD/shaders/globals.hlsl
@@ -501,6 +501,7 @@ NBL_CONSTEXPR uint32_t InvalidMainObjectIdx = MaxIndexableMainObjects;
 NBL_CONSTEXPR uint32_t InvalidCustomProjectionIndex = nbl::hlsl::numeric_limits<uint32_t>::max;
 NBL_CONSTEXPR uint32_t InvalidCustomClipRectIndex = nbl::hlsl::numeric_limits<uint32_t>::max;
 NBL_CONSTEXPR uint32_t InvalidTextureIdx = nbl::hlsl::numeric_limits<uint32_t>::max;
+NBL_CONSTEXPR uint32_t InvalidMSDFImageIdx = nbl::hlsl::numeric_limits<uint32_t>::max;
 
 // Hatches
 NBL_CONSTEXPR MajorAxis SelectedMajorAxis = MajorAxis::MAJOR_Y;

From 28ea75fbc40b4687d92bbb1ac9b67c6430e90b86 Mon Sep 17 00:00:00 2001
From: keptsecret <sorchon@gmail.com>
Date: Mon, 5 May 2025 17:14:37 +0700
Subject: [PATCH 225/529] simplify scratch,proxy to just scalar types

---
 .../benchmarkWorkgroup.comp.hlsl              |  2 +-
 .../app_resources/testWorkgroup.comp.hlsl     |  4 ++--
 .../app_resources/workgroupCommon.hlsl        | 24 +++++--------------
 3 files changed, 9 insertions(+), 21 deletions(-)

diff --git a/74a_Workgroup2ScanTest/app_resources/benchmarkWorkgroup.comp.hlsl b/74a_Workgroup2ScanTest/app_resources/benchmarkWorkgroup.comp.hlsl
index f758f6ac8..e20e528d7 100644
--- a/74a_Workgroup2ScanTest/app_resources/benchmarkWorkgroup.comp.hlsl
+++ b/74a_Workgroup2ScanTest/app_resources/benchmarkWorkgroup.comp.hlsl
@@ -30,7 +30,7 @@ struct DataProxy
     dtype_t outputVal;
 };
 
-static ScratchProxy<config_t> arithmeticAccessor;
+static ScratchProxy arithmeticAccessor;
 
 template<class Binop, class device_capabilities>
 struct operation_t
diff --git a/74a_Workgroup2ScanTest/app_resources/testWorkgroup.comp.hlsl b/74a_Workgroup2ScanTest/app_resources/testWorkgroup.comp.hlsl
index ac4104279..f9453a165 100644
--- a/74a_Workgroup2ScanTest/app_resources/testWorkgroup.comp.hlsl
+++ b/74a_Workgroup2ScanTest/app_resources/testWorkgroup.comp.hlsl
@@ -24,7 +24,7 @@ struct DataProxy
     }
 };
 
-static ScratchProxy<config_t> arithmeticAccessor;
+static ScratchProxy arithmeticAccessor;
 
 template<class Binop, class device_capabilities>
 struct operation_t
@@ -35,7 +35,7 @@ struct operation_t
     void operator()()
     {
         DataProxy<config_t,Binop> dataAccessor;
-        nbl::hlsl::OPERATION<config_t,binop_base_t,device_capabilities>::template __call<DataProxy<config_t,Binop>, ScratchProxy<config_t> >(dataAccessor,arithmeticAccessor);
+        nbl::hlsl::OPERATION<config_t,binop_base_t,device_capabilities>::template __call<DataProxy<config_t,Binop>, ScratchProxy>(dataAccessor,arithmeticAccessor);
         // we barrier before because we alias the accessors for Binop
         arithmeticAccessor.workgroupExecutionAndMemoryBarrier();
     }
diff --git a/74a_Workgroup2ScanTest/app_resources/workgroupCommon.hlsl b/74a_Workgroup2ScanTest/app_resources/workgroupCommon.hlsl
index e60856bf8..7e8512e72 100644
--- a/74a_Workgroup2ScanTest/app_resources/workgroupCommon.hlsl
+++ b/74a_Workgroup2ScanTest/app_resources/workgroupCommon.hlsl
@@ -42,33 +42,21 @@ bool canStore();
 #error "Define SUBGROUP_SIZE_LOG2!"
 #endif
 
-groupshared vector<uint32_t, config_t::ItemsPerInvocation_1> scratch[config_t::SubgroupSize];  // final (level 1) scan needs to fit in one subgroup exactly
+// final (level 1/2) scan needs to fit in one subgroup exactly
+groupshared uint32_t scratch[config_t::SubgroupsPerVirtualWorkgroupLog2*config_t::ItemsPerInvocation_1];
 
-template<class Config>
 struct ScratchProxy
 {
-    using scalar_t = uint32_t;
-    using stype_t = vector<uint32_t, Config::ItemsPerInvocation_1>;
-
-    stype_t get(const uint32_t ix)
+    void get(const uint32_t ix, NBL_REF_ARG(uint32_t) value)
     {
-        return scratch[ix];
+        value = scratch[ix];
     }
-    void set(const uint32_t ix, const stype_t value)
+    void set(const uint32_t ix, const uint32_t value)
     {
         scratch[ix] = value;
     }
 
-    scalar_t getByComponent(const uint32_t ix)
-    {
-        return scratch[ix/Config::ItemsPerInvocation_1][ix&(Config::ItemsPerInvocation_1-1)];
-    }
-    void setByComponent(const uint32_t ix, const scalar_t value)
-    {
-        scratch[ix/Config::ItemsPerInvocation_1][ix&(Config::ItemsPerInvocation_1-1)] = value;
-    }
-
-    stype_t atomicOr(const uint32_t ix, const stype_t value)
+    uint32_t atomicOr(const uint32_t ix, const uint32_t value)
     {
         return nbl::hlsl::glsl::atomicOr(scratch[ix],value);
     }

From 4ec49382bee48a8cf5b1d5f5916a40c350c354e3 Mon Sep 17 00:00:00 2001
From: Erfan Ahmadi <ahmadierfan99@gmail.com>
Date: Mon, 5 May 2025 15:07:50 +0400
Subject: [PATCH 226/529] CacheAndReplay Feature

---
 62_CAD/DrawResourcesFiller.cpp | 109 ++++++++++++++++++++++-------
 62_CAD/DrawResourcesFiller.h   | 108 ++++++++++++++++++-----------
 62_CAD/main.cpp                | 121 +++++++++++++++++++++------------
 3 files changed, 230 insertions(+), 108 deletions(-)

diff --git a/62_CAD/DrawResourcesFiller.cpp b/62_CAD/DrawResourcesFiller.cpp
index 7bfb92cea..c83055f0e 100644
--- a/62_CAD/DrawResourcesFiller.cpp
+++ b/62_CAD/DrawResourcesFiller.cpp
@@ -371,15 +371,39 @@ void DrawResourcesFiller::_test_addImageObject(float64_t2 topLeftPos, float32_t2
 	endMainObject();
 }
 
-bool DrawResourcesFiller::finalizeAllCopiesToGPU(SIntendedSubmitInfo& intendedNextSubmit)
+bool DrawResourcesFiller::pushAllUploads(SIntendedSubmitInfo& intendedNextSubmit)
 {
+	if (!intendedNextSubmit.valid())
+	{
+		// It is a caching submit without command buffer, just for the purpose of accumulation of staging resources
+		// In that case we don't push any uploads (i.e. we don't record any copy commmand in active command buffer, because there is no active command buffer)
+		return false;
+	}
+
 	bool success = true;
-	flushDrawObjects();
-	success &= finalizeBufferCopies(intendedNextSubmit);
-	success &= finalizeMSDFImagesCopies(intendedNextSubmit);
+	if (currentReplayCache)
+	{
+		// This means we're in a replay cache scope, use the replay cache to push to GPU instead of internal accumulation
+		success &= pushBufferUploads(intendedNextSubmit, currentReplayCache->resourcesCollection);
+		success &= pushMSDFImagesUploads(intendedNextSubmit, currentReplayCache->msdfStagedCPUImages);
+	}
+	else
+	{
+		flushDrawObjects();
+		success &= pushBufferUploads(intendedNextSubmit, resourcesCollection);
+		success &= pushMSDFImagesUploads(intendedNextSubmit, msdfStagedCPUImages);
+	}
 	return success;
 }
 
+const DrawResourcesFiller::ResourcesCollection& DrawResourcesFiller::getResourcesCollection() const
+{
+	if (currentReplayCache)
+		return currentReplayCache->resourcesCollection;
+	else
+		return resourcesCollection;
+}
+
 void DrawResourcesFiller::setActiveLineStyle(const LineStyleInfo& lineStyle)
 {
 	activeLineStyle = lineStyle;
@@ -435,7 +459,50 @@ void DrawResourcesFiller::popCustomClipRect()
 	activeClipRectIndices.pop_back();
 }
 
-bool DrawResourcesFiller::finalizeBufferCopies(SIntendedSubmitInfo& intendedNextSubmit)
+/// For advanced use only, (passed to shaders for them to know if we overflow-submitted in the middle if a main obj
+uint32_t DrawResourcesFiller::getActiveMainObjectIndex() const
+{
+	if (currentReplayCache)
+		return currentReplayCache->activeMainObjectIndex;
+	else
+		return activeMainObjectIndex;
+}
+
+const std::vector<DrawResourcesFiller::DrawCallData>& DrawResourcesFiller::getDrawCalls() const
+{
+	if (currentReplayCache)
+		return currentReplayCache->drawCallsData;
+	else
+		return drawCalls;
+}
+
+std::unique_ptr<DrawResourcesFiller::ReplayCache> DrawResourcesFiller::createReplayCache()
+{
+	flushDrawObjects();
+	std::unique_ptr<ReplayCache> ret = std::unique_ptr<ReplayCache>(new ReplayCache);
+	ret->resourcesCollection = resourcesCollection;
+	ret->msdfStagedCPUImages = msdfStagedCPUImages;
+	for (auto& stagedMSDF : ret->msdfStagedCPUImages)
+	{
+		stagedMSDF.uploadedToGPU = false; // to trigger upload for all msdf functions again.
+		stagedMSDF.usedThisFrame = false;
+	}
+	ret->drawCallsData = drawCalls;
+	ret->activeMainObjectIndex = activeMainObjectIndex;
+	return ret;
+}
+
+void DrawResourcesFiller::setReplayCache(ReplayCache* cache)
+{
+	currentReplayCache = cache;
+}
+
+void DrawResourcesFiller::unsetReplayCache()
+{
+	currentReplayCache = nullptr;
+}
+
+bool DrawResourcesFiller::pushBufferUploads(SIntendedSubmitInfo& intendedNextSubmit, ResourcesCollection& resources)
 {
 	copiedResourcesSize = 0ull;
 
@@ -479,19 +546,19 @@ bool DrawResourcesFiller::finalizeBufferCopies(SIntendedSubmitInfo& intendedNext
 			copiedResourcesSize += drawBuffer.getAlignedStorageSize();
 		};
 
-	copyCPUFilledDrawBuffer(resourcesCollection.lineStyles);
-	copyCPUFilledDrawBuffer(resourcesCollection.dtmSettings);
-	copyCPUFilledDrawBuffer(resourcesCollection.customProjections);
-	copyCPUFilledDrawBuffer(resourcesCollection.customClipRects);
-	copyCPUFilledDrawBuffer(resourcesCollection.mainObjects);
-	copyCPUFilledDrawBuffer(resourcesCollection.drawObjects);
-	copyCPUFilledDrawBuffer(resourcesCollection.indexBuffer);
-	copyCPUFilledDrawBuffer(resourcesCollection.geometryInfo);
+	copyCPUFilledDrawBuffer(resources.lineStyles);
+	copyCPUFilledDrawBuffer(resources.dtmSettings);
+	copyCPUFilledDrawBuffer(resources.customProjections);
+	copyCPUFilledDrawBuffer(resources.customClipRects);
+	copyCPUFilledDrawBuffer(resources.mainObjects);
+	copyCPUFilledDrawBuffer(resources.drawObjects);
+	copyCPUFilledDrawBuffer(resources.indexBuffer);
+	copyCPUFilledDrawBuffer(resources.geometryInfo);
 	
 	return true;
 }
 
-bool DrawResourcesFiller::finalizeMSDFImagesCopies(SIntendedSubmitInfo& intendedNextSubmit)
+bool DrawResourcesFiller::pushMSDFImagesUploads(SIntendedSubmitInfo& intendedNextSubmit, std::vector<MSDFStagedCPUImage>& stagedMSDFCPUImages)
 {
 	auto* cmdBuffInfo = intendedNextSubmit.getCommandBufferForRecording();
 	
@@ -531,9 +598,9 @@ bool DrawResourcesFiller::finalizeMSDFImagesCopies(SIntendedSubmitInfo& intended
 
 		// Do the copies and advance the iterator.
 		// this is the pattern we use for iterating when entries will get erased if processed successfully, but may get skipped for later.
-		for (uint32_t i = 0u; i < msdfStagedCPUImages.size(); ++i)
+		for (uint32_t i = 0u; i < stagedMSDFCPUImages.size(); ++i)
 		{
-			auto& stagedMSDF = msdfStagedCPUImages[i];
+			auto& stagedMSDF = stagedMSDFCPUImages[i];
 			if (stagedMSDF.image && i < msdfImage->getCreationParameters().arrayLayers)
 			{
 				for (uint32_t mip = 0; mip < stagedMSDF.image->getCreationParameters().mipLevels; mip++)
@@ -572,7 +639,6 @@ bool DrawResourcesFiller::finalizeMSDFImagesCopies(SIntendedSubmitInfo& intended
 			}
 			else
 			{
-				assert(false);
 				stagedMSDF.uploadedToGPU = false;
 			}
 		}
@@ -624,7 +690,6 @@ const size_t DrawResourcesFiller::calculateRemainingResourcesSize() const
 
 void DrawResourcesFiller::submitCurrentDrawObjectsAndReset(SIntendedSubmitInfo& intendedNextSubmit, uint32_t& mainObjectIndex)
 {
-	finalizeAllCopiesToGPU(intendedNextSubmit);
 	submitDraws(intendedNextSubmit);
 	reset(); // resets everything, things referenced through mainObj and other shit will be pushed again through acquireXXX_SubmitIfNeeded
 	mainObjectIndex = acquireActiveMainObjectIndex_SubmitIfNeeded(intendedNextSubmit); // it will be 0 because it's first mainObjectIndex after reset and invalidation
@@ -780,7 +845,6 @@ uint32_t DrawResourcesFiller::acquireActiveMainObjectIndex_SubmitIfNeeded(SInten
 	if (needToOverflowSubmit)
 	{
 		// failed to fit into remaining resources mem or exceeded max indexable mainobj
-		finalizeAllCopiesToGPU(intendedNextSubmit);
 		submitDraws(intendedNextSubmit);
 		reset(); // resets everything! be careful!
 	}
@@ -803,7 +867,6 @@ uint32_t DrawResourcesFiller::addLineStyle_SubmitIfNeeded(const LineStyleInfo& l
 	if (outLineStyleIdx == InvalidStyleIdx)
 	{
 		// There wasn't enough resource memory remaining to fit a single LineStyle
-		finalizeAllCopiesToGPU(intendedNextSubmit);
 		submitDraws(intendedNextSubmit);
 		reset(); // resets everything! be careful!
 
@@ -821,7 +884,6 @@ uint32_t DrawResourcesFiller::addDTMSettings_SubmitIfNeeded(const DTMSettingsInf
 	if (outDTMSettingIdx == InvalidDTMSettingsIdx)
 	{
 		// There wasn't enough resource memory remaining to fit dtmsettings struct + 2 linestyles structs.
-		finalizeAllCopiesToGPU(intendedNextSubmit);
 		submitDraws(intendedNextSubmit);
 		reset(); // resets everything! be careful!
 
@@ -839,7 +901,6 @@ uint32_t DrawResourcesFiller::addCustomProjection_SubmitIfNeeded(const float64_t
 
 	if (!enoughMem)
 	{
-		finalizeAllCopiesToGPU(intendedNextSubmit);
 		submitDraws(intendedNextSubmit);
 		reset(); // resets everything! be careful!
 	}
@@ -856,7 +917,6 @@ uint32_t DrawResourcesFiller::addCustomClipRect_SubmitIfNeeded(const WorldClipRe
 
 	if (!enoughMem)
 	{
-		finalizeAllCopiesToGPU(intendedNextSubmit);
 		submitDraws(intendedNextSubmit);
 		reset(); // resets everything! be careful!
 	}
@@ -1159,7 +1219,6 @@ uint32_t DrawResourcesFiller::addMSDFTexture(const MSDFInputInfo& msdfInput, cor
 		{
 			// Dealloc once submission is finished
 			msdfTextureArrayIndexAllocator->multi_deallocate(1u, &evicted.alloc_idx, nextSemaSignal);
-			finalizeAllCopiesToGPU(intendedNextSubmit);
 			submitDraws(intendedNextSubmit);
 			reset(); // resets everything, things referenced through mainObj and other shit will be pushed again through acquireXXX_SubmitIfNeeded
 		} 
@@ -1183,7 +1242,7 @@ uint32_t DrawResourcesFiller::addMSDFTexture(const MSDFInputInfo& msdfInput, cor
 
 		if (inserted->alloc_idx != IndexAllocator::AddressAllocator::invalid_address)
 		{
-			// We stage copy, finalizeMSDFImagesCopies will push it into GPU
+			// We stage copy, pushMSDFImagesUploads will push it into GPU
 			msdfStagedCPUImages[inserted->alloc_idx].image = std::move(cpuImage);
 			msdfStagedCPUImages[inserted->alloc_idx].uploadedToGPU = false;
 		}
diff --git a/62_CAD/DrawResourcesFiller.h b/62_CAD/DrawResourcesFiller.h
index fb4f2c97a..a10379e1a 100644
--- a/62_CAD/DrawResourcesFiller.h
+++ b/62_CAD/DrawResourcesFiller.h
@@ -86,6 +86,7 @@ struct DrawResourcesFiller
 	};
 
 	/// @brief struct to hold all resources
+	// TODO: rename to staged resources buffers or something like that
 	struct ResourcesCollection
 	{
 		// auto-submission level 0 resources (settings that mainObj references)
@@ -204,12 +205,12 @@ struct DrawResourcesFiller
 		float32_t2 size,
 		float32_t rotation,
 		SIntendedSubmitInfo& intendedNextSubmit);
-
-	/// @brief call this function before submitting to ensure all resources are copied
+	
+	/// @brief call this function before submitting to ensure all buffer and textures resourcesCollection requested via drawing calls are copied to GPU
 	/// records copy command into intendedNextSubmit's active command buffer and might possibly submits if fails allocation on staging upload memory.
-	bool finalizeAllCopiesToGPU(SIntendedSubmitInfo& intendedNextSubmit);
+	bool pushAllUploads(SIntendedSubmitInfo& intendedNextSubmit);
 
-	/// @brief  resets resources buffers
+	/// @brief  resets staging buffers and images
 	void reset()
 	{
 		resetDrawObjects();
@@ -225,7 +226,7 @@ struct DrawResourcesFiller
 	}
 
 	/// @brief collection of all the resources that will eventually be reserved or copied to in the resourcesGPUBuffer, will be accessed via individual BDA pointers in shaders
-	const ResourcesCollection& getResourcesCollection() const { return resourcesCollection; }
+	const ResourcesCollection& getResourcesCollection() const;
 
 	/// @brief buffer containing all non-texture type resources
 	nbl::core::smart_refctd_ptr<IGPUBuffer> getResourcesGPUBuffer() const { return resourcesGPUBuffer; }
@@ -260,7 +261,23 @@ struct DrawResourcesFiller
 	}
 
 	/// For advanced use only, (passed to shaders for them to know if we overflow-submitted in the middle if a main obj
-	uint32_t getActiveMainObjectIndex() const { return activeMainObjectIndex; }
+	uint32_t getActiveMainObjectIndex() const;
+
+	struct MSDFStagedCPUImage
+	{
+		core::smart_refctd_ptr<ICPUImage> image;
+		bool uploadedToGPU : 1u;
+		// TODO: Use frame counter instead, generalize struct to all textures probably, DONT try to abuse scratchSema.nextSignal as frame tracker, because there can be "cached" draws where no submits happen.
+		bool usedThisFrame : 1u;
+
+		bool isValid() const { return image.get() != nullptr; }
+		void evict()
+		{
+			image = nullptr;
+			uploadedToGPU = false;
+			usedThisFrame = false;
+		}
+	};
 
 	// NOTE: Most probably Going to get removed soon with a single draw call in GPU-driven rendering
 	struct DrawCallData
@@ -283,29 +300,58 @@ struct DrawResourcesFiller
 		bool isDTMRendering;
 	};
 
-	const std::vector<DrawCallData>& getDrawCalls() const { return drawCalls; }
-
-	// ! This is all the textures and buffers that were staged on CPU and eventually copied to GPU in a single submit
-	// ! This data is prepped and ready to be consumed by GPU with no further transformations applied on the data.
-	// ! You can back this up,  and replay your scene without having to traverse your scene and do AddXXX, DrawXXX all over again.
-	struct DrawResourcesCache
+	const std::vector<DrawCallData>& getDrawCalls() const;
+
+	/// @brief Stores all CPU-side resources that were staged and prepared for a single GPU submission.
+	///
+	/// *** This cache includes anything used or referenced from DrawResourcesFiller in the Draw Submit:
+	/// - Buffer data (geometry, indices, etc.)
+	/// - MSDF CPU images
+	/// - Draw call metadata
+	/// - Active MainObject Index --> this is another state of the submit that we need to store 
+	///
+	/// The data is fully preprocessed and ready to be pushed to the GPU with no further transformation.
+	/// This enables efficient replays without traversing or re-generating scene content.
+	struct ReplayCache
 	{
-		// TODO: Resources Colletion
-		// TODO: MSDFs Staging Cache
-		// TODO: Draw Calls Data
-		// TODO: Get total memory consumption
+		ResourcesCollection resourcesCollection;
+		std::vector<MSDFStagedCPUImage> msdfStagedCPUImages;
+		std::vector<DrawCallData> drawCallsData;
+		uint32_t activeMainObjectIndex = InvalidMainObjectIdx;
+		// TODO: non msdf general CPU Images
+		// TODO: Get total memory consumption for logging?
 	};
 
-	// TODO: Backup which gives DrawResourcesCache
-	// TODO: Restore which gets DrawResourcesCache
+	/// @brief Creates a snapshot of all currently staged CPU-side resourcesCollection for future replay or deferred submission.
+	/// 
+	/// @warning This cache corresponds to a **single intended GPU submit**. 
+	/// If your frame submission overflows into multiple submits due to staging memory limits or batching,
+	/// you are responsible for creating **multiple ReplayCache instances**, one per submit.
+	///
+	/// @return A heap-allocated ReplayCache containing a copy of all staged CPU-side resourcesCollection and draw call data.
+	std::unique_ptr<ReplayCache> createReplayCache();
+
+	/// @brief Redirects all subsequent resource upload and getters to use an external ReplayCache.
+	///
+	/// After calling this function, staging, resource getters, and upload mechanisms will pull data from the given ReplayCache
+	/// instead of the internal accumulation cache.
+	///
+	/// User is responsible for management of cache and making sure it's alive in the ReplayCache scope
+	void setReplayCache(ReplayCache* cache);
+	
+	/// @brief Reverts internal logic to use the default internal staging and resource accumulation cache.
+	/// Must be called once per corresponding `pushReplayCacheUse()`.
+	void unsetReplayCache();
 
 protected:
 
 	SubmitFunc submitDraws;
-	
-	bool finalizeBufferCopies(SIntendedSubmitInfo& intendedNextSubmit);
 
-	bool finalizeMSDFImagesCopies(SIntendedSubmitInfo& intendedNextSubmit);
+	/// @brief Records GPU copy commands for all staged buffer resourcesCollection into the active command buffer.
+	bool pushBufferUploads(SIntendedSubmitInfo& intendedNextSubmit, ResourcesCollection& resourcesCollection);
+	
+	/// @brief Records GPU copy commands for all staged msdf images into the active command buffer.
+	bool pushMSDFImagesUploads(SIntendedSubmitInfo& intendedNextSubmit, std::vector<MSDFStagedCPUImage>& stagedMSDFCPUImages);
 
 	const size_t calculateRemainingResourcesSize() const;
 
@@ -513,6 +559,9 @@ struct DrawResourcesFiller
 	// Flushes Current Draw Call and adds to drawCalls
 	void flushDrawObjects();
 
+	// Replay Cache override
+	ReplayCache* currentReplayCache = nullptr;
+
 	// DrawCalls Data
 	uint64_t drawObjectsFlushedToDrawCalls = 0ull;
 	std::vector<DrawCallData> drawCalls; // either dtms or objects
@@ -544,22 +593,6 @@ struct DrawResourcesFiller
 	std::deque<WorldClipRect> activeClipRects; // stack of clips stored so we can resubmit them if geometry buffer got reset.
 	std::deque<uint32_t> activeClipRectIndices; // stack of clips gpu addresses in geometry buffer. to keep track of them in push/pops
 
-	struct MSDFStagedCPUImage
-	{
-		core::smart_refctd_ptr<ICPUImage> image;
-		bool uploadedToGPU : 1u;
-		// TODO: Use frame counter instead, generalize struct to all textures probably, DONT try to abuse scratchSema.nextSignal as frame tracker, because there can be "cached" draws where no submits happen.
-		bool usedThisFrame : 1u;
-
-		bool isValid() const { return image.get() != nullptr; }
-		void evict()
-		{
-			image = nullptr;
-			uploadedToGPU = false;
-			usedThisFrame = false;
-		}
-	};
-
 	GetGlyphMSDFTextureFunc getGlyphMSDF;
 	GetHatchFillPatternMSDFTextureFunc getHatchFillPatternMSDF;
 
@@ -568,7 +601,6 @@ struct DrawResourcesFiller
 	smart_refctd_ptr<IndexAllocator>	msdfTextureArrayIndexAllocator;
 	std::unique_ptr<MSDFsLRUCache>		msdfLRUCache; // LRU Cache to evict Least Recently Used in case of overflow
 
-	// TODO: Maybe move this to Resources Collection?
 	std::vector<MSDFStagedCPUImage>		msdfStagedCPUImages = {}; // cached cpu imaged + their status, size equals to LRUCache size
 	static constexpr asset::E_FORMAT	MSDFTextureFormat = asset::E_FORMAT::EF_R8G8B8A8_SNORM;
 
diff --git a/62_CAD/main.cpp b/62_CAD/main.cpp
index 9ab67ffe2..c7fe04603 100644
--- a/62_CAD/main.cpp
+++ b/62_CAD/main.cpp
@@ -45,6 +45,7 @@ static constexpr bool DebugModeWireframe = false;
 static constexpr bool DebugRotatingViewProj = false;
 static constexpr bool FragmentShaderPixelInterlock = true;
 static constexpr bool LargeGeoTextureStreaming = true;
+static constexpr bool CacheAndReplay = false; // caches first frame resources (buffers and images) from DrawResourcesFiller  and replays in future frames, skiping CPU Logic
 
 enum class ExampleMode
 {
@@ -77,7 +78,7 @@ constexpr std::array<float, (uint32_t)ExampleMode::CASE_COUNT> cameraExtents =
 	10.0	// CASE_BUG
 };
 
-constexpr ExampleMode mode = ExampleMode::CASE_2;
+constexpr ExampleMode mode = ExampleMode::CASE_9;
 
 class Camera2D
 {
@@ -240,7 +241,7 @@ class CSwapchainResources : public ISimpleManagedSurface::ISwapchainResources
 			std::fill(m_framebuffers.begin(),m_framebuffers.end(),nullptr);
 		}
 
-		// For creating extra per-image or swapchain resources you might need
+		// For creating extra per-image or swapchain resourcesCollection you might need
 		virtual inline bool onCreateSwapchain_impl(const uint8_t qFam)
 		{
 			auto device = const_cast<ILogicalDevice*>(m_renderpass->getOriginDevice());
@@ -286,10 +287,10 @@ class ComputerAidedDesign final : public examples::SimpleWindowedApplication, pu
 	constexpr static uint32_t MaxSubmitsInFlight = 16u;
 public:
 
-	void allocateResources(uint32_t maxObjects)
+	void allocateResources()
 	{
 		drawResourcesFiller = DrawResourcesFiller(core::smart_refctd_ptr(m_utils), getGraphicsQueue());
-
+		
 		size_t bufferSize = 512u * 1024u * 1024u; // 512 MB
 		drawResourcesFiller.allocateResourcesBuffer(m_device.get(), bufferSize);
 		drawResourcesFiller.allocateMSDFTextures(m_device.get(), 256u, uint32_t2(MSDFSize, MSDFSize));
@@ -626,7 +627,7 @@ class ComputerAidedDesign final : public examples::SimpleWindowedApplication, pu
 	double dt = 0;
 	double m_timeElapsed = 0.0;
 	std::chrono::steady_clock::time_point lastTime;
-	uint32_t m_hatchDebugStep = 0u;
+	uint32_t m_hatchDebugStep = 10u;
 	E_HEIGHT_SHADING_MODE m_shadingModeExample = E_HEIGHT_SHADING_MODE::DISCRETE_VARIABLE_LENGTH_INTERVALS;
 
 	inline bool onAppInitialized(smart_refctd_ptr<ISystem>&& system) override
@@ -657,7 +658,7 @@ class ComputerAidedDesign final : public examples::SimpleWindowedApplication, pu
 		if (!m_surface->init(getGraphicsQueue(),std::move(scResources),{}))
 			return logFail("Could not initialize the Surface!");
 
-		allocateResources(1024 * 1024u);
+		allocateResources();
 
 		const bitflag<IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS> bindlessTextureFlags =
 			IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_UPDATE_AFTER_BIND_BIT |
@@ -1089,6 +1090,14 @@ class ComputerAidedDesign final : public examples::SimpleWindowedApplication, pu
 			}
 		, m_logger.get());
 		
+		const bool isCachingDraw = CacheAndReplay && m_realFrameIx == 0u;
+		if (isCachingDraw)
+		{
+			SIntendedSubmitInfo invalidSubmit = {};
+			addObjects(invalidSubmit); // if any overflows happen here, it will add to our replay cache and not submit anything
+			replayCaches.push_back(drawResourcesFiller.createReplayCache());
+			finishedCachingDraw = true;
+		}
 
 		if (!beginFrameRender())
 			return;
@@ -1109,10 +1118,28 @@ class ComputerAidedDesign final : public examples::SimpleWindowedApplication, pu
 		IQueue::SSubmitInfo::SSemaphoreInfo waitSems[2u] = { acquired, prevFrameRendered };
 		m_intendedNextSubmit.waitSemaphores = waitSems;
 		
-		addObjects(m_intendedNextSubmit);
-		
+		if (CacheAndReplay)
+		{
+			// to size-1u because we only want to submit overflows here.
+			for (uint32_t i = 0u; i < replayCaches.size() - 1u; ++i)
+			{
+				drawResourcesFiller.setReplayCache(replayCaches[i].get());
+				submitDraws(m_intendedNextSubmit, true);
+				drawResourcesFiller.unsetReplayCache();
+			}
+			if (!replayCaches.empty())
+				drawResourcesFiller.setReplayCache(replayCaches.back().get());
+		}
+		else
+		{
+			addObjects(m_intendedNextSubmit);
+		}
+
 		endFrameRender(m_intendedNextSubmit);
 
+		if (CacheAndReplay)
+			drawResourcesFiller.unsetReplayCache();
+
 #ifdef BENCHMARK_TILL_FIRST_FRAME
 		if (!stopBenchamrkFlag)
 		{
@@ -1187,17 +1214,19 @@ class ComputerAidedDesign final : public examples::SimpleWindowedApplication, pu
 	
 	void submitDraws(SIntendedSubmitInfo& intendedSubmitInfo, bool inBetweenSubmit)
 	{
-		// TODO: Remove this check later
-		if (inBetweenSubmit)
+		const bool isCachingDraw = CacheAndReplay && m_realFrameIx == 0u && !finishedCachingDraw;
+		if (isCachingDraw)
 		{
-			m_logger->log("Temporarily Disabled. Auto-Submission shouldn't happen (for Demo)", ILogger::ELL_ERROR);
-			assert(!inBetweenSubmit);
+			replayCaches.push_back(drawResourcesFiller.createReplayCache());
+			return; // we don't record, submit or do anything, just caching the draw resources
 		}
 
+		drawResourcesFiller.pushAllUploads(intendedSubmitInfo);
+
 		// Use the current recording command buffer of the intendedSubmitInfos scratchCommandBuffers, it should be in recording state
 		auto* cb = m_currentRecordingCommandBufferInfo->cmdbuf;
 		
-		const auto& resources = drawResourcesFiller.getResourcesCollection();
+		const auto& resourcesCollection = drawResourcesFiller.getResourcesCollection();
 		const auto& resourcesGPUBuffer = drawResourcesFiller.getResourcesGPUBuffer();
 
 		float64_t3x3 projectionToNDC;
@@ -1206,13 +1235,13 @@ class ComputerAidedDesign final : public examples::SimpleWindowedApplication, pu
 		Globals globalData = {};
 		uint64_t baseAddress = resourcesGPUBuffer->getDeviceAddress();
 		globalData.pointers = {
-			.lineStyles				= baseAddress + resources.lineStyles.bufferOffset,
-			.dtmSettings			= baseAddress + resources.dtmSettings.bufferOffset,
-			.customProjections		= baseAddress + resources.customProjections.bufferOffset,
-			.customClipRects		= baseAddress + resources.customClipRects.bufferOffset,
-			.mainObjects			= baseAddress + resources.mainObjects.bufferOffset,
-			.drawObjects			= baseAddress + resources.drawObjects.bufferOffset,
-			.geometryBuffer			= baseAddress + resources.geometryInfo.bufferOffset,
+			.lineStyles				= baseAddress + resourcesCollection.lineStyles.bufferOffset,
+			.dtmSettings			= baseAddress + resourcesCollection.dtmSettings.bufferOffset,
+			.customProjections		= baseAddress + resourcesCollection.customProjections.bufferOffset,
+			.customClipRects		= baseAddress + resourcesCollection.customClipRects.bufferOffset,
+			.mainObjects			= baseAddress + resourcesCollection.mainObjects.bufferOffset,
+			.drawObjects			= baseAddress + resourcesCollection.drawObjects.bufferOffset,
+			.geometryBuffer			= baseAddress + resourcesCollection.geometryInfo.bufferOffset,
 		};
 		globalData.antiAliasingFactor = 1.0;// +abs(cos(m_timeElapsed * 0.0008)) * 20.0f;
 		globalData.resolution = uint32_t2{ m_window->getWidth(), m_window->getHeight() };
@@ -1253,7 +1282,7 @@ class ComputerAidedDesign final : public examples::SimpleWindowedApplication, pu
 			uint32_t bufferBarriersCount = 0u;
 			IGPUCommandBuffer::SPipelineBarrierDependencyInfo::buffer_barrier_t bufferBarriers[MaxBufferBarriersCount];
 			
-			const auto& resources = drawResourcesFiller.getResourcesCollection();
+			const auto& resourcesCollection = drawResourcesFiller.getResourcesCollection();
 
 			if (m_globalsBuffer->getSize() > 0u)
 			{
@@ -1311,14 +1340,14 @@ class ComputerAidedDesign final : public examples::SimpleWindowedApplication, pu
 		
 		cb->bindGraphicsPipeline(graphicsPipeline.get());
 
-		for (auto& drawCall : drawResourcesFiller.drawCalls)
+		for (auto& drawCall : drawResourcesFiller.getDrawCalls())
 		{
 			if (drawCall.isDTMRendering)
 			{
-				cb->bindIndexBuffer({ .offset = resources.geometryInfo.bufferOffset + drawCall.dtm.indexBufferOffset, .buffer = drawResourcesFiller.getResourcesGPUBuffer().get()}, asset::EIT_32BIT);
+				cb->bindIndexBuffer({ .offset = resourcesCollection.geometryInfo.bufferOffset + drawCall.dtm.indexBufferOffset, .buffer = drawResourcesFiller.getResourcesGPUBuffer().get()}, asset::EIT_32BIT);
 
 				PushConstants pc = {
-					.triangleMeshVerticesBaseAddress = drawCall.dtm.triangleMeshVerticesBaseAddress + resourcesGPUBuffer->getDeviceAddress() + resources.geometryInfo.bufferOffset,
+					.triangleMeshVerticesBaseAddress = drawCall.dtm.triangleMeshVerticesBaseAddress + resourcesGPUBuffer->getDeviceAddress() + resourcesCollection.geometryInfo.bufferOffset,
 					.triangleMeshMainObjectIndex = drawCall.dtm.triangleMeshMainObjectIndex,
 					.isDTMRendering = true
 				};
@@ -1336,8 +1365,8 @@ class ComputerAidedDesign final : public examples::SimpleWindowedApplication, pu
 				const uint64_t indexOffset = drawCall.drawObj.drawObjectStart * 6u;
 				const uint64_t indexCount = drawCall.drawObj.drawObjectCount * 6u;
 
-				// assert(currentIndexCount == resources.indexBuffer.getCount());
-				cb->bindIndexBuffer({ .offset = resources.indexBuffer.bufferOffset + indexOffset * sizeof(uint32_t), .buffer = resourcesGPUBuffer.get()}, asset::EIT_32BIT);
+				// assert(currentIndexCount == resourcesCollection.indexBuffer.getCount());
+				cb->bindIndexBuffer({ .offset = resourcesCollection.indexBuffer.bufferOffset + indexOffset * sizeof(uint32_t), .buffer = resourcesGPUBuffer.get()}, asset::EIT_32BIT);
 				cb->drawIndexed(indexCount, 1u, 0u, 0u, 0u);
 			}
 		}
@@ -1350,7 +1379,7 @@ class ComputerAidedDesign final : public examples::SimpleWindowedApplication, pu
 
 		if constexpr (DebugModeWireframe)
 		{
-			const uint32_t indexCount = resources.drawObjects.getCount() * 6u;
+			const uint32_t indexCount = resourcesCollection.drawObjects.getCount() * 6u;
 			cb->bindGraphicsPipeline(debugGraphicsPipeline.get());
 			cb->drawIndexed(indexCount, 1u, 0u, 0u, 0u);
 		}
@@ -1448,22 +1477,6 @@ class ComputerAidedDesign final : public examples::SimpleWindowedApplication, pu
 	
 	void addObjects(SIntendedSubmitInfo& intendedNextSubmit)
 	{
-		// we record upload of our objects and if we failed to allocate we submit everything
-		if (!intendedNextSubmit.valid())
-		{
-			// log("intendedNextSubmit is invalid.", nbl::system::ILogger::ELL_ERROR);
-			assert(false);
-			return;
-		}
-
-		// Use the current recording command buffer of the intendedSubmitInfos scratchCommandBuffers, it should be in recording state
-		auto* cmdbuf = m_currentRecordingCommandBufferInfo->cmdbuf;
-
-		assert(cmdbuf->getState() == video::IGPUCommandBuffer::STATE::RECORDING && cmdbuf->isResettable());
-		assert(cmdbuf->getRecordingFlags().hasFlags(video::IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT));
-
-		auto* cmdpool = cmdbuf->getPool();
-
 		drawResourcesFiller.setSubmitDrawsFunction(
 			[&](SIntendedSubmitInfo& intendedNextSubmit)
 			{
@@ -2822,6 +2835,23 @@ class ComputerAidedDesign final : public examples::SimpleWindowedApplication, pu
 		{
 			if (m_realFrameIx == 0u)
 			{
+				// we record upload of our objects and if we failed to allocate we submit everything
+				if (!intendedNextSubmit.valid())
+				{
+					// log("intendedNextSubmit is invalid.", nbl::system::ILogger::ELL_ERROR);
+					assert(false);
+					return;
+				}
+
+				// Use the current recording command buffer of the intendedSubmitInfos scratchCommandBuffers, it should be in recording state
+				auto* cmdbuf = m_currentRecordingCommandBufferInfo->cmdbuf;
+
+				assert(cmdbuf->getState() == video::IGPUCommandBuffer::STATE::RECORDING && cmdbuf->isResettable());
+				assert(cmdbuf->getRecordingFlags().hasFlags(video::IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT));
+
+				auto* cmdpool = cmdbuf->getPool();
+
+
 				// Load image
 				system::path m_loadCWD = "..";
 				std::string imagePath = "../../media/color_space_test/R8G8B8A8_1.png";
@@ -3416,8 +3446,6 @@ class ComputerAidedDesign final : public examples::SimpleWindowedApplication, pu
 				drawResourcesFiller.drawFixedGeometryPolyline(polyline, style, transformation, TransformationType::TT_FIXED_SCREENSPACE_SIZE, intendedNextSubmit);
 			}
 		}
-
-		drawResourcesFiller.finalizeAllCopiesToGPU(intendedNextSubmit);
 	}
 
 	double getScreenToWorldRatio(const float64_t3x3& viewProjectionMatrix, uint32_t2 windowSize)
@@ -3433,6 +3461,9 @@ class ComputerAidedDesign final : public examples::SimpleWindowedApplication, pu
 	std::chrono::seconds timeout = std::chrono::seconds(0x7fffFFFFu);
 	clock_t::time_point start;
 
+	std::vector<std::unique_ptr<DrawResourcesFiller::ReplayCache>> replayCaches = {}; // vector because there can be overflow submits
+	bool finishedCachingDraw = false;
+
 	bool fragmentShaderInterlockEnabled = false;
 
 	core::smart_refctd_ptr<InputSystem> m_inputSystem;

From 8677f1fc20c10a41603f75a445a9901132e54321 Mon Sep 17 00:00:00 2001
From: Erfan Ahmadi <ahmadierfan99@gmail.com>
Date: Mon, 5 May 2025 16:48:49 +0400
Subject: [PATCH 227/529] sync with builtin-shaders work

---
 62_CAD/shaders/globals.hlsl                    | 18 +++++++++++-------
 .../shaders/main_pipeline/fragment_shader.hlsl |  5 ++---
 .../shaders/main_pipeline/resolve_alphas.hlsl  |  3 +--
 .../shaders/main_pipeline/vertex_shader.hlsl   |  3 +--
 62_CAD/shaders/runtimeDeviceConfigCaps.hlsl    |  6 ++++++
 5 files changed, 21 insertions(+), 14 deletions(-)
 create mode 100644 62_CAD/shaders/runtimeDeviceConfigCaps.hlsl

diff --git a/62_CAD/shaders/globals.hlsl b/62_CAD/shaders/globals.hlsl
index 69346ee14..b565ff4ff 100644
--- a/62_CAD/shaders/globals.hlsl
+++ b/62_CAD/shaders/globals.hlsl
@@ -1,7 +1,14 @@
 #ifndef _CAD_EXAMPLE_GLOBALS_HLSL_INCLUDED_
 #define _CAD_EXAMPLE_GLOBALS_HLSL_INCLUDED_
 
-// #define NBL_FORCE_EMULATED_FLOAT_64
+#ifdef __HLSL_VERSION
+#ifndef NBL_USE_SPIRV_BUILTINS
+#include "runtimeDeviceConfigCaps.hlsl" // defines DeviceConfigCaps, uses JIT device caps
+#endif
+#endif
+
+// TODO[Erfan]: Turn off in the future, but keep enabled to test
+#define NBL_FORCE_EMULATED_FLOAT_64
 
 #include <nbl/builtin/hlsl/portable/float64_t.hlsl>
 #include <nbl/builtin/hlsl/portable/vector_t.hlsl>
@@ -13,16 +20,14 @@
 
 #ifdef __HLSL_VERSION
 #include <nbl/builtin/hlsl/math/equations/quadratic.hlsl>
-#include <nbl/builtin/hlsl/jit/device_capabilities.hlsl>
 #endif
 
 using namespace nbl::hlsl;
 
-// because we can't use jit/device_capabilities.hlsl in c++ code
 #ifdef __HLSL_VERSION
-using pfloat64_t = portable_float64_t<jit::device_capabilities>;
-using pfloat64_t2 = portable_float64_t2<jit::device_capabilities>;
-using pfloat64_t3 = portable_float64_t3<jit::device_capabilities>;
+using pfloat64_t = portable_float64_t<DeviceConfigCaps>;
+using pfloat64_t2 = portable_float64_t2<DeviceConfigCaps>;
+using pfloat64_t3 = portable_float64_t3<DeviceConfigCaps>;
 #else
 using pfloat64_t = float64_t;
 using pfloat64_t2 = nbl::hlsl::vector<float64_t, 2>;
@@ -501,7 +506,6 @@ NBL_CONSTEXPR uint32_t InvalidMainObjectIdx = MaxIndexableMainObjects;
 NBL_CONSTEXPR uint32_t InvalidCustomProjectionIndex = nbl::hlsl::numeric_limits<uint32_t>::max;
 NBL_CONSTEXPR uint32_t InvalidCustomClipRectIndex = nbl::hlsl::numeric_limits<uint32_t>::max;
 NBL_CONSTEXPR uint32_t InvalidTextureIdx = nbl::hlsl::numeric_limits<uint32_t>::max;
-NBL_CONSTEXPR uint32_t InvalidMSDFImageIdx = nbl::hlsl::numeric_limits<uint32_t>::max;
 
 // Hatches
 NBL_CONSTEXPR MajorAxis SelectedMajorAxis = MajorAxis::MAJOR_Y;
diff --git a/62_CAD/shaders/main_pipeline/fragment_shader.hlsl b/62_CAD/shaders/main_pipeline/fragment_shader.hlsl
index 326c4cf0d..6475faeff 100644
--- a/62_CAD/shaders/main_pipeline/fragment_shader.hlsl
+++ b/62_CAD/shaders/main_pipeline/fragment_shader.hlsl
@@ -7,7 +7,6 @@
 #include <nbl/builtin/hlsl/math/equations/quadratic.hlsl>
 #include <nbl/builtin/hlsl/math/geometry.hlsl>
 #include <nbl/builtin/hlsl/spirv_intrinsics/fragment_shader_pixel_interlock.hlsl>
-#include <nbl/builtin/hlsl/jit/device_capabilities.hlsl>
 #include <nbl/builtin/hlsl/text_rendering/msdf.hlsl>
 //#include <nbl/builtin/hlsl/spirv_intrinsics/fragment_shader_barycentric.hlsl>
 
@@ -159,7 +158,7 @@ float4 fragMain(PSInput input) : SV_TARGET
         localAlpha = dtmColor.a;
 
         gammaUncorrect(textureColor); // want to output to SRGB without gamma correction
-        return calculateFinalColor<nbl::hlsl::jit::device_capabilities::fragmentShaderPixelInterlock>(uint2(input.position.xy), localAlpha, currentMainObjectIdx, textureColor, true);
+        return calculateFinalColor<DeviceConfigCaps::fragmentShaderPixelInterlock>(uint2(input.position.xy), localAlpha, currentMainObjectIdx, textureColor, true);
     }
     else
     {
@@ -416,6 +415,6 @@ float4 fragMain(PSInput input) : SV_TARGET
         // TODO[Przemek]: But make sure you're still calling this, correctly calculating alpha and texture color.
         // you can add 1 main object and push via DrawResourcesFiller like we already do for other objects (this go in the mainObjects StorageBuffer) and then set the currentMainObjectIdx to 0 here
         // having 1 main object temporarily means that all triangle meshes will be treated as a unified object in blending operations. 
-        return calculateFinalColor<nbl::hlsl::jit::device_capabilities::fragmentShaderPixelInterlock>(fragCoord, localAlpha, currentMainObjectIdx, textureColor, colorFromTexture);
+        return calculateFinalColor<DeviceConfigCaps::fragmentShaderPixelInterlock>(fragCoord, localAlpha, currentMainObjectIdx, textureColor, colorFromTexture);
     }
 }
diff --git a/62_CAD/shaders/main_pipeline/resolve_alphas.hlsl b/62_CAD/shaders/main_pipeline/resolve_alphas.hlsl
index 86257428f..69bab6bde 100644
--- a/62_CAD/shaders/main_pipeline/resolve_alphas.hlsl
+++ b/62_CAD/shaders/main_pipeline/resolve_alphas.hlsl
@@ -1,6 +1,5 @@
 #include "common.hlsl"
 #include <nbl/builtin/hlsl/spirv_intrinsics/fragment_shader_pixel_interlock.hlsl>
-#include <nbl/builtin/hlsl/jit/device_capabilities.hlsl>
 
 template<bool FragmentShaderPixelInterlock>
 float32_t4 calculateFinalColor(const uint2 fragCoord);
@@ -78,5 +77,5 @@ float32_t4 calculateFinalColor<true>(const uint2 fragCoord)
 [shader("pixel")]
 float4 resolveAlphaMain(float4 position : SV_Position) : SV_TARGET
 {
-    return calculateFinalColor<nbl::hlsl::jit::device_capabilities::fragmentShaderPixelInterlock>(position.xy);
+    return calculateFinalColor<DeviceConfigCaps::fragmentShaderPixelInterlock>(position.xy);
 }
diff --git a/62_CAD/shaders/main_pipeline/vertex_shader.hlsl b/62_CAD/shaders/main_pipeline/vertex_shader.hlsl
index 478ad964f..73225e3c0 100644
--- a/62_CAD/shaders/main_pipeline/vertex_shader.hlsl
+++ b/62_CAD/shaders/main_pipeline/vertex_shader.hlsl
@@ -5,7 +5,6 @@
 #include <nbl/builtin/hlsl/math/equations/quadratic.hlsl>
 #include <nbl/builtin/hlsl/limits.hlsl>
 #include <nbl/builtin/hlsl/algorithm.hlsl>
-#include <nbl/builtin/hlsl/jit/device_capabilities.hlsl>
 
 // TODO[Lucas]: Move these functions to builtin hlsl functions (Even the shadertoy obb and aabb ones)
 float cross2D(float2 a, float2 b)
@@ -518,7 +517,7 @@ PSInput main(uint vertexID : SV_VertexID)
             const float2 dilateRate = pixelsToIncreaseOnEachSide / screenSpaceAabbExtents; // float sufficient to hold the dilate rect? 
             float2 dilateVec;
             float2 dilatedUV;
-            dilateHatch<jit::device_capabilities::fragmentShaderPixelInterlock>(dilateVec, dilatedUV, undilatedCorner, dilateRate, ndcAxisU, ndcAxisV);
+            dilateHatch<DeviceConfigCaps::fragmentShaderPixelInterlock>(dilateVec, dilatedUV, undilatedCorner, dilateRate, ndcAxisU, ndcAxisV);
 
             // doing interpolation this way to ensure correct endpoints and 0 and 1, we can alternatively use branches to set current corner based on vertexIdx
             const pfloat64_t2 currentCorner = curveBox.aabbMin * (_static_cast<pfloat64_t2>(float2(1.0f, 1.0f)) - undilatedCornerF64) +
diff --git a/62_CAD/shaders/runtimeDeviceConfigCaps.hlsl b/62_CAD/shaders/runtimeDeviceConfigCaps.hlsl
new file mode 100644
index 000000000..96647c0e7
--- /dev/null
+++ b/62_CAD/shaders/runtimeDeviceConfigCaps.hlsl
@@ -0,0 +1,6 @@
+#ifndef _RUNTIME_DEVICE_CONFIG_CAPS_HLSL_INCLUDED_
+#define _RUNTIME_DEVICE_CONFIG_CAPS_HLSL_INCLUDED_
+
+#include <nbl/builtin/hlsl/jit/device_capabilities.hlsl>
+using DeviceConfigCaps = nbl::hlsl::jit::device_capabilities;
+#endif // _RUNTIME_DEVICE_CONFIG_CAPS_HLSL_INCLUDED_

From 8c76367c1c226cce3d66f1c60f540e29a501a1cb Mon Sep 17 00:00:00 2001
From: devsh <devsh@devsh.eu>
Date: Tue, 6 May 2025 15:05:59 +0200
Subject: [PATCH 228/529] update the Acceleration Structure Position fetch code
 in one example after AS-refactor

---
 71_RayTracingPipeline/main.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/71_RayTracingPipeline/main.cpp b/71_RayTracingPipeline/main.cpp
index 219a7aacb..e31f5c280 100644
--- a/71_RayTracingPipeline/main.cpp
+++ b/71_RayTracingPipeline/main.cpp
@@ -1483,7 +1483,7 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication,
 
       auto blasFlags = bitflag(IGPUBottomLevelAccelerationStructure::BUILD_FLAGS::PREFER_FAST_TRACE_BIT) | IGPUBottomLevelAccelerationStructure::BUILD_FLAGS::ALLOW_COMPACTION_BIT;
       if (m_physicalDevice->getProperties().limits.rayTracingPositionFetch)
-        blasFlags |= IGPUBottomLevelAccelerationStructure::BUILD_FLAGS::ALLOW_DATA_ACCESS_KHR;
+        blasFlags |= IGPUBottomLevelAccelerationStructure::BUILD_FLAGS::ALLOW_DATA_ACCESS;
 
       IGPUBottomLevelAccelerationStructure::DeviceBuildInfo initBuildInfo;
       initBuildInfo.buildFlags = blasFlags;

From e8c2831c4117b2daaa0d1d61654c271496705f80 Mon Sep 17 00:00:00 2001
From: keptsecret <sorchon@gmail.com>
Date: Wed, 7 May 2025 09:51:43 +0700
Subject: [PATCH 229/529] move all tests into new example

---
 .../app_resources/shaderCommon.hlsl           |  36 ++--
 .../app_resources/testSubgroup.comp.hlsl      |   6 +-
 74a_Workgroup2ScanTest/main.cpp               | 179 +++++++++++++-----
 3 files changed, 152 insertions(+), 69 deletions(-)

diff --git a/74a_Workgroup2ScanTest/app_resources/shaderCommon.hlsl b/74a_Workgroup2ScanTest/app_resources/shaderCommon.hlsl
index 79bf74e71..376f69579 100644
--- a/74a_Workgroup2ScanTest/app_resources/shaderCommon.hlsl
+++ b/74a_Workgroup2ScanTest/app_resources/shaderCommon.hlsl
@@ -2,6 +2,7 @@
 
 #include "nbl/builtin/hlsl/glsl_compat/core.hlsl"
 #include "nbl/builtin/hlsl/subgroup/basic.hlsl"
+#include "nbl/builtin/hlsl/subgroup/arithmetic_portability.hlsl"
 #include "nbl/builtin/hlsl/subgroup2/arithmetic_portability.hlsl"
 
 #include "nbl/builtin/hlsl/jit/device_capabilities.hlsl"
@@ -28,33 +29,40 @@ bool canStore();
 #ifndef OPERATION
 #error "Define OPERATION!"
 #endif
+
 #ifndef SUBGROUP_SIZE_LOG2
 #error "Define SUBGROUP_SIZE_LOG2!"
 #endif
-template<template<class> class binop>
+template<template<class> class binop, typename T, uint32_t N>
 static void subtest(NBL_CONST_REF_ARG(type_t) sourceVal)
 {
+    // TODO static assert vector<T, N> == type_t
+    //using type_t = vector<T, N>;
+    using config_t = nbl::hlsl::subgroup2::Configuration<SUBGROUP_SIZE_LOG2>;
+    using params_t = nbl::hlsl::subgroup2::ArithmeticParams<config_t, typename binop<T>::base_t, N, nbl::hlsl::jit::device_capabilities>;
+
     if (globalIndex()==0u)
-        output[binop<type_t>::BindingIndex].template Store<uint32_t>(0,nbl::hlsl::glsl::gl_SubgroupSize());
+        output[binop<T>::BindingIndex].template Store<uint32_t>(0,nbl::hlsl::glsl::gl_SubgroupSize());
         
-    operation_t<typename binop<type_t>::base_t,nbl::hlsl::jit::device_capabilities> func;
+    operation_t<params_t> func;
     if (canStore())
-        output[binop<type_t>::BindingIndex].template Store<type_t>(sizeof(uint32_t)+sizeof(type_t)*globalIndex(),func(sourceVal));
+        output[binop<T>::BindingIndex].template Store<type_t>(sizeof(uint32_t)+sizeof(type_t)*globalIndex(),func(sourceVal));
 }
 
 
 type_t test()
 {
-    const type_t sourceVal = inputValue[globalIndex()];
-
-    subtest<bit_and>(sourceVal);
-    subtest<bit_xor>(sourceVal);
-    subtest<bit_or>(sourceVal);
-    subtest<plus>(sourceVal);
-    subtest<multiplies>(sourceVal);
-    subtest<minimum>(sourceVal);
-    subtest<maximum>(sourceVal);
+    const uint32_t idx = globalIndex();
+    type_t sourceVal = inputValue[idx];
+
+    subtest<bit_and, uint32_t, ITEMS_PER_INVOCATION>(sourceVal);
+    subtest<bit_xor, uint32_t, ITEMS_PER_INVOCATION>(sourceVal);
+    subtest<bit_or, uint32_t, ITEMS_PER_INVOCATION>(sourceVal);
+    subtest<plus, uint32_t, ITEMS_PER_INVOCATION>(sourceVal);
+    subtest<multiplies, uint32_t, ITEMS_PER_INVOCATION>(sourceVal);
+    subtest<minimum, uint32_t, ITEMS_PER_INVOCATION>(sourceVal);
+    subtest<maximum, uint32_t, ITEMS_PER_INVOCATION>(sourceVal);
     return sourceVal;
 }
 
-#include "nbl/builtin/hlsl/workgroup/basic.hlsl"
\ No newline at end of file
+#include "nbl/builtin/hlsl/workgroup/basic.hlsl"
diff --git a/74a_Workgroup2ScanTest/app_resources/testSubgroup.comp.hlsl b/74a_Workgroup2ScanTest/app_resources/testSubgroup.comp.hlsl
index 479265d73..2cc1ccb60 100644
--- a/74a_Workgroup2ScanTest/app_resources/testSubgroup.comp.hlsl
+++ b/74a_Workgroup2ScanTest/app_resources/testSubgroup.comp.hlsl
@@ -6,7 +6,7 @@
 
 uint32_t globalIndex()
 {
-	return nbl::hlsl::glsl::gl_WorkGroupID().x*WORKGROUP_SIZE+nbl::hlsl::workgroup::SubgroupContiguousIndex();
+    return nbl::hlsl::glsl::gl_WorkGroupID().x*WORKGROUP_SIZE+nbl::hlsl::workgroup::SubgroupContiguousIndex();
 }
 
 bool canStore() {return true;}
@@ -14,5 +14,5 @@ bool canStore() {return true;}
 [numthreads(WORKGROUP_SIZE,1,1)]
 void main()
 {
-	test();
-}
\ No newline at end of file
+    test();
+}
diff --git a/74a_Workgroup2ScanTest/main.cpp b/74a_Workgroup2ScanTest/main.cpp
index 57e70bf68..bde717d7b 100644
--- a/74a_Workgroup2ScanTest/main.cpp
+++ b/74a_Workgroup2ScanTest/main.cpp
@@ -154,6 +154,7 @@ class Workgroup2ScanTestApp final : public application_templates::BasicMultiQueu
 			return smart_refctd_ptr_static_cast<ICPUShader>(firstAssetInBundle);
 		};
 
+		auto subgroupTestSource = getShaderSource("app_resources/testSubgroup.comp.hlsl");
 		auto workgroupTestSource = getShaderSource("app_resources/testWorkgroup.comp.hlsl");
 		// now create or retrieve final resources to run our tests
 		sema = m_device->createSemaphore(timelineValue);
@@ -168,7 +169,8 @@ class Workgroup2ScanTestApp final : public application_templates::BasicMultiQueu
 		}
 
 		const auto MaxWorkgroupSize = m_physicalDevice->getLimits().maxComputeWorkGroupInvocations;
-		const std::array<uint32_t, 3> WorkgroupSizes = { 64, 512, 1024 };
+		const std::array<uint32_t, 4> WorkgroupSizes = { 128, 256, 512, 1024 };
+		const std::array<uint32_t, 3> ItemsPerInvocations = { 1, 2, 4 };
 		const auto MinSubgroupSize = m_physicalDevice->getLimits().minSubgroupSize;
 		const auto MaxSubgroupSize = m_physicalDevice->getLimits().maxSubgroupSize;
 		for (auto subgroupSize=MinSubgroupSize; subgroupSize <= MaxSubgroupSize; subgroupSize *= 2u)
@@ -181,15 +183,27 @@ class Workgroup2ScanTestApp final : public application_templates::BasicMultiQueu
 				m_api->startCapture();
 				m_logger->log("Testing Workgroup Size %u with Subgroup Size %u", ILogger::ELL_INFO, workgroupSize, subgroupSize);
 
-				bool passed = true;
-				const uint32_t itemsPerWG = workgroupSize <= 4 * subgroupSize ? workgroupSize : ItemsPerInvocation * max(workgroupSize >> subgroupSizeLog2, subgroupSize) << subgroupSizeLog2;	// TODO use Config somehow
-				m_logger->log("Testing Item Count %u", ILogger::ELL_INFO, itemsPerWG);
-				passed = runTest<emulatedReduction, true>(workgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize, itemsPerWG) && passed;
-				logTestOutcome(passed, itemsPerWG);
-				passed = runTest<emulatedScanInclusive, true>(workgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize, itemsPerWG) && passed;
-				logTestOutcome(passed, itemsPerWG);
-				passed = runTest<emulatedScanExclusive, true>(workgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize, itemsPerWG) && passed;
-				logTestOutcome(passed, itemsPerWG);
+				for (uint32_t j = 0; j < ItemsPerInvocations.size(); j++)
+				{
+					const uint32_t itemsPerInvocation = ItemsPerInvocations[j];
+					m_logger->log("Testing Items per Invocation %u", ILogger::ELL_INFO, itemsPerInvocation);
+					bool passed = true;
+					passed = runTest<emulatedReduction, false>(subgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize, ~0u, itemsPerInvocation) && passed;
+					logTestOutcome(passed, workgroupSize);
+					passed = runTest<emulatedScanInclusive, false>(subgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize, ~0u, itemsPerInvocation) && passed;
+					logTestOutcome(passed, workgroupSize);
+					passed = runTest<emulatedScanExclusive, false>(subgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize, ~0u, itemsPerInvocation) && passed;
+					logTestOutcome(passed, workgroupSize);
+
+					const uint32_t itemsPerWG = workgroupSize <= 4 * subgroupSize ? workgroupSize : itemsPerInvocation * max(workgroupSize >> subgroupSizeLog2, subgroupSize) << subgroupSizeLog2;	// TODO use Config somehow
+					m_logger->log("Testing Item Count %u", ILogger::ELL_INFO, itemsPerWG);
+					passed = runTest<emulatedReduction, true>(workgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize, itemsPerWG, itemsPerInvocation) && passed;
+					logTestOutcome(passed, itemsPerWG);
+					passed = runTest<emulatedScanInclusive, true>(workgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize, itemsPerWG, itemsPerInvocation) && passed;
+					logTestOutcome(passed, itemsPerWG);
+					passed = runTest<emulatedScanExclusive, true>(workgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize, itemsPerWG, itemsPerInvocation) && passed;
+					logTestOutcome(passed, itemsPerWG);
+				}
 				m_api->endCapture();
 			}
 		}
@@ -243,7 +257,7 @@ class Workgroup2ScanTestApp final : public application_templates::BasicMultiQueu
 	}
 
 	template<template<class> class Arithmetic, bool WorkgroupTest>
-	bool runTest(const smart_refctd_ptr<const ICPUShader>& source, const uint32_t elementCount, const uint8_t subgroupSizeLog2, const uint32_t workgroupSize, uint32_t itemsPerWG = ~0u)
+	bool runTest(const smart_refctd_ptr<const ICPUShader>& source, const uint32_t elementCount, const uint8_t subgroupSizeLog2, const uint32_t workgroupSize, uint32_t itemsPerWG = ~0u, uint32_t itemsPerInvoc = 1u)
 	{
 		std::string arith_name = Arithmetic<bit_xor<float>>::name;
 		const uint32_t workgroupSizeLog2 = hlsl::findMSB(workgroupSize);
@@ -267,29 +281,59 @@ class Workgroup2ScanTestApp final : public application_templates::BasicMultiQueu
 		includeFinder->addSearchPath("nbl/builtin/hlsl/jit", core::make_smart_refctd_ptr<CJITIncludeLoader>(m_physicalDevice->getLimits(), m_device->getEnabledFeatures()));
 		options.preprocessorOptions.includeFinder = includeFinder;
 
-		const std::string definitions[5] = {
-			"workgroup2::" + arith_name,
-			std::to_string(workgroupSizeLog2),
-			std::to_string(itemsPerWG),
-			std::to_string(ItemsPerInvocation),
-			std::to_string(subgroupSizeLog2)
-		};
-
-		const IShaderCompiler::SMacroDefinition defines[5] = {
-			{ "OPERATION", definitions[0] },
-			{ "WORKGROUP_SIZE_LOG2", definitions[1] },
-			{ "ITEMS_PER_WG", definitions[2] },
-			{ "ITEMS_PER_INVOCATION", definitions[3] },
-			{ "SUBGROUP_SIZE_LOG2", definitions[4] }
-		};
-		options.preprocessorOptions.extraDefines = { defines, defines + 5 };
-
-		smart_refctd_ptr<ICPUShader> overriddenUnspecialized = compiler->compileToSPIRV((const char*)source->getContent()->getPointer(), options);
+		smart_refctd_ptr<ICPUShader> overriddenUnspecialized;
+		if constexpr (WorkgroupTest)
+		{
+			const std::string definitions[5] = {
+				"workgroup2::" + arith_name,
+				std::to_string(workgroupSizeLog2),
+				std::to_string(itemsPerWG),
+				std::to_string(itemsPerInvoc),
+				std::to_string(subgroupSizeLog2)
+			};
+
+			const IShaderCompiler::SMacroDefinition defines[5] = {
+				{ "OPERATION", definitions[0] },
+				{ "WORKGROUP_SIZE_LOG2", definitions[1] },
+				{ "ITEMS_PER_WG", definitions[2] },
+				{ "ITEMS_PER_INVOCATION", definitions[3] },
+				{ "SUBGROUP_SIZE_LOG2", definitions[4] }
+			};
+			options.preprocessorOptions.extraDefines = { defines, defines + 5 };
+
+			overriddenUnspecialized = compiler->compileToSPIRV((const char*)source->getContent()->getPointer(), options);
+		}
+		else
+		{
+			const std::string definitions[4] = { 
+				"subgroup2::" + arith_name,
+				std::to_string(workgroupSize),
+				std::to_string(itemsPerInvoc),
+				std::to_string(subgroupSizeLog2)
+			};
+
+			const IShaderCompiler::SMacroDefinition defines[4] = {
+				{ "OPERATION", definitions[0] },
+				{ "WORKGROUP_SIZE", definitions[1] },
+				{ "ITEMS_PER_INVOCATION", definitions[2] },
+				{ "SUBGROUP_SIZE_LOG2", definitions[3] }
+			};
+			options.preprocessorOptions.extraDefines = { defines, defines + 4 };
+
+			overriddenUnspecialized = compiler->compileToSPIRV((const char*)source->getContent()->getPointer(), options);
+		}
 
 		auto pipeline = createPipeline(overriddenUnspecialized.get(),subgroupSizeLog2);
 
 		// TODO: overlap dispatches with memory readbacks (requires multiple copies of `buffers`)
-		const uint32_t workgroupCount = elementCount / itemsPerWG;
+		uint32_t workgroupCount;
+		if constexpr (WorkgroupTest)
+			workgroupCount = elementCount / itemsPerWG;
+		else
+		{
+			itemsPerWG = workgroupSize;
+			workgroupCount = elementCount / (itemsPerWG * itemsPerInvoc);
+		}	
 		cmdbuf->begin(IGPUCommandBuffer::USAGE::NONE);
 		cmdbuf->bindComputePipeline(pipeline.get());
 		cmdbuf->bindDescriptorSets(EPBP_COMPUTE, pipeline->getLayout(), 0u, 1u, &descriptorSet.get());
@@ -324,20 +368,20 @@ class Workgroup2ScanTestApp final : public application_templates::BasicMultiQueu
 		m_device->blockForSemaphores(wait);
 
 		// check results
-		bool passed = validateResults<Arithmetic, bit_and<uint32_t>, WorkgroupTest>(itemsPerWG, workgroupCount);
-		passed = validateResults<Arithmetic, bit_xor<uint32_t>, WorkgroupTest>(itemsPerWG, workgroupCount) && passed;
-		passed = validateResults<Arithmetic, bit_or<uint32_t>, WorkgroupTest>(itemsPerWG, workgroupCount) && passed;
-		passed = validateResults<Arithmetic, plus<uint32_t>, WorkgroupTest>(itemsPerWG, workgroupCount) && passed;
-		passed = validateResults<Arithmetic, multiplies<uint32_t>, WorkgroupTest>(itemsPerWG, workgroupCount) && passed;
-		passed = validateResults<Arithmetic, minimum<uint32_t>, WorkgroupTest>(itemsPerWG, workgroupCount) && passed;
-		passed = validateResults<Arithmetic, maximum<uint32_t>, WorkgroupTest>(itemsPerWG, workgroupCount) && passed;
+		bool passed = validateResults<Arithmetic, bit_and<uint32_t>, WorkgroupTest>(itemsPerWG, workgroupCount, itemsPerInvoc);
+		passed = validateResults<Arithmetic, bit_xor<uint32_t>, WorkgroupTest>(itemsPerWG, workgroupCount, itemsPerInvoc) && passed;
+		passed = validateResults<Arithmetic, bit_or<uint32_t>, WorkgroupTest>(itemsPerWG, workgroupCount, itemsPerInvoc) && passed;
+		passed = validateResults<Arithmetic, plus<uint32_t>, WorkgroupTest>(itemsPerWG, workgroupCount, itemsPerInvoc) && passed;
+		passed = validateResults<Arithmetic, multiplies<uint32_t>, WorkgroupTest>(itemsPerWG, workgroupCount, itemsPerInvoc) && passed;
+		passed = validateResults<Arithmetic, minimum<uint32_t>, WorkgroupTest>(itemsPerWG, workgroupCount, itemsPerInvoc) && passed;
+		passed = validateResults<Arithmetic, maximum<uint32_t>, WorkgroupTest>(itemsPerWG, workgroupCount, itemsPerInvoc) && passed;
 
 		return passed;
 	}
 
 	//returns true if result matches
 	template<template<class> class Arithmetic, class Binop, bool WorkgroupTest>
-	bool validateResults(const uint32_t itemsPerWG, const uint32_t workgroupCount)
+	bool validateResults(const uint32_t itemsPerWG, const uint32_t workgroupCount, const uint32_t itemsPerInvoc)
 	{
 		bool success = true;
 
@@ -361,22 +405,53 @@ class Workgroup2ScanTestApp final : public application_templates::BasicMultiQueu
 		for (uint32_t workgroupID = 0u; success && workgroupID < workgroupCount; workgroupID++)
 		{
 			const auto workgroupOffset = workgroupID * itemsPerWG;
-			Arithmetic<Binop>::impl(tmp, inputData + workgroupOffset, itemsPerWG);
 
-			for (uint32_t localInvocationIndex = 0u; localInvocationIndex < itemsPerWG; localInvocationIndex++)
+			if constexpr (WorkgroupTest)
+			{
+				Arithmetic<Binop>::impl(tmp, inputData + workgroupOffset, itemsPerWG);
+
+				for (uint32_t localInvocationIndex = 0u; localInvocationIndex < itemsPerWG; localInvocationIndex++)
+				{
+					const auto globalInvocationIndex = workgroupOffset + localInvocationIndex;
+					const auto cpuVal = tmp[localInvocationIndex];
+					const auto gpuVal = testData[globalInvocationIndex];
+					if (cpuVal != gpuVal)
+					{
+						m_logger->log(
+							"Failed test #%d  (%s)  (%s) Expected %u got %u for workgroup %d and localinvoc %d",
+							ILogger::ELL_ERROR, itemsPerWG, WorkgroupTest ? "workgroup" : "subgroup", Binop::name,
+							cpuVal, gpuVal, workgroupID, localInvocationIndex
+						);
+						success = false;
+						break;
+					}
+				}
+			}
+			else
 			{
-				const auto globalInvocationIndex = workgroupOffset + localInvocationIndex;
-				const auto cpuVal = tmp[localInvocationIndex];
-				const auto gpuVal = testData[globalInvocationIndex];
-				if (cpuVal != gpuVal)
+				for (uint32_t pseudoSubgroupID = 0u; pseudoSubgroupID < itemsPerWG; pseudoSubgroupID += subgroupSize)
+					Arithmetic<Binop>::impl(tmp + pseudoSubgroupID * itemsPerInvoc, inputData + workgroupOffset + pseudoSubgroupID * itemsPerInvoc, subgroupSize * itemsPerInvoc);
+
+				for (uint32_t localInvocationIndex = 0u; localInvocationIndex < itemsPerWG; localInvocationIndex++)
 				{
-					m_logger->log(
-						"Failed test #%d  (%s)  (%s) Expected %u got %u for workgroup %d and localinvoc %d",
-						ILogger::ELL_ERROR, itemsPerWG, WorkgroupTest ? "workgroup" : "subgroup", Binop::name,
-						cpuVal, gpuVal, workgroupID, localInvocationIndex
-					);
-					success = false;
-					break;
+					const auto localOffset = localInvocationIndex * itemsPerInvoc;
+					const auto globalInvocationIndex = workgroupOffset + localOffset;
+
+					for (uint32_t itemInvocationIndex = 0u; itemInvocationIndex < itemsPerInvoc; itemInvocationIndex++)
+					{
+						const auto cpuVal = tmp[localOffset + itemInvocationIndex];
+						const auto gpuVal = testData[globalInvocationIndex + itemInvocationIndex];
+						if (cpuVal != gpuVal)
+						{
+							m_logger->log(
+								"Failed test #%d  (%s)  (%s) Expected %u got %u for workgroup %d and localinvoc %d and iteminvoc %d",
+								ILogger::ELL_ERROR, itemsPerWG, WorkgroupTest ? "workgroup" : "subgroup", Binop::name,
+								cpuVal, gpuVal, workgroupID, localInvocationIndex, itemInvocationIndex
+							);
+							success = false;
+							break;
+						}
+					}
 				}
 			}
 		}
@@ -401,7 +476,7 @@ class Workgroup2ScanTestApp final : public application_templates::BasicMultiQueu
 
 	uint32_t totalFailCount = 0;
 
-	uint32_t ItemsPerInvocation = 4u;
+	//uint32_t ItemsPerInvocation = 4u;
 };
 
 NBL_MAIN_FUNC(Workgroup2ScanTestApp)
\ No newline at end of file

From 2ba2b824213e0730813bf61f55680226b52c2479 Mon Sep 17 00:00:00 2001
From: keptsecret <sorchon@gmail.com>
Date: Wed, 7 May 2025 15:10:56 +0700
Subject: [PATCH 230/529] workgroup scan benchmark, renamed examples

---
 .../CMakeLists.txt                            |   0
 .../benchmarkWorkgroup.comp.hlsl              |   0
 .../app_resources/common.hlsl                 |   0
 .../app_resources/shaderCommon.hlsl           |   0
 .../app_resources/testSubgroup.comp.hlsl      |   0
 .../app_resources/testWorkgroup.comp.hlsl     |   0
 .../app_resources/workgroupCommon.hlsl        |   0
 .../config.json.template                      |   0
 .../main.cpp                                  |   2 -
 .../pipeline.groovy                           |   0
 .../CMakeLists.txt                            |   0
 .../app_resources/benchmarkSubgroup.comp.hlsl |   0
 .../benchmarkWorkgroup.comp.hlsl              |  93 ++++++
 .../app_resources/common.hlsl                 |   0
 .../app_resources/shaderCommon.hlsl           |   0
 .../app_resources/testSubgroup.comp.hlsl      |   0
 .../app_resources/testWorkgroup.comp.hlsl     |   0
 .../app_resources/workgroupCommon.hlsl        |  69 ++++
 .../config.json.template                      |   0
 .../imgui.ini                                 |   0
 .../main.cpp                                  | 303 +++++-------------
 .../pipeline.groovy                           |   0
 CMakeLists.txt                                |   4 +-
 23 files changed, 244 insertions(+), 227 deletions(-)
 rename {73_ArithmeticBench => 73_Arithmetic2UnitTest}/CMakeLists.txt (100%)
 rename {74a_Workgroup2ScanTest => 73_Arithmetic2UnitTest}/app_resources/benchmarkWorkgroup.comp.hlsl (100%)
 rename {74a_Workgroup2ScanTest => 73_Arithmetic2UnitTest}/app_resources/common.hlsl (100%)
 rename {73_ArithmeticBench => 73_Arithmetic2UnitTest}/app_resources/shaderCommon.hlsl (100%)
 rename {73_ArithmeticBench => 73_Arithmetic2UnitTest}/app_resources/testSubgroup.comp.hlsl (100%)
 rename {74a_Workgroup2ScanTest => 73_Arithmetic2UnitTest}/app_resources/testWorkgroup.comp.hlsl (100%)
 rename {74a_Workgroup2ScanTest => 73_Arithmetic2UnitTest}/app_resources/workgroupCommon.hlsl (100%)
 rename {73_ArithmeticBench => 73_Arithmetic2UnitTest}/config.json.template (100%)
 rename {74a_Workgroup2ScanTest => 73_Arithmetic2UnitTest}/main.cpp (99%)
 rename {73_ArithmeticBench => 73_Arithmetic2UnitTest}/pipeline.groovy (100%)
 rename {74a_Workgroup2ScanTest => 74_Arithmetic2Bench}/CMakeLists.txt (100%)
 rename {73_ArithmeticBench => 74_Arithmetic2Bench}/app_resources/benchmarkSubgroup.comp.hlsl (100%)
 create mode 100644 74_Arithmetic2Bench/app_resources/benchmarkWorkgroup.comp.hlsl
 rename {73_ArithmeticBench => 74_Arithmetic2Bench}/app_resources/common.hlsl (100%)
 rename {74a_Workgroup2ScanTest => 74_Arithmetic2Bench}/app_resources/shaderCommon.hlsl (100%)
 rename {74a_Workgroup2ScanTest => 74_Arithmetic2Bench}/app_resources/testSubgroup.comp.hlsl (100%)
 rename {73_ArithmeticBench => 74_Arithmetic2Bench}/app_resources/testWorkgroup.comp.hlsl (100%)
 create mode 100644 74_Arithmetic2Bench/app_resources/workgroupCommon.hlsl
 rename {74a_Workgroup2ScanTest => 74_Arithmetic2Bench}/config.json.template (100%)
 rename {73_ArithmeticBench => 74_Arithmetic2Bench}/imgui.ini (100%)
 rename {73_ArithmeticBench => 74_Arithmetic2Bench}/main.cpp (68%)
 rename {74a_Workgroup2ScanTest => 74_Arithmetic2Bench}/pipeline.groovy (100%)

diff --git a/73_ArithmeticBench/CMakeLists.txt b/73_Arithmetic2UnitTest/CMakeLists.txt
similarity index 100%
rename from 73_ArithmeticBench/CMakeLists.txt
rename to 73_Arithmetic2UnitTest/CMakeLists.txt
diff --git a/74a_Workgroup2ScanTest/app_resources/benchmarkWorkgroup.comp.hlsl b/73_Arithmetic2UnitTest/app_resources/benchmarkWorkgroup.comp.hlsl
similarity index 100%
rename from 74a_Workgroup2ScanTest/app_resources/benchmarkWorkgroup.comp.hlsl
rename to 73_Arithmetic2UnitTest/app_resources/benchmarkWorkgroup.comp.hlsl
diff --git a/74a_Workgroup2ScanTest/app_resources/common.hlsl b/73_Arithmetic2UnitTest/app_resources/common.hlsl
similarity index 100%
rename from 74a_Workgroup2ScanTest/app_resources/common.hlsl
rename to 73_Arithmetic2UnitTest/app_resources/common.hlsl
diff --git a/73_ArithmeticBench/app_resources/shaderCommon.hlsl b/73_Arithmetic2UnitTest/app_resources/shaderCommon.hlsl
similarity index 100%
rename from 73_ArithmeticBench/app_resources/shaderCommon.hlsl
rename to 73_Arithmetic2UnitTest/app_resources/shaderCommon.hlsl
diff --git a/73_ArithmeticBench/app_resources/testSubgroup.comp.hlsl b/73_Arithmetic2UnitTest/app_resources/testSubgroup.comp.hlsl
similarity index 100%
rename from 73_ArithmeticBench/app_resources/testSubgroup.comp.hlsl
rename to 73_Arithmetic2UnitTest/app_resources/testSubgroup.comp.hlsl
diff --git a/74a_Workgroup2ScanTest/app_resources/testWorkgroup.comp.hlsl b/73_Arithmetic2UnitTest/app_resources/testWorkgroup.comp.hlsl
similarity index 100%
rename from 74a_Workgroup2ScanTest/app_resources/testWorkgroup.comp.hlsl
rename to 73_Arithmetic2UnitTest/app_resources/testWorkgroup.comp.hlsl
diff --git a/74a_Workgroup2ScanTest/app_resources/workgroupCommon.hlsl b/73_Arithmetic2UnitTest/app_resources/workgroupCommon.hlsl
similarity index 100%
rename from 74a_Workgroup2ScanTest/app_resources/workgroupCommon.hlsl
rename to 73_Arithmetic2UnitTest/app_resources/workgroupCommon.hlsl
diff --git a/73_ArithmeticBench/config.json.template b/73_Arithmetic2UnitTest/config.json.template
similarity index 100%
rename from 73_ArithmeticBench/config.json.template
rename to 73_Arithmetic2UnitTest/config.json.template
diff --git a/74a_Workgroup2ScanTest/main.cpp b/73_Arithmetic2UnitTest/main.cpp
similarity index 99%
rename from 74a_Workgroup2ScanTest/main.cpp
rename to 73_Arithmetic2UnitTest/main.cpp
index bde717d7b..31eb4ab8f 100644
--- a/74a_Workgroup2ScanTest/main.cpp
+++ b/73_Arithmetic2UnitTest/main.cpp
@@ -475,8 +475,6 @@ class Workgroup2ScanTestApp final : public application_templates::BasicMultiQueu
 	smart_refctd_ptr<ICPUBuffer> resultsBuffer;
 
 	uint32_t totalFailCount = 0;
-
-	//uint32_t ItemsPerInvocation = 4u;
 };
 
 NBL_MAIN_FUNC(Workgroup2ScanTestApp)
\ No newline at end of file
diff --git a/73_ArithmeticBench/pipeline.groovy b/73_Arithmetic2UnitTest/pipeline.groovy
similarity index 100%
rename from 73_ArithmeticBench/pipeline.groovy
rename to 73_Arithmetic2UnitTest/pipeline.groovy
diff --git a/74a_Workgroup2ScanTest/CMakeLists.txt b/74_Arithmetic2Bench/CMakeLists.txt
similarity index 100%
rename from 74a_Workgroup2ScanTest/CMakeLists.txt
rename to 74_Arithmetic2Bench/CMakeLists.txt
diff --git a/73_ArithmeticBench/app_resources/benchmarkSubgroup.comp.hlsl b/74_Arithmetic2Bench/app_resources/benchmarkSubgroup.comp.hlsl
similarity index 100%
rename from 73_ArithmeticBench/app_resources/benchmarkSubgroup.comp.hlsl
rename to 74_Arithmetic2Bench/app_resources/benchmarkSubgroup.comp.hlsl
diff --git a/74_Arithmetic2Bench/app_resources/benchmarkWorkgroup.comp.hlsl b/74_Arithmetic2Bench/app_resources/benchmarkWorkgroup.comp.hlsl
new file mode 100644
index 000000000..ed56dd766
--- /dev/null
+++ b/74_Arithmetic2Bench/app_resources/benchmarkWorkgroup.comp.hlsl
@@ -0,0 +1,93 @@
+#pragma shader_stage(compute)
+
+#include "workgroupCommon.hlsl"
+
+// NOTE added dummy output image to be able to profile with Nsight, which still doesn't support profiling headless compute shaders
+[[vk::binding(2, 0)]] RWTexture2D<float32_t4> outImage; // dummy
+
+template<class Config, class Binop>
+struct DataProxy
+{
+    using dtype_t = vector<uint32_t, Config::ItemsPerInvocation_0>;
+    static_assert(nbl::hlsl::is_same_v<dtype_t, type_t>);
+
+    dtype_t get(const uint32_t ix)
+    {
+        return inputValue[ix];
+    }
+    void set(const uint32_t ix, const dtype_t value)
+    {
+        output[Binop::BindingIndex].template Store<type_t>(sizeof(uint32_t) + sizeof(type_t) * ix, value);
+    }
+
+    void workgroupExecutionAndMemoryBarrier()
+    {
+        nbl::hlsl::glsl::barrier();
+        //nbl::hlsl::glsl::memoryBarrierShared(); implied by the above
+    }
+};
+
+static ScratchProxy arithmeticAccessor;
+
+template<class Binop, class device_capabilities>
+struct operation_t
+{
+    using binop_base_t = typename Binop::base_t;
+    using otype_t = typename Binop::type_t;
+
+    void operator()()
+    {
+        DataProxy<config_t,Binop> dataAccessor;
+        nbl::hlsl::OPERATION<config_t,binop_base_t,device_capabilities>::template __call<DataProxy<config_t,Binop>, ScratchProxy>(dataAccessor,arithmeticAccessor);
+        // we barrier before because we alias the accessors for Binop
+        arithmeticAccessor.workgroupExecutionAndMemoryBarrier();
+    }
+};
+
+#ifndef NUM_LOOPS
+#error "Define NUM_LOOPS!"
+#endif
+
+template<template<class> class binop, typename T, uint32_t N>
+static void subbench(NBL_CONST_REF_ARG(type_t) sourceVal)
+{
+    if (globalIndex()==0u)
+        output[binop<T>::BindingIndex].template Store<uint32_t>(0,nbl::hlsl::glsl::gl_SubgroupSize());
+
+    operation_t<binop<T>,nbl::hlsl::jit::device_capabilities> func;
+    // TODO separate out store/load from DataProxy? so we don't do too many RW in benchmark
+    for (uint32_t i = 0; i < NUM_LOOPS; i++)
+        func(); // store is done with data accessor now
+}
+
+
+type_t benchmark()
+{
+    const type_t sourceVal = inputValue[globalIndex()];
+
+    subbench<bit_and, uint32_t, ITEMS_PER_INVOCATION>(sourceVal);
+    subbench<bit_xor, uint32_t, ITEMS_PER_INVOCATION>(sourceVal);
+    subbench<bit_or, uint32_t, ITEMS_PER_INVOCATION>(sourceVal);
+    subbench<plus, uint32_t, ITEMS_PER_INVOCATION>(sourceVal);
+    subbench<multiplies, uint32_t, ITEMS_PER_INVOCATION>(sourceVal);
+    subbench<minimum, uint32_t, ITEMS_PER_INVOCATION>(sourceVal);
+    subbench<maximum, uint32_t, ITEMS_PER_INVOCATION>(sourceVal);
+    return sourceVal;
+}
+
+
+uint32_t globalIndex()
+{
+    return nbl::hlsl::glsl::gl_WorkGroupID().x*ITEMS_PER_WG+nbl::hlsl::workgroup::SubgroupContiguousIndex();
+}
+
+bool canStore()
+{
+    return nbl::hlsl::workgroup::SubgroupContiguousIndex()<ITEMS_PER_WG;
+}
+
+[numthreads(WORKGROUP_SIZE,1,1)]
+void main()
+{
+    const type_t sourceVal = benchmark();
+}
diff --git a/73_ArithmeticBench/app_resources/common.hlsl b/74_Arithmetic2Bench/app_resources/common.hlsl
similarity index 100%
rename from 73_ArithmeticBench/app_resources/common.hlsl
rename to 74_Arithmetic2Bench/app_resources/common.hlsl
diff --git a/74a_Workgroup2ScanTest/app_resources/shaderCommon.hlsl b/74_Arithmetic2Bench/app_resources/shaderCommon.hlsl
similarity index 100%
rename from 74a_Workgroup2ScanTest/app_resources/shaderCommon.hlsl
rename to 74_Arithmetic2Bench/app_resources/shaderCommon.hlsl
diff --git a/74a_Workgroup2ScanTest/app_resources/testSubgroup.comp.hlsl b/74_Arithmetic2Bench/app_resources/testSubgroup.comp.hlsl
similarity index 100%
rename from 74a_Workgroup2ScanTest/app_resources/testSubgroup.comp.hlsl
rename to 74_Arithmetic2Bench/app_resources/testSubgroup.comp.hlsl
diff --git a/73_ArithmeticBench/app_resources/testWorkgroup.comp.hlsl b/74_Arithmetic2Bench/app_resources/testWorkgroup.comp.hlsl
similarity index 100%
rename from 73_ArithmeticBench/app_resources/testWorkgroup.comp.hlsl
rename to 74_Arithmetic2Bench/app_resources/testWorkgroup.comp.hlsl
diff --git a/74_Arithmetic2Bench/app_resources/workgroupCommon.hlsl b/74_Arithmetic2Bench/app_resources/workgroupCommon.hlsl
new file mode 100644
index 000000000..7e8512e72
--- /dev/null
+++ b/74_Arithmetic2Bench/app_resources/workgroupCommon.hlsl
@@ -0,0 +1,69 @@
+#include "nbl/builtin/hlsl/workgroup/scratch_size.hlsl"
+
+#include "nbl/builtin/hlsl/workgroup2/arithmetic.hlsl"
+
+#include "nbl/builtin/hlsl/workgroup/broadcast.hlsl"
+
+#include "nbl/builtin/hlsl/glsl_compat/core.hlsl"
+#include "nbl/builtin/hlsl/subgroup/basic.hlsl"
+#include "nbl/builtin/hlsl/subgroup2/arithmetic_portability.hlsl"
+
+#include "nbl/builtin/hlsl/jit/device_capabilities.hlsl"
+
+#include "common.hlsl"
+
+static const uint32_t WORKGROUP_SIZE = 1u << WORKGROUP_SIZE_LOG2;
+
+// https://github.com/microsoft/DirectXShaderCompiler/issues/6144
+uint32_t3 nbl::hlsl::glsl::gl_WorkGroupSize() {return uint32_t3(WORKGROUP_SIZE,1,1);}
+
+#ifndef ITEMS_PER_INVOCATION
+#error "Define ITEMS_PER_INVOCATION!"
+#endif
+
+using config_t = nbl::hlsl::workgroup2::Configuration<WORKGROUP_SIZE_LOG2, SUBGROUP_SIZE_LOG2, ITEMS_PER_INVOCATION>;
+
+typedef vector<uint32_t, config_t::ItemsPerInvocation_0> type_t;
+
+// unfortunately DXC chokes on descriptors as static members
+// https://github.com/microsoft/DirectXShaderCompiler/issues/5940
+[[vk::binding(0, 0)]] StructuredBuffer<type_t> inputValue;
+[[vk::binding(1, 0)]] RWByteAddressBuffer output[8];
+
+// because subgroups don't match `gl_LocalInvocationIndex` snake curve addressing, we also can't load inputs that way
+uint32_t globalIndex();
+// since we test ITEMS_PER_WG<WorkgroupSize we need this so workgroups don't overwrite each other's outputs
+bool canStore();
+
+#ifndef OPERATION
+#error "Define OPERATION!"
+#endif
+#ifndef SUBGROUP_SIZE_LOG2
+#error "Define SUBGROUP_SIZE_LOG2!"
+#endif
+
+// final (level 1/2) scan needs to fit in one subgroup exactly
+groupshared uint32_t scratch[config_t::SubgroupsPerVirtualWorkgroupLog2*config_t::ItemsPerInvocation_1];
+
+struct ScratchProxy
+{
+    void get(const uint32_t ix, NBL_REF_ARG(uint32_t) value)
+    {
+        value = scratch[ix];
+    }
+    void set(const uint32_t ix, const uint32_t value)
+    {
+        scratch[ix] = value;
+    }
+
+    uint32_t atomicOr(const uint32_t ix, const uint32_t value)
+    {
+        return nbl::hlsl::glsl::atomicOr(scratch[ix],value);
+    }
+
+    void workgroupExecutionAndMemoryBarrier()
+    {
+        nbl::hlsl::glsl::barrier();
+        //nbl::hlsl::glsl::memoryBarrierShared(); implied by the above
+    }
+};
diff --git a/74a_Workgroup2ScanTest/config.json.template b/74_Arithmetic2Bench/config.json.template
similarity index 100%
rename from 74a_Workgroup2ScanTest/config.json.template
rename to 74_Arithmetic2Bench/config.json.template
diff --git a/73_ArithmeticBench/imgui.ini b/74_Arithmetic2Bench/imgui.ini
similarity index 100%
rename from 73_ArithmeticBench/imgui.ini
rename to 74_Arithmetic2Bench/imgui.ini
diff --git a/73_ArithmeticBench/main.cpp b/74_Arithmetic2Bench/main.cpp
similarity index 68%
rename from 73_ArithmeticBench/main.cpp
rename to 74_Arithmetic2Bench/main.cpp
index d129cfaf9..abbae38fb 100644
--- a/73_ArithmeticBench/main.cpp
+++ b/74_Arithmetic2Bench/main.cpp
@@ -265,9 +265,8 @@ class ArithmeticBenchApp final : public examples::SimpleWindowedApplication, pub
 			return smart_refctd_ptr_static_cast<ICPUShader>(firstAssetInBundle);
 		};
 
-		auto subgroupTestSource = getShaderSource("app_resources/testSubgroup.comp.hlsl");
 		auto subgroupBenchSource = getShaderSource("app_resources/benchmarkSubgroup.comp.hlsl");
-		//auto workgroupTestSource = getShaderSource("app_resources/testWorkgroup.comp.hlsl");
+		auto workgroupBenchSource = getShaderSource("app_resources/benchmarkWorkgroup.comp.hlsl");
 		// now create or retrieve final resources to run our tests
 		sema = m_device->createSemaphore(timelineValue);
 		resultsBuffer = ICPUBuffer::create({ outputBuffers[0]->getSize() });
@@ -280,25 +279,22 @@ class ArithmeticBenchApp final : public examples::SimpleWindowedApplication, pub
 				return false;
 			}
 		}
-		
-		// TODO variable items per invocation?
-		const uint32_t NumLoops = 1000u;
-		const std::array<uint32_t, 3> workgroupSizes = { 256, 512, 1024 };
+
 		// const auto MaxWorkgroupSize = m_physicalDevice->getLimits().maxComputeWorkGroupInvocations;
 		const auto MinSubgroupSize = m_physicalDevice->getLimits().minSubgroupSize;
 		const auto MaxSubgroupSize = m_physicalDevice->getLimits().maxSubgroupSize;
-		
-		if (b_runTests)
-		{
-			runTests(cmdbuf.get(), subgroupTestSource, elementCount, ItemsPerInvocation, MinSubgroupSize, MaxSubgroupSize, workgroupSizes);
-
-			m_logger->log("==========Result==========", ILogger::ELL_INFO);
-			m_logger->log("Fail Count: %u", ILogger::ELL_INFO, totalFailCount);
-		}
 
 		// for each workgroup size (manually adjust items per invoc, operation else uses up a lot of ram)
-		for (uint32_t i = 0; i < workgroupSizes.size(); i++)
-			benchSets[i] = createBenchmarkPipelines<ArithmeticOp>(subgroupBenchSource, benchPplnLayout.get(), elementCount, hlsl::findMSB(MinSubgroupSize), workgroupSizes[i], ItemsPerInvocation, NumLoops);
+		if constexpr (DoWorkgroupBenchmarks)
+		{
+			for (uint32_t i = 0; i < workgroupSizes.size(); i++)
+				benchSets[i] = createBenchmarkPipelines<ArithmeticOp, DoWorkgroupBenchmarks>(workgroupBenchSource, benchPplnLayout.get(), elementCount, hlsl::findMSB(MinSubgroupSize), workgroupSizes[i], ItemsPerInvocation, NumLoops);
+		}
+		else
+		{
+			for (uint32_t i = 0; i < workgroupSizes.size(); i++)
+				benchSets[i] = createBenchmarkPipelines<ArithmeticOp, DoWorkgroupBenchmarks>(subgroupBenchSource, benchPplnLayout.get(), elementCount, hlsl::findMSB(MinSubgroupSize), workgroupSizes[i], ItemsPerInvocation, NumLoops);
+		}
 
 		m_winMgr->show(m_window.get());
 
@@ -399,7 +395,7 @@ class ArithmeticBenchApp final : public examples::SimpleWindowedApplication, pub
 		cmdbuf->bindDescriptorSets(EPBP_COMPUTE, benchSets[0].pipeline->getLayout(), 0u, 1u, &benchDs.get());
 
 		for (uint32_t i = 0; i < benchSets.size(); i++)
-			runBenchmark(cmdbuf, benchSets[i], elementCount, SubgroupSizeLog2);
+			runBenchmark<DoWorkgroupBenchmarks>(cmdbuf, benchSets[0], elementCount, SubgroupSizeLog2);
 
 
 		// blit
@@ -570,40 +566,6 @@ class ArithmeticBenchApp final : public examples::SimpleWindowedApplication, pub
 		}
 	}
 
-	void runTests(IGPUCommandBuffer* cmdbuf, smart_refctd_ptr<ICPUShader> subgroupTestSource, uint32_t elementCount, uint32_t itemsPerInvocation, uint32_t MinSubgroupSize, uint32_t MaxSubgroupSize, const std::array<uint32_t, 3>& workgroupSizes)
-	{
-		for (auto subgroupSize = MinSubgroupSize; subgroupSize <= MaxSubgroupSize; subgroupSize *= 2u)
-		{
-			const uint8_t subgroupSizeLog2 = hlsl::findMSB(subgroupSize);
-			for (const auto& workgroupSize : workgroupSizes)
-			{
-				// make sure renderdoc captures everything for debugging
-				m_api->startCapture();
-				m_logger->log("Testing Workgroup Size %u with Subgroup Size %u", ILogger::ELL_INFO, workgroupSize, subgroupSize);
-
-				bool passed = true;
-				// TODO async the testing
-				passed = runTest<emulatedReduction, false>(cmdbuf, subgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize, ~0u, itemsPerInvocation) && passed;
-				logTestOutcome(passed, workgroupSize);
-				passed = runTest<emulatedScanInclusive, false>(cmdbuf, subgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize, ~0u, itemsPerInvocation) && passed;
-				logTestOutcome(passed, workgroupSize);
-				passed = runTest<emulatedScanExclusive, false>(cmdbuf, subgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize, ~0u, itemsPerInvocation) && passed;
-				logTestOutcome(passed, workgroupSize);
-				//for (uint32_t itemsPerWG = workgroupSize; itemsPerWG > workgroupSize - subgroupSize; itemsPerWG--)
-				//{
-				//	m_logger->log("Testing Item Count %u", ILogger::ELL_INFO, itemsPerWG);
-				//	passed = runTest<emulatedReduction, true>(workgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize, itemsPerWG) && passed;
-				//	logTestOutcome(passed, itemsPerWG);
-				//	passed = runTest<emulatedScanInclusive, true>(workgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize, itemsPerWG) && passed;
-				//	logTestOutcome(passed, itemsPerWG);
-				//	passed = runTest<emulatedScanExclusive, true>(workgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize, itemsPerWG) && passed;
-				//	logTestOutcome(passed, itemsPerWG);
-				//}
-				m_api->endCapture();
-			}
-		}
-	}
-
 	// create pipeline (specialized every test) [TODO: turn into a future/async]
 	smart_refctd_ptr<IGPUComputePipeline> createPipeline(const ICPUShader* overridenUnspecialized, const IGPUPipelineLayout* layout, const uint8_t subgroupSizeLog2)
 	{
@@ -630,15 +592,10 @@ class ArithmeticBenchApp final : public examples::SimpleWindowedApplication, pub
 		uint32_t itemsPerInvocation;
 	};
 
-	template<template<class> class Arithmetic>
+	template<template<class> class Arithmetic, bool WorkgroupBench>
 	BenchmarkSet createBenchmarkPipelines(const smart_refctd_ptr<const ICPUShader>&source, const IGPUPipelineLayout* layout, const uint32_t elementCount, const uint8_t subgroupSizeLog2, const uint32_t workgroupSize, uint32_t itemsPerInvoc = 1u, uint32_t numLoops = 8u)
 	{
-		std::string arith_name = Arithmetic<plus<uint32_t>>::name;	// TODO all operations
-
-		//smart_refctd_ptr<ICPUShader> overridenUnspecialized = CHLSLCompiler::createOverridenCopy(
-		//	source.get(), "#define OPERATION %s\n#define WORKGROUP_SIZE %d\n#define ITEMS_PER_INVOCATION %d\n#define SUBGROUP_SIZE_LOG2 %d\n",
-		//	(("subgroup2::") + arith_name).c_str(), workgroupSize, itemsPerInvoc, subgroupSizeLog2
-		//);
+		std::string arith_name = Arithmetic<plus<uint32_t>>::name;
 
 		auto compiler = make_smart_refctd_ptr<asset::CHLSLCompiler>(smart_refctd_ptr(m_system));
 		CHLSLCompiler::SOptions options = {};
@@ -659,182 +616,78 @@ class ArithmeticBenchApp final : public examples::SimpleWindowedApplication, pub
 		includeFinder->addSearchPath("nbl/builtin/hlsl/jit", core::make_smart_refctd_ptr<CJITIncludeLoader>(m_physicalDevice->getLimits(), m_device->getEnabledFeatures()));
 		options.preprocessorOptions.includeFinder = includeFinder;
 
-		const std::string definitions[5] = { 
-			"subgroup2::" + arith_name,
-			std::to_string(workgroupSize),
-			std::to_string(itemsPerInvoc),
-			std::to_string(subgroupSizeLog2),
-			std::to_string(numLoops)
-		};
-
-		const IShaderCompiler::SMacroDefinition defines[5] = {
-			{ "OPERATION", definitions[0] },
-			{ "WORKGROUP_SIZE", definitions[1] },
-			{ "ITEMS_PER_INVOCATION", definitions[2] },
-			{ "SUBGROUP_SIZE_LOG2", definitions[3] },
-			{ "NUM_LOOPS", definitions[4] },
-		};
-		options.preprocessorOptions.extraDefines = { defines, defines + 5 };
-
-		smart_refctd_ptr<ICPUShader> overridenUnspecialized = compiler->compileToSPIRV((const char*)source->getContent()->getPointer(), options);
-
-		BenchmarkSet set;
-		set.pipeline = createPipeline(overridenUnspecialized.get(), layout, subgroupSizeLog2);
-		set.workgroupSize = workgroupSize;
-		set.itemsPerInvocation = itemsPerInvoc;
-
-		return set;
-	};
-
-	template<template<class> class Arithmetic, bool WorkgroupTest>
-	bool runTest(IGPUCommandBuffer* cmdbuf, const smart_refctd_ptr<const ICPUShader>& source, const uint32_t elementCount, const uint8_t subgroupSizeLog2, const uint32_t workgroupSize, uint32_t itemsPerWG = ~0u, uint32_t itemsPerInvoc = 1u)
-	{
-		std::string arith_name = Arithmetic<bit_xor<uint32_t>>::name;
-
-		smart_refctd_ptr<ICPUShader> overridenUnspecialized;
-		//if constexpr (WorkgroupTest)
-		//{
-		//	overridenUnspecialized = CHLSLCompiler::createOverridenCopy(
-		//		source.get(), "#define OPERATION %s\n#define WORKGROUP_SIZE %d\n#define ITEMS_PER_WG %d\n",
-		//		(("workgroup::") + arith_name).c_str(), workgroupSize, itemsPerWG
-		//	);
-		//}
-		//else
-		//{
-			itemsPerWG = workgroupSize;
-			overridenUnspecialized = CHLSLCompiler::createOverridenCopy(
-				source.get(), "#define OPERATION %s\n#define WORKGROUP_SIZE %d\n#define ITEMS_PER_INVOCATION %d\n#define SUBGROUP_SIZE_LOG2 %d\n",
-				(("subgroup2::") + arith_name).c_str(), workgroupSize, itemsPerInvoc, subgroupSizeLog2
-			);
-		//}
-		auto pipeline = createPipeline(overridenUnspecialized.get(),testPplnLayout.get(), subgroupSizeLog2);
-
-		// TODO: overlap dispatches with memory readbacks (requires multiple copies of `buffers`)
-		const uint32_t workgroupCount = elementCount / (itemsPerWG * itemsPerInvoc);
-		cmdbuf->begin(IGPUCommandBuffer::USAGE::NONE);
-		cmdbuf->bindComputePipeline(pipeline.get());
-		cmdbuf->bindDescriptorSets(EPBP_COMPUTE, pipeline->getLayout(), 0u, 1u, &testDs.get());
-		cmdbuf->dispatch(workgroupCount, 1, 1);
+		const uint32_t subgroupSize = 0x1u << subgroupSizeLog2;
+		const uint32_t itemsPerWG = workgroupSize <= 4 * subgroupSize ? workgroupSize : itemsPerInvoc * max(workgroupSize >> subgroupSizeLog2, subgroupSize) << subgroupSizeLog2;	// TODO use Config somehow
+		smart_refctd_ptr<ICPUShader> overriddenUnspecialized;
+		if constexpr (WorkgroupBench)
 		{
-			IGPUCommandBuffer::SPipelineBarrierDependencyInfo::buffer_barrier_t memoryBarrier[OutputBufferCount];
-			for (auto i=0u; i<OutputBufferCount; i++)
-			{
-				memoryBarrier[i] = {
-					.barrier = {
-						.dep = {
-							.srcStageMask = PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT,
-							.srcAccessMask = ACCESS_FLAGS::SHADER_WRITE_BITS,
-							// in theory we don't need the HOST BITS cause we block on a semaphore but might as well add them
-							.dstStageMask = PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT|PIPELINE_STAGE_FLAGS::HOST_BIT,
-							.dstAccessMask = ACCESS_FLAGS::SHADER_WRITE_BITS|ACCESS_FLAGS::HOST_READ_BIT
-						}
-					},
-					.range = {0ull,outputBuffers[i]->getSize(),outputBuffers[i]}
-				};
-			}
-			IGPUCommandBuffer::SPipelineBarrierDependencyInfo info = {.memBarriers={},.bufBarriers=memoryBarrier};
-			cmdbuf->pipelineBarrier(asset::E_DEPENDENCY_FLAGS::EDF_NONE,info);
-		}
-		cmdbuf->end();
+			const uint32_t workgroupSizeLog2 = hlsl::findMSB(workgroupSize);
+			const std::string definitions[6] = {
+				"workgroup2::" + arith_name,
+				std::to_string(workgroupSizeLog2),
+				std::to_string(itemsPerWG),
+				std::to_string(itemsPerInvoc),
+				std::to_string(subgroupSizeLog2),
+				std::to_string(numLoops)
+			};
 
-		const IQueue::SSubmitInfo::SSemaphoreInfo signal[1] = {{.semaphore=sema.get(),.value=++timelineValue}};
-		const IQueue::SSubmitInfo::SCommandBufferInfo cmdbufs[1] = {{.cmdbuf=cmdbuf}};
-		const IQueue::SSubmitInfo submits[1] = {{.commandBuffers=cmdbufs,.signalSemaphores=signal}};
-		computeQueue->submit(submits);
-		const ISemaphore::SWaitInfo wait[1] = {{.semaphore=sema.get(),.value=timelineValue}};
-		m_device->blockForSemaphores(wait);
-
-		// check results
-		bool passed = validateResults<Arithmetic, bit_and<uint32_t>, WorkgroupTest>(itemsPerWG, workgroupCount, itemsPerInvoc);
-		passed = validateResults<Arithmetic, bit_xor<uint32_t>, WorkgroupTest>(itemsPerWG, workgroupCount, itemsPerInvoc) && passed;
-		passed = validateResults<Arithmetic, bit_or<uint32_t>, WorkgroupTest>(itemsPerWG, workgroupCount, itemsPerInvoc) && passed;
-		passed = validateResults<Arithmetic, plus<uint32_t>, WorkgroupTest>(itemsPerWG, workgroupCount, itemsPerInvoc) && passed;
-		passed = validateResults<Arithmetic, multiplies<uint32_t>, WorkgroupTest>(itemsPerWG, workgroupCount, itemsPerInvoc) && passed;
-		passed = validateResults<Arithmetic, minimum<uint32_t>, WorkgroupTest>(itemsPerWG, workgroupCount, itemsPerInvoc) && passed;
-		passed = validateResults<Arithmetic, maximum<uint32_t>, WorkgroupTest>(itemsPerWG, workgroupCount, itemsPerInvoc) && passed;
-		//if constexpr (WorkgroupTest)
-		//	passed = validateResults<Arithmetic, ballot<uint32_t>, WorkgroupTest>(itemsPerWG, workgroupCount) && passed;
-
-		return passed;
-	}
+			const IShaderCompiler::SMacroDefinition defines[6] = {
+				{ "OPERATION", definitions[0] },
+				{ "WORKGROUP_SIZE_LOG2", definitions[1] },
+				{ "ITEMS_PER_WG", definitions[2] },
+				{ "ITEMS_PER_INVOCATION", definitions[3] },
+				{ "SUBGROUP_SIZE_LOG2", definitions[4] },
+				{ "NUM_LOOPS", definitions[5] }
+			};
+			options.preprocessorOptions.extraDefines = { defines, defines + 6 };
 
-	//returns true if result matches
-	template<template<class> class Arithmetic, class Binop, bool WorkgroupTest>
-	bool validateResults(const uint32_t itemsPerWG, const uint32_t workgroupCount, uint32_t itemsPerInvoc = 1u)
-	{
-		bool success = true;
+			overriddenUnspecialized = compiler->compileToSPIRV((const char*)source->getContent()->getPointer(), options);
+		}
+		else
+		{
+			const std::string definitions[5] = { 
+				"subgroup2::" + arith_name,
+				std::to_string(workgroupSize),
+				std::to_string(itemsPerInvoc),
+				std::to_string(subgroupSizeLog2),
+				std::to_string(numLoops)
+			};
 
-		// download data
-		const SBufferRange<IGPUBuffer> bufferRange = {0u, resultsBuffer->getSize(), outputBuffers[Binop::BindingIndex]};
-		m_utils->downloadBufferRangeViaStagingBufferAutoSubmit(SIntendedSubmitInfo{.queue=transferDownQueue},bufferRange,resultsBuffer->getPointer());
+			const IShaderCompiler::SMacroDefinition defines[5] = {
+				{ "OPERATION", definitions[0] },
+				{ "WORKGROUP_SIZE", definitions[1] },
+				{ "ITEMS_PER_INVOCATION", definitions[2] },
+				{ "SUBGROUP_SIZE_LOG2", definitions[3] },
+				{ "NUM_LOOPS", definitions[4] }
+			};
+			options.preprocessorOptions.extraDefines = { defines, defines + 5 };
 
-		using type_t = typename Binop::type_t;
-		const auto dataFromBuffer = reinterpret_cast<const uint32_t*>(resultsBuffer->getPointer());
-		const auto subgroupSize = dataFromBuffer[0];
-		if (subgroupSize<nbl::hlsl::subgroup::MinSubgroupSize || subgroupSize>nbl::hlsl::subgroup::MaxSubgroupSize)
+			overriddenUnspecialized = compiler->compileToSPIRV((const char*)source->getContent()->getPointer(), options);
+		}
+		
+		BenchmarkSet set;
+		set.pipeline = createPipeline(overriddenUnspecialized.get(), layout, subgroupSizeLog2);
+		if constexpr (WorkgroupBench)
 		{
-			m_logger->log("Unexpected Subgroup Size %u", ILogger::ELL_ERROR, subgroupSize);
-			return false;
+			set.workgroupSize = itemsPerWG;
 		}
-
-		const auto testData = reinterpret_cast<const type_t*>(dataFromBuffer + 1);
-		// TODO: parallel for (the temporary values need to be threadlocal or what?)
-		// now check if the data obtained has valid values
-		type_t* tmp = new type_t[itemsPerWG * itemsPerInvoc];
-		//type_t* ballotInput = new type_t[itemsPerWG];
-		for (uint32_t workgroupID = 0u; success && workgroupID < workgroupCount; workgroupID++)
+		else
 		{
-			const auto workgroupOffset = workgroupID * itemsPerWG * itemsPerInvoc;
-
-			//if constexpr (WorkgroupTest)
-			//{
-			//	if constexpr (std::is_same_v<ballot<type_t>, Binop>)
-			//	{
-			//		for (auto i = 0u; i < itemsPerWG; i++)
-			//			ballotInput[i] = inputData[i + workgroupOffset] & 0x1u;
-			//		Arithmetic<Binop>::impl(tmp, ballotInput, itemsPerWG);
-			//	}
-			//	else
-			//		Arithmetic<Binop>::impl(tmp, inputData + workgroupOffset, itemsPerWG);
-			//}
-			//else
-			//{
-				for (uint32_t pseudoSubgroupID = 0u; pseudoSubgroupID < itemsPerWG; pseudoSubgroupID += subgroupSize)
-					Arithmetic<Binop>::impl(tmp + pseudoSubgroupID * itemsPerInvoc, inputData + workgroupOffset + pseudoSubgroupID * itemsPerInvoc, subgroupSize * itemsPerInvoc);
-			//}
-
-			for (uint32_t localInvocationIndex = 0u; localInvocationIndex < itemsPerWG; localInvocationIndex++)
-			{
-				const auto localOffset = localInvocationIndex * itemsPerInvoc;
-				const auto globalInvocationIndex = workgroupOffset + localOffset;
-
-				for (uint32_t itemInvocationIndex = 0u; itemInvocationIndex < itemsPerInvoc; itemInvocationIndex++)
-				{
-					const auto cpuVal = tmp[localOffset + itemInvocationIndex];
-					const auto gpuVal = testData[globalInvocationIndex + itemInvocationIndex];
-					if (cpuVal != gpuVal)
-					{
-						m_logger->log(
-							"Failed test #%d  (%s)  (%s) Expected %u got %u for workgroup %d and localinvoc %d and iteminvoc %d",
-							ILogger::ELL_ERROR, itemsPerWG, WorkgroupTest ? "workgroup" : "subgroup", Binop::name,
-							cpuVal, gpuVal, workgroupID, localInvocationIndex, itemInvocationIndex
-						);
-						success = false;
-						break;
-					}
-				}
-			}
+			set.workgroupSize = workgroupSize;
 		}
-		//delete[] ballotInput;
-		delete[] tmp;
-
-		return success;
-	}
+		set.itemsPerInvocation = itemsPerInvoc;
 
+		return set;
+	};
 
+	template<bool WorkgroupBench>
 	void runBenchmark(IGPUCommandBuffer* cmdbuf, const BenchmarkSet& set, const uint32_t elementCount, const uint8_t subgroupSizeLog2)
 	{
-		const uint32_t workgroupCount = elementCount / (set.workgroupSize * set.itemsPerInvocation);
+		uint32_t workgroupCount;
+		if constexpr (WorkgroupBench)
+			workgroupCount = elementCount / set.workgroupSize;
+		else
+			workgroupCount = elementCount / (set.workgroupSize * set.itemsPerInvocation);
 
 		cmdbuf->bindComputePipeline(set.pipeline.get());
 		cmdbuf->dispatch(workgroupCount, 1, 1);
@@ -884,12 +737,16 @@ class ArithmeticBenchApp final : public examples::SimpleWindowedApplication, pub
 	constexpr static inline uint32_t MaxNumSubmits = 30;
 	uint32_t numSubmits = 0;
 
+	/* PARAMETERS TO CHANGE FOR DIFFERENT BENCHMARKS */
+	constexpr static inline bool DoWorkgroupBenchmarks = true;
+	uint32_t ItemsPerInvocation = 4u;
+	constexpr static inline uint32_t NumLoops = 1000u;
+	constexpr static inline std::array<uint32_t, 3> workgroupSizes = { 256, 512, 1024 };
 	template<class BinOp>
 	using ArithmeticOp = emulatedReduction<BinOp>;	// change this to test other arithmetic ops
 
-	bool b_runTests = false;
+
 	uint32_t* inputData = nullptr;
-	uint32_t ItemsPerInvocation = 4u;
 	constexpr static inline uint32_t OutputBufferCount = 8u;
 	smart_refctd_ptr<IGPUBuffer> outputBuffers[OutputBufferCount];
 
diff --git a/74a_Workgroup2ScanTest/pipeline.groovy b/74_Arithmetic2Bench/pipeline.groovy
similarity index 100%
rename from 74a_Workgroup2ScanTest/pipeline.groovy
rename to 74_Arithmetic2Bench/pipeline.groovy
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 5d7369560..dc6b74de1 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -91,8 +91,8 @@ if(NBL_BUILD_EXAMPLES)
   	add_subdirectory(70_FLIPFluids EXCLUDE_FROM_ALL)
 	add_subdirectory(71_RayTracingPipeline EXCLUDE_FROM_ALL)
 
-	add_subdirectory(73_ArithmeticBench EXCLUDE_FROM_ALL)
-	add_subdirectory(74a_Workgroup2ScanTest EXCLUDE_FROM_ALL)
+	add_subdirectory(73_Arithmetic2UnitTest EXCLUDE_FROM_ALL)
+	add_subdirectory(74_Arithmetic2Bench EXCLUDE_FROM_ALL)
 
 	NBL_HOOK_COMMON_API("${NBL_COMMON_API_TARGETS}")
 endif()

From d567e716682695ec5ebcdff17e144e25576cd1f0 Mon Sep 17 00:00:00 2001
From: keptsecret <sorchon@gmail.com>
Date: Wed, 7 May 2025 15:16:57 +0700
Subject: [PATCH 231/529] removed obsolete files

---
 .../benchmarkWorkgroup.comp.hlsl              |  97 ----------------
 .../app_resources/testSubgroup.comp.hlsl      |  18 ---
 .../app_resources/testWorkgroup.comp.hlsl     | 107 ------------------
 74_Arithmetic2Bench/imgui.ini                 |   5 -
 4 files changed, 227 deletions(-)
 delete mode 100644 73_Arithmetic2UnitTest/app_resources/benchmarkWorkgroup.comp.hlsl
 delete mode 100644 74_Arithmetic2Bench/app_resources/testSubgroup.comp.hlsl
 delete mode 100644 74_Arithmetic2Bench/app_resources/testWorkgroup.comp.hlsl
 delete mode 100644 74_Arithmetic2Bench/imgui.ini

diff --git a/73_Arithmetic2UnitTest/app_resources/benchmarkWorkgroup.comp.hlsl b/73_Arithmetic2UnitTest/app_resources/benchmarkWorkgroup.comp.hlsl
deleted file mode 100644
index e20e528d7..000000000
--- a/73_Arithmetic2UnitTest/app_resources/benchmarkWorkgroup.comp.hlsl
+++ /dev/null
@@ -1,97 +0,0 @@
-#pragma shader_stage(compute)
-
-#include "workgroupCommon.hlsl"
-
-template<class Config, class Binop>
-struct DataProxy
-{
-    using dtype_t = vector<uint32_t, Config::ItemsPerInvocation_0>;
-    static_assert(nbl::hlsl::is_same_v<dtype_t, type_t>);
-
-    dtype_t get(const uint32_t ix)
-    {
-        // return inputValue[ix];
-        return inputVal;
-    }
-    void set(const uint32_t ix, const dtype_t value)
-    {
-        // output[Binop::BindingIndex].template Store<type_t>(sizeof(uint32_t) + sizeof(type_t) * ix, value);
-        outputVal = value;
-    }
-
-    void workgroupExecutionAndMemoryBarrier()
-    {
-        nbl::hlsl::glsl::barrier();
-        //nbl::hlsl::glsl::memoryBarrierShared(); implied by the above
-    }
-
-    // to avoid multiple load/store in benchmark, also values not that important?
-    dtype_t inputVal;
-    dtype_t outputVal;
-};
-
-static ScratchProxy arithmeticAccessor;
-
-template<class Binop, class device_capabilities>
-struct operation_t
-{
-    using binop_base_t = typename Binop::base_t;
-    using otype_t = typename Binop::type_t;
-
-    otype_t operator()()
-    {
-        DataProxy<config_t,Binop> dataAccessor;
-        dataAccessor.inputVal = inputValue[globalIndex()];
-        nbl::hlsl::OPERATION<config_t,binop_base_t,device_capabilities>::template __call<DataProxy<config_t,Binop>, ScratchProxy<config_t> >(dataAccessor,arithmeticAccessor);
-        // we barrier before because we alias the accessors for Binop
-        arithmeticAccessor.workgroupExecutionAndMemoryBarrier();
-        return dataAccessor.outputVal;
-    }
-};
-
-
-template<template<class> class binop, typename T, uint32_t N>
-static void subtest(NBL_CONST_REF_ARG(type_t) sourceVal)
-{
-    if (globalIndex()==0u)
-        output[binop<T>::BindingIndex].template Store<uint32_t>(0,nbl::hlsl::glsl::gl_SubgroupSize());
-
-    type_t value;
-    operation_t<binop<T>,nbl::hlsl::jit::device_capabilities> func;
-    for (uint32_t i = 0; i < NUM_LOOPS; i++)
-        value = func(); // store is done with data accessor now
-
-    output[binop<T>::BindingIndex].template Store<type_t>(sizeof(uint32_t) + sizeof(type_t) * globalIndex(), value);
-}
-
-
-type_t test()
-{
-    const type_t sourceVal = inputValue[globalIndex()];
-
-    subtest<bit_and, uint32_t, ITEMS_PER_INVOCATION>(sourceVal);
-    subtest<bit_xor, uint32_t, ITEMS_PER_INVOCATION>(sourceVal);
-    subtest<bit_or, uint32_t, ITEMS_PER_INVOCATION>(sourceVal);
-    subtest<plus, uint32_t, ITEMS_PER_INVOCATION>(sourceVal);
-    subtest<multiplies, uint32_t, ITEMS_PER_INVOCATION>(sourceVal);
-    subtest<minimum, uint32_t, ITEMS_PER_INVOCATION>(sourceVal);
-    subtest<maximum, uint32_t, ITEMS_PER_INVOCATION>(sourceVal);
-    return sourceVal;
-}
-
-
-uint32_t globalIndex()
-{
-    return nbl::hlsl::glsl::gl_WorkGroupID().x*ITEMS_PER_WG+nbl::hlsl::workgroup::SubgroupContiguousIndex();
-}
-
-bool canStore()
-{
-    return nbl::hlsl::workgroup::SubgroupContiguousIndex()<ITEMS_PER_WG;
-}
-
-[numthreads(WORKGROUP_SIZE,1,1)]
-void main()
-{
-    const type_t sourceVal = test();
-}
\ No newline at end of file
diff --git a/74_Arithmetic2Bench/app_resources/testSubgroup.comp.hlsl b/74_Arithmetic2Bench/app_resources/testSubgroup.comp.hlsl
deleted file mode 100644
index 2cc1ccb60..000000000
--- a/74_Arithmetic2Bench/app_resources/testSubgroup.comp.hlsl
+++ /dev/null
@@ -1,18 +0,0 @@
-#pragma shader_stage(compute)
-
-#define operation_t nbl::hlsl::OPERATION
-
-#include "shaderCommon.hlsl"
-
-uint32_t globalIndex()
-{
-    return nbl::hlsl::glsl::gl_WorkGroupID().x*WORKGROUP_SIZE+nbl::hlsl::workgroup::SubgroupContiguousIndex();
-}
-
-bool canStore() {return true;}
-
-[numthreads(WORKGROUP_SIZE,1,1)]
-void main()
-{
-    test();
-}
diff --git a/74_Arithmetic2Bench/app_resources/testWorkgroup.comp.hlsl b/74_Arithmetic2Bench/app_resources/testWorkgroup.comp.hlsl
deleted file mode 100644
index 9bafae47f..000000000
--- a/74_Arithmetic2Bench/app_resources/testWorkgroup.comp.hlsl
+++ /dev/null
@@ -1,107 +0,0 @@
-#pragma shader_stage(compute)
-
-
-#include "nbl/builtin/hlsl/workgroup/scratch_size.hlsl"
-
-static const uint32_t ArithmeticSz = nbl::hlsl::workgroup::scratch_size_arithmetic<ITEMS_PER_WG>::value;
-static const uint32_t BallotSz = nbl::hlsl::workgroup::scratch_size_ballot<ITEMS_PER_WG>::value;
-static const uint32_t ScratchSz = ArithmeticSz+BallotSz;
-
-// TODO: Can we make it a static variable in the ScratchProxy struct?
-groupshared uint32_t scratch[ScratchSz];
-
-
-#include "nbl/builtin/hlsl/workgroup/arithmetic.hlsl"
-
-
-template<uint16_t offset>
-struct ScratchProxy
-{
-	void get(const uint32_t ix, NBL_REF_ARG(uint32_t) value)
-	{
-		value = scratch[ix+offset];
-	}
-	void set(const uint32_t ix, const uint32_t value)
-	{
-		scratch[ix+offset] = value;
-	}
-
-	uint32_t atomicOr(const uint32_t ix, const uint32_t value)
-	{
-		return nbl::hlsl::glsl::atomicOr(scratch[ix],value);
-	}
-
-	void workgroupExecutionAndMemoryBarrier()
-	{
-		nbl::hlsl::glsl::barrier();
-		//nbl::hlsl::glsl::memoryBarrierShared(); implied by the above
-	}
-};
-
-static ScratchProxy<0> arithmeticAccessor;
-
-
-#include "nbl/builtin/hlsl/workgroup/broadcast.hlsl"
-
-
-template<class Binop, class device_capabilities>
-struct operation_t
-{
-	using type_t = typename Binop::type_t;
-
-	type_t operator()(type_t value)
-	{
-		type_t retval = nbl::hlsl::OPERATION<Binop,ITEMS_PER_WG,device_capabilities>::template __call<ScratchProxy<0> >(value,arithmeticAccessor);
-		// we barrier before because we alias the accessors for Binop
-		arithmeticAccessor.workgroupExecutionAndMemoryBarrier();
-		return retval;
-	}
-};
-
-
-#include "shaderCommon.hlsl"
-
-static ScratchProxy<ArithmeticSz> ballotAccessor;
-
-
-uint32_t globalIndex()
-{
-	return nbl::hlsl::glsl::gl_WorkGroupID().x*ITEMS_PER_WG+nbl::hlsl::workgroup::SubgroupContiguousIndex();
-}
-
-bool canStore()
-{
-	return nbl::hlsl::workgroup::SubgroupContiguousIndex()<ITEMS_PER_WG;
-}
-
-[numthreads(WORKGROUP_SIZE,1,1)]
-void main()
-{
-	const type_t sourceVal = test();
-	if (globalIndex()==0u)
-		output[ballot<type_t>::BindingIndex].template Store<uint32_t>(0,nbl::hlsl::glsl::gl_SubgroupSize());
-
-	// we can only ballot booleans, so low bit
-	nbl::hlsl::workgroup::ballot<ScratchProxy<ArithmeticSz> >(bool(sourceVal & 0x1u), ballotAccessor);
-	// need to barrier between ballot and usages of a ballot by myself
-	ballotAccessor.workgroupExecutionAndMemoryBarrier();
-
-	uint32_t destVal = 0xdeadbeefu;
-#define CONSTEXPR_OP_TYPE_TEST(IS_OP) nbl::hlsl::is_same<nbl::hlsl::OPERATION<nbl::hlsl::bit_xor<float>,0x45>,nbl::hlsl::workgroup::IS_OP<nbl::hlsl::bit_xor<float>,0x45> >::value
-#define BALLOT_TEMPLATE_ARGS ITEMS_PER_WG,decltype(ballotAccessor),decltype(arithmeticAccessor),nbl::hlsl::jit::device_capabilities
-	if (CONSTEXPR_OP_TYPE_TEST(reduction))
-		destVal = nbl::hlsl::workgroup::ballotBitCount<BALLOT_TEMPLATE_ARGS>(ballotAccessor,arithmeticAccessor);
-	else if (CONSTEXPR_OP_TYPE_TEST(inclusive_scan))
-		destVal = nbl::hlsl::workgroup::ballotInclusiveBitCount<BALLOT_TEMPLATE_ARGS>(ballotAccessor,arithmeticAccessor);
-	else if (CONSTEXPR_OP_TYPE_TEST(exclusive_scan))
-		destVal = nbl::hlsl::workgroup::ballotExclusiveBitCount<BALLOT_TEMPLATE_ARGS>(ballotAccessor,arithmeticAccessor);
-	else
-	{
-		assert(false);
-	}
-#undef BALLOT_TEMPLATE_ARGS
-#undef CONSTEXPR_OP_TYPE_TEST
-
-	if (canStore())
-		output[ballot<type_t>::BindingIndex].template Store<type_t>(sizeof(uint32_t)+sizeof(type_t)*globalIndex(),destVal);
-}
\ No newline at end of file
diff --git a/74_Arithmetic2Bench/imgui.ini b/74_Arithmetic2Bench/imgui.ini
deleted file mode 100644
index 4a5c20148..000000000
--- a/74_Arithmetic2Bench/imgui.ini
+++ /dev/null
@@ -1,5 +0,0 @@
-[Window][Debug##Default]
-Pos=60,60
-Size=400,400
-Collapsed=0
-

From 54acf2a433f3d5d4abaedfc8b0a33a435b45977e Mon Sep 17 00:00:00 2001
From: keptsecret <sorchon@gmail.com>
Date: Wed, 7 May 2025 15:26:03 +0700
Subject: [PATCH 232/529] replaced old ex 23 unit test with new tests

---
 .../CMakeLists.txt                            |   0
 .../app_resources/common.hlsl                 |   0
 .../app_resources/shaderCommon.hlsl           |   0
 .../app_resources/testSubgroup.comp.hlsl      |   0
 .../app_resources/testWorkgroup.comp.hlsl     |   0
 .../app_resources/workgroupCommon.hlsl        |   0
 .../config.json.template                      |   0
 .../main.cpp                                  |   0
 .../pipeline.groovy                           |   0
 .../app_resources/shaderCommon.hlsl           |  55 ---
 .../app_resources/testSubgroup.comp.hlsl      |  18 -
 .../app_resources/testWorkgroup.comp.hlsl     | 107 ----
 23_ArithmeticUnitTest/main.cpp                | 462 ------------------
 73_Arithmetic2UnitTest/CMakeLists.txt         |  25 -
 .../app_resources/common.hlsl                 |  96 ----
 73_Arithmetic2UnitTest/config.json.template   |  28 --
 73_Arithmetic2UnitTest/pipeline.groovy        |  50 --
 17 files changed, 841 deletions(-)
 rename {23_ArithmeticUnitTest => 23_Arithmetic2UnitTest}/CMakeLists.txt (100%)
 rename {23_ArithmeticUnitTest => 23_Arithmetic2UnitTest}/app_resources/common.hlsl (100%)
 rename {73_Arithmetic2UnitTest => 23_Arithmetic2UnitTest}/app_resources/shaderCommon.hlsl (100%)
 rename {73_Arithmetic2UnitTest => 23_Arithmetic2UnitTest}/app_resources/testSubgroup.comp.hlsl (100%)
 rename {73_Arithmetic2UnitTest => 23_Arithmetic2UnitTest}/app_resources/testWorkgroup.comp.hlsl (100%)
 rename {73_Arithmetic2UnitTest => 23_Arithmetic2UnitTest}/app_resources/workgroupCommon.hlsl (100%)
 rename {23_ArithmeticUnitTest => 23_Arithmetic2UnitTest}/config.json.template (100%)
 rename {73_Arithmetic2UnitTest => 23_Arithmetic2UnitTest}/main.cpp (100%)
 rename {23_ArithmeticUnitTest => 23_Arithmetic2UnitTest}/pipeline.groovy (100%)
 delete mode 100644 23_ArithmeticUnitTest/app_resources/shaderCommon.hlsl
 delete mode 100644 23_ArithmeticUnitTest/app_resources/testSubgroup.comp.hlsl
 delete mode 100644 23_ArithmeticUnitTest/app_resources/testWorkgroup.comp.hlsl
 delete mode 100644 23_ArithmeticUnitTest/main.cpp
 delete mode 100644 73_Arithmetic2UnitTest/CMakeLists.txt
 delete mode 100644 73_Arithmetic2UnitTest/app_resources/common.hlsl
 delete mode 100644 73_Arithmetic2UnitTest/config.json.template
 delete mode 100644 73_Arithmetic2UnitTest/pipeline.groovy

diff --git a/23_ArithmeticUnitTest/CMakeLists.txt b/23_Arithmetic2UnitTest/CMakeLists.txt
similarity index 100%
rename from 23_ArithmeticUnitTest/CMakeLists.txt
rename to 23_Arithmetic2UnitTest/CMakeLists.txt
diff --git a/23_ArithmeticUnitTest/app_resources/common.hlsl b/23_Arithmetic2UnitTest/app_resources/common.hlsl
similarity index 100%
rename from 23_ArithmeticUnitTest/app_resources/common.hlsl
rename to 23_Arithmetic2UnitTest/app_resources/common.hlsl
diff --git a/73_Arithmetic2UnitTest/app_resources/shaderCommon.hlsl b/23_Arithmetic2UnitTest/app_resources/shaderCommon.hlsl
similarity index 100%
rename from 73_Arithmetic2UnitTest/app_resources/shaderCommon.hlsl
rename to 23_Arithmetic2UnitTest/app_resources/shaderCommon.hlsl
diff --git a/73_Arithmetic2UnitTest/app_resources/testSubgroup.comp.hlsl b/23_Arithmetic2UnitTest/app_resources/testSubgroup.comp.hlsl
similarity index 100%
rename from 73_Arithmetic2UnitTest/app_resources/testSubgroup.comp.hlsl
rename to 23_Arithmetic2UnitTest/app_resources/testSubgroup.comp.hlsl
diff --git a/73_Arithmetic2UnitTest/app_resources/testWorkgroup.comp.hlsl b/23_Arithmetic2UnitTest/app_resources/testWorkgroup.comp.hlsl
similarity index 100%
rename from 73_Arithmetic2UnitTest/app_resources/testWorkgroup.comp.hlsl
rename to 23_Arithmetic2UnitTest/app_resources/testWorkgroup.comp.hlsl
diff --git a/73_Arithmetic2UnitTest/app_resources/workgroupCommon.hlsl b/23_Arithmetic2UnitTest/app_resources/workgroupCommon.hlsl
similarity index 100%
rename from 73_Arithmetic2UnitTest/app_resources/workgroupCommon.hlsl
rename to 23_Arithmetic2UnitTest/app_resources/workgroupCommon.hlsl
diff --git a/23_ArithmeticUnitTest/config.json.template b/23_Arithmetic2UnitTest/config.json.template
similarity index 100%
rename from 23_ArithmeticUnitTest/config.json.template
rename to 23_Arithmetic2UnitTest/config.json.template
diff --git a/73_Arithmetic2UnitTest/main.cpp b/23_Arithmetic2UnitTest/main.cpp
similarity index 100%
rename from 73_Arithmetic2UnitTest/main.cpp
rename to 23_Arithmetic2UnitTest/main.cpp
diff --git a/23_ArithmeticUnitTest/pipeline.groovy b/23_Arithmetic2UnitTest/pipeline.groovy
similarity index 100%
rename from 23_ArithmeticUnitTest/pipeline.groovy
rename to 23_Arithmetic2UnitTest/pipeline.groovy
diff --git a/23_ArithmeticUnitTest/app_resources/shaderCommon.hlsl b/23_ArithmeticUnitTest/app_resources/shaderCommon.hlsl
deleted file mode 100644
index 13ee8d21e..000000000
--- a/23_ArithmeticUnitTest/app_resources/shaderCommon.hlsl
+++ /dev/null
@@ -1,55 +0,0 @@
-#include "common.hlsl"
-
-#include "nbl/builtin/hlsl/glsl_compat/core.hlsl"
-#include "nbl/builtin/hlsl/subgroup/basic.hlsl"
-#include "nbl/builtin/hlsl/subgroup/arithmetic_portability.hlsl"
-
-#include "nbl/builtin/hlsl/jit/device_capabilities.hlsl"
-
-// https://github.com/microsoft/DirectXShaderCompiler/issues/6144
-uint32_t3 nbl::hlsl::glsl::gl_WorkGroupSize() {return uint32_t3(WORKGROUP_SIZE,1,1);}
-
-// unfortunately DXC chokes on descriptors as static members
-// https://github.com/microsoft/DirectXShaderCompiler/issues/5940
-[[vk::binding(0, 0)]] StructuredBuffer<uint32_t> inputValue;
-[[vk::binding(1, 0)]] RWByteAddressBuffer output[8];
-
-// because subgroups don't match `gl_LocalInvocationIndex` snake curve addressing, we also can't load inputs that way
-uint32_t globalIndex();
-// since we test ITEMS_PER_WG<WorkgroupSize we need this so workgroups don't overwrite each other's outputs
-bool canStore();
-
-//typedef decltype(inputValue[0]) type_t;
-typedef uint32_t type_t;
-
-
-#ifndef OPERATION
-#error "Define OPERATION!"
-#endif
-template<template<class> class binop>
-static void subtest(NBL_CONST_REF_ARG(type_t) sourceVal)
-{
-	if (globalIndex()==0u)
-		output[binop<type_t>::BindingIndex].template Store<uint32_t>(0,nbl::hlsl::glsl::gl_SubgroupSize());
-		
-	operation_t<typename binop<type_t>::base_t,nbl::hlsl::jit::device_capabilities> func;
-	if (canStore())
-		output[binop<type_t>::BindingIndex].template Store<type_t>(sizeof(uint32_t)+sizeof(type_t)*globalIndex(),func(sourceVal));
-}
-
-
-type_t test()
-{
-	const type_t sourceVal = inputValue[globalIndex()];
-
-	subtest<bit_and>(sourceVal);
-	subtest<bit_xor>(sourceVal);
-	subtest<bit_or>(sourceVal);
-	subtest<plus>(sourceVal);
-	subtest<multiplies>(sourceVal);
-	subtest<minimum>(sourceVal);
-	subtest<maximum>(sourceVal);
-	return sourceVal;
-}
-
-#include "nbl/builtin/hlsl/workgroup/basic.hlsl"
\ No newline at end of file
diff --git a/23_ArithmeticUnitTest/app_resources/testSubgroup.comp.hlsl b/23_ArithmeticUnitTest/app_resources/testSubgroup.comp.hlsl
deleted file mode 100644
index 479265d73..000000000
--- a/23_ArithmeticUnitTest/app_resources/testSubgroup.comp.hlsl
+++ /dev/null
@@ -1,18 +0,0 @@
-#pragma shader_stage(compute)
-
-#define operation_t nbl::hlsl::OPERATION
-
-#include "shaderCommon.hlsl"
-
-uint32_t globalIndex()
-{
-	return nbl::hlsl::glsl::gl_WorkGroupID().x*WORKGROUP_SIZE+nbl::hlsl::workgroup::SubgroupContiguousIndex();
-}
-
-bool canStore() {return true;}
-
-[numthreads(WORKGROUP_SIZE,1,1)]
-void main()
-{
-	test();
-}
\ No newline at end of file
diff --git a/23_ArithmeticUnitTest/app_resources/testWorkgroup.comp.hlsl b/23_ArithmeticUnitTest/app_resources/testWorkgroup.comp.hlsl
deleted file mode 100644
index 9bafae47f..000000000
--- a/23_ArithmeticUnitTest/app_resources/testWorkgroup.comp.hlsl
+++ /dev/null
@@ -1,107 +0,0 @@
-#pragma shader_stage(compute)
-
-
-#include "nbl/builtin/hlsl/workgroup/scratch_size.hlsl"
-
-static const uint32_t ArithmeticSz = nbl::hlsl::workgroup::scratch_size_arithmetic<ITEMS_PER_WG>::value;
-static const uint32_t BallotSz = nbl::hlsl::workgroup::scratch_size_ballot<ITEMS_PER_WG>::value;
-static const uint32_t ScratchSz = ArithmeticSz+BallotSz;
-
-// TODO: Can we make it a static variable in the ScratchProxy struct?
-groupshared uint32_t scratch[ScratchSz];
-
-
-#include "nbl/builtin/hlsl/workgroup/arithmetic.hlsl"
-
-
-template<uint16_t offset>
-struct ScratchProxy
-{
-	void get(const uint32_t ix, NBL_REF_ARG(uint32_t) value)
-	{
-		value = scratch[ix+offset];
-	}
-	void set(const uint32_t ix, const uint32_t value)
-	{
-		scratch[ix+offset] = value;
-	}
-
-	uint32_t atomicOr(const uint32_t ix, const uint32_t value)
-	{
-		return nbl::hlsl::glsl::atomicOr(scratch[ix],value);
-	}
-
-	void workgroupExecutionAndMemoryBarrier()
-	{
-		nbl::hlsl::glsl::barrier();
-		//nbl::hlsl::glsl::memoryBarrierShared(); implied by the above
-	}
-};
-
-static ScratchProxy<0> arithmeticAccessor;
-
-
-#include "nbl/builtin/hlsl/workgroup/broadcast.hlsl"
-
-
-template<class Binop, class device_capabilities>
-struct operation_t
-{
-	using type_t = typename Binop::type_t;
-
-	type_t operator()(type_t value)
-	{
-		type_t retval = nbl::hlsl::OPERATION<Binop,ITEMS_PER_WG,device_capabilities>::template __call<ScratchProxy<0> >(value,arithmeticAccessor);
-		// we barrier before because we alias the accessors for Binop
-		arithmeticAccessor.workgroupExecutionAndMemoryBarrier();
-		return retval;
-	}
-};
-
-
-#include "shaderCommon.hlsl"
-
-static ScratchProxy<ArithmeticSz> ballotAccessor;
-
-
-uint32_t globalIndex()
-{
-	return nbl::hlsl::glsl::gl_WorkGroupID().x*ITEMS_PER_WG+nbl::hlsl::workgroup::SubgroupContiguousIndex();
-}
-
-bool canStore()
-{
-	return nbl::hlsl::workgroup::SubgroupContiguousIndex()<ITEMS_PER_WG;
-}
-
-[numthreads(WORKGROUP_SIZE,1,1)]
-void main()
-{
-	const type_t sourceVal = test();
-	if (globalIndex()==0u)
-		output[ballot<type_t>::BindingIndex].template Store<uint32_t>(0,nbl::hlsl::glsl::gl_SubgroupSize());
-
-	// we can only ballot booleans, so low bit
-	nbl::hlsl::workgroup::ballot<ScratchProxy<ArithmeticSz> >(bool(sourceVal & 0x1u), ballotAccessor);
-	// need to barrier between ballot and usages of a ballot by myself
-	ballotAccessor.workgroupExecutionAndMemoryBarrier();
-
-	uint32_t destVal = 0xdeadbeefu;
-#define CONSTEXPR_OP_TYPE_TEST(IS_OP) nbl::hlsl::is_same<nbl::hlsl::OPERATION<nbl::hlsl::bit_xor<float>,0x45>,nbl::hlsl::workgroup::IS_OP<nbl::hlsl::bit_xor<float>,0x45> >::value
-#define BALLOT_TEMPLATE_ARGS ITEMS_PER_WG,decltype(ballotAccessor),decltype(arithmeticAccessor),nbl::hlsl::jit::device_capabilities
-	if (CONSTEXPR_OP_TYPE_TEST(reduction))
-		destVal = nbl::hlsl::workgroup::ballotBitCount<BALLOT_TEMPLATE_ARGS>(ballotAccessor,arithmeticAccessor);
-	else if (CONSTEXPR_OP_TYPE_TEST(inclusive_scan))
-		destVal = nbl::hlsl::workgroup::ballotInclusiveBitCount<BALLOT_TEMPLATE_ARGS>(ballotAccessor,arithmeticAccessor);
-	else if (CONSTEXPR_OP_TYPE_TEST(exclusive_scan))
-		destVal = nbl::hlsl::workgroup::ballotExclusiveBitCount<BALLOT_TEMPLATE_ARGS>(ballotAccessor,arithmeticAccessor);
-	else
-	{
-		assert(false);
-	}
-#undef BALLOT_TEMPLATE_ARGS
-#undef CONSTEXPR_OP_TYPE_TEST
-
-	if (canStore())
-		output[ballot<type_t>::BindingIndex].template Store<type_t>(sizeof(uint32_t)+sizeof(type_t)*globalIndex(),destVal);
-}
\ No newline at end of file
diff --git a/23_ArithmeticUnitTest/main.cpp b/23_ArithmeticUnitTest/main.cpp
deleted file mode 100644
index 147d231e2..000000000
--- a/23_ArithmeticUnitTest/main.cpp
+++ /dev/null
@@ -1,462 +0,0 @@
-#include "nbl/application_templates/BasicMultiQueueApplication.hpp"
-#include "nbl/application_templates/MonoAssetManagerAndBuiltinResourceApplication.hpp"
-#include "app_resources/common.hlsl"
-
-using namespace nbl;
-using namespace core;
-using namespace asset;
-using namespace system;
-using namespace video;
-
-// method emulations on the CPU, to verify the results of the GPU methods
-template<class Binop>
-struct emulatedReduction
-{
-	using type_t = typename Binop::type_t;
-
-	static inline void impl(type_t* out, const type_t* in, const uint32_t itemCount)
-	{
-		const type_t red = std::reduce(in,in+itemCount,Binop::identity,Binop());
-		std::fill(out,out+itemCount,red);
-	}
-
-	static inline constexpr const char* name = "reduction";
-};
-template<class Binop>
-struct emulatedScanInclusive
-{
-	using type_t = typename Binop::type_t;
-
-	static inline void impl(type_t* out, const type_t* in, const uint32_t itemCount)
-	{
-		std::inclusive_scan(in,in+itemCount,out,Binop());
-	}
-	static inline constexpr const char* name = "inclusive_scan";
-};
-template<class Binop>
-struct emulatedScanExclusive
-{
-	using type_t = typename Binop::type_t;
-
-	static inline void impl(type_t* out, const type_t* in, const uint32_t itemCount)
-	{
-		std::exclusive_scan(in,in+itemCount,out,Binop::identity,Binop());
-	}
-	static inline constexpr const char* name = "exclusive_scan";
-};
-
-class ArithmeticUnitTestApp final : public application_templates::BasicMultiQueueApplication, public application_templates::MonoAssetManagerAndBuiltinResourceApplication
-{
-	using device_base_t = application_templates::BasicMultiQueueApplication;
-	using asset_base_t = application_templates::MonoAssetManagerAndBuiltinResourceApplication;
-
-public:
-	ArithmeticUnitTestApp(const path& _localInputCWD, const path& _localOutputCWD, const path& _sharedInputCWD, const path& _sharedOutputCWD) :
-		system::IApplicationFramework(_localInputCWD, _localOutputCWD, _sharedInputCWD, _sharedOutputCWD) {}
-
-	bool onAppInitialized(smart_refctd_ptr<ISystem>&& system) override
-	{
-		if (!device_base_t::onAppInitialized(std::move(system)))
-			return false;
-		if (!asset_base_t::onAppInitialized(std::move(system)))
-			return false;
-
-		transferDownQueue = getTransferDownQueue();
-		computeQueue = getComputeQueue();
-
-		// TODO: get the element count from argv
-		const uint32_t elementCount = Output<>::ScanElementCount;
-		// populate our random data buffer on the CPU and create a GPU copy
-		inputData = new uint32_t[elementCount];
-		smart_refctd_ptr<IGPUBuffer> gpuinputDataBuffer;
-		{
-			std::mt19937 randGenerator(0xdeadbeefu);
-			for (uint32_t i = 0u; i < elementCount; i++)
-				inputData[i] = randGenerator(); // TODO: change to using xoroshiro, then we can skip having the input buffer at all
-
-			IGPUBuffer::SCreationParams inputDataBufferCreationParams = {};
-			inputDataBufferCreationParams.size = sizeof(Output<>::data[0]) * elementCount;
-			inputDataBufferCreationParams.usage = IGPUBuffer::EUF_STORAGE_BUFFER_BIT | IGPUBuffer::EUF_TRANSFER_DST_BIT;
-			m_utils->createFilledDeviceLocalBufferOnDedMem(
-				SIntendedSubmitInfo{.queue=getTransferUpQueue()},
-				std::move(inputDataBufferCreationParams),
-				inputData
-			).move_into(gpuinputDataBuffer);
-		}
-
-		// create 8 buffers for 8 operations
-		for (auto i=0u; i<OutputBufferCount; i++)
-		{
-			IGPUBuffer::SCreationParams params = {};
-			params.size = sizeof(uint32_t) + gpuinputDataBuffer->getSize();
-			params.usage = bitflag(IGPUBuffer::EUF_STORAGE_BUFFER_BIT) | IGPUBuffer::EUF_TRANSFER_SRC_BIT;
-
-			outputBuffers[i] = m_device->createBuffer(std::move(params));
-			auto mreq = outputBuffers[i]->getMemoryReqs();
-			mreq.memoryTypeBits &= m_physicalDevice->getDeviceLocalMemoryTypeBits();
-			assert(mreq.memoryTypeBits);
-
-			auto bufferMem = m_device->allocate(mreq, outputBuffers[i].get());
-			assert(bufferMem.isValid());
-		}
-
-		// create Descriptor Set and Pipeline Layout
-		{
-			// create Descriptor Set Layout
-			smart_refctd_ptr<IGPUDescriptorSetLayout> dsLayout;
-			{
-				IGPUDescriptorSetLayout::SBinding binding[2];
-				for (uint32_t i = 0u; i < 2; i++)
-					binding[i] = {{},i,IDescriptor::E_TYPE::ET_STORAGE_BUFFER,IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_NONE,IShader::E_SHADER_STAGE::ESS_COMPUTE,1u,nullptr };
-				binding[1].count = OutputBufferCount;
-				dsLayout = m_device->createDescriptorSetLayout(binding);
-			}
-
-			// set and transient pool
-			auto descPool = m_device->createDescriptorPoolForDSLayouts(IDescriptorPool::ECF_NONE,{&dsLayout.get(),1});
-			descriptorSet = descPool->createDescriptorSet(smart_refctd_ptr(dsLayout));
-			{
-				IGPUDescriptorSet::SDescriptorInfo infos[1+OutputBufferCount];
-				infos[0].desc = gpuinputDataBuffer;
-				infos[0].info.buffer = { 0u,gpuinputDataBuffer->getSize() };
-				for (uint32_t i = 1u; i <= OutputBufferCount; i++)
-				{
-					auto buff = outputBuffers[i - 1];
-					infos[i].info.buffer = { 0u,buff->getSize() };
-					infos[i].desc = std::move(buff); // save an atomic in the refcount
-
-				}
-
-				IGPUDescriptorSet::SWriteDescriptorSet writes[2];
-				for (uint32_t i=0u; i<2; i++)
-					writes[i] = {descriptorSet.get(),i,0u,1u,infos+i};
-				writes[1].count = OutputBufferCount;
-
-				m_device->updateDescriptorSets(2, writes, 0u, nullptr);
-			}
-
-			pipelineLayout = m_device->createPipelineLayout({},std::move(dsLayout));
-		}
-
-		const auto spirv_isa_cache_path = localOutputCWD/"spirv_isa_cache.bin";
-		// enclose to make sure file goes out of scope and we can reopen it
-		{
-			smart_refctd_ptr<const IFile> spirv_isa_cache_input;
-			// try to load SPIR-V to ISA cache
-			{
-				ISystem::future_t<smart_refctd_ptr<IFile>> fileCreate;
-				m_system->createFile(fileCreate,spirv_isa_cache_path,IFile::ECF_READ|IFile::ECF_MAPPABLE|IFile::ECF_COHERENT);
-				if (auto lock=fileCreate.acquire())
-					spirv_isa_cache_input = *lock;
-			}
-			// create the cache
-			{
-				std::span<const uint8_t> spirv_isa_cache_data = {};
-				if (spirv_isa_cache_input)
-					spirv_isa_cache_data = {reinterpret_cast<const uint8_t*>(spirv_isa_cache_input->getMappedPointer()),spirv_isa_cache_input->getSize()};
-				else
-					m_logger->log("Failed to load SPIR-V 2 ISA cache!",ILogger::ELL_PERFORMANCE);
-				// Normally we'd deserialize a `ICPUPipelineCache` properly and pass that instead
-				m_spirv_isa_cache = m_device->createPipelineCache(spirv_isa_cache_data);
-			}
-		}
-		{
-			// TODO: rename `deleteDirectory` to just `delete`? and a `IFile::setSize()` ?
-			m_system->deleteDirectory(spirv_isa_cache_path);
-			ISystem::future_t<smart_refctd_ptr<IFile>> fileCreate;
-			m_system->createFile(fileCreate,spirv_isa_cache_path,IFile::ECF_WRITE);
-			// I can be relatively sure I'll succeed to acquire the future, the pointer to created file might be null though.
-			m_spirv_isa_cache_output=*fileCreate.acquire();
-			if (!m_spirv_isa_cache_output)
-				logFail("Failed to Create SPIR-V to ISA cache file.");
-		}
-
-		// load shader source from file
-		auto getShaderSource = [&](const char* filePath) -> auto
-		{
-			IAssetLoader::SAssetLoadParams lparams = {};
-			lparams.logger = m_logger.get();
-			lparams.workingDirectory = "";
-			auto bundle = m_assetMgr->getAsset(filePath, lparams);
-			if (bundle.getContents().empty() || bundle.getAssetType()!=IAsset::ET_SHADER)
-			{
-				m_logger->log("Shader %s not found!", ILogger::ELL_ERROR, filePath);
-				exit(-1);
-			}
-			auto firstAssetInBundle = bundle.getContents()[0];
-			return smart_refctd_ptr_static_cast<ICPUShader>(firstAssetInBundle);
-		};
-
-		auto subgroupTestSource = getShaderSource("app_resources/testSubgroup.comp.hlsl");
-		auto workgroupTestSource = getShaderSource("app_resources/testWorkgroup.comp.hlsl");
-		// now create or retrieve final resources to run our tests
-		sema = m_device->createSemaphore(timelineValue);
-		resultsBuffer = ICPUBuffer::create({ outputBuffers[0]->getSize() });
-		{
-			smart_refctd_ptr<nbl::video::IGPUCommandPool> cmdpool = m_device->createCommandPool(computeQueue->getFamilyIndex(),IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT);
-			if (!cmdpool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY,{&cmdbuf,1}))
-			{
-				logFail("Failed to create Command Buffers!\n");
-				return false;
-			}
-		}
-
-		const auto MaxWorkgroupSize = m_physicalDevice->getLimits().maxComputeWorkGroupInvocations;
-		const auto MinSubgroupSize = m_physicalDevice->getLimits().minSubgroupSize;
-		const auto MaxSubgroupSize = m_physicalDevice->getLimits().maxSubgroupSize;
-		for (auto subgroupSize=MinSubgroupSize; subgroupSize <= MaxSubgroupSize; subgroupSize *= 2u)
-		{
-			const uint8_t subgroupSizeLog2 = hlsl::findMSB(subgroupSize);
-			for (uint32_t workgroupSize = subgroupSize; workgroupSize <= MaxWorkgroupSize; workgroupSize += subgroupSize)
-			{
-				// make sure renderdoc captures everything for debugging
-				m_api->startCapture();
-				m_logger->log("Testing Workgroup Size %u with Subgroup Size %u", ILogger::ELL_INFO, workgroupSize, subgroupSize);
-
-				bool passed = true;
-				// TODO async the testing
-				passed = runTest<emulatedReduction, false>(subgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize) && passed;
-				logTestOutcome(passed, workgroupSize);
-				passed = runTest<emulatedScanInclusive, false>(subgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize) && passed;
-				logTestOutcome(passed, workgroupSize);
-				passed = runTest<emulatedScanExclusive, false>(subgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize) && passed;
-				logTestOutcome(passed, workgroupSize);
-				for (uint32_t itemsPerWG = workgroupSize; itemsPerWG > workgroupSize - subgroupSize; itemsPerWG--)
-				{
-					m_logger->log("Testing Item Count %u", ILogger::ELL_INFO, itemsPerWG);
-					passed = runTest<emulatedReduction, true>(workgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize, itemsPerWG) && passed;
-					logTestOutcome(passed, itemsPerWG);
-					passed = runTest<emulatedScanInclusive, true>(workgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize, itemsPerWG) && passed;
-					logTestOutcome(passed, itemsPerWG);
-					passed = runTest<emulatedScanExclusive, true>(workgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize, itemsPerWG) && passed;
-					logTestOutcome(passed, itemsPerWG);
-				}
-				m_api->endCapture();
-
-				// save cache every now and then	
-				{
-					auto cpu = m_spirv_isa_cache->convertToCPUCache();
-					// Normally we'd beautifully JSON serialize the thing, allow multiple devices & drivers + metadata
-					auto bin = cpu->getEntries().begin()->second.bin;
-					IFile::success_t success;
-					m_spirv_isa_cache_output->write(success,bin->data(),0ull,bin->size());
-					if (!success)
-						logFail("Could not write Create SPIR-V to ISA cache to disk!");
-				}
-			}
-		}
-
-		return true;
-	}
-
-	virtual bool onAppTerminated() override
-	{
-		m_logger->log("==========Result==========", ILogger::ELL_INFO);
-		m_logger->log("Fail Count: %u", ILogger::ELL_INFO, totalFailCount);
-		delete[] inputData;
-		return true;
-	}
-
-	// the unit test is carried out on init
-	void workLoopBody() override {}
-
-	//
-	bool keepRunning() override { return false; }
-
-private:
-	void logTestOutcome(bool passed, uint32_t workgroupSize)
-	{
-		if (passed)
-			m_logger->log("Passed test #%u", ILogger::ELL_INFO, workgroupSize);
-		else
-		{
-			totalFailCount++;
-			m_logger->log("Failed test #%u", ILogger::ELL_ERROR, workgroupSize);
-		}
-	}
-
-	// create pipeline (specialized every test) [TODO: turn into a future/async]
-	smart_refctd_ptr<IGPUComputePipeline> createPipeline(const ICPUShader* overridenUnspecialized, const uint8_t subgroupSizeLog2)
-	{
-		auto shader = m_device->createShader(overridenUnspecialized);
-		IGPUComputePipeline::SCreationParams params = {};
-		params.layout = pipelineLayout.get();
-		params.shader = {
-			.entryPoint = "main",
-			.shader = shader.get(),
-			.entries = nullptr,
-			.requiredSubgroupSize = static_cast<IGPUShader::SSpecInfo::SUBGROUP_SIZE>(subgroupSizeLog2),
-			.requireFullSubgroups = true
-		};
-		core::smart_refctd_ptr<IGPUComputePipeline> pipeline;
-		if (!m_device->createComputePipelines(m_spirv_isa_cache.get(),{&params,1},&pipeline))
-			return nullptr;
-		return pipeline;
-	}
-
-	/*template<template<class> class Arithmetic, bool WorkgroupTest>
-	bool runTest(const smart_refctd_ptr<const ICPUShader>& source, const uint32_t elementCount, const uint32_t workgroupSize, uint32_t itemsPerWG = ~0u)
-	{
-		return true;
-	}*/
-
-	template<template<class> class Arithmetic, bool WorkgroupTest>
-	bool runTest(const smart_refctd_ptr<const ICPUShader>& source, const uint32_t elementCount, const uint8_t subgroupSizeLog2, const uint32_t workgroupSize, uint32_t itemsPerWG = ~0u)
-	{
-		std::string arith_name = Arithmetic<bit_xor<float>>::name;
-
-		smart_refctd_ptr<ICPUShader> overridenUnspecialized;
-		if constexpr (WorkgroupTest)
-		{
-			overridenUnspecialized = CHLSLCompiler::createOverridenCopy(
-				source.get(), "#define OPERATION %s\n#define WORKGROUP_SIZE %d\n#define ITEMS_PER_WG %d\n",
-				(("workgroup::") + arith_name).c_str(), workgroupSize, itemsPerWG
-			);
-		}
-		else
-		{
-			itemsPerWG = workgroupSize;
-			overridenUnspecialized = CHLSLCompiler::createOverridenCopy(
-				source.get(), "#define OPERATION %s\n#define WORKGROUP_SIZE %d\n",
-				(("subgroup::") + arith_name).c_str(), workgroupSize
-			);
-		}
-		auto pipeline = createPipeline(overridenUnspecialized.get(),subgroupSizeLog2);
-
-		// TODO: overlap dispatches with memory readbacks (requires multiple copies of `buffers`)
-		const uint32_t workgroupCount = elementCount / itemsPerWG;
-		cmdbuf->begin(IGPUCommandBuffer::USAGE::NONE);
-		cmdbuf->bindComputePipeline(pipeline.get());
-		cmdbuf->bindDescriptorSets(EPBP_COMPUTE, pipeline->getLayout(), 0u, 1u, &descriptorSet.get());
-		cmdbuf->dispatch(workgroupCount, 1, 1);
-		{
-			IGPUCommandBuffer::SPipelineBarrierDependencyInfo::buffer_barrier_t memoryBarrier[OutputBufferCount];
-			for (auto i=0u; i<OutputBufferCount; i++)
-			{
-				memoryBarrier[i] = {
-					.barrier = {
-						.dep = {
-							.srcStageMask = PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT,
-							.srcAccessMask = ACCESS_FLAGS::SHADER_WRITE_BITS,
-							// in theory we don't need the HOST BITS cause we block on a semaphore but might as well add them
-							.dstStageMask = PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT|PIPELINE_STAGE_FLAGS::HOST_BIT,
-							.dstAccessMask = ACCESS_FLAGS::SHADER_WRITE_BITS|ACCESS_FLAGS::HOST_READ_BIT
-						}
-					},
-					.range = {0ull,outputBuffers[i]->getSize(),outputBuffers[i]}
-				};
-			}
-			IGPUCommandBuffer::SPipelineBarrierDependencyInfo info = {.memBarriers={},.bufBarriers=memoryBarrier};
-			cmdbuf->pipelineBarrier(asset::E_DEPENDENCY_FLAGS::EDF_NONE,info);
-		}
-		cmdbuf->end();
-
-		const IQueue::SSubmitInfo::SSemaphoreInfo signal[1] = {{.semaphore=sema.get(),.value=++timelineValue}};
-		const IQueue::SSubmitInfo::SCommandBufferInfo cmdbufs[1] = {{.cmdbuf=cmdbuf.get()}};
-		const IQueue::SSubmitInfo submits[1] = {{.commandBuffers=cmdbufs,.signalSemaphores=signal}};
-		computeQueue->submit(submits);
-		const ISemaphore::SWaitInfo wait[1] = {{.semaphore=sema.get(),.value=timelineValue}};
-		m_device->blockForSemaphores(wait);
-
-		// check results
-		bool passed = validateResults<Arithmetic, bit_and<uint32_t>, WorkgroupTest>(itemsPerWG, workgroupCount);
-		passed = validateResults<Arithmetic, bit_xor<uint32_t>, WorkgroupTest>(itemsPerWG, workgroupCount) && passed;
-		passed = validateResults<Arithmetic, bit_or<uint32_t>, WorkgroupTest>(itemsPerWG, workgroupCount) && passed;
-		passed = validateResults<Arithmetic, plus<uint32_t>, WorkgroupTest>(itemsPerWG, workgroupCount) && passed;
-		passed = validateResults<Arithmetic, multiplies<uint32_t>, WorkgroupTest>(itemsPerWG, workgroupCount) && passed;
-		passed = validateResults<Arithmetic, minimum<uint32_t>, WorkgroupTest>(itemsPerWG, workgroupCount) && passed;
-		passed = validateResults<Arithmetic, maximum<uint32_t>, WorkgroupTest>(itemsPerWG, workgroupCount) && passed;
-		if constexpr (WorkgroupTest)
-			passed = validateResults<Arithmetic, ballot<uint32_t>, WorkgroupTest>(itemsPerWG, workgroupCount) && passed;
-
-		return passed;
-	}
-
-	//returns true if result matches
-	template<template<class> class Arithmetic, class Binop, bool WorkgroupTest>
-	bool validateResults(const uint32_t itemsPerWG, const uint32_t workgroupCount)
-	{
-		bool success = true;
-
-		// download data
-		const SBufferRange<IGPUBuffer> bufferRange = {0u, resultsBuffer->getSize(), outputBuffers[Binop::BindingIndex]};
-		m_utils->downloadBufferRangeViaStagingBufferAutoSubmit(SIntendedSubmitInfo{.queue=transferDownQueue},bufferRange,resultsBuffer->getPointer());
-
-		using type_t = typename Binop::type_t;
-		const auto dataFromBuffer = reinterpret_cast<const uint32_t*>(resultsBuffer->getPointer());
-		const auto subgroupSize = dataFromBuffer[0];
-		if (subgroupSize<nbl::hlsl::subgroup::MinSubgroupSize || subgroupSize>nbl::hlsl::subgroup::MaxSubgroupSize)
-		{
-			m_logger->log("Unexpected Subgroup Size %u", ILogger::ELL_ERROR, subgroupSize);
-			return false;
-		}
-
-		const auto testData = reinterpret_cast<const type_t*>(dataFromBuffer + 1);
-		// TODO: parallel for (the temporary values need to be threadlocal or what?)
-		// now check if the data obtained has valid values
-		type_t* tmp = new type_t[itemsPerWG];
-		type_t* ballotInput = new type_t[itemsPerWG];
-		for (uint32_t workgroupID = 0u; success && workgroupID < workgroupCount; workgroupID++)
-		{
-			const auto workgroupOffset = workgroupID * itemsPerWG;
-
-			if constexpr (WorkgroupTest)
-			{
-				if constexpr (std::is_same_v<ballot<type_t>, Binop>)
-				{
-					for (auto i = 0u; i < itemsPerWG; i++)
-						ballotInput[i] = inputData[i + workgroupOffset] & 0x1u;
-					Arithmetic<Binop>::impl(tmp, ballotInput, itemsPerWG);
-				}
-				else
-					Arithmetic<Binop>::impl(tmp, inputData + workgroupOffset, itemsPerWG);
-			}
-			else
-			{
-				for (uint32_t pseudoSubgroupID = 0u; pseudoSubgroupID < itemsPerWG; pseudoSubgroupID += subgroupSize)
-					Arithmetic<Binop>::impl(tmp + pseudoSubgroupID, inputData + workgroupOffset + pseudoSubgroupID, subgroupSize);
-			}
-
-			for (uint32_t localInvocationIndex = 0u; localInvocationIndex < itemsPerWG; localInvocationIndex++)
-			{
-				const auto globalInvocationIndex = workgroupOffset + localInvocationIndex;
-				const auto cpuVal = tmp[localInvocationIndex];
-				const auto gpuVal = testData[globalInvocationIndex];
-				if (cpuVal != gpuVal)
-				{
-					m_logger->log(
-						"Failed test #%d  (%s)  (%s) Expected %u got %u for workgroup %d and localinvoc %d",
-						ILogger::ELL_ERROR, itemsPerWG, WorkgroupTest ? "workgroup" : "subgroup", Binop::name,
-						cpuVal, gpuVal, workgroupID, localInvocationIndex
-					);
-					success = false;
-					break;
-				}
-			}
-		}
-		delete[] ballotInput;
-		delete[] tmp;
-
-		return success;
-	}
-
-	IQueue* transferDownQueue;
-	IQueue* computeQueue;
-	smart_refctd_ptr<IGPUPipelineCache> m_spirv_isa_cache;
-	smart_refctd_ptr<IFile> m_spirv_isa_cache_output;
-
-	uint32_t* inputData = nullptr;
-	constexpr static inline uint32_t OutputBufferCount = 8u;
-	smart_refctd_ptr<IGPUBuffer> outputBuffers[OutputBufferCount];
-	smart_refctd_ptr<IGPUDescriptorSet> descriptorSet;
-	smart_refctd_ptr<IGPUPipelineLayout> pipelineLayout;
-
-	smart_refctd_ptr<ISemaphore> sema;
-	uint64_t timelineValue = 0;
-	smart_refctd_ptr<IGPUCommandBuffer> cmdbuf;
-	smart_refctd_ptr<ICPUBuffer> resultsBuffer;
-
-	uint32_t totalFailCount = 0;
-};
-
-NBL_MAIN_FUNC(ArithmeticUnitTestApp)
\ No newline at end of file
diff --git a/73_Arithmetic2UnitTest/CMakeLists.txt b/73_Arithmetic2UnitTest/CMakeLists.txt
deleted file mode 100644
index 0724366c9..000000000
--- a/73_Arithmetic2UnitTest/CMakeLists.txt
+++ /dev/null
@@ -1,25 +0,0 @@
-
-include(common RESULT_VARIABLE RES)
-if(NOT RES)
-	message(FATAL_ERROR "common.cmake not found. Should be in {repo_root}/cmake directory")
-endif()
-
-nbl_create_executable_project("" "" "" "" "${NBL_EXECUTABLE_PROJECT_CREATION_PCH_TARGET}")
-
-if(NBL_EMBED_BUILTIN_RESOURCES)
-	set(_BR_TARGET_ ${EXECUTABLE_NAME}_builtinResourceData)
-	set(RESOURCE_DIR "app_resources")
-
-	get_filename_component(_SEARCH_DIRECTORIES_ "${CMAKE_CURRENT_SOURCE_DIR}" ABSOLUTE)
-	get_filename_component(_OUTPUT_DIRECTORY_SOURCE_ "${CMAKE_CURRENT_BINARY_DIR}/src" ABSOLUTE)
-	get_filename_component(_OUTPUT_DIRECTORY_HEADER_ "${CMAKE_CURRENT_BINARY_DIR}/include" ABSOLUTE)
-
-    file(GLOB_RECURSE BUILTIN_RESOURCE_FILES RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}/${RESOURCE_DIR}" CONFIGURE_DEPENDS "${CMAKE_CURRENT_SOURCE_DIR}/${RESOURCE_DIR}/*")
-    foreach(RES_FILE ${BUILTIN_RESOURCE_FILES})
-      LIST_BUILTIN_RESOURCE(RESOURCES_TO_EMBED "${RES_FILE}")
-    endforeach()
-
-	ADD_CUSTOM_BUILTIN_RESOURCES(${_BR_TARGET_} RESOURCES_TO_EMBED "${_SEARCH_DIRECTORIES_}" "${RESOURCE_DIR}" "nbl::this_example::builtin" "${_OUTPUT_DIRECTORY_HEADER_}" "${_OUTPUT_DIRECTORY_SOURCE_}")
-
-	LINK_BUILTIN_RESOURCES_TO_TARGET(${EXECUTABLE_NAME} ${_BR_TARGET_})
-endif()
\ No newline at end of file
diff --git a/73_Arithmetic2UnitTest/app_resources/common.hlsl b/73_Arithmetic2UnitTest/app_resources/common.hlsl
deleted file mode 100644
index 10892a2b9..000000000
--- a/73_Arithmetic2UnitTest/app_resources/common.hlsl
+++ /dev/null
@@ -1,96 +0,0 @@
-#include "nbl/builtin/hlsl/cpp_compat.hlsl"
-#include "nbl/builtin/hlsl/functional.hlsl"
-
-template<uint32_t kScanElementCount=1024*1024>
-struct Output
-{
-	NBL_CONSTEXPR_STATIC_INLINE uint32_t ScanElementCount = kScanElementCount;
-
-	uint32_t subgroupSize;
-	uint32_t data[ScanElementCount];
-};
-
-// Thanks to our unified HLSL/C++ STD lib we're able to remove a whole load of code
-template<typename T>
-struct bit_and : nbl::hlsl::bit_and<T>
-{
-	using base_t = nbl::hlsl::bit_and<T>;
-
-	NBL_CONSTEXPR_STATIC_INLINE uint16_t BindingIndex = 0;
-#ifndef __HLSL_VERSION
-	static inline constexpr const char* name = "bit_and";
-#endif
-};
-template<typename T>
-struct bit_or : nbl::hlsl::bit_or<T>
-{
-	using base_t = nbl::hlsl::bit_or<T>;
-
-	NBL_CONSTEXPR_STATIC_INLINE uint16_t BindingIndex = 1;
-#ifndef __HLSL_VERSION
-	static inline constexpr const char* name = "bit_xor";
-#endif
-};
-template<typename T>
-struct bit_xor : nbl::hlsl::bit_xor<T>
-{
-	using base_t = nbl::hlsl::bit_xor<T>;
-
-	NBL_CONSTEXPR_STATIC_INLINE uint16_t BindingIndex = 2;
-#ifndef __HLSL_VERSION
-	static inline constexpr const char* name = "bit_or";
-#endif
-};
-template<typename T>
-struct plus : nbl::hlsl::plus<T>
-{
-	using base_t = nbl::hlsl::plus<T>;
-
-	NBL_CONSTEXPR_STATIC_INLINE uint16_t BindingIndex = 3;
-#ifndef __HLSL_VERSION
-	static inline constexpr const char* name = "plus";
-#endif
-};
-template<typename T>
-struct multiplies : nbl::hlsl::multiplies<T>
-{
-	using base_t = nbl::hlsl::multiplies<T>;
-
-	NBL_CONSTEXPR_STATIC_INLINE uint16_t BindingIndex = 4;
-#ifndef __HLSL_VERSION
-	static inline constexpr const char* name = "multiplies";
-#endif
-};
-template<typename T>
-struct minimum : nbl::hlsl::minimum<T>
-{
-	using base_t = nbl::hlsl::minimum<T>;
-
-	NBL_CONSTEXPR_STATIC_INLINE uint16_t BindingIndex = 5;
-#ifndef __HLSL_VERSION
-	static inline constexpr const char* name = "minimum";
-#endif
-};
-template<typename T>
-struct maximum : nbl::hlsl::maximum<T>
-{
-	using base_t = nbl::hlsl::maximum<T>;
-
-	NBL_CONSTEXPR_STATIC_INLINE uint16_t BindingIndex = 6;
-#ifndef __HLSL_VERSION
-	static inline constexpr const char* name = "maximum";
-#endif
-};
-
-template<typename T>
-struct ballot : nbl::hlsl::plus<T>
-{
-	using base_t = nbl::hlsl::plus<T>;
-
-	NBL_CONSTEXPR_STATIC_INLINE uint16_t BindingIndex = 7;
-#ifndef __HLSL_VERSION
-	static inline constexpr const char* name = "bitcount";
-#endif
-};
-
-#include "nbl/builtin/hlsl/subgroup/basic.hlsl"
\ No newline at end of file
diff --git a/73_Arithmetic2UnitTest/config.json.template b/73_Arithmetic2UnitTest/config.json.template
deleted file mode 100644
index f961745c1..000000000
--- a/73_Arithmetic2UnitTest/config.json.template
+++ /dev/null
@@ -1,28 +0,0 @@
-{
-  "enableParallelBuild": true,
-  "threadsPerBuildProcess" : 2,
-  "isExecuted": false,
-  "scriptPath": "",
-  "cmake": {
-    "configurations": [ "Release", "Debug", "RelWithDebInfo" ],
-    "buildModes": [],
-    "requiredOptions": []
-  }, 
-  "profiles": [
-    {
-      "backend": "vulkan",
-      "platform": "windows",
-      "buildModes": [],
-      "runConfiguration": "Release",
-      "gpuArchitectures": []
-    }
-  ],
-  "dependencies": [],
-  "data": [
-    {
-      "dependencies": [],
-      "command": [""],
-      "outputs": []
-    }
-  ]
-}
\ No newline at end of file
diff --git a/73_Arithmetic2UnitTest/pipeline.groovy b/73_Arithmetic2UnitTest/pipeline.groovy
deleted file mode 100644
index 7ea9947e0..000000000
--- a/73_Arithmetic2UnitTest/pipeline.groovy
+++ /dev/null
@@ -1,50 +0,0 @@
-import org.DevshGraphicsProgramming.Agent
-import org.DevshGraphicsProgramming.BuilderInfo
-import org.DevshGraphicsProgramming.IBuilder
-
-class CArithemticUnitTestBuilder extends IBuilder
-{
-	public CArithemticUnitTestBuilder(Agent _agent, _info)
-	{
-		super(_agent, _info)
-	}
-	
-	@Override
-	public boolean prepare(Map axisMapping)
-	{
-		return true
-	}
-	
-	@Override
-  	public boolean build(Map axisMapping)
-	{
-		IBuilder.CONFIGURATION config = axisMapping.get("CONFIGURATION")
-		IBuilder.BUILD_TYPE buildType = axisMapping.get("BUILD_TYPE")
-		
-		def nameOfBuildDirectory = getNameOfBuildDirectory(buildType)
-		def nameOfConfig = getNameOfConfig(config)
-		
-		agent.execute("cmake --build ${info.rootProjectPath}/${nameOfBuildDirectory}/${info.targetProjectPathRelativeToRoot} --target ${info.targetBaseName} --config ${nameOfConfig} -j12 -v")
-		
-		return true
-	}
-	
-	@Override
-  	public boolean test(Map axisMapping)
-	{
-		return true
-	}
-	
-	@Override
-	public boolean install(Map axisMapping)
-	{
-		return true
-	}
-}
-
-def create(Agent _agent, _info)
-{
-	return new CArithemticUnitTestBuilder(_agent, _info)
-}
-
-return this
\ No newline at end of file

From 030d6227ff20939ea838f51ce82969f96cbd12ca Mon Sep 17 00:00:00 2001
From: keptsecret <sorchon@gmail.com>
Date: Wed, 7 May 2025 16:54:59 +0700
Subject: [PATCH 233/529] minor fixes

---
 23_Arithmetic2UnitTest/app_resources/testWorkgroup.comp.hlsl  | 4 ++--
 .../app_resources/benchmarkWorkgroup.comp.hlsl                | 4 ++--
 CMakeLists.txt                                                | 3 +--
 3 files changed, 5 insertions(+), 6 deletions(-)

diff --git a/23_Arithmetic2UnitTest/app_resources/testWorkgroup.comp.hlsl b/23_Arithmetic2UnitTest/app_resources/testWorkgroup.comp.hlsl
index f9453a165..7f1b5dcbe 100644
--- a/23_Arithmetic2UnitTest/app_resources/testWorkgroup.comp.hlsl
+++ b/23_Arithmetic2UnitTest/app_resources/testWorkgroup.comp.hlsl
@@ -8,9 +8,9 @@ struct DataProxy
     using dtype_t = vector<uint32_t, Config::ItemsPerInvocation_0>;
     static_assert(nbl::hlsl::is_same_v<dtype_t, type_t>);
 
-    dtype_t get(const uint32_t ix)
+    void get(const uint32_t ix, NBL_REF_ARG(dtype_t) value)
     {
-        return inputValue[ix];
+        value = inputValue[ix];
     }
     void set(const uint32_t ix, const dtype_t value)
     {
diff --git a/74_Arithmetic2Bench/app_resources/benchmarkWorkgroup.comp.hlsl b/74_Arithmetic2Bench/app_resources/benchmarkWorkgroup.comp.hlsl
index ed56dd766..aa0717112 100644
--- a/74_Arithmetic2Bench/app_resources/benchmarkWorkgroup.comp.hlsl
+++ b/74_Arithmetic2Bench/app_resources/benchmarkWorkgroup.comp.hlsl
@@ -11,9 +11,9 @@ struct DataProxy
     using dtype_t = vector<uint32_t, Config::ItemsPerInvocation_0>;
     static_assert(nbl::hlsl::is_same_v<dtype_t, type_t>);
 
-    dtype_t get(const uint32_t ix)
+    void get(const uint32_t ix, NBL_REF_ARG(dtype_t) value)
     {
-        return inputValue[ix];
+        value = inputValue[ix];
     }
     void set(const uint32_t ix, const dtype_t value)
     {
diff --git a/CMakeLists.txt b/CMakeLists.txt
index dc6b74de1..ed3992203 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -58,7 +58,7 @@ if(NBL_BUILD_EXAMPLES)
 	add_subdirectory(20_AllocatorTest EXCLUDE_FROM_ALL)
 	add_subdirectory(21_LRUCacheUnitTest EXCLUDE_FROM_ALL)
 	add_subdirectory(22_CppCompat EXCLUDE_FROM_ALL)
-	add_subdirectory(23_ArithmeticUnitTest EXCLUDE_FROM_ALL)
+	add_subdirectory(23_Arithmetic2UnitTest EXCLUDE_FROM_ALL)
 	add_subdirectory(24_ColorSpaceTest EXCLUDE_FROM_ALL)
 	add_subdirectory(25_FilterTest EXCLUDE_FROM_ALL)
 	add_subdirectory(26_Blur EXCLUDE_FROM_ALL)
@@ -91,7 +91,6 @@ if(NBL_BUILD_EXAMPLES)
   	add_subdirectory(70_FLIPFluids EXCLUDE_FROM_ALL)
 	add_subdirectory(71_RayTracingPipeline EXCLUDE_FROM_ALL)
 
-	add_subdirectory(73_Arithmetic2UnitTest EXCLUDE_FROM_ALL)
 	add_subdirectory(74_Arithmetic2Bench EXCLUDE_FROM_ALL)
 
 	NBL_HOOK_COMMON_API("${NBL_COMMON_API_TARGETS}")

From ca71a39db753938a7ae90a8445cb4186efe7fa56 Mon Sep 17 00:00:00 2001
From: keptsecret <sorchon@gmail.com>
Date: Thu, 8 May 2025 14:14:59 +0700
Subject: [PATCH 234/529] minor fixes to workgroup benchmark

---
 .../app_resources/benchmarkWorkgroup.comp.hlsl              | 6 ++++--
 74_Arithmetic2Bench/main.cpp                                | 4 ++--
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/74_Arithmetic2Bench/app_resources/benchmarkWorkgroup.comp.hlsl b/74_Arithmetic2Bench/app_resources/benchmarkWorkgroup.comp.hlsl
index aa0717112..ec3f9b295 100644
--- a/74_Arithmetic2Bench/app_resources/benchmarkWorkgroup.comp.hlsl
+++ b/74_Arithmetic2Bench/app_resources/benchmarkWorkgroup.comp.hlsl
@@ -11,13 +11,15 @@ struct DataProxy
     using dtype_t = vector<uint32_t, Config::ItemsPerInvocation_0>;
     static_assert(nbl::hlsl::is_same_v<dtype_t, type_t>);
 
+    // we don't want to write/read storage multiple times in loop; doesn't seem optimized out in generated spirv
     void get(const uint32_t ix, NBL_REF_ARG(dtype_t) value)
     {
-        value = inputValue[ix];
+        // value = inputValue[ix];
+        value = globalIndex();
     }
     void set(const uint32_t ix, const dtype_t value)
     {
-        output[Binop::BindingIndex].template Store<type_t>(sizeof(uint32_t) + sizeof(type_t) * ix, value);
+        // output[Binop::BindingIndex].template Store<type_t>(sizeof(uint32_t) + sizeof(type_t) * ix, value);
     }
 
     void workgroupExecutionAndMemoryBarrier()
diff --git a/74_Arithmetic2Bench/main.cpp b/74_Arithmetic2Bench/main.cpp
index abbae38fb..b6bffb2b4 100644
--- a/74_Arithmetic2Bench/main.cpp
+++ b/74_Arithmetic2Bench/main.cpp
@@ -395,7 +395,7 @@ class ArithmeticBenchApp final : public examples::SimpleWindowedApplication, pub
 		cmdbuf->bindDescriptorSets(EPBP_COMPUTE, benchSets[0].pipeline->getLayout(), 0u, 1u, &benchDs.get());
 
 		for (uint32_t i = 0; i < benchSets.size(); i++)
-			runBenchmark<DoWorkgroupBenchmarks>(cmdbuf, benchSets[0], elementCount, SubgroupSizeLog2);
+			runBenchmark<DoWorkgroupBenchmarks>(cmdbuf, benchSets[i], elementCount, SubgroupSizeLog2);
 
 
 		// blit
@@ -741,7 +741,7 @@ class ArithmeticBenchApp final : public examples::SimpleWindowedApplication, pub
 	constexpr static inline bool DoWorkgroupBenchmarks = true;
 	uint32_t ItemsPerInvocation = 4u;
 	constexpr static inline uint32_t NumLoops = 1000u;
-	constexpr static inline std::array<uint32_t, 3> workgroupSizes = { 256, 512, 1024 };
+	constexpr static inline std::array<uint32_t, 3> workgroupSizes = { 128, 512, 1024 };
 	template<class BinOp>
 	using ArithmeticOp = emulatedReduction<BinOp>;	// change this to test other arithmetic ops
 

From 6018e9a7e0bd5cb4eeaa47571610ff7dbb0ce054 Mon Sep 17 00:00:00 2001
From: keptsecret <sorchon@gmail.com>
Date: Thu, 8 May 2025 17:03:09 +0700
Subject: [PATCH 235/529] more minor fixes

---
 .../benchmarkWorkgroup.comp.hlsl              |  2 +-
 74_Arithmetic2Bench/main.cpp                  | 37 +++----------------
 2 files changed, 7 insertions(+), 32 deletions(-)

diff --git a/74_Arithmetic2Bench/app_resources/benchmarkWorkgroup.comp.hlsl b/74_Arithmetic2Bench/app_resources/benchmarkWorkgroup.comp.hlsl
index ec3f9b295..ac6ea7fd8 100644
--- a/74_Arithmetic2Bench/app_resources/benchmarkWorkgroup.comp.hlsl
+++ b/74_Arithmetic2Bench/app_resources/benchmarkWorkgroup.comp.hlsl
@@ -15,7 +15,7 @@ struct DataProxy
     void get(const uint32_t ix, NBL_REF_ARG(dtype_t) value)
     {
         // value = inputValue[ix];
-        value = globalIndex();
+        value = nbl::hlsl::promote<dtype_t>(globalIndex());
     }
     void set(const uint32_t ix, const dtype_t value)
     {
diff --git a/74_Arithmetic2Bench/main.cpp b/74_Arithmetic2Bench/main.cpp
index b6bffb2b4..1d8e41a24 100644
--- a/74_Arithmetic2Bench/main.cpp
+++ b/74_Arithmetic2Bench/main.cpp
@@ -192,29 +192,6 @@ class ArithmeticBenchApp final : public examples::SimpleWindowedApplication, pub
 			}
 
 			// set and transient pool
-			auto descPool = m_device->createDescriptorPoolForDSLayouts(IDescriptorPool::ECF_NONE,{&dsLayout.get(),1});
-			testDs = descPool->createDescriptorSet(smart_refctd_ptr(dsLayout));
-			{
-				IGPUDescriptorSet::SDescriptorInfo infos[1+OutputBufferCount];
-				infos[0].desc = gpuinputDataBuffer;
-				infos[0].info.buffer = { 0u,gpuinputDataBuffer->getSize() };
-				for (uint32_t i = 1u; i <= OutputBufferCount; i++)
-				{
-					auto buff = outputBuffers[i - 1];
-					infos[i].info.buffer = { 0u,buff->getSize() };
-					infos[i].desc = std::move(buff); // save an atomic in the refcount
-				}
-
-				IGPUDescriptorSet::SWriteDescriptorSet writes[2];
-				for (uint32_t i=0u; i<2; i++)
-					writes[i] = {testDs.get(),i,0u,1u,infos+i};
-				writes[1].count = OutputBufferCount;
-
-				m_device->updateDescriptorSets(2, writes, 0u, nullptr);
-			}
-			testPplnLayout = m_device->createPipelineLayout({}, std::move(dsLayout));
-
-
 			smart_refctd_ptr<IGPUDescriptorSetLayout> benchLayout;
 			{
 				IGPUDescriptorSetLayout::SBinding binding[3];
@@ -727,24 +704,22 @@ class ArithmeticBenchApp final : public examples::SimpleWindowedApplication, pub
 
 	smart_refctd_ptr<IGPUImage> dummyImg;
 
-	std::array<BenchmarkSet, 3> benchSets;
-	smart_refctd_ptr<IDescriptorPool> benchPool;
-	smart_refctd_ptr<IGPUDescriptorSet> benchDs;
-
-	smart_refctd_ptr<IGPUDescriptorSet> testDs;
-	smart_refctd_ptr<IGPUPipelineLayout> testPplnLayout;
-
 	constexpr static inline uint32_t MaxNumSubmits = 30;
 	uint32_t numSubmits = 0;
 
 	/* PARAMETERS TO CHANGE FOR DIFFERENT BENCHMARKS */
+
 	constexpr static inline bool DoWorkgroupBenchmarks = true;
 	uint32_t ItemsPerInvocation = 4u;
 	constexpr static inline uint32_t NumLoops = 1000u;
-	constexpr static inline std::array<uint32_t, 3> workgroupSizes = { 128, 512, 1024 };
+	constexpr static inline uint32_t NumBenchmarks = 6u;
+	constexpr static inline std::array<uint32_t, NumBenchmarks> workgroupSizes = { 32, 64, 128, 256, 512, 1024 };
 	template<class BinOp>
 	using ArithmeticOp = emulatedReduction<BinOp>;	// change this to test other arithmetic ops
 
+	std::array<BenchmarkSet, NumBenchmarks> benchSets;
+	smart_refctd_ptr<IDescriptorPool> benchPool;
+	smart_refctd_ptr<IGPUDescriptorSet> benchDs;
 
 	uint32_t* inputData = nullptr;
 	constexpr static inline uint32_t OutputBufferCount = 8u;

From 3a9758c176c55652831ced820904282a76be03db Mon Sep 17 00:00:00 2001
From: keptsecret <sorchon@gmail.com>
Date: Fri, 9 May 2025 14:03:07 +0700
Subject: [PATCH 236/529] some fixes to using config vars

---
 23_Arithmetic2UnitTest/app_resources/workgroupCommon.hlsl | 2 +-
 23_Arithmetic2UnitTest/main.cpp                           | 2 +-
 74_Arithmetic2Bench/app_resources/workgroupCommon.hlsl    | 2 +-
 74_Arithmetic2Bench/main.cpp                              | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/23_Arithmetic2UnitTest/app_resources/workgroupCommon.hlsl b/23_Arithmetic2UnitTest/app_resources/workgroupCommon.hlsl
index 7e8512e72..702fcbe25 100644
--- a/23_Arithmetic2UnitTest/app_resources/workgroupCommon.hlsl
+++ b/23_Arithmetic2UnitTest/app_resources/workgroupCommon.hlsl
@@ -43,7 +43,7 @@ bool canStore();
 #endif
 
 // final (level 1/2) scan needs to fit in one subgroup exactly
-groupshared uint32_t scratch[config_t::SubgroupsPerVirtualWorkgroupLog2*config_t::ItemsPerInvocation_1];
+groupshared uint32_t scratch[config_t::SubgroupsPerVirtualWorkgroup*config_t::ItemsPerInvocation_1];
 
 struct ScratchProxy
 {
diff --git a/23_Arithmetic2UnitTest/main.cpp b/23_Arithmetic2UnitTest/main.cpp
index 31eb4ab8f..d5a251f39 100644
--- a/23_Arithmetic2UnitTest/main.cpp
+++ b/23_Arithmetic2UnitTest/main.cpp
@@ -195,7 +195,7 @@ class Workgroup2ScanTestApp final : public application_templates::BasicMultiQueu
 					passed = runTest<emulatedScanExclusive, false>(subgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize, ~0u, itemsPerInvocation) && passed;
 					logTestOutcome(passed, workgroupSize);
 
-					const uint32_t itemsPerWG = workgroupSize <= 4 * subgroupSize ? workgroupSize : itemsPerInvocation * max(workgroupSize >> subgroupSizeLog2, subgroupSize) << subgroupSizeLog2;	// TODO use Config somehow
+					const uint32_t itemsPerWG = workgroupSize <= subgroupSize ? workgroupSize * itemsPerInvocation : itemsPerInvocation * max(workgroupSize >> subgroupSizeLog2, subgroupSize) << subgroupSizeLog2;	// TODO use Config somehow
 					m_logger->log("Testing Item Count %u", ILogger::ELL_INFO, itemsPerWG);
 					passed = runTest<emulatedReduction, true>(workgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize, itemsPerWG, itemsPerInvocation) && passed;
 					logTestOutcome(passed, itemsPerWG);
diff --git a/74_Arithmetic2Bench/app_resources/workgroupCommon.hlsl b/74_Arithmetic2Bench/app_resources/workgroupCommon.hlsl
index 7e8512e72..702fcbe25 100644
--- a/74_Arithmetic2Bench/app_resources/workgroupCommon.hlsl
+++ b/74_Arithmetic2Bench/app_resources/workgroupCommon.hlsl
@@ -43,7 +43,7 @@ bool canStore();
 #endif
 
 // final (level 1/2) scan needs to fit in one subgroup exactly
-groupshared uint32_t scratch[config_t::SubgroupsPerVirtualWorkgroupLog2*config_t::ItemsPerInvocation_1];
+groupshared uint32_t scratch[config_t::SubgroupsPerVirtualWorkgroup*config_t::ItemsPerInvocation_1];
 
 struct ScratchProxy
 {
diff --git a/74_Arithmetic2Bench/main.cpp b/74_Arithmetic2Bench/main.cpp
index 1d8e41a24..bf20d5faa 100644
--- a/74_Arithmetic2Bench/main.cpp
+++ b/74_Arithmetic2Bench/main.cpp
@@ -594,7 +594,7 @@ class ArithmeticBenchApp final : public examples::SimpleWindowedApplication, pub
 		options.preprocessorOptions.includeFinder = includeFinder;
 
 		const uint32_t subgroupSize = 0x1u << subgroupSizeLog2;
-		const uint32_t itemsPerWG = workgroupSize <= 4 * subgroupSize ? workgroupSize : itemsPerInvoc * max(workgroupSize >> subgroupSizeLog2, subgroupSize) << subgroupSizeLog2;	// TODO use Config somehow
+		const uint32_t itemsPerWG = workgroupSize <= subgroupSize ? workgroupSize * itemsPerInvoc : itemsPerInvoc * max(workgroupSize >> subgroupSizeLog2, subgroupSize) << subgroupSizeLog2;	// TODO use Config somehow
 		smart_refctd_ptr<ICPUShader> overriddenUnspecialized;
 		if constexpr (WorkgroupBench)
 		{

From e496e987296338a3ba7492b18f0249c7cca56d68 Mon Sep 17 00:00:00 2001
From: keptsecret <sorchon@gmail.com>
Date: Mon, 12 May 2025 11:00:12 +0700
Subject: [PATCH 237/529] fixes to test mem errors

---
 23_Arithmetic2UnitTest/main.cpp | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

diff --git a/23_Arithmetic2UnitTest/main.cpp b/23_Arithmetic2UnitTest/main.cpp
index d5a251f39..49cba28d1 100644
--- a/23_Arithmetic2UnitTest/main.cpp
+++ b/23_Arithmetic2UnitTest/main.cpp
@@ -169,8 +169,6 @@ class Workgroup2ScanTestApp final : public application_templates::BasicMultiQueu
 		}
 
 		const auto MaxWorkgroupSize = m_physicalDevice->getLimits().maxComputeWorkGroupInvocations;
-		const std::array<uint32_t, 4> WorkgroupSizes = { 128, 256, 512, 1024 };
-		const std::array<uint32_t, 3> ItemsPerInvocations = { 1, 2, 4 };
 		const auto MinSubgroupSize = m_physicalDevice->getLimits().minSubgroupSize;
 		const auto MaxSubgroupSize = m_physicalDevice->getLimits().maxSubgroupSize;
 		for (auto subgroupSize=MinSubgroupSize; subgroupSize <= MaxSubgroupSize; subgroupSize *= 2u)
@@ -401,13 +399,16 @@ class Workgroup2ScanTestApp final : public application_templates::BasicMultiQueu
 		const auto testData = reinterpret_cast<const type_t*>(dataFromBuffer + 1);
 		// TODO: parallel for (the temporary values need to be threadlocal or what?)
 		// now check if the data obtained has valid values
-		type_t* tmp = new type_t[itemsPerWG];
+		type_t* tmp;
+		if constexpr (WorkgroupTest)
+			tmp = new type_t[itemsPerWG];
+		else
+			tmp = new type_t[itemsPerWG * itemsPerInvoc];
 		for (uint32_t workgroupID = 0u; success && workgroupID < workgroupCount; workgroupID++)
 		{
-			const auto workgroupOffset = workgroupID * itemsPerWG;
-
 			if constexpr (WorkgroupTest)
 			{
+				const auto workgroupOffset = workgroupID * itemsPerWG;
 				Arithmetic<Binop>::impl(tmp, inputData + workgroupOffset, itemsPerWG);
 
 				for (uint32_t localInvocationIndex = 0u; localInvocationIndex < itemsPerWG; localInvocationIndex++)
@@ -429,6 +430,7 @@ class Workgroup2ScanTestApp final : public application_templates::BasicMultiQueu
 			}
 			else
 			{
+				const auto workgroupOffset = workgroupID * itemsPerWG * itemsPerInvoc;
 				for (uint32_t pseudoSubgroupID = 0u; pseudoSubgroupID < itemsPerWG; pseudoSubgroupID += subgroupSize)
 					Arithmetic<Binop>::impl(tmp + pseudoSubgroupID * itemsPerInvoc, inputData + workgroupOffset + pseudoSubgroupID * itemsPerInvoc, subgroupSize * itemsPerInvoc);
 
@@ -475,6 +477,9 @@ class Workgroup2ScanTestApp final : public application_templates::BasicMultiQueu
 	smart_refctd_ptr<ICPUBuffer> resultsBuffer;
 
 	uint32_t totalFailCount = 0;
+
+	constexpr static inline std::array<uint32_t, 4> WorkgroupSizes = { 32, 256, 512, 1024 };
+	constexpr static inline std::array<uint32_t, 3> ItemsPerInvocations = { 1, 2, 4 };
 };
 
 NBL_MAIN_FUNC(Workgroup2ScanTestApp)
\ No newline at end of file

From 8b5659980c4b00ca6c05356a43ec004743f0b0ab Mon Sep 17 00:00:00 2001
From: Erfan Ahmadi <ahmadierfan99@gmail.com>
Date: Mon, 12 May 2025 08:37:40 +0400
Subject: [PATCH 238/529] Small Fix to avoid msdf evictions of glyphs that were
 re-used

---
 62_CAD/DrawResourcesFiller.h | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/62_CAD/DrawResourcesFiller.h b/62_CAD/DrawResourcesFiller.h
index a10379e1a..9be4a5b57 100644
--- a/62_CAD/DrawResourcesFiller.h
+++ b/62_CAD/DrawResourcesFiller.h
@@ -548,6 +548,13 @@ struct DrawResourcesFiller
 		{
 			textureIdx = tRef->alloc_idx;
 			tRef->lastUsedSemaphoreValue = intendedNextSubmit.getFutureScratchSemaphore().value; // update this because the texture will get used on the next submit
+			if (textureIdx >= 0u && textureIdx <= msdfStagedCPUImages.size())
+				msdfStagedCPUImages[textureIdx].usedThisFrame = true;
+			else
+			{
+				// shouldn't happen, TODO: Log
+				assert(false);
+			}
 		}
 		return textureIdx;
 	}

From 48b3125a32c722ef46a8fae79aa985670b07f12a Mon Sep 17 00:00:00 2001
From: Erfan Ahmadi <ahmadierfan99@gmail.com>
Date: Mon, 12 May 2025 08:38:14 +0400
Subject: [PATCH 239/529] small typo fix, oops

---
 62_CAD/DrawResourcesFiller.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/62_CAD/DrawResourcesFiller.h b/62_CAD/DrawResourcesFiller.h
index 9be4a5b57..aec6e3b8e 100644
--- a/62_CAD/DrawResourcesFiller.h
+++ b/62_CAD/DrawResourcesFiller.h
@@ -548,7 +548,7 @@ struct DrawResourcesFiller
 		{
 			textureIdx = tRef->alloc_idx;
 			tRef->lastUsedSemaphoreValue = intendedNextSubmit.getFutureScratchSemaphore().value; // update this because the texture will get used on the next submit
-			if (textureIdx >= 0u && textureIdx <= msdfStagedCPUImages.size())
+			if (textureIdx >= 0u && textureIdx < msdfStagedCPUImages.size())
 				msdfStagedCPUImages[textureIdx].usedThisFrame = true;
 			else
 			{

From 20011f5fdd3e8454bb830ded6f4221ec75036809 Mon Sep 17 00:00:00 2001
From: keptsecret <sorchon@gmail.com>
Date: Mon, 12 May 2025 16:17:01 +0700
Subject: [PATCH 240/529] config struct changes

---
 74_Arithmetic2Bench/app_resources/workgroupCommon.hlsl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/74_Arithmetic2Bench/app_resources/workgroupCommon.hlsl b/74_Arithmetic2Bench/app_resources/workgroupCommon.hlsl
index 702fcbe25..026687cfa 100644
--- a/74_Arithmetic2Bench/app_resources/workgroupCommon.hlsl
+++ b/74_Arithmetic2Bench/app_resources/workgroupCommon.hlsl
@@ -43,7 +43,7 @@ bool canStore();
 #endif
 
 // final (level 1/2) scan needs to fit in one subgroup exactly
-groupshared uint32_t scratch[config_t::SubgroupsPerVirtualWorkgroup*config_t::ItemsPerInvocation_1];
+groupshared uint32_t scratch[config_t::SharedMemSize];
 
 struct ScratchProxy
 {

From daf68102835a819250e907079106d366fd4cd47f Mon Sep 17 00:00:00 2001
From: Erfan Ahmadi <ahmadierfan99@gmail.com>
Date: Tue, 13 May 2025 15:52:15 +0400
Subject: [PATCH 241/529] Support for StaticImages [VRAM Limitation Eviction
 Untested] + MSDF Image Eviction and Caching Fixes

---
 62_CAD/DrawResourcesFiller.cpp                | 483 +++++++++++++++---
 62_CAD/DrawResourcesFiller.h                  | 107 ++--
 62_CAD/main.cpp                               | 223 +++-----
 62_CAD/shaders/globals.hlsl                   |   3 +-
 62_CAD/shaders/main_pipeline/common.hlsl      |   2 +-
 .../main_pipeline/fragment_shader.hlsl        |   6 +-
 6 files changed, 546 insertions(+), 278 deletions(-)

diff --git a/62_CAD/DrawResourcesFiller.cpp b/62_CAD/DrawResourcesFiller.cpp
index c83055f0e..d5babd393 100644
--- a/62_CAD/DrawResourcesFiller.cpp
+++ b/62_CAD/DrawResourcesFiller.cpp
@@ -6,7 +6,9 @@ DrawResourcesFiller::DrawResourcesFiller()
 DrawResourcesFiller::DrawResourcesFiller(smart_refctd_ptr<IUtilities>&& utils, IQueue* copyQueue) :
 	m_utilities(utils),
 	m_copyQueue(copyQueue)
-{}
+{
+	imagesUsageCache = std::unique_ptr<ImagesUsageCache>(new ImagesUsageCache(ImagesBindingArraySize));
+}
 
 // function is called when buffer is filled and we should submit draws and clear the buffers and continue filling
 
@@ -15,6 +17,13 @@ void DrawResourcesFiller::setSubmitDrawsFunction(const SubmitFunc& func)
 	submitDraws = func;
 }
 
+// DrawResourcesFiller needs to access these in order to allocate GPUImages and write the to their correct descriptor set binding
+void DrawResourcesFiller::setTexturesDescriptorSetAndBinding(core::smart_refctd_ptr<video::IGPUDescriptorSet>&& descriptorSet, uint32_t binding)
+{
+	imagesArrayBinding = binding;
+	suballocatedDescriptorSet = core::make_smart_refctd_ptr<SubAllocatedDescriptorSet>(std::move(descriptorSet));
+}
+
 void DrawResourcesFiller::allocateResourcesBuffer(ILogicalDevice* logicalDevice, size_t size)
 {
 	// TODO: Make this function failable and report insufficient memory if less that getMinimumRequiredResourcesBufferSize, TODO: Have retry mechanism to allocate less mem
@@ -239,14 +248,14 @@ void DrawResourcesFiller::drawHatch(
 	if (color.a == 0.0f) // not visible
 		return;
 
-	uint32_t textureIdx = InvalidTextureIdx;
+	uint32_t textureIdx = InvalidTextureIndex;
 	if (fillPattern != HatchFillPattern::SOLID_FILL)
 	{
 		MSDFInputInfo msdfInfo = MSDFInputInfo(fillPattern);
 		textureIdx = getMSDFIndexFromInputInfo(msdfInfo, intendedNextSubmit);
-		if (textureIdx == InvalidTextureIdx)
+		if (textureIdx == InvalidTextureIndex)
 			textureIdx = addMSDFTexture(msdfInfo, getHatchFillPatternMSDF(fillPattern), intendedNextSubmit);
-		_NBL_DEBUG_BREAK_IF(textureIdx == InvalidTextureIdx); // probably getHatchFillPatternMSDF returned nullptr
+		_NBL_DEBUG_BREAK_IF(textureIdx == InvalidTextureIndex); // probably getHatchFillPatternMSDF returned nullptr
 	}
 
 	LineStyleInfo lineStyle = {};
@@ -282,16 +291,16 @@ void DrawResourcesFiller::drawFontGlyph(
 		float32_t2 minUV,
 		SIntendedSubmitInfo& intendedNextSubmit)
 {
-	uint32_t textureIdx = InvalidTextureIdx;
+	uint32_t textureIdx = InvalidTextureIndex;
 	const MSDFInputInfo msdfInput = MSDFInputInfo(fontFace->getHash(), glyphIdx);
 	textureIdx = getMSDFIndexFromInputInfo(msdfInput, intendedNextSubmit);
-	if (textureIdx == InvalidTextureIdx)
+	if (textureIdx == InvalidTextureIndex)
 		textureIdx = addMSDFTexture(msdfInput, getGlyphMSDF(fontFace, glyphIdx), intendedNextSubmit);
 
 	uint32_t mainObjIdx = acquireActiveMainObjectIndex_SubmitIfNeeded(intendedNextSubmit);
 	assert(mainObjIdx != InvalidMainObjectIdx);
 
-	if (textureIdx != InvalidTextureIdx)
+	if (textureIdx != InvalidTextureIndex)
 	{
 		GlyphInfo glyphInfo = GlyphInfo(topLeft, dirU, aspectRatio, textureIdx, minUV);
 		if (!addFontGlyph_Internal(glyphInfo, mainObjIdx))
@@ -309,48 +318,167 @@ void DrawResourcesFiller::drawFontGlyph(
 	}
 }
 
-// TODO[Przemek]: similar to other drawXXX and drawXXX_internal functions that create mainobjects, drawObjects and push additional info in geometry buffer, input to function would be a GridDTMInfo
-// We don't have an allocator or memory management for texture updates yet, see how `_test_addImageObject` is being temporarily used (Descriptor updates and pipeline barriers) to upload an image into gpu and update a descriptor slot (it will become more sophisticated but doesn't block you)
-
-void DrawResourcesFiller::_test_addImageObject(float64_t2 topLeftPos, float32_t2 size, float32_t rotation, SIntendedSubmitInfo& intendedNextSubmit)
+uint32_t DrawResourcesFiller::addStaticImage2D(image_id imageID, const core::smart_refctd_ptr<ICPUImage>& cpuImage, SIntendedSubmitInfo& intendedNextSubmit)
 {
-	auto addImageObject_Internal = [&](const ImageObjectInfo& imageObjectInfo, uint32_t mainObjIdx) -> bool
+	/*
+	 * The `suballocatedDescriptorSet` manages indices (slots) into a array of textures binding.
+	 * This callback is invoked on eviction, and must:
+	 *   - Ensure safe deallocation of the slot.
+	 *   - Submit any pending draw calls if the evicted image was scheduled to be used in the upcoming submission.
+	 */
+	auto evictionCallback = [&](const ImageReference& evicted)
+	{
+		// Prepare wait info to defer index deallocation until the GPU has finished using the resource.
+		// Because we will be writing to the descriptor set location which might be in use.
+		ISemaphore::SWaitInfo deallocationWaitInfo = { .semaphore = intendedNextSubmit.getFutureScratchSemaphore().semaphore, .value = evicted.lastUsedSemaphoreValue };
+
+		const bool imageUsedForNextIntendedSubmit = (evicted.lastUsedSemaphoreValue == intendedNextSubmit.getFutureScratchSemaphore().value);
+
+		if (imageUsedForNextIntendedSubmit)
 		{
-			const size_t remainingResourcesSize = calculateRemainingResourcesSize();
-			
-			const uint32_t uploadableObjects = (remainingResourcesSize) / (sizeof(ImageObjectInfo) + sizeof(DrawObject) + sizeof(uint32_t) * 6u);
-			// TODO[ERFAN]: later take into account: our maximum indexable vertex 
+			// The evicted image is scheduled for use in the upcoming submit.
+			// To avoid rendering artifacts, we must flush the current draw queue now.
+			// After submission, we reset state so that data referencing the evicted slot can be re-uploaded.
+			suballocatedDescriptorSet->multi_deallocate(imagesArrayBinding, 1u, &evicted.index, deallocationWaitInfo);
+			submitDraws(intendedNextSubmit);
+			reset(); // resets everything, things referenced through mainObj and other shit will be pushed again through acquireXXX_SubmitIfNeeded
+		} 
+		else
+		{
+			// The image is not used in the current frame (intended next submit), so we can deallocate without submitting any draws.
+			// Still wait on the semaphore to ensure past GPU usage is complete.
+			suballocatedDescriptorSet->multi_deallocate(imagesArrayBinding, 1u, &evicted.index, deallocationWaitInfo);
+		}
+	};
 	
-			if (uploadableObjects <= 0u)
-				return false;
 
-			// Add Geometry
-			size_t geometryBufferOffset = resourcesCollection.geometryInfo.increaseSizeAndGetOffset(sizeof(ImageObjectInfo), alignof(ImageObjectInfo));
-			void* dst = resourcesCollection.geometryInfo.data() + geometryBufferOffset;
-			memcpy(dst, &imageObjectInfo, sizeof(ImageObjectInfo));
-
-			// Push Indices, remove later when compute fills this
-			uint32_t* indexBufferToBeFilled = resourcesCollection.indexBuffer.increaseCountAndGetPtr(6u * 1u);
-			const uint32_t startObj = resourcesCollection.drawObjects.getCount();
-			uint32_t i = 0u;
-			indexBufferToBeFilled[i*6]		= (startObj+i)*4u + 1u;
-			indexBufferToBeFilled[i*6 + 1u]	= (startObj+i)*4u + 0u;
-			indexBufferToBeFilled[i*6 + 2u]	= (startObj+i)*4u + 2u;
-			indexBufferToBeFilled[i*6 + 3u]	= (startObj+i)*4u + 1u;
-			indexBufferToBeFilled[i*6 + 4u]	= (startObj+i)*4u + 2u;
-			indexBufferToBeFilled[i*6 + 5u]	= (startObj+i)*4u + 3u;
-
-			// Add DrawObjs
-			DrawObject* drawObjectsToBeFilled = resourcesCollection.drawObjects.increaseCountAndGetPtr(1u);
-			DrawObject drawObj = {};
-			drawObj.mainObjIndex = mainObjIdx;
-			drawObj.type_subsectionIdx = uint32_t(static_cast<uint16_t>(ObjectType::IMAGE) | (0 << 16)); // TODO: use custom pack/unpack function
-			drawObj.geometryAddress = geometryBufferOffset;
-			drawObjectsToBeFilled[0u] = drawObj;
+	// Try inserting or updating the image usage in the cache.
+	// If the image is already present, updates its semaphore value.
+	ImageReference* inserted = imagesUsageCache->insert(imageID, intendedNextSubmit.getFutureScratchSemaphore().value, evictionCallback);
+	inserted->lastUsedSemaphoreValue = intendedNextSubmit.getFutureScratchSemaphore().value; // in case there was an eviction + auto-submit, we need to update AGAIN
 
-			return true;
-		};
+	// if inserted->index was not InvalidTextureIndex then it means we had a cache hit and updated the value of our sema
+	// in which case we don't queue anything for upload, and return the idx
+	if (inserted->index == InvalidTextureIndex)
+	{
+		// This is a new image (cache miss). Allocate a descriptor index for it.
+		inserted->index = video::SubAllocatedDescriptorSet::AddressAllocator::invalid_address;
+		// Blocking allocation attempt; if the descriptor pool is exhausted, this may stall.
+		suballocatedDescriptorSet->multi_allocate(std::chrono::time_point<std::chrono::steady_clock>::max(), imagesArrayBinding, 1u, &inserted->index); // if the prev submit causes DEVICE_LOST then we'll get a deadlock here since we're using max timepoint
+
+		if (inserted->index != video::SubAllocatedDescriptorSet::AddressAllocator::invalid_address)
+		{
+			auto* device = m_utilities->getLogicalDevice();
+			auto* physDev = m_utilities->getLogicalDevice()->getPhysicalDevice();
+
+			IGPUImage::SCreationParams imageParams = {};
+			imageParams = cpuImage->getCreationParameters();
+			imageParams.usage |= IGPUImage::EUF_TRANSFER_DST_BIT|IGPUImage::EUF_SAMPLED_BIT;
+
+			// promote format because RGB8 and friends don't actually exist in HW
+			{
+				const IPhysicalDevice::SImageFormatPromotionRequest request = {
+					.originalFormat = imageParams.format,
+					.usages = IPhysicalDevice::SFormatImageUsages::SUsage(imageParams.usage)
+				};
+				imageParams.format = physDev->promoteImageFormat(request,imageParams.tiling);
+			}
+
+			// Attempt to create a GPU image and image view for this texture.
+			core::smart_refctd_ptr<IGPUImageView> gpuImageView = nullptr;
+
+			// Attempt to create a GPU image and corresponding image view for this texture.
+			// If creation or memory allocation fails (likely due to VRAM exhaustion),
+			// we'll evict another texture from the LRU cache and retry until successful, or until only the currently-inserted image remains.
+			while (imagesUsageCache->size() > 0u)
+			{
+				// Try creating the image and allocating memory for it:
+				auto gpuImg = device->createImage(std::move(imageParams));
+				if (!gpuImg || !device->allocate(gpuImg->getMemoryReqs(), gpuImg.get()).isValid())
+				{
+					// Failed creating or allocating the image, evict and retry.
+					if (imagesUsageCache->size() == 1u)
+					{
+						// Nothing else to evict; give up.
+						// We probably have evicted almost every other texture except the one we just allocated an index for
+						break;
+					}
+
+					assert(imagesUsageCache->size() > 1u);
+
+					const image_id evictionCandidate = imagesUsageCache->select_eviction_candidate();
+					ImageReference* imageRef = imagesUsageCache->peek(evictionCandidate);
+					if (imageRef)
+						evictionCallback(*imageRef);
+					imagesUsageCache->erase(evictionCandidate);
+					suballocatedDescriptorSet->cull_frees(); // to make sure deallocation requests in eviction callback are waited for.
 
+					// we don't hold any references to the GPUImageView or GPUImage so descriptor binding will be the last reference
+					// hopefully by here the suballocated descriptor set freed some VRAM by dropping the image last ref and it's dedicated allocation.
+
+					continue; // Retry allocation after evicting.
+				}
+				
+				IGPUImageView::SCreationParams viewParams = {
+					.image = gpuImg,
+					.viewType = IGPUImageView::ET_2D,
+					.format = gpuImg->getCreationParameters().format
+				};
+				gpuImg->setObjectDebugName((std::to_string(imageID) + " Static Image 2D").c_str());
+				gpuImageView = device->createImageView(std::move(viewParams));
+				if (!gpuImageView)
+				{
+					// TODO[LOG]: that's rare, image view creation failed.
+					break;
+				}
+
+				gpuImageView->setObjectDebugName((std::to_string(imageID) + " Static Image View 2D").c_str());
+				break;
+			}
+
+			if (gpuImageView)
+			{
+				StaticImagesCopy copyToStage = 
+				{
+					.cpuImage = cpuImage,
+					.gpuImageView = gpuImageView,
+					.arrayIndex = inserted->index,
+				};
+				staticImagesStagedCopies.push_back(copyToStage);
+			}
+			else
+			{
+				// All attempts to create the GPU image and its corresponding view have failed.
+				// Most likely cause: insufficient GPU memory or unsupported image parameters.
+				// TODO: Log a warning or error here � `addStaticImage2D` failed, likely due to low VRAM.
+
+				if (inserted->index != InvalidTextureIndex)
+				{
+					// We previously allocated a descriptor index, but failed to create a usable GPU image.
+					// It's crucial to deallocate this index to avoid leaks and preserve descriptor pool space.
+					// No semaphore wait needed here, as the GPU never got to use this slot.
+					suballocatedDescriptorSet->multi_deallocate(imagesArrayBinding, 1u, &inserted->index, {});
+					inserted->index = InvalidTextureIndex;
+				}
+			}
+		}
+		else
+		{
+			// TODO: log here, index allocation failed.
+			inserted->index = InvalidTextureIndex;
+		}
+	}
+	
+	assert(inserted->index != InvalidTextureIndex); // shouldn't happen, because we're using LRU cache, so worst case eviction will happen + multi-deallocate and next next multi_allocate should definitely succeed
+
+	return inserted->index;
+}
+
+// TODO[Przemek]: similar to other drawXXX and drawXXX_internal functions that create mainobjects, drawObjects and push additional info in geometry buffer, input to function would be a GridDTMInfo
+// We don't have an allocator or memory management for texture updates yet, see how `_test_addImageObject` is being temporarily used (Descriptor updates and pipeline barriers) to upload an image into gpu and update a descriptor slot (it will become more sophisticated but doesn't block you)
+
+void DrawResourcesFiller::addImageObject(image_id imageID, float64_t2 topLeftPos, float32_t2 size, float32_t rotation, SIntendedSubmitInfo& intendedNextSubmit)
+{
 	beginMainObject(MainObjectType::IMAGE);
 
 	uint32_t mainObjIdx = acquireActiveMainObjectIndex_SubmitIfNeeded(intendedNextSubmit);
@@ -359,7 +487,7 @@ void DrawResourcesFiller::_test_addImageObject(float64_t2 topLeftPos, float32_t2
 	info.topLeft = topLeftPos;
 	info.dirU = float32_t2(size.x * cos(rotation), size.x * sin(rotation)); // 
 	info.aspectRatio = size.y / size.x;
-	info.textureID = 0u;
+	info.textureID = getImageIndexFromID(imageID, intendedNextSubmit); // for this to be valid and safe, this function needs to be called immediately after `addStaticImage` function to make sure image is in memory
 	if (!addImageObject_Internal(info, mainObjIdx))
 	{
 		// single image object couldn't fit into memory to push to gpu, so we submit rendering current objects and reset geometry buffer and draw objects
@@ -376,7 +504,7 @@ bool DrawResourcesFiller::pushAllUploads(SIntendedSubmitInfo& intendedNextSubmit
 	if (!intendedNextSubmit.valid())
 	{
 		// It is a caching submit without command buffer, just for the purpose of accumulation of staging resources
-		// In that case we don't push any uploads (i.e. we don't record any copy commmand in active command buffer, because there is no active command buffer)
+		// In that case we don't push any uploads (i.e. we don't record any stagedStaticImage commmand in active command buffer, because there is no active command buffer)
 		return false;
 	}
 
@@ -386,12 +514,14 @@ bool DrawResourcesFiller::pushAllUploads(SIntendedSubmitInfo& intendedNextSubmit
 		// This means we're in a replay cache scope, use the replay cache to push to GPU instead of internal accumulation
 		success &= pushBufferUploads(intendedNextSubmit, currentReplayCache->resourcesCollection);
 		success &= pushMSDFImagesUploads(intendedNextSubmit, currentReplayCache->msdfStagedCPUImages);
+		// TODO: pushStaticImagesUploads
 	}
 	else
 	{
 		flushDrawObjects();
 		success &= pushBufferUploads(intendedNextSubmit, resourcesCollection);
 		success &= pushMSDFImagesUploads(intendedNextSubmit, msdfStagedCPUImages);
+		success &= pushStaticImagesUploads(intendedNextSubmit);
 	}
 	return success;
 }
@@ -483,10 +613,7 @@ std::unique_ptr<DrawResourcesFiller::ReplayCache> DrawResourcesFiller::createRep
 	ret->resourcesCollection = resourcesCollection;
 	ret->msdfStagedCPUImages = msdfStagedCPUImages;
 	for (auto& stagedMSDF : ret->msdfStagedCPUImages)
-	{
 		stagedMSDF.uploadedToGPU = false; // to trigger upload for all msdf functions again.
-		stagedMSDF.usedThisFrame = false;
-	}
 	ret->drawCallsData = drawCalls;
 	ret->activeMainObjectIndex = activeMainObjectIndex;
 	return ret;
@@ -568,7 +695,7 @@ bool DrawResourcesFiller::pushMSDFImagesUploads(SIntendedSubmitInfo& intendedNex
 
 		auto msdfImage = msdfTextureArray->getCreationParameters().image;
 
-		// preparing msdfs for copy
+		// preparing msdfs for stagedStaticImage
 		using image_barrier_t = IGPUCommandBuffer::SPipelineBarrierDependencyInfo::image_barrier_t;
 		image_barrier_t beforeTransferImageBarrier[] =
 		{
@@ -682,6 +809,138 @@ bool DrawResourcesFiller::pushMSDFImagesUploads(SIntendedSubmitInfo& intendedNex
 	}
 }
 
+bool DrawResourcesFiller::pushStaticImagesUploads(SIntendedSubmitInfo& intendedNextSubmit)
+{
+	auto* device = m_utilities->getLogicalDevice();
+	auto* physDev = m_utilities->getLogicalDevice()->getPhysicalDevice();
+	auto* descriptorSet = suballocatedDescriptorSet->getDescriptorSet();
+	auto* cmdBuffInfo = intendedNextSubmit.getCommandBufferForRecording();
+	
+	if (cmdBuffInfo)
+	{
+		bool success = true;
+
+		if (staticImagesStagedCopies.size() > 0ull)
+		{
+			IGPUCommandBuffer* commandBuffer = cmdBuffInfo->cmdbuf;
+
+			// DescriptorSet Updates
+			std::vector<video::IGPUDescriptorSet::SDescriptorInfo> descriptorInfos;
+			std::vector<IGPUDescriptorSet::SWriteDescriptorSet> descriptorWrites;
+			descriptorInfos.resize(staticImagesStagedCopies.size());
+			descriptorWrites.resize(staticImagesStagedCopies.size());
+			for (uint32_t i = 0u; i < staticImagesStagedCopies.size(); ++i)
+			{
+				auto& stagedStaticImage = staticImagesStagedCopies[i];
+				// Bind gpu image view to descriptor set
+				descriptorInfos[i].info.image.imageLayout = IImage::LAYOUT::READ_ONLY_OPTIMAL;
+				descriptorInfos[i].desc = stagedStaticImage.gpuImageView;
+
+				// consider batching contiguous writes, if descriptor set updating was a hotspot
+				descriptorWrites[i].dstSet = descriptorSet,
+					descriptorWrites[i].binding = imagesArrayBinding;
+				descriptorWrites[i].arrayElement = stagedStaticImage.arrayIndex;
+				descriptorWrites[i].count = 1u;
+				descriptorWrites[i].info = &descriptorInfos[i];
+			}
+
+			success &= device->updateDescriptorSets(descriptorWrites.size(), descriptorWrites.data(), 0u, nullptr);
+
+			std::vector<IGPUCommandBuffer::SPipelineBarrierDependencyInfo::image_barrier_t> beforeCopyImageBarriers;
+			beforeCopyImageBarriers.resize(staticImagesStagedCopies.size());
+
+			// Pipeline Barriers before stagedStaticImage
+			for (uint32_t i = 0u; i < staticImagesStagedCopies.size(); ++i)
+			{
+				auto& stagedStaticImage = staticImagesStagedCopies[i];
+				const auto& gpuImg = stagedStaticImage.gpuImageView->getCreationParameters().image;
+				beforeCopyImageBarriers[i] =
+				{
+					.barrier = {
+						.dep = {
+							.srcStageMask = PIPELINE_STAGE_FLAGS::NONE, // previous top of pipe -> top_of_pipe in first scope = none
+							.srcAccessMask = ACCESS_FLAGS::NONE,
+							.dstStageMask = PIPELINE_STAGE_FLAGS::COPY_BIT,
+							.dstAccessMask = ACCESS_FLAGS::TRANSFER_WRITE_BIT,
+						}
+						// .ownershipOp. No queueFam ownership transfer
+					},
+					.image = gpuImg.get(),
+					.subresourceRange = {
+						.aspectMask = IImage::E_ASPECT_FLAGS::EAF_COLOR_BIT,
+						.baseMipLevel = 0u,
+						.levelCount = ICPUImageView::remaining_mip_levels,
+						.baseArrayLayer = 0u,
+						.layerCount = ICPUImageView::remaining_array_layers
+					},
+					.oldLayout = IImage::LAYOUT::UNDEFINED,
+					.newLayout = IImage::LAYOUT::TRANSFER_DST_OPTIMAL,
+				};
+			}
+			success &= commandBuffer->pipelineBarrier(E_DEPENDENCY_FLAGS::EDF_NONE, { .imgBarriers = beforeCopyImageBarriers });
+
+			for (uint32_t i = 0u; i < staticImagesStagedCopies.size(); ++i)
+			{
+				auto& stagedStaticImage = staticImagesStagedCopies[i];
+				auto& gpuImg = stagedStaticImage.gpuImageView->getCreationParameters().image;
+				success &= m_utilities->updateImageViaStagingBuffer(
+					intendedNextSubmit,
+					stagedStaticImage.cpuImage->getBuffer()->getPointer(), stagedStaticImage.cpuImage->getCreationParameters().format,
+					gpuImg.get(), IImage::LAYOUT::TRANSFER_DST_OPTIMAL,
+					stagedStaticImage.cpuImage->getRegions());
+			}
+
+			std::vector<IGPUCommandBuffer::SPipelineBarrierDependencyInfo::image_barrier_t> afterCopyImageBarriers;
+			afterCopyImageBarriers.resize(staticImagesStagedCopies.size());
+
+			// Pipeline Barriers before stagedStaticImage
+			for (uint32_t i = 0u; i < staticImagesStagedCopies.size(); ++i)
+			{
+				auto& stagedStaticImage = staticImagesStagedCopies[i];
+				const auto& gpuImg = stagedStaticImage.gpuImageView->getCreationParameters().image;
+				afterCopyImageBarriers[i] =
+				{
+					.barrier = {
+						.dep = {
+							.srcStageMask = PIPELINE_STAGE_FLAGS::COPY_BIT, // previous top of pipe -> top_of_pipe in first scope = none
+							.srcAccessMask = ACCESS_FLAGS::TRANSFER_WRITE_BIT,
+							.dstStageMask = PIPELINE_STAGE_FLAGS::FRAGMENT_SHADER_BIT,
+							.dstAccessMask = ACCESS_FLAGS::SHADER_READ_BITS,
+						}
+						// .ownershipOp. No queueFam ownership transfer
+					},
+					.image = gpuImg.get(),
+					.subresourceRange = {
+						.aspectMask = IImage::E_ASPECT_FLAGS::EAF_COLOR_BIT,
+						.baseMipLevel = 0u,
+						.levelCount = ICPUImageView::remaining_mip_levels,
+						.baseArrayLayer = 0u,
+						.layerCount = ICPUImageView::remaining_array_layers
+					},
+					.oldLayout = IImage::LAYOUT::TRANSFER_DST_OPTIMAL,
+					.newLayout = IImage::LAYOUT::READ_ONLY_OPTIMAL,
+				};
+			}
+			success &= commandBuffer->pipelineBarrier(E_DEPENDENCY_FLAGS::EDF_NONE, { .imgBarriers = afterCopyImageBarriers });
+		}
+
+		staticImagesStagedCopies.clear();
+		if (!success)
+		{
+			// TODO: Log
+			_NBL_DEBUG_BREAK_IF(true);
+		}
+		return success;
+
+	}
+	else
+	{
+		// TODO: Log
+			_NBL_DEBUG_BREAK_IF(true);
+		return false;
+	}
+}
+
 const size_t DrawResourcesFiller::calculateRemainingResourcesSize() const
 {
 	assert(resourcesGPUBuffer->getSize() >= resourcesCollection.calculateTotalConsumption());
@@ -1188,6 +1447,55 @@ bool DrawResourcesFiller::addFontGlyph_Internal(const GlyphInfo& glyphInfo, uint
 	return true;
 }
 
+bool DrawResourcesFiller::addImageObject_Internal(const ImageObjectInfo& imageObjectInfo, uint32_t mainObjIdx)
+{
+	const size_t remainingResourcesSize = calculateRemainingResourcesSize();
+
+	const uint32_t uploadableObjects = (remainingResourcesSize) / (sizeof(ImageObjectInfo) + sizeof(DrawObject) + sizeof(uint32_t) * 6u);
+	// TODO[ERFAN]: later take into account: our maximum indexable vertex 
+
+	if (uploadableObjects <= 0u)
+		return false;
+
+	// Add Geometry
+	size_t geometryBufferOffset = resourcesCollection.geometryInfo.increaseSizeAndGetOffset(sizeof(ImageObjectInfo), alignof(ImageObjectInfo));
+	void* dst = resourcesCollection.geometryInfo.data() + geometryBufferOffset;
+	memcpy(dst, &imageObjectInfo, sizeof(ImageObjectInfo));
+
+	// Push Indices, remove later when compute fills this
+	uint32_t* indexBufferToBeFilled = resourcesCollection.indexBuffer.increaseCountAndGetPtr(6u * 1u);
+	const uint32_t startObj = resourcesCollection.drawObjects.getCount();
+	uint32_t i = 0u;
+	indexBufferToBeFilled[i * 6] = (startObj + i) * 4u + 1u;
+	indexBufferToBeFilled[i * 6 + 1u] = (startObj + i) * 4u + 0u;
+	indexBufferToBeFilled[i * 6 + 2u] = (startObj + i) * 4u + 2u;
+	indexBufferToBeFilled[i * 6 + 3u] = (startObj + i) * 4u + 1u;
+	indexBufferToBeFilled[i * 6 + 4u] = (startObj + i) * 4u + 2u;
+	indexBufferToBeFilled[i * 6 + 5u] = (startObj + i) * 4u + 3u;
+
+	// Add DrawObjs
+	DrawObject* drawObjectsToBeFilled = resourcesCollection.drawObjects.increaseCountAndGetPtr(1u);
+	DrawObject drawObj = {};
+	drawObj.mainObjIndex = mainObjIdx;
+	drawObj.type_subsectionIdx = uint32_t(static_cast<uint16_t>(ObjectType::IMAGE) | (0 << 16)); // TODO: use custom pack/unpack function
+	drawObj.geometryAddress = geometryBufferOffset;
+	drawObjectsToBeFilled[0u] = drawObj;
+
+	return true;
+}
+
+uint32_t DrawResourcesFiller::getImageIndexFromID(image_id imageID, const SIntendedSubmitInfo& intendedNextSubmit)
+{
+	uint32_t textureIdx = InvalidTextureIndex;
+	ImageReference* imageRef = imagesUsageCache->get(imageID);
+	if (imageRef)
+	{
+		textureIdx = imageRef->index;
+		imageRef->lastUsedSemaphoreValue = intendedNextSubmit.getFutureScratchSemaphore().value; // update this because the texture will get used on the next submit
+	}
+	return textureIdx;
+}
+
 void DrawResourcesFiller::setGlyphMSDFTextureFunction(const GetGlyphMSDFTextureFunc& func)
 {
 	getGlyphMSDF = func;
@@ -1198,43 +1506,76 @@ void DrawResourcesFiller::setHatchFillMSDFTextureFunction(const GetHatchFillPatt
 	getHatchFillPatternMSDF = func;
 }
 
+uint32_t DrawResourcesFiller::getMSDFIndexFromInputInfo(const MSDFInputInfo& msdfInfo, const SIntendedSubmitInfo& intendedNextSubmit)
+{
+	uint32_t textureIdx = InvalidTextureIndex;
+	MSDFReference* tRef = msdfLRUCache->get(msdfInfo);
+	if (tRef)
+	{
+		textureIdx = tRef->alloc_idx;
+		tRef->lastUsedSemaphoreValue = intendedNextSubmit.getFutureScratchSemaphore().value; // update this because the texture will get used on the next submit
+	}
+	return textureIdx;
+}
+
 uint32_t DrawResourcesFiller::addMSDFTexture(const MSDFInputInfo& msdfInput, core::smart_refctd_ptr<ICPUImage>&& cpuImage, SIntendedSubmitInfo& intendedNextSubmit)
 {
 	if (!cpuImage)
-		return InvalidTextureIdx; // TODO: Log
+		return InvalidTextureIndex; // TODO: Log
 
 	const auto cpuImageSize = cpuImage->getMipSize(0);
 	const bool sizeMatch = cpuImageSize.x == getMSDFResolution().x && cpuImageSize.y == getMSDFResolution().y && cpuImageSize.z == 1u;
 	if (!sizeMatch)
-		return InvalidTextureIdx; // TODO: Log
-
-	// TextureReferences hold the semaValue related to the "scratch semaphore" in IntendedSubmitInfo
-	// Every single submit increases this value by 1
-	// The reason for holding on to the lastUsedSema is deferred dealloc, which we call in the case of eviction, making sure we get rid of the entry inside the allocator only when the texture is done being used
-	const auto nextSemaSignal = intendedNextSubmit.getFutureScratchSemaphore();
-
+		return InvalidTextureIndex; // TODO: Log
+
+	/*
+	 * The `msdfTextureArrayIndexAllocator` manages indices (slots) into a texture array for MSDF images.
+	 * When all slots are occupied, the least recently used entry is evicted via `msdfLRUCache`.
+	 * This callback is invoked on eviction, and must:
+	 *   - Ensure safe deallocation of the slot.
+	 *   - Submit any pending draw calls if the evicted MSDF was scheduled to be used in the upcoming submission.
+	 */
 	auto evictionCallback = [&](const MSDFReference& evicted)
 	{
-		if (msdfStagedCPUImages[evicted.alloc_idx].usedThisFrame)
+		// Prepare wait info to defer index deallocation until the GPU has finished using the resource.
+		// NOTE: This wait is currently *not* required for correctness because:
+		//   - Both the image upload (stagedStaticImage) and usage occur within the same timeline (`intendedNextSubmit`).
+		//   - timeline semaphores guarantee proper ordering: the next submit's stagedStaticImage will wait on the prior usage.
+		//   - Therefore, we can safely overwrite or reallocate the slot without waiting for explicit GPU completion.
+		//
+		// However, this `deallocationWaitInfo` *will* become essential if we start interacting with MSDF images
+		// outside the `intendedNextSubmit` timeline � for example, issuing uploads via a transfer queue or using a separate command buffer and timeline.
+		ISemaphore::SWaitInfo deallocationWaitInfo = { .semaphore = intendedNextSubmit.getFutureScratchSemaphore().semaphore, .value = evicted.lastUsedSemaphoreValue };
+
+		const bool imageUsedForNextIntendedSubmit = (evicted.lastUsedSemaphoreValue == intendedNextSubmit.getFutureScratchSemaphore().value);
+
+		if (imageUsedForNextIntendedSubmit)
 		{
-			// Dealloc once submission is finished
-			msdfTextureArrayIndexAllocator->multi_deallocate(1u, &evicted.alloc_idx, nextSemaSignal);
+			// The evicted image is scheduled for use in the upcoming submit.
+			// To avoid rendering artifacts, we must flush the current draw queue now.
+			// After submission, we reset state so that data referencing the evicted slot can be re-uploaded.
+			msdfTextureArrayIndexAllocator->multi_deallocate(1u, &evicted.alloc_idx, deallocationWaitInfo);
 			submitDraws(intendedNextSubmit);
 			reset(); // resets everything, things referenced through mainObj and other shit will be pushed again through acquireXXX_SubmitIfNeeded
 		} 
 		else
 		{
-			// We didn't use it this frame, so it's safe to dealloc now, withou needing to "overflow" submit
-			msdfTextureArrayIndexAllocator->multi_deallocate(1u, &evicted.alloc_idx);
+			// The image is not used in the current frame (intended next submit), so we can deallocate without submitting any draws.
+			// Still wait on the semaphore to ensure past GPU usage is complete (read note above).
+			msdfTextureArrayIndexAllocator->multi_deallocate(1u, &evicted.alloc_idx, deallocationWaitInfo);
 		}
+		
+		// Clear CPU-side metadata associated with the evicted slot.
 		msdfStagedCPUImages[evicted.alloc_idx].evict();
 	};
 	
 	// We pass nextSemaValue instead of constructing a new MSDFReference and passing it into `insert` that's because we might get a cache hit and only update the value of the nextSema
-	MSDFReference* inserted = msdfLRUCache->insert(msdfInput, nextSemaSignal.value, evictionCallback);
+	MSDFReference* inserted = msdfLRUCache->insert(msdfInput, intendedNextSubmit.getFutureScratchSemaphore().value, evictionCallback);
 	
-	// if inserted->alloc_idx was not InvalidTextureIdx then it means we had a cache hit and updated the value of our sema, in which case we don't queue anything for upload, and return the idx
-	if (inserted->alloc_idx == InvalidTextureIdx)
+	inserted->lastUsedSemaphoreValue = intendedNextSubmit.getFutureScratchSemaphore().value; // in case there was an eviction + auto-submit, we need to update AGAIN
+
+	// if inserted->alloc_idx was not InvalidTextureIndex then it means we had a cache hit and updated the value of our sema, in which case we don't queue anything for upload, and return the idx
+	if (inserted->alloc_idx == InvalidTextureIndex)
 	{
 		// New insertion == cache miss happened and insertion was successfull
 		inserted->alloc_idx = IndexAllocator::AddressAllocator::invalid_address;
@@ -1242,22 +1583,18 @@ uint32_t DrawResourcesFiller::addMSDFTexture(const MSDFInputInfo& msdfInput, cor
 
 		if (inserted->alloc_idx != IndexAllocator::AddressAllocator::invalid_address)
 		{
-			// We stage copy, pushMSDFImagesUploads will push it into GPU
+			// We stage stagedStaticImage, pushMSDFImagesUploads will push it into GPU
 			msdfStagedCPUImages[inserted->alloc_idx].image = std::move(cpuImage);
 			msdfStagedCPUImages[inserted->alloc_idx].uploadedToGPU = false;
 		}
 		else
 		{
 			// TODO: log here, assert will be called in a few lines
-			inserted->alloc_idx = InvalidTextureIdx;
+			inserted->alloc_idx = InvalidTextureIndex;
 		}
 	}
 	
-	assert(inserted->alloc_idx != InvalidTextureIdx); // shouldn't happen, because we're using LRU cache, so worst case eviction will happen + multi-deallocate and next next multi_allocate should definitely succeed
-	if (inserted->alloc_idx != InvalidTextureIdx)
-	{
-		msdfStagedCPUImages[inserted->alloc_idx].usedThisFrame = true;
-	}
+	assert(inserted->alloc_idx != InvalidTextureIndex); // shouldn't happen, because we're using LRU cache, so worst case eviction will happen + multi-deallocate and next next multi_allocate should definitely succeed
 
 	return inserted->alloc_idx;
 }
@@ -1273,4 +1610,4 @@ void DrawResourcesFiller::flushDrawObjects()
 		drawCalls.push_back(drawCall);
 		drawObjectsFlushedToDrawCalls = resourcesCollection.drawObjects.getCount();
 	}
-}
+}
\ No newline at end of file
diff --git a/62_CAD/DrawResourcesFiller.h b/62_CAD/DrawResourcesFiller.h
index aec6e3b8e..1a86c09e2 100644
--- a/62_CAD/DrawResourcesFiller.h
+++ b/62_CAD/DrawResourcesFiller.h
@@ -3,10 +3,11 @@
 #include "CTriangleMesh.h"
 #include "Hatch.h"
 #include "IndexAllocator.h"
+#include "ImagesUsageCache.h"
 #include <nbl/video/utilities/SIntendedSubmitInfo.h>
 #include <nbl/core/containers/LRUCache.h>  
 #include <nbl/ext/TextRendering/TextRendering.h>
-
+// #include <nbl/video/alloc/SubAllocatedDescriptorSet.h>
 using namespace nbl;
 using namespace nbl::video;
 using namespace nbl::core;
@@ -126,6 +127,9 @@ struct DrawResourcesFiller
 	typedef std::function<void(SIntendedSubmitInfo&)> SubmitFunc;
 	void setSubmitDrawsFunction(const SubmitFunc& func);
 	
+	// DrawResourcesFiller needs to access these in order to allocate GPUImages and write the to their correct descriptor set binding
+	void setTexturesDescriptorSetAndBinding(core::smart_refctd_ptr<video::IGPUDescriptorSet>&& descriptorSet, uint32_t binding);
+
 	/// @brief Get minimum required size for resources buffer (containing objects and geometry info and their settings)
 	static constexpr size_t getMinimumRequiredResourcesBufferSize()
 	{
@@ -200,11 +204,38 @@ struct DrawResourcesFiller
 		float32_t2 minUV,
 		SIntendedSubmitInfo& intendedNextSubmit);
 	
-	void _test_addImageObject(
-		float64_t2 topLeftPos,
-		float32_t2 size,
-		float32_t rotation,
-		SIntendedSubmitInfo& intendedNextSubmit);
+	/**
+	 * @brief Adds a static 2D image to the draw resource set for rendering.
+	 *
+	 * This function ensures that a given image is available as a GPU-resident texture for future draw submissions.
+	 * It uses an LRU cache to manage descriptor set slots and evicts old images if necessary to make room for new ones.
+	 *
+	 * If the image is already cached and its slot is valid, it returns the slot index directly.
+	 * Otherwise, it performs the following:
+	 *   - Allocates a new descriptor set slot.
+	 *   - Promotes the image format to be GPU-compatible.
+	 *   - Creates a GPU image and GPU image view.
+	 *   - Queues the image for uploading via staging in the next submit.
+	 *   - If memory is constrained, attempts to evict other images to free up space.
+	 *
+	 * @param imageID              Unique identifier for the image resource.
+	 * @param cpuImage             The CPU-side image resource to (possibly) upload.
+	 * @param intendedNextSubmit   Struct representing the upcoming submission, including a semaphore for safe scheduling.
+	 *
+	 * @return The index (slot) into the descriptor set array where the image is or will be bound.
+	 *         Returns `InvalidTextureIndex` only if all fallback and eviction attempts failed.
+	 *
+	 * @note This function ensures that the descriptor slot is not reused while the GPU may still be reading from it.
+	 *       If an eviction is required and the evicted image is scheduled to be used in the next submit, it triggers
+	 *       a flush of pending draws to preserve correctness.
+	 *
+	 * @note The function uses the `imagesUsageCache` LRU cache to track usage and validity of texture slots.
+	 *       If an insertion leads to an eviction, a callback ensures proper deallocation and synchronization.
+	*/
+	uint32_t addStaticImage2D(image_id imageID, const core::smart_refctd_ptr<ICPUImage>& cpuImage, SIntendedSubmitInfo& intendedNextSubmit);
+
+	// This function must be called immediately after `addStaticImage` for the same imageID.
+	void addImageObject(image_id imageID, float64_t2 topLeftPos, float32_t2 size, float32_t rotation, SIntendedSubmitInfo& intendedNextSubmit);
 	
 	/// @brief call this function before submitting to ensure all buffer and textures resourcesCollection requested via drawing calls are copied to GPU
 	/// records copy command into intendedNextSubmit's active command buffer and might possibly submits if fails allocation on staging upload memory.
@@ -219,7 +250,6 @@ struct DrawResourcesFiller
 		resetCustomClipRects();
 		resetLineStyles();
 		resetDTMSettings();
-		resetMSDFsUsageState();
 
 		drawObjectsFlushedToDrawCalls = 0ull;
 		drawCalls.clear();
@@ -267,15 +297,12 @@ struct DrawResourcesFiller
 	{
 		core::smart_refctd_ptr<ICPUImage> image;
 		bool uploadedToGPU : 1u;
-		// TODO: Use frame counter instead, generalize struct to all textures probably, DONT try to abuse scratchSema.nextSignal as frame tracker, because there can be "cached" draws where no submits happen.
-		bool usedThisFrame : 1u;
 
 		bool isValid() const { return image.get() != nullptr; }
 		void evict()
 		{
 			image = nullptr;
 			uploadedToGPU = false;
-			usedThisFrame = false;
 		}
 	};
 
@@ -353,6 +380,10 @@ struct DrawResourcesFiller
 	/// @brief Records GPU copy commands for all staged msdf images into the active command buffer.
 	bool pushMSDFImagesUploads(SIntendedSubmitInfo& intendedNextSubmit, std::vector<MSDFStagedCPUImage>& stagedMSDFCPUImages);
 
+	/// @brief Records GPU copy commands for all staged msdf images into the active command buffer.
+	/// TODO: Handle for cache&replay mode later
+	bool pushStaticImagesUploads(SIntendedSubmitInfo& intendedNextSubmit);
+
 	const size_t calculateRemainingResourcesSize() const;
 
 	/// @brief Internal Function to call whenever we overflow when we can't fill all of mainObject's drawObjects
@@ -416,6 +447,11 @@ struct DrawResourcesFiller
 	/// Attempts to upload a single GlyphInfo considering resource limitations
 	bool addFontGlyph_Internal(const GlyphInfo& glyphInfo, uint32_t mainObjIdx);
 	
+	/// Attempts to upload a single image object considering resource limitations (not accounting for the resource image added using addStaticImage2D function)
+	bool addImageObject_Internal(const ImageObjectInfo& imageObjectInfo, uint32_t mainObjIdx);;
+	
+	uint32_t getImageIndexFromID(image_id imageID, const SIntendedSubmitInfo& intendedNextSubmit);
+
 	void resetMainObjects()
 	{
 		resourcesCollection.mainObjects.vector.clear();
@@ -460,12 +496,6 @@ struct DrawResourcesFiller
 		activeDTMSettingsIndex = InvalidDTMSettingsIdx;
 	}
 	
-	void resetMSDFsUsageState()
-	{
-		for (auto& stagedMSDF : msdfStagedCPUImages)
-			stagedMSDF.usedThisFrame = false;
-	}
-
 	// MSDF Hashing and Caching Internal Functions 
 	enum class MSDFType : uint8_t
 	{
@@ -526,41 +556,22 @@ struct DrawResourcesFiller
 	};
 
 	struct MSDFInputInfoHash { std::size_t operator()(const MSDFInputInfo& info) const { return info.lookupHash; } };
-
+	
 	struct MSDFReference
 	{
 		uint32_t alloc_idx;
 		uint64_t lastUsedSemaphoreValue;
 
 		MSDFReference(uint32_t alloc_idx, uint64_t semaphoreVal) : alloc_idx(alloc_idx), lastUsedSemaphoreValue(semaphoreVal) {}
-		MSDFReference(uint64_t semaphoreVal) : MSDFReference(InvalidTextureIdx, semaphoreVal) {}
-		MSDFReference() : MSDFReference(InvalidTextureIdx, ~0ull) {}
+		MSDFReference(uint64_t semaphoreVal) : MSDFReference(InvalidTextureIndex, semaphoreVal) {}
+		MSDFReference() : MSDFReference(InvalidTextureIndex, ~0ull) {}
 
 		// In LRU Cache `insert` function, in case of cache hit, we need to assign semaphore value to MSDFReference without changing `alloc_idx`
 		inline MSDFReference& operator=(uint64_t semamphoreVal) { lastUsedSemaphoreValue = semamphoreVal; return *this;  }
 	};
 	
-	uint32_t getMSDFIndexFromInputInfo(const MSDFInputInfo& msdfInfo, SIntendedSubmitInfo& intendedNextSubmit)
-	{
-		uint32_t textureIdx = InvalidTextureIdx;
-		MSDFReference* tRef = msdfLRUCache->get(msdfInfo);
-		if (tRef)
-		{
-			textureIdx = tRef->alloc_idx;
-			tRef->lastUsedSemaphoreValue = intendedNextSubmit.getFutureScratchSemaphore().value; // update this because the texture will get used on the next submit
-			if (textureIdx >= 0u && textureIdx < msdfStagedCPUImages.size())
-				msdfStagedCPUImages[textureIdx].usedThisFrame = true;
-			else
-			{
-				// shouldn't happen, TODO: Log
-				assert(false);
-			}
-		}
-		return textureIdx;
-	}
+	uint32_t getMSDFIndexFromInputInfo(const MSDFInputInfo& msdfInfo, const SIntendedSubmitInfo& intendedNextSubmit);
 	
-	// ! mainObjIdx: make sure to pass your mainObjIdx to it if you want it to stay synced/updated if some overflow submit occured which would potentially erase what your mainObject points at.
-	// If you haven't created a mainObject yet, then pass InvalidMainObjectIdx
 	uint32_t addMSDFTexture(const MSDFInputInfo& msdfInput, core::smart_refctd_ptr<ICPUImage>&& cpuImage, SIntendedSubmitInfo& intendedNextSubmit);
 	
 	// Flushes Current Draw Call and adds to drawCalls
@@ -593,6 +604,7 @@ struct DrawResourcesFiller
 	TransformationType activeMainObjectTransformationType;
 
 	uint32_t activeMainObjectIndex = InvalidMainObjectIdx;
+
 	// The ClipRects & Projections are stack, because user can push/pop ClipRects & Projections in any order
 	std::deque<float64_t3x3> activeProjections; // stack of projections stored so we can resubmit them if geometry buffer got reset.
 	std::deque<uint32_t> activeProjectionIndices; // stack of projection gpu addresses in geometry buffer. to keep track of them in push/pops
@@ -603,14 +615,29 @@ struct DrawResourcesFiller
 	GetGlyphMSDFTextureFunc getGlyphMSDF;
 	GetHatchFillPatternMSDFTextureFunc getHatchFillPatternMSDF;
 
-	using MSDFsLRUCache = core::LRUCache<MSDFInputInfo, MSDFReference, MSDFInputInfoHash>;
+	using MSDFsLRUCache = core::ResizableLRUCache<MSDFInputInfo, MSDFReference, MSDFInputInfoHash>;
 	smart_refctd_ptr<IGPUImageView>		msdfTextureArray; // view to the resource holding all the msdfs in it's layers
 	smart_refctd_ptr<IndexAllocator>	msdfTextureArrayIndexAllocator;
 	std::unique_ptr<MSDFsLRUCache>		msdfLRUCache; // LRU Cache to evict Least Recently Used in case of overflow
 
 	std::vector<MSDFStagedCPUImage>		msdfStagedCPUImages = {}; // cached cpu imaged + their status, size equals to LRUCache size
 	static constexpr asset::E_FORMAT	MSDFTextureFormat = asset::E_FORMAT::EF_R8G8B8A8_SNORM;
+	
+	// Images:
+	std::unique_ptr<ImagesUsageCache> imagesUsageCache;
+	smart_refctd_ptr<SubAllocatedDescriptorSet> suballocatedDescriptorSet;
+	uint32_t imagesArrayBinding = 0u;
+	
+	// static images (not streamable):
+	struct StaticImagesCopy
+	{
+		core::smart_refctd_ptr<ICPUImage> cpuImage;
+		core::smart_refctd_ptr<IGPUImageView> gpuImageView;
+		uint32_t arrayIndex;
+	};
+	std::vector<StaticImagesCopy> staticImagesStagedCopies;
 
+	
 	bool m_hasInitializedMSDFTextureArrays = false;
 };
 
diff --git a/62_CAD/main.cpp b/62_CAD/main.cpp
index c7fe04603..0aad1669e 100644
--- a/62_CAD/main.cpp
+++ b/62_CAD/main.cpp
@@ -78,7 +78,7 @@ constexpr std::array<float, (uint32_t)ExampleMode::CASE_COUNT> cameraExtents =
 	10.0	// CASE_BUG
 };
 
-constexpr ExampleMode mode = ExampleMode::CASE_9;
+constexpr ExampleMode mode = ExampleMode::CASE_7;
 
 class Camera2D
 {
@@ -666,6 +666,7 @@ class ComputerAidedDesign final : public examples::SimpleWindowedApplication, pu
 			IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_PARTIALLY_BOUND_BIT;
 
 		// Create DescriptorSetLayout, PipelineLayout and update DescriptorSets
+		const uint32_t imagesBinding = 3u;
 		{
 			video::IGPUDescriptorSetLayout::SBinding bindingsSet0[] = {
 				{
@@ -690,11 +691,11 @@ class ComputerAidedDesign final : public examples::SimpleWindowedApplication, pu
 					.count = 1u,
 				},
 				{
-					.binding = 3u,
+					.binding = imagesBinding,
 					.type = asset::IDescriptor::E_TYPE::ET_SAMPLED_IMAGE,
 					.createFlags = bindlessTextureFlags,
 					.stageFlags = asset::IShader::E_SHADER_STAGE::ESS_FRAGMENT,
-					.count = 128u,
+					.count = ImagesBindingArraySize,
 				},
 			};
 			descriptorSetLayout0 = m_device->createDescriptorSetLayout(bindingsSet0);
@@ -813,6 +814,8 @@ class ComputerAidedDesign final : public examples::SimpleWindowedApplication, pu
 			pipelineLayout = m_device->createPipelineLayout({ &range,1 }, core::smart_refctd_ptr(descriptorSetLayout0), core::smart_refctd_ptr(descriptorSetLayout1), nullptr, nullptr);
 		}
 
+		drawResourcesFiller.setTexturesDescriptorSetAndBinding(core::smart_refctd_ptr(descriptorSet0), imagesBinding);
+
 		smart_refctd_ptr<IGPUShader> mainPipelineFragmentShaders = {};
 		smart_refctd_ptr<IGPUShader> mainPipelineVertexShader = {};
 		std::array<smart_refctd_ptr<IGPUShader>, 2u> geoTexturePipelineShaders = {};
@@ -1035,6 +1038,55 @@ class ComputerAidedDesign final : public examples::SimpleWindowedApplication, pu
 		m_intendedNextSubmit.scratchCommandBuffers = m_commandBufferInfos;
 		m_currentRecordingCommandBufferInfo = &m_commandBufferInfos[0];
 
+		// Load image
+		system::path m_loadCWD = "..";
+		std::string imagePath = "../../media/color_space_test/R8G8B8A8_1.png";
+
+		constexpr auto cachingFlags = static_cast<IAssetLoader::E_CACHING_FLAGS>(IAssetLoader::ECF_DONT_CACHE_REFERENCES & IAssetLoader::ECF_DONT_CACHE_TOP_LEVEL);
+		const IAssetLoader::SAssetLoadParams loadParams(0ull, nullptr, cachingFlags, IAssetLoader::ELPF_NONE, m_logger.get(), m_loadCWD);
+		auto bundle = m_assetMgr->getAsset(imagePath, loadParams);
+		auto contents = bundle.getContents();
+		if (contents.empty())
+		{
+			m_logger->log("Failed to load image with path %s, skipping!", ILogger::ELL_ERROR, (m_loadCWD / imagePath).c_str());
+		}
+
+		smart_refctd_ptr<ICPUImageView> cpuImgView;
+		const auto& asset = contents[0];
+		switch (asset->getAssetType())
+		{
+		case IAsset::ET_IMAGE:
+		{
+			auto image = smart_refctd_ptr_static_cast<ICPUImage>(asset);
+			const auto format = image->getCreationParameters().format;
+
+			ICPUImageView::SCreationParams viewParams = {
+				.flags = ICPUImageView::E_CREATE_FLAGS::ECF_NONE,
+				.image = std::move(image),
+				.viewType = IImageView<ICPUImage>::E_TYPE::ET_2D,
+				.format = format,
+				.subresourceRange = {
+					.aspectMask = IImage::E_ASPECT_FLAGS::EAF_COLOR_BIT,
+					.baseMipLevel = 0u,
+					.levelCount = ICPUImageView::remaining_mip_levels,
+					.baseArrayLayer = 0u,
+					.layerCount = ICPUImageView::remaining_array_layers
+				}
+			};
+
+			cpuImgView = ICPUImageView::create(std::move(viewParams));
+		} break;
+
+		case IAsset::ET_IMAGE_VIEW:
+			cpuImgView = smart_refctd_ptr_static_cast<ICPUImageView>(asset);
+			break;
+		default:
+			m_logger->log("Failed to load ICPUImage or ICPUImageView got some other Asset Type, skipping!", ILogger::ELL_ERROR);
+		}
+				
+		const auto cpuImage = cpuImgView->getCreationParameters().image;
+		sampleImages.push_back(cpuImage);
+
 		return true;
 	}
 
@@ -1218,6 +1270,7 @@ class ComputerAidedDesign final : public examples::SimpleWindowedApplication, pu
 		if (isCachingDraw)
 		{
 			replayCaches.push_back(drawResourcesFiller.createReplayCache());
+			intendedSubmitInfo.scratchSemaphore.value++; // fake advance needed for Texture and MSDF LRU caches and evictions to work
 			return; // we don't record, submit or do anything, just caching the draw resources
 		}
 
@@ -2833,165 +2886,13 @@ class ComputerAidedDesign final : public examples::SimpleWindowedApplication, pu
 		}
 		else if (mode == ExampleMode::CASE_7)
 		{
-			if (m_realFrameIx == 0u)
+			for (uint32_t i = 0; i < sampleImages.size(); ++i)
 			{
-				// we record upload of our objects and if we failed to allocate we submit everything
-				if (!intendedNextSubmit.valid())
-				{
-					// log("intendedNextSubmit is invalid.", nbl::system::ILogger::ELL_ERROR);
-					assert(false);
-					return;
-				}
-
-				// Use the current recording command buffer of the intendedSubmitInfos scratchCommandBuffers, it should be in recording state
-				auto* cmdbuf = m_currentRecordingCommandBufferInfo->cmdbuf;
-
-				assert(cmdbuf->getState() == video::IGPUCommandBuffer::STATE::RECORDING && cmdbuf->isResettable());
-				assert(cmdbuf->getRecordingFlags().hasFlags(video::IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT));
-
-				auto* cmdpool = cmdbuf->getPool();
-
-
-				// Load image
-				system::path m_loadCWD = "..";
-				std::string imagePath = "../../media/color_space_test/R8G8B8A8_1.png";
-				
-				constexpr auto cachingFlags = static_cast<IAssetLoader::E_CACHING_FLAGS>(IAssetLoader::ECF_DONT_CACHE_REFERENCES & IAssetLoader::ECF_DONT_CACHE_TOP_LEVEL);
-				const IAssetLoader::SAssetLoadParams loadParams(0ull, nullptr, cachingFlags, IAssetLoader::ELPF_NONE, m_logger.get(),m_loadCWD);
-				auto bundle = m_assetMgr->getAsset(imagePath,loadParams);
-				auto contents = bundle.getContents();
-				if (contents.empty())
-				{
-					m_logger->log("Failed to load image with path %s, skipping!",ILogger::ELL_ERROR,(m_loadCWD/imagePath).c_str());
-				}
-				
-				smart_refctd_ptr<ICPUImageView> cpuImgView;
-				const auto& asset = contents[0];
-				switch (asset->getAssetType())
-				{
-					case IAsset::ET_IMAGE:
-					{
-						auto image = smart_refctd_ptr_static_cast<ICPUImage>(asset);
-						const auto format = image->getCreationParameters().format;
-
-						ICPUImageView::SCreationParams viewParams = {
-							.flags = ICPUImageView::E_CREATE_FLAGS::ECF_NONE,
-							.image = std::move(image),
-							.viewType = IImageView<ICPUImage>::E_TYPE::ET_2D,
-							.format = format,
-							.subresourceRange = {
-								.aspectMask = IImage::E_ASPECT_FLAGS::EAF_COLOR_BIT,
-								.baseMipLevel = 0u,
-								.levelCount = ICPUImageView::remaining_mip_levels,
-								.baseArrayLayer = 0u,
-								.layerCount = ICPUImageView::remaining_array_layers
-							}
-						};
-
-						cpuImgView = ICPUImageView::create(std::move(viewParams));
-					} break;
-
-					case IAsset::ET_IMAGE_VIEW:
-						cpuImgView = smart_refctd_ptr_static_cast<ICPUImageView>(asset);
-						break;
-					default:
-						m_logger->log("Failed to load ICPUImage or ICPUImageView got some other Asset Type, skipping!",ILogger::ELL_ERROR);
-				}
-
-				// create matching size gpu image
-				smart_refctd_ptr<IGPUImage> gpuImg;
-				const auto& origParams = cpuImgView->getCreationParameters();
-				const auto origImage = origParams.image;
-				IGPUImage::SCreationParams imageParams = {};
-				imageParams = origImage->getCreationParameters();
-				imageParams.usage |= IGPUImage::EUF_TRANSFER_DST_BIT|IGPUImage::EUF_SAMPLED_BIT;
-				// promote format because RGB8 and friends don't actually exist in HW
-				{
-					const IPhysicalDevice::SImageFormatPromotionRequest request = {
-						.originalFormat = imageParams.format,
-						.usages = IPhysicalDevice::SFormatImageUsages::SUsage(imageParams.usage)
-					};
-					imageParams.format = m_physicalDevice->promoteImageFormat(request,imageParams.tiling);
-				}
-				gpuImg = m_device->createImage(std::move(imageParams));
-				if (!gpuImg || !m_device->allocate(gpuImg->getMemoryReqs(),gpuImg.get()).isValid())
-					m_logger->log("Failed to create or allocate gpu image!",ILogger::ELL_ERROR);
-				gpuImg->setObjectDebugName(imagePath.c_str());
-				
-				IGPUImageView::SCreationParams viewParams = {
-					.image = gpuImg,
-					.viewType = IGPUImageView::ET_2D,
-					.format = gpuImg->getCreationParameters().format
-				};
-				auto gpuImgView = m_device->createImageView(std::move(viewParams));
-
-				// Bind gpu image view to descriptor set
-				video::IGPUDescriptorSet::SDescriptorInfo dsInfo;
-				dsInfo.info.image.imageLayout = IImage::LAYOUT::READ_ONLY_OPTIMAL;
-				dsInfo.desc = gpuImgView;
-
-				IGPUDescriptorSet::SWriteDescriptorSet dsWrites[1u] =
-				{
-					{
-						.dstSet = descriptorSet0.get(),
-						.binding = 3u,
-						.arrayElement = 0u,
-						.count = 1u,
-						.info = &dsInfo,
-					}
-				};
-				m_device->updateDescriptorSets(1u, dsWrites, 0u, nullptr);
-
-				// Upload Loaded CPUImageData to GPU
-				IGPUCommandBuffer::SPipelineBarrierDependencyInfo::image_barrier_t beforeCopyImageBarriers[] =
-				{
-					{
-						.barrier = {
-							.dep = {
-								.srcStageMask = PIPELINE_STAGE_FLAGS::NONE, // previous top of pipe -> top_of_pipe in first scope = none
-								.srcAccessMask = ACCESS_FLAGS::NONE,
-								.dstStageMask = PIPELINE_STAGE_FLAGS::COPY_BIT,
-								.dstAccessMask = ACCESS_FLAGS::TRANSFER_WRITE_BIT,
-							}
-							// .ownershipOp. No queueFam ownership transfer
-						},
-						.image = gpuImg.get(),
-						.subresourceRange = origParams.subresourceRange,
-						.oldLayout = IImage::LAYOUT::UNDEFINED,
-						.newLayout = IImage::LAYOUT::TRANSFER_DST_OPTIMAL,
-					}
-				};
-
-				cmdbuf->pipelineBarrier(E_DEPENDENCY_FLAGS::EDF_NONE,  { .imgBarriers = beforeCopyImageBarriers  });
-				m_utils->updateImageViaStagingBuffer(
-					intendedNextSubmit, 
-					origImage->getBuffer()->getPointer(), origImage->getCreationParameters().format,
-					gpuImg.get(), IImage::LAYOUT::TRANSFER_DST_OPTIMAL, 
-					origImage->getRegions());
-
-				IGPUCommandBuffer::SPipelineBarrierDependencyInfo::image_barrier_t afterCopyImageBarriers[] =
-				{
-					{
-						.barrier = {
-							.dep = {
-								.srcStageMask = PIPELINE_STAGE_FLAGS::COPY_BIT, // previous top of pipe -> top_of_pipe in first scope = none
-								.srcAccessMask = ACCESS_FLAGS::TRANSFER_WRITE_BIT,
-								.dstStageMask = PIPELINE_STAGE_FLAGS::FRAGMENT_SHADER_BIT,
-								.dstAccessMask = ACCESS_FLAGS::SHADER_READ_BITS,
-							}
-							// .ownershipOp. No queueFam ownership transfer
-						},
-						.image = gpuImg.get(),
-						.subresourceRange = origParams.subresourceRange,
-						.oldLayout = IImage::LAYOUT::TRANSFER_DST_OPTIMAL,
-						.newLayout = IImage::LAYOUT::READ_ONLY_OPTIMAL,
-					}
-				};
-				cmdbuf->pipelineBarrier(E_DEPENDENCY_FLAGS::EDF_NONE,  { .imgBarriers = afterCopyImageBarriers  });
+				uint64_t imageID = i * 69ull; // it can be hash or something of the file path the image was loaded from
+				drawResourcesFiller.addStaticImage2D(imageID, sampleImages[i], intendedNextSubmit);
+				drawResourcesFiller.addImageObject(imageID, { 0.0, 0.0 }, { 100.0, 100.0 }, 0.0, intendedNextSubmit);
+				drawResourcesFiller.addImageObject(imageID, { 40.0, +40.0 }, { 100.0, 100.0 }, 0.0, intendedNextSubmit);
 			}
-			drawResourcesFiller._test_addImageObject({ 0.0, 0.0 }, { 100.0, 100.0 }, 0.0, intendedNextSubmit);
-			drawResourcesFiller._test_addImageObject({ 40.0, +40.0 }, { 100.0, 100.0 }, 0.0, intendedNextSubmit);
-			
 			LineStyleInfo lineStyle = 
 			{
 				.color = float32_t4(1.0f, 0.1f, 0.1f, 0.9f),
@@ -3517,6 +3418,8 @@ class ComputerAidedDesign final : public examples::SimpleWindowedApplication, pu
 	
 	std::vector<std::unique_ptr<msdfgen::Shape>> m_shapeMSDFImages = {};
 
+	std::vector<smart_refctd_ptr<ICPUImage>> sampleImages;
+
 	static constexpr char FirstGeneratedCharacter = ' ';
 	static constexpr char LastGeneratedCharacter = '~';
 
diff --git a/62_CAD/shaders/globals.hlsl b/62_CAD/shaders/globals.hlsl
index b565ff4ff..3cf575b27 100644
--- a/62_CAD/shaders/globals.hlsl
+++ b/62_CAD/shaders/globals.hlsl
@@ -497,6 +497,7 @@ inline bool operator==(const DTMSettings& lhs, const DTMSettings& rhs)
 }
 #endif
 
+NBL_CONSTEXPR uint32_t ImagesBindingArraySize = 128;
 NBL_CONSTEXPR uint32_t MainObjectIdxBits = 24u; // It will be packed next to alpha in a texture
 NBL_CONSTEXPR uint32_t AlphaBits = 32u - MainObjectIdxBits;
 NBL_CONSTEXPR uint32_t MaxIndexableMainObjects = (1u << MainObjectIdxBits) - 1u;
@@ -505,7 +506,7 @@ NBL_CONSTEXPR uint32_t InvalidDTMSettingsIdx = nbl::hlsl::numeric_limits<uint32_
 NBL_CONSTEXPR uint32_t InvalidMainObjectIdx = MaxIndexableMainObjects;
 NBL_CONSTEXPR uint32_t InvalidCustomProjectionIndex = nbl::hlsl::numeric_limits<uint32_t>::max;
 NBL_CONSTEXPR uint32_t InvalidCustomClipRectIndex = nbl::hlsl::numeric_limits<uint32_t>::max;
-NBL_CONSTEXPR uint32_t InvalidTextureIdx = nbl::hlsl::numeric_limits<uint32_t>::max;
+NBL_CONSTEXPR uint32_t InvalidTextureIndex = nbl::hlsl::numeric_limits<uint32_t>::max;
 
 // Hatches
 NBL_CONSTEXPR MajorAxis SelectedMajorAxis = MajorAxis::MAJOR_Y;
diff --git a/62_CAD/shaders/main_pipeline/common.hlsl b/62_CAD/shaders/main_pipeline/common.hlsl
index 631e421b9..8b8cf5bad 100644
--- a/62_CAD/shaders/main_pipeline/common.hlsl
+++ b/62_CAD/shaders/main_pipeline/common.hlsl
@@ -242,7 +242,7 @@ struct PSInput
 [[vk::combinedImageSampler]][[vk::binding(1, 0)]] SamplerState msdfSampler : register(s4);
 
 [[vk::binding(2, 0)]] SamplerState textureSampler : register(s5);
-[[vk::binding(3, 0)]] Texture2D textures[128] : register(t5);
+[[vk::binding(3, 0)]] Texture2D textures[ImagesBindingArraySize] : register(t5);
 
 // Set 1 - Window dependant data which has higher update frequency due to multiple windows and resize need image recreation and descriptor writes
 [[vk::binding(0, 1)]] globallycoherent RWTexture2D<uint> pseudoStencil : register(u0);
diff --git a/62_CAD/shaders/main_pipeline/fragment_shader.hlsl b/62_CAD/shaders/main_pipeline/fragment_shader.hlsl
index 6475faeff..d5949afcf 100644
--- a/62_CAD/shaders/main_pipeline/fragment_shader.hlsl
+++ b/62_CAD/shaders/main_pipeline/fragment_shader.hlsl
@@ -335,7 +335,7 @@ float4 fragMain(PSInput input) : SV_TARGET
 
             LineStyle style = loadLineStyle(mainObj.styleIdx);
             uint32_t textureId = asuint(style.screenSpaceLineWidth);
-            if (textureId != InvalidTextureIdx)
+            if (textureId != InvalidTextureIndex)
             {
                 // For Hatch fiils we sample the first mip as we don't fill the others, because they are constant in screenspace and render as expected
                 // If later on we decided that we can have different sizes here, we should do computations similar to FONT_GLYPH
@@ -349,7 +349,7 @@ float4 fragMain(PSInput input) : SV_TARGET
             const float2 uv = input.getFontGlyphUV();
             const uint32_t textureId = input.getFontGlyphTextureId();
 
-            if (textureId != InvalidTextureIdx)
+            if (textureId != InvalidTextureIndex)
             {
                 float mipLevel = msdfTextures.CalculateLevelOfDetail(msdfSampler, uv);
                 float3 msdfSample = msdfTextures.SampleLevel(msdfSampler, float3(uv, float(textureId)), mipLevel);
@@ -380,7 +380,7 @@ float4 fragMain(PSInput input) : SV_TARGET
             const float2 uv = input.getImageUV();
             const uint32_t textureId = input.getImageTextureId();
 
-            if (textureId != InvalidTextureIdx)
+            if (textureId != InvalidTextureIndex)
             {
                 float4 colorSample = textures[NonUniformResourceIndex(textureId)].Sample(textureSampler, float2(uv.x, uv.y));
                 textureColor = colorSample.rgb;

From 55a9e135ed1cf7e66a4d3b5bc0a161408949008f Mon Sep 17 00:00:00 2001
From: keptsecret <sorchon@gmail.com>
Date: Wed, 14 May 2025 13:36:08 +0700
Subject: [PATCH 242/529] block to test tlas/blas asset converter

---
 67_RayQueryGeometry/main.cpp | 237 +++++++++++++++++++++++++++++++++++
 1 file changed, 237 insertions(+)

diff --git a/67_RayQueryGeometry/main.cpp b/67_RayQueryGeometry/main.cpp
index aff687742..a6f6dfcc0 100644
--- a/67_RayQueryGeometry/main.cpp
+++ b/67_RayQueryGeometry/main.cpp
@@ -4,6 +4,8 @@
 
 #include "common.hpp"
 
+#define TEST_ASSET_CONV_AS
+
 class RayQueryGeometryApp final : public examples::SimpleWindowedApplication, public application_templates::MonoAssetManagerAndBuiltinResourceApplication
 {
 		using device_base_t = examples::SimpleWindowedApplication;
@@ -126,6 +128,10 @@ class RayQueryGeometryApp final : public examples::SimpleWindowedApplication, pu
 
 			auto cQueue = getComputeQueue();
 
+#ifdef TEST_ASSET_CONV_AS
+			if (!createAccelerationStructuresFromGeometry(cQueue, geometryCreator))
+				return logFail("Could not create acceleration structures from provided geometry creator");
+#else
 			// create geometry objects
 			if (!createGeometries(gQueue, geometryCreator))
 				return logFail("Could not create geometries from geometry creator");
@@ -147,6 +153,7 @@ class RayQueryGeometryApp final : public examples::SimpleWindowedApplication, pu
 			if (!createAccelerationStructures(cQueue))
 #endif
 				return logFail("Could not create acceleration structures");
+#endif	// TEST_ASSET_CONV_AS
 
 			// create pipelines
 			{
@@ -590,6 +597,235 @@ class RayQueryGeometryApp final : public examples::SimpleWindowedApplication, pu
 			}
 		}
 
+#ifdef TEST_ASSET_CONV_AS
+		bool createAccelerationStructuresFromGeometry(video::CThreadSafeQueueAdapter* queue, const IGeometryCreator* gc)
+		{
+			// get geometries in ICPUBuffers
+			std::array<ReferenceObjectCpu, OT_COUNT> objectsCpu;
+			objectsCpu[OT_CUBE] = ReferenceObjectCpu{ .meta = {.type = OT_CUBE, .name = "Cube Mesh" }, .shadersType = GP_BASIC, .data = gc->createCubeMesh(nbl::core::vector3df(1.f, 1.f, 1.f)) };
+			objectsCpu[OT_SPHERE] = ReferenceObjectCpu{ .meta = {.type = OT_SPHERE, .name = "Sphere Mesh" }, .shadersType = GP_BASIC, .data = gc->createSphereMesh(2, 16, 16) };
+			objectsCpu[OT_CYLINDER] = ReferenceObjectCpu{ .meta = {.type = OT_CYLINDER, .name = "Cylinder Mesh" }, .shadersType = GP_BASIC, .data = gc->createCylinderMesh(2, 2, 20) };
+			objectsCpu[OT_RECTANGLE] = ReferenceObjectCpu{ .meta = {.type = OT_RECTANGLE, .name = "Rectangle Mesh" }, .shadersType = GP_BASIC, .data = gc->createRectangleMesh(nbl::core::vector2df_SIMD(1.5, 3)) };
+			objectsCpu[OT_DISK] = ReferenceObjectCpu{ .meta = {.type = OT_DISK, .name = "Disk Mesh" }, .shadersType = GP_BASIC, .data = gc->createDiskMesh(2, 30) };
+			objectsCpu[OT_ARROW] = ReferenceObjectCpu{ .meta = {.type = OT_ARROW, .name = "Arrow Mesh" }, .shadersType = GP_BASIC, .data = gc->createArrowMesh() };
+			objectsCpu[OT_CONE] = ReferenceObjectCpu{ .meta = {.type = OT_CONE, .name = "Cone Mesh" }, .shadersType = GP_CONE, .data = gc->createConeMesh(2, 3, 10) };
+			objectsCpu[OT_ICOSPHERE] = ReferenceObjectCpu{ .meta = {.type = OT_ICOSPHERE, .name = "Icosphere Mesh" }, .shadersType = GP_ICO, .data = gc->createIcoSphere(1, 3, true) };
+
+			auto geomInfoBuffer = ICPUBuffer::create({ OT_COUNT * sizeof(SGeomInfo) });
+
+			SGeomInfo* geomInfos = reinterpret_cast<SGeomInfo*>(geomInfoBuffer->getPointer());
+			const uint32_t byteOffsets[OT_COUNT] = { 18, 24, 24, 20, 20, 24, 16, 12 };	// based on normals data position
+			const uint32_t smoothNormals[OT_COUNT] = { 0, 1, 1, 0, 0, 1, 1, 1 };
+
+			struct ScratchVIBindings
+			{
+				nbl::asset::SBufferBinding<ICPUBuffer> vertex, index;
+			};
+			std::array<ScratchVIBindings, OT_COUNT> scratchBuffers;
+
+			for (uint32_t i = 0; i < scratchBuffers.size(); i++)
+			{
+				const auto& geom = objectsCpu[i];
+				auto& scratchObj = scratchBuffers[i];
+				const bool useIndex = geom.data.indexType != EIT_UNKNOWN;
+
+				auto vBuffer = smart_refctd_ptr(geom.data.bindings[0].buffer); // no offset
+				auto vUsage = bitflag(IGPUBuffer::EUF_STORAGE_BUFFER_BIT) | IGPUBuffer::EUF_TRANSFER_DST_BIT | IGPUBuffer::EUF_INLINE_UPDATE_VIA_CMDBUF |
+					IGPUBuffer::EUF_ACCELERATION_STRUCTURE_BUILD_INPUT_READ_ONLY_BIT | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT;
+
+				auto iBuffer = smart_refctd_ptr(geom.data.indexBuffer.buffer); // no offset
+				auto iUsage = bitflag(IGPUBuffer::EUF_STORAGE_BUFFER_BIT) | IGPUBuffer::EUF_TRANSFER_DST_BIT | IGPUBuffer::EUF_INLINE_UPDATE_VIA_CMDBUF |
+					IGPUBuffer::EUF_ACCELERATION_STRUCTURE_BUILD_INPUT_READ_ONLY_BIT | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT;
+
+				vBuffer->addUsageFlags(vUsage);
+				vBuffer->setContentHash(vBuffer->computeContentHash());
+				scratchObj.vertex = { .offset = 0, .buffer = vBuffer };
+
+				if (useIndex)
+					if (iBuffer)
+					{
+						iBuffer->addUsageFlags(iUsage);
+						iBuffer->setContentHash(iBuffer->computeContentHash());
+					}
+				scratchObj.index = { .offset = 0, .buffer = iBuffer };
+			}
+
+			// get ICPUBuffers into ICPUBottomLevelAccelerationStructures
+			std::array<smart_refctd_ptr<ICPUBottomLevelAccelerationStructure>, OT_COUNT> cpuBlas;
+			for (uint32_t i = 0; i < cpuBlas.size(); i++)
+			{
+				auto triangles = make_refctd_dynamic_array<smart_refctd_dynamic_array<ICPUBottomLevelAccelerationStructure::Triangles<ICPUBuffer>>>(1u);
+				auto primitiveCounts = make_refctd_dynamic_array<smart_refctd_dynamic_array<uint32_t>>(1u);
+
+				auto& tri = triangles->front();
+				auto& primCount = primitiveCounts->front();
+				const auto& geom = objectsCpu[i];
+				const auto& scratchObj = scratchBuffers[i];
+
+				const bool useIndex = geom.data.indexType != EIT_UNKNOWN;
+				const uint32_t vertexStride = geom.data.inputParams.bindings[0].stride;
+				const uint32_t numVertices = scratchObj.vertex.buffer->getSize() / vertexStride;
+
+				if (useIndex)
+					primCount = geom.data.indexCount / 3;
+				else
+					primCount = numVertices / 3;
+
+				geomInfos[i].indexType = geom.data.indexType;
+				geomInfos[i].vertexStride = vertexStride;
+				geomInfos[i].smoothNormals = smoothNormals[i];
+
+				tri.vertexData[0] = scratchObj.vertex;
+				tri.indexData = useIndex ? scratchObj.index : scratchObj.vertex;
+				tri.maxVertex = numVertices - 1;
+				tri.vertexStride = vertexStride;
+				tri.vertexFormat = EF_R32G32B32_SFLOAT;
+				tri.indexType = geom.data.indexType;
+				tri.geometryFlags = IGPUBottomLevelAccelerationStructure::GEOMETRY_FLAGS::OPAQUE_BIT;
+
+				auto& blas = cpuBlas[i];
+				blas->setGeometries(std::move(triangles), std::move(primitiveCounts));
+
+				auto blasFlags = bitflag(IGPUBottomLevelAccelerationStructure::BUILD_FLAGS::PREFER_FAST_TRACE_BIT) | IGPUBottomLevelAccelerationStructure::BUILD_FLAGS::ALLOW_COMPACTION_BIT;
+				if (m_physicalDevice->getProperties().limits.rayTracingPositionFetch)
+					blasFlags |= IGPUBottomLevelAccelerationStructure::BUILD_FLAGS::ALLOW_DATA_ACCESS;
+
+				blas->setBuildFlags(blasFlags);
+				blas->setContentHash(blas->computeContentHash());
+			}
+
+			// TODO: when does compact blas happen?
+
+			// get ICPUBottomLevelAccelerationStructure into ICPUTopLevelAccelerationStructure
+			auto geomInstances = make_refctd_dynamic_array<smart_refctd_dynamic_array<ICPUTopLevelAccelerationStructure::PolymorphicInstance>>(OT_COUNT);
+			{
+				uint32_t i = 0;
+				for (auto instance = geomInstances->begin(); instance != geomInstances->end(); instance++, i++)
+				{
+					ICPUTopLevelAccelerationStructure::StaticInstance inst;
+					inst.base.blas = cpuBlas[i];
+					inst.base.flags = static_cast<uint32_t>(IGPUTopLevelAccelerationStructure::INSTANCE_FLAGS::TRIANGLE_FACING_CULL_DISABLE_BIT);
+					inst.base.instanceCustomIndex = i;
+					inst.base.instanceShaderBindingTableRecordOffset = 0;
+					inst.base.mask = 0xFF;
+
+					core::matrix3x4SIMD transform;
+					transform.setTranslation(nbl::core::vectorSIMDf(5.f * i, 0, 0, 0));
+					inst.transform = transform;
+					
+					instance->instance = inst;
+				}
+			}
+
+			smart_refctd_ptr<ICPUTopLevelAccelerationStructure> cpuTlas;
+			cpuTlas->setInstances(std::move(geomInstances));
+			cpuTlas->setBuildFlags(IGPUTopLevelAccelerationStructure::BUILD_FLAGS::PREFER_FAST_TRACE_BIT);
+
+			// convert with asset converter
+			auto pool = m_device->createCommandPool(queue->getFamilyIndex(), IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT);
+			if (!pool)
+				return logFail("Couldn't create Command Pool for geometry creation!");
+			auto cmdbuf = getSingleUseCommandBufferAndBegin(pool);
+			cmdbuf->beginDebugMarker("Build geometry vertex and index buffers");
+
+			smart_refctd_ptr<CAssetConverter> converter = CAssetConverter::create({ .device = m_device.get(), .optimizer = {} });
+			CAssetConverter::SInputs inputs = {};
+			inputs.logger = m_logger.get();
+
+			std::array<ICPUTopLevelAccelerationStructure*, 1u> tmpTlas;
+			std::array<ICPUBuffer*, OT_COUNT * 2u> tmpBuffers;
+			{
+				tmpTlas[0] = cpuTlas.get();
+				for (uint32_t i = 0; i < objectsCpu.size(); i++)
+				{
+					tmpBuffers[2 * i + 0] = scratchBuffers[i].vertex.buffer.get();
+					tmpBuffers[2 * i + 1] = scratchBuffers[i].index.buffer.get();
+				}
+
+				std::get<CAssetConverter::SInputs::asset_span_t<ICPUTopLevelAccelerationStructure>>(inputs.assets) = tmpTlas;
+				std::get<CAssetConverter::SInputs::asset_span_t<ICPUBuffer>>(inputs.assets) = tmpBuffers;
+			}
+
+			auto reservation = converter->reserve(inputs);
+			{
+				auto prepass = [&]<typename asset_type_t>(const auto & references) -> bool
+				{
+					auto objects = reservation.getGPUObjects<asset_type_t>();
+					uint32_t counter = {};
+					for (auto& object : objects)
+					{
+						auto gpu = object.value;
+						auto* reference = references[counter];
+
+						if (reference)
+						{
+							if (!gpu)
+							{
+								m_logger->log("Failed to convert a CPU object to GPU!", ILogger::ELL_ERROR);
+								return false;
+							}
+						}
+						counter++;
+					}
+					return true;
+				};
+
+				prepass.template operator() < ICPUTopLevelAccelerationStructure > (tmpTlas);
+				prepass.template operator() < ICPUBuffer > (tmpBuffers);
+			}
+
+			auto semaphore = m_device->createSemaphore(0u);
+
+			std::array<IQueue::SSubmitInfo::SCommandBufferInfo, 1> cmdbufs = {};
+			cmdbufs.front().cmdbuf = cmdbuf.get();
+
+			SIntendedSubmitInfo transfer = {};
+			transfer.queue = queue;
+			transfer.scratchCommandBuffers = cmdbufs;
+			transfer.scratchSemaphore = {
+				.semaphore = semaphore.get(),
+				.value = 0u,
+				.stageMask = PIPELINE_STAGE_FLAGS::ALL_TRANSFER_BITS	// TODO mask for AS?
+			};
+			// convert
+			{
+				CAssetConverter::SConvertParams params = {};
+				params.utilities = m_utils.get();
+				params.transfer = &transfer;
+
+				auto future = reservation.convert(params);
+				if (future.copy() != IQueue::RESULT::SUCCESS)
+				{
+					m_logger->log("Failed to await submission feature!", ILogger::ELL_ERROR);
+					return false;
+				}
+
+				// assign gpu objects to output
+				auto&& tlases = reservation.getGPUObjects<ICPUTopLevelAccelerationStructure>();
+				gpuTlas = tlases[0].value;
+				auto&& buffers = reservation.getGPUObjects<ICPUBuffer>();
+				for (uint32_t i = 0; i < objectsCpu.size(); i++)
+				{
+					auto vBuffer = buffers[2 * i + 0].value;
+					auto iBuffer = buffers[2 * i + 1].value;
+					const auto& geom = objectsCpu[i];
+					const bool useIndex = geom.data.indexType != EIT_UNKNOWN;
+
+					geomInfos[i].vertexBufferAddress = vBuffer->getDeviceAddress() + byteOffsets[i];
+					geomInfos[i].indexBufferAddress = useIndex ? iBuffer->getDeviceAddress() : geomInfos[i].vertexBufferAddress;
+				}
+			}
+
+			{
+				IGPUBuffer::SCreationParams params;
+				params.usage = IGPUBuffer::EUF_STORAGE_BUFFER_BIT | IGPUBuffer::EUF_TRANSFER_DST_BIT | IGPUBuffer::EUF_INLINE_UPDATE_VIA_CMDBUF | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT;
+				params.size = OT_COUNT * sizeof(SGeomInfo);
+				m_utils->createFilledDeviceLocalBufferOnDedMem(SIntendedSubmitInfo{ .queue = queue }, std::move(params), geomInfos).move_into(geometryInfoBuffer);
+			}
+
+			return true;
+		}
+#else
 		bool createGeometries(video::CThreadSafeQueueAdapter* queue, const IGeometryCreator* gc)
 		{
 			auto pool = m_device->createCommandPool(queue->getFamilyIndex(), IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT);
@@ -1057,6 +1293,7 @@ class RayQueryGeometryApp final : public examples::SimpleWindowedApplication, pu
 
 			return true;
 		}
+#endif // TEST_ASSET_CONV_AS
 
 
 		smart_refctd_ptr<IWindow> m_window;

From 4a951b307b09ecf4a054f7ac27d4dac01f5e8fb9 Mon Sep 17 00:00:00 2001
From: keptsecret <sorchon@gmail.com>
Date: Wed, 14 May 2025 15:27:54 +0700
Subject: [PATCH 243/529] more test case coverage

---
 23_Arithmetic2UnitTest/main.cpp | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/23_Arithmetic2UnitTest/main.cpp b/23_Arithmetic2UnitTest/main.cpp
index 49cba28d1..a3c274160 100644
--- a/23_Arithmetic2UnitTest/main.cpp
+++ b/23_Arithmetic2UnitTest/main.cpp
@@ -174,9 +174,8 @@ class Workgroup2ScanTestApp final : public application_templates::BasicMultiQueu
 		for (auto subgroupSize=MinSubgroupSize; subgroupSize <= MaxSubgroupSize; subgroupSize *= 2u)
 		{
 			const uint8_t subgroupSizeLog2 = hlsl::findMSB(subgroupSize);
-			for (uint32_t i = 0; i < WorkgroupSizes.size(); i++)
+			for (uint32_t workgroupSize = subgroupSize; workgroupSize < MaxWorkgroupSize; workgroupSize *= 2)
 			{
-				const uint32_t workgroupSize = WorkgroupSizes[i];
 				// make sure renderdoc captures everything for debugging
 				m_api->startCapture();
 				m_logger->log("Testing Workgroup Size %u with Subgroup Size %u", ILogger::ELL_INFO, workgroupSize, subgroupSize);
@@ -478,8 +477,7 @@ class Workgroup2ScanTestApp final : public application_templates::BasicMultiQueu
 
 	uint32_t totalFailCount = 0;
 
-	constexpr static inline std::array<uint32_t, 4> WorkgroupSizes = { 32, 256, 512, 1024 };
-	constexpr static inline std::array<uint32_t, 3> ItemsPerInvocations = { 1, 2, 4 };
+	constexpr static inline std::array<uint32_t, 4> ItemsPerInvocations = { 1, 2, 3, 4 };
 };
 
 NBL_MAIN_FUNC(Workgroup2ScanTestApp)
\ No newline at end of file

From 16b7349f55344cafc8ec9ab28ce72e129fe938bd Mon Sep 17 00:00:00 2001
From: keptsecret <sorchon@gmail.com>
Date: Wed, 14 May 2025 16:41:59 +0700
Subject: [PATCH 244/529] some fixes + log debug

---
 67_RayQueryGeometry/main.cpp | 52 +++++++++++++++++++++++-------------
 1 file changed, 33 insertions(+), 19 deletions(-)

diff --git a/67_RayQueryGeometry/main.cpp b/67_RayQueryGeometry/main.cpp
index a6f6dfcc0..cec4e5270 100644
--- a/67_RayQueryGeometry/main.cpp
+++ b/67_RayQueryGeometry/main.cpp
@@ -617,29 +617,27 @@ class RayQueryGeometryApp final : public examples::SimpleWindowedApplication, pu
 			const uint32_t byteOffsets[OT_COUNT] = { 18, 24, 24, 20, 20, 24, 16, 12 };	// based on normals data position
 			const uint32_t smoothNormals[OT_COUNT] = { 0, 1, 1, 0, 0, 1, 1, 1 };
 
-			struct ScratchVIBindings
+			struct CPUBufferBindings
 			{
 				nbl::asset::SBufferBinding<ICPUBuffer> vertex, index;
 			};
-			std::array<ScratchVIBindings, OT_COUNT> scratchBuffers;
+			std::array<CPUBufferBindings, OT_COUNT> cpuBuffers;
 
-			for (uint32_t i = 0; i < scratchBuffers.size(); i++)
+			for (uint32_t i = 0; i < cpuBuffers.size(); i++)
 			{
 				const auto& geom = objectsCpu[i];
-				auto& scratchObj = scratchBuffers[i];
+				auto& cpuObj = cpuBuffers[i];
 				const bool useIndex = geom.data.indexType != EIT_UNKNOWN;
 
 				auto vBuffer = smart_refctd_ptr(geom.data.bindings[0].buffer); // no offset
-				auto vUsage = bitflag(IGPUBuffer::EUF_STORAGE_BUFFER_BIT) | IGPUBuffer::EUF_TRANSFER_DST_BIT | IGPUBuffer::EUF_INLINE_UPDATE_VIA_CMDBUF |
-					IGPUBuffer::EUF_ACCELERATION_STRUCTURE_BUILD_INPUT_READ_ONLY_BIT | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT;
+				auto vUsage = bitflag(IGPUBuffer::EUF_STORAGE_BUFFER_BIT) | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT;
 
 				auto iBuffer = smart_refctd_ptr(geom.data.indexBuffer.buffer); // no offset
-				auto iUsage = bitflag(IGPUBuffer::EUF_STORAGE_BUFFER_BIT) | IGPUBuffer::EUF_TRANSFER_DST_BIT | IGPUBuffer::EUF_INLINE_UPDATE_VIA_CMDBUF |
-					IGPUBuffer::EUF_ACCELERATION_STRUCTURE_BUILD_INPUT_READ_ONLY_BIT | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT;
+				auto iUsage = bitflag(IGPUBuffer::EUF_STORAGE_BUFFER_BIT) | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT;
 
 				vBuffer->addUsageFlags(vUsage);
 				vBuffer->setContentHash(vBuffer->computeContentHash());
-				scratchObj.vertex = { .offset = 0, .buffer = vBuffer };
+				cpuObj.vertex = { .offset = 0, .buffer = vBuffer };
 
 				if (useIndex)
 					if (iBuffer)
@@ -647,7 +645,7 @@ class RayQueryGeometryApp final : public examples::SimpleWindowedApplication, pu
 						iBuffer->addUsageFlags(iUsage);
 						iBuffer->setContentHash(iBuffer->computeContentHash());
 					}
-				scratchObj.index = { .offset = 0, .buffer = iBuffer };
+				cpuObj.index = { .offset = 0, .buffer = iBuffer };
 			}
 
 			// get ICPUBuffers into ICPUBottomLevelAccelerationStructures
@@ -660,11 +658,11 @@ class RayQueryGeometryApp final : public examples::SimpleWindowedApplication, pu
 				auto& tri = triangles->front();
 				auto& primCount = primitiveCounts->front();
 				const auto& geom = objectsCpu[i];
-				const auto& scratchObj = scratchBuffers[i];
+				const auto& cpuBuf = cpuBuffers[i];
 
 				const bool useIndex = geom.data.indexType != EIT_UNKNOWN;
 				const uint32_t vertexStride = geom.data.inputParams.bindings[0].stride;
-				const uint32_t numVertices = scratchObj.vertex.buffer->getSize() / vertexStride;
+				const uint32_t numVertices = cpuBuf.vertex.buffer->getSize() / vertexStride;
 
 				if (useIndex)
 					primCount = geom.data.indexCount / 3;
@@ -675,8 +673,8 @@ class RayQueryGeometryApp final : public examples::SimpleWindowedApplication, pu
 				geomInfos[i].vertexStride = vertexStride;
 				geomInfos[i].smoothNormals = smoothNormals[i];
 
-				tri.vertexData[0] = scratchObj.vertex;
-				tri.indexData = useIndex ? scratchObj.index : scratchObj.vertex;
+				tri.vertexData[0] = cpuBuf.vertex;
+				tri.indexData = useIndex ? cpuBuf.index : cpuBuf.vertex;
 				tri.maxVertex = numVertices - 1;
 				tri.vertexStride = vertexStride;
 				tri.vertexFormat = EF_R32G32B32_SFLOAT;
@@ -684,6 +682,7 @@ class RayQueryGeometryApp final : public examples::SimpleWindowedApplication, pu
 				tri.geometryFlags = IGPUBottomLevelAccelerationStructure::GEOMETRY_FLAGS::OPAQUE_BIT;
 
 				auto& blas = cpuBlas[i];
+				blas = make_smart_refctd_ptr<ICPUBottomLevelAccelerationStructure>();
 				blas->setGeometries(std::move(triangles), std::move(primitiveCounts));
 
 				auto blasFlags = bitflag(IGPUBottomLevelAccelerationStructure::BUILD_FLAGS::PREFER_FAST_TRACE_BIT) | IGPUBottomLevelAccelerationStructure::BUILD_FLAGS::ALLOW_COMPACTION_BIT;
@@ -717,7 +716,7 @@ class RayQueryGeometryApp final : public examples::SimpleWindowedApplication, pu
 				}
 			}
 
-			smart_refctd_ptr<ICPUTopLevelAccelerationStructure> cpuTlas;
+			auto cpuTlas = make_smart_refctd_ptr<ICPUTopLevelAccelerationStructure>();
 			cpuTlas->setInstances(std::move(geomInstances));
 			cpuTlas->setBuildFlags(IGPUTopLevelAccelerationStructure::BUILD_FLAGS::PREFER_FAST_TRACE_BIT);
 
@@ -726,7 +725,6 @@ class RayQueryGeometryApp final : public examples::SimpleWindowedApplication, pu
 			if (!pool)
 				return logFail("Couldn't create Command Pool for geometry creation!");
 			auto cmdbuf = getSingleUseCommandBufferAndBegin(pool);
-			cmdbuf->beginDebugMarker("Build geometry vertex and index buffers");
 
 			smart_refctd_ptr<CAssetConverter> converter = CAssetConverter::create({ .device = m_device.get(), .optimizer = {} });
 			CAssetConverter::SInputs inputs = {};
@@ -738,8 +736,8 @@ class RayQueryGeometryApp final : public examples::SimpleWindowedApplication, pu
 				tmpTlas[0] = cpuTlas.get();
 				for (uint32_t i = 0; i < objectsCpu.size(); i++)
 				{
-					tmpBuffers[2 * i + 0] = scratchBuffers[i].vertex.buffer.get();
-					tmpBuffers[2 * i + 1] = scratchBuffers[i].index.buffer.get();
+					tmpBuffers[2 * i + 0] = cpuBuffers[i].vertex.buffer.get();
+					tmpBuffers[2 * i + 1] = cpuBuffers[i].index.buffer.get();
 				}
 
 				std::get<CAssetConverter::SInputs::asset_span_t<ICPUTopLevelAccelerationStructure>>(inputs.assets) = tmpTlas;
@@ -774,6 +772,13 @@ class RayQueryGeometryApp final : public examples::SimpleWindowedApplication, pu
 				prepass.template operator() < ICPUBuffer > (tmpBuffers);
 			}
 
+			// TODO wait for convert
+			m_logger->log("willDeviceASBuild: %d, willHostASBuild: %d\nminASBuildScratchSize: %d, maxASBuildScratchSize: %d\nminCompactedASAllocatorSpace: %d, requiredQueueFlags: %d\n", ILogger::ELL_INFO,
+				reservation.willDeviceASBuild(), reservation.willHostASBuild(),
+				reservation.getMinASBuildScratchSize(false), reservation.getMaxASBuildScratchSize(false),
+				reservation.getMinCompactedASAllocatorSpace(), reservation.getRequiredQueueFlags(false));
+			return false;
+
 			auto semaphore = m_device->createSemaphore(0u);
 
 			std::array<IQueue::SSubmitInfo::SCommandBufferInfo, 1> cmdbufs = {};
@@ -785,13 +790,22 @@ class RayQueryGeometryApp final : public examples::SimpleWindowedApplication, pu
 			transfer.scratchSemaphore = {
 				.semaphore = semaphore.get(),
 				.value = 0u,
-				.stageMask = PIPELINE_STAGE_FLAGS::ALL_TRANSFER_BITS	// TODO mask for AS?
+				.stageMask = PIPELINE_STAGE_FLAGS::ALL_TRANSFER_BITS
+			};
+			SIntendedSubmitInfo compute = {};
+			compute.queue = queue;
+			compute.scratchCommandBuffers = cmdbufs;
+			compute.scratchSemaphore = {
+				.semaphore = semaphore.get(),
+				.value = 0u,
+				.stageMask = PIPELINE_STAGE_FLAGS::ACCELERATION_STRUCTURE_BUILD_BIT	// TODO correct mask?
 			};
 			// convert
 			{
 				CAssetConverter::SConvertParams params = {};
 				params.utilities = m_utils.get();
 				params.transfer = &transfer;
+				params.compute = &compute;
 
 				auto future = reservation.convert(params);
 				if (future.copy() != IQueue::RESULT::SUCCESS)

From 825c73d5d8307efef2488f0b6ce82b69c32855ea Mon Sep 17 00:00:00 2001
From: Arkadiusz Lachowicz <areklachowicz@gmail.com>
Date: Wed, 14 May 2025 11:56:11 +0200
Subject: [PATCH 245/529] update media submodule

---
 media | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/media b/media
index a98646358..68dbe85b9 160000
--- a/media
+++ b/media
@@ -1 +1 @@
-Subproject commit a9864635879e5a616ac400eecd8b6451b498fbf1
+Subproject commit 68dbe85b9849c9b094760428a3639f5c8917d85e

From ce29c74f5258851d5cafa105758c45b07ed8f516 Mon Sep 17 00:00:00 2001
From: Erfan Ahmadi <ahmadierfan99@gmail.com>
Date: Wed, 14 May 2025 14:05:49 +0400
Subject: [PATCH 246/529] [WIP] Images.h and ImagesMemorySubAllocator, TODO:
 postDestroyCleanup and deallocate image from the suballocator

---
 62_CAD/DrawResourcesFiller.cpp | 153 ++++++++++++++++++++++++++-------
 62_CAD/DrawResourcesFiller.h   |   6 +-
 62_CAD/Images.h                | 153 +++++++++++++++++++++++++++++++++
 3 files changed, 278 insertions(+), 34 deletions(-)
 create mode 100644 62_CAD/Images.h

diff --git a/62_CAD/DrawResourcesFiller.cpp b/62_CAD/DrawResourcesFiller.cpp
index d5babd393..5948535a9 100644
--- a/62_CAD/DrawResourcesFiller.cpp
+++ b/62_CAD/DrawResourcesFiller.cpp
@@ -27,6 +27,7 @@ void DrawResourcesFiller::setTexturesDescriptorSetAndBinding(core::smart_refctd_
 void DrawResourcesFiller::allocateResourcesBuffer(ILogicalDevice* logicalDevice, size_t size)
 {
 	// TODO: Make this function failable and report insufficient memory if less that getMinimumRequiredResourcesBufferSize, TODO: Have retry mechanism to allocate less mem
+	// TODO: Allocate buffer memory and image memory with 1 allocation, so that failure and retries are more straightforward.
 	size = core::alignUp(size, ResourcesMaxNaturalAlignment);
 	size = core::max(size, getMinimumRequiredResourcesBufferSize());
 	// size = 368u; STRESS TEST
@@ -39,6 +40,46 @@ void DrawResourcesFiller::allocateResourcesBuffer(ILogicalDevice* logicalDevice,
 	IDeviceMemoryBacked::SDeviceMemoryRequirements memReq = resourcesGPUBuffer->getMemoryReqs();
 	memReq.memoryTypeBits &= logicalDevice->getPhysicalDevice()->getDeviceLocalMemoryTypeBits();
 	auto mem = logicalDevice->allocate(memReq, resourcesGPUBuffer.get(), IDeviceMemoryAllocation::EMAF_DEVICE_ADDRESS_BIT);
+
+	// Allocate for Images  
+	{
+		const auto& memoryProperties = logicalDevice->getPhysicalDevice()->getMemoryProperties();
+		uint32_t memoryTypeIdx = ~0u;
+		for (uint32_t i = 0u; i < memoryProperties.memoryTypeCount; ++i)
+		{
+			if (memoryProperties.memoryTypes[i].propertyFlags.hasFlags(IDeviceMemoryAllocation::EMPF_DEVICE_LOCAL_BIT))
+			{
+				memoryTypeIdx = i;
+				break;
+			}
+		}
+
+		if (memoryTypeIdx == ~0u)
+		{
+			// TODO: Log, no device local memory found?! weird
+			assert(false);
+		}
+
+		IDeviceMemoryAllocator::SAllocateInfo allocationInfo =
+		{
+			.size = 512 * 1024 * 1024, // 512 MB
+			.flags = IDeviceMemoryAllocation::E_MEMORY_ALLOCATE_FLAGS::EMAF_NONE,
+			.memoryTypeIndex = memoryTypeIdx,
+			.dedication = nullptr,
+		};
+		imagesMemoryArena = logicalDevice->allocate(allocationInfo);
+
+		if (imagesMemoryArena.isValid())
+		{
+			imagesMemorySubAllocator = std::unique_ptr<ImagesMemorySubAllocator>(new ImagesMemorySubAllocator(allocationInfo.size));
+		}
+		else
+		{
+			// LOG: Allocation failure to allocate memory arena for images 
+			assert(false);
+		}
+	}
+
 }
 
 void DrawResourcesFiller::allocateMSDFTextures(ILogicalDevice* logicalDevice, uint32_t maxMSDFs, uint32_t2 msdfsExtent)
@@ -350,7 +391,6 @@ uint32_t DrawResourcesFiller::addStaticImage2D(image_id imageID, const core::sma
 			suballocatedDescriptorSet->multi_deallocate(imagesArrayBinding, 1u, &evicted.index, deallocationWaitInfo);
 		}
 	};
-	
 
 	// Try inserting or updating the image usage in the cache.
 	// If the image is already present, updates its semaphore value.
@@ -393,47 +433,94 @@ uint32_t DrawResourcesFiller::addStaticImage2D(image_id imageID, const core::sma
 			while (imagesUsageCache->size() > 0u)
 			{
 				// Try creating the image and allocating memory for it:
-				auto gpuImg = device->createImage(std::move(imageParams));
-				if (!gpuImg || !device->allocate(gpuImg->getMemoryReqs(), gpuImg.get()).isValid())
+				auto gpuImage = device->createImage(std::move(imageParams));
+				
+				if (gpuImage)
 				{
-					// Failed creating or allocating the image, evict and retry.
-					if (imagesUsageCache->size() == 1u)
+					nbl::video::IDeviceMemoryBacked::SDeviceMemoryRequirements gpuImageMemoryRequirements = gpuImage->getMemoryReqs();
+					const bool imageMemoryRequirementsMatch = 
+						(physDev->getDeviceLocalMemoryTypeBits() & gpuImageMemoryRequirements.memoryTypeBits) != 0 && // should have device local memory compatible
+						(gpuImageMemoryRequirements.requiresDedicatedAllocation == false); // should not require dedicated allocation
+
+					if (imageMemoryRequirementsMatch)
 					{
-						// Nothing else to evict; give up.
-						// We probably have evicted almost every other texture except the one we just allocated an index for
+						uint64_t allocationOffset = imagesMemorySubAllocator->allocate(gpuImageMemoryRequirements);
+						const bool allocationFromImagesMemoryArenaSuccessfull = allocationOffset != ImagesMemorySubAllocator::InvalidAddress;
+						if (allocationFromImagesMemoryArenaSuccessfull)
+						{
+							nbl::video::ILogicalDevice::SBindImageMemoryInfo bindImageMemoryInfo =
+							{
+								.image = gpuImage.get(),
+								.binding = {.memory = imagesMemoryArena.memory.get(), .offset = imagesMemoryArena.offset + allocationOffset }
+							};
+							const bool boundToMemorySuccessfully = device->bindImageMemory({ &bindImageMemoryInfo, 1u });
+							if (boundToMemorySuccessfully)
+							{
+								IGPUImageView::SCreationParams viewParams = {
+									.image = gpuImage,
+									.viewType = IGPUImageView::ET_2D,
+									.format = gpuImage->getCreationParameters().format
+								};
+								gpuImage->setObjectDebugName((std::to_string(imageID) + " Static Image 2D").c_str());
+								gpuImageView = device->createImageView(std::move(viewParams));
+								if (gpuImageView)
+								{
+									gpuImageView->setObjectDebugName((std::to_string(imageID) + " Static Image View 2D").c_str());
+								}
+								else
+								{
+									// irrecoverable error if simple image creation fails.
+									// TODO[LOG]: that's rare, image view creation failed.
+								}
+
+								// succcessful with everything, just break and get out of this retry loop
+								break;
+							}
+							else
+							{
+								// irrecoverable error if simple bindImageMemory fails.
+								// TODO: LOG
+								break;
+							}
+						}
+						else
+						{
+							// recoverable error when allocation fails, we don't log anything, next code will try evicting other images and retry
+						}
+					}
+					else
+					{
+						// irrecoverable error if memory requirements of the image don't match our preallocated devicememory
+						// TODO: LOG
 						break;
 					}
-
-					assert(imagesUsageCache->size() > 1u);
-
-					const image_id evictionCandidate = imagesUsageCache->select_eviction_candidate();
-					ImageReference* imageRef = imagesUsageCache->peek(evictionCandidate);
-					if (imageRef)
-						evictionCallback(*imageRef);
-					imagesUsageCache->erase(evictionCandidate);
-					suballocatedDescriptorSet->cull_frees(); // to make sure deallocation requests in eviction callback are waited for.
-
-					// we don't hold any references to the GPUImageView or GPUImage so descriptor binding will be the last reference
-					// hopefully by here the suballocated descriptor set freed some VRAM by dropping the image last ref and it's dedicated allocation.
-
-					continue; // Retry allocation after evicting.
 				}
-				
-				IGPUImageView::SCreationParams viewParams = {
-					.image = gpuImg,
-					.viewType = IGPUImageView::ET_2D,
-					.format = gpuImg->getCreationParameters().format
-				};
-				gpuImg->setObjectDebugName((std::to_string(imageID) + " Static Image 2D").c_str());
-				gpuImageView = device->createImageView(std::move(viewParams));
-				if (!gpuImageView)
+				else
 				{
-					// TODO[LOG]: that's rare, image view creation failed.
+					// irrecoverable error if simple image creation fails.
+					// TODO: LOG
 					break;
 				}
 
-				gpuImageView->setObjectDebugName((std::to_string(imageID) + " Static Image View 2D").c_str());
-				break;
+				// Getting here means we failed creating or allocating the image, evict and retry.
+				if (imagesUsageCache->size() == 1u)
+					{
+						// Nothing else to evict; give up.
+						// We probably have evicted almost every other texture except the one we just allocated an index for
+						break;
+					}
+
+				assert(imagesUsageCache->size() > 1u);
+
+				const image_id evictionCandidate = imagesUsageCache->select_eviction_candidate();
+				ImageReference* imageRef = imagesUsageCache->peek(evictionCandidate);
+				if (imageRef)
+					evictionCallback(*imageRef);
+				imagesUsageCache->erase(evictionCandidate);
+				suballocatedDescriptorSet->cull_frees(); // to make sure deallocation requests in eviction callback are waited for.
+
+				// we don't hold any references to the GPUImageView or GPUImage so descriptor binding will be the last reference
+				// hopefully by here the suballocated descriptor set freed some VRAM by dropping the image last ref and it's dedicated allocation.
 			}
 
 			if (gpuImageView)
diff --git a/62_CAD/DrawResourcesFiller.h b/62_CAD/DrawResourcesFiller.h
index 1a86c09e2..594c0fba3 100644
--- a/62_CAD/DrawResourcesFiller.h
+++ b/62_CAD/DrawResourcesFiller.h
@@ -3,7 +3,7 @@
 #include "CTriangleMesh.h"
 #include "Hatch.h"
 #include "IndexAllocator.h"
-#include "ImagesUsageCache.h"
+#include "Images.h"
 #include <nbl/video/utilities/SIntendedSubmitInfo.h>
 #include <nbl/core/containers/LRUCache.h>  
 #include <nbl/ext/TextRendering/TextRendering.h>
@@ -589,6 +589,10 @@ struct DrawResourcesFiller
 	nbl::core::smart_refctd_ptr<IGPUBuffer> resourcesGPUBuffer;
 	size_t copiedResourcesSize;
 
+	// GPUImages Memory Arena + AddressAllocator
+	IDeviceMemoryAllocator::SAllocation imagesMemoryArena;
+	std::unique_ptr<ImagesMemorySubAllocator> imagesMemorySubAllocator;
+
 	// Members
 	smart_refctd_ptr<IUtilities> m_utilities;
 	IQueue* m_copyQueue;
diff --git a/62_CAD/Images.h b/62_CAD/Images.h
new file mode 100644
index 000000000..8b4309669
--- /dev/null
+++ b/62_CAD/Images.h
@@ -0,0 +1,153 @@
+#pragma once
+using namespace nbl;
+using namespace nbl::video;
+using namespace nbl::core;
+using namespace nbl::asset;
+
+using image_id = uint64_t; // Could later be templated or replaced with a stronger type or hash key.
+	
+struct ImageReference
+{
+	static constexpr uint32_t InvalidTextureIndex = nbl::hlsl::numeric_limits<uint32_t>::max;
+	uint32_t index = InvalidTextureIndex; // index in our array of textures binding
+	uint64_t lastUsedSemaphoreValue = 0ull; // last used semaphore value on this image
+	uint64_t memoryUsage = 0ull; // TODO: to be considered later
+
+	ImageReference() 
+		: index(InvalidTextureIndex)
+		, lastUsedSemaphoreValue(0ull)
+		, memoryUsage(0ull)
+	{}
+	
+	// In LRU Cache `insert` function, in case of cache miss, we need to construct the refereence with semaphore value
+	ImageReference(uint64_t semamphoreVal) 
+		: index(InvalidTextureIndex)
+		, lastUsedSemaphoreValue(semamphoreVal)
+		, memoryUsage(0ull)
+	{}
+
+	// In LRU Cache `insert` function, in case of cache hit, we need to assign semaphore value without changing `index`
+	inline ImageReference& operator=(uint64_t semamphoreVal) { lastUsedSemaphoreValue = semamphoreVal; return *this;  }
+};
+
+// A resource-aware image cache with an LRU eviction policy.
+// This cache tracks image usage by ID and provides hooks for eviction logic, such as releasing descriptor slots and deallocating GPU memory.
+// Currently, eviction is purely LRU-based. In the future, eviction decisions may incorporate additional factors:
+//   - memory usage per image.
+//   - lastUsedSemaphoreValue.
+// This class does not own GPU resources directly, but helps coordinate their lifetimes in sync with GPU usage via eviction callbacks.
+class ImagesUsageCache
+{
+public:
+	ImagesUsageCache(size_t capacity) 
+		: lruCache(ImagesLRUCache(capacity))
+	{}
+
+	// Attempts to insert a new image into the cache.
+	// If the cache is full, invokes the provided `evictCallback` to evict an image.
+	// Returns a pointer to the inserted or existing ImageReference.
+	template<std::invocable<const ImageReference&> EvictionCallback>
+	inline ImageReference* insert(image_id imageID, uint64_t lastUsedSema, EvictionCallback&& evictCallback)
+	{
+		return lruCache.insert(imageID, lastUsedSema, std::forward<EvictionCallback>(evictCallback));
+	}
+	
+	// Retrieves the image associated with `imageID`, updating its LRU position.
+	inline ImageReference* get(image_id imageID)
+	{
+		return lruCache.get(imageID);
+	}
+	
+	// Retrieves the ImageReference without updating LRU order.
+	inline ImageReference* peek(image_id imageID)
+	{
+		return lruCache.peek(imageID);
+	}
+
+	inline size_t size() const { return lruCache.size(); }
+	
+	// Selects an eviction candidate based on LRU policy.
+	// In the future, this could factor in memory pressure or semaphore sync requirements.
+	inline image_id select_eviction_candidate() 
+	{
+		const image_id* lru = lruCache.get_least_recently_used();
+		if (lru)
+			return *lru;
+		else
+		{
+			// we shouldn't select eviction candidate if lruCache is empty
+			_NBL_DEBUG_BREAK_IF(true);
+			return 0ull;
+		}
+	}
+	
+	// Removes a specific image from the cache (manual eviction).
+	inline void erase(image_id imageID)
+	{
+		lruCache.erase(imageID);
+	}
+
+private:
+	using ImagesLRUCache = core::ResizableLRUCache<image_id, ImageReference>;
+	ImagesLRUCache lruCache; // TODO: for now, work with simple lru cache, later on consider resource usage along with lastUsedSema value
+};
+
+/**
+ * @class ImagesMemorySubAllocator
+ * @brief A memory sub-allocator designed for managing sub-allocations within a pre-allocated GPU memory arena for images.
+ * 
+ * This class wraps around `nbl::core::GeneralpurposeAddressAllocator` to provide offset-based memory allocation
+ * for image resources within a contiguous block of GPU memory.
+ *
+ * @note This class only manages address offsets. The actual memory must be bound separately.
+ */
+class ImagesMemorySubAllocator
+{
+public:
+	using AddressAllocator = nbl::core::GeneralpurposeAddressAllocator<uint64_t>;
+	using ReservedAllocator = nbl::core::allocator<uint8_t>;
+	static constexpr uint64_t InvalidAddress = AddressAllocator::invalid_address;
+
+	ImagesMemorySubAllocator() = default;
+	
+	ImagesMemorySubAllocator(const uint64_t memoryArenaSize)
+	{
+		constexpr uint64_t MaxAlignment = 4096u; // safe choice based on hardware reports
+		constexpr uint64_t MinAllocSize = 128 * 1024u; // 128KB, the larger this is the better
+		m_reservedAllocSize = AddressAllocator::reserved_size(MaxAlignment, memoryArenaSize, MinAllocSize);
+		m_reservedAllocator = std::unique_ptr<ReservedAllocator>(new ReservedAllocator());
+		m_reservedAlloc = m_reservedAllocator->allocate(m_reservedAllocSize, _NBL_SIMD_ALIGNMENT);
+		m_addressAllocator = std::unique_ptr<AddressAllocator>(new AddressAllocator(
+			m_reservedAlloc, 0u, 0u, MaxAlignment, memoryArenaSize, MinAllocSize
+		));
+
+		// m_addressAllocator->alloc_addr(bytes, alignment);
+		// m_addressAllocator->free_addr(addr, bytes)
+	}
+
+	// return offset, will return InvalidAddress if failed
+	uint64_t allocate(const nbl::video::IDeviceMemoryBacked::SDeviceMemoryRequirements& imageMemoryRequirements)
+	{
+		return m_addressAllocator->alloc_addr(imageMemoryRequirements.size, 1u << imageMemoryRequirements.alignmentLog2);
+	}
+
+	void deallocate(uint64_t addr, uint64_t size)
+	{
+		m_addressAllocator->free_addr(addr, size);
+	}
+
+	~ImagesMemorySubAllocator()
+	{
+		if (m_reservedAlloc)
+			m_reservedAllocator->deallocate(reinterpret_cast<uint8_t*>(m_reservedAlloc), m_reservedAllocSize);
+	}
+	
+private:
+	std::unique_ptr<AddressAllocator> m_addressAllocator = nullptr;
+
+	// Memory Allocation Required for the AddressAllocator
+	std::unique_ptr<ReservedAllocator> m_reservedAllocator = nullptr;
+	void* m_reservedAlloc = nullptr;
+	size_t m_reservedAllocSize = 0;
+
+};
\ No newline at end of file

From bd9ee8e18c28e5fc444b732e59e8c68a8f2c97db Mon Sep 17 00:00:00 2001
From: Erfan Ahmadi <ahmadierfan99@gmail.com>
Date: Wed, 14 May 2025 17:01:08 +0400
Subject: [PATCH 247/529] Better Image Test + PostCleanup free + Fixing command
 buffer usage after potential auto-submit

---
 62_CAD/DrawResourcesFiller.cpp | 30 ++++++++---
 62_CAD/DrawResourcesFiller.h   | 24 +++++++--
 62_CAD/Images.h                | 21 +++-----
 62_CAD/main.cpp                | 96 +++++++++++++++++++---------------
 4 files changed, 106 insertions(+), 65 deletions(-)

diff --git a/62_CAD/DrawResourcesFiller.cpp b/62_CAD/DrawResourcesFiller.cpp
index 5948535a9..411cb356c 100644
--- a/62_CAD/DrawResourcesFiller.cpp
+++ b/62_CAD/DrawResourcesFiller.cpp
@@ -71,7 +71,7 @@ void DrawResourcesFiller::allocateResourcesBuffer(ILogicalDevice* logicalDevice,
 
 		if (imagesMemoryArena.isValid())
 		{
-			imagesMemorySubAllocator = std::unique_ptr<ImagesMemorySubAllocator>(new ImagesMemorySubAllocator(allocationInfo.size));
+			imagesMemorySubAllocator = core::make_smart_refctd_ptr<ImagesMemorySubAllocator>(static_cast<uint64_t>(allocationInfo.size));
 		}
 		else
 		{
@@ -414,7 +414,6 @@ uint32_t DrawResourcesFiller::addStaticImage2D(image_id imageID, const core::sma
 			IGPUImage::SCreationParams imageParams = {};
 			imageParams = cpuImage->getCreationParameters();
 			imageParams.usage |= IGPUImage::EUF_TRANSFER_DST_BIT|IGPUImage::EUF_SAMPLED_BIT;
-
 			// promote format because RGB8 and friends don't actually exist in HW
 			{
 				const IPhysicalDevice::SImageFormatPromotionRequest request = {
@@ -432,19 +431,28 @@ uint32_t DrawResourcesFiller::addStaticImage2D(image_id imageID, const core::sma
 			// we'll evict another texture from the LRU cache and retry until successful, or until only the currently-inserted image remains.
 			while (imagesUsageCache->size() > 0u)
 			{
+				// Pre-create the cleanup object that will later be used to release the image's memory range.
+				// Ownership will be passed to the GPU image, but we retain a temporary raw pointer
+				// so we can configure the cleanup object *after* allocation succeeds.
+				std::unique_ptr<ImageCleanup> cleanupObject = std::make_unique<ImageCleanup>();
+				ImageCleanup* currentImageCleanup = cleanupObject.get();
+				imageParams.postDestroyCleanup = std::move(cleanupObject);
+
 				// Try creating the image and allocating memory for it:
 				auto gpuImage = device->createImage(std::move(imageParams));
 				
 				if (gpuImage)
 				{
 					nbl::video::IDeviceMemoryBacked::SDeviceMemoryRequirements gpuImageMemoryRequirements = gpuImage->getMemoryReqs();
+					uint32_t actualAlignment = 1u << gpuImageMemoryRequirements.alignmentLog2;
 					const bool imageMemoryRequirementsMatch = 
 						(physDev->getDeviceLocalMemoryTypeBits() & gpuImageMemoryRequirements.memoryTypeBits) != 0 && // should have device local memory compatible
-						(gpuImageMemoryRequirements.requiresDedicatedAllocation == false); // should not require dedicated allocation
+						(gpuImageMemoryRequirements.requiresDedicatedAllocation == false) && // should not require dedicated allocation
+						((ImagesMemorySubAllocator::MaxMemoryAlignment % actualAlignment) == 0u); // should be consistent with our suballocator's max alignment
 
 					if (imageMemoryRequirementsMatch)
 					{
-						uint64_t allocationOffset = imagesMemorySubAllocator->allocate(gpuImageMemoryRequirements);
+						uint64_t allocationOffset = imagesMemorySubAllocator->allocate(gpuImageMemoryRequirements.size, 1u << gpuImageMemoryRequirements.alignmentLog2);
 						const bool allocationFromImagesMemoryArenaSuccessfull = allocationOffset != ImagesMemorySubAllocator::InvalidAddress;
 						if (allocationFromImagesMemoryArenaSuccessfull)
 						{
@@ -465,6 +473,10 @@ uint32_t DrawResourcesFiller::addStaticImage2D(image_id imageID, const core::sma
 								gpuImageView = device->createImageView(std::move(viewParams));
 								if (gpuImageView)
 								{
+									// SUCESS!
+									currentImageCleanup->imagesMemorySuballocator = imagesMemorySubAllocator;
+									currentImageCleanup->addr = allocationOffset;
+									currentImageCleanup->size = gpuImageMemoryRequirements.size;
 									gpuImageView->setObjectDebugName((std::to_string(imageID) + " Static Image View 2D").c_str());
 								}
 								else
@@ -778,7 +790,7 @@ bool DrawResourcesFiller::pushMSDFImagesUploads(SIntendedSubmitInfo& intendedNex
 	
 	if (cmdBuffInfo)
 	{
-		IGPUCommandBuffer* cmdBuff = cmdBuffInfo->cmdbuf;
+		IGPUCommandBuffer* commandBuffer = cmdBuffInfo->cmdbuf;
 
 		auto msdfImage = msdfTextureArray->getCreationParameters().image;
 
@@ -808,7 +820,7 @@ bool DrawResourcesFiller::pushMSDFImagesUploads(SIntendedSubmitInfo& intendedNex
 				.newLayout = IImage::LAYOUT::TRANSFER_DST_OPTIMAL,
 			}
 		};
-		cmdBuff->pipelineBarrier(E_DEPENDENCY_FLAGS::EDF_NONE, { .imgBarriers = beforeTransferImageBarrier });
+		commandBuffer->pipelineBarrier(E_DEPENDENCY_FLAGS::EDF_NONE, { .imgBarriers = beforeTransferImageBarrier });
 
 		// Do the copies and advance the iterator.
 		// this is the pattern we use for iterating when entries will get erased if processed successfully, but may get skipped for later.
@@ -857,6 +869,8 @@ bool DrawResourcesFiller::pushMSDFImagesUploads(SIntendedSubmitInfo& intendedNex
 			}
 		}
 
+		commandBuffer = intendedNextSubmit.getCommandBufferForRecording()->cmdbuf; // overflow-submit in utilities calls might've cause current recording command buffer to change
+
 		// preparing msdfs for use
 		image_barrier_t afterTransferImageBarrier[] =
 		{
@@ -882,7 +896,7 @@ bool DrawResourcesFiller::pushMSDFImagesUploads(SIntendedSubmitInfo& intendedNex
 				.newLayout = IImage::LAYOUT::READ_ONLY_OPTIMAL,
 			}
 		};
-		cmdBuff->pipelineBarrier(E_DEPENDENCY_FLAGS::EDF_NONE, { .imgBarriers = afterTransferImageBarrier });
+		commandBuffer->pipelineBarrier(E_DEPENDENCY_FLAGS::EDF_NONE, { .imgBarriers = afterTransferImageBarrier });
 		
 		if (!m_hasInitializedMSDFTextureArrays)
 			m_hasInitializedMSDFTextureArrays = true;
@@ -977,6 +991,8 @@ bool DrawResourcesFiller::pushStaticImagesUploads(SIntendedSubmitInfo& intendedN
 					stagedStaticImage.cpuImage->getRegions());
 			}
 
+			commandBuffer = intendedNextSubmit.getCommandBufferForRecording()->cmdbuf; // overflow-submit in utilities calls might've cause current recording command buffer to change
+
 			std::vector<IGPUCommandBuffer::SPipelineBarrierDependencyInfo::image_barrier_t> afterCopyImageBarriers;
 			afterCopyImageBarriers.resize(staticImagesStagedCopies.size());
 
diff --git a/62_CAD/DrawResourcesFiller.h b/62_CAD/DrawResourcesFiller.h
index 594c0fba3..ea60b33cd 100644
--- a/62_CAD/DrawResourcesFiller.h
+++ b/62_CAD/DrawResourcesFiller.h
@@ -591,7 +591,7 @@ struct DrawResourcesFiller
 
 	// GPUImages Memory Arena + AddressAllocator
 	IDeviceMemoryAllocator::SAllocation imagesMemoryArena;
-	std::unique_ptr<ImagesMemorySubAllocator> imagesMemorySubAllocator;
+	smart_refctd_ptr<ImagesMemorySubAllocator> imagesMemorySubAllocator;
 
 	// Members
 	smart_refctd_ptr<IUtilities> m_utilities;
@@ -626,6 +626,7 @@ struct DrawResourcesFiller
 
 	std::vector<MSDFStagedCPUImage>		msdfStagedCPUImages = {}; // cached cpu imaged + their status, size equals to LRUCache size
 	static constexpr asset::E_FORMAT	MSDFTextureFormat = asset::E_FORMAT::EF_R8G8B8A8_SNORM;
+	bool m_hasInitializedMSDFTextureArrays = false;
 	
 	// Images:
 	std::unique_ptr<ImagesUsageCache> imagesUsageCache;
@@ -641,7 +642,24 @@ struct DrawResourcesFiller
 	};
 	std::vector<StaticImagesCopy> staticImagesStagedCopies;
 
-	
-	bool m_hasInitializedMSDFTextureArrays = false;
+	struct ImageCleanup : nbl::video::ICleanup
+	{
+		ImageCleanup()
+			: imagesMemorySuballocator(nullptr)
+			, addr(ImagesMemorySubAllocator::InvalidAddress)
+			, size(0ull)
+		{}
+
+		~ImageCleanup() override
+		{
+			if (imagesMemorySuballocator && addr != ImagesMemorySubAllocator::InvalidAddress)
+				imagesMemorySuballocator->deallocate(addr, size);
+		}
+
+		smart_refctd_ptr<ImagesMemorySubAllocator> imagesMemorySuballocator;
+		uint64_t addr;
+		uint64_t size;
+
+	};
 };
 
diff --git a/62_CAD/Images.h b/62_CAD/Images.h
index 8b4309669..b2772d217 100644
--- a/62_CAD/Images.h
+++ b/62_CAD/Images.h
@@ -101,34 +101,29 @@ class ImagesUsageCache
  *
  * @note This class only manages address offsets. The actual memory must be bound separately.
  */
-class ImagesMemorySubAllocator
+class ImagesMemorySubAllocator : public core::IReferenceCounted 
 {
 public:
 	using AddressAllocator = nbl::core::GeneralpurposeAddressAllocator<uint64_t>;
 	using ReservedAllocator = nbl::core::allocator<uint8_t>;
 	static constexpr uint64_t InvalidAddress = AddressAllocator::invalid_address;
+	static constexpr uint64_t MaxMemoryAlignment = 4096u; // safe choice based on hardware reports
+	static constexpr uint64_t MinAllocSize = 128 * 1024u; // 128KB, the larger this is the better
 
-	ImagesMemorySubAllocator() = default;
-	
-	ImagesMemorySubAllocator(const uint64_t memoryArenaSize)
+	ImagesMemorySubAllocator(uint64_t memoryArenaSize)
 	{
-		constexpr uint64_t MaxAlignment = 4096u; // safe choice based on hardware reports
-		constexpr uint64_t MinAllocSize = 128 * 1024u; // 128KB, the larger this is the better
-		m_reservedAllocSize = AddressAllocator::reserved_size(MaxAlignment, memoryArenaSize, MinAllocSize);
+		m_reservedAllocSize = AddressAllocator::reserved_size(MaxMemoryAlignment, memoryArenaSize, MinAllocSize);
 		m_reservedAllocator = std::unique_ptr<ReservedAllocator>(new ReservedAllocator());
 		m_reservedAlloc = m_reservedAllocator->allocate(m_reservedAllocSize, _NBL_SIMD_ALIGNMENT);
 		m_addressAllocator = std::unique_ptr<AddressAllocator>(new AddressAllocator(
-			m_reservedAlloc, 0u, 0u, MaxAlignment, memoryArenaSize, MinAllocSize
+			m_reservedAlloc, 0u, 0u, MaxMemoryAlignment, memoryArenaSize, MinAllocSize
 		));
-
-		// m_addressAllocator->alloc_addr(bytes, alignment);
-		// m_addressAllocator->free_addr(addr, bytes)
 	}
 
 	// return offset, will return InvalidAddress if failed
-	uint64_t allocate(const nbl::video::IDeviceMemoryBacked::SDeviceMemoryRequirements& imageMemoryRequirements)
+	uint64_t allocate(uint64_t size, uint64_t alignment)
 	{
-		return m_addressAllocator->alloc_addr(imageMemoryRequirements.size, 1u << imageMemoryRequirements.alignmentLog2);
+		return m_addressAllocator->alloc_addr(size, alignment);
 	}
 
 	void deallocate(uint64_t addr, uint64_t size)
diff --git a/62_CAD/main.cpp b/62_CAD/main.cpp
index 0aad1669e..8c6e5c33f 100644
--- a/62_CAD/main.cpp
+++ b/62_CAD/main.cpp
@@ -1040,52 +1040,62 @@ class ComputerAidedDesign final : public examples::SimpleWindowedApplication, pu
 
 		// Load image
 		system::path m_loadCWD = "..";
-		std::string imagePath = "../../media/color_space_test/R8G8B8A8_1.png";
-
-		constexpr auto cachingFlags = static_cast<IAssetLoader::E_CACHING_FLAGS>(IAssetLoader::ECF_DONT_CACHE_REFERENCES & IAssetLoader::ECF_DONT_CACHE_TOP_LEVEL);
-		const IAssetLoader::SAssetLoadParams loadParams(0ull, nullptr, cachingFlags, IAssetLoader::ELPF_NONE, m_logger.get(), m_loadCWD);
-		auto bundle = m_assetMgr->getAsset(imagePath, loadParams);
-		auto contents = bundle.getContents();
-		if (contents.empty())
+		constexpr uint32_t SampleImagesCount = 4u;
+		std::string imagePaths[SampleImagesCount] =
 		{
-			m_logger->log("Failed to load image with path %s, skipping!", ILogger::ELL_ERROR, (m_loadCWD / imagePath).c_str());
-		}
+			"../../media/color_space_test/R8G8B8A8_1.png",
+			"../../media/color_space_test/R8G8B8A8_2.png",
+			"../../media/color_space_test/R8G8B8_1.png",
+			"../../media/color_space_test/R8G8B8_1.jpg",
+		};
 
-		smart_refctd_ptr<ICPUImageView> cpuImgView;
-		const auto& asset = contents[0];
-		switch (asset->getAssetType())
+		for (uint32_t i = 0; i < SampleImagesCount; ++i)
 		{
-		case IAsset::ET_IMAGE:
-		{
-			auto image = smart_refctd_ptr_static_cast<ICPUImage>(asset);
-			const auto format = image->getCreationParameters().format;
-
-			ICPUImageView::SCreationParams viewParams = {
-				.flags = ICPUImageView::E_CREATE_FLAGS::ECF_NONE,
-				.image = std::move(image),
-				.viewType = IImageView<ICPUImage>::E_TYPE::ET_2D,
-				.format = format,
-				.subresourceRange = {
-					.aspectMask = IImage::E_ASPECT_FLAGS::EAF_COLOR_BIT,
-					.baseMipLevel = 0u,
-					.levelCount = ICPUImageView::remaining_mip_levels,
-					.baseArrayLayer = 0u,
-					.layerCount = ICPUImageView::remaining_array_layers
-				}
-			};
+			constexpr auto cachingFlags = static_cast<IAssetLoader::E_CACHING_FLAGS>(IAssetLoader::ECF_DONT_CACHE_REFERENCES & IAssetLoader::ECF_DONT_CACHE_TOP_LEVEL);
+			const IAssetLoader::SAssetLoadParams loadParams(0ull, nullptr, cachingFlags, IAssetLoader::ELPF_NONE, m_logger.get(), m_loadCWD);
+			auto bundle = m_assetMgr->getAsset(imagePaths[i], loadParams);
+			auto contents = bundle.getContents();
+			if (contents.empty())
+			{
+				m_logger->log("Failed to load image with path %s, skipping!", ILogger::ELL_ERROR, (m_loadCWD / imagePaths[i]).c_str());
+			}
+
+			smart_refctd_ptr<ICPUImageView> cpuImgView;
+			const auto& asset = contents[0];
+			switch (asset->getAssetType())
+			{
+			case IAsset::ET_IMAGE:
+			{
+				auto image = smart_refctd_ptr_static_cast<ICPUImage>(asset);
+				const auto format = image->getCreationParameters().format;
+
+				ICPUImageView::SCreationParams viewParams = {
+					.flags = ICPUImageView::E_CREATE_FLAGS::ECF_NONE,
+					.image = std::move(image),
+					.viewType = IImageView<ICPUImage>::E_TYPE::ET_2D,
+					.format = format,
+					.subresourceRange = {
+						.aspectMask = IImage::E_ASPECT_FLAGS::EAF_COLOR_BIT,
+						.baseMipLevel = 0u,
+						.levelCount = ICPUImageView::remaining_mip_levels,
+						.baseArrayLayer = 0u,
+						.layerCount = ICPUImageView::remaining_array_layers
+					}
+				};
+
+				cpuImgView = ICPUImageView::create(std::move(viewParams));
+			} break;
 
-			cpuImgView = ICPUImageView::create(std::move(viewParams));
-		} break;
+			case IAsset::ET_IMAGE_VIEW:
+				cpuImgView = smart_refctd_ptr_static_cast<ICPUImageView>(asset);
+				break;
+			default:
+				m_logger->log("Failed to load ICPUImage or ICPUImageView got some other Asset Type, skipping!", ILogger::ELL_ERROR);
+			}
 
-		case IAsset::ET_IMAGE_VIEW:
-			cpuImgView = smart_refctd_ptr_static_cast<ICPUImageView>(asset);
-			break;
-		default:
-			m_logger->log("Failed to load ICPUImage or ICPUImageView got some other Asset Type, skipping!", ILogger::ELL_ERROR);
+			const auto cpuImage = cpuImgView->getCreationParameters().image;
+			sampleImages.push_back(cpuImage);
 		}
-				
-		const auto cpuImage = cpuImgView->getCreationParameters().image;
-		sampleImages.push_back(cpuImage);
 
 		return true;
 	}
@@ -1276,6 +1286,8 @@ class ComputerAidedDesign final : public examples::SimpleWindowedApplication, pu
 
 		drawResourcesFiller.pushAllUploads(intendedSubmitInfo);
 
+		m_currentRecordingCommandBufferInfo = intendedSubmitInfo.getCommandBufferForRecording(); // drawResourcesFiller.pushAllUploads might've overflow submitted and changed the current recording command buffer
+
 		// Use the current recording command buffer of the intendedSubmitInfos scratchCommandBuffers, it should be in recording state
 		auto* cb = m_currentRecordingCommandBufferInfo->cmdbuf;
 		
@@ -2890,8 +2902,8 @@ class ComputerAidedDesign final : public examples::SimpleWindowedApplication, pu
 			{
 				uint64_t imageID = i * 69ull; // it can be hash or something of the file path the image was loaded from
 				drawResourcesFiller.addStaticImage2D(imageID, sampleImages[i], intendedNextSubmit);
-				drawResourcesFiller.addImageObject(imageID, { 0.0, 0.0 }, { 100.0, 100.0 }, 0.0, intendedNextSubmit);
-				drawResourcesFiller.addImageObject(imageID, { 40.0, +40.0 }, { 100.0, 100.0 }, 0.0, intendedNextSubmit);
+				drawResourcesFiller.addImageObject(imageID, { 0.0 + i * 100.0, 0.0 }, { 100.0 , 100.0 }, 0.0, intendedNextSubmit);
+				// drawResourcesFiller.addImageObject(imageID, { 40.0, +40.0 }, { 100.0, 100.0 }, 0.0, intendedNextSubmit);
 			}
 			LineStyleInfo lineStyle = 
 			{

From fcff2ac68ea5dafcff8873bd379bd691c003bd59 Mon Sep 17 00:00:00 2001
From: Erfan Ahmadi <ahmadierfan99@gmail.com>
Date: Wed, 14 May 2025 18:32:29 +0400
Subject: [PATCH 248/529] Small edits and refactor

---
 62_CAD/DrawResourcesFiller.h | 20 --------------------
 62_CAD/Images.h              | 20 ++++++++++++++++++++
 62_CAD/main.cpp              |  9 ++++-----
 3 files changed, 24 insertions(+), 25 deletions(-)

diff --git a/62_CAD/DrawResourcesFiller.h b/62_CAD/DrawResourcesFiller.h
index ea60b33cd..e87ede3b6 100644
--- a/62_CAD/DrawResourcesFiller.h
+++ b/62_CAD/DrawResourcesFiller.h
@@ -641,25 +641,5 @@ struct DrawResourcesFiller
 		uint32_t arrayIndex;
 	};
 	std::vector<StaticImagesCopy> staticImagesStagedCopies;
-
-	struct ImageCleanup : nbl::video::ICleanup
-	{
-		ImageCleanup()
-			: imagesMemorySuballocator(nullptr)
-			, addr(ImagesMemorySubAllocator::InvalidAddress)
-			, size(0ull)
-		{}
-
-		~ImageCleanup() override
-		{
-			if (imagesMemorySuballocator && addr != ImagesMemorySubAllocator::InvalidAddress)
-				imagesMemorySuballocator->deallocate(addr, size);
-		}
-
-		smart_refctd_ptr<ImagesMemorySubAllocator> imagesMemorySuballocator;
-		uint64_t addr;
-		uint64_t size;
-
-	};
 };
 
diff --git a/62_CAD/Images.h b/62_CAD/Images.h
index b2772d217..6e13d19cd 100644
--- a/62_CAD/Images.h
+++ b/62_CAD/Images.h
@@ -145,4 +145,24 @@ class ImagesMemorySubAllocator : public core::IReferenceCounted
 	void* m_reservedAlloc = nullptr;
 	size_t m_reservedAllocSize = 0;
 
+};
+
+struct ImageCleanup : nbl::video::ICleanup
+{
+	ImageCleanup()
+		: imagesMemorySuballocator(nullptr)
+		, addr(ImagesMemorySubAllocator::InvalidAddress)
+		, size(0ull)
+	{}
+
+	~ImageCleanup() override
+	{
+		if (imagesMemorySuballocator && addr != ImagesMemorySubAllocator::InvalidAddress)
+			imagesMemorySuballocator->deallocate(addr, size);
+	}
+
+	smart_refctd_ptr<ImagesMemorySubAllocator> imagesMemorySuballocator;
+	uint64_t addr;
+	uint64_t size;
+
 };
\ No newline at end of file
diff --git a/62_CAD/main.cpp b/62_CAD/main.cpp
index 8c6e5c33f..016571fa8 100644
--- a/62_CAD/main.cpp
+++ b/62_CAD/main.cpp
@@ -1040,8 +1040,7 @@ class ComputerAidedDesign final : public examples::SimpleWindowedApplication, pu
 
 		// Load image
 		system::path m_loadCWD = "..";
-		constexpr uint32_t SampleImagesCount = 4u;
-		std::string imagePaths[SampleImagesCount] =
+		std::string imagePaths[] =
 		{
 			"../../media/color_space_test/R8G8B8A8_1.png",
 			"../../media/color_space_test/R8G8B8A8_2.png",
@@ -1049,15 +1048,15 @@ class ComputerAidedDesign final : public examples::SimpleWindowedApplication, pu
 			"../../media/color_space_test/R8G8B8_1.jpg",
 		};
 
-		for (uint32_t i = 0; i < SampleImagesCount; ++i)
+		for (const auto& imagePath : imagePaths)
 		{
 			constexpr auto cachingFlags = static_cast<IAssetLoader::E_CACHING_FLAGS>(IAssetLoader::ECF_DONT_CACHE_REFERENCES & IAssetLoader::ECF_DONT_CACHE_TOP_LEVEL);
 			const IAssetLoader::SAssetLoadParams loadParams(0ull, nullptr, cachingFlags, IAssetLoader::ELPF_NONE, m_logger.get(), m_loadCWD);
-			auto bundle = m_assetMgr->getAsset(imagePaths[i], loadParams);
+			auto bundle = m_assetMgr->getAsset(imagePath, loadParams);
 			auto contents = bundle.getContents();
 			if (contents.empty())
 			{
-				m_logger->log("Failed to load image with path %s, skipping!", ILogger::ELL_ERROR, (m_loadCWD / imagePaths[i]).c_str());
+				m_logger->log("Failed to load image with path %s, skipping!", ILogger::ELL_ERROR, (m_loadCWD / imagePath).c_str());
 			}
 
 			smart_refctd_ptr<ICPUImageView> cpuImgView;

From 052148f0d1611df0ae2e9cb4d9ee4edc08e3f351 Mon Sep 17 00:00:00 2001
From: devsh <devsh@devsh.eu>
Date: Wed, 14 May 2025 16:55:37 +0200
Subject: [PATCH 249/529] disable more old code, use two queues for BLAS & TLAS
 convert and multiple command buffers

add code to test ReBAR uploads
---
 67_RayQueryGeometry/main.cpp | 140 ++++++++++++++++++++++++++++-------
 1 file changed, 115 insertions(+), 25 deletions(-)

diff --git a/67_RayQueryGeometry/main.cpp b/67_RayQueryGeometry/main.cpp
index cec4e5270..0d7494e9c 100644
--- a/67_RayQueryGeometry/main.cpp
+++ b/67_RayQueryGeometry/main.cpp
@@ -538,6 +538,7 @@ class RayQueryGeometryApp final : public examples::SimpleWindowedApplication, pu
 			return buffer;
 		}
 
+#ifndef TEST_ASSET_CONV_AS
 		smart_refctd_ptr<IGPUCommandBuffer> getSingleUseCommandBufferAndBegin(smart_refctd_ptr<IGPUCommandPool> pool)
 		{
 			smart_refctd_ptr<IGPUCommandBuffer> cmdbuf;
@@ -596,6 +597,7 @@ class RayQueryGeometryApp final : public examples::SimpleWindowedApplication, pu
 				m_device->blockForSemaphores(info);
 			}
 		}
+#endif
 
 #ifdef TEST_ASSET_CONV_AS
 		bool createAccelerationStructuresFromGeometry(video::CThreadSafeQueueAdapter* queue, const IGeometryCreator* gc)
@@ -720,15 +722,41 @@ class RayQueryGeometryApp final : public examples::SimpleWindowedApplication, pu
 			cpuTlas->setInstances(std::move(geomInstances));
 			cpuTlas->setBuildFlags(IGPUTopLevelAccelerationStructure::BUILD_FLAGS::PREFER_FAST_TRACE_BIT);
 
+//#define TEST_REBAR_FALLBACK
 			// convert with asset converter
-			auto pool = m_device->createCommandPool(queue->getFamilyIndex(), IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT);
-			if (!pool)
-				return logFail("Couldn't create Command Pool for geometry creation!");
-			auto cmdbuf = getSingleUseCommandBufferAndBegin(pool);
-
 			smart_refctd_ptr<CAssetConverter> converter = CAssetConverter::create({ .device = m_device.get(), .optimizer = {} });
-			CAssetConverter::SInputs inputs = {};
+			struct MyInputs : CAssetConverter::SInputs
+			{
+#ifndef TEST_REBAR_FALLBACK
+				inline uint32_t constrainMemoryTypeBits(const size_t groupCopyID, const IAsset* canonicalAsset, const blake3_hash_t& contentHash, const IDeviceMemoryBacked* memoryBacked) const override
+				{
+					assert(memoryBacked);
+					return memoryBacked->getObjectType()!=IDeviceMemoryBacked::EOT_BUFFER ? (~0u):rebarMemoryTypes;
+				}
+#endif
+				uint32_t rebarMemoryTypes;
+			} inputs = {};
 			inputs.logger = m_logger.get();
+			inputs.rebarMemoryTypes = m_physicalDevice->getDirectVRAMAccessMemoryTypeBits();
+#ifndef TEST_REBAR_FALLBACK
+			struct MyAllocator final : public IDeviceMemoryAllocator
+			{
+				ILogicalDevice* getDeviceForAllocations() const override {return device;}
+
+				SAllocation allocate(const SAllocateInfo& info) override
+				{
+					auto retval = device->allocate(info);
+					// map what is mappable by default so ReBAR checks succeed
+					if (retval.isValid() && retval.memory->isMappable())
+						retval.memory->map({.offset=0,.length=info.size});
+					return retval;
+				}
+
+				ILogicalDevice* device;
+			} myalloc;
+			myalloc.device = m_device.get();
+			inputs.allocator = &myalloc;
+#endif
 
 			std::array<ICPUTopLevelAccelerationStructure*, 1u> tmpTlas;
 			std::array<ICPUBuffer*, OT_COUNT * 2u> tmpBuffers;
@@ -772,40 +800,102 @@ class RayQueryGeometryApp final : public examples::SimpleWindowedApplication, pu
 				prepass.template operator() < ICPUBuffer > (tmpBuffers);
 			}
 
-			// TODO wait for convert
-			m_logger->log("willDeviceASBuild: %d, willHostASBuild: %d\nminASBuildScratchSize: %d, maxASBuildScratchSize: %d\nminCompactedASAllocatorSpace: %d, requiredQueueFlags: %d\n", ILogger::ELL_INFO,
-				reservation.willDeviceASBuild(), reservation.willHostASBuild(),
-				reservation.getMinASBuildScratchSize(false), reservation.getMaxASBuildScratchSize(false),
-				reservation.getMinCompactedASAllocatorSpace(), reservation.getRequiredQueueFlags(false));
-			return false;
-
-			auto semaphore = m_device->createSemaphore(0u);
-
-			std::array<IQueue::SSubmitInfo::SCommandBufferInfo, 1> cmdbufs = {};
-			cmdbufs.front().cmdbuf = cmdbuf.get();
 
+			constexpr auto XferBufferCount = 2;
+			std::array<smart_refctd_ptr<IGPUCommandBuffer>,XferBufferCount> xferBufs = {};
+			std::array<IQueue::SSubmitInfo::SCommandBufferInfo,XferBufferCount> xferBufInfos = {};
+			{
+				auto pool = m_device->createCommandPool(getTransferUpQueue()->getFamilyIndex(),IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT | IGPUCommandPool::CREATE_FLAGS::TRANSIENT_BIT);
+				pool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY,xferBufs);
+				xferBufs.front()->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT);
+				for (auto i=0; i<XferBufferCount; i++)
+					xferBufInfos[i].cmdbuf = xferBufs[i].get();
+			}
+			auto xferSema = m_device->createSemaphore(0u);
 			SIntendedSubmitInfo transfer = {};
-			transfer.queue = queue;
-			transfer.scratchCommandBuffers = cmdbufs;
+			transfer.queue = getTransferUpQueue();
+			transfer.scratchCommandBuffers = xferBufInfos;
 			transfer.scratchSemaphore = {
-				.semaphore = semaphore.get(),
+				.semaphore = xferSema.get(),
 				.value = 0u,
 				.stageMask = PIPELINE_STAGE_FLAGS::ALL_TRANSFER_BITS
 			};
+			
+			constexpr auto CompBufferCount = 2;
+			std::array<smart_refctd_ptr<IGPUCommandBuffer>,CompBufferCount> compBufs = {};
+			std::array<IQueue::SSubmitInfo::SCommandBufferInfo,CompBufferCount> compBufInfos = {};
+			{
+				auto pool = m_device->createCommandPool(getComputeQueue()->getFamilyIndex(),IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT|IGPUCommandPool::CREATE_FLAGS::TRANSIENT_BIT);
+				pool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY,compBufs);
+				compBufs.front()->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT);
+				for (auto i=0; i<CompBufferCount; i++)
+					compBufInfos[i].cmdbuf = compBufs[i].get();
+			}
+			auto compSema = m_device->createSemaphore(0u);
 			SIntendedSubmitInfo compute = {};
-			compute.queue = queue;
-			compute.scratchCommandBuffers = cmdbufs;
+			compute.queue = getComputeQueue();
+			compute.scratchCommandBuffers = compBufInfos;
 			compute.scratchSemaphore = {
-				.semaphore = semaphore.get(),
+				.semaphore = compSema.get(),
 				.value = 0u,
-				.stageMask = PIPELINE_STAGE_FLAGS::ACCELERATION_STRUCTURE_BUILD_BIT	// TODO correct mask?
+				.stageMask = PIPELINE_STAGE_FLAGS::ACCELERATION_STRUCTURE_BUILD_BIT|PIPELINE_STAGE_FLAGS::ACCELERATION_STRUCTURE_COPY_BIT
 			};
 			// convert
 			{
-				CAssetConverter::SConvertParams params = {};
+				smart_refctd_ptr<CAssetConverter::SConvertParams::scratch_for_device_AS_build_t> scratchAlloc;
+				{
+					constexpr auto MaxAlignment = 256;
+					constexpr auto MinAllocationSize = 1024;
+					const auto scratchSize = core::alignUp(reservation.getMinASBuildScratchSize(false),MaxAlignment);
+					
+
+					IGPUBuffer::SCreationParams creationParams = {};
+					creationParams.size = scratchSize;
+					creationParams.usage = IGPUBuffer::EUF_ACCELERATION_STRUCTURE_BUILD_INPUT_READ_ONLY_BIT|IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT|IGPUBuffer::EUF_STORAGE_BUFFER_BIT;
+#ifdef TEST_REBAR_FALLBACK
+					creationParams.usage |= IGPUBuffer::EUF_TRANSFER_DST_BIT;
+					core::unordered_set<uint32_t> sharingSet = {compute.queue->getFamilyIndex(),transfer.queue->getFamilyIndex()};
+					core::vector<uint32_t> sharingIndices(sharingSet.begin(),sharingSet.end());
+					if (sharingIndices.size()>1)
+						creationParams.queueFamilyIndexCount = sharingIndices.size();
+					creationParams.queueFamilyIndices = sharingIndices.data();
+#endif
+					auto scratchBuffer = m_device->createBuffer(std::move(creationParams));
+
+					auto reqs = scratchBuffer->getMemoryReqs();
+#ifndef TEST_REBAR_FALLBACK
+					reqs.memoryTypeBits &= m_physicalDevice->getDirectVRAMAccessMemoryTypeBits();
+#endif
+					auto allocation = m_device->allocate(reqs,scratchBuffer.get(),IDeviceMemoryAllocation::EMAF_DEVICE_ADDRESS_BIT);
+#ifndef TEST_REBAR_FALLBACK
+					allocation.memory->map({.offset=0,.length=reqs.size});
+#endif
+
+					scratchAlloc = make_smart_refctd_ptr<CAssetConverter::SConvertParams::scratch_for_device_AS_build_t>(
+						SBufferRange<video::IGPUBuffer>{0ull,scratchSize,std::move(scratchBuffer)},
+						core::allocator<uint8_t>(),MaxAlignment,MinAllocationSize
+					);
+				}
+
+				struct MyParams final : CAssetConverter::SConvertParams
+				{
+					inline uint32_t getFinalOwnerQueueFamily(const IGPUBuffer* buffer, const core::blake3_hash_t& createdFrom) override
+					{
+						return finalUser;
+					}
+					inline uint32_t getFinalOwnerQueueFamily(const IGPUAccelerationStructure* image, const core::blake3_hash_t& createdFrom) override
+					{
+						return finalUser;
+					}
+
+					uint8_t finalUser;
+				} params = {};
+#undef TEST_REBAR_FALLBACK
 				params.utilities = m_utils.get();
 				params.transfer = &transfer;
 				params.compute = &compute;
+				params.scratchForDeviceASBuild = scratchAlloc.get();
+				params.finalUser = queue->getFamilyIndex();
 
 				auto future = reservation.convert(params);
 				if (future.copy() != IQueue::RESULT::SUCCESS)

From 908abd110c387d48110ce8aeb67f0e0f2dd68943 Mon Sep 17 00:00:00 2001
From: keptsecret <sorchon@gmail.com>
Date: Thu, 15 May 2025 10:37:32 +0700
Subject: [PATCH 250/529] refactor name changes

---
 23_Arithmetic2UnitTest/app_resources/workgroupCommon.hlsl | 2 +-
 74_Arithmetic2Bench/app_resources/workgroupCommon.hlsl    | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/23_Arithmetic2UnitTest/app_resources/workgroupCommon.hlsl b/23_Arithmetic2UnitTest/app_resources/workgroupCommon.hlsl
index 702fcbe25..031a34d08 100644
--- a/23_Arithmetic2UnitTest/app_resources/workgroupCommon.hlsl
+++ b/23_Arithmetic2UnitTest/app_resources/workgroupCommon.hlsl
@@ -21,7 +21,7 @@ uint32_t3 nbl::hlsl::glsl::gl_WorkGroupSize() {return uint32_t3(WORKGROUP_SIZE,1
 #error "Define ITEMS_PER_INVOCATION!"
 #endif
 
-using config_t = nbl::hlsl::workgroup2::Configuration<WORKGROUP_SIZE_LOG2, SUBGROUP_SIZE_LOG2, ITEMS_PER_INVOCATION>;
+using config_t = nbl::hlsl::workgroup2::ArithmeticConfiguration<WORKGROUP_SIZE_LOG2, SUBGROUP_SIZE_LOG2, ITEMS_PER_INVOCATION>;
 
 typedef vector<uint32_t, config_t::ItemsPerInvocation_0> type_t;
 
diff --git a/74_Arithmetic2Bench/app_resources/workgroupCommon.hlsl b/74_Arithmetic2Bench/app_resources/workgroupCommon.hlsl
index 026687cfa..19e5893f0 100644
--- a/74_Arithmetic2Bench/app_resources/workgroupCommon.hlsl
+++ b/74_Arithmetic2Bench/app_resources/workgroupCommon.hlsl
@@ -21,7 +21,7 @@ uint32_t3 nbl::hlsl::glsl::gl_WorkGroupSize() {return uint32_t3(WORKGROUP_SIZE,1
 #error "Define ITEMS_PER_INVOCATION!"
 #endif
 
-using config_t = nbl::hlsl::workgroup2::Configuration<WORKGROUP_SIZE_LOG2, SUBGROUP_SIZE_LOG2, ITEMS_PER_INVOCATION>;
+using config_t = nbl::hlsl::workgroup2::ArithmeticConfiguration<WORKGROUP_SIZE_LOG2, SUBGROUP_SIZE_LOG2, ITEMS_PER_INVOCATION>;
 
 typedef vector<uint32_t, config_t::ItemsPerInvocation_0> type_t;
 

From 81238adaecbd8d717bdab0dd73e08e2938a794c6 Mon Sep 17 00:00:00 2001
From: keptsecret <sorchon@gmail.com>
Date: Thu, 15 May 2025 14:41:16 +0700
Subject: [PATCH 251/529] minor refactor

---
 23_Arithmetic2UnitTest/app_resources/workgroupCommon.hlsl | 2 +-
 23_Arithmetic2UnitTest/main.cpp                           | 2 +-
 74_Arithmetic2Bench/app_resources/workgroupCommon.hlsl    | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/23_Arithmetic2UnitTest/app_resources/workgroupCommon.hlsl b/23_Arithmetic2UnitTest/app_resources/workgroupCommon.hlsl
index 031a34d08..69aa11ecc 100644
--- a/23_Arithmetic2UnitTest/app_resources/workgroupCommon.hlsl
+++ b/23_Arithmetic2UnitTest/app_resources/workgroupCommon.hlsl
@@ -43,7 +43,7 @@ bool canStore();
 #endif
 
 // final (level 1/2) scan needs to fit in one subgroup exactly
-groupshared uint32_t scratch[config_t::SubgroupsPerVirtualWorkgroup*config_t::ItemsPerInvocation_1];
+groupshared uint32_t scratch[config_t::ElementCount];
 
 struct ScratchProxy
 {
diff --git a/23_Arithmetic2UnitTest/main.cpp b/23_Arithmetic2UnitTest/main.cpp
index a3c274160..e7dfcefa1 100644
--- a/23_Arithmetic2UnitTest/main.cpp
+++ b/23_Arithmetic2UnitTest/main.cpp
@@ -174,7 +174,7 @@ class Workgroup2ScanTestApp final : public application_templates::BasicMultiQueu
 		for (auto subgroupSize=MinSubgroupSize; subgroupSize <= MaxSubgroupSize; subgroupSize *= 2u)
 		{
 			const uint8_t subgroupSizeLog2 = hlsl::findMSB(subgroupSize);
-			for (uint32_t workgroupSize = subgroupSize; workgroupSize < MaxWorkgroupSize; workgroupSize *= 2)
+			for (uint32_t workgroupSize = subgroupSize; workgroupSize <= MaxWorkgroupSize; workgroupSize *= 2u)
 			{
 				// make sure renderdoc captures everything for debugging
 				m_api->startCapture();
diff --git a/74_Arithmetic2Bench/app_resources/workgroupCommon.hlsl b/74_Arithmetic2Bench/app_resources/workgroupCommon.hlsl
index 19e5893f0..69aa11ecc 100644
--- a/74_Arithmetic2Bench/app_resources/workgroupCommon.hlsl
+++ b/74_Arithmetic2Bench/app_resources/workgroupCommon.hlsl
@@ -43,7 +43,7 @@ bool canStore();
 #endif
 
 // final (level 1/2) scan needs to fit in one subgroup exactly
-groupshared uint32_t scratch[config_t::SharedMemSize];
+groupshared uint32_t scratch[config_t::ElementCount];
 
 struct ScratchProxy
 {

From 093099f7c2b524507aa7f1dc0b1e9c089f4921ae Mon Sep 17 00:00:00 2001
From: Erfan Ahmadi <ahmadierfan99@gmail.com>
Date: Thu, 15 May 2025 12:21:23 +0400
Subject: [PATCH 252/529] Test and verify overflow submit works with images
 bound to non dedicated memory arena

---
 62_CAD/DrawResourcesFiller.cpp |  48 ++++++----
 62_CAD/Images.h                | 161 +++++++++++++++++----------------
 62_CAD/main.cpp                |   1 +
 3 files changed, 112 insertions(+), 98 deletions(-)

diff --git a/62_CAD/DrawResourcesFiller.cpp b/62_CAD/DrawResourcesFiller.cpp
index 411cb356c..91d3f22ce 100644
--- a/62_CAD/DrawResourcesFiller.cpp
+++ b/62_CAD/DrawResourcesFiller.cpp
@@ -62,7 +62,8 @@ void DrawResourcesFiller::allocateResourcesBuffer(ILogicalDevice* logicalDevice,
 
 		IDeviceMemoryAllocator::SAllocateInfo allocationInfo =
 		{
-			.size = 512 * 1024 * 1024, // 512 MB
+			// TODO: Get from user side.
+			.size = 70 * 1024 * 1024, // 70 MB
 			.flags = IDeviceMemoryAllocation::E_MEMORY_ALLOCATE_FLAGS::EMAF_NONE,
 			.memoryTypeIndex = memoryTypeIdx,
 			.dedication = nullptr,
@@ -372,6 +373,12 @@ uint32_t DrawResourcesFiller::addStaticImage2D(image_id imageID, const core::sma
 		// Prepare wait info to defer index deallocation until the GPU has finished using the resource.
 		// Because we will be writing to the descriptor set location which might be in use.
 		ISemaphore::SWaitInfo deallocationWaitInfo = { .semaphore = intendedNextSubmit.getFutureScratchSemaphore().semaphore, .value = evicted.lastUsedSemaphoreValue };
+		
+		// will later be used to release the image's memory range.
+		core::smart_refctd_ptr<ImageCleanup> cleanupObject = core::make_smart_refctd_ptr<ImageCleanup>();
+		cleanupObject->imagesMemorySuballocator = imagesMemorySubAllocator;
+		cleanupObject->addr = evicted.allocationOffset;
+		cleanupObject->size = evicted.allocationSize;
 
 		const bool imageUsedForNextIntendedSubmit = (evicted.lastUsedSemaphoreValue == intendedNextSubmit.getFutureScratchSemaphore().value);
 
@@ -380,7 +387,7 @@ uint32_t DrawResourcesFiller::addStaticImage2D(image_id imageID, const core::sma
 			// The evicted image is scheduled for use in the upcoming submit.
 			// To avoid rendering artifacts, we must flush the current draw queue now.
 			// After submission, we reset state so that data referencing the evicted slot can be re-uploaded.
-			suballocatedDescriptorSet->multi_deallocate(imagesArrayBinding, 1u, &evicted.index, deallocationWaitInfo);
+			suballocatedDescriptorSet->multi_deallocate(imagesArrayBinding, 1u, &evicted.index, deallocationWaitInfo, &cleanupObject.get());
 			submitDraws(intendedNextSubmit);
 			reset(); // resets everything, things referenced through mainObj and other shit will be pushed again through acquireXXX_SubmitIfNeeded
 		} 
@@ -388,7 +395,7 @@ uint32_t DrawResourcesFiller::addStaticImage2D(image_id imageID, const core::sma
 		{
 			// The image is not used in the current frame (intended next submit), so we can deallocate without submitting any draws.
 			// Still wait on the semaphore to ensure past GPU usage is complete.
-			suballocatedDescriptorSet->multi_deallocate(imagesArrayBinding, 1u, &evicted.index, deallocationWaitInfo);
+			suballocatedDescriptorSet->multi_deallocate(imagesArrayBinding, 1u, &evicted.index, deallocationWaitInfo, &cleanupObject.get());
 		}
 	};
 
@@ -431,13 +438,6 @@ uint32_t DrawResourcesFiller::addStaticImage2D(image_id imageID, const core::sma
 			// we'll evict another texture from the LRU cache and retry until successful, or until only the currently-inserted image remains.
 			while (imagesUsageCache->size() > 0u)
 			{
-				// Pre-create the cleanup object that will later be used to release the image's memory range.
-				// Ownership will be passed to the GPU image, but we retain a temporary raw pointer
-				// so we can configure the cleanup object *after* allocation succeeds.
-				std::unique_ptr<ImageCleanup> cleanupObject = std::make_unique<ImageCleanup>();
-				ImageCleanup* currentImageCleanup = cleanupObject.get();
-				imageParams.postDestroyCleanup = std::move(cleanupObject);
-
 				// Try creating the image and allocating memory for it:
 				auto gpuImage = device->createImage(std::move(imageParams));
 				
@@ -452,14 +452,15 @@ uint32_t DrawResourcesFiller::addStaticImage2D(image_id imageID, const core::sma
 
 					if (imageMemoryRequirementsMatch)
 					{
-						uint64_t allocationOffset = imagesMemorySubAllocator->allocate(gpuImageMemoryRequirements.size, 1u << gpuImageMemoryRequirements.alignmentLog2);
-						const bool allocationFromImagesMemoryArenaSuccessfull = allocationOffset != ImagesMemorySubAllocator::InvalidAddress;
+						inserted->allocationOffset = imagesMemorySubAllocator->allocate(gpuImageMemoryRequirements.size, 1u << gpuImageMemoryRequirements.alignmentLog2);
+						const bool allocationFromImagesMemoryArenaSuccessfull = inserted->allocationOffset != ImagesMemorySubAllocator::InvalidAddress;
 						if (allocationFromImagesMemoryArenaSuccessfull)
 						{
+							inserted->allocationSize = gpuImageMemoryRequirements.size;
 							nbl::video::ILogicalDevice::SBindImageMemoryInfo bindImageMemoryInfo =
 							{
 								.image = gpuImage.get(),
-								.binding = {.memory = imagesMemoryArena.memory.get(), .offset = imagesMemoryArena.offset + allocationOffset }
+								.binding = {.memory = imagesMemoryArena.memory.get(), .offset = imagesMemoryArena.offset + inserted->allocationOffset }
 							};
 							const bool boundToMemorySuccessfully = device->bindImageMemory({ &bindImageMemoryInfo, 1u });
 							if (boundToMemorySuccessfully)
@@ -473,10 +474,7 @@ uint32_t DrawResourcesFiller::addStaticImage2D(image_id imageID, const core::sma
 								gpuImageView = device->createImageView(std::move(viewParams));
 								if (gpuImageView)
 								{
-									// SUCESS!
-									currentImageCleanup->imagesMemorySuballocator = imagesMemorySubAllocator;
-									currentImageCleanup->addr = allocationOffset;
-									currentImageCleanup->size = gpuImageMemoryRequirements.size;
+									// SUCCESS!
 									gpuImageView->setObjectDebugName((std::to_string(imageID) + " Static Image View 2D").c_str());
 								}
 								else
@@ -543,13 +541,23 @@ uint32_t DrawResourcesFiller::addStaticImage2D(image_id imageID, const core::sma
 					.gpuImageView = gpuImageView,
 					.arrayIndex = inserted->index,
 				};
+				
 				staticImagesStagedCopies.push_back(copyToStage);
 			}
 			else
 			{
 				// All attempts to create the GPU image and its corresponding view have failed.
 				// Most likely cause: insufficient GPU memory or unsupported image parameters.
-				// TODO: Log a warning or error here � `addStaticImage2D` failed, likely due to low VRAM.
+				// TODO: Log a warning or error here � `addStaticImage2D` failed, likely due to low VRAM.
+				// assert(false);
+				
+				if (inserted->allocationOffset != ImagesMemorySubAllocator::InvalidAddress)
+				{
+					// We previously successfully create and allocated memory for the Image
+					// but failed to bind and create image view
+					// It's crucial to deallocate the offset+size form our images memory suballocator
+					imagesMemorySubAllocator->deallocate(inserted->allocationOffset, inserted->allocationSize);
+				}
 
 				if (inserted->index != InvalidTextureIndex)
 				{
@@ -568,7 +576,7 @@ uint32_t DrawResourcesFiller::addStaticImage2D(image_id imageID, const core::sma
 		}
 	}
 	
-	assert(inserted->index != InvalidTextureIndex); // shouldn't happen, because we're using LRU cache, so worst case eviction will happen + multi-deallocate and next next multi_allocate should definitely succeed
+	// assert(inserted->index != InvalidTextureIndex); // shouldn't happen, because we're using LRU cache, so worst case eviction will happen + multi-deallocate and next next multi_allocate should definitely succeed
 
 	return inserted->index;
 }
@@ -1647,7 +1655,7 @@ uint32_t DrawResourcesFiller::addMSDFTexture(const MSDFInputInfo& msdfInput, cor
 		//   - Therefore, we can safely overwrite or reallocate the slot without waiting for explicit GPU completion.
 		//
 		// However, this `deallocationWaitInfo` *will* become essential if we start interacting with MSDF images
-		// outside the `intendedNextSubmit` timeline � for example, issuing uploads via a transfer queue or using a separate command buffer and timeline.
+		// outside the `intendedNextSubmit` timeline � for example, issuing uploads via a transfer queue or using a separate command buffer and timeline.
 		ISemaphore::SWaitInfo deallocationWaitInfo = { .semaphore = intendedNextSubmit.getFutureScratchSemaphore().semaphore, .value = evicted.lastUsedSemaphoreValue };
 
 		const bool imageUsedForNextIntendedSubmit = (evicted.lastUsedSemaphoreValue == intendedNextSubmit.getFutureScratchSemaphore().value);
diff --git a/62_CAD/Images.h b/62_CAD/Images.h
index 6e13d19cd..7d9682e36 100644
--- a/62_CAD/Images.h
+++ b/62_CAD/Images.h
@@ -5,25 +5,105 @@ using namespace nbl::core;
 using namespace nbl::asset;
 
 using image_id = uint64_t; // Could later be templated or replaced with a stronger type or hash key.
+
+/**
+ * @class ImagesMemorySubAllocator
+ * @brief A memory sub-allocator designed for managing sub-allocations within a pre-allocated GPU memory arena for images.
+ * 
+ * This class wraps around `nbl::core::GeneralpurposeAddressAllocator` to provide offset-based memory allocation
+ * for image resources within a contiguous block of GPU memory.
+ *
+ * @note This class only manages address offsets. The actual memory must be bound separately.
+ */
+class ImagesMemorySubAllocator : public core::IReferenceCounted 
+{
+public:
+	using AddressAllocator = nbl::core::GeneralpurposeAddressAllocator<uint64_t>;
+	using ReservedAllocator = nbl::core::allocator<uint8_t>;
+	static constexpr uint64_t InvalidAddress = AddressAllocator::invalid_address;
+	static constexpr uint64_t MaxMemoryAlignment = 4096u; // safe choice based on hardware reports
+	static constexpr uint64_t MinAllocSize = 128 * 1024u; // 128KB, the larger this is the better
+
+	ImagesMemorySubAllocator(uint64_t memoryArenaSize)
+	{
+		m_reservedAllocSize = AddressAllocator::reserved_size(MaxMemoryAlignment, memoryArenaSize, MinAllocSize);
+		m_reservedAllocator = std::unique_ptr<ReservedAllocator>(new ReservedAllocator());
+		m_reservedAlloc = m_reservedAllocator->allocate(m_reservedAllocSize, _NBL_SIMD_ALIGNMENT);
+		m_addressAllocator = std::unique_ptr<AddressAllocator>(new AddressAllocator(
+			m_reservedAlloc, 0u, 0u, MaxMemoryAlignment, memoryArenaSize, MinAllocSize
+		));
+	}
+
+	// return offset, will return InvalidAddress if failed
+	uint64_t allocate(uint64_t size, uint64_t alignment)
+	{
+		return m_addressAllocator->alloc_addr(size, alignment);
+	}
+
+	void deallocate(uint64_t addr, uint64_t size)
+	{
+		m_addressAllocator->free_addr(addr, size);
+	}
+
+	~ImagesMemorySubAllocator()
+	{
+		if (m_reservedAlloc)
+			m_reservedAllocator->deallocate(reinterpret_cast<uint8_t*>(m_reservedAlloc), m_reservedAllocSize);
+	}
 	
+private:
+	std::unique_ptr<AddressAllocator> m_addressAllocator = nullptr;
+
+	// Memory Allocation Required for the AddressAllocator
+	std::unique_ptr<ReservedAllocator> m_reservedAllocator = nullptr;
+	void* m_reservedAlloc = nullptr;
+	size_t m_reservedAllocSize = 0;
+
+};
+
+// This will be dropped when the descriptor gets dropped from SuballocatedDescriptorSet.
+// Destructor will then deallocate from GeneralPurposeAllocator, making the previously allocated range of the image available/free again.
+struct ImageCleanup : public core::IReferenceCounted
+{
+	ImageCleanup()
+		: imagesMemorySuballocator(nullptr)
+		, addr(ImagesMemorySubAllocator::InvalidAddress)
+		, size(0ull)
+	{}
+
+	~ImageCleanup() override
+	{
+		if (imagesMemorySuballocator && addr != ImagesMemorySubAllocator::InvalidAddress)
+			imagesMemorySuballocator->deallocate(addr, size);
+	}
+
+	smart_refctd_ptr<ImagesMemorySubAllocator> imagesMemorySuballocator;
+	uint64_t addr;
+	uint64_t size;
+
+};
+
 struct ImageReference
 {
 	static constexpr uint32_t InvalidTextureIndex = nbl::hlsl::numeric_limits<uint32_t>::max;
 	uint32_t index = InvalidTextureIndex; // index in our array of textures binding
 	uint64_t lastUsedSemaphoreValue = 0ull; // last used semaphore value on this image
-	uint64_t memoryUsage = 0ull; // TODO: to be considered later
+	uint64_t allocationOffset = ImagesMemorySubAllocator::InvalidAddress;
+	uint64_t allocationSize = 0ull;
 
 	ImageReference() 
 		: index(InvalidTextureIndex)
 		, lastUsedSemaphoreValue(0ull)
-		, memoryUsage(0ull)
+		, allocationOffset(ImagesMemorySubAllocator::InvalidAddress)
+		, allocationSize(0ull)
 	{}
 	
 	// In LRU Cache `insert` function, in case of cache miss, we need to construct the refereence with semaphore value
 	ImageReference(uint64_t semamphoreVal) 
 		: index(InvalidTextureIndex)
 		, lastUsedSemaphoreValue(semamphoreVal)
-		, memoryUsage(0ull)
+		, allocationOffset(ImagesMemorySubAllocator::InvalidAddress)
+		, allocationSize(0ull)
 	{}
 
 	// In LRU Cache `insert` function, in case of cache hit, we need to assign semaphore value without changing `index`
@@ -91,78 +171,3 @@ class ImagesUsageCache
 	using ImagesLRUCache = core::ResizableLRUCache<image_id, ImageReference>;
 	ImagesLRUCache lruCache; // TODO: for now, work with simple lru cache, later on consider resource usage along with lastUsedSema value
 };
-
-/**
- * @class ImagesMemorySubAllocator
- * @brief A memory sub-allocator designed for managing sub-allocations within a pre-allocated GPU memory arena for images.
- * 
- * This class wraps around `nbl::core::GeneralpurposeAddressAllocator` to provide offset-based memory allocation
- * for image resources within a contiguous block of GPU memory.
- *
- * @note This class only manages address offsets. The actual memory must be bound separately.
- */
-class ImagesMemorySubAllocator : public core::IReferenceCounted 
-{
-public:
-	using AddressAllocator = nbl::core::GeneralpurposeAddressAllocator<uint64_t>;
-	using ReservedAllocator = nbl::core::allocator<uint8_t>;
-	static constexpr uint64_t InvalidAddress = AddressAllocator::invalid_address;
-	static constexpr uint64_t MaxMemoryAlignment = 4096u; // safe choice based on hardware reports
-	static constexpr uint64_t MinAllocSize = 128 * 1024u; // 128KB, the larger this is the better
-
-	ImagesMemorySubAllocator(uint64_t memoryArenaSize)
-	{
-		m_reservedAllocSize = AddressAllocator::reserved_size(MaxMemoryAlignment, memoryArenaSize, MinAllocSize);
-		m_reservedAllocator = std::unique_ptr<ReservedAllocator>(new ReservedAllocator());
-		m_reservedAlloc = m_reservedAllocator->allocate(m_reservedAllocSize, _NBL_SIMD_ALIGNMENT);
-		m_addressAllocator = std::unique_ptr<AddressAllocator>(new AddressAllocator(
-			m_reservedAlloc, 0u, 0u, MaxMemoryAlignment, memoryArenaSize, MinAllocSize
-		));
-	}
-
-	// return offset, will return InvalidAddress if failed
-	uint64_t allocate(uint64_t size, uint64_t alignment)
-	{
-		return m_addressAllocator->alloc_addr(size, alignment);
-	}
-
-	void deallocate(uint64_t addr, uint64_t size)
-	{
-		m_addressAllocator->free_addr(addr, size);
-	}
-
-	~ImagesMemorySubAllocator()
-	{
-		if (m_reservedAlloc)
-			m_reservedAllocator->deallocate(reinterpret_cast<uint8_t*>(m_reservedAlloc), m_reservedAllocSize);
-	}
-	
-private:
-	std::unique_ptr<AddressAllocator> m_addressAllocator = nullptr;
-
-	// Memory Allocation Required for the AddressAllocator
-	std::unique_ptr<ReservedAllocator> m_reservedAllocator = nullptr;
-	void* m_reservedAlloc = nullptr;
-	size_t m_reservedAllocSize = 0;
-
-};
-
-struct ImageCleanup : nbl::video::ICleanup
-{
-	ImageCleanup()
-		: imagesMemorySuballocator(nullptr)
-		, addr(ImagesMemorySubAllocator::InvalidAddress)
-		, size(0ull)
-	{}
-
-	~ImageCleanup() override
-	{
-		if (imagesMemorySuballocator && addr != ImagesMemorySubAllocator::InvalidAddress)
-			imagesMemorySuballocator->deallocate(addr, size);
-	}
-
-	smart_refctd_ptr<ImagesMemorySubAllocator> imagesMemorySuballocator;
-	uint64_t addr;
-	uint64_t size;
-
-};
\ No newline at end of file
diff --git a/62_CAD/main.cpp b/62_CAD/main.cpp
index 016571fa8..b0e4c8d05 100644
--- a/62_CAD/main.cpp
+++ b/62_CAD/main.cpp
@@ -1517,6 +1517,7 @@ class ComputerAidedDesign final : public examples::SimpleWindowedApplication, pu
 	{
 		auto retval = device_base_t::getRequiredDeviceFeatures();
 		retval.fragmentShaderPixelInterlock = FragmentShaderPixelInterlock;
+		retval.nullDescriptor = true;
 		return retval;
 	}
 

From 749658f2027632a73ce1ee9a07f6abe51ae1c0f0 Mon Sep 17 00:00:00 2001
From: keptsecret <sorchon@gmail.com>
Date: Thu, 15 May 2025 15:55:34 +0700
Subject: [PATCH 253/529] manage workgroup in example

---
 .../app_resources/testWorkgroup.comp.hlsl                   | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/23_Arithmetic2UnitTest/app_resources/testWorkgroup.comp.hlsl b/23_Arithmetic2UnitTest/app_resources/testWorkgroup.comp.hlsl
index 7f1b5dcbe..eb7d8e936 100644
--- a/23_Arithmetic2UnitTest/app_resources/testWorkgroup.comp.hlsl
+++ b/23_Arithmetic2UnitTest/app_resources/testWorkgroup.comp.hlsl
@@ -10,11 +10,13 @@ struct DataProxy
 
     void get(const uint32_t ix, NBL_REF_ARG(dtype_t) value)
     {
-        value = inputValue[ix];
+        const uint32_t workgroupOffset = nbl::hlsl::glsl::gl_WorkGroupID().x * Config::VirtualWorkgroupSize;
+        value = inputValue[workgroupOffset + ix];
     }
     void set(const uint32_t ix, const dtype_t value)
     {
-        output[Binop::BindingIndex].template Store<type_t>(sizeof(uint32_t) + sizeof(type_t) * ix, value);
+        const uint32_t workgroupOffset = nbl::hlsl::glsl::gl_WorkGroupID().x * Config::VirtualWorkgroupSize;
+        output[Binop::BindingIndex].template Store<type_t>(sizeof(uint32_t) + sizeof(type_t) * (workgroupOffset+ix), value);
     }
 
     void workgroupExecutionAndMemoryBarrier()

From 1de31ddfd725009bd650f1fe80f1c4a8c2e6a14a Mon Sep 17 00:00:00 2001
From: keptsecret <sorchon@gmail.com>
Date: Thu, 15 May 2025 15:57:04 +0700
Subject: [PATCH 254/529] moved benchmark to ex 29

---
 {74_Arithmetic2Bench => 29_Arithmetic2Bench}/CMakeLists.txt    | 0
 .../app_resources/benchmarkSubgroup.comp.hlsl                  | 0
 .../app_resources/benchmarkWorkgroup.comp.hlsl                 | 0
 .../app_resources/common.hlsl                                  | 0
 .../app_resources/shaderCommon.hlsl                            | 0
 .../app_resources/workgroupCommon.hlsl                         | 0
 .../config.json.template                                       | 0
 {74_Arithmetic2Bench => 29_Arithmetic2Bench}/main.cpp          | 0
 {74_Arithmetic2Bench => 29_Arithmetic2Bench}/pipeline.groovy   | 0
 CMakeLists.txt                                                 | 3 +--
 10 files changed, 1 insertion(+), 2 deletions(-)
 rename {74_Arithmetic2Bench => 29_Arithmetic2Bench}/CMakeLists.txt (100%)
 rename {74_Arithmetic2Bench => 29_Arithmetic2Bench}/app_resources/benchmarkSubgroup.comp.hlsl (100%)
 rename {74_Arithmetic2Bench => 29_Arithmetic2Bench}/app_resources/benchmarkWorkgroup.comp.hlsl (100%)
 rename {74_Arithmetic2Bench => 29_Arithmetic2Bench}/app_resources/common.hlsl (100%)
 rename {74_Arithmetic2Bench => 29_Arithmetic2Bench}/app_resources/shaderCommon.hlsl (100%)
 rename {74_Arithmetic2Bench => 29_Arithmetic2Bench}/app_resources/workgroupCommon.hlsl (100%)
 rename {74_Arithmetic2Bench => 29_Arithmetic2Bench}/config.json.template (100%)
 rename {74_Arithmetic2Bench => 29_Arithmetic2Bench}/main.cpp (100%)
 rename {74_Arithmetic2Bench => 29_Arithmetic2Bench}/pipeline.groovy (100%)

diff --git a/74_Arithmetic2Bench/CMakeLists.txt b/29_Arithmetic2Bench/CMakeLists.txt
similarity index 100%
rename from 74_Arithmetic2Bench/CMakeLists.txt
rename to 29_Arithmetic2Bench/CMakeLists.txt
diff --git a/74_Arithmetic2Bench/app_resources/benchmarkSubgroup.comp.hlsl b/29_Arithmetic2Bench/app_resources/benchmarkSubgroup.comp.hlsl
similarity index 100%
rename from 74_Arithmetic2Bench/app_resources/benchmarkSubgroup.comp.hlsl
rename to 29_Arithmetic2Bench/app_resources/benchmarkSubgroup.comp.hlsl
diff --git a/74_Arithmetic2Bench/app_resources/benchmarkWorkgroup.comp.hlsl b/29_Arithmetic2Bench/app_resources/benchmarkWorkgroup.comp.hlsl
similarity index 100%
rename from 74_Arithmetic2Bench/app_resources/benchmarkWorkgroup.comp.hlsl
rename to 29_Arithmetic2Bench/app_resources/benchmarkWorkgroup.comp.hlsl
diff --git a/74_Arithmetic2Bench/app_resources/common.hlsl b/29_Arithmetic2Bench/app_resources/common.hlsl
similarity index 100%
rename from 74_Arithmetic2Bench/app_resources/common.hlsl
rename to 29_Arithmetic2Bench/app_resources/common.hlsl
diff --git a/74_Arithmetic2Bench/app_resources/shaderCommon.hlsl b/29_Arithmetic2Bench/app_resources/shaderCommon.hlsl
similarity index 100%
rename from 74_Arithmetic2Bench/app_resources/shaderCommon.hlsl
rename to 29_Arithmetic2Bench/app_resources/shaderCommon.hlsl
diff --git a/74_Arithmetic2Bench/app_resources/workgroupCommon.hlsl b/29_Arithmetic2Bench/app_resources/workgroupCommon.hlsl
similarity index 100%
rename from 74_Arithmetic2Bench/app_resources/workgroupCommon.hlsl
rename to 29_Arithmetic2Bench/app_resources/workgroupCommon.hlsl
diff --git a/74_Arithmetic2Bench/config.json.template b/29_Arithmetic2Bench/config.json.template
similarity index 100%
rename from 74_Arithmetic2Bench/config.json.template
rename to 29_Arithmetic2Bench/config.json.template
diff --git a/74_Arithmetic2Bench/main.cpp b/29_Arithmetic2Bench/main.cpp
similarity index 100%
rename from 74_Arithmetic2Bench/main.cpp
rename to 29_Arithmetic2Bench/main.cpp
diff --git a/74_Arithmetic2Bench/pipeline.groovy b/29_Arithmetic2Bench/pipeline.groovy
similarity index 100%
rename from 74_Arithmetic2Bench/pipeline.groovy
rename to 29_Arithmetic2Bench/pipeline.groovy
diff --git a/CMakeLists.txt b/CMakeLists.txt
index ed3992203..31ebaddf9 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -64,6 +64,7 @@ if(NBL_BUILD_EXAMPLES)
 	add_subdirectory(26_Blur EXCLUDE_FROM_ALL)
 	add_subdirectory(27_MPMCScheduler EXCLUDE_FROM_ALL)	
 	add_subdirectory(28_FFTBloom EXCLUDE_FROM_ALL)
+	add_subdirectory(29_Arithmetic2Bench EXCLUDE_FROM_ALL)
 	# add_subdirectory(36_CUDAInterop EXCLUDE_FROM_ALL)
 
 	# Showcase compute pathtracing
@@ -91,7 +92,5 @@ if(NBL_BUILD_EXAMPLES)
   	add_subdirectory(70_FLIPFluids EXCLUDE_FROM_ALL)
 	add_subdirectory(71_RayTracingPipeline EXCLUDE_FROM_ALL)
 
-	add_subdirectory(74_Arithmetic2Bench EXCLUDE_FROM_ALL)
-
 	NBL_HOOK_COMMON_API("${NBL_COMMON_API_TARGETS}")
 endif()

From fc20a2940689494cc79c3525fb16e578855199bf Mon Sep 17 00:00:00 2001
From: Przemek <minikers21@gmail.com>
Date: Thu, 15 May 2025 14:10:28 +0200
Subject: [PATCH 255/529] Grid DTM fundamentals

---
 62_CAD/DrawResourcesFiller.cpp                | 60 +++++++++++++-
 62_CAD/DrawResourcesFiller.h                  |  9 ++
 62_CAD/main.cpp                               | 82 ++++++++++++++++++-
 62_CAD/shaders/globals.hlsl                   | 21 +++--
 62_CAD/shaders/main_pipeline/common.hlsl      | 13 ++-
 .../main_pipeline/fragment_shader.hlsl        | 19 +++--
 .../shaders/main_pipeline/vertex_shader.hlsl  | 28 ++++++-
 7 files changed, 214 insertions(+), 18 deletions(-)

diff --git a/62_CAD/DrawResourcesFiller.cpp b/62_CAD/DrawResourcesFiller.cpp
index c83055f0e..cb6e79352 100644
--- a/62_CAD/DrawResourcesFiller.cpp
+++ b/62_CAD/DrawResourcesFiller.cpp
@@ -311,6 +311,27 @@ void DrawResourcesFiller::drawFontGlyph(
 
 // TODO[Przemek]: similar to other drawXXX and drawXXX_internal functions that create mainobjects, drawObjects and push additional info in geometry buffer, input to function would be a GridDTMInfo
 // We don't have an allocator or memory management for texture updates yet, see how `_test_addImageObject` is being temporarily used (Descriptor updates and pipeline barriers) to upload an image into gpu and update a descriptor slot (it will become more sophisticated but doesn't block you)
+void DrawResourcesFiller::drawGridDTM(
+	const float64_t2& topLeft,
+	float64_t height,
+	float64_t width,
+	const DTMSettingsInfo& dtmSettingsInfo,
+	SIntendedSubmitInfo& intendedNextSubmit)
+{
+	GridDTMInfo gridDTMInfo;
+	gridDTMInfo.topLeft = topLeft;
+	gridDTMInfo.height = height;
+	gridDTMInfo.width = width;
+
+	beginMainObject(MainObjectType::GRID_DTM);
+
+	uint32_t mainObjectIdx = acquireActiveMainObjectIndex_SubmitIfNeeded(intendedNextSubmit);
+	assert(mainObjectIdx != InvalidMainObjectIdx);
+
+	addGridDTM_Internal(gridDTMInfo, mainObjectIdx);
+
+	endMainObject();
+}
 
 void DrawResourcesFiller::_test_addImageObject(float64_t2 topLeftPos, float32_t2 size, float32_t rotation, SIntendedSubmitInfo& intendedNextSubmit)
 {
@@ -827,7 +848,7 @@ uint32_t DrawResourcesFiller::acquireActiveMainObjectIndex_SubmitIfNeeded(SInten
 		(activeMainObjectType == MainObjectType::POLYLINE) ||
 		(activeMainObjectType == MainObjectType::HATCH) ||
 		(activeMainObjectType == MainObjectType::TEXT);
-	const bool needsDTMSettings = (activeMainObjectType == MainObjectType::DTM);
+	const bool needsDTMSettings = (activeMainObjectType == MainObjectType::DTM || activeMainObjectType == MainObjectType::GRID_DTM);
 	const bool needsCustomProjection = (!activeProjectionIndices.empty());
 	const bool needsCustomClipRect = (!activeClipRectIndices.empty());
 
@@ -1188,6 +1209,43 @@ bool DrawResourcesFiller::addFontGlyph_Internal(const GlyphInfo& glyphInfo, uint
 	return true;
 }
 
+bool DrawResourcesFiller::addGridDTM_Internal(const GridDTMInfo& gridDTMInfo, uint32_t mainObjIdx)
+{
+	const size_t remainingResourcesSize = calculateRemainingResourcesSize();
+
+	const uint32_t uploadableObjects = (remainingResourcesSize) / (sizeof(GridDTMInfo) + sizeof(DrawObject) + sizeof(uint32_t) * 6u);
+	// TODO[ERFAN]: later take into account: our maximum indexable vertex 
+
+	if (uploadableObjects <= 0u)
+		return false;
+
+	// Add Geometry
+	size_t geometryBufferOffset = resourcesCollection.geometryInfo.increaseSizeAndGetOffset(sizeof(GridDTMInfo), alignof(GridDTMInfo));
+	void* dst = resourcesCollection.geometryInfo.data() + geometryBufferOffset;
+	memcpy(dst, &gridDTMInfo, sizeof(GridDTMInfo));
+
+	// Push Indices, remove later when compute fills this
+	uint32_t* indexBufferToBeFilled = resourcesCollection.indexBuffer.increaseCountAndGetPtr(6u);
+	const uint32_t startObj = resourcesCollection.drawObjects.getCount();
+	uint32_t i = 0u;
+	indexBufferToBeFilled[i * 6] = (startObj + i) * 4u + 1u;
+	indexBufferToBeFilled[i * 6 + 1u] = (startObj + i) * 4u + 0u;
+	indexBufferToBeFilled[i * 6 + 2u] = (startObj + i) * 4u + 2u;
+	indexBufferToBeFilled[i * 6 + 3u] = (startObj + i) * 4u + 1u;
+	indexBufferToBeFilled[i * 6 + 4u] = (startObj + i) * 4u + 2u;
+	indexBufferToBeFilled[i * 6 + 5u] = (startObj + i) * 4u + 3u;
+
+	// Add DrawObjs
+	DrawObject* drawObjectsToBeFilled = resourcesCollection.drawObjects.increaseCountAndGetPtr(1u);
+	DrawObject drawObj = {};
+	drawObj.mainObjIndex = mainObjIdx;
+	drawObj.type_subsectionIdx = uint32_t(static_cast<uint16_t>(ObjectType::GRID_DTM) | (0 << 16));
+	//drawObj.geometryAddress = 0;
+	drawObjectsToBeFilled[0u] = drawObj;
+
+	return true;
+}
+
 void DrawResourcesFiller::setGlyphMSDFTextureFunction(const GetGlyphMSDFTextureFunc& func)
 {
 	getGlyphMSDF = func;
diff --git a/62_CAD/DrawResourcesFiller.h b/62_CAD/DrawResourcesFiller.h
index a10379e1a..15c4ad9a8 100644
--- a/62_CAD/DrawResourcesFiller.h
+++ b/62_CAD/DrawResourcesFiller.h
@@ -199,6 +199,12 @@ struct DrawResourcesFiller
 		float32_t  aspectRatio,
 		float32_t2 minUV,
 		SIntendedSubmitInfo& intendedNextSubmit);
+
+	void drawGridDTM(const float64_t2& topLeft,
+		float64_t height,
+		float64_t width,
+		const DTMSettingsInfo& dtmSettingsInfo,
+		SIntendedSubmitInfo& intendedNextSubmit);
 	
 	void _test_addImageObject(
 		float64_t2 topLeftPos,
@@ -416,6 +422,9 @@ struct DrawResourcesFiller
 	/// Attempts to upload a single GlyphInfo considering resource limitations
 	bool addFontGlyph_Internal(const GlyphInfo& glyphInfo, uint32_t mainObjIdx);
 	
+	/// Attempts to upload a single GridDTMInfo considering resource limitations
+	bool addGridDTM_Internal(const GridDTMInfo& gridDTMInfo, uint32_t mainObjIdx);
+
 	void resetMainObjects()
 	{
 		resourcesCollection.mainObjects.vector.clear();
diff --git a/62_CAD/main.cpp b/62_CAD/main.cpp
index c7fe04603..842712632 100644
--- a/62_CAD/main.cpp
+++ b/62_CAD/main.cpp
@@ -60,6 +60,7 @@ enum class ExampleMode
 	CASE_8, // MSDF and Text
 	CASE_9, // DTM
 	CASE_BUG, // Bug Repro, after fix, rename to CASE_10 and comment should be: testing fixed geometry and emulated fp64 corner cases
+	CASE_11, // grid DTM
 	CASE_COUNT
 };
 
@@ -75,10 +76,11 @@ constexpr std::array<float, (uint32_t)ExampleMode::CASE_COUNT> cameraExtents =
 	10.0,	// CASE_7
 	600.0,	// CASE_8
 	600.0,	// CASE_9
-	10.0	// CASE_BUG
+	10.0,	// CASE_BUG
+	600.0	// CASE_11
 };
 
-constexpr ExampleMode mode = ExampleMode::CASE_9;
+constexpr ExampleMode mode = ExampleMode::CASE_11;
 
 class Camera2D
 {
@@ -3446,6 +3448,82 @@ class ComputerAidedDesign final : public examples::SimpleWindowedApplication, pu
 				drawResourcesFiller.drawFixedGeometryPolyline(polyline, style, transformation, TransformationType::TT_FIXED_SCREENSPACE_SIZE, intendedNextSubmit);
 			}
 		}
+		else if (mode == ExampleMode::CASE_11)
+		{
+			DTMSettingsInfo dtmInfo{};
+			//dtmInfo.mode |= E_DTM_MODE::OUTLINE;
+			dtmInfo.mode |= E_DTM_MODE::HEIGHT_SHADING;
+			dtmInfo.mode |= E_DTM_MODE::CONTOUR;
+
+			dtmInfo.outlineStyleInfo.screenSpaceLineWidth = 0.0f;
+			dtmInfo.outlineStyleInfo.worldSpaceLineWidth = 1.0f;
+			dtmInfo.outlineStyleInfo.color = float32_t4(0.0f, 0.39f, 0.0f, 1.0f);
+			std::array<double, 4> outlineStipplePattern = { 0.0f, -5.0f, 20.0f, -5.0f };
+			dtmInfo.outlineStyleInfo.setStipplePatternData(outlineStipplePattern);
+
+			dtmInfo.contourSettingsCount = 2u;
+			dtmInfo.contourSettings[0u].startHeight = 20;
+			dtmInfo.contourSettings[0u].endHeight = 90;
+			dtmInfo.contourSettings[0u].heightInterval = 10;
+			dtmInfo.contourSettings[0u].lineStyleInfo.screenSpaceLineWidth = 0.0f;
+			dtmInfo.contourSettings[0u].lineStyleInfo.worldSpaceLineWidth = 1.0f;
+			dtmInfo.contourSettings[0u].lineStyleInfo.color = float32_t4(0.0f, 0.0f, 1.0f, 0.7f);
+			std::array<double, 4> contourStipplePattern = { 0.0f, -5.0f, 10.0f, -5.0f };
+			dtmInfo.contourSettings[0u].lineStyleInfo.setStipplePatternData(contourStipplePattern);
+
+			dtmInfo.contourSettings[1u] = dtmInfo.contourSettings[0u];
+			dtmInfo.contourSettings[1u].startHeight += 5.0f;
+			dtmInfo.contourSettings[1u].heightInterval = 13.0f;
+			dtmInfo.contourSettings[1u].lineStyleInfo.color = float32_t4(0.8f, 0.4f, 0.3f, 1.0f);
+
+			// PRESS 1, 2, 3 TO SWITCH HEIGHT SHADING MODE
+			// 1 - DISCRETE_VARIABLE_LENGTH_INTERVALS
+			// 2 - DISCRETE_FIXED_LENGTH_INTERVALS
+			// 3 - CONTINOUS_INTERVALS
+			float animatedAlpha = (std::cos(m_timeElapsed * 0.0005) + 1.0) * 0.5;
+			switch (m_shadingModeExample)
+			{
+				case E_HEIGHT_SHADING_MODE::DISCRETE_VARIABLE_LENGTH_INTERVALS:
+				{
+					dtmInfo.heightShadingInfo.heightShadingMode = E_HEIGHT_SHADING_MODE::DISCRETE_VARIABLE_LENGTH_INTERVALS;
+
+					dtmInfo.heightShadingInfo.addHeightColorMapEntry(-10.0f, float32_t4(0.5f, 1.0f, 1.0f, 1.0f));
+					dtmInfo.heightShadingInfo.addHeightColorMapEntry(20.0f, float32_t4(0.0f, 1.0f, 0.0f, 1.0f));
+					dtmInfo.heightShadingInfo.addHeightColorMapEntry(25.0f, float32_t4(1.0f, 1.0f, 0.0f, animatedAlpha));
+					dtmInfo.heightShadingInfo.addHeightColorMapEntry(70.0f, float32_t4(1.0f, 0.0f, 0.0f, 1.0f));
+					dtmInfo.heightShadingInfo.addHeightColorMapEntry(90.0f, float32_t4(1.0f, 0.0f, 0.0f, 1.0f));
+
+					break;
+				}
+				case E_HEIGHT_SHADING_MODE::DISCRETE_FIXED_LENGTH_INTERVALS:
+				{
+					dtmInfo.heightShadingInfo.intervalLength = 10.0f;
+					dtmInfo.heightShadingInfo.intervalIndexToHeightMultiplier = dtmInfo.heightShadingInfo.intervalLength;
+					dtmInfo.heightShadingInfo.isCenteredShading = false;
+					dtmInfo.heightShadingInfo.heightShadingMode = E_HEIGHT_SHADING_MODE::DISCRETE_FIXED_LENGTH_INTERVALS;
+					dtmInfo.heightShadingInfo.addHeightColorMapEntry(0.0f, float32_t4(0.0f, 0.0f, 1.0f, animatedAlpha));
+					dtmInfo.heightShadingInfo.addHeightColorMapEntry(25.0f, float32_t4(0.0f, 1.0f, 1.0f, animatedAlpha));
+					dtmInfo.heightShadingInfo.addHeightColorMapEntry(50.0f, float32_t4(0.0f, 1.0f, 0.0f, animatedAlpha));
+					dtmInfo.heightShadingInfo.addHeightColorMapEntry(75.0f, float32_t4(1.0f, 1.0f, 0.0f, animatedAlpha));
+					dtmInfo.heightShadingInfo.addHeightColorMapEntry(100.0f, float32_t4(1.0f, 0.0f, 0.0f, animatedAlpha));
+
+					break;
+				}
+				case E_HEIGHT_SHADING_MODE::CONTINOUS_INTERVALS:
+				{
+					dtmInfo.heightShadingInfo.heightShadingMode = E_HEIGHT_SHADING_MODE::CONTINOUS_INTERVALS;
+					dtmInfo.heightShadingInfo.addHeightColorMapEntry(0.0f, float32_t4(0.0f, 0.0f, 1.0f, animatedAlpha));
+					dtmInfo.heightShadingInfo.addHeightColorMapEntry(25.0f, float32_t4(0.0f, 1.0f, 1.0f, animatedAlpha));
+					dtmInfo.heightShadingInfo.addHeightColorMapEntry(50.0f, float32_t4(0.0f, 1.0f, 0.0f, animatedAlpha));
+					dtmInfo.heightShadingInfo.addHeightColorMapEntry(75.0f, float32_t4(1.0f, 1.0f, 0.0f, animatedAlpha));
+					dtmInfo.heightShadingInfo.addHeightColorMapEntry(90.0f, float32_t4(1.0f, 0.0f, 0.0f, animatedAlpha));
+
+					break;
+				}
+			}
+
+			drawResourcesFiller.drawGridDTM({ 0.0f, 200.0f }, 400.0f, 800.0f, dtmInfo, intendedNextSubmit);
+		}
 	}
 
 	double getScreenToWorldRatio(const float64_t3x3& viewProjectionMatrix, uint32_t2 windowSize)
diff --git a/62_CAD/shaders/globals.hlsl b/62_CAD/shaders/globals.hlsl
index b565ff4ff..acbd55d40 100644
--- a/62_CAD/shaders/globals.hlsl
+++ b/62_CAD/shaders/globals.hlsl
@@ -120,6 +120,7 @@ enum class MainObjectType : uint32_t
     TEXT,
     IMAGE,
     DTM,
+    GRID_DTM
 };
 
 enum class ObjectType : uint32_t
@@ -130,7 +131,8 @@ enum class ObjectType : uint32_t
     POLYLINE_CONNECTOR = 3u,
     FONT_GLYPH = 4u,
     IMAGE = 5u,
-    TRIANGLE_MESH = 6u
+    TRIANGLE_MESH = 6u,
+    GRID_DTM = 7u
 };
 
 enum class MajorAxis : uint32_t
@@ -232,16 +234,23 @@ struct GlyphInfo
 // Goes into geometry buffer, needs to be aligned by 8
 struct ImageObjectInfo
 {
-    pfloat64_t2  topLeft; // 2 * 8 = 16 bytes (16)
+    pfloat64_t2 topLeft; // 2 * 8 = 16 bytes (16)
     float32_t2 dirU; // 2 * 4 = 8 bytes (24)
     float32_t aspectRatio; // 4 bytes (28)
     uint32_t textureID; // 4 bytes (32)
 };
 
-/*
-GRID DTM Info similar to `ImageObjectInfo`
-other than textureID, there will be dtmSettingsIdx referencing a dtmSettings
-*/
+// Goes into geometry buffer, needs to be aligned by 8
+struct GridDTMInfo
+{
+    pfloat64_t2 topLeft; // 2 * 8 = 16 bytes (16)
+    pfloat64_t height; // 8 bytes (24)
+    pfloat64_t width; // 8 bytes (32)
+    uint32_t textureID; // 4 bytes (36)
+    uint32_t dtmInfoID; // 4 bytes (40)
+    float gridCellWidth; // 4 bytes (44)
+    float _padding; // 4 bytes (48)
+};
 
 static uint32_t packR11G11B10_UNORM(float32_t3 color)
 {
diff --git a/62_CAD/shaders/main_pipeline/common.hlsl b/62_CAD/shaders/main_pipeline/common.hlsl
index 631e421b9..e348ca0c2 100644
--- a/62_CAD/shaders/main_pipeline/common.hlsl
+++ b/62_CAD/shaders/main_pipeline/common.hlsl
@@ -229,7 +229,18 @@ struct PSInput
     void setScreenSpaceVertexAttribs(float3 pos) { vertexScreenSpacePos = pos; }
 #else // fragment shader
     float3 getScreenSpaceVertexAttribs(uint32_t vertexIndex) { return vertexScreenSpacePos[vertexIndex]; }
-#endif 
+#endif
+
+    /* GRID DTM */
+    uint getHeightMapTextureID() { return data1.x; }
+    uint getDTMSettingsID() { return data1.y; }
+    float getGridDTMScreenSpaceCellWidth() { return data2.x; }
+    float2 getGridDTMScreenSpacePosition() { return interp_data5.zw; }
+
+    void setHeightMapTextureID(uint heightMapTextureID) { data1.x = heightMapTextureID; }
+    void setDTMSettingsID(uint dtmSettingsID) { data1.y = dtmSettingsID; }
+    void setGridDTMScreenSpaceCellWidth(float screenSpaceGridWidth) { data2.x = screenSpaceGridWidth; }
+    void setGridDTMScreenSpacePosition(float2 screenSpacePosition) { interp_data5.zw = screenSpacePosition; }
 };
 
 // Set 0 - Scene Data and Globals, buffer bindings don't change the buffers only get updated
diff --git a/62_CAD/shaders/main_pipeline/fragment_shader.hlsl b/62_CAD/shaders/main_pipeline/fragment_shader.hlsl
index 6475faeff..6a3953bf2 100644
--- a/62_CAD/shaders/main_pipeline/fragment_shader.hlsl
+++ b/62_CAD/shaders/main_pipeline/fragment_shader.hlsl
@@ -387,7 +387,7 @@ float4 fragMain(PSInput input) : SV_TARGET
                 localAlpha = colorSample.a;
             }
         }
-        // objType GRID_DTM here
+        else if (objType == ObjectType::GRID_DTM)
         {
             // NOTE: create and read from a texture as a last step, you can generate the height values procedurally from a function while you're working on the sdf stuff.
             
@@ -402,6 +402,16 @@ float4 fragMain(PSInput input) : SV_TARGET
             
             // TODO: we need to emulate dilation and do sdf of neighbouring cells as well. because contours, outlines and shading can bleed into other cells for AA.
             // [NOTE] Do dilation as last step, when everything else works fine
+
+            textureColor = float4(1.0f, 1.0f, 1.0f, 1.0f);
+            float2 uv = input.getImageUV();
+            float scalar = uv.x * uv.x * 0.25f + uv.y * uv.y * 0.25f;
+            textureColor *= scalar;
+            localAlpha = 1.0f;
+
+            //return outputColor;
+            printf("uv = %f, %f", uv.x, uv.y);
+
         }
         
 
@@ -410,11 +420,8 @@ float4 fragMain(PSInput input) : SV_TARGET
         if (localAlpha <= 0)
             discard;
         
-        const bool colorFromTexture = objType == ObjectType::IMAGE;
-        
-        // TODO[Przemek]: But make sure you're still calling this, correctly calculating alpha and texture color.
-        // you can add 1 main object and push via DrawResourcesFiller like we already do for other objects (this go in the mainObjects StorageBuffer) and then set the currentMainObjectIdx to 0 here
-        // having 1 main object temporarily means that all triangle meshes will be treated as a unified object in blending operations. 
+        const bool colorFromTexture = objType == ObjectType::IMAGE || objType == ObjectType::GRID_DTM;
+
         return calculateFinalColor<DeviceConfigCaps::fragmentShaderPixelInterlock>(fragCoord, localAlpha, currentMainObjectIdx, textureColor, colorFromTexture);
     }
 }
diff --git a/62_CAD/shaders/main_pipeline/vertex_shader.hlsl b/62_CAD/shaders/main_pipeline/vertex_shader.hlsl
index 73225e3c0..4b55c1e30 100644
--- a/62_CAD/shaders/main_pipeline/vertex_shader.hlsl
+++ b/62_CAD/shaders/main_pipeline/vertex_shader.hlsl
@@ -119,7 +119,7 @@ PSInput main(uint vertexID : SV_VertexID)
     outV.data3 = float4(0, 0, 0, 0);
     outV.data4 = float4(0, 0, 0, 0);
     outV.interp_data5 = float2(0, 0);
-    
+
     if (pc.isDTMRendering)
     {
         outV.setObjType(ObjectType::TRIANGLE_MESH);
@@ -645,7 +645,31 @@ PSInput main(uint vertexID : SV_VertexID)
             outV.setImageUV(uv);
             outV.setImageTextureId(textureID);
         }
-        // TODO: Przemek objType GRID_DTM, Similar transformations to IMAGE
+        else if (objType == ObjectType::GRID_DTM)
+        {
+            pfloat64_t2 topLeft = vk::RawBufferLoad<pfloat64_t2>(globals.pointers.geometryBuffer + drawObj.geometryAddress, 8u);
+            pfloat64_t height = vk::RawBufferLoad<pfloat64_t>(globals.pointers.geometryBuffer + drawObj.geometryAddress + sizeof(pfloat64_t2), 8u);
+            pfloat64_t width = vk::RawBufferLoad<pfloat64_t>(globals.pointers.geometryBuffer + drawObj.geometryAddress + sizeof(pfloat64_t2) + sizeof(pfloat64_t), 8u);
+            uint32_t textureID = vk::RawBufferLoad<uint32_t>(globals.pointers.geometryBuffer + drawObj.geometryAddress + sizeof(pfloat64_t2) + 2 * sizeof(pfloat64_t), 8u);
+            uint32_t dtmSettingsID = vk::RawBufferLoad<uint32_t>(globals.pointers.geometryBuffer + drawObj.geometryAddress + sizeof(pfloat64_t2) + 2 * sizeof(pfloat64_t) + sizeof(uint32_t), 8u);
+            float gridCellWidth = vk::RawBufferLoad<float>(globals.pointers.geometryBuffer + drawObj.geometryAddress + sizeof(pfloat64_t2) + 2 * sizeof(pfloat64_t) + 2 * sizeof(uint32_t), 8u);
+
+            const float2 corner = float2(bool2(vertexIdx & 0x1u, vertexIdx >> 1));
+            pfloat64_t2 vtxPos = topLeft;
+            if (corner.x)
+                vtxPos.x = vtxPos.x + width;
+            if (corner.y)
+                vtxPos.y = vtxPos.y - height;
+
+            float2 ndcVtxPos = _static_cast<float2>(transformPointNdc(clipProjectionData.projectionToNDC, vtxPos));
+            outV.position = float4(ndcVtxPos, 0.0f, 1.0f);
+
+            outV.setHeightMapTextureID(textureID);
+            outV.setDTMSettingsID(dtmSettingsID);
+            outV.setGridDTMScreenSpaceCellWidth(gridCellWidth); // TODO: is input world space?
+            outV.setGridDTMScreenSpacePosition(transformPointScreenSpace(clipProjectionData.projectionToNDC, globals.resolution, transformPointScreenSpace(clipProjectionData.projectionToNDC, globals.resolution, vtxPos)));
+            outV.setImageUV(corner);
+        }
 
     // Make the cage fullscreen for testing: 
 #if 0

From 0b010ea5503ab5f5460183d48d6aefb806c03baf Mon Sep 17 00:00:00 2001
From: Przemek <minikers21@gmail.com>
Date: Thu, 15 May 2025 15:17:09 +0200
Subject: [PATCH 256/529] Updated media submodule

---
 media | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/media b/media
index 68dbe85b9..a98646358 160000
--- a/media
+++ b/media
@@ -1 +1 @@
-Subproject commit 68dbe85b9849c9b094760428a3639f5c8917d85e
+Subproject commit a9864635879e5a616ac400eecd8b6451b498fbf1

From ab6dd2e3bbca153072f6cec0c84117429599e789 Mon Sep 17 00:00:00 2001
From: Przemek <minikers21@gmail.com>
Date: Thu, 15 May 2025 15:48:07 +0200
Subject: [PATCH 257/529] Corrections

---
 62_CAD/shaders/main_pipeline/common.hlsl        | 2 +-
 62_CAD/shaders/main_pipeline/vertex_shader.hlsl | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/62_CAD/shaders/main_pipeline/common.hlsl b/62_CAD/shaders/main_pipeline/common.hlsl
index 23dc7b421..ccc30b1b8 100644
--- a/62_CAD/shaders/main_pipeline/common.hlsl
+++ b/62_CAD/shaders/main_pipeline/common.hlsl
@@ -82,7 +82,7 @@ struct PSInput
     [[vk::location(2)]] nointerpolation float4 data3 : COLOR3;
     [[vk::location(3)]] nointerpolation float4 data4 : COLOR4;
     // Data segments that need interpolation, mostly for hatches
-    [[vk::location(5)]] float2 interp_data5 : COLOR5;
+    [[vk::location(5)]] float4 interp_data5 : COLOR5;
 #ifdef FRAGMENT_SHADER_INPUT
     [[vk::location(6)]] [[vk::ext_decorate(/*spv::DecoratePerVertexKHR*/5285)]] float3 vertexScreenSpacePos[3] : COLOR6;
 #else
diff --git a/62_CAD/shaders/main_pipeline/vertex_shader.hlsl b/62_CAD/shaders/main_pipeline/vertex_shader.hlsl
index 4b55c1e30..e92a8d33b 100644
--- a/62_CAD/shaders/main_pipeline/vertex_shader.hlsl
+++ b/62_CAD/shaders/main_pipeline/vertex_shader.hlsl
@@ -118,7 +118,7 @@ PSInput main(uint vertexID : SV_VertexID)
     outV.data2 = float4(0, 0, 0, 0);
     outV.data3 = float4(0, 0, 0, 0);
     outV.data4 = float4(0, 0, 0, 0);
-    outV.interp_data5 = float2(0, 0);
+    outV.interp_data5 = float4(0, 0, 0, 0);
 
     if (pc.isDTMRendering)
     {
@@ -667,7 +667,7 @@ PSInput main(uint vertexID : SV_VertexID)
             outV.setHeightMapTextureID(textureID);
             outV.setDTMSettingsID(dtmSettingsID);
             outV.setGridDTMScreenSpaceCellWidth(gridCellWidth); // TODO: is input world space?
-            outV.setGridDTMScreenSpacePosition(transformPointScreenSpace(clipProjectionData.projectionToNDC, globals.resolution, transformPointScreenSpace(clipProjectionData.projectionToNDC, globals.resolution, vtxPos)));
+            outV.setGridDTMScreenSpacePosition(transformPointScreenSpace(clipProjectionData.projectionToNDC, globals.resolution, vtxPos));
             outV.setImageUV(corner);
         }
 

From e828dc49ef0a223dcbb8b4af8d722974747f29ee Mon Sep 17 00:00:00 2001
From: keptsecret <sorchon@gmail.com>
Date: Fri, 16 May 2025 11:18:11 +0700
Subject: [PATCH 258/529] fit accessors to concept

---
 .../app_resources/testWorkgroup.comp.hlsl                   | 2 ++
 23_Arithmetic2UnitTest/app_resources/workgroupCommon.hlsl   | 6 ++++--
 .../app_resources/benchmarkWorkgroup.comp.hlsl              | 2 ++
 29_Arithmetic2Bench/app_resources/workgroupCommon.hlsl      | 6 ++++--
 4 files changed, 12 insertions(+), 4 deletions(-)

diff --git a/23_Arithmetic2UnitTest/app_resources/testWorkgroup.comp.hlsl b/23_Arithmetic2UnitTest/app_resources/testWorkgroup.comp.hlsl
index eb7d8e936..3aafc0aa7 100644
--- a/23_Arithmetic2UnitTest/app_resources/testWorkgroup.comp.hlsl
+++ b/23_Arithmetic2UnitTest/app_resources/testWorkgroup.comp.hlsl
@@ -8,11 +8,13 @@ struct DataProxy
     using dtype_t = vector<uint32_t, Config::ItemsPerInvocation_0>;
     static_assert(nbl::hlsl::is_same_v<dtype_t, type_t>);
 
+    template<typename AccessType>
     void get(const uint32_t ix, NBL_REF_ARG(dtype_t) value)
     {
         const uint32_t workgroupOffset = nbl::hlsl::glsl::gl_WorkGroupID().x * Config::VirtualWorkgroupSize;
         value = inputValue[workgroupOffset + ix];
     }
+    template<typename AccessType>
     void set(const uint32_t ix, const dtype_t value)
     {
         const uint32_t workgroupOffset = nbl::hlsl::glsl::gl_WorkGroupID().x * Config::VirtualWorkgroupSize;
diff --git a/23_Arithmetic2UnitTest/app_resources/workgroupCommon.hlsl b/23_Arithmetic2UnitTest/app_resources/workgroupCommon.hlsl
index 69aa11ecc..b0ccbf295 100644
--- a/23_Arithmetic2UnitTest/app_resources/workgroupCommon.hlsl
+++ b/23_Arithmetic2UnitTest/app_resources/workgroupCommon.hlsl
@@ -47,11 +47,13 @@ groupshared uint32_t scratch[config_t::ElementCount];
 
 struct ScratchProxy
 {
-    void get(const uint32_t ix, NBL_REF_ARG(uint32_t) value)
+    template<typename AccessType>
+    void get(const uint32_t ix, NBL_REF_ARG(AccessType) value)
     {
         value = scratch[ix];
     }
-    void set(const uint32_t ix, const uint32_t value)
+    template<typename AccessType>
+    void set(const uint32_t ix, const AccessType value)
     {
         scratch[ix] = value;
     }
diff --git a/29_Arithmetic2Bench/app_resources/benchmarkWorkgroup.comp.hlsl b/29_Arithmetic2Bench/app_resources/benchmarkWorkgroup.comp.hlsl
index ac6ea7fd8..6e32bedbd 100644
--- a/29_Arithmetic2Bench/app_resources/benchmarkWorkgroup.comp.hlsl
+++ b/29_Arithmetic2Bench/app_resources/benchmarkWorkgroup.comp.hlsl
@@ -12,11 +12,13 @@ struct DataProxy
     static_assert(nbl::hlsl::is_same_v<dtype_t, type_t>);
 
     // we don't want to write/read storage multiple times in loop; doesn't seem optimized out in generated spirv
+    template<typename AccessType>
     void get(const uint32_t ix, NBL_REF_ARG(dtype_t) value)
     {
         // value = inputValue[ix];
         value = nbl::hlsl::promote<dtype_t>(globalIndex());
     }
+    template<typename AccessType>
     void set(const uint32_t ix, const dtype_t value)
     {
         // output[Binop::BindingIndex].template Store<type_t>(sizeof(uint32_t) + sizeof(type_t) * ix, value);
diff --git a/29_Arithmetic2Bench/app_resources/workgroupCommon.hlsl b/29_Arithmetic2Bench/app_resources/workgroupCommon.hlsl
index 69aa11ecc..b0ccbf295 100644
--- a/29_Arithmetic2Bench/app_resources/workgroupCommon.hlsl
+++ b/29_Arithmetic2Bench/app_resources/workgroupCommon.hlsl
@@ -47,11 +47,13 @@ groupshared uint32_t scratch[config_t::ElementCount];
 
 struct ScratchProxy
 {
-    void get(const uint32_t ix, NBL_REF_ARG(uint32_t) value)
+    template<typename AccessType>
+    void get(const uint32_t ix, NBL_REF_ARG(AccessType) value)
     {
         value = scratch[ix];
     }
-    void set(const uint32_t ix, const uint32_t value)
+    template<typename AccessType>
+    void set(const uint32_t ix, const AccessType value)
     {
         scratch[ix] = value;
     }

From 34a61a337035d8f2b053f1fed002b5f2c85ffde5 Mon Sep 17 00:00:00 2001
From: Erfan Ahmadi <ahmadierfan99@gmail.com>
Date: Fri, 16 May 2025 08:58:48 +0400
Subject: [PATCH 259/529] Fix overflow submits in a beautiful way

---
 62_CAD/DrawResourcesFiller.cpp | 88 +++++++++++++++++++++-------------
 62_CAD/DrawResourcesFiller.h   | 20 +++++---
 62_CAD/Images.h                | 13 ++---
 62_CAD/main.cpp                | 16 ++++---
 4 files changed, 87 insertions(+), 50 deletions(-)

diff --git a/62_CAD/DrawResourcesFiller.cpp b/62_CAD/DrawResourcesFiller.cpp
index 105374493..c7a074d2f 100644
--- a/62_CAD/DrawResourcesFiller.cpp
+++ b/62_CAD/DrawResourcesFiller.cpp
@@ -370,31 +370,35 @@ uint32_t DrawResourcesFiller::addStaticImage2D(image_id imageID, const core::sma
 	 */
 	auto evictionCallback = [&](const ImageReference& evicted)
 	{
-		// Prepare wait info to defer index deallocation until the GPU has finished using the resource.
-		// Because we will be writing to the descriptor set location which might be in use.
-		ISemaphore::SWaitInfo deallocationWaitInfo = { .semaphore = intendedNextSubmit.getFutureScratchSemaphore().semaphore, .value = evicted.lastUsedSemaphoreValue };
-		
-		// will later be used to release the image's memory range.
+		// Later used to release the image's memory range.
 		core::smart_refctd_ptr<ImageCleanup> cleanupObject = core::make_smart_refctd_ptr<ImageCleanup>();
 		cleanupObject->imagesMemorySuballocator = imagesMemorySubAllocator;
 		cleanupObject->addr = evicted.allocationOffset;
 		cleanupObject->size = evicted.allocationSize;
+		
 
-		const bool imageUsedForNextIntendedSubmit = (evicted.lastUsedSemaphoreValue == intendedNextSubmit.getFutureScratchSemaphore().value);
-
+		const bool imageUsedForNextIntendedSubmit = (evicted.lastUsedFrameIndex == currentFrameIndex);
+		
+		// NOTE: `deallocationWaitInfo` is crucial for both paths, we need to make sure we'll write to a descriptor arrayIndex when it's 100% done with previous usages.
 		if (imageUsedForNextIntendedSubmit)
 		{
 			// The evicted image is scheduled for use in the upcoming submit.
 			// To avoid rendering artifacts, we must flush the current draw queue now.
 			// After submission, we reset state so that data referencing the evicted slot can be re-uploaded.
-			suballocatedDescriptorSet->multi_deallocate(imagesArrayBinding, 1u, &evicted.index, deallocationWaitInfo, &cleanupObject.get());
 			submitDraws(intendedNextSubmit);
 			reset(); // resets everything, things referenced through mainObj and other shit will be pushed again through acquireXXX_SubmitIfNeeded
+			
+			// Prepare wait info to defer index deallocation until the GPU has finished using the resource.
+			// we wait on the signal semaphore for the submit we just did above.
+			ISemaphore::SWaitInfo deallocationWaitInfo = { .semaphore = intendedNextSubmit.scratchSemaphore.semaphore, .value = intendedNextSubmit.scratchSemaphore.value };
+			suballocatedDescriptorSet->multi_deallocate(imagesArrayBinding, 1u, &evicted.index, deallocationWaitInfo, &cleanupObject.get());
 		} 
 		else
 		{
-			// The image is not used in the current frame (intended next submit), so we can deallocate without submitting any draws.
+			// The image is not used in the current frame, so we can deallocate without submitting any draws.
 			// Still wait on the semaphore to ensure past GPU usage is complete.
+			// TODO: We don't know which semaphore value the frame with `evicted.lastUsedFrameIndex` index was submitted with, so we wait for the worst case value which is the immediate prev submit.
+			ISemaphore::SWaitInfo deallocationWaitInfo = { .semaphore = intendedNextSubmit.scratchSemaphore.semaphore, .value = intendedNextSubmit.scratchSemaphore.value };
 			suballocatedDescriptorSet->multi_deallocate(imagesArrayBinding, 1u, &evicted.index, deallocationWaitInfo, &cleanupObject.get());
 		}
 	};
@@ -402,7 +406,7 @@ uint32_t DrawResourcesFiller::addStaticImage2D(image_id imageID, const core::sma
 	// Try inserting or updating the image usage in the cache.
 	// If the image is already present, updates its semaphore value.
 	ImageReference* inserted = imagesUsageCache->insert(imageID, intendedNextSubmit.getFutureScratchSemaphore().value, evictionCallback);
-	inserted->lastUsedSemaphoreValue = intendedNextSubmit.getFutureScratchSemaphore().value; // in case there was an eviction + auto-submit, we need to update AGAIN
+	inserted->lastUsedFrameIndex = currentFrameIndex; // in case there was an eviction + auto-submit, we need to update AGAIN
 
 	// if inserted->index was not InvalidTextureIndex then it means we had a cache hit and updated the value of our sema
 	// in which case we don't queue anything for upload, and return the idx
@@ -481,6 +485,7 @@ uint32_t DrawResourcesFiller::addStaticImage2D(image_id imageID, const core::sma
 								{
 									// irrecoverable error if simple image creation fails.
 									// TODO[LOG]: that's rare, image view creation failed.
+									_NBL_DEBUG_BREAK_IF(true);
 								}
 
 								// succcessful with everything, just break and get out of this retry loop
@@ -490,11 +495,13 @@ uint32_t DrawResourcesFiller::addStaticImage2D(image_id imageID, const core::sma
 							{
 								// irrecoverable error if simple bindImageMemory fails.
 								// TODO: LOG
+								_NBL_DEBUG_BREAK_IF(true);
 								break;
 							}
 						}
 						else
 						{
+							// printf(std::format("Allocation Failed, Trying again, ImageID={} Size={} \n", imageID, gpuImageMemoryRequirements.size).c_str());
 							// recoverable error when allocation fails, we don't log anything, next code will try evicting other images and retry
 						}
 					}
@@ -502,6 +509,7 @@ uint32_t DrawResourcesFiller::addStaticImage2D(image_id imageID, const core::sma
 					{
 						// irrecoverable error if memory requirements of the image don't match our preallocated devicememory
 						// TODO: LOG
+						_NBL_DEBUG_BREAK_IF(true);
 						break;
 					}
 				}
@@ -509,6 +517,7 @@ uint32_t DrawResourcesFiller::addStaticImage2D(image_id imageID, const core::sma
 				{
 					// irrecoverable error if simple image creation fails.
 					// TODO: LOG
+					_NBL_DEBUG_BREAK_IF(true);
 					break;
 				}
 
@@ -517,6 +526,7 @@ uint32_t DrawResourcesFiller::addStaticImage2D(image_id imageID, const core::sma
 					{
 						// Nothing else to evict; give up.
 						// We probably have evicted almost every other texture except the one we just allocated an index for
+						_NBL_DEBUG_BREAK_IF(true);
 						break;
 					}
 
@@ -527,7 +537,7 @@ uint32_t DrawResourcesFiller::addStaticImage2D(image_id imageID, const core::sma
 				if (imageRef)
 					evictionCallback(*imageRef);
 				imagesUsageCache->erase(evictionCandidate);
-				suballocatedDescriptorSet->cull_frees(); // to make sure deallocation requests in eviction callback are waited for.
+				while (suballocatedDescriptorSet->cull_frees()) {}; // to make sure deallocation requests in eviction callback are blocked for.
 
 				// we don't hold any references to the GPUImageView or GPUImage so descriptor binding will be the last reference
 				// hopefully by here the suballocated descriptor set freed some VRAM by dropping the image last ref and it's dedicated allocation.
@@ -535,13 +545,15 @@ uint32_t DrawResourcesFiller::addStaticImage2D(image_id imageID, const core::sma
 
 			if (gpuImageView)
 			{
+				inserted->lastUsedFrameIndex = currentFrameIndex; // there was an eviction + auto-submit, we need to update AGAIN
+
 				StaticImagesCopy copyToStage = 
 				{
 					.cpuImage = cpuImage,
 					.gpuImageView = gpuImageView,
 					.arrayIndex = inserted->index,
 				};
-				
+				// printf(std::format("Everything success, ImageID={} ArrayIndex={} \n", imageID, inserted->index).c_str());
 				staticImagesStagedCopies.push_back(copyToStage);
 			}
 			else
@@ -549,8 +561,8 @@ uint32_t DrawResourcesFiller::addStaticImage2D(image_id imageID, const core::sma
 				// All attempts to create the GPU image and its corresponding view have failed.
 				// Most likely cause: insufficient GPU memory or unsupported image parameters.
 				// TODO: Log a warning or error here � `addStaticImage2D` failed, likely due to low VRAM.
-				// assert(false);
-				
+				_NBL_DEBUG_BREAK_IF(true);
+
 				if (inserted->allocationOffset != ImagesMemorySubAllocator::InvalidAddress)
 				{
 					// We previously successfully create and allocated memory for the Image
@@ -576,7 +588,7 @@ uint32_t DrawResourcesFiller::addStaticImage2D(image_id imageID, const core::sma
 		}
 	}
 	
-	// assert(inserted->index != InvalidTextureIndex); // shouldn't happen, because we're using LRU cache, so worst case eviction will happen + multi-deallocate and next next multi_allocate should definitely succeed
+	assert(inserted->index != InvalidTextureIndex); // shouldn't happen, because we're using LRU cache, so worst case eviction will happen + multi-deallocate and next next multi_allocate should definitely succeed
 
 	return inserted->index;
 }
@@ -967,8 +979,8 @@ bool DrawResourcesFiller::pushStaticImagesUploads(SIntendedSubmitInfo& intendedN
 				descriptorInfos[i].desc = stagedStaticImage.gpuImageView;
 
 				// consider batching contiguous writes, if descriptor set updating was a hotspot
-				descriptorWrites[i].dstSet = descriptorSet,
-					descriptorWrites[i].binding = imagesArrayBinding;
+				descriptorWrites[i].dstSet = descriptorSet;
+				descriptorWrites[i].binding = imagesArrayBinding;
 				descriptorWrites[i].arrayElement = stagedStaticImage.arrayIndex;
 				descriptorWrites[i].count = 1u;
 				descriptorWrites[i].info = &descriptorInfos[i];
@@ -1660,7 +1672,7 @@ uint32_t DrawResourcesFiller::getImageIndexFromID(image_id imageID, const SInten
 	if (imageRef)
 	{
 		textureIdx = imageRef->index;
-		imageRef->lastUsedSemaphoreValue = intendedNextSubmit.getFutureScratchSemaphore().value; // update this because the texture will get used on the next submit
+		imageRef->lastUsedFrameIndex = currentFrameIndex; // update this because the texture will get used on the next frane
 	}
 	return textureIdx;
 }
@@ -1675,6 +1687,13 @@ void DrawResourcesFiller::setHatchFillMSDFTextureFunction(const GetHatchFillPatt
 	getHatchFillPatternMSDF = func;
 }
 
+void DrawResourcesFiller::markFrameUsageComplete(uint64_t drawSubmitWaitValue)
+{
+	currentFrameIndex++;
+	// TODO[LATER]: take into account that currentFrameIndex was submitted with drawSubmitWaitValue; Use that value when deallocating the resources marked with this frame index
+	//				Currently, for evictions the worst case value will be waited for, as there is no way yet to know which semaphoroe value will signal the completion of the (to be evicted) resource's usage
+}
+
 uint32_t DrawResourcesFiller::getMSDFIndexFromInputInfo(const MSDFInputInfo& msdfInfo, const SIntendedSubmitInfo& intendedNextSubmit)
 {
 	uint32_t textureIdx = InvalidTextureIndex;
@@ -1682,7 +1701,7 @@ uint32_t DrawResourcesFiller::getMSDFIndexFromInputInfo(const MSDFInputInfo& msd
 	if (tRef)
 	{
 		textureIdx = tRef->alloc_idx;
-		tRef->lastUsedSemaphoreValue = intendedNextSubmit.getFutureScratchSemaphore().value; // update this because the texture will get used on the next submit
+		tRef->lastUsedFrameIndex = currentFrameIndex; // update this because the texture will get used on the next frame
 	}
 	return textureIdx;
 }
@@ -1706,31 +1725,36 @@ uint32_t DrawResourcesFiller::addMSDFTexture(const MSDFInputInfo& msdfInput, cor
 	 */
 	auto evictionCallback = [&](const MSDFReference& evicted)
 	{
-		// Prepare wait info to defer index deallocation until the GPU has finished using the resource.
-		// NOTE: This wait is currently *not* required for correctness because:
-		//   - Both the image upload (stagedStaticImage) and usage occur within the same timeline (`intendedNextSubmit`).
-		//   - timeline semaphores guarantee proper ordering: the next submit's stagedStaticImage will wait on the prior usage.
+		// `deallocationWaitInfo` is used to prepare wait info to defer index deallocation until the GPU has finished using the resource.
+		// NOTE: `deallocationWaitInfo` is currently *not* required for correctness because:
+		//   - Both the image upload (msdfStagedCPUImages) and usage occur within the same timeline (`intendedNextSubmit`).
+		//   - timeline semaphores guarantee proper ordering: the next submit's msdfStagedCPUImages will wait on the prior usage.
 		//   - Therefore, we can safely overwrite or reallocate the slot without waiting for explicit GPU completion.
 		//
 		// However, this `deallocationWaitInfo` *will* become essential if we start interacting with MSDF images
-		// outside the `intendedNextSubmit` timeline � for example, issuing uploads via a transfer queue or using a separate command buffer and timeline.
-		ISemaphore::SWaitInfo deallocationWaitInfo = { .semaphore = intendedNextSubmit.getFutureScratchSemaphore().semaphore, .value = evicted.lastUsedSemaphoreValue };
+		// outside the `intendedNextSubmit` timeline for example, issuing uploads via a transfer queue or using a separate command buffer and timeline.
 
-		const bool imageUsedForNextIntendedSubmit = (evicted.lastUsedSemaphoreValue == intendedNextSubmit.getFutureScratchSemaphore().value);
+		const bool imageUsedForNextIntendedSubmit = (evicted.lastUsedFrameIndex == currentFrameIndex);
 
 		if (imageUsedForNextIntendedSubmit)
 		{
 			// The evicted image is scheduled for use in the upcoming submit.
 			// To avoid rendering artifacts, we must flush the current draw queue now.
 			// After submission, we reset state so that data referencing the evicted slot can be re-uploaded.
-			msdfTextureArrayIndexAllocator->multi_deallocate(1u, &evicted.alloc_idx, deallocationWaitInfo);
 			submitDraws(intendedNextSubmit);
 			reset(); // resets everything, things referenced through mainObj and other shit will be pushed again through acquireXXX_SubmitIfNeeded
+
+			// Prepare wait info to defer index deallocation until the GPU has finished using the resource.
+			// we wait on the signal semaphore for the submit we just did above.
+			ISemaphore::SWaitInfo deallocationWaitInfo = { .semaphore = intendedNextSubmit.scratchSemaphore.semaphore, .value = intendedNextSubmit.scratchSemaphore.value };
+			msdfTextureArrayIndexAllocator->multi_deallocate(1u, &evicted.alloc_idx, deallocationWaitInfo);
 		} 
 		else
 		{
-			// The image is not used in the current frame (intended next submit), so we can deallocate without submitting any draws.
-			// Still wait on the semaphore to ensure past GPU usage is complete (read note above).
+			// The image is not used in the current frame, so we can deallocate without submitting any draws.
+			// Still wait on the semaphore to ensure past GPU usage is complete.
+			// TODO: We don't know which semaphore value the frame with `evicted.lastUsedFrameIndex` index was submitted with, so we wait for the worst case value which is the immediate prev submit (scratchSemaphore.value).
+			ISemaphore::SWaitInfo deallocationWaitInfo = { .semaphore = intendedNextSubmit.scratchSemaphore.semaphore, .value = intendedNextSubmit.scratchSemaphore.value };
 			msdfTextureArrayIndexAllocator->multi_deallocate(1u, &evicted.alloc_idx, deallocationWaitInfo);
 		}
 		
@@ -1739,9 +1763,9 @@ uint32_t DrawResourcesFiller::addMSDFTexture(const MSDFInputInfo& msdfInput, cor
 	};
 	
 	// We pass nextSemaValue instead of constructing a new MSDFReference and passing it into `insert` that's because we might get a cache hit and only update the value of the nextSema
-	MSDFReference* inserted = msdfLRUCache->insert(msdfInput, intendedNextSubmit.getFutureScratchSemaphore().value, evictionCallback);
+	MSDFReference* inserted = msdfLRUCache->insert(msdfInput, currentFrameIndex, evictionCallback);
 	
-	inserted->lastUsedSemaphoreValue = intendedNextSubmit.getFutureScratchSemaphore().value; // in case there was an eviction + auto-submit, we need to update AGAIN
+	inserted->lastUsedFrameIndex = currentFrameIndex; // in case there was an eviction + auto-submit, we need to update AGAIN
 
 	// if inserted->alloc_idx was not InvalidTextureIndex then it means we had a cache hit and updated the value of our sema, in which case we don't queue anything for upload, and return the idx
 	if (inserted->alloc_idx == InvalidTextureIndex)
@@ -1752,7 +1776,7 @@ uint32_t DrawResourcesFiller::addMSDFTexture(const MSDFInputInfo& msdfInput, cor
 
 		if (inserted->alloc_idx != IndexAllocator::AddressAllocator::invalid_address)
 		{
-			// We stage stagedStaticImage, pushMSDFImagesUploads will push it into GPU
+			// We stage msdfStagedCPUImages, pushMSDFImagesUploads will push it into GPU
 			msdfStagedCPUImages[inserted->alloc_idx].image = std::move(cpuImage);
 			msdfStagedCPUImages[inserted->alloc_idx].uploadedToGPU = false;
 		}
diff --git a/62_CAD/DrawResourcesFiller.h b/62_CAD/DrawResourcesFiller.h
index 801dc41c2..a9b5da172 100644
--- a/62_CAD/DrawResourcesFiller.h
+++ b/62_CAD/DrawResourcesFiller.h
@@ -149,16 +149,21 @@ struct DrawResourcesFiller
 	void setGlyphMSDFTextureFunction(const GetGlyphMSDFTextureFunc& func);
 	void setHatchFillMSDFTextureFunction(const GetHatchFillPatternMSDFTextureFunc& func);
 
+	// Must be called at the end of each frame.
+	// right before submitting the main draw that uses the currently queued geometry, images, or other objects/resources.
+	// Registers the semaphore/value that will signal completion of this frame�s draw,
+	// This allows future frames to safely deallocate or evict resources used in the current frame by waiting on this signal before reuse or destruction.
+	// `drawSubmitWaitValue` should reference the wait value of the draw submission finishing this frame using the `intendedNextSubmit`; 
+	void markFrameUsageComplete(uint64_t drawSubmitWaitValue);
+
 	// TODO[Przemek]: try to draft up a `CTriangleMesh` Class in it's own header (like CPolyline), simplest form is basically two cpu buffers (1 array of uint index buffer, 1 array of float64_t3 vertexBuffer)
 	// TODO[Przemek]: Then have a `drawMesh` function here similar to drawXXX's below, this will fit both vertex and index buffer in the `geometryBuffer`.
 	// take a `SIntendedSubmitInfo` like others, but don't use it as I don't want you to handle anything regarding autoSubmit
 	// somehow retrieve or calculate the geometry buffer offsets of your vertex and index buffer to be used outside for binding purposes
 
-	
 	//! this function fills buffers required for drawing a polyline and submits a draw through provided callback when there is not enough memory.
 	void drawPolyline(const CPolylineBase& polyline, const LineStyleInfo& lineStyleInfo, SIntendedSubmitInfo& intendedNextSubmit);
 
-
 	//! Draws a fixed-geometry polyline using a custom transformation.
 	//! TODO: Change `polyline` input to an ID referencing a possibly cached instance in our buffers, allowing reuse and avoiding redundant uploads.
 	void drawFixedGeometryPolyline(const CPolylineBase& polyline, const LineStyleInfo& lineStyleInfo, const float64_t3x3& transformation, TransformationType transformationType, SIntendedSubmitInfo& intendedNextSubmit);
@@ -568,14 +573,14 @@ struct DrawResourcesFiller
 	struct MSDFReference
 	{
 		uint32_t alloc_idx;
-		uint64_t lastUsedSemaphoreValue;
+		uint64_t lastUsedFrameIndex;
 
-		MSDFReference(uint32_t alloc_idx, uint64_t semaphoreVal) : alloc_idx(alloc_idx), lastUsedSemaphoreValue(semaphoreVal) {}
-		MSDFReference(uint64_t semaphoreVal) : MSDFReference(InvalidTextureIndex, semaphoreVal) {}
+		MSDFReference(uint32_t alloc_idx, uint64_t semaphoreVal) : alloc_idx(alloc_idx), lastUsedFrameIndex(semaphoreVal) {}
+		MSDFReference(uint64_t currentFrameIndex) : MSDFReference(InvalidTextureIndex, currentFrameIndex) {}
 		MSDFReference() : MSDFReference(InvalidTextureIndex, ~0ull) {}
 
 		// In LRU Cache `insert` function, in case of cache hit, we need to assign semaphore value to MSDFReference without changing `alloc_idx`
-		inline MSDFReference& operator=(uint64_t semamphoreVal) { lastUsedSemaphoreValue = semamphoreVal; return *this;  }
+		inline MSDFReference& operator=(uint64_t currentFrameIndex) { lastUsedFrameIndex = currentFrameIndex; return *this;  }
 	};
 	
 	uint32_t getMSDFIndexFromInputInfo(const MSDFInputInfo& msdfInfo, const SIntendedSubmitInfo& intendedNextSubmit);
@@ -585,6 +590,9 @@ struct DrawResourcesFiller
 	// Flushes Current Draw Call and adds to drawCalls
 	void flushDrawObjects();
 
+	// FrameIndex used as a criteria for resource/image eviction in case of limitations
+	uint32_t currentFrameIndex = 0u;
+
 	// Replay Cache override
 	ReplayCache* currentReplayCache = nullptr;
 
diff --git a/62_CAD/Images.h b/62_CAD/Images.h
index 7d9682e36..7c9609161 100644
--- a/62_CAD/Images.h
+++ b/62_CAD/Images.h
@@ -73,6 +73,7 @@ struct ImageCleanup : public core::IReferenceCounted
 
 	~ImageCleanup() override
 	{
+		// printf(std::format("Actual Eviction size={}, offset={} \n", size, addr).c_str());
 		if (imagesMemorySuballocator && addr != ImagesMemorySubAllocator::InvalidAddress)
 			imagesMemorySuballocator->deallocate(addr, size);
 	}
@@ -87,34 +88,34 @@ struct ImageReference
 {
 	static constexpr uint32_t InvalidTextureIndex = nbl::hlsl::numeric_limits<uint32_t>::max;
 	uint32_t index = InvalidTextureIndex; // index in our array of textures binding
-	uint64_t lastUsedSemaphoreValue = 0ull; // last used semaphore value on this image
+	uint64_t lastUsedFrameIndex = 0ull; // last used semaphore value on this image
 	uint64_t allocationOffset = ImagesMemorySubAllocator::InvalidAddress;
 	uint64_t allocationSize = 0ull;
 
 	ImageReference() 
 		: index(InvalidTextureIndex)
-		, lastUsedSemaphoreValue(0ull)
+		, lastUsedFrameIndex(0ull)
 		, allocationOffset(ImagesMemorySubAllocator::InvalidAddress)
 		, allocationSize(0ull)
 	{}
 	
 	// In LRU Cache `insert` function, in case of cache miss, we need to construct the refereence with semaphore value
-	ImageReference(uint64_t semamphoreVal) 
+	ImageReference(uint64_t currentFrameIndex) 
 		: index(InvalidTextureIndex)
-		, lastUsedSemaphoreValue(semamphoreVal)
+		, lastUsedFrameIndex(currentFrameIndex)
 		, allocationOffset(ImagesMemorySubAllocator::InvalidAddress)
 		, allocationSize(0ull)
 	{}
 
 	// In LRU Cache `insert` function, in case of cache hit, we need to assign semaphore value without changing `index`
-	inline ImageReference& operator=(uint64_t semamphoreVal) { lastUsedSemaphoreValue = semamphoreVal; return *this;  }
+	inline ImageReference& operator=(uint64_t currentFrameIndex) { lastUsedFrameIndex = currentFrameIndex; return *this;  }
 };
 
 // A resource-aware image cache with an LRU eviction policy.
 // This cache tracks image usage by ID and provides hooks for eviction logic, such as releasing descriptor slots and deallocating GPU memory.
 // Currently, eviction is purely LRU-based. In the future, eviction decisions may incorporate additional factors:
 //   - memory usage per image.
-//   - lastUsedSemaphoreValue.
+//   - lastUsedFrameIndex.
 // This class does not own GPU resources directly, but helps coordinate their lifetimes in sync with GPU usage via eviction callbacks.
 class ImagesUsageCache
 {
diff --git a/62_CAD/main.cpp b/62_CAD/main.cpp
index 356ff23aa..238dbedb6 100644
--- a/62_CAD/main.cpp
+++ b/62_CAD/main.cpp
@@ -1044,10 +1044,10 @@ class ComputerAidedDesign final : public examples::SimpleWindowedApplication, pu
 		system::path m_loadCWD = "..";
 		std::string imagePaths[] =
 		{
-			"../../media/color_space_test/R8G8B8A8_1.png",
-			"../../media/color_space_test/R8G8B8A8_2.png",
-			"../../media/color_space_test/R8G8B8_1.png",
 			"../../media/color_space_test/R8G8B8_1.jpg",
+			"../../media/color_space_test/R8G8B8_1.png",
+			"../../media/color_space_test/R8G8B8A8_2.png",
+			"../../media/color_space_test/R8G8B8A8_1.png",
 		};
 
 		for (const auto& imagePath : imagePaths)
@@ -1280,8 +1280,8 @@ class ComputerAidedDesign final : public examples::SimpleWindowedApplication, pu
 		const bool isCachingDraw = CacheAndReplay && m_realFrameIx == 0u && !finishedCachingDraw;
 		if (isCachingDraw)
 		{
+			drawResourcesFiller.markFrameUsageComplete(intendedSubmitInfo.getFutureScratchSemaphore().value);
 			replayCaches.push_back(drawResourcesFiller.createReplayCache());
-			intendedSubmitInfo.scratchSemaphore.value++; // fake advance needed for Texture and MSDF LRU caches and evictions to work
 			return; // we don't record, submit or do anything, just caching the draw resources
 		}
 
@@ -1454,6 +1454,8 @@ class ComputerAidedDesign final : public examples::SimpleWindowedApplication, pu
 
 		if (!inBetweenSubmit)
 			cb->endDebugMarker();
+		
+		drawResourcesFiller.markFrameUsageComplete(intendedSubmitInfo.getFutureScratchSemaphore().value);
 
 		if (inBetweenSubmit)
 		{
@@ -2903,9 +2905,11 @@ class ComputerAidedDesign final : public examples::SimpleWindowedApplication, pu
 			for (uint32_t i = 0; i < sampleImages.size(); ++i)
 			{
 				uint64_t imageID = i * 69ull; // it can be hash or something of the file path the image was loaded from
+				//printf(std::format("\n Image {} \n", i).c_str());
 				drawResourcesFiller.addStaticImage2D(imageID, sampleImages[i], intendedNextSubmit);
-				drawResourcesFiller.addImageObject(imageID, { 0.0 + i * 100.0, 0.0 }, { 100.0 , 100.0 }, 0.0, intendedNextSubmit);
+				drawResourcesFiller.addImageObject(imageID, { 0.0 + (i) * 3.0, 0.0 }, { 3.0 , 3.0 }, 0.0, intendedNextSubmit);
 				// drawResourcesFiller.addImageObject(imageID, { 40.0, +40.0 }, { 100.0, 100.0 }, 0.0, intendedNextSubmit);
+				//printf("\n");
 			}
 			LineStyleInfo lineStyle = 
 			{
@@ -2922,7 +2926,7 @@ class ComputerAidedDesign final : public examples::SimpleWindowedApplication, pu
 				linePoints.push_back({ 100.0, -100.0 });
 				polyline.addLinePoints(linePoints);
 			}
-			drawResourcesFiller.drawPolyline(polyline, lineStyle, intendedNextSubmit);
+			// drawResourcesFiller.drawPolyline(polyline, lineStyle, intendedNextSubmit);
 		}
 		else if (mode == ExampleMode::CASE_8)
 		{

From c41617b0506e4c1830c5d9f90b4827df1a807d33 Mon Sep 17 00:00:00 2001
From: keptsecret <sorchon@gmail.com>
Date: Fri, 16 May 2025 15:45:25 +0700
Subject: [PATCH 260/529] moved stuff around, check inputs in imgui

---
 71_RayTracingPipeline/main.cpp | 3342 ++++++++++++++++----------------
 1 file changed, 1675 insertions(+), 1667 deletions(-)

diff --git a/71_RayTracingPipeline/main.cpp b/71_RayTracingPipeline/main.cpp
index e31f5c280..ad13b4a5d 100644
--- a/71_RayTracingPipeline/main.cpp
+++ b/71_RayTracingPipeline/main.cpp
@@ -6,6 +6,8 @@
 #include "nbl/ext/FullScreenTriangle/FullScreenTriangle.h"
 #include "nbl/builtin/hlsl/indirect_commands.hlsl"
 
+#define TEST_ASSET_CONV_AS
+
 class RaytracingPipelineApp final : public examples::SimpleWindowedApplication, public application_templates::MonoAssetManagerAndBuiltinResourceApplication
 {
   using device_base_t = examples::SimpleWindowedApplication;
@@ -18,768 +20,768 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication,
   constexpr static inline uint32_t NumberOfProceduralGeometries = 5;
 
   static constexpr const char* s_lightTypeNames[E_LIGHT_TYPE::ELT_COUNT] = {
-    "Directional",
-    "Point",
-    "Spot"
+	"Directional",
+	"Point",
+	"Spot"
   };
 
   constexpr static inline clock_t::duration DisplayImageDuration = std::chrono::milliseconds(900);
 
   struct ShaderBindingTable
   {
-    SBufferRange<IGPUBuffer> raygenGroupRange;
-    SBufferRange<IGPUBuffer> hitGroupsRange;
-    uint32_t hitGroupsStride;
-    SBufferRange<IGPUBuffer> missGroupsRange;
-    uint32_t missGroupsStride;
-    SBufferRange<IGPUBuffer> callableGroupsRange;
-    uint32_t callableGroupsStride;
+	SBufferRange<IGPUBuffer> raygenGroupRange;
+	SBufferRange<IGPUBuffer> hitGroupsRange;
+	uint32_t hitGroupsStride;
+	SBufferRange<IGPUBuffer> missGroupsRange;
+	uint32_t missGroupsStride;
+	SBufferRange<IGPUBuffer> callableGroupsRange;
+	uint32_t callableGroupsStride;
   };
 
 
 public:
   inline RaytracingPipelineApp(const path& _localInputCWD, const path& _localOutputCWD, const path& _sharedInputCWD, const path& _sharedOutputCWD)
-    : IApplicationFramework(_localInputCWD, _localOutputCWD, _sharedInputCWD, _sharedOutputCWD)
+	: IApplicationFramework(_localInputCWD, _localOutputCWD, _sharedInputCWD, _sharedOutputCWD)
   {
   }
 
   inline SPhysicalDeviceFeatures getRequiredDeviceFeatures() const override
   {
-    auto retval = device_base_t::getRequiredDeviceFeatures();
-    retval.rayTracingPipeline = true;
-    retval.accelerationStructure = true;
-    retval.rayQuery = true;
-    return retval;
+	auto retval = device_base_t::getRequiredDeviceFeatures();
+	retval.rayTracingPipeline = true;
+	retval.accelerationStructure = true;
+	retval.rayQuery = true;
+	return retval;
   }
 
   inline SPhysicalDeviceFeatures getPreferredDeviceFeatures() const override
   {
-    auto retval = device_base_t::getPreferredDeviceFeatures();
-    retval.accelerationStructureHostCommands = true;
-    return retval;
+	auto retval = device_base_t::getPreferredDeviceFeatures();
+	retval.accelerationStructureHostCommands = true;
+	return retval;
   }
 
   inline core::vector<video::SPhysicalDeviceFilter::SurfaceCompatibility> getSurfaces() const override
   {
-    if (!m_surface)
-    {
-      {
-        auto windowCallback = core::make_smart_refctd_ptr<CEventCallback>(smart_refctd_ptr(m_inputSystem), smart_refctd_ptr(m_logger));
-        IWindow::SCreationParams params = {};
-        params.callback = core::make_smart_refctd_ptr<ISimpleManagedSurface::ICallback>();
-        params.width = WIN_W;
-        params.height = WIN_H;
-        params.x = 32;
-        params.y = 32;
-        params.flags = ui::IWindow::ECF_HIDDEN | IWindow::ECF_BORDERLESS | IWindow::ECF_RESIZABLE;
-        params.windowCaption = "RaytracingPipelineApp";
-        params.callback = windowCallback;
-        const_cast<std::remove_const_t<decltype(m_window)>&>(m_window) = m_winMgr->createWindow(std::move(params));
-      }
-
-      auto surface = CSurfaceVulkanWin32::create(smart_refctd_ptr(m_api), smart_refctd_ptr_static_cast<IWindowWin32>(m_window));
-      const_cast<std::remove_const_t<decltype(m_surface)>&>(m_surface) = CSimpleResizeSurface<ISimpleManagedSurface::ISwapchainResources>::create(std::move(surface));
-    }
-
-    if (m_surface)
-      return { {m_surface->getSurface()/*,EQF_NONE*/} };
-
-    return {};
+	if (!m_surface)
+	{
+	  {
+		auto windowCallback = core::make_smart_refctd_ptr<CEventCallback>(smart_refctd_ptr(m_inputSystem), smart_refctd_ptr(m_logger));
+		IWindow::SCreationParams params = {};
+		params.callback = core::make_smart_refctd_ptr<ISimpleManagedSurface::ICallback>();
+		params.width = WIN_W;
+		params.height = WIN_H;
+		params.x = 32;
+		params.y = 32;
+		params.flags = ui::IWindow::ECF_HIDDEN | IWindow::ECF_BORDERLESS | IWindow::ECF_RESIZABLE;
+		params.windowCaption = "RaytracingPipelineApp";
+		params.callback = windowCallback;
+		const_cast<std::remove_const_t<decltype(m_window)>&>(m_window) = m_winMgr->createWindow(std::move(params));
+	  }
+
+	  auto surface = CSurfaceVulkanWin32::create(smart_refctd_ptr(m_api), smart_refctd_ptr_static_cast<IWindowWin32>(m_window));
+	  const_cast<std::remove_const_t<decltype(m_surface)>&>(m_surface) = CSimpleResizeSurface<ISimpleManagedSurface::ISwapchainResources>::create(std::move(surface));
+	}
+
+	if (m_surface)
+	  return { {m_surface->getSurface()/*,EQF_NONE*/} };
+
+	return {};
   }
 
   // so that we can use the same queue for asset converter and rendering
   inline core::vector<queue_req_t> getQueueRequirements() const override
   {
-    auto reqs = device_base_t::getQueueRequirements();
-    reqs.front().requiredFlags |= IQueue::FAMILY_FLAGS::TRANSFER_BIT;
-    return reqs;
+	auto reqs = device_base_t::getQueueRequirements();
+	reqs.front().requiredFlags |= IQueue::FAMILY_FLAGS::TRANSFER_BIT;
+	return reqs;
   }
 
   inline bool onAppInitialized(smart_refctd_ptr<ISystem>&& system) override
   {
-    m_inputSystem = make_smart_refctd_ptr<InputSystem>(logger_opt_smart_ptr(smart_refctd_ptr(m_logger)));
-
-    if (!device_base_t::onAppInitialized(smart_refctd_ptr(system)))
-      return false;
-
-    if (!asset_base_t::onAppInitialized(smart_refctd_ptr(system)))
-      return false;
-
-    smart_refctd_ptr<IShaderCompiler::CCache> shaderReadCache = nullptr;
-    smart_refctd_ptr<IShaderCompiler::CCache> shaderWriteCache = core::make_smart_refctd_ptr<IShaderCompiler::CCache>();
-    auto shaderCachePath = localOutputCWD / "main_pipeline_shader_cache.bin";
-
-    {
-        core::smart_refctd_ptr<system::IFile> shaderReadCacheFile;
-        {
-            system::ISystem::future_t<core::smart_refctd_ptr<system::IFile>> future;
-            m_system->createFile(future, shaderCachePath.c_str(), system::IFile::ECF_READ);
-            if (future.wait())
-            {
-                future.acquire().move_into(shaderReadCacheFile);
-                if (shaderReadCacheFile)
-                {
-                    const size_t size = shaderReadCacheFile->getSize();
-                    if (size > 0ull)
-                    {
-                        std::vector<uint8_t> contents(size);
-                        system::IFile::success_t succ;
-                        shaderReadCacheFile->read(succ, contents.data(), 0, size);
-                        if (succ)
-                            shaderReadCache = IShaderCompiler::CCache::deserialize(contents);
-                    }
-                }
-            }
-            else
-                m_logger->log("Failed Openning Shader Cache File.", ILogger::ELL_ERROR);
-        }
-
-    }
-
-    // Load Custom Shader
-    auto loadCompileAndCreateShader = [&](const std::string& relPath) -> smart_refctd_ptr<IGPUShader>
-        {
-            IAssetLoader::SAssetLoadParams lp = {};
-            lp.logger = m_logger.get();
-            lp.workingDirectory = ""; // virtual root
-            auto assetBundle = m_assetMgr->getAsset(relPath, lp);
-            const auto assets = assetBundle.getContents();
-            if (assets.empty())
-                return nullptr;
-
-            // lets go straight from ICPUSpecializedShader to IGPUSpecializedShader
-            auto sourceRaw = IAsset::castDown<ICPUShader>(assets[0]);
-            if (!sourceRaw)
-                return nullptr;
-
-            return m_device->createShader({ sourceRaw.get(), nullptr, shaderReadCache.get(), shaderWriteCache.get() });
-        };
-
-    // load shaders
-    const auto raygenShader = loadCompileAndCreateShader("app_resources/raytrace.rgen.hlsl");
-    const auto closestHitShader = loadCompileAndCreateShader("app_resources/raytrace.rchit.hlsl");
-    const auto proceduralClosestHitShader = loadCompileAndCreateShader("app_resources/raytrace_procedural.rchit.hlsl");
-    const auto intersectionHitShader = loadCompileAndCreateShader("app_resources/raytrace.rint.hlsl");
-    const auto anyHitShaderColorPayload = loadCompileAndCreateShader("app_resources/raytrace.rahit.hlsl");
-    const auto anyHitShaderShadowPayload = loadCompileAndCreateShader("app_resources/raytrace_shadow.rahit.hlsl");
-    const auto missShader = loadCompileAndCreateShader("app_resources/raytrace.rmiss.hlsl");
-    const auto missShadowShader = loadCompileAndCreateShader("app_resources/raytrace_shadow.rmiss.hlsl");
-    const auto directionalLightCallShader = loadCompileAndCreateShader("app_resources/light_directional.rcall.hlsl");
-    const auto pointLightCallShader = loadCompileAndCreateShader("app_resources/light_point.rcall.hlsl");
-    const auto spotLightCallShader = loadCompileAndCreateShader("app_resources/light_spot.rcall.hlsl");
-    const auto fragmentShader = loadCompileAndCreateShader("app_resources/present.frag.hlsl");
-
-    core::smart_refctd_ptr<system::IFile> shaderWriteCacheFile;
-    {
-        system::ISystem::future_t<core::smart_refctd_ptr<system::IFile>> future;
-        m_system->deleteFile(shaderCachePath); // temp solution instead of trimming, to make sure we won't have corrupted json
-        m_system->createFile(future, shaderCachePath.c_str(), system::IFile::ECF_WRITE);
-        if (future.wait())
-        {
-            future.acquire().move_into(shaderWriteCacheFile);
-            if (shaderWriteCacheFile)
-            {
-                auto serializedCache = shaderWriteCache->serialize();
-                if (shaderWriteCacheFile)
-                {
-                    system::IFile::success_t succ;
-                    shaderWriteCacheFile->write(succ, serializedCache->getPointer(), 0, serializedCache->getSize());
-                    if (!succ)
-                        m_logger->log("Failed Writing To Shader Cache File.", ILogger::ELL_ERROR);
-                }
-            }
-            else
-                m_logger->log("Failed Creating Shader Cache File.", ILogger::ELL_ERROR);
-        }
-        else
-            m_logger->log("Failed Creating Shader Cache File.", ILogger::ELL_ERROR);
-    }
-
-    m_semaphore = m_device->createSemaphore(m_realFrameIx);
-    if (!m_semaphore)
-      return logFail("Failed to Create a Semaphore!");
-
-    auto gQueue = getGraphicsQueue();
-
-    // Create renderpass and init surface
-    nbl::video::IGPURenderpass* renderpass;
-    {
-      ISwapchain::SCreationParams swapchainParams = { .surface = smart_refctd_ptr<ISurface>(m_surface->getSurface()) };
-      if (!swapchainParams.deduceFormat(m_physicalDevice))
-        return logFail("Could not choose a Surface Format for the Swapchain!");
-
-      const static IGPURenderpass::SCreationParams::SSubpassDependency dependencies[] =
-      {
-        {
-          .srcSubpass = IGPURenderpass::SCreationParams::SSubpassDependency::External,
-          .dstSubpass = 0,
-          .memoryBarrier =
-          {
-            .srcStageMask = asset::PIPELINE_STAGE_FLAGS::COPY_BIT,
-            .srcAccessMask = asset::ACCESS_FLAGS::TRANSFER_WRITE_BIT,
-            .dstStageMask = asset::PIPELINE_STAGE_FLAGS::COLOR_ATTACHMENT_OUTPUT_BIT,
-            .dstAccessMask = asset::ACCESS_FLAGS::COLOR_ATTACHMENT_WRITE_BIT
-          }
-        },
-        {
-          .srcSubpass = 0,
-          .dstSubpass = IGPURenderpass::SCreationParams::SSubpassDependency::External,
-          .memoryBarrier =
-          {
-            .srcStageMask = asset::PIPELINE_STAGE_FLAGS::COLOR_ATTACHMENT_OUTPUT_BIT,
-            .srcAccessMask = asset::ACCESS_FLAGS::COLOR_ATTACHMENT_WRITE_BIT
-          }
-        },
-        IGPURenderpass::SCreationParams::DependenciesEnd
-      };
-
-      auto scResources = std::make_unique<CDefaultSwapchainFramebuffers>(m_device.get(), swapchainParams.surfaceFormat.format, dependencies);
-      renderpass = scResources->getRenderpass();
-
-      if (!renderpass)
-        return logFail("Failed to create Renderpass!");
-
-      if (!m_surface || !m_surface->init(gQueue, std::move(scResources), swapchainParams.sharedParams))
-        return logFail("Could not create Window & Surface or initialize the Surface!");
-    }
-
-    auto pool = m_device->createCommandPool(gQueue->getFamilyIndex(), IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT);
-
-    m_converter = CAssetConverter::create({ .device = m_device.get(), .optimizer = {} });
-
-    for (auto i = 0u; i < MaxFramesInFlight; i++)
-    {
-      if (!pool)
-        return logFail("Couldn't create Command Pool!");
-      if (!pool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY, { m_cmdBufs.data() + i, 1 }))
-        return logFail("Couldn't create Command Buffer!");
-    }
-
-    m_winMgr->setWindowSize(m_window.get(), WIN_W, WIN_H);
-    m_surface->recreateSwapchain();
-
-
-    // create output images
-    m_hdrImage = m_device->createImage({
-        {
-          .type = IGPUImage::ET_2D,
-          .samples = ICPUImage::ESCF_1_BIT,
-          .format = EF_R16G16B16A16_SFLOAT,
-          .extent = {WIN_W, WIN_H, 1},
-          .mipLevels = 1,
-          .arrayLayers = 1,
-          .flags = IImage::ECF_NONE,
-          .usage = bitflag(IImage::EUF_STORAGE_BIT) | IImage::EUF_TRANSFER_SRC_BIT | IImage::EUF_SAMPLED_BIT
-        }
-      });
-
-    if (!m_hdrImage || !m_device->allocate(m_hdrImage->getMemoryReqs(), m_hdrImage.get()).isValid())
-      return logFail("Could not create HDR Image");
-
-    m_hdrImageView = m_device->createImageView({
-      .flags = IGPUImageView::ECF_NONE,
-      .subUsages = IGPUImage::E_USAGE_FLAGS::EUF_STORAGE_BIT | IGPUImage::E_USAGE_FLAGS::EUF_SAMPLED_BIT,
-      .image = m_hdrImage,
-      .viewType = IGPUImageView::E_TYPE::ET_2D,
-      .format = asset::EF_R16G16B16A16_SFLOAT
-    });
-
-
-
-    // ray trace pipeline and descriptor set layout setup
-    {
-      const IGPUDescriptorSetLayout::SBinding bindings[] = {
-        {
-          .binding = 0,
-          .type = asset::IDescriptor::E_TYPE::ET_ACCELERATION_STRUCTURE,
-          .createFlags = IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_NONE,
-          .stageFlags = asset::IShader::E_SHADER_STAGE::ESS_RAYGEN,
-          .count = 1,
-        },
-        {
-          .binding = 1,
-          .type = asset::IDescriptor::E_TYPE::ET_STORAGE_IMAGE,
-          .createFlags = IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_NONE,
-          .stageFlags = asset::IShader::E_SHADER_STAGE::ESS_RAYGEN,
-          .count = 1,
-        }
-      };
-      const auto descriptorSetLayout = m_device->createDescriptorSetLayout(bindings);
-
-      const std::array<IGPUDescriptorSetLayout*, ICPUPipelineLayout::DESCRIPTOR_SET_COUNT> dsLayoutPtrs = { descriptorSetLayout.get() };
-      m_rayTracingDsPool = m_device->createDescriptorPoolForDSLayouts(IDescriptorPool::ECF_UPDATE_AFTER_BIND_BIT, std::span(dsLayoutPtrs.begin(), dsLayoutPtrs.end()));
-      m_rayTracingDs = m_rayTracingDsPool->createDescriptorSet(descriptorSetLayout);
-
-      const SPushConstantRange pcRange = {
-        .stageFlags = IShader::E_SHADER_STAGE::ESS_ALL_RAY_TRACING,
-        .offset = 0u,
-        .size = sizeof(SPushConstants),
-      };
-      const auto pipelineLayout = m_device->createPipelineLayout({ &pcRange, 1 }, smart_refctd_ptr(descriptorSetLayout), nullptr, nullptr, nullptr);
-
-      IGPURayTracingPipeline::SCreationParams params = {};
-
-      enum RtDemoShader
-      {
-        RTDS_RAYGEN,
-        RTDS_MISS,
-        RTDS_MISS_SHADOW,
-        RTDS_CLOSEST_HIT,
-        RTDS_SPHERE_CLOSEST_HIT,
-        RTDS_ANYHIT_PRIMARY,
-        RTDS_ANYHIT_SHADOW,
-        RTDS_INTERSECTION,
-        RTDS_DIRECTIONAL_CALL,
-        RTDS_POINT_CALL,
-        RTDS_SPOT_CALL,
-        RTDS_COUNT
-      };
-
-      IGPUShader::SSpecInfo shaders[RTDS_COUNT];
-      shaders[RTDS_RAYGEN] = {.shader = raygenShader.get()};
-      shaders[RTDS_MISS] = {.shader = missShader.get()};
-      shaders[RTDS_MISS_SHADOW] = { .shader = missShadowShader.get() };
-      shaders[RTDS_CLOSEST_HIT] = {.shader = closestHitShader.get()};
-      shaders[RTDS_SPHERE_CLOSEST_HIT] = {.shader = proceduralClosestHitShader.get()};
-      shaders[RTDS_ANYHIT_PRIMARY] = {.shader = anyHitShaderColorPayload.get()};
-      shaders[RTDS_ANYHIT_SHADOW] = {.shader = anyHitShaderShadowPayload.get()};
-      shaders[RTDS_INTERSECTION] = {.shader = intersectionHitShader.get() };
-      shaders[RTDS_DIRECTIONAL_CALL] = {.shader = directionalLightCallShader.get()};
-      shaders[RTDS_POINT_CALL] = {.shader = pointLightCallShader.get()};
-      shaders[RTDS_SPOT_CALL] = {.shader = spotLightCallShader.get()};
-
-      params.layout = pipelineLayout.get();
-      params.shaders = std::span(shaders);
-      using RayTracingFlags = IGPURayTracingPipeline::SCreationParams::FLAGS;
-      params.flags = core::bitflag(RayTracingFlags::NO_NULL_MISS_SHADERS) |
-        RayTracingFlags::NO_NULL_INTERSECTION_SHADERS | 
-        RayTracingFlags::NO_NULL_ANY_HIT_SHADERS;
-
-      auto& shaderGroups = params.shaderGroups;
-
-      shaderGroups.raygen = { .index = RTDS_RAYGEN };
-
-      IRayTracingPipelineBase::SGeneralShaderGroup missGroups[EMT_COUNT];
-      missGroups[EMT_PRIMARY] = { .index = RTDS_MISS };
-      missGroups[EMT_OCCLUSION] = { .index = RTDS_MISS_SHADOW };
-      shaderGroups.misses = missGroups;
-
-      auto getHitGroupIndex = [](E_GEOM_TYPE geomType, E_RAY_TYPE rayType)
-        {
-          return geomType * ERT_COUNT + rayType;
-        };
-      IRayTracingPipelineBase::SHitShaderGroup hitGroups[E_RAY_TYPE::ERT_COUNT * E_GEOM_TYPE::EGT_COUNT];
-      hitGroups[getHitGroupIndex(EGT_TRIANGLES, ERT_PRIMARY)] = {
-        .closestHit = RTDS_CLOSEST_HIT,
-        .anyHit = RTDS_ANYHIT_PRIMARY,
-      };
-      hitGroups[getHitGroupIndex(EGT_TRIANGLES, ERT_OCCLUSION)] = {
-        .closestHit = IGPURayTracingPipeline::SGeneralShaderGroup::Unused,
-        .anyHit = RTDS_ANYHIT_SHADOW,
-      };
-      hitGroups[getHitGroupIndex(EGT_PROCEDURAL, ERT_PRIMARY)] = {
-        .closestHit = RTDS_SPHERE_CLOSEST_HIT,
-        .anyHit = RTDS_ANYHIT_PRIMARY,
-        .intersection = RTDS_INTERSECTION,
-      };
-      hitGroups[getHitGroupIndex(EGT_PROCEDURAL, ERT_OCCLUSION)] = {
-        .closestHit = IGPURayTracingPipeline::SGeneralShaderGroup::Unused,
-        .anyHit = RTDS_ANYHIT_SHADOW,
-        .intersection = RTDS_INTERSECTION,
-      };
-      shaderGroups.hits = hitGroups;
-
-      IRayTracingPipelineBase::SGeneralShaderGroup callableGroups[ELT_COUNT];
-      callableGroups[ELT_DIRECTIONAL] = { .index = RTDS_DIRECTIONAL_CALL };
-      callableGroups[ELT_POINT] = { .index = RTDS_POINT_CALL };
-      callableGroups[ELT_SPOT] = { .index = RTDS_SPOT_CALL };
-      shaderGroups.callables = callableGroups;
-
-      params.cached.maxRecursionDepth = 1;
-      params.cached.dynamicStackSize = true;
-
-      if (!m_device->createRayTracingPipelines(nullptr, { &params, 1 }, &m_rayTracingPipeline))
-        return logFail("Failed to create ray tracing pipeline");
-
-      calculateRayTracingStackSize(m_rayTracingPipeline);
-      
-      if (!createShaderBindingTable(gQueue, m_rayTracingPipeline))
-        return logFail("Could not create shader binding table");
-
-    }
-
-    auto assetManager = make_smart_refctd_ptr<nbl::asset::IAssetManager>(smart_refctd_ptr(system));
-    auto* geometryCreator = assetManager->getGeometryCreator();
-
-    if (!createIndirectBuffer(gQueue))
-      return logFail("Could not create indirect buffer");
-
-    // create geometry objects
-    if (!createGeometries(gQueue, geometryCreator))
-      return logFail("Could not create geometries from geometry creator");
-
-    if (!createAccelerationStructures(getComputeQueue()))
-      return logFail("Could not create acceleration structures");
-
-    ISampler::SParams samplerParams = {
-      .AnisotropicFilter = 0
-    };
-    auto defaultSampler = m_device->createSampler(samplerParams);
-
-    {
-      const IGPUDescriptorSetLayout::SBinding bindings[] = {
-        {
-          .binding = 0u,
-          .type = nbl::asset::IDescriptor::E_TYPE::ET_COMBINED_IMAGE_SAMPLER,
-          .createFlags = ICPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_NONE,
-          .stageFlags = IShader::E_SHADER_STAGE::ESS_FRAGMENT,
-          .count = 1u,
-          .immutableSamplers = &defaultSampler
-        }
-      };
-      auto gpuPresentDescriptorSetLayout = m_device->createDescriptorSetLayout(bindings);
-      const video::IGPUDescriptorSetLayout* const layouts[] = { gpuPresentDescriptorSetLayout.get() };
-      const uint32_t setCounts[] = { 1u };
-      m_presentDsPool = m_device->createDescriptorPoolForDSLayouts(IDescriptorPool::E_CREATE_FLAGS::ECF_NONE, layouts, setCounts);
-      m_presentDs = m_presentDsPool->createDescriptorSet(gpuPresentDescriptorSetLayout);
-
-      auto scRes = static_cast<CDefaultSwapchainFramebuffers*>(m_surface->getSwapchainResources());
-      ext::FullScreenTriangle::ProtoPipeline fsTriProtoPPln(m_assetMgr.get(), m_device.get(), m_logger.get());
-      if (!fsTriProtoPPln)
-        return logFail("Failed to create Full Screen Triangle protopipeline or load its vertex shader!");
-
-      const IGPUShader::SSpecInfo fragSpec = {
-        .entryPoint = "main",
-        .shader = fragmentShader.get()
-      };
-
-      auto presentLayout = m_device->createPipelineLayout(
-        {},
-        core::smart_refctd_ptr(gpuPresentDescriptorSetLayout),
-        nullptr,
-        nullptr,
-        nullptr
-      );
-      m_presentPipeline = fsTriProtoPPln.createPipeline(fragSpec, presentLayout.get(), scRes->getRenderpass());
-      if (!m_presentPipeline)
-        return logFail("Could not create Graphics Pipeline!");
-    }
-
-    // write descriptors
-    IGPUDescriptorSet::SDescriptorInfo infos[3];
-    infos[0].desc = m_gpuTlas;
-
-    infos[1].desc = m_hdrImageView;
-    if (!infos[1].desc)
-      return logFail("Failed to create image view");
-    infos[1].info.image.imageLayout = IImage::LAYOUT::GENERAL;
-
-    infos[2].desc = m_hdrImageView;
-    infos[2].info.image.imageLayout = IImage::LAYOUT::READ_ONLY_OPTIMAL;
-
-    IGPUDescriptorSet::SWriteDescriptorSet writes[] = {
-        {.dstSet = m_rayTracingDs.get(), .binding = 0, .arrayElement = 0, .count = 1, .info = &infos[0]},
-        {.dstSet = m_rayTracingDs.get(), .binding = 1, .arrayElement = 0, .count = 1, .info = &infos[1]},
-        {.dstSet = m_presentDs.get(), .binding = 0, .arrayElement = 0, .count = 1, .info = &infos[2] },
-    };
-    m_device->updateDescriptorSets(std::span(writes), {});
-
-    // gui descriptor setup
-    {
-      using binding_flags_t = IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS;
-      {
-        IGPUSampler::SParams params;
-        params.AnisotropicFilter = 1u;
-        params.TextureWrapU = ETC_REPEAT;
-        params.TextureWrapV = ETC_REPEAT;
-        params.TextureWrapW = ETC_REPEAT;
-
-        m_ui.samplers.gui = m_device->createSampler(params);
-        m_ui.samplers.gui->setObjectDebugName("Nabla IMGUI UI Sampler");
-      }
-
-      std::array<core::smart_refctd_ptr<IGPUSampler>, 69u> immutableSamplers;
-      for (auto& it : immutableSamplers)
-        it = smart_refctd_ptr(m_ui.samplers.scene);
-
-      immutableSamplers[nbl::ext::imgui::UI::FontAtlasTexId] = smart_refctd_ptr(m_ui.samplers.gui);
-
-      nbl::ext::imgui::UI::SCreationParameters params;
-
-      params.resources.texturesInfo = { .setIx = 0u, .bindingIx = 0u };
-      params.resources.samplersInfo = { .setIx = 0u, .bindingIx = 1u };
-      params.assetManager = m_assetMgr;
-      params.pipelineCache = nullptr;
-      params.pipelineLayout = nbl::ext::imgui::UI::createDefaultPipelineLayout(m_utils->getLogicalDevice(), params.resources.texturesInfo, params.resources.samplersInfo, MaxUITextureCount);
-      params.renderpass = smart_refctd_ptr<IGPURenderpass>(renderpass);
-      params.streamingBuffer = nullptr;
-      params.subpassIx = 0u;
-      params.transfer = getTransferUpQueue();
-      params.utilities = m_utils;
-      {
-        m_ui.manager = ext::imgui::UI::create(std::move(params));
-
-        // note that we use default layout provided by our extension, but you are free to create your own by filling nbl::ext::imgui::UI::S_CREATION_PARAMETERS::resources
-        const auto* descriptorSetLayout = m_ui.manager->getPipeline()->getLayout()->getDescriptorSetLayout(0u);
-        const auto& params = m_ui.manager->getCreationParameters();
-
-        IDescriptorPool::SCreateInfo descriptorPoolInfo = {};
-        descriptorPoolInfo.maxDescriptorCount[static_cast<uint32_t>(asset::IDescriptor::E_TYPE::ET_SAMPLER)] = (uint32_t)nbl::ext::imgui::UI::DefaultSamplerIx::COUNT;
-        descriptorPoolInfo.maxDescriptorCount[static_cast<uint32_t>(asset::IDescriptor::E_TYPE::ET_SAMPLED_IMAGE)] = MaxUITextureCount;
-        descriptorPoolInfo.maxSets = 1u;
-        descriptorPoolInfo.flags = IDescriptorPool::E_CREATE_FLAGS::ECF_UPDATE_AFTER_BIND_BIT;
-
-        m_guiDescriptorSetPool = m_device->createDescriptorPool(std::move(descriptorPoolInfo));
-        assert(m_guiDescriptorSetPool);
-
-        m_guiDescriptorSetPool->createDescriptorSets(1u, &descriptorSetLayout, &m_ui.descriptorSet);
-        assert(m_ui.descriptorSet);
-      }
-    }
-
-    m_ui.manager->registerListener(
-      [this]() -> void {
-        ImGuiIO& io = ImGui::GetIO();
-
-        m_camera.setProjectionMatrix([&]()
-        {
-          static matrix4SIMD projection;
-
-          projection = matrix4SIMD::buildProjectionMatrixPerspectiveFovRH(
-            core::radians(m_cameraSetting.fov), 
-            io.DisplaySize.x / io.DisplaySize.y, 
-            m_cameraSetting.zNear, 
-            m_cameraSetting.zFar);
-
-          return projection;
-        }());
-
-        ImGui::SetNextWindowPos(ImVec2(1024, 100), ImGuiCond_Appearing);
-        ImGui::SetNextWindowSize(ImVec2(256, 256), ImGuiCond_Appearing);
-
-        // create a window and insert the inspector
-        ImGui::SetNextWindowPos(ImVec2(10, 10), ImGuiCond_Appearing);
-        ImGui::SetNextWindowSize(ImVec2(320, 340), ImGuiCond_Appearing);
-        ImGui::Begin("Controls");
-
-        ImGui::SameLine();
-
-        ImGui::Text("Camera");
-
-        ImGui::SliderFloat("Move speed", &m_cameraSetting.moveSpeed, 0.1f, 10.f);
-        ImGui::SliderFloat("Rotate speed", &m_cameraSetting.rotateSpeed, 0.1f, 10.f);
-        ImGui::SliderFloat("Fov", &m_cameraSetting.fov, 20.f, 150.f);
-        ImGui::SliderFloat("zNear", &m_cameraSetting.zNear, 0.1f, 100.f);
-        ImGui::SliderFloat("zFar", &m_cameraSetting.zFar, 110.f, 10000.f);
-        Light m_oldLight = m_light;
-        int light_type = m_light.type;
-        ImGui::ListBox("LightType", &light_type, s_lightTypeNames, ELT_COUNT);
-        m_light.type = static_cast<E_LIGHT_TYPE>(light_type);
-        if (m_light.type == ELT_DIRECTIONAL)
-        {
-          ImGui::SliderFloat3("Light Direction", &m_light.direction.x, -1.f, 1.f);
-        } else if (m_light.type == ELT_POINT)
-        {
-          ImGui::SliderFloat3("Light Position", &m_light.position.x, -20.f, 20.f);
-        } else if (m_light.type == ELT_SPOT)
-        {
-          ImGui::SliderFloat3("Light Direction", &m_light.direction.x, -1.f, 1.f);
-          ImGui::SliderFloat3("Light Position", &m_light.position.x, -20.f, 20.f);
-
-          float32_t dOuterCutoff = hlsl::degrees(acos(m_light.outerCutoff));
-          if (ImGui::SliderFloat("Light Outer Cutoff", &dOuterCutoff, 0.0f, 45.0f))
-          {
-            m_light.outerCutoff = cos(hlsl::radians(dOuterCutoff));
-          }
-        }
-        ImGui::Checkbox("Use Indirect Command", &m_useIndirectCommand);
-        if (m_light != m_oldLight)
-        {
-          m_frameAccumulationCounter = 0;
-        }
-
-        ImGui::Text("X: %f Y: %f", io.MousePos.x, io.MousePos.y);
-
-        ImGui::End();
-      }
-    );
-
-    // Set Camera
-    {
-      core::vectorSIMDf cameraPosition(0, 5, -10);
-      matrix4SIMD proj = matrix4SIMD::buildProjectionMatrixPerspectiveFovRH(
-        core::radians(60.0f),
-        WIN_W / WIN_H,
-        0.01f,
-        500.0f
-      );
-      m_camera = Camera(cameraPosition, core::vectorSIMDf(0, 0, 0), proj);
-    }
-
-    m_winMgr->setWindowSize(m_window.get(), WIN_W, WIN_H);
-    m_surface->recreateSwapchain();
-    m_winMgr->show(m_window.get());
-    m_oracle.reportBeginFrameRecord();
-    m_camera.mapKeysToWASD();
-
-    return true;
+	m_inputSystem = make_smart_refctd_ptr<InputSystem>(logger_opt_smart_ptr(smart_refctd_ptr(m_logger)));
+
+	if (!device_base_t::onAppInitialized(smart_refctd_ptr(system)))
+	  return false;
+
+	if (!asset_base_t::onAppInitialized(smart_refctd_ptr(system)))
+	  return false;
+
+	smart_refctd_ptr<IShaderCompiler::CCache> shaderReadCache = nullptr;
+	smart_refctd_ptr<IShaderCompiler::CCache> shaderWriteCache = core::make_smart_refctd_ptr<IShaderCompiler::CCache>();
+	auto shaderCachePath = localOutputCWD / "main_pipeline_shader_cache.bin";
+
+	{
+		core::smart_refctd_ptr<system::IFile> shaderReadCacheFile;
+		{
+			system::ISystem::future_t<core::smart_refctd_ptr<system::IFile>> future;
+			m_system->createFile(future, shaderCachePath.c_str(), system::IFile::ECF_READ);
+			if (future.wait())
+			{
+				future.acquire().move_into(shaderReadCacheFile);
+				if (shaderReadCacheFile)
+				{
+					const size_t size = shaderReadCacheFile->getSize();
+					if (size > 0ull)
+					{
+						std::vector<uint8_t> contents(size);
+						system::IFile::success_t succ;
+						shaderReadCacheFile->read(succ, contents.data(), 0, size);
+						if (succ)
+							shaderReadCache = IShaderCompiler::CCache::deserialize(contents);
+					}
+				}
+			}
+			else
+				m_logger->log("Failed Openning Shader Cache File.", ILogger::ELL_ERROR);
+		}
+
+	}
+
+	// Load Custom Shader
+	auto loadCompileAndCreateShader = [&](const std::string& relPath) -> smart_refctd_ptr<IGPUShader>
+		{
+			IAssetLoader::SAssetLoadParams lp = {};
+			lp.logger = m_logger.get();
+			lp.workingDirectory = ""; // virtual root
+			auto assetBundle = m_assetMgr->getAsset(relPath, lp);
+			const auto assets = assetBundle.getContents();
+			if (assets.empty())
+				return nullptr;
+
+			// lets go straight from ICPUSpecializedShader to IGPUSpecializedShader
+			auto sourceRaw = IAsset::castDown<ICPUShader>(assets[0]);
+			if (!sourceRaw)
+				return nullptr;
+
+			return m_device->createShader({ sourceRaw.get(), nullptr, shaderReadCache.get(), shaderWriteCache.get() });
+		};
+
+	// load shaders
+	const auto raygenShader = loadCompileAndCreateShader("app_resources/raytrace.rgen.hlsl");
+	const auto closestHitShader = loadCompileAndCreateShader("app_resources/raytrace.rchit.hlsl");
+	const auto proceduralClosestHitShader = loadCompileAndCreateShader("app_resources/raytrace_procedural.rchit.hlsl");
+	const auto intersectionHitShader = loadCompileAndCreateShader("app_resources/raytrace.rint.hlsl");
+	const auto anyHitShaderColorPayload = loadCompileAndCreateShader("app_resources/raytrace.rahit.hlsl");
+	const auto anyHitShaderShadowPayload = loadCompileAndCreateShader("app_resources/raytrace_shadow.rahit.hlsl");
+	const auto missShader = loadCompileAndCreateShader("app_resources/raytrace.rmiss.hlsl");
+	const auto missShadowShader = loadCompileAndCreateShader("app_resources/raytrace_shadow.rmiss.hlsl");
+	const auto directionalLightCallShader = loadCompileAndCreateShader("app_resources/light_directional.rcall.hlsl");
+	const auto pointLightCallShader = loadCompileAndCreateShader("app_resources/light_point.rcall.hlsl");
+	const auto spotLightCallShader = loadCompileAndCreateShader("app_resources/light_spot.rcall.hlsl");
+	const auto fragmentShader = loadCompileAndCreateShader("app_resources/present.frag.hlsl");
+
+	core::smart_refctd_ptr<system::IFile> shaderWriteCacheFile;
+	{
+		system::ISystem::future_t<core::smart_refctd_ptr<system::IFile>> future;
+		m_system->deleteFile(shaderCachePath); // temp solution instead of trimming, to make sure we won't have corrupted json
+		m_system->createFile(future, shaderCachePath.c_str(), system::IFile::ECF_WRITE);
+		if (future.wait())
+		{
+			future.acquire().move_into(shaderWriteCacheFile);
+			if (shaderWriteCacheFile)
+			{
+				auto serializedCache = shaderWriteCache->serialize();
+				if (shaderWriteCacheFile)
+				{
+					system::IFile::success_t succ;
+					shaderWriteCacheFile->write(succ, serializedCache->getPointer(), 0, serializedCache->getSize());
+					if (!succ)
+						m_logger->log("Failed Writing To Shader Cache File.", ILogger::ELL_ERROR);
+				}
+			}
+			else
+				m_logger->log("Failed Creating Shader Cache File.", ILogger::ELL_ERROR);
+		}
+		else
+			m_logger->log("Failed Creating Shader Cache File.", ILogger::ELL_ERROR);
+	}
+
+	m_semaphore = m_device->createSemaphore(m_realFrameIx);
+	if (!m_semaphore)
+	  return logFail("Failed to Create a Semaphore!");
+
+	auto gQueue = getGraphicsQueue();
+
+	// Create renderpass and init surface
+	nbl::video::IGPURenderpass* renderpass;
+	{
+	  ISwapchain::SCreationParams swapchainParams = { .surface = smart_refctd_ptr<ISurface>(m_surface->getSurface()) };
+	  if (!swapchainParams.deduceFormat(m_physicalDevice))
+		return logFail("Could not choose a Surface Format for the Swapchain!");
+
+	  const static IGPURenderpass::SCreationParams::SSubpassDependency dependencies[] =
+	  {
+		{
+		  .srcSubpass = IGPURenderpass::SCreationParams::SSubpassDependency::External,
+		  .dstSubpass = 0,
+		  .memoryBarrier =
+		  {
+			.srcStageMask = asset::PIPELINE_STAGE_FLAGS::COPY_BIT,
+			.srcAccessMask = asset::ACCESS_FLAGS::TRANSFER_WRITE_BIT,
+			.dstStageMask = asset::PIPELINE_STAGE_FLAGS::COLOR_ATTACHMENT_OUTPUT_BIT,
+			.dstAccessMask = asset::ACCESS_FLAGS::COLOR_ATTACHMENT_WRITE_BIT
+		  }
+		},
+		{
+		  .srcSubpass = 0,
+		  .dstSubpass = IGPURenderpass::SCreationParams::SSubpassDependency::External,
+		  .memoryBarrier =
+		  {
+			.srcStageMask = asset::PIPELINE_STAGE_FLAGS::COLOR_ATTACHMENT_OUTPUT_BIT,
+			.srcAccessMask = asset::ACCESS_FLAGS::COLOR_ATTACHMENT_WRITE_BIT
+		  }
+		},
+		IGPURenderpass::SCreationParams::DependenciesEnd
+	  };
+
+	  auto scResources = std::make_unique<CDefaultSwapchainFramebuffers>(m_device.get(), swapchainParams.surfaceFormat.format, dependencies);
+	  renderpass = scResources->getRenderpass();
+
+	  if (!renderpass)
+		return logFail("Failed to create Renderpass!");
+
+	  if (!m_surface || !m_surface->init(gQueue, std::move(scResources), swapchainParams.sharedParams))
+		return logFail("Could not create Window & Surface or initialize the Surface!");
+	}
+
+	auto pool = m_device->createCommandPool(gQueue->getFamilyIndex(), IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT);
+
+	m_converter = CAssetConverter::create({ .device = m_device.get(), .optimizer = {} });
+
+	for (auto i = 0u; i < MaxFramesInFlight; i++)
+	{
+	  if (!pool)
+		return logFail("Couldn't create Command Pool!");
+	  if (!pool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY, { m_cmdBufs.data() + i, 1 }))
+		return logFail("Couldn't create Command Buffer!");
+	}
+
+	m_winMgr->setWindowSize(m_window.get(), WIN_W, WIN_H);
+	m_surface->recreateSwapchain();
+
+
+	// create output images
+	m_hdrImage = m_device->createImage({
+		{
+		  .type = IGPUImage::ET_2D,
+		  .samples = ICPUImage::ESCF_1_BIT,
+		  .format = EF_R16G16B16A16_SFLOAT,
+		  .extent = {WIN_W, WIN_H, 1},
+		  .mipLevels = 1,
+		  .arrayLayers = 1,
+		  .flags = IImage::ECF_NONE,
+		  .usage = bitflag(IImage::EUF_STORAGE_BIT) | IImage::EUF_TRANSFER_SRC_BIT | IImage::EUF_SAMPLED_BIT
+		}
+	  });
+
+	if (!m_hdrImage || !m_device->allocate(m_hdrImage->getMemoryReqs(), m_hdrImage.get()).isValid())
+	  return logFail("Could not create HDR Image");
+
+	m_hdrImageView = m_device->createImageView({
+	  .flags = IGPUImageView::ECF_NONE,
+	  .subUsages = IGPUImage::E_USAGE_FLAGS::EUF_STORAGE_BIT | IGPUImage::E_USAGE_FLAGS::EUF_SAMPLED_BIT,
+	  .image = m_hdrImage,
+	  .viewType = IGPUImageView::E_TYPE::ET_2D,
+	  .format = asset::EF_R16G16B16A16_SFLOAT
+	});
+
+
+
+	// ray trace pipeline and descriptor set layout setup
+	{
+	  const IGPUDescriptorSetLayout::SBinding bindings[] = {
+		{
+		  .binding = 0,
+		  .type = asset::IDescriptor::E_TYPE::ET_ACCELERATION_STRUCTURE,
+		  .createFlags = IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_NONE,
+		  .stageFlags = asset::IShader::E_SHADER_STAGE::ESS_RAYGEN,
+		  .count = 1,
+		},
+		{
+		  .binding = 1,
+		  .type = asset::IDescriptor::E_TYPE::ET_STORAGE_IMAGE,
+		  .createFlags = IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_NONE,
+		  .stageFlags = asset::IShader::E_SHADER_STAGE::ESS_RAYGEN,
+		  .count = 1,
+		}
+	  };
+	  const auto descriptorSetLayout = m_device->createDescriptorSetLayout(bindings);
+
+	  const std::array<IGPUDescriptorSetLayout*, ICPUPipelineLayout::DESCRIPTOR_SET_COUNT> dsLayoutPtrs = { descriptorSetLayout.get() };
+	  m_rayTracingDsPool = m_device->createDescriptorPoolForDSLayouts(IDescriptorPool::ECF_UPDATE_AFTER_BIND_BIT, std::span(dsLayoutPtrs.begin(), dsLayoutPtrs.end()));
+	  m_rayTracingDs = m_rayTracingDsPool->createDescriptorSet(descriptorSetLayout);
+
+	  const SPushConstantRange pcRange = {
+		.stageFlags = IShader::E_SHADER_STAGE::ESS_ALL_RAY_TRACING,
+		.offset = 0u,
+		.size = sizeof(SPushConstants),
+	  };
+	  const auto pipelineLayout = m_device->createPipelineLayout({ &pcRange, 1 }, smart_refctd_ptr(descriptorSetLayout), nullptr, nullptr, nullptr);
+
+	  IGPURayTracingPipeline::SCreationParams params = {};
+
+	  enum RtDemoShader
+	  {
+		RTDS_RAYGEN,
+		RTDS_MISS,
+		RTDS_MISS_SHADOW,
+		RTDS_CLOSEST_HIT,
+		RTDS_SPHERE_CLOSEST_HIT,
+		RTDS_ANYHIT_PRIMARY,
+		RTDS_ANYHIT_SHADOW,
+		RTDS_INTERSECTION,
+		RTDS_DIRECTIONAL_CALL,
+		RTDS_POINT_CALL,
+		RTDS_SPOT_CALL,
+		RTDS_COUNT
+	  };
+
+	  IGPUShader::SSpecInfo shaders[RTDS_COUNT];
+	  shaders[RTDS_RAYGEN] = {.shader = raygenShader.get()};
+	  shaders[RTDS_MISS] = {.shader = missShader.get()};
+	  shaders[RTDS_MISS_SHADOW] = { .shader = missShadowShader.get() };
+	  shaders[RTDS_CLOSEST_HIT] = {.shader = closestHitShader.get()};
+	  shaders[RTDS_SPHERE_CLOSEST_HIT] = {.shader = proceduralClosestHitShader.get()};
+	  shaders[RTDS_ANYHIT_PRIMARY] = {.shader = anyHitShaderColorPayload.get()};
+	  shaders[RTDS_ANYHIT_SHADOW] = {.shader = anyHitShaderShadowPayload.get()};
+	  shaders[RTDS_INTERSECTION] = {.shader = intersectionHitShader.get() };
+	  shaders[RTDS_DIRECTIONAL_CALL] = {.shader = directionalLightCallShader.get()};
+	  shaders[RTDS_POINT_CALL] = {.shader = pointLightCallShader.get()};
+	  shaders[RTDS_SPOT_CALL] = {.shader = spotLightCallShader.get()};
+
+	  params.layout = pipelineLayout.get();
+	  params.shaders = std::span(shaders);
+	  using RayTracingFlags = IGPURayTracingPipeline::SCreationParams::FLAGS;
+	  params.flags = core::bitflag(RayTracingFlags::NO_NULL_MISS_SHADERS) |
+		RayTracingFlags::NO_NULL_INTERSECTION_SHADERS | 
+		RayTracingFlags::NO_NULL_ANY_HIT_SHADERS;
+
+	  auto& shaderGroups = params.shaderGroups;
+
+	  shaderGroups.raygen = { .index = RTDS_RAYGEN };
+
+	  IRayTracingPipelineBase::SGeneralShaderGroup missGroups[EMT_COUNT];
+	  missGroups[EMT_PRIMARY] = { .index = RTDS_MISS };
+	  missGroups[EMT_OCCLUSION] = { .index = RTDS_MISS_SHADOW };
+	  shaderGroups.misses = missGroups;
+
+	  auto getHitGroupIndex = [](E_GEOM_TYPE geomType, E_RAY_TYPE rayType)
+		{
+		  return geomType * ERT_COUNT + rayType;
+		};
+	  IRayTracingPipelineBase::SHitShaderGroup hitGroups[E_RAY_TYPE::ERT_COUNT * E_GEOM_TYPE::EGT_COUNT];
+	  hitGroups[getHitGroupIndex(EGT_TRIANGLES, ERT_PRIMARY)] = {
+		.closestHit = RTDS_CLOSEST_HIT,
+		.anyHit = RTDS_ANYHIT_PRIMARY,
+	  };
+	  hitGroups[getHitGroupIndex(EGT_TRIANGLES, ERT_OCCLUSION)] = {
+		.closestHit = IGPURayTracingPipeline::SGeneralShaderGroup::Unused,
+		.anyHit = RTDS_ANYHIT_SHADOW,
+	  };
+	  hitGroups[getHitGroupIndex(EGT_PROCEDURAL, ERT_PRIMARY)] = {
+		.closestHit = RTDS_SPHERE_CLOSEST_HIT,
+		.anyHit = RTDS_ANYHIT_PRIMARY,
+		.intersection = RTDS_INTERSECTION,
+	  };
+	  hitGroups[getHitGroupIndex(EGT_PROCEDURAL, ERT_OCCLUSION)] = {
+		.closestHit = IGPURayTracingPipeline::SGeneralShaderGroup::Unused,
+		.anyHit = RTDS_ANYHIT_SHADOW,
+		.intersection = RTDS_INTERSECTION,
+	  };
+	  shaderGroups.hits = hitGroups;
+
+	  IRayTracingPipelineBase::SGeneralShaderGroup callableGroups[ELT_COUNT];
+	  callableGroups[ELT_DIRECTIONAL] = { .index = RTDS_DIRECTIONAL_CALL };
+	  callableGroups[ELT_POINT] = { .index = RTDS_POINT_CALL };
+	  callableGroups[ELT_SPOT] = { .index = RTDS_SPOT_CALL };
+	  shaderGroups.callables = callableGroups;
+
+	  params.cached.maxRecursionDepth = 1;
+	  params.cached.dynamicStackSize = true;
+
+	  if (!m_device->createRayTracingPipelines(nullptr, { &params, 1 }, &m_rayTracingPipeline))
+		return logFail("Failed to create ray tracing pipeline");
+
+	  calculateRayTracingStackSize(m_rayTracingPipeline);
+	  
+	  if (!createShaderBindingTable(gQueue, m_rayTracingPipeline))
+		return logFail("Could not create shader binding table");
+
+	}
+
+	auto assetManager = make_smart_refctd_ptr<nbl::asset::IAssetManager>(smart_refctd_ptr(system));
+	auto* geometryCreator = assetManager->getGeometryCreator();
+
+	if (!createIndirectBuffer(gQueue))
+	  return logFail("Could not create indirect buffer");
+
+	// create geometry objects
+	if (!createGeometries(gQueue, geometryCreator))
+	  return logFail("Could not create geometries from geometry creator");
+
+	if (!createAccelerationStructures(getComputeQueue()))
+	  return logFail("Could not create acceleration structures");
+
+	ISampler::SParams samplerParams = {
+	  .AnisotropicFilter = 0
+	};
+	auto defaultSampler = m_device->createSampler(samplerParams);
+
+	{
+	  const IGPUDescriptorSetLayout::SBinding bindings[] = {
+		{
+		  .binding = 0u,
+		  .type = nbl::asset::IDescriptor::E_TYPE::ET_COMBINED_IMAGE_SAMPLER,
+		  .createFlags = ICPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_NONE,
+		  .stageFlags = IShader::E_SHADER_STAGE::ESS_FRAGMENT,
+		  .count = 1u,
+		  .immutableSamplers = &defaultSampler
+		}
+	  };
+	  auto gpuPresentDescriptorSetLayout = m_device->createDescriptorSetLayout(bindings);
+	  const video::IGPUDescriptorSetLayout* const layouts[] = { gpuPresentDescriptorSetLayout.get() };
+	  const uint32_t setCounts[] = { 1u };
+	  m_presentDsPool = m_device->createDescriptorPoolForDSLayouts(IDescriptorPool::E_CREATE_FLAGS::ECF_NONE, layouts, setCounts);
+	  m_presentDs = m_presentDsPool->createDescriptorSet(gpuPresentDescriptorSetLayout);
+
+	  auto scRes = static_cast<CDefaultSwapchainFramebuffers*>(m_surface->getSwapchainResources());
+	  ext::FullScreenTriangle::ProtoPipeline fsTriProtoPPln(m_assetMgr.get(), m_device.get(), m_logger.get());
+	  if (!fsTriProtoPPln)
+		return logFail("Failed to create Full Screen Triangle protopipeline or load its vertex shader!");
+
+	  const IGPUShader::SSpecInfo fragSpec = {
+		.entryPoint = "main",
+		.shader = fragmentShader.get()
+	  };
+
+	  auto presentLayout = m_device->createPipelineLayout(
+		{},
+		core::smart_refctd_ptr(gpuPresentDescriptorSetLayout),
+		nullptr,
+		nullptr,
+		nullptr
+	  );
+	  m_presentPipeline = fsTriProtoPPln.createPipeline(fragSpec, presentLayout.get(), scRes->getRenderpass());
+	  if (!m_presentPipeline)
+		return logFail("Could not create Graphics Pipeline!");
+	}
+
+	// write descriptors
+	IGPUDescriptorSet::SDescriptorInfo infos[3];
+	infos[0].desc = m_gpuTlas;
+
+	infos[1].desc = m_hdrImageView;
+	if (!infos[1].desc)
+	  return logFail("Failed to create image view");
+	infos[1].info.image.imageLayout = IImage::LAYOUT::GENERAL;
+
+	infos[2].desc = m_hdrImageView;
+	infos[2].info.image.imageLayout = IImage::LAYOUT::READ_ONLY_OPTIMAL;
+
+	IGPUDescriptorSet::SWriteDescriptorSet writes[] = {
+		{.dstSet = m_rayTracingDs.get(), .binding = 0, .arrayElement = 0, .count = 1, .info = &infos[0]},
+		{.dstSet = m_rayTracingDs.get(), .binding = 1, .arrayElement = 0, .count = 1, .info = &infos[1]},
+		{.dstSet = m_presentDs.get(), .binding = 0, .arrayElement = 0, .count = 1, .info = &infos[2] },
+	};
+	m_device->updateDescriptorSets(std::span(writes), {});
+
+	// gui descriptor setup
+	{
+	  using binding_flags_t = IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS;
+	  {
+		IGPUSampler::SParams params;
+		params.AnisotropicFilter = 1u;
+		params.TextureWrapU = ETC_REPEAT;
+		params.TextureWrapV = ETC_REPEAT;
+		params.TextureWrapW = ETC_REPEAT;
+
+		m_ui.samplers.gui = m_device->createSampler(params);
+		m_ui.samplers.gui->setObjectDebugName("Nabla IMGUI UI Sampler");
+	  }
+
+	  std::array<core::smart_refctd_ptr<IGPUSampler>, 69u> immutableSamplers;
+	  for (auto& it : immutableSamplers)
+		it = smart_refctd_ptr(m_ui.samplers.scene);
+
+	  immutableSamplers[nbl::ext::imgui::UI::FontAtlasTexId] = smart_refctd_ptr(m_ui.samplers.gui);
+
+	  nbl::ext::imgui::UI::SCreationParameters params;
+
+	  params.resources.texturesInfo = { .setIx = 0u, .bindingIx = 0u };
+	  params.resources.samplersInfo = { .setIx = 0u, .bindingIx = 1u };
+	  params.assetManager = m_assetMgr;
+	  params.pipelineCache = nullptr;
+	  params.pipelineLayout = nbl::ext::imgui::UI::createDefaultPipelineLayout(m_utils->getLogicalDevice(), params.resources.texturesInfo, params.resources.samplersInfo, MaxUITextureCount);
+	  params.renderpass = smart_refctd_ptr<IGPURenderpass>(renderpass);
+	  params.streamingBuffer = nullptr;
+	  params.subpassIx = 0u;
+	  params.transfer = getTransferUpQueue();
+	  params.utilities = m_utils;
+	  {
+		m_ui.manager = ext::imgui::UI::create(std::move(params));
+
+		// note that we use default layout provided by our extension, but you are free to create your own by filling nbl::ext::imgui::UI::S_CREATION_PARAMETERS::resources
+		const auto* descriptorSetLayout = m_ui.manager->getPipeline()->getLayout()->getDescriptorSetLayout(0u);
+		const auto& params = m_ui.manager->getCreationParameters();
+
+		IDescriptorPool::SCreateInfo descriptorPoolInfo = {};
+		descriptorPoolInfo.maxDescriptorCount[static_cast<uint32_t>(asset::IDescriptor::E_TYPE::ET_SAMPLER)] = (uint32_t)nbl::ext::imgui::UI::DefaultSamplerIx::COUNT;
+		descriptorPoolInfo.maxDescriptorCount[static_cast<uint32_t>(asset::IDescriptor::E_TYPE::ET_SAMPLED_IMAGE)] = MaxUITextureCount;
+		descriptorPoolInfo.maxSets = 1u;
+		descriptorPoolInfo.flags = IDescriptorPool::E_CREATE_FLAGS::ECF_UPDATE_AFTER_BIND_BIT;
+
+		m_guiDescriptorSetPool = m_device->createDescriptorPool(std::move(descriptorPoolInfo));
+		assert(m_guiDescriptorSetPool);
+
+		m_guiDescriptorSetPool->createDescriptorSets(1u, &descriptorSetLayout, &m_ui.descriptorSet);
+		assert(m_ui.descriptorSet);
+	  }
+	}
+
+	m_ui.manager->registerListener(
+	  [this]() -> void {
+		ImGuiIO& io = ImGui::GetIO();
+
+		m_camera.setProjectionMatrix([&]()
+		{
+		  static matrix4SIMD projection;
+
+		  projection = matrix4SIMD::buildProjectionMatrixPerspectiveFovRH(
+			core::radians(m_cameraSetting.fov), 
+			io.DisplaySize.x / io.DisplaySize.y, 
+			m_cameraSetting.zNear, 
+			m_cameraSetting.zFar);
+
+		  return projection;
+		}());
+
+		ImGui::SetNextWindowPos(ImVec2(1024, 100), ImGuiCond_Appearing);
+		ImGui::SetNextWindowSize(ImVec2(256, 256), ImGuiCond_Appearing);
+
+		// create a window and insert the inspector
+		ImGui::SetNextWindowPos(ImVec2(10, 10), ImGuiCond_Appearing);
+		ImGui::SetNextWindowSize(ImVec2(320, 340), ImGuiCond_Appearing);
+		ImGui::Begin("Controls");
+
+		ImGui::SameLine();
+
+		ImGui::Text("Camera");
+
+		ImGui::SliderFloat("Move speed", &m_cameraSetting.moveSpeed, 0.1f, 10.f);
+		ImGui::SliderFloat("Rotate speed", &m_cameraSetting.rotateSpeed, 0.1f, 10.f);
+		ImGui::SliderFloat("Fov", &m_cameraSetting.fov, 20.f, 150.f);
+		ImGui::SliderFloat("zNear", &m_cameraSetting.zNear, 0.1f, 100.f);
+		ImGui::SliderFloat("zFar", &m_cameraSetting.zFar, 110.f, 10000.f);
+		Light m_oldLight = m_light;
+		int light_type = m_light.type;
+		ImGui::ListBox("LightType", &light_type, s_lightTypeNames, ELT_COUNT);
+		m_light.type = static_cast<E_LIGHT_TYPE>(light_type);
+		if (m_light.type == ELT_DIRECTIONAL)
+		{
+		  ImGui::SliderFloat3("Light Direction", &m_light.direction.x, -1.f, 1.f);
+		} else if (m_light.type == ELT_POINT)
+		{
+		  ImGui::SliderFloat3("Light Position", &m_light.position.x, -20.f, 20.f);
+		} else if (m_light.type == ELT_SPOT)
+		{
+		  ImGui::SliderFloat3("Light Direction", &m_light.direction.x, -1.f, 1.f);
+		  ImGui::SliderFloat3("Light Position", &m_light.position.x, -20.f, 20.f);
+
+		  float32_t dOuterCutoff = hlsl::degrees(acos(m_light.outerCutoff));
+		  if (ImGui::SliderFloat("Light Outer Cutoff", &dOuterCutoff, 0.0f, 45.0f))
+		  {
+			m_light.outerCutoff = cos(hlsl::radians(dOuterCutoff));
+		  }
+		}
+		ImGui::Checkbox("Use Indirect Command", &m_useIndirectCommand);
+		if (m_light != m_oldLight)
+		{
+		  m_frameAccumulationCounter = 0;
+		}
+
+		ImGui::Text("X: %f Y: %f", io.MousePos.x, io.MousePos.y);
+
+		ImGui::End();
+	  }
+	);
+
+	// Set Camera
+	{
+	  core::vectorSIMDf cameraPosition(0, 5, -10);
+	  matrix4SIMD proj = matrix4SIMD::buildProjectionMatrixPerspectiveFovRH(
+		core::radians(60.0f),
+		WIN_W / WIN_H,
+		0.01f,
+		500.0f
+	  );
+	  m_camera = Camera(cameraPosition, core::vectorSIMDf(0, 0, 0), proj);
+	}
+
+	m_winMgr->setWindowSize(m_window.get(), WIN_W, WIN_H);
+	m_surface->recreateSwapchain();
+	m_winMgr->show(m_window.get());
+	m_oracle.reportBeginFrameRecord();
+	m_camera.mapKeysToWASD();
+
+	return true;
   }
 
   bool updateGUIDescriptorSet()
   {
-    // texture atlas, note we don't create info & write pair for the font sampler because UI extension's is immutable and baked into DS layout
-    static std::array<IGPUDescriptorSet::SDescriptorInfo, MaxUITextureCount> descriptorInfo;
-    static IGPUDescriptorSet::SWriteDescriptorSet writes[MaxUITextureCount];
-
-    descriptorInfo[nbl::ext::imgui::UI::FontAtlasTexId].info.image.imageLayout = IImage::LAYOUT::READ_ONLY_OPTIMAL;
-    descriptorInfo[nbl::ext::imgui::UI::FontAtlasTexId].desc = smart_refctd_ptr<IGPUImageView>(m_ui.manager->getFontAtlasView());
-
-    for (uint32_t i = 0; i < descriptorInfo.size(); ++i)
-    {
-      writes[i].dstSet = m_ui.descriptorSet.get();
-      writes[i].binding = 0u;
-      writes[i].arrayElement = i;
-      writes[i].count = 1u;
-    }
-    writes[nbl::ext::imgui::UI::FontAtlasTexId].info = descriptorInfo.data() + nbl::ext::imgui::UI::FontAtlasTexId;
-
-    return m_device->updateDescriptorSets(writes, {});
+	// texture atlas, note we don't create info & write pair for the font sampler because UI extension's is immutable and baked into DS layout
+	static std::array<IGPUDescriptorSet::SDescriptorInfo, MaxUITextureCount> descriptorInfo;
+	static IGPUDescriptorSet::SWriteDescriptorSet writes[MaxUITextureCount];
+
+	descriptorInfo[nbl::ext::imgui::UI::FontAtlasTexId].info.image.imageLayout = IImage::LAYOUT::READ_ONLY_OPTIMAL;
+	descriptorInfo[nbl::ext::imgui::UI::FontAtlasTexId].desc = smart_refctd_ptr<IGPUImageView>(m_ui.manager->getFontAtlasView());
+
+	for (uint32_t i = 0; i < descriptorInfo.size(); ++i)
+	{
+	  writes[i].dstSet = m_ui.descriptorSet.get();
+	  writes[i].binding = 0u;
+	  writes[i].arrayElement = i;
+	  writes[i].count = 1u;
+	}
+	writes[nbl::ext::imgui::UI::FontAtlasTexId].info = descriptorInfo.data() + nbl::ext::imgui::UI::FontAtlasTexId;
+
+	return m_device->updateDescriptorSets(writes, {});
   }
 
   inline void workLoopBody() override
   {
-    // framesInFlight: ensuring safe execution of command buffers and acquires, `framesInFlight` only affect semaphore waits, don't use this to index your resources because it can change with swapchain recreation.
-    const uint32_t framesInFlight = core::min(MaxFramesInFlight, m_surface->getMaxAcquiresInFlight());
-    // We block for semaphores for 2 reasons here:
-      // A) Resource: Can't use resource like a command buffer BEFORE previous use is finished! [MaxFramesInFlight]
-      // B) Acquire: Can't have more acquires in flight than a certain threshold returned by swapchain or your surface helper class. [MaxAcquiresInFlight]
-    if (m_realFrameIx >= framesInFlight)
-    {
-      const ISemaphore::SWaitInfo cbDonePending[] = 
-      {
-        {
-          .semaphore = m_semaphore.get(),
-          .value = m_realFrameIx + 1 - framesInFlight
-        }
-      };
-      if (m_device->blockForSemaphores(cbDonePending) != ISemaphore::WAIT_RESULT::SUCCESS)
-        return;
-    }
-    const auto resourceIx = m_realFrameIx % MaxFramesInFlight;
-
-    m_api->startCapture();
-
-    update();
-
-    auto queue = getGraphicsQueue();
-    auto cmdbuf = m_cmdBufs[resourceIx].get();
-
-    if (!keepRunning())
-      return;
-
-    cmdbuf->reset(IGPUCommandBuffer::RESET_FLAGS::RELEASE_RESOURCES_BIT);
-    cmdbuf->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT);
-    cmdbuf->beginDebugMarker("RaytracingPipelineApp Frame");
-
-    const auto viewMatrix = m_camera.getViewMatrix();
-    const auto projectionMatrix = m_camera.getProjectionMatrix();
-    const auto viewProjectionMatrix = m_camera.getConcatenatedMatrix();
-
-    core::matrix3x4SIMD modelMatrix;
-    modelMatrix.setTranslation(nbl::core::vectorSIMDf(0, 0, 0, 0));
-    modelMatrix.setRotation(quaternion(0, 0, 0));
-
-    core::matrix4SIMD modelViewProjectionMatrix = core::concatenateBFollowedByA(viewProjectionMatrix, modelMatrix);
-    if (m_cachedModelViewProjectionMatrix != modelViewProjectionMatrix)
-    {
-      m_frameAccumulationCounter = 0;
-      m_cachedModelViewProjectionMatrix = modelViewProjectionMatrix;
-    }
-    core::matrix4SIMD invModelViewProjectionMatrix;
-    modelViewProjectionMatrix.getInverseTransform(invModelViewProjectionMatrix);
-
-    {
-      IGPUCommandBuffer::SPipelineBarrierDependencyInfo::image_barrier_t imageBarriers[1];
-      imageBarriers[0].barrier = {
-         .dep = {
-           .srcStageMask = PIPELINE_STAGE_FLAGS::FRAGMENT_SHADER_BIT, // previous frame read from framgent shader
-           .srcAccessMask = ACCESS_FLAGS::SHADER_READ_BITS,
-           .dstStageMask = PIPELINE_STAGE_FLAGS::RAY_TRACING_SHADER_BIT,
-           .dstAccessMask = ACCESS_FLAGS::SHADER_WRITE_BITS
-        }
-      };
-      imageBarriers[0].image = m_hdrImage.get();
-      imageBarriers[0].subresourceRange = {
-        .aspectMask = IImage::EAF_COLOR_BIT,
-        .baseMipLevel = 0u,
-        .levelCount = 1u,
-        .baseArrayLayer = 0u,
-        .layerCount = 1u
-      };
-      imageBarriers[0].oldLayout = m_frameAccumulationCounter == 0 ? IImage::LAYOUT::UNDEFINED : IImage::LAYOUT::READ_ONLY_OPTIMAL;
-      imageBarriers[0].newLayout = IImage::LAYOUT::GENERAL;
-      cmdbuf->pipelineBarrier(E_DEPENDENCY_FLAGS::EDF_NONE, { .imgBarriers = imageBarriers });
-    }
-
-    // Trace Rays Pass
-    {
-      SPushConstants pc;
-      pc.light = m_light;
-      pc.proceduralGeomInfoBuffer = m_proceduralGeomInfoBuffer->getDeviceAddress();
-      pc.triangleGeomInfoBuffer = m_triangleGeomInfoBuffer->getDeviceAddress();
-      pc.frameCounter = m_frameAccumulationCounter;
-      const core::vector3df camPos = m_camera.getPosition().getAsVector3df();
-      pc.camPos = { camPos.X, camPos.Y, camPos.Z };
-      memcpy(&pc.invMVP, invModelViewProjectionMatrix.pointer(), sizeof(pc.invMVP));
-
-      cmdbuf->bindRayTracingPipeline(m_rayTracingPipeline.get());
-      cmdbuf->setRayTracingPipelineStackSize(m_rayTracingStackSize);
-      cmdbuf->pushConstants(m_rayTracingPipeline->getLayout(), IShader::E_SHADER_STAGE::ESS_ALL_RAY_TRACING, 0, sizeof(SPushConstants), &pc);
-      cmdbuf->bindDescriptorSets(EPBP_RAY_TRACING, m_rayTracingPipeline->getLayout(), 0, 1, &m_rayTracingDs.get());
-      if (m_useIndirectCommand)
-      {
-        cmdbuf->traceRaysIndirect(
-          SBufferBinding<const IGPUBuffer>{
-            .offset = 0,
-            .buffer = m_indirectBuffer,
-          });
-      }else
-      {
-        cmdbuf->traceRays(
-          m_shaderBindingTable.raygenGroupRange,
-          m_shaderBindingTable.missGroupsRange, m_shaderBindingTable.missGroupsStride,
-          m_shaderBindingTable.hitGroupsRange, m_shaderBindingTable.hitGroupsStride,
-          m_shaderBindingTable.callableGroupsRange, m_shaderBindingTable.callableGroupsStride,
-          WIN_W, WIN_H, 1);
-      }
-    }
-
-    // pipeline barrier
-    {
-      IGPUCommandBuffer::SPipelineBarrierDependencyInfo::image_barrier_t imageBarriers[1];
-      imageBarriers[0].barrier = {
-        .dep = {
-          .srcStageMask = PIPELINE_STAGE_FLAGS::RAY_TRACING_SHADER_BIT,
-          .srcAccessMask = ACCESS_FLAGS::SHADER_WRITE_BITS,
-          .dstStageMask = PIPELINE_STAGE_FLAGS::COLOR_ATTACHMENT_OUTPUT_BIT,
-          .dstAccessMask = ACCESS_FLAGS::COLOR_ATTACHMENT_WRITE_BIT
-        }
-      };
-      imageBarriers[0].image = m_hdrImage.get();
-      imageBarriers[0].subresourceRange = {
-        .aspectMask = IImage::EAF_COLOR_BIT,
-        .baseMipLevel = 0u,
-        .levelCount = 1u,
-        .baseArrayLayer = 0u,
-        .layerCount = 1u
-      };
-      imageBarriers[0].oldLayout = IImage::LAYOUT::GENERAL;
-      imageBarriers[0].newLayout = IImage::LAYOUT::READ_ONLY_OPTIMAL;
-
-      cmdbuf->pipelineBarrier(E_DEPENDENCY_FLAGS::EDF_NONE, { .imgBarriers = imageBarriers });
-    }
-
-    {
+	// framesInFlight: ensuring safe execution of command buffers and acquires, `framesInFlight` only affect semaphore waits, don't use this to index your resources because it can change with swapchain recreation.
+	const uint32_t framesInFlight = core::min(MaxFramesInFlight, m_surface->getMaxAcquiresInFlight());
+	// We block for semaphores for 2 reasons here:
+	  // A) Resource: Can't use resource like a command buffer BEFORE previous use is finished! [MaxFramesInFlight]
+	  // B) Acquire: Can't have more acquires in flight than a certain threshold returned by swapchain or your surface helper class. [MaxAcquiresInFlight]
+	if (m_realFrameIx >= framesInFlight)
+	{
+	  const ISemaphore::SWaitInfo cbDonePending[] = 
+	  {
+		{
+		  .semaphore = m_semaphore.get(),
+		  .value = m_realFrameIx + 1 - framesInFlight
+		}
+	  };
+	  if (m_device->blockForSemaphores(cbDonePending) != ISemaphore::WAIT_RESULT::SUCCESS)
+		return;
+	}
+	const auto resourceIx = m_realFrameIx % MaxFramesInFlight;
+
+	m_api->startCapture();
+
+	update();
+
+	auto queue = getGraphicsQueue();
+	auto cmdbuf = m_cmdBufs[resourceIx].get();
+
+	if (!keepRunning())
+	  return;
+
+	cmdbuf->reset(IGPUCommandBuffer::RESET_FLAGS::RELEASE_RESOURCES_BIT);
+	cmdbuf->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT);
+	cmdbuf->beginDebugMarker("RaytracingPipelineApp Frame");
+
+	const auto viewMatrix = m_camera.getViewMatrix();
+	const auto projectionMatrix = m_camera.getProjectionMatrix();
+	const auto viewProjectionMatrix = m_camera.getConcatenatedMatrix();
+
+	core::matrix3x4SIMD modelMatrix;
+	modelMatrix.setTranslation(nbl::core::vectorSIMDf(0, 0, 0, 0));
+	modelMatrix.setRotation(quaternion(0, 0, 0));
+
+	core::matrix4SIMD modelViewProjectionMatrix = core::concatenateBFollowedByA(viewProjectionMatrix, modelMatrix);
+	if (m_cachedModelViewProjectionMatrix != modelViewProjectionMatrix)
+	{
+	  m_frameAccumulationCounter = 0;
+	  m_cachedModelViewProjectionMatrix = modelViewProjectionMatrix;
+	}
+	core::matrix4SIMD invModelViewProjectionMatrix;
+	modelViewProjectionMatrix.getInverseTransform(invModelViewProjectionMatrix);
+
+	{
+	  IGPUCommandBuffer::SPipelineBarrierDependencyInfo::image_barrier_t imageBarriers[1];
+	  imageBarriers[0].barrier = {
+		 .dep = {
+		   .srcStageMask = PIPELINE_STAGE_FLAGS::FRAGMENT_SHADER_BIT, // previous frame read from framgent shader
+		   .srcAccessMask = ACCESS_FLAGS::SHADER_READ_BITS,
+		   .dstStageMask = PIPELINE_STAGE_FLAGS::RAY_TRACING_SHADER_BIT,
+		   .dstAccessMask = ACCESS_FLAGS::SHADER_WRITE_BITS
+		}
+	  };
+	  imageBarriers[0].image = m_hdrImage.get();
+	  imageBarriers[0].subresourceRange = {
+		.aspectMask = IImage::EAF_COLOR_BIT,
+		.baseMipLevel = 0u,
+		.levelCount = 1u,
+		.baseArrayLayer = 0u,
+		.layerCount = 1u
+	  };
+	  imageBarriers[0].oldLayout = m_frameAccumulationCounter == 0 ? IImage::LAYOUT::UNDEFINED : IImage::LAYOUT::READ_ONLY_OPTIMAL;
+	  imageBarriers[0].newLayout = IImage::LAYOUT::GENERAL;
+	  cmdbuf->pipelineBarrier(E_DEPENDENCY_FLAGS::EDF_NONE, { .imgBarriers = imageBarriers });
+	}
+
+	// Trace Rays Pass
+	{
+	  SPushConstants pc;
+	  pc.light = m_light;
+	  pc.proceduralGeomInfoBuffer = m_proceduralGeomInfoBuffer->getDeviceAddress();
+	  pc.triangleGeomInfoBuffer = m_triangleGeomInfoBuffer->getDeviceAddress();
+	  pc.frameCounter = m_frameAccumulationCounter;
+	  const core::vector3df camPos = m_camera.getPosition().getAsVector3df();
+	  pc.camPos = { camPos.X, camPos.Y, camPos.Z };
+	  memcpy(&pc.invMVP, invModelViewProjectionMatrix.pointer(), sizeof(pc.invMVP));
+
+	  cmdbuf->bindRayTracingPipeline(m_rayTracingPipeline.get());
+	  cmdbuf->setRayTracingPipelineStackSize(m_rayTracingStackSize);
+	  cmdbuf->pushConstants(m_rayTracingPipeline->getLayout(), IShader::E_SHADER_STAGE::ESS_ALL_RAY_TRACING, 0, sizeof(SPushConstants), &pc);
+	  cmdbuf->bindDescriptorSets(EPBP_RAY_TRACING, m_rayTracingPipeline->getLayout(), 0, 1, &m_rayTracingDs.get());
+	  if (m_useIndirectCommand)
+	  {
+		cmdbuf->traceRaysIndirect(
+		  SBufferBinding<const IGPUBuffer>{
+			.offset = 0,
+			.buffer = m_indirectBuffer,
+		  });
+	  }else
+	  {
+		cmdbuf->traceRays(
+		  m_shaderBindingTable.raygenGroupRange,
+		  m_shaderBindingTable.missGroupsRange, m_shaderBindingTable.missGroupsStride,
+		  m_shaderBindingTable.hitGroupsRange, m_shaderBindingTable.hitGroupsStride,
+		  m_shaderBindingTable.callableGroupsRange, m_shaderBindingTable.callableGroupsStride,
+		  WIN_W, WIN_H, 1);
+	  }
+	}
+
+	// pipeline barrier
+	{
+	  IGPUCommandBuffer::SPipelineBarrierDependencyInfo::image_barrier_t imageBarriers[1];
+	  imageBarriers[0].barrier = {
+		.dep = {
+		  .srcStageMask = PIPELINE_STAGE_FLAGS::RAY_TRACING_SHADER_BIT,
+		  .srcAccessMask = ACCESS_FLAGS::SHADER_WRITE_BITS,
+		  .dstStageMask = PIPELINE_STAGE_FLAGS::COLOR_ATTACHMENT_OUTPUT_BIT,
+		  .dstAccessMask = ACCESS_FLAGS::COLOR_ATTACHMENT_WRITE_BIT
+		}
+	  };
+	  imageBarriers[0].image = m_hdrImage.get();
+	  imageBarriers[0].subresourceRange = {
+		.aspectMask = IImage::EAF_COLOR_BIT,
+		.baseMipLevel = 0u,
+		.levelCount = 1u,
+		.baseArrayLayer = 0u,
+		.layerCount = 1u
+	  };
+	  imageBarriers[0].oldLayout = IImage::LAYOUT::GENERAL;
+	  imageBarriers[0].newLayout = IImage::LAYOUT::READ_ONLY_OPTIMAL;
+
+	  cmdbuf->pipelineBarrier(E_DEPENDENCY_FLAGS::EDF_NONE, { .imgBarriers = imageBarriers });
+	}
+
+	{
 			asset::SViewport viewport;
 			{
 				viewport.minDepth = 1.f;
@@ -795,993 +797,999 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication,
 			VkRect2D defaultScisors[] = { {.offset = {(int32_t)viewport.x, (int32_t)viewport.y}, .extent = {(uint32_t)viewport.width, (uint32_t)viewport.height}} };
 			cmdbuf->setScissor(defaultScisors);
 
-      auto scRes = static_cast<CDefaultSwapchainFramebuffers*>(m_surface->getSwapchainResources());
-      const VkRect2D currentRenderArea =
-      {
-        .offset = {0,0},
-        .extent = {m_window->getWidth(),m_window->getHeight()}
-      };
-      const IGPUCommandBuffer::SClearColorValue clearColor = { .float32 = {0.f,0.f,0.f,1.f} };
-      const IGPUCommandBuffer::SRenderpassBeginInfo info =
-      {
-        .framebuffer = scRes->getFramebuffer(m_currentImageAcquire.imageIndex),
-        .colorClearValues = &clearColor,
-        .depthStencilClearValues = nullptr,
-        .renderArea = currentRenderArea
-      };
-      nbl::video::ISemaphore::SWaitInfo waitInfo = { .semaphore = m_semaphore.get(), .value = m_realFrameIx + 1u };
-
-      cmdbuf->beginRenderPass(info, IGPUCommandBuffer::SUBPASS_CONTENTS::INLINE);
-
-      cmdbuf->bindGraphicsPipeline(m_presentPipeline.get());
-      cmdbuf->bindDescriptorSets(EPBP_GRAPHICS, m_presentPipeline->getLayout(), 0, 1u, &m_presentDs.get());
-      ext::FullScreenTriangle::recordDrawCall(cmdbuf);
-
-      const auto uiParams = m_ui.manager->getCreationParameters();
-      auto* uiPipeline = m_ui.manager->getPipeline();
-      cmdbuf->bindGraphicsPipeline(uiPipeline);
-      cmdbuf->bindDescriptorSets(EPBP_GRAPHICS, uiPipeline->getLayout(), uiParams.resources.texturesInfo.setIx, 1u, &m_ui.descriptorSet.get());
-      m_ui.manager->render(cmdbuf, waitInfo);
-
-      cmdbuf->endRenderPass();
-
-    }
-
-    cmdbuf->endDebugMarker();
-    cmdbuf->end();
-
-    {
-      const IQueue::SSubmitInfo::SSemaphoreInfo rendered[] =
-      {
-        {
-          .semaphore = m_semaphore.get(),
-          .value = ++m_realFrameIx,
-          .stageMask = PIPELINE_STAGE_FLAGS::ALL_TRANSFER_BITS
-        }
-      };
-      {
-        {
-          const IQueue::SSubmitInfo::SCommandBufferInfo commandBuffers[] =
-          {
-            {.cmdbuf = cmdbuf }
-          };
-
-          const IQueue::SSubmitInfo::SSemaphoreInfo acquired[] =
-          {
-            {
-              .semaphore = m_currentImageAcquire.semaphore,
-              .value = m_currentImageAcquire.acquireCount,
-              .stageMask = PIPELINE_STAGE_FLAGS::NONE
-            }
-          };
-          const IQueue::SSubmitInfo infos[] =
-          {
-            {
-              .waitSemaphores = acquired,
-              .commandBuffers = commandBuffers,
-              .signalSemaphores = rendered
-            }
-          };
-
-          updateGUIDescriptorSet();
-
-          if (queue->submit(infos) != IQueue::RESULT::SUCCESS)
-            m_realFrameIx--;
-        }
-      }
-
-      m_window->setCaption("[Nabla Engine] Ray Tracing Pipeline");
-      m_surface->present(m_currentImageAcquire.imageIndex, rendered);
-    }
-    m_api->endCapture();
-    m_frameAccumulationCounter++;
-  }
-
-  inline void update()
-  {
-    m_camera.setMoveSpeed(m_cameraSetting.moveSpeed);
-    m_camera.setRotateSpeed(m_cameraSetting.rotateSpeed);
-
-    static std::chrono::microseconds previousEventTimestamp{};
-
-    m_inputSystem->getDefaultMouse(&m_mouse);
-    m_inputSystem->getDefaultKeyboard(&m_keyboard);
-
-    auto updatePresentationTimestamp = [&]()
-      {
-        m_currentImageAcquire = m_surface->acquireNextImage();
-
-        m_oracle.reportEndFrameRecord();
-        const auto timestamp = m_oracle.getNextPresentationTimeStamp();
-        m_oracle.reportBeginFrameRecord();
-
-        return timestamp;
-      };
-
-    const auto nextPresentationTimestamp = updatePresentationTimestamp();
-
-    struct
-    {
-      std::vector<SMouseEvent> mouse{};
-      std::vector<SKeyboardEvent> keyboard{};
-    } capturedEvents;
-
-    m_camera.beginInputProcessing(nextPresentationTimestamp);
-    {
-      m_mouse.consumeEvents([&](const IMouseEventChannel::range_t& events) -> void
-        {
-          m_camera.mouseProcess(events); // don't capture the events, only let camera handle them with its impl
-
-          for (const auto& e : events) // here capture
-          {
-            if (e.timeStamp < previousEventTimestamp)
-              continue;
-
-            previousEventTimestamp = e.timeStamp;
-            capturedEvents.mouse.emplace_back(e);
-
-          }
-        }, m_logger.get());
-
-      m_keyboard.consumeEvents([&](const IKeyboardEventChannel::range_t& events) -> void
-        {
-          m_camera.keyboardProcess(events); // don't capture the events, only let camera handle them with its impl
-
-          for (const auto& e : events) // here capture
-          {
-            if (e.timeStamp < previousEventTimestamp)
-              continue;
-
-            previousEventTimestamp = e.timeStamp;
-            capturedEvents.keyboard.emplace_back(e);
-          }
-        }, m_logger.get());
+	  auto scRes = static_cast<CDefaultSwapchainFramebuffers*>(m_surface->getSwapchainResources());
+	  const VkRect2D currentRenderArea =
+	  {
+		.offset = {0,0},
+		.extent = {m_window->getWidth(),m_window->getHeight()}
+	  };
+	  const IGPUCommandBuffer::SClearColorValue clearColor = { .float32 = {0.f,0.f,0.f,1.f} };
+	  const IGPUCommandBuffer::SRenderpassBeginInfo info =
+	  {
+		.framebuffer = scRes->getFramebuffer(m_currentImageAcquire.imageIndex),
+		.colorClearValues = &clearColor,
+		.depthStencilClearValues = nullptr,
+		.renderArea = currentRenderArea
+	  };
+	  nbl::video::ISemaphore::SWaitInfo waitInfo = { .semaphore = m_semaphore.get(), .value = m_realFrameIx + 1u };
+
+	  cmdbuf->beginRenderPass(info, IGPUCommandBuffer::SUBPASS_CONTENTS::INLINE);
+
+	  cmdbuf->bindGraphicsPipeline(m_presentPipeline.get());
+	  cmdbuf->bindDescriptorSets(EPBP_GRAPHICS, m_presentPipeline->getLayout(), 0, 1u, &m_presentDs.get());
+	  ext::FullScreenTriangle::recordDrawCall(cmdbuf);
+
+	  const auto uiParams = m_ui.manager->getCreationParameters();
+	  auto* uiPipeline = m_ui.manager->getPipeline();
+	  cmdbuf->bindGraphicsPipeline(uiPipeline);
+	  cmdbuf->bindDescriptorSets(EPBP_GRAPHICS, uiPipeline->getLayout(), uiParams.resources.texturesInfo.setIx, 1u, &m_ui.descriptorSet.get());
+	  m_ui.manager->render(cmdbuf, waitInfo);
+
+	  cmdbuf->endRenderPass();
+
+	}
+
+	cmdbuf->endDebugMarker();
+	cmdbuf->end();
+
+	{
+	  const IQueue::SSubmitInfo::SSemaphoreInfo rendered[] =
+	  {
+		{
+		  .semaphore = m_semaphore.get(),
+		  .value = ++m_realFrameIx,
+		  .stageMask = PIPELINE_STAGE_FLAGS::ALL_TRANSFER_BITS
+		}
+	  };
+	  {
+		{
+		  const IQueue::SSubmitInfo::SCommandBufferInfo commandBuffers[] =
+		  {
+			{.cmdbuf = cmdbuf }
+		  };
+
+		  const IQueue::SSubmitInfo::SSemaphoreInfo acquired[] =
+		  {
+			{
+			  .semaphore = m_currentImageAcquire.semaphore,
+			  .value = m_currentImageAcquire.acquireCount,
+			  .stageMask = PIPELINE_STAGE_FLAGS::NONE
+			}
+		  };
+		  const IQueue::SSubmitInfo infos[] =
+		  {
+			{
+			  .waitSemaphores = acquired,
+			  .commandBuffers = commandBuffers,
+			  .signalSemaphores = rendered
+			}
+		  };
 
-    }
-    m_camera.endInputProcessing(nextPresentationTimestamp);
+		  updateGUIDescriptorSet();
 
-    const core::SRange<const nbl::ui::SMouseEvent> mouseEvents(capturedEvents.mouse.data(), capturedEvents.mouse.data() + capturedEvents.mouse.size());
-    const core::SRange<const nbl::ui::SKeyboardEvent> keyboardEvents(capturedEvents.keyboard.data(), capturedEvents.keyboard.data() + capturedEvents.keyboard.size());
-    const auto cursorPosition = m_window->getCursorControl()->getPosition();
-    const auto mousePosition = float32_t2(cursorPosition.x, cursorPosition.y) - float32_t2(m_window->getX(), m_window->getY());
+		  if (queue->submit(infos) != IQueue::RESULT::SUCCESS)
+			m_realFrameIx--;
+		}
+	  }
 
-    const ext::imgui::UI::SUpdateParameters params =
-    {
-      .mousePosition = mousePosition,
-      .displaySize = { m_window->getWidth(), m_window->getHeight() },
-      .mouseEvents = mouseEvents,
-      .keyboardEvents = keyboardEvents
-    };
+	  m_window->setCaption("[Nabla Engine] Ray Tracing Pipeline");
+	  m_surface->present(m_currentImageAcquire.imageIndex, rendered);
+	}
+	m_api->endCapture();
+	m_frameAccumulationCounter++;
+  }
 
-    m_ui.manager->update(params);
+  inline void update()
+  {
+	m_camera.setMoveSpeed(m_cameraSetting.moveSpeed);
+	m_camera.setRotateSpeed(m_cameraSetting.rotateSpeed);
+
+	static std::chrono::microseconds previousEventTimestamp{};
+
+	m_inputSystem->getDefaultMouse(&m_mouse);
+	m_inputSystem->getDefaultKeyboard(&m_keyboard);
+
+	auto updatePresentationTimestamp = [&]()
+	  {
+		m_currentImageAcquire = m_surface->acquireNextImage();
+
+		m_oracle.reportEndFrameRecord();
+		const auto timestamp = m_oracle.getNextPresentationTimeStamp();
+		m_oracle.reportBeginFrameRecord();
+
+		return timestamp;
+	  };
+
+	const auto nextPresentationTimestamp = updatePresentationTimestamp();
+
+	struct
+	{
+	  std::vector<SMouseEvent> mouse{};
+	  std::vector<SKeyboardEvent> keyboard{};
+	} capturedEvents;
+
+	m_camera.beginInputProcessing(nextPresentationTimestamp);
+	{
+	  const auto& io = ImGui::GetIO();
+	  m_mouse.consumeEvents([&](const IMouseEventChannel::range_t& events) -> void
+		{
+		  if (!io.WantCaptureMouse)
+			m_camera.mouseProcess(events); // don't capture the events, only let camera handle them with its impl
+
+		  for (const auto& e : events) // here capture
+		  {
+			if (e.timeStamp < previousEventTimestamp)
+			  continue;
+
+			previousEventTimestamp = e.timeStamp;
+			capturedEvents.mouse.emplace_back(e);
+
+		  }
+		}, m_logger.get());
+
+	  m_keyboard.consumeEvents([&](const IKeyboardEventChannel::range_t& events) -> void
+		{
+		  if (!io.WantCaptureKeyboard)
+			  m_camera.keyboardProcess(events); // don't capture the events, only let camera handle them with its impl
+
+		  for (const auto& e : events) // here capture
+		  {
+			if (e.timeStamp < previousEventTimestamp)
+			  continue;
+
+			previousEventTimestamp = e.timeStamp;
+			capturedEvents.keyboard.emplace_back(e);
+		  }
+		}, m_logger.get());
+
+	}
+	m_camera.endInputProcessing(nextPresentationTimestamp);
+
+	const core::SRange<const nbl::ui::SMouseEvent> mouseEvents(capturedEvents.mouse.data(), capturedEvents.mouse.data() + capturedEvents.mouse.size());
+	const core::SRange<const nbl::ui::SKeyboardEvent> keyboardEvents(capturedEvents.keyboard.data(), capturedEvents.keyboard.data() + capturedEvents.keyboard.size());
+	const auto cursorPosition = m_window->getCursorControl()->getPosition();
+	const auto mousePosition = float32_t2(cursorPosition.x, cursorPosition.y) - float32_t2(m_window->getX(), m_window->getY());
+
+	const ext::imgui::UI::SUpdateParameters params =
+	{
+	  .mousePosition = mousePosition,
+	  .displaySize = { m_window->getWidth(), m_window->getHeight() },
+	  .mouseEvents = mouseEvents,
+	  .keyboardEvents = keyboardEvents
+	};
+
+	m_ui.manager->update(params);
   }
 
   inline bool keepRunning() override
   {
-    if (m_surface->irrecoverable())
-      return false;
+	if (m_surface->irrecoverable())
+	  return false;
 
-    return true;
+	return true;
   }
 
   inline bool onAppTerminated() override
   {
-    return device_base_t::onAppTerminated();
+	return device_base_t::onAppTerminated();
   }
 
 private:
   uint32_t getWorkgroupCount(uint32_t dim, uint32_t size)
   {
-    return (dim + size - 1) / size;
+	return (dim + size - 1) / size;
   }
 
   smart_refctd_ptr<IGPUBuffer> createBuffer(IGPUBuffer::SCreationParams& params)
   {
-    smart_refctd_ptr<IGPUBuffer> buffer;
-    buffer = m_device->createBuffer(std::move(params));
-    auto bufReqs = buffer->getMemoryReqs();
-    bufReqs.memoryTypeBits &= m_physicalDevice->getDeviceLocalMemoryTypeBits();
-    m_device->allocate(bufReqs, buffer.get(), IDeviceMemoryAllocation::EMAF_DEVICE_ADDRESS_BIT);
+	smart_refctd_ptr<IGPUBuffer> buffer;
+	buffer = m_device->createBuffer(std::move(params));
+	auto bufReqs = buffer->getMemoryReqs();
+	bufReqs.memoryTypeBits &= m_physicalDevice->getDeviceLocalMemoryTypeBits();
+	m_device->allocate(bufReqs, buffer.get(), IDeviceMemoryAllocation::EMAF_DEVICE_ADDRESS_BIT);
 
-    return buffer;
+	return buffer;
   }
 
   smart_refctd_ptr<IGPUCommandBuffer> getSingleUseCommandBufferAndBegin(smart_refctd_ptr<IGPUCommandPool> pool)
   {
-    smart_refctd_ptr<IGPUCommandBuffer> cmdbuf;
-    if (!pool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY, 1u, &cmdbuf))
-      return nullptr;
+	smart_refctd_ptr<IGPUCommandBuffer> cmdbuf;
+	if (!pool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY, 1u, &cmdbuf))
+	  return nullptr;
 
-    cmdbuf->reset(IGPUCommandBuffer::RESET_FLAGS::RELEASE_RESOURCES_BIT);
-    cmdbuf->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT);
+	cmdbuf->reset(IGPUCommandBuffer::RESET_FLAGS::RELEASE_RESOURCES_BIT);
+	cmdbuf->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT);
 
-    return cmdbuf;
+	return cmdbuf;
   }
 
   void cmdbufSubmitAndWait(smart_refctd_ptr<IGPUCommandBuffer> cmdbuf, CThreadSafeQueueAdapter* queue, uint64_t startValue)
   {
-    cmdbuf->end();
-
-    uint64_t finishedValue = startValue + 1;
-
-    // submit builds
-    {
-      auto completed = m_device->createSemaphore(startValue);
-
-      std::array<IQueue::SSubmitInfo::SSemaphoreInfo, 1u> signals;
-      {
-        auto& signal = signals.front();
-        signal.value = finishedValue;
-        signal.stageMask = bitflag(PIPELINE_STAGE_FLAGS::ALL_TRANSFER_BITS);
-        signal.semaphore = completed.get();
-      }
-
-      const IQueue::SSubmitInfo::SCommandBufferInfo commandBuffers[1] = { {
-        .cmdbuf = cmdbuf.get()
-      } };
-
-      const IQueue::SSubmitInfo infos[] =
-      {
-        {
-          .waitSemaphores = {},
-          .commandBuffers = commandBuffers,
-          .signalSemaphores = signals
-        }
-      };
-
-      if (queue->submit(infos) != IQueue::RESULT::SUCCESS)
-      {
-        m_logger->log("Failed to submit geometry transfer upload operations!", ILogger::ELL_ERROR);
-        return;
-      }
-
-      const ISemaphore::SWaitInfo info[] =
-      { {
-        .semaphore = completed.get(),
-        .value = finishedValue
-      } };
-
-      m_device->blockForSemaphores(info);
-    }
+	cmdbuf->end();
+
+	uint64_t finishedValue = startValue + 1;
+
+	// submit builds
+	{
+	  auto completed = m_device->createSemaphore(startValue);
+
+	  std::array<IQueue::SSubmitInfo::SSemaphoreInfo, 1u> signals;
+	  {
+		auto& signal = signals.front();
+		signal.value = finishedValue;
+		signal.stageMask = bitflag(PIPELINE_STAGE_FLAGS::ALL_TRANSFER_BITS);
+		signal.semaphore = completed.get();
+	  }
+
+	  const IQueue::SSubmitInfo::SCommandBufferInfo commandBuffers[1] = { {
+		.cmdbuf = cmdbuf.get()
+	  } };
+
+	  const IQueue::SSubmitInfo infos[] =
+	  {
+		{
+		  .waitSemaphores = {},
+		  .commandBuffers = commandBuffers,
+		  .signalSemaphores = signals
+		}
+	  };
+
+	  if (queue->submit(infos) != IQueue::RESULT::SUCCESS)
+	  {
+		m_logger->log("Failed to submit geometry transfer upload operations!", ILogger::ELL_ERROR);
+		return;
+	  }
+
+	  const ISemaphore::SWaitInfo info[] =
+	  { {
+		.semaphore = completed.get(),
+		.value = finishedValue
+	  } };
+
+	  m_device->blockForSemaphores(info);
+	}
   }
 
   bool createIndirectBuffer(video::CThreadSafeQueueAdapter* queue)
   {
-    const auto getBufferRangeAddress = [](const SBufferRange<IGPUBuffer>& range)
-      {
-        return range.buffer->getDeviceAddress() + range.offset;
-      };
-    const auto command = TraceRaysIndirectCommand_t{
-      .raygenShaderRecordAddress = getBufferRangeAddress(m_shaderBindingTable.raygenGroupRange),
-      .raygenShaderRecordSize = m_shaderBindingTable.raygenGroupRange.size,
-      .missShaderBindingTableAddress = getBufferRangeAddress(m_shaderBindingTable.missGroupsRange),
-      .missShaderBindingTableSize = m_shaderBindingTable.missGroupsRange.size,
-      .missShaderBindingTableStride = m_shaderBindingTable.missGroupsStride,
-      .hitShaderBindingTableAddress = getBufferRangeAddress(m_shaderBindingTable.hitGroupsRange),
-      .hitShaderBindingTableSize = m_shaderBindingTable.hitGroupsRange.size,
-      .hitShaderBindingTableStride = m_shaderBindingTable.hitGroupsStride,
-      .callableShaderBindingTableAddress = getBufferRangeAddress(m_shaderBindingTable.callableGroupsRange),
-      .callableShaderBindingTableSize = m_shaderBindingTable.callableGroupsRange.size,
-      .callableShaderBindingTableStride = m_shaderBindingTable.callableGroupsStride,
-      .width = WIN_W,
-      .height = WIN_H,
-      .depth = 1,
-    };
-    IGPUBuffer::SCreationParams params;
-    params.usage = IGPUBuffer::EUF_TRANSFER_DST_BIT | IGPUBuffer::EUF_INDIRECT_BUFFER_BIT | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT;
-    params.size = sizeof(TraceRaysIndirectCommand_t);
-    m_utils->createFilledDeviceLocalBufferOnDedMem(SIntendedSubmitInfo{ .queue = queue }, std::move(params), &command).move_into(m_indirectBuffer);
-    return true;
+	const auto getBufferRangeAddress = [](const SBufferRange<IGPUBuffer>& range)
+	  {
+		return range.buffer->getDeviceAddress() + range.offset;
+	  };
+	const auto command = TraceRaysIndirectCommand_t{
+	  .raygenShaderRecordAddress = getBufferRangeAddress(m_shaderBindingTable.raygenGroupRange),
+	  .raygenShaderRecordSize = m_shaderBindingTable.raygenGroupRange.size,
+	  .missShaderBindingTableAddress = getBufferRangeAddress(m_shaderBindingTable.missGroupsRange),
+	  .missShaderBindingTableSize = m_shaderBindingTable.missGroupsRange.size,
+	  .missShaderBindingTableStride = m_shaderBindingTable.missGroupsStride,
+	  .hitShaderBindingTableAddress = getBufferRangeAddress(m_shaderBindingTable.hitGroupsRange),
+	  .hitShaderBindingTableSize = m_shaderBindingTable.hitGroupsRange.size,
+	  .hitShaderBindingTableStride = m_shaderBindingTable.hitGroupsStride,
+	  .callableShaderBindingTableAddress = getBufferRangeAddress(m_shaderBindingTable.callableGroupsRange),
+	  .callableShaderBindingTableSize = m_shaderBindingTable.callableGroupsRange.size,
+	  .callableShaderBindingTableStride = m_shaderBindingTable.callableGroupsStride,
+	  .width = WIN_W,
+	  .height = WIN_H,
+	  .depth = 1,
+	};
+	IGPUBuffer::SCreationParams params;
+	params.usage = IGPUBuffer::EUF_TRANSFER_DST_BIT | IGPUBuffer::EUF_INDIRECT_BUFFER_BIT | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT;
+	params.size = sizeof(TraceRaysIndirectCommand_t);
+	m_utils->createFilledDeviceLocalBufferOnDedMem(SIntendedSubmitInfo{ .queue = queue }, std::move(params), &command).move_into(m_indirectBuffer);
+	return true;
   }
 
-  bool createGeometries(video::CThreadSafeQueueAdapter* queue, const IGeometryCreator* gc)
+  void calculateRayTracingStackSize(const smart_refctd_ptr<video::IGPURayTracingPipeline>& pipeline)
   {
-    auto pool = m_device->createCommandPool(queue->getFamilyIndex(), IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT);
-    if (!pool)
-      return logFail("Couldn't create Command Pool for geometry creation!");
-
-    const auto defaultMaterial = Material{
-      .ambient = {0.2, 0.1, 0.1},
-      .diffuse = {0.8, 0.3, 0.3},
-      .specular = {0.8, 0.8, 0.8},
-      .shininess = 1.0f,
-      .alpha = 1.0f,
-    };
-
-    auto getTranslationMatrix = [](float32_t x, float32_t y, float32_t z)
-      {
-        core::matrix3x4SIMD transform;
-        transform.setTranslation(nbl::core::vectorSIMDf(x, y, z, 0));
-        return transform;
-      };
-
-    core::matrix3x4SIMD planeTransform;
-    planeTransform.setRotation(quaternion::fromAngleAxis(core::radians(-90.0f), vector3df_SIMD{ 1, 0, 0 }));
-
-    const auto cpuObjects = std::array{
-      ReferenceObjectCpu {
-        .meta = {.type = OT_RECTANGLE, .name = "Plane Mesh"},
-        .data = gc->createRectangleMesh(nbl::core::vector2df_SIMD(10, 10)),
-        .material = defaultMaterial,
-        .transform = planeTransform,
-      },
-      ReferenceObjectCpu {
-        .meta = {.type = OT_CUBE, .name = "Cube Mesh"},
-        .data = gc->createCubeMesh(nbl::core::vector3df(1, 1, 1)),
-        .material = defaultMaterial,
-        .transform = getTranslationMatrix(0, 0.5f, 0),
-      },
-      ReferenceObjectCpu {
-        .meta = {.type = OT_CUBE, .name = "Cube Mesh 2"},
-        .data = gc->createCubeMesh(nbl::core::vector3df(1.5, 1.5, 1.5)),
-        .material = Material{
-          .ambient = {0.1, 0.1, 0.2},
-          .diffuse = {0.2, 0.2, 0.8},
-          .specular = {0.8, 0.8, 0.8},
-          .shininess = 1.0f,
-        },
-        .transform = getTranslationMatrix(-5.0f, 1.0f, 0),
-      },
-      ReferenceObjectCpu {
-        .meta = {.type = OT_CUBE, .name = "Transparent Cube Mesh"},
-        .data = gc->createCubeMesh(nbl::core::vector3df(1.5, 1.5, 1.5)),
-        .material = Material{
-          .ambient = {0.1, 0.2, 0.1},
-          .diffuse = {0.2, 0.8, 0.2},
-          .specular = {0.8, 0.8, 0.8},
-          .shininess = 1.0f,
-          .alpha = 0.2,
-        },
-        .transform = getTranslationMatrix(5.0f, 1.0f, 0),
-      },
-    };
-
-    struct ScratchVIBindings
-    {
-      nbl::asset::SBufferBinding<ICPUBuffer> vertex, index;
-    };
-    std::array<ScratchVIBindings, std::size(cpuObjects)> scratchBuffers;
-
-    for (uint32_t i = 0; i < cpuObjects.size(); i++)
-    {
-      const auto& cpuObject = cpuObjects[i];
-
-      auto vBuffer = smart_refctd_ptr(cpuObject.data.bindings[0].buffer); // no offset
-      auto vUsage = bitflag(IGPUBuffer::EUF_STORAGE_BUFFER_BIT) | IGPUBuffer::EUF_TRANSFER_DST_BIT | IGPUBuffer::EUF_INLINE_UPDATE_VIA_CMDBUF |
-        IGPUBuffer::EUF_ACCELERATION_STRUCTURE_BUILD_INPUT_READ_ONLY_BIT | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT;
-      vBuffer->addUsageFlags(vUsage);
-      vBuffer->setContentHash(vBuffer->computeContentHash());
-
-      auto iBuffer = smart_refctd_ptr(cpuObject.data.indexBuffer.buffer); // no offset
-      auto iUsage = bitflag(IGPUBuffer::EUF_STORAGE_BUFFER_BIT) | IGPUBuffer::EUF_TRANSFER_DST_BIT | IGPUBuffer::EUF_INLINE_UPDATE_VIA_CMDBUF |
-        IGPUBuffer::EUF_ACCELERATION_STRUCTURE_BUILD_INPUT_READ_ONLY_BIT | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT;
-
-      if (cpuObject.data.indexType != EIT_UNKNOWN)
-        if (iBuffer)
-        {
-          iBuffer->addUsageFlags(iUsage);
-          iBuffer->setContentHash(iBuffer->computeContentHash());
-        }
-
-      scratchBuffers[i] = {
-        .vertex = {.offset = 0, .buffer = vBuffer},
-        .index = {.offset = 0, .buffer = iBuffer},
-      };
-
-    }
-
-    auto cmdbuf = getSingleUseCommandBufferAndBegin(pool);
-    cmdbuf->beginDebugMarker("Build geometry vertex and index buffers");
-
-    CAssetConverter::SInputs inputs = {};
-    inputs.logger = m_logger.get();
-    std::array<ICPUBuffer*, std::size(cpuObjects) * 2u> tmpBuffers;
-    {
-      for (uint32_t i = 0; i < cpuObjects.size(); i++)
-      {
-        tmpBuffers[2 * i + 0] = scratchBuffers[i].vertex.buffer.get();
-        tmpBuffers[2 * i + 1] = scratchBuffers[i].index.buffer.get();
-      }
-
-      std::get<CAssetConverter::SInputs::asset_span_t<ICPUBuffer>>(inputs.assets) = tmpBuffers;
-    }
-
-    auto reservation = m_converter->reserve(inputs);
-    {
-      auto prepass = [&]<typename asset_type_t>(const auto & references) -> bool
-      {
-        auto objects = reservation.getGPUObjects<asset_type_t>();
-        uint32_t counter = {};
-        for (auto& object : objects)
-        {
-          auto gpu = object.value;
-          auto* reference = references[counter];
-
-          if (reference)
-          {
-            if (!gpu)
-            {
-              m_logger->log("Failed to convert a CPU object to GPU!", ILogger::ELL_ERROR);
-              return false;
-            }
-          }
-          counter++;
-        }
-        return true;
-      };
-
-      prepass.template operator() < ICPUBuffer > (tmpBuffers);
-    }
-
-    auto geomInfoBuffer = ICPUBuffer::create({ std::size(cpuObjects) * sizeof(STriangleGeomInfo) });
-    STriangleGeomInfo* geomInfos = reinterpret_cast<STriangleGeomInfo*>(geomInfoBuffer->getPointer());
-
-    m_gpuTriangleGeometries.reserve(std::size(cpuObjects));
-    // convert
-    {
-      // not sure if need this (probably not, originally for transition img view)
-      auto semaphore = m_device->createSemaphore(0u);
-
-      std::array<IQueue::SSubmitInfo::SCommandBufferInfo, 1> cmdbufs = {};
-      cmdbufs.front().cmdbuf = cmdbuf.get();
-
-      SIntendedSubmitInfo transfer = {};
-      transfer.queue = queue;
-      transfer.scratchCommandBuffers = cmdbufs;
-      transfer.scratchSemaphore = {
-        .semaphore = semaphore.get(),
-        .value = 0u,
-        .stageMask = PIPELINE_STAGE_FLAGS::ALL_TRANSFER_BITS
-      };
-
-      CAssetConverter::SConvertParams params = {};
-      params.utilities = m_utils.get();
-      params.transfer = &transfer;
-
-      auto future = reservation.convert(params);
-      if (future.copy() != IQueue::RESULT::SUCCESS)
-      {
-        m_logger->log("Failed to await submission feature!", ILogger::ELL_ERROR);
-        return false;
-      }
-
-      auto&& buffers = reservation.getGPUObjects<ICPUBuffer>();
-      for (uint32_t i = 0; i < cpuObjects.size(); i++)
-      {
-        auto& cpuObject = cpuObjects[i];
-
-        m_gpuTriangleGeometries.push_back(ReferenceObjectGpu{
-          .meta = cpuObject.meta,
-          .bindings = {
-            .vertex = {.offset = 0, .buffer = buffers[2 * i + 0].value },
-            .index = {.offset = 0, .buffer = buffers[2 * i + 1].value },
-          },
-          .vertexStride = cpuObject.data.inputParams.bindings[0].stride,
-          .indexType = cpuObject.data.indexType,
-          .indexCount = cpuObject.data.indexCount,
-          .material = hlsl::_static_cast<MaterialPacked>(cpuObject.material),
-          .transform = cpuObject.transform,
-          });
-      }
-
-      for (uint32_t i = 0; i < m_gpuTriangleGeometries.size(); i++)
-      {
-        const auto& gpuObject = m_gpuTriangleGeometries[i];
-        const uint64_t vertexBufferAddress = gpuObject.bindings.vertex.buffer->getDeviceAddress();
-        geomInfos[i] = {
-          .material = gpuObject.material,
-          .vertexBufferAddress = vertexBufferAddress,
-          .indexBufferAddress = gpuObject.useIndex() ? gpuObject.bindings.index.buffer->getDeviceAddress() : vertexBufferAddress,
-          .vertexStride = gpuObject.vertexStride,
-          .objType = gpuObject.meta.type,
-          .indexType = gpuObject.indexType,
-          .smoothNormals = s_smoothNormals[gpuObject.meta.type],
-        };
-      }
-    }
-
-    {
-      IGPUBuffer::SCreationParams params;
-      params.usage = IGPUBuffer::EUF_STORAGE_BUFFER_BIT | IGPUBuffer::EUF_TRANSFER_DST_BIT | IGPUBuffer::EUF_INLINE_UPDATE_VIA_CMDBUF | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT;
-      params.size = geomInfoBuffer->getSize();
-      m_utils->createFilledDeviceLocalBufferOnDedMem(SIntendedSubmitInfo{ .queue = queue }, std::move(params), geomInfos).move_into(m_triangleGeomInfoBuffer);
-    }
-
-    // intersection geometries setup
-    {
-      core::vector<SProceduralGeomInfo> proceduralGeoms;
-      proceduralGeoms.reserve(NumberOfProceduralGeometries);
-      using Aabb = IGPUBottomLevelAccelerationStructure::AABB_t;
-      core::vector<Aabb> aabbs;
-      aabbs.reserve(NumberOfProceduralGeometries);
-      for (int32_t i = 0; i < NumberOfProceduralGeometries; i++)
-      {
-        const auto middle_i = NumberOfProceduralGeometries / 2.0;
-        SProceduralGeomInfo sphere = {
-          .material = hlsl::_static_cast<MaterialPacked>(Material{
-            .ambient = {0.1, 0.05 * i, 0.1},
-            .diffuse = {0.3, 0.2 * i, 0.3},
-            .specular = {0.8, 0.8, 0.8},
-            .shininess = 1.0f,
-          }),
-          .center = float32_t3((i - middle_i) * 4.0, 2, 5.0),
-          .radius = 1,
-        };
-
-        proceduralGeoms.push_back(sphere);
-        const auto sphereMin = sphere.center - sphere.radius;
-        const auto sphereMax = sphere.center + sphere.radius;
-        aabbs.emplace_back(
-          vector3d(sphereMin.x, sphereMin.y, sphereMin.z), 
-          vector3d(sphereMax.x, sphereMax.y, sphereMax.z));
-      }
-
-      {
-        IGPUBuffer::SCreationParams params;
-        params.usage = IGPUBuffer::EUF_STORAGE_BUFFER_BIT | IGPUBuffer::EUF_TRANSFER_DST_BIT | IGPUBuffer::EUF_INLINE_UPDATE_VIA_CMDBUF | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT;
-        params.size = proceduralGeoms.size() * sizeof(SProceduralGeomInfo);
-        m_utils->createFilledDeviceLocalBufferOnDedMem(SIntendedSubmitInfo{ .queue = queue }, std::move(params), proceduralGeoms.data()).move_into(m_proceduralGeomInfoBuffer);
-      }
-
-      {
-        IGPUBuffer::SCreationParams params;
-        params.usage = IGPUBuffer::EUF_TRANSFER_DST_BIT | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT | IGPUBuffer::EUF_ACCELERATION_STRUCTURE_BUILD_INPUT_READ_ONLY_BIT;
-        params.size = aabbs.size() * sizeof(Aabb);
-        m_utils->createFilledDeviceLocalBufferOnDedMem(SIntendedSubmitInfo{ .queue = queue }, std::move(params), aabbs.data()).move_into(m_proceduralAabbBuffer);
-      }
-    }
-
-    return true;
+	const auto raygenStackSize = pipeline->getRaygenStackSize();
+	auto getMaxSize = [&](auto ranges, auto valProj) -> uint16_t
+	  {
+		auto maxValue = 0;
+		for (const auto& val : ranges)
+		{
+		  maxValue = std::max<uint16_t>(maxValue, std::invoke(valProj, val));
+		}
+		return maxValue;
+	  };
+
+	const auto closestHitStackMax = getMaxSize(pipeline->getHitStackSizes(), &IGPURayTracingPipeline::SHitGroupStackSize::closestHit);
+	const auto anyHitStackMax = getMaxSize(pipeline->getHitStackSizes(), &IGPURayTracingPipeline::SHitGroupStackSize::anyHit);
+	const auto intersectionStackMax = getMaxSize(pipeline->getHitStackSizes(), &IGPURayTracingPipeline::SHitGroupStackSize::intersection);
+	const auto missStackMax = getMaxSize(pipeline->getMissStackSizes(), std::identity{});
+	const auto callableStackMax = getMaxSize(pipeline->getCallableStackSizes(), std::identity{});
+	auto firstDepthStackSizeMax = std::max(closestHitStackMax, missStackMax);
+	firstDepthStackSizeMax = std::max<uint16_t>(firstDepthStackSizeMax, intersectionStackMax + anyHitStackMax);
+	m_rayTracingStackSize = raygenStackSize + std::max(firstDepthStackSizeMax, callableStackMax);
   }
 
-  void calculateRayTracingStackSize(const smart_refctd_ptr<video::IGPURayTracingPipeline>& pipeline)
+  bool createShaderBindingTable(video::CThreadSafeQueueAdapter* queue, const smart_refctd_ptr<video::IGPURayTracingPipeline>& pipeline)
   {
-    const auto raygenStackSize = pipeline->getRaygenStackSize();
-    auto getMaxSize = [&](auto ranges, auto valProj) -> uint16_t
-      {
-        auto maxValue = 0;
-        for (const auto& val : ranges)
-        {
-          maxValue = std::max<uint16_t>(maxValue, std::invoke(valProj, val));
-        }
-        return maxValue;
-      };
-
-    const auto closestHitStackMax = getMaxSize(pipeline->getHitStackSizes(), &IGPURayTracingPipeline::SHitGroupStackSize::closestHit);
-    const auto anyHitStackMax = getMaxSize(pipeline->getHitStackSizes(), &IGPURayTracingPipeline::SHitGroupStackSize::anyHit);
-    const auto intersectionStackMax = getMaxSize(pipeline->getHitStackSizes(), &IGPURayTracingPipeline::SHitGroupStackSize::intersection);
-    const auto missStackMax = getMaxSize(pipeline->getMissStackSizes(), std::identity{});
-    const auto callableStackMax = getMaxSize(pipeline->getCallableStackSizes(), std::identity{});
-    auto firstDepthStackSizeMax = std::max(closestHitStackMax, missStackMax);
-    firstDepthStackSizeMax = std::max<uint16_t>(firstDepthStackSizeMax, intersectionStackMax + anyHitStackMax);
-    m_rayTracingStackSize = raygenStackSize + std::max(firstDepthStackSizeMax, callableStackMax);
+	const auto& limits = m_device->getPhysicalDevice()->getLimits();
+	const auto handleSize = SPhysicalDeviceLimits::ShaderGroupHandleSize;
+	const auto handleSizeAligned = nbl::core::alignUp(handleSize, limits.shaderGroupHandleAlignment);
+
+	auto& raygenRange = m_shaderBindingTable.raygenGroupRange;
+
+	auto& hitRange = m_shaderBindingTable.hitGroupsRange;
+	const auto hitHandles = pipeline->getHitHandles();
+
+	auto& missRange = m_shaderBindingTable.missGroupsRange;
+	const auto missHandles = pipeline->getMissHandles();
+
+	auto& callableRange = m_shaderBindingTable.callableGroupsRange;
+	const auto callableHandles = pipeline->getCallableHandles();
+
+	raygenRange = {
+	  .offset = 0,
+	  .size = core::alignUp(handleSizeAligned, limits.shaderGroupBaseAlignment)
+	};
+
+	missRange = {
+	  .offset = raygenRange.size,
+	  .size = core::alignUp(missHandles.size() * handleSizeAligned, limits.shaderGroupBaseAlignment),
+	};
+	m_shaderBindingTable.missGroupsStride = handleSizeAligned;
+
+	hitRange = {
+	  .offset = missRange.offset + missRange.size,
+	  .size = core::alignUp(hitHandles.size() * handleSizeAligned, limits.shaderGroupBaseAlignment),
+	};
+	m_shaderBindingTable.hitGroupsStride = handleSizeAligned;
+
+	callableRange = {
+	  .offset = hitRange.offset + hitRange.size,
+	  .size = core::alignUp(callableHandles.size() * handleSizeAligned, limits.shaderGroupBaseAlignment),
+	};
+	m_shaderBindingTable.callableGroupsStride = handleSizeAligned;
+
+	const auto bufferSize = raygenRange.size + missRange.size + hitRange.size + callableRange.size;
+
+	ICPUBuffer::SCreationParams cpuBufferParams;
+	cpuBufferParams.size = bufferSize;
+	auto cpuBuffer = ICPUBuffer::create(std::move(cpuBufferParams));
+	uint8_t* pData = reinterpret_cast<uint8_t*>(cpuBuffer->getPointer());
+
+	// copy raygen region
+	memcpy(pData, &pipeline->getRaygen(), handleSize);
+
+	// copy miss region
+	uint8_t* pMissData = pData + missRange.offset;
+	for (const auto& handle : missHandles)
+	{
+	  memcpy(pMissData, &handle, handleSize);
+	  pMissData += m_shaderBindingTable.missGroupsStride;
+	}
+
+	// copy hit region
+	uint8_t* pHitData = pData + hitRange.offset;
+	for (const auto& handle : hitHandles)
+	{
+	  memcpy(pHitData, &handle, handleSize);
+	  pHitData += m_shaderBindingTable.hitGroupsStride;
+	}
+
+	// copy callable region
+	uint8_t* pCallableData = pData + callableRange.offset;
+	for (const auto& handle : callableHandles)
+	{
+	  memcpy(pCallableData, &handle, handleSize);
+	  pCallableData += m_shaderBindingTable.callableGroupsStride;
+	}
+
+	{
+	  IGPUBuffer::SCreationParams params;
+	  params.usage = IGPUBuffer::EUF_TRANSFER_DST_BIT | IGPUBuffer::EUF_INLINE_UPDATE_VIA_CMDBUF | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT | IGPUBuffer::EUF_SHADER_BINDING_TABLE_BIT;
+	  params.size = bufferSize;
+	  m_utils->createFilledDeviceLocalBufferOnDedMem(SIntendedSubmitInfo{ .queue = queue }, std::move(params), pData).move_into(raygenRange.buffer);
+	  missRange.buffer = core::smart_refctd_ptr(raygenRange.buffer);
+	  hitRange.buffer = core::smart_refctd_ptr(raygenRange.buffer);
+	  callableRange.buffer = core::smart_refctd_ptr(raygenRange.buffer);
+	}
+
+	return true;
   }
 
-  bool createShaderBindingTable(video::CThreadSafeQueueAdapter* queue, const smart_refctd_ptr<video::IGPURayTracingPipeline>& pipeline)
+#ifdef TEST_ASSET_CONV_AS
+#else
+  bool createGeometries(video::CThreadSafeQueueAdapter* queue, const IGeometryCreator* gc)
   {
-    const auto& limits = m_device->getPhysicalDevice()->getLimits();
-    const auto handleSize = SPhysicalDeviceLimits::ShaderGroupHandleSize;
-    const auto handleSizeAligned = nbl::core::alignUp(handleSize, limits.shaderGroupHandleAlignment);
-
-    auto& raygenRange = m_shaderBindingTable.raygenGroupRange;
-
-    auto& hitRange = m_shaderBindingTable.hitGroupsRange;
-    const auto hitHandles = pipeline->getHitHandles();
-
-    auto& missRange = m_shaderBindingTable.missGroupsRange;
-    const auto missHandles = pipeline->getMissHandles();
-
-    auto& callableRange = m_shaderBindingTable.callableGroupsRange;
-    const auto callableHandles = pipeline->getCallableHandles();
-
-    raygenRange = {
-      .offset = 0,
-      .size = core::alignUp(handleSizeAligned, limits.shaderGroupBaseAlignment)
-    };
-
-    missRange = {
-      .offset = raygenRange.size,
-      .size = core::alignUp(missHandles.size() * handleSizeAligned, limits.shaderGroupBaseAlignment),
-    };
-    m_shaderBindingTable.missGroupsStride = handleSizeAligned;
-
-    hitRange = {
-      .offset = missRange.offset + missRange.size,
-      .size = core::alignUp(hitHandles.size() * handleSizeAligned, limits.shaderGroupBaseAlignment),
-    };
-    m_shaderBindingTable.hitGroupsStride = handleSizeAligned;
-
-    callableRange = {
-      .offset = hitRange.offset + hitRange.size,
-      .size = core::alignUp(callableHandles.size() * handleSizeAligned, limits.shaderGroupBaseAlignment),
-    };
-    m_shaderBindingTable.callableGroupsStride = handleSizeAligned;
-
-    const auto bufferSize = raygenRange.size + missRange.size + hitRange.size + callableRange.size;
-
-    ICPUBuffer::SCreationParams cpuBufferParams;
-    cpuBufferParams.size = bufferSize;
-    auto cpuBuffer = ICPUBuffer::create(std::move(cpuBufferParams));
-    uint8_t* pData = reinterpret_cast<uint8_t*>(cpuBuffer->getPointer());
-
-    // copy raygen region
-    memcpy(pData, &pipeline->getRaygen(), handleSize);
-
-    // copy miss region
-    uint8_t* pMissData = pData + missRange.offset;
-    for (const auto& handle : missHandles)
-    {
-      memcpy(pMissData, &handle, handleSize);
-      pMissData += m_shaderBindingTable.missGroupsStride;
-    }
-
-    // copy hit region
-    uint8_t* pHitData = pData + hitRange.offset;
-    for (const auto& handle : hitHandles)
-    {
-      memcpy(pHitData, &handle, handleSize);
-      pHitData += m_shaderBindingTable.hitGroupsStride;
-    }
-
-    // copy callable region
-    uint8_t* pCallableData = pData + callableRange.offset;
-    for (const auto& handle : callableHandles)
-    {
-      memcpy(pCallableData, &handle, handleSize);
-      pCallableData += m_shaderBindingTable.callableGroupsStride;
-    }
-
-    {
-      IGPUBuffer::SCreationParams params;
-      params.usage = IGPUBuffer::EUF_TRANSFER_DST_BIT | IGPUBuffer::EUF_INLINE_UPDATE_VIA_CMDBUF | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT | IGPUBuffer::EUF_SHADER_BINDING_TABLE_BIT;
-      params.size = bufferSize;
-      m_utils->createFilledDeviceLocalBufferOnDedMem(SIntendedSubmitInfo{ .queue = queue }, std::move(params), pData).move_into(raygenRange.buffer);
-      missRange.buffer = core::smart_refctd_ptr(raygenRange.buffer);
-      hitRange.buffer = core::smart_refctd_ptr(raygenRange.buffer);
-      callableRange.buffer = core::smart_refctd_ptr(raygenRange.buffer);
-    }
-
-    return true;
+	  auto pool = m_device->createCommandPool(queue->getFamilyIndex(), IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT);
+	  if (!pool)
+		  return logFail("Couldn't create Command Pool for geometry creation!");
+
+	  const auto defaultMaterial = Material{
+		.ambient = {0.2, 0.1, 0.1},
+		.diffuse = {0.8, 0.3, 0.3},
+		.specular = {0.8, 0.8, 0.8},
+		.shininess = 1.0f,
+		.alpha = 1.0f,
+	  };
+
+	  auto getTranslationMatrix = [](float32_t x, float32_t y, float32_t z)
+		  {
+			  core::matrix3x4SIMD transform;
+			  transform.setTranslation(nbl::core::vectorSIMDf(x, y, z, 0));
+			  return transform;
+		  };
+
+	  core::matrix3x4SIMD planeTransform;
+	  planeTransform.setRotation(quaternion::fromAngleAxis(core::radians(-90.0f), vector3df_SIMD{ 1, 0, 0 }));
+
+	  const auto cpuObjects = std::array{
+		ReferenceObjectCpu {
+		  .meta = {.type = OT_RECTANGLE, .name = "Plane Mesh"},
+		  .data = gc->createRectangleMesh(nbl::core::vector2df_SIMD(10, 10)),
+		  .material = defaultMaterial,
+		  .transform = planeTransform,
+		},
+		ReferenceObjectCpu {
+		  .meta = {.type = OT_CUBE, .name = "Cube Mesh"},
+		  .data = gc->createCubeMesh(nbl::core::vector3df(1, 1, 1)),
+		  .material = defaultMaterial,
+		  .transform = getTranslationMatrix(0, 0.5f, 0),
+		},
+		ReferenceObjectCpu {
+		  .meta = {.type = OT_CUBE, .name = "Cube Mesh 2"},
+		  .data = gc->createCubeMesh(nbl::core::vector3df(1.5, 1.5, 1.5)),
+		  .material = Material{
+			.ambient = {0.1, 0.1, 0.2},
+			.diffuse = {0.2, 0.2, 0.8},
+			.specular = {0.8, 0.8, 0.8},
+			.shininess = 1.0f,
+		  },
+		  .transform = getTranslationMatrix(-5.0f, 1.0f, 0),
+		},
+		ReferenceObjectCpu {
+		  .meta = {.type = OT_CUBE, .name = "Transparent Cube Mesh"},
+		  .data = gc->createCubeMesh(nbl::core::vector3df(1.5, 1.5, 1.5)),
+		  .material = Material{
+			.ambient = {0.1, 0.2, 0.1},
+			.diffuse = {0.2, 0.8, 0.2},
+			.specular = {0.8, 0.8, 0.8},
+			.shininess = 1.0f,
+			.alpha = 0.2,
+		  },
+		  .transform = getTranslationMatrix(5.0f, 1.0f, 0),
+		},
+	  };
+
+	  struct ScratchVIBindings
+	  {
+		  nbl::asset::SBufferBinding<ICPUBuffer> vertex, index;
+	  };
+	  std::array<ScratchVIBindings, std::size(cpuObjects)> scratchBuffers;
+
+	  for (uint32_t i = 0; i < cpuObjects.size(); i++)
+	  {
+		  const auto& cpuObject = cpuObjects[i];
+
+		  auto vBuffer = smart_refctd_ptr(cpuObject.data.bindings[0].buffer); // no offset
+		  auto vUsage = bitflag(IGPUBuffer::EUF_STORAGE_BUFFER_BIT) | IGPUBuffer::EUF_TRANSFER_DST_BIT | IGPUBuffer::EUF_INLINE_UPDATE_VIA_CMDBUF |
+			  IGPUBuffer::EUF_ACCELERATION_STRUCTURE_BUILD_INPUT_READ_ONLY_BIT | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT;
+		  vBuffer->addUsageFlags(vUsage);
+		  vBuffer->setContentHash(vBuffer->computeContentHash());
+
+		  auto iBuffer = smart_refctd_ptr(cpuObject.data.indexBuffer.buffer); // no offset
+		  auto iUsage = bitflag(IGPUBuffer::EUF_STORAGE_BUFFER_BIT) | IGPUBuffer::EUF_TRANSFER_DST_BIT | IGPUBuffer::EUF_INLINE_UPDATE_VIA_CMDBUF |
+			  IGPUBuffer::EUF_ACCELERATION_STRUCTURE_BUILD_INPUT_READ_ONLY_BIT | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT;
+
+		  if (cpuObject.data.indexType != EIT_UNKNOWN)
+			  if (iBuffer)
+			  {
+				  iBuffer->addUsageFlags(iUsage);
+				  iBuffer->setContentHash(iBuffer->computeContentHash());
+			  }
+
+		  scratchBuffers[i] = {
+			.vertex = {.offset = 0, .buffer = vBuffer},
+			.index = {.offset = 0, .buffer = iBuffer},
+		  };
+
+	  }
+
+	  auto cmdbuf = getSingleUseCommandBufferAndBegin(pool);
+	  cmdbuf->beginDebugMarker("Build geometry vertex and index buffers");
+
+	  CAssetConverter::SInputs inputs = {};
+	  inputs.logger = m_logger.get();
+	  std::array<ICPUBuffer*, std::size(cpuObjects) * 2u> tmpBuffers;
+	  {
+		  for (uint32_t i = 0; i < cpuObjects.size(); i++)
+		  {
+			  tmpBuffers[2 * i + 0] = scratchBuffers[i].vertex.buffer.get();
+			  tmpBuffers[2 * i + 1] = scratchBuffers[i].index.buffer.get();
+		  }
+
+		  std::get<CAssetConverter::SInputs::asset_span_t<ICPUBuffer>>(inputs.assets) = tmpBuffers;
+	  }
+
+	  auto reservation = m_converter->reserve(inputs);
+	  {
+		  auto prepass = [&]<typename asset_type_t>(const auto & references) -> bool
+		  {
+			  auto objects = reservation.getGPUObjects<asset_type_t>();
+			  uint32_t counter = {};
+			  for (auto& object : objects)
+			  {
+				  auto gpu = object.value;
+				  auto* reference = references[counter];
+
+				  if (reference)
+				  {
+					  if (!gpu)
+					  {
+						  m_logger->log("Failed to convert a CPU object to GPU!", ILogger::ELL_ERROR);
+						  return false;
+					  }
+				  }
+				  counter++;
+			  }
+			  return true;
+		  };
+
+		  prepass.template operator() < ICPUBuffer > (tmpBuffers);
+	  }
+
+	  auto geomInfoBuffer = ICPUBuffer::create({ std::size(cpuObjects) * sizeof(STriangleGeomInfo) });
+	  STriangleGeomInfo* geomInfos = reinterpret_cast<STriangleGeomInfo*>(geomInfoBuffer->getPointer());
+
+	  m_gpuTriangleGeometries.reserve(std::size(cpuObjects));
+	  // convert
+	  {
+		  // not sure if need this (probably not, originally for transition img view)
+		  auto semaphore = m_device->createSemaphore(0u);
+
+		  std::array<IQueue::SSubmitInfo::SCommandBufferInfo, 1> cmdbufs = {};
+		  cmdbufs.front().cmdbuf = cmdbuf.get();
+
+		  SIntendedSubmitInfo transfer = {};
+		  transfer.queue = queue;
+		  transfer.scratchCommandBuffers = cmdbufs;
+		  transfer.scratchSemaphore = {
+			.semaphore = semaphore.get(),
+			.value = 0u,
+			.stageMask = PIPELINE_STAGE_FLAGS::ALL_TRANSFER_BITS
+		  };
+
+		  CAssetConverter::SConvertParams params = {};
+		  params.utilities = m_utils.get();
+		  params.transfer = &transfer;
+
+		  auto future = reservation.convert(params);
+		  if (future.copy() != IQueue::RESULT::SUCCESS)
+		  {
+			  m_logger->log("Failed to await submission feature!", ILogger::ELL_ERROR);
+			  return false;
+		  }
+
+		  auto&& buffers = reservation.getGPUObjects<ICPUBuffer>();
+		  for (uint32_t i = 0; i < cpuObjects.size(); i++)
+		  {
+			  auto& cpuObject = cpuObjects[i];
+
+			  m_gpuTriangleGeometries.push_back(ReferenceObjectGpu{
+				.meta = cpuObject.meta,
+				.bindings = {
+				  .vertex = {.offset = 0, .buffer = buffers[2 * i + 0].value },
+				  .index = {.offset = 0, .buffer = buffers[2 * i + 1].value },
+				},
+				.vertexStride = cpuObject.data.inputParams.bindings[0].stride,
+				.indexType = cpuObject.data.indexType,
+				.indexCount = cpuObject.data.indexCount,
+				.material = hlsl::_static_cast<MaterialPacked>(cpuObject.material),
+				.transform = cpuObject.transform,
+				  });
+		  }
+
+		  for (uint32_t i = 0; i < m_gpuTriangleGeometries.size(); i++)
+		  {
+			  const auto& gpuObject = m_gpuTriangleGeometries[i];
+			  const uint64_t vertexBufferAddress = gpuObject.bindings.vertex.buffer->getDeviceAddress();
+			  geomInfos[i] = {
+				.material = gpuObject.material,
+				.vertexBufferAddress = vertexBufferAddress,
+				.indexBufferAddress = gpuObject.useIndex() ? gpuObject.bindings.index.buffer->getDeviceAddress() : vertexBufferAddress,
+				.vertexStride = gpuObject.vertexStride,
+				.objType = gpuObject.meta.type,
+				.indexType = gpuObject.indexType,
+				.smoothNormals = s_smoothNormals[gpuObject.meta.type],
+			  };
+		  }
+	  }
+
+	  {
+		  IGPUBuffer::SCreationParams params;
+		  params.usage = IGPUBuffer::EUF_STORAGE_BUFFER_BIT | IGPUBuffer::EUF_TRANSFER_DST_BIT | IGPUBuffer::EUF_INLINE_UPDATE_VIA_CMDBUF | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT;
+		  params.size = geomInfoBuffer->getSize();
+		  m_utils->createFilledDeviceLocalBufferOnDedMem(SIntendedSubmitInfo{ .queue = queue }, std::move(params), geomInfos).move_into(m_triangleGeomInfoBuffer);
+	  }
+
+	  // intersection geometries setup
+	  {
+		  core::vector<SProceduralGeomInfo> proceduralGeoms;
+		  proceduralGeoms.reserve(NumberOfProceduralGeometries);
+		  using Aabb = IGPUBottomLevelAccelerationStructure::AABB_t;
+		  core::vector<Aabb> aabbs;
+		  aabbs.reserve(NumberOfProceduralGeometries);
+		  for (int32_t i = 0; i < NumberOfProceduralGeometries; i++)
+		  {
+			  const auto middle_i = NumberOfProceduralGeometries / 2.0;
+			  SProceduralGeomInfo sphere = {
+				.material = hlsl::_static_cast<MaterialPacked>(Material{
+				  .ambient = {0.1, 0.05 * i, 0.1},
+				  .diffuse = {0.3, 0.2 * i, 0.3},
+				  .specular = {0.8, 0.8, 0.8},
+				  .shininess = 1.0f,
+				}),
+				.center = float32_t3((i - middle_i) * 4.0, 2, 5.0),
+				.radius = 1,
+			  };
+
+			  proceduralGeoms.push_back(sphere);
+			  const auto sphereMin = sphere.center - sphere.radius;
+			  const auto sphereMax = sphere.center + sphere.radius;
+			  aabbs.emplace_back(
+				  vector3d(sphereMin.x, sphereMin.y, sphereMin.z),
+				  vector3d(sphereMax.x, sphereMax.y, sphereMax.z));
+		  }
+
+		  {
+			  IGPUBuffer::SCreationParams params;
+			  params.usage = IGPUBuffer::EUF_STORAGE_BUFFER_BIT | IGPUBuffer::EUF_TRANSFER_DST_BIT | IGPUBuffer::EUF_INLINE_UPDATE_VIA_CMDBUF | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT;
+			  params.size = proceduralGeoms.size() * sizeof(SProceduralGeomInfo);
+			  m_utils->createFilledDeviceLocalBufferOnDedMem(SIntendedSubmitInfo{ .queue = queue }, std::move(params), proceduralGeoms.data()).move_into(m_proceduralGeomInfoBuffer);
+		  }
+
+		  {
+			  IGPUBuffer::SCreationParams params;
+			  params.usage = IGPUBuffer::EUF_TRANSFER_DST_BIT | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT | IGPUBuffer::EUF_ACCELERATION_STRUCTURE_BUILD_INPUT_READ_ONLY_BIT;
+			  params.size = aabbs.size() * sizeof(Aabb);
+			  m_utils->createFilledDeviceLocalBufferOnDedMem(SIntendedSubmitInfo{ .queue = queue }, std::move(params), aabbs.data()).move_into(m_proceduralAabbBuffer);
+		  }
+	  }
+
+	  return true;
   }
 
   bool createAccelerationStructures(video::CThreadSafeQueueAdapter* queue)
   {
-    // plus 1 blas for procedural geometry contains {{var::NumberOfProcedural}}
-    // spheres. Each sphere is a primitive instead one instance or geometry
-    const auto blasCount = m_gpuTriangleGeometries.size() + 1;
-    const auto proceduralBlasIdx = m_gpuTriangleGeometries.size();
+	// plus 1 blas for procedural geometry contains {{var::NumberOfProcedural}}
+	// spheres. Each sphere is a primitive instead one instance or geometry
+	const auto blasCount = m_gpuTriangleGeometries.size() + 1;
+	const auto proceduralBlasIdx = m_gpuTriangleGeometries.size();
 
-    IQueryPool::SCreationParams qParams{ .queryCount = static_cast<uint32_t>(blasCount), .queryType = IQueryPool::ACCELERATION_STRUCTURE_COMPACTED_SIZE };
-    smart_refctd_ptr<IQueryPool> queryPool = m_device->createQueryPool(std::move(qParams));
+	IQueryPool::SCreationParams qParams{ .queryCount = static_cast<uint32_t>(blasCount), .queryType = IQueryPool::ACCELERATION_STRUCTURE_COMPACTED_SIZE };
+	smart_refctd_ptr<IQueryPool> queryPool = m_device->createQueryPool(std::move(qParams));
 
-    auto pool = m_device->createCommandPool(queue->getFamilyIndex(), IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT | IGPUCommandPool::CREATE_FLAGS::TRANSIENT_BIT);
-    if (!pool)
-      return logFail("Couldn't create Command Pool for blas/tlas creation!");
+	auto pool = m_device->createCommandPool(queue->getFamilyIndex(), IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT | IGPUCommandPool::CREATE_FLAGS::TRANSIENT_BIT);
+	if (!pool)
+	  return logFail("Couldn't create Command Pool for blas/tlas creation!");
 
-    m_api->startCapture();
+	m_api->startCapture();
 #ifdef TRY_BUILD_FOR_NGFX // NSight is "debugger-challenged" it can't capture anything not happenning "during a frame", so we need to trick it
-    m_currentImageAcquire = m_surface->acquireNextImage();
-    {
-      const IQueue::SSubmitInfo::SSemaphoreInfo acquired[] = { {
-        .semaphore = m_currentImageAcquire.semaphore,
-        .value = m_currentImageAcquire.acquireCount,
-        .stageMask = PIPELINE_STAGE_FLAGS::ALL_COMMANDS_BITS
-      } };
-      m_surface->present(m_currentImageAcquire.imageIndex, acquired);
-    }
-    m_currentImageAcquire = m_surface->acquireNextImage();
+	m_currentImageAcquire = m_surface->acquireNextImage();
+	{
+	  const IQueue::SSubmitInfo::SSemaphoreInfo acquired[] = { {
+		.semaphore = m_currentImageAcquire.semaphore,
+		.value = m_currentImageAcquire.acquireCount,
+		.stageMask = PIPELINE_STAGE_FLAGS::ALL_COMMANDS_BITS
+	  } };
+	  m_surface->present(m_currentImageAcquire.imageIndex, acquired);
+	}
+	m_currentImageAcquire = m_surface->acquireNextImage();
 #endif
-    size_t totalScratchSize = 0;
-    const auto scratchOffsetAlignment = m_device->getPhysicalDevice()->getLimits().minAccelerationStructureScratchOffsetAlignment;
-
-    // build bottom level ASes
-    {
-      core::vector<uint32_t> primitiveCounts(blasCount);
-      core::vector<IGPUBottomLevelAccelerationStructure::Triangles<const IGPUBuffer>> triangles(m_gpuTriangleGeometries.size());
-      core::vector<uint32_t> scratchSizes(blasCount);
-      IGPUBottomLevelAccelerationStructure::AABBs<const IGPUBuffer> aabbs;
-
-      auto blasFlags = bitflag(IGPUBottomLevelAccelerationStructure::BUILD_FLAGS::PREFER_FAST_TRACE_BIT) | IGPUBottomLevelAccelerationStructure::BUILD_FLAGS::ALLOW_COMPACTION_BIT;
-      if (m_physicalDevice->getProperties().limits.rayTracingPositionFetch)
-        blasFlags |= IGPUBottomLevelAccelerationStructure::BUILD_FLAGS::ALLOW_DATA_ACCESS;
-
-      IGPUBottomLevelAccelerationStructure::DeviceBuildInfo initBuildInfo;
-      initBuildInfo.buildFlags = blasFlags;
-      initBuildInfo.geometryCount = 1;	// only 1 geometry object per blas
-      initBuildInfo.srcAS = nullptr;
-      initBuildInfo.dstAS = nullptr;
-      initBuildInfo.scratch = {};
-
-      auto blasBuildInfos = core::vector(blasCount, initBuildInfo);
-
-      m_gpuBlasList.resize(blasCount);
-      // setup blas info for triangle geometries
-      for (uint32_t i = 0; i < blasCount; i++)
-      {
-        const auto isProcedural = i == proceduralBlasIdx;
-        if (isProcedural)
-        {
-          aabbs.data.buffer = smart_refctd_ptr(m_proceduralAabbBuffer);
-          aabbs.data.offset = 0;
-          aabbs.stride = sizeof(IGPUBottomLevelAccelerationStructure::AABB_t);
-          aabbs.geometryFlags = IGPUBottomLevelAccelerationStructure::GEOMETRY_FLAGS::OPAQUE_BIT; // only allow opaque for now
-
-          primitiveCounts[proceduralBlasIdx] = NumberOfProceduralGeometries;
-          blasBuildInfos[proceduralBlasIdx].aabbs = &aabbs;
-          blasBuildInfos[proceduralBlasIdx].buildFlags |= IGPUBottomLevelAccelerationStructure::BUILD_FLAGS::GEOMETRY_TYPE_IS_AABB_BIT;
-        } else
-        {
-          const auto& gpuObject = m_gpuTriangleGeometries[i];
-
-          const uint32_t vertexStride = gpuObject.vertexStride;
-          const uint32_t numVertices = gpuObject.bindings.vertex.buffer->getSize() / vertexStride;
-          if (gpuObject.useIndex())
-            primitiveCounts[i] = gpuObject.indexCount / 3;
-          else
-            primitiveCounts[i] = numVertices / 3;
-
-          triangles[i].vertexData[0] = gpuObject.bindings.vertex;
-          triangles[i].indexData = gpuObject.useIndex() ? gpuObject.bindings.index : gpuObject.bindings.vertex;
-          triangles[i].maxVertex = numVertices - 1;
-          triangles[i].vertexStride = vertexStride;
-          triangles[i].vertexFormat = EF_R32G32B32_SFLOAT;
-          triangles[i].indexType = gpuObject.indexType;
-          triangles[i].geometryFlags = gpuObject.material.isTransparent() ?
-            IGPUBottomLevelAccelerationStructure::GEOMETRY_FLAGS::NO_DUPLICATE_ANY_HIT_INVOCATION_BIT :
-            IGPUBottomLevelAccelerationStructure::GEOMETRY_FLAGS::OPAQUE_BIT;
-
-          blasBuildInfos[i].triangles = &triangles[i];
-        }
-        ILogicalDevice::AccelerationStructureBuildSizes buildSizes;
-        {
-          const uint32_t maxPrimCount[1] = { primitiveCounts[i] };
-          if (isProcedural)
-          {
-            const auto* aabbData = &aabbs;
-            buildSizes = m_device->getAccelerationStructureBuildSizes(blasBuildInfos[i].buildFlags, false, std::span{ aabbData, 1}, maxPrimCount);
-          }
-          else
-          {
-            const auto* trianglesData = triangles.data();
-            buildSizes = m_device->getAccelerationStructureBuildSizes(blasBuildInfos[i].buildFlags, false, std::span{trianglesData,1}, maxPrimCount);
-          }
-          if (!buildSizes)
-            return logFail("Failed to get BLAS build sizes");
-        }
-
-        scratchSizes[i] = buildSizes.buildScratchSize;
-        totalScratchSize = core::alignUp(totalScratchSize, scratchOffsetAlignment);
-        totalScratchSize += buildSizes.buildScratchSize;
-
-        {
-          IGPUBuffer::SCreationParams params;
-          params.usage = bitflag(IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT) | IGPUBuffer::EUF_ACCELERATION_STRUCTURE_STORAGE_BIT;
-          params.size = buildSizes.accelerationStructureSize;
-          smart_refctd_ptr<IGPUBuffer> asBuffer = createBuffer(params);
-
-          IGPUBottomLevelAccelerationStructure::SCreationParams blasParams;
-          blasParams.bufferRange.buffer = asBuffer;
-          blasParams.bufferRange.offset = 0u;
-          blasParams.bufferRange.size = buildSizes.accelerationStructureSize;
-          blasParams.flags = IGPUBottomLevelAccelerationStructure::SCreationParams::FLAGS::NONE;
-          m_gpuBlasList[i] = m_device->createBottomLevelAccelerationStructure(std::move(blasParams));
-          if (!m_gpuBlasList[i])
-            return logFail("Could not create BLAS");
-        }
-      }
-
-
-      auto cmdbufBlas = getSingleUseCommandBufferAndBegin(pool);
-      cmdbufBlas->beginDebugMarker("Build BLAS");
-
-      cmdbufBlas->resetQueryPool(queryPool.get(), 0, blasCount);
-
-      smart_refctd_ptr<IGPUBuffer> scratchBuffer;
-      {
-        IGPUBuffer::SCreationParams params;
-        params.usage = bitflag(IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT) | IGPUBuffer::EUF_STORAGE_BUFFER_BIT;
-        params.size = totalScratchSize;
-        scratchBuffer = createBuffer(params);
-      }
-
-      core::vector<IGPUBottomLevelAccelerationStructure::BuildRangeInfo> buildRangeInfos(blasCount);
-      core::vector<IGPUBottomLevelAccelerationStructure::BuildRangeInfo*> pRangeInfos(blasCount);
-      for (uint32_t i = 0; i < blasCount; i++)
-      {
-        blasBuildInfos[i].dstAS = m_gpuBlasList[i].get();
-        blasBuildInfos[i].scratch.buffer = scratchBuffer;
-        if (i == 0)
-        {
-          blasBuildInfos[i].scratch.offset = 0u;
-        } else
-        {
-          const auto unalignedOffset = blasBuildInfos[i - 1].scratch.offset + scratchSizes[i - 1];
-          blasBuildInfos[i].scratch.offset = core::alignUp(unalignedOffset, scratchOffsetAlignment);
-        }
-
-        buildRangeInfos[i].primitiveCount = primitiveCounts[i];
-        buildRangeInfos[i].primitiveByteOffset = 0u;
-        buildRangeInfos[i].firstVertex = 0u;
-        buildRangeInfos[i].transformByteOffset = 0u;
-
-        pRangeInfos[i] = &buildRangeInfos[i];
-      }
-
-      if (!cmdbufBlas->buildAccelerationStructures(std::span(blasBuildInfos), pRangeInfos.data()))
-        return logFail("Failed to build BLAS");
-
-      {
-        SMemoryBarrier memBarrier;
-        memBarrier.srcStageMask = PIPELINE_STAGE_FLAGS::ACCELERATION_STRUCTURE_BUILD_BIT;
-        memBarrier.srcAccessMask = ACCESS_FLAGS::ACCELERATION_STRUCTURE_WRITE_BIT;
-        memBarrier.dstStageMask = PIPELINE_STAGE_FLAGS::ACCELERATION_STRUCTURE_BUILD_BIT;
-        memBarrier.dstAccessMask = ACCESS_FLAGS::ACCELERATION_STRUCTURE_READ_BIT;
-        cmdbufBlas->pipelineBarrier(E_DEPENDENCY_FLAGS::EDF_NONE, { .memBarriers = {&memBarrier, 1} });
-      }
-
-
-      core::vector<const IGPUAccelerationStructure*> ases(blasCount);
-      for (uint32_t i = 0; i < blasCount; i++)
-        ases[i] = m_gpuBlasList[i].get();
-      if (!cmdbufBlas->writeAccelerationStructureProperties(std::span(ases), IQueryPool::ACCELERATION_STRUCTURE_COMPACTED_SIZE,
-        queryPool.get(), 0))
-        return logFail("Failed to write acceleration structure properties!");
-
-      cmdbufBlas->endDebugMarker();
-      cmdbufSubmitAndWait(cmdbufBlas, queue, 39);
-    }
-
-    auto cmdbufCompact = getSingleUseCommandBufferAndBegin(pool);
-    cmdbufCompact->beginDebugMarker("Compact BLAS");
-
-    // compact blas
-    {
-      core::vector<size_t> asSizes(blasCount);
-      if (!m_device->getQueryPoolResults(queryPool.get(), 0, blasCount, asSizes.data(), sizeof(size_t), bitflag(IQueryPool::WAIT_BIT) | IQueryPool::_64_BIT))
-        return logFail("Could not get query pool results for AS sizes");
-
-      core::vector<smart_refctd_ptr<IGPUBottomLevelAccelerationStructure>> cleanupBlas(blasCount);
-      for (uint32_t i = 0; i < blasCount; i++)
-      {
-        if (asSizes[i] == 0) continue;
-        cleanupBlas[i] = m_gpuBlasList[i];
-        {
-          IGPUBuffer::SCreationParams params;
-          params.usage = bitflag(IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT) | IGPUBuffer::EUF_ACCELERATION_STRUCTURE_STORAGE_BIT;
-          params.size = asSizes[i];
-          smart_refctd_ptr<IGPUBuffer> asBuffer = createBuffer(params);
-
-          IGPUBottomLevelAccelerationStructure::SCreationParams blasParams;
-          blasParams.bufferRange.buffer = asBuffer;
-          blasParams.bufferRange.offset = 0u;
-          blasParams.bufferRange.size = asSizes[i];
-          blasParams.flags = IGPUBottomLevelAccelerationStructure::SCreationParams::FLAGS::NONE;
-          m_gpuBlasList[i] = m_device->createBottomLevelAccelerationStructure(std::move(blasParams));
-          if (!m_gpuBlasList[i])
-            return logFail("Could not create compacted BLAS");
-        }
-
-        IGPUBottomLevelAccelerationStructure::CopyInfo copyInfo;
-        copyInfo.src = cleanupBlas[i].get();
-        copyInfo.dst = m_gpuBlasList[i].get();
-        copyInfo.mode = IGPUBottomLevelAccelerationStructure::COPY_MODE::COMPACT;
-        if (!cmdbufCompact->copyAccelerationStructure(copyInfo))
-          return logFail("Failed to copy AS to compact");
-      }
-    }
-
-    cmdbufCompact->endDebugMarker();
-    cmdbufSubmitAndWait(cmdbufCompact, queue, 40);
-
-    auto cmdbufTlas = getSingleUseCommandBufferAndBegin(pool);
-    cmdbufTlas->beginDebugMarker("Build TLAS");
-
-    // build top level AS
-    {
-      const uint32_t instancesCount = blasCount;
-      core::vector<IGPUTopLevelAccelerationStructure::DeviceStaticInstance> instances(instancesCount);
-      for (uint32_t i = 0; i < instancesCount; i++)
-      {
-        const auto isProceduralInstance = i == proceduralBlasIdx;
-        instances[i].base.blas.deviceAddress = m_gpuBlasList[i]->getReferenceForDeviceOperations().deviceAddress;
-        instances[i].base.mask = 0xFF;
-        instances[i].base.instanceCustomIndex = i;
-        instances[i].base.instanceShaderBindingTableRecordOffset = isProceduralInstance ? 2 : 0;
-        instances[i].base.flags = static_cast<uint32_t>(IGPUTopLevelAccelerationStructure::INSTANCE_FLAGS::TRIANGLE_FACING_CULL_DISABLE_BIT);
-        instances[i].transform = isProceduralInstance ? matrix3x4SIMD() : m_gpuTriangleGeometries[i].transform;
-      }
-
-      {
-        size_t bufSize = instancesCount * sizeof(IGPUTopLevelAccelerationStructure::DeviceStaticInstance);
-        IGPUBuffer::SCreationParams params;
-        params.usage = bitflag(IGPUBuffer::EUF_ACCELERATION_STRUCTURE_BUILD_INPUT_READ_ONLY_BIT) | IGPUBuffer::EUF_STORAGE_BUFFER_BIT |
-          IGPUBuffer::EUF_INLINE_UPDATE_VIA_CMDBUF | IGPUBuffer::EUF_TRANSFER_DST_BIT | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT;
-        params.size = bufSize;
-        m_instanceBuffer = createBuffer(params);
-
-        SBufferRange<IGPUBuffer> range = { .offset = 0u, .size = bufSize, .buffer = m_instanceBuffer };
-        cmdbufTlas->updateBuffer(range, instances.data());
-      }
-
-      // make sure instances upload complete first
-      {
-        SMemoryBarrier memBarrier;
-        memBarrier.srcStageMask = PIPELINE_STAGE_FLAGS::ALL_TRANSFER_BITS;
-        memBarrier.srcAccessMask = ACCESS_FLAGS::TRANSFER_WRITE_BIT;
-        memBarrier.dstStageMask = PIPELINE_STAGE_FLAGS::ACCELERATION_STRUCTURE_BUILD_BIT;
-        memBarrier.dstAccessMask = ACCESS_FLAGS::ACCELERATION_STRUCTURE_WRITE_BIT;
-        cmdbufTlas->pipelineBarrier(E_DEPENDENCY_FLAGS::EDF_NONE, { .memBarriers = {&memBarrier, 1} });
-      }
-
-      auto tlasFlags = bitflag(IGPUTopLevelAccelerationStructure::BUILD_FLAGS::PREFER_FAST_TRACE_BIT);
-
-      IGPUTopLevelAccelerationStructure::DeviceBuildInfo tlasBuildInfo;
-      tlasBuildInfo.buildFlags = tlasFlags;
-      tlasBuildInfo.srcAS = nullptr;
-      tlasBuildInfo.dstAS = nullptr;
-      tlasBuildInfo.instanceData.buffer = m_instanceBuffer;
-      tlasBuildInfo.instanceData.offset = 0u;
-      tlasBuildInfo.scratch = {};
-
-      auto buildSizes = m_device->getAccelerationStructureBuildSizes(tlasFlags, false, instancesCount);
-      if (!buildSizes)
-        return logFail("Failed to get TLAS build sizes");
-
-      {
-        IGPUBuffer::SCreationParams params;
-        params.usage = bitflag(IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT) | IGPUBuffer::EUF_ACCELERATION_STRUCTURE_STORAGE_BIT;
-        params.size = buildSizes.accelerationStructureSize;
-        smart_refctd_ptr<IGPUBuffer> asBuffer = createBuffer(params);
-
-        IGPUTopLevelAccelerationStructure::SCreationParams tlasParams;
-        tlasParams.bufferRange.buffer = asBuffer;
-        tlasParams.bufferRange.offset = 0u;
-        tlasParams.bufferRange.size = buildSizes.accelerationStructureSize;
-        tlasParams.flags = IGPUTopLevelAccelerationStructure::SCreationParams::FLAGS::NONE;
-        m_gpuTlas = m_device->createTopLevelAccelerationStructure(std::move(tlasParams));
-        if (!m_gpuTlas)
-          return logFail("Could not create TLAS");
-      }
-
-      smart_refctd_ptr<IGPUBuffer> scratchBuffer;
-      {
-        IGPUBuffer::SCreationParams params;
-        params.usage = bitflag(IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT) | IGPUBuffer::EUF_STORAGE_BUFFER_BIT;
-        params.size = buildSizes.buildScratchSize;
-        scratchBuffer = createBuffer(params);
-      }
-
-      tlasBuildInfo.dstAS = m_gpuTlas.get();
-      tlasBuildInfo.scratch.buffer = scratchBuffer;
-      tlasBuildInfo.scratch.offset = 0u;
-
-      IGPUTopLevelAccelerationStructure::BuildRangeInfo buildRangeInfo[1u];
-      buildRangeInfo[0].instanceCount = instancesCount;
-      buildRangeInfo[0].instanceByteOffset = 0u;
-      IGPUTopLevelAccelerationStructure::BuildRangeInfo* pRangeInfos;
-      pRangeInfos = &buildRangeInfo[0];
-
-      if (!cmdbufTlas->buildAccelerationStructures({ &tlasBuildInfo, 1 }, pRangeInfos))
-        return logFail("Failed to build TLAS");
-    }
-
-    cmdbufTlas->endDebugMarker();
-    cmdbufSubmitAndWait(cmdbufTlas, queue, 45);
+	size_t totalScratchSize = 0;
+	const auto scratchOffsetAlignment = m_device->getPhysicalDevice()->getLimits().minAccelerationStructureScratchOffsetAlignment;
+
+	// build bottom level ASes
+	{
+	  core::vector<uint32_t> primitiveCounts(blasCount);
+	  core::vector<IGPUBottomLevelAccelerationStructure::Triangles<const IGPUBuffer>> triangles(m_gpuTriangleGeometries.size());
+	  core::vector<uint32_t> scratchSizes(blasCount);
+	  IGPUBottomLevelAccelerationStructure::AABBs<const IGPUBuffer> aabbs;
+
+	  auto blasFlags = bitflag(IGPUBottomLevelAccelerationStructure::BUILD_FLAGS::PREFER_FAST_TRACE_BIT) | IGPUBottomLevelAccelerationStructure::BUILD_FLAGS::ALLOW_COMPACTION_BIT;
+	  if (m_physicalDevice->getProperties().limits.rayTracingPositionFetch)
+		blasFlags |= IGPUBottomLevelAccelerationStructure::BUILD_FLAGS::ALLOW_DATA_ACCESS;
+
+	  IGPUBottomLevelAccelerationStructure::DeviceBuildInfo initBuildInfo;
+	  initBuildInfo.buildFlags = blasFlags;
+	  initBuildInfo.geometryCount = 1;	// only 1 geometry object per blas
+	  initBuildInfo.srcAS = nullptr;
+	  initBuildInfo.dstAS = nullptr;
+	  initBuildInfo.scratch = {};
+
+	  auto blasBuildInfos = core::vector(blasCount, initBuildInfo);
+
+	  m_gpuBlasList.resize(blasCount);
+	  // setup blas info for triangle geometries
+	  for (uint32_t i = 0; i < blasCount; i++)
+	  {
+		const auto isProcedural = i == proceduralBlasIdx;
+		if (isProcedural)
+		{
+		  aabbs.data.buffer = smart_refctd_ptr(m_proceduralAabbBuffer);
+		  aabbs.data.offset = 0;
+		  aabbs.stride = sizeof(IGPUBottomLevelAccelerationStructure::AABB_t);
+		  aabbs.geometryFlags = IGPUBottomLevelAccelerationStructure::GEOMETRY_FLAGS::OPAQUE_BIT; // only allow opaque for now
+
+		  primitiveCounts[proceduralBlasIdx] = NumberOfProceduralGeometries;
+		  blasBuildInfos[proceduralBlasIdx].aabbs = &aabbs;
+		  blasBuildInfos[proceduralBlasIdx].buildFlags |= IGPUBottomLevelAccelerationStructure::BUILD_FLAGS::GEOMETRY_TYPE_IS_AABB_BIT;
+		} else
+		{
+		  const auto& gpuObject = m_gpuTriangleGeometries[i];
+
+		  const uint32_t vertexStride = gpuObject.vertexStride;
+		  const uint32_t numVertices = gpuObject.bindings.vertex.buffer->getSize() / vertexStride;
+		  if (gpuObject.useIndex())
+			primitiveCounts[i] = gpuObject.indexCount / 3;
+		  else
+			primitiveCounts[i] = numVertices / 3;
+
+		  triangles[i].vertexData[0] = gpuObject.bindings.vertex;
+		  triangles[i].indexData = gpuObject.useIndex() ? gpuObject.bindings.index : gpuObject.bindings.vertex;
+		  triangles[i].maxVertex = numVertices - 1;
+		  triangles[i].vertexStride = vertexStride;
+		  triangles[i].vertexFormat = EF_R32G32B32_SFLOAT;
+		  triangles[i].indexType = gpuObject.indexType;
+		  triangles[i].geometryFlags = gpuObject.material.isTransparent() ?
+			IGPUBottomLevelAccelerationStructure::GEOMETRY_FLAGS::NO_DUPLICATE_ANY_HIT_INVOCATION_BIT :
+			IGPUBottomLevelAccelerationStructure::GEOMETRY_FLAGS::OPAQUE_BIT;
+
+		  blasBuildInfos[i].triangles = &triangles[i];
+		}
+		ILogicalDevice::AccelerationStructureBuildSizes buildSizes;
+		{
+		  const uint32_t maxPrimCount[1] = { primitiveCounts[i] };
+		  if (isProcedural)
+		  {
+			const auto* aabbData = &aabbs;
+			buildSizes = m_device->getAccelerationStructureBuildSizes(blasBuildInfos[i].buildFlags, false, std::span{ aabbData, 1}, maxPrimCount);
+		  }
+		  else
+		  {
+			const auto* trianglesData = triangles.data();
+			buildSizes = m_device->getAccelerationStructureBuildSizes(blasBuildInfos[i].buildFlags, false, std::span{trianglesData,1}, maxPrimCount);
+		  }
+		  if (!buildSizes)
+			return logFail("Failed to get BLAS build sizes");
+		}
+
+		scratchSizes[i] = buildSizes.buildScratchSize;
+		totalScratchSize = core::alignUp(totalScratchSize, scratchOffsetAlignment);
+		totalScratchSize += buildSizes.buildScratchSize;
+
+		{
+		  IGPUBuffer::SCreationParams params;
+		  params.usage = bitflag(IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT) | IGPUBuffer::EUF_ACCELERATION_STRUCTURE_STORAGE_BIT;
+		  params.size = buildSizes.accelerationStructureSize;
+		  smart_refctd_ptr<IGPUBuffer> asBuffer = createBuffer(params);
+
+		  IGPUBottomLevelAccelerationStructure::SCreationParams blasParams;
+		  blasParams.bufferRange.buffer = asBuffer;
+		  blasParams.bufferRange.offset = 0u;
+		  blasParams.bufferRange.size = buildSizes.accelerationStructureSize;
+		  blasParams.flags = IGPUBottomLevelAccelerationStructure::SCreationParams::FLAGS::NONE;
+		  m_gpuBlasList[i] = m_device->createBottomLevelAccelerationStructure(std::move(blasParams));
+		  if (!m_gpuBlasList[i])
+			return logFail("Could not create BLAS");
+		}
+	  }
+
+
+	  auto cmdbufBlas = getSingleUseCommandBufferAndBegin(pool);
+	  cmdbufBlas->beginDebugMarker("Build BLAS");
+
+	  cmdbufBlas->resetQueryPool(queryPool.get(), 0, blasCount);
+
+	  smart_refctd_ptr<IGPUBuffer> scratchBuffer;
+	  {
+		IGPUBuffer::SCreationParams params;
+		params.usage = bitflag(IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT) | IGPUBuffer::EUF_STORAGE_BUFFER_BIT;
+		params.size = totalScratchSize;
+		scratchBuffer = createBuffer(params);
+	  }
+
+	  core::vector<IGPUBottomLevelAccelerationStructure::BuildRangeInfo> buildRangeInfos(blasCount);
+	  core::vector<IGPUBottomLevelAccelerationStructure::BuildRangeInfo*> pRangeInfos(blasCount);
+	  for (uint32_t i = 0; i < blasCount; i++)
+	  {
+		blasBuildInfos[i].dstAS = m_gpuBlasList[i].get();
+		blasBuildInfos[i].scratch.buffer = scratchBuffer;
+		if (i == 0)
+		{
+		  blasBuildInfos[i].scratch.offset = 0u;
+		} else
+		{
+		  const auto unalignedOffset = blasBuildInfos[i - 1].scratch.offset + scratchSizes[i - 1];
+		  blasBuildInfos[i].scratch.offset = core::alignUp(unalignedOffset, scratchOffsetAlignment);
+		}
+
+		buildRangeInfos[i].primitiveCount = primitiveCounts[i];
+		buildRangeInfos[i].primitiveByteOffset = 0u;
+		buildRangeInfos[i].firstVertex = 0u;
+		buildRangeInfos[i].transformByteOffset = 0u;
+
+		pRangeInfos[i] = &buildRangeInfos[i];
+	  }
+
+	  if (!cmdbufBlas->buildAccelerationStructures(std::span(blasBuildInfos), pRangeInfos.data()))
+		return logFail("Failed to build BLAS");
+
+	  {
+		SMemoryBarrier memBarrier;
+		memBarrier.srcStageMask = PIPELINE_STAGE_FLAGS::ACCELERATION_STRUCTURE_BUILD_BIT;
+		memBarrier.srcAccessMask = ACCESS_FLAGS::ACCELERATION_STRUCTURE_WRITE_BIT;
+		memBarrier.dstStageMask = PIPELINE_STAGE_FLAGS::ACCELERATION_STRUCTURE_BUILD_BIT;
+		memBarrier.dstAccessMask = ACCESS_FLAGS::ACCELERATION_STRUCTURE_READ_BIT;
+		cmdbufBlas->pipelineBarrier(E_DEPENDENCY_FLAGS::EDF_NONE, { .memBarriers = {&memBarrier, 1} });
+	  }
+
+
+	  core::vector<const IGPUAccelerationStructure*> ases(blasCount);
+	  for (uint32_t i = 0; i < blasCount; i++)
+		ases[i] = m_gpuBlasList[i].get();
+	  if (!cmdbufBlas->writeAccelerationStructureProperties(std::span(ases), IQueryPool::ACCELERATION_STRUCTURE_COMPACTED_SIZE,
+		queryPool.get(), 0))
+		return logFail("Failed to write acceleration structure properties!");
+
+	  cmdbufBlas->endDebugMarker();
+	  cmdbufSubmitAndWait(cmdbufBlas, queue, 39);
+	}
+
+	auto cmdbufCompact = getSingleUseCommandBufferAndBegin(pool);
+	cmdbufCompact->beginDebugMarker("Compact BLAS");
+
+	// compact blas
+	{
+	  core::vector<size_t> asSizes(blasCount);
+	  if (!m_device->getQueryPoolResults(queryPool.get(), 0, blasCount, asSizes.data(), sizeof(size_t), bitflag(IQueryPool::WAIT_BIT) | IQueryPool::_64_BIT))
+		return logFail("Could not get query pool results for AS sizes");
+
+	  core::vector<smart_refctd_ptr<IGPUBottomLevelAccelerationStructure>> cleanupBlas(blasCount);
+	  for (uint32_t i = 0; i < blasCount; i++)
+	  {
+		if (asSizes[i] == 0) continue;
+		cleanupBlas[i] = m_gpuBlasList[i];
+		{
+		  IGPUBuffer::SCreationParams params;
+		  params.usage = bitflag(IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT) | IGPUBuffer::EUF_ACCELERATION_STRUCTURE_STORAGE_BIT;
+		  params.size = asSizes[i];
+		  smart_refctd_ptr<IGPUBuffer> asBuffer = createBuffer(params);
+
+		  IGPUBottomLevelAccelerationStructure::SCreationParams blasParams;
+		  blasParams.bufferRange.buffer = asBuffer;
+		  blasParams.bufferRange.offset = 0u;
+		  blasParams.bufferRange.size = asSizes[i];
+		  blasParams.flags = IGPUBottomLevelAccelerationStructure::SCreationParams::FLAGS::NONE;
+		  m_gpuBlasList[i] = m_device->createBottomLevelAccelerationStructure(std::move(blasParams));
+		  if (!m_gpuBlasList[i])
+			return logFail("Could not create compacted BLAS");
+		}
+
+		IGPUBottomLevelAccelerationStructure::CopyInfo copyInfo;
+		copyInfo.src = cleanupBlas[i].get();
+		copyInfo.dst = m_gpuBlasList[i].get();
+		copyInfo.mode = IGPUBottomLevelAccelerationStructure::COPY_MODE::COMPACT;
+		if (!cmdbufCompact->copyAccelerationStructure(copyInfo))
+		  return logFail("Failed to copy AS to compact");
+	  }
+	}
+
+	cmdbufCompact->endDebugMarker();
+	cmdbufSubmitAndWait(cmdbufCompact, queue, 40);
+
+	auto cmdbufTlas = getSingleUseCommandBufferAndBegin(pool);
+	cmdbufTlas->beginDebugMarker("Build TLAS");
+
+	// build top level AS
+	{
+	  const uint32_t instancesCount = blasCount;
+	  core::vector<IGPUTopLevelAccelerationStructure::DeviceStaticInstance> instances(instancesCount);
+	  for (uint32_t i = 0; i < instancesCount; i++)
+	  {
+		const auto isProceduralInstance = i == proceduralBlasIdx;
+		instances[i].base.blas.deviceAddress = m_gpuBlasList[i]->getReferenceForDeviceOperations().deviceAddress;
+		instances[i].base.mask = 0xFF;
+		instances[i].base.instanceCustomIndex = i;
+		instances[i].base.instanceShaderBindingTableRecordOffset = isProceduralInstance ? 2 : 0;
+		instances[i].base.flags = static_cast<uint32_t>(IGPUTopLevelAccelerationStructure::INSTANCE_FLAGS::TRIANGLE_FACING_CULL_DISABLE_BIT);
+		instances[i].transform = isProceduralInstance ? matrix3x4SIMD() : m_gpuTriangleGeometries[i].transform;
+	  }
+
+	  {
+		size_t bufSize = instancesCount * sizeof(IGPUTopLevelAccelerationStructure::DeviceStaticInstance);
+		IGPUBuffer::SCreationParams params;
+		params.usage = bitflag(IGPUBuffer::EUF_ACCELERATION_STRUCTURE_BUILD_INPUT_READ_ONLY_BIT) | IGPUBuffer::EUF_STORAGE_BUFFER_BIT |
+		  IGPUBuffer::EUF_INLINE_UPDATE_VIA_CMDBUF | IGPUBuffer::EUF_TRANSFER_DST_BIT | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT;
+		params.size = bufSize;
+		m_instanceBuffer = createBuffer(params);
+
+		SBufferRange<IGPUBuffer> range = { .offset = 0u, .size = bufSize, .buffer = m_instanceBuffer };
+		cmdbufTlas->updateBuffer(range, instances.data());
+	  }
+
+	  // make sure instances upload complete first
+	  {
+		SMemoryBarrier memBarrier;
+		memBarrier.srcStageMask = PIPELINE_STAGE_FLAGS::ALL_TRANSFER_BITS;
+		memBarrier.srcAccessMask = ACCESS_FLAGS::TRANSFER_WRITE_BIT;
+		memBarrier.dstStageMask = PIPELINE_STAGE_FLAGS::ACCELERATION_STRUCTURE_BUILD_BIT;
+		memBarrier.dstAccessMask = ACCESS_FLAGS::ACCELERATION_STRUCTURE_WRITE_BIT;
+		cmdbufTlas->pipelineBarrier(E_DEPENDENCY_FLAGS::EDF_NONE, { .memBarriers = {&memBarrier, 1} });
+	  }
+
+	  auto tlasFlags = bitflag(IGPUTopLevelAccelerationStructure::BUILD_FLAGS::PREFER_FAST_TRACE_BIT);
+
+	  IGPUTopLevelAccelerationStructure::DeviceBuildInfo tlasBuildInfo;
+	  tlasBuildInfo.buildFlags = tlasFlags;
+	  tlasBuildInfo.srcAS = nullptr;
+	  tlasBuildInfo.dstAS = nullptr;
+	  tlasBuildInfo.instanceData.buffer = m_instanceBuffer;
+	  tlasBuildInfo.instanceData.offset = 0u;
+	  tlasBuildInfo.scratch = {};
+
+	  auto buildSizes = m_device->getAccelerationStructureBuildSizes(tlasFlags, false, instancesCount);
+	  if (!buildSizes)
+		return logFail("Failed to get TLAS build sizes");
+
+	  {
+		IGPUBuffer::SCreationParams params;
+		params.usage = bitflag(IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT) | IGPUBuffer::EUF_ACCELERATION_STRUCTURE_STORAGE_BIT;
+		params.size = buildSizes.accelerationStructureSize;
+		smart_refctd_ptr<IGPUBuffer> asBuffer = createBuffer(params);
+
+		IGPUTopLevelAccelerationStructure::SCreationParams tlasParams;
+		tlasParams.bufferRange.buffer = asBuffer;
+		tlasParams.bufferRange.offset = 0u;
+		tlasParams.bufferRange.size = buildSizes.accelerationStructureSize;
+		tlasParams.flags = IGPUTopLevelAccelerationStructure::SCreationParams::FLAGS::NONE;
+		m_gpuTlas = m_device->createTopLevelAccelerationStructure(std::move(tlasParams));
+		if (!m_gpuTlas)
+		  return logFail("Could not create TLAS");
+	  }
+
+	  smart_refctd_ptr<IGPUBuffer> scratchBuffer;
+	  {
+		IGPUBuffer::SCreationParams params;
+		params.usage = bitflag(IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT) | IGPUBuffer::EUF_STORAGE_BUFFER_BIT;
+		params.size = buildSizes.buildScratchSize;
+		scratchBuffer = createBuffer(params);
+	  }
+
+	  tlasBuildInfo.dstAS = m_gpuTlas.get();
+	  tlasBuildInfo.scratch.buffer = scratchBuffer;
+	  tlasBuildInfo.scratch.offset = 0u;
+
+	  IGPUTopLevelAccelerationStructure::BuildRangeInfo buildRangeInfo[1u];
+	  buildRangeInfo[0].instanceCount = instancesCount;
+	  buildRangeInfo[0].instanceByteOffset = 0u;
+	  IGPUTopLevelAccelerationStructure::BuildRangeInfo* pRangeInfos;
+	  pRangeInfos = &buildRangeInfo[0];
+
+	  if (!cmdbufTlas->buildAccelerationStructures({ &tlasBuildInfo, 1 }, pRangeInfos))
+		return logFail("Failed to build TLAS");
+	}
+
+	cmdbufTlas->endDebugMarker();
+	cmdbufSubmitAndWait(cmdbufTlas, queue, 45);
 
 #ifdef TRY_BUILD_FOR_NGFX
-    {
-      const IQueue::SSubmitInfo::SSemaphoreInfo acquired[] = { {
-        .semaphore = m_currentImageAcquire.semaphore,
-        .value = m_currentImageAcquire.acquireCount,
-        .stageMask = PIPELINE_STAGE_FLAGS::ALL_COMMANDS_BITS
-      } };
-      m_surface->present(m_currentImageAcquire.imageIndex, acquired);
-    }
+	{
+	  const IQueue::SSubmitInfo::SSemaphoreInfo acquired[] = { {
+		.semaphore = m_currentImageAcquire.semaphore,
+		.value = m_currentImageAcquire.acquireCount,
+		.stageMask = PIPELINE_STAGE_FLAGS::ALL_COMMANDS_BITS
+	  } };
+	  m_surface->present(m_currentImageAcquire.imageIndex, acquired);
+	}
 #endif
-    m_api->endCapture();
+	m_api->endCapture();
 
-    return true;
+	return true;
   }
+#endif // TEST_ASSET_CONV_AS
 
 
   smart_refctd_ptr<IWindow> m_window;
@@ -1798,37 +1806,37 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication,
 
   struct CameraSetting
   {
-    float fov = 60.f;
-    float zNear = 0.1f;
-    float zFar = 10000.f;
-    float moveSpeed = 1.f;
-    float rotateSpeed = 1.f;
-    float viewWidth = 10.f;
-    float camYAngle = 165.f / 180.f * 3.14159f;
-    float camXAngle = 32.f / 180.f * 3.14159f;
-    
+	float fov = 60.f;
+	float zNear = 0.1f;
+	float zFar = 10000.f;
+	float moveSpeed = 1.f;
+	float rotateSpeed = 1.f;
+	float viewWidth = 10.f;
+	float camYAngle = 165.f / 180.f * 3.14159f;
+	float camXAngle = 32.f / 180.f * 3.14159f;
+	
   } m_cameraSetting;
   Camera m_camera = Camera(core::vectorSIMDf(0, 0, 0), core::vectorSIMDf(0, 0, 0), core::matrix4SIMD());
 
   Light m_light = {
-    .direction = {-1.0f, -1.0f, -0.4f},
-    .position = {10.0f, 15.0f, 8.0f},
-    .outerCutoff = 0.866025404f, // {cos(radians(30.0f))}, 
-    .type = ELT_DIRECTIONAL
+	.direction = {-1.0f, -1.0f, -0.4f},
+	.position = {10.0f, 15.0f, 8.0f},
+	.outerCutoff = 0.866025404f, // {cos(radians(30.0f))}, 
+	.type = ELT_DIRECTIONAL
   };
 
   video::CDumbPresentationOracle m_oracle;
 
   struct C_UI
   {
-    nbl::core::smart_refctd_ptr<nbl::ext::imgui::UI> manager;
+	nbl::core::smart_refctd_ptr<nbl::ext::imgui::UI> manager;
 
-    struct
-    {
-      core::smart_refctd_ptr<video::IGPUSampler> gui, scene;
-    } samplers;
+	struct
+	{
+	  core::smart_refctd_ptr<video::IGPUSampler> gui, scene;
+	} samplers;
 
-    core::smart_refctd_ptr<IGPUDescriptorSet> descriptorSet;
+	core::smart_refctd_ptr<IGPUDescriptorSet> descriptorSet;
   } m_ui;
   core::smart_refctd_ptr<IDescriptorPool> m_guiDescriptorSetPool;
 

From b498e9cf6a5b2c0badf0ccc528c4306f055a015a Mon Sep 17 00:00:00 2001
From: keptsecret <sorchon@gmail.com>
Date: Fri, 16 May 2025 17:13:38 +0700
Subject: [PATCH 261/529] triangles and aabbs into icpubuffers

---
 71_RayTracingPipeline/main.cpp | 3409 +++++++++++++++++---------------
 1 file changed, 1780 insertions(+), 1629 deletions(-)

diff --git a/71_RayTracingPipeline/main.cpp b/71_RayTracingPipeline/main.cpp
index ad13b4a5d..528b2c314 100644
--- a/71_RayTracingPipeline/main.cpp
+++ b/71_RayTracingPipeline/main.cpp
@@ -10,778 +10,786 @@
 
 class RaytracingPipelineApp final : public examples::SimpleWindowedApplication, public application_templates::MonoAssetManagerAndBuiltinResourceApplication
 {
-  using device_base_t = examples::SimpleWindowedApplication;
-  using asset_base_t = application_templates::MonoAssetManagerAndBuiltinResourceApplication;
-  using clock_t = std::chrono::steady_clock;
-
-  constexpr static inline uint32_t WIN_W = 1280, WIN_H = 720;
-  constexpr static inline uint32_t MaxFramesInFlight = 3u;
-  constexpr static inline uint8_t MaxUITextureCount = 1u;
-  constexpr static inline uint32_t NumberOfProceduralGeometries = 5;
-
-  static constexpr const char* s_lightTypeNames[E_LIGHT_TYPE::ELT_COUNT] = {
-	"Directional",
-	"Point",
-	"Spot"
-  };
-
-  constexpr static inline clock_t::duration DisplayImageDuration = std::chrono::milliseconds(900);
-
-  struct ShaderBindingTable
-  {
-	SBufferRange<IGPUBuffer> raygenGroupRange;
-	SBufferRange<IGPUBuffer> hitGroupsRange;
-	uint32_t hitGroupsStride;
-	SBufferRange<IGPUBuffer> missGroupsRange;
-	uint32_t missGroupsStride;
-	SBufferRange<IGPUBuffer> callableGroupsRange;
-	uint32_t callableGroupsStride;
-  };
+	using device_base_t = examples::SimpleWindowedApplication;
+	using asset_base_t = application_templates::MonoAssetManagerAndBuiltinResourceApplication;
+	using clock_t = std::chrono::steady_clock;
+
+	constexpr static inline uint32_t WIN_W = 1280, WIN_H = 720;
+	constexpr static inline uint32_t MaxFramesInFlight = 3u;
+	constexpr static inline uint8_t MaxUITextureCount = 1u;
+	constexpr static inline uint32_t NumberOfProceduralGeometries = 5;
+
+	static constexpr const char* s_lightTypeNames[E_LIGHT_TYPE::ELT_COUNT] = {
+	  "Directional",
+	  "Point",
+	  "Spot"
+	};
+
+	constexpr static inline clock_t::duration DisplayImageDuration = std::chrono::milliseconds(900);
+
+	struct ShaderBindingTable
+	{
+		SBufferRange<IGPUBuffer> raygenGroupRange;
+		SBufferRange<IGPUBuffer> hitGroupsRange;
+		uint32_t hitGroupsStride;
+		SBufferRange<IGPUBuffer> missGroupsRange;
+		uint32_t missGroupsStride;
+		SBufferRange<IGPUBuffer> callableGroupsRange;
+		uint32_t callableGroupsStride;
+	};
 
 
 public:
-  inline RaytracingPipelineApp(const path& _localInputCWD, const path& _localOutputCWD, const path& _sharedInputCWD, const path& _sharedOutputCWD)
-	: IApplicationFramework(_localInputCWD, _localOutputCWD, _sharedInputCWD, _sharedOutputCWD)
-  {
-  }
-
-  inline SPhysicalDeviceFeatures getRequiredDeviceFeatures() const override
-  {
-	auto retval = device_base_t::getRequiredDeviceFeatures();
-	retval.rayTracingPipeline = true;
-	retval.accelerationStructure = true;
-	retval.rayQuery = true;
-	return retval;
-  }
-
-  inline SPhysicalDeviceFeatures getPreferredDeviceFeatures() const override
-  {
-	auto retval = device_base_t::getPreferredDeviceFeatures();
-	retval.accelerationStructureHostCommands = true;
-	return retval;
-  }
-
-  inline core::vector<video::SPhysicalDeviceFilter::SurfaceCompatibility> getSurfaces() const override
-  {
-	if (!m_surface)
+	inline RaytracingPipelineApp(const path& _localInputCWD, const path& _localOutputCWD, const path& _sharedInputCWD, const path& _sharedOutputCWD)
+		: IApplicationFramework(_localInputCWD, _localOutputCWD, _sharedInputCWD, _sharedOutputCWD)
 	{
-	  {
-		auto windowCallback = core::make_smart_refctd_ptr<CEventCallback>(smart_refctd_ptr(m_inputSystem), smart_refctd_ptr(m_logger));
-		IWindow::SCreationParams params = {};
-		params.callback = core::make_smart_refctd_ptr<ISimpleManagedSurface::ICallback>();
-		params.width = WIN_W;
-		params.height = WIN_H;
-		params.x = 32;
-		params.y = 32;
-		params.flags = ui::IWindow::ECF_HIDDEN | IWindow::ECF_BORDERLESS | IWindow::ECF_RESIZABLE;
-		params.windowCaption = "RaytracingPipelineApp";
-		params.callback = windowCallback;
-		const_cast<std::remove_const_t<decltype(m_window)>&>(m_window) = m_winMgr->createWindow(std::move(params));
-	  }
-
-	  auto surface = CSurfaceVulkanWin32::create(smart_refctd_ptr(m_api), smart_refctd_ptr_static_cast<IWindowWin32>(m_window));
-	  const_cast<std::remove_const_t<decltype(m_surface)>&>(m_surface) = CSimpleResizeSurface<ISimpleManagedSurface::ISwapchainResources>::create(std::move(surface));
 	}
 
-	if (m_surface)
-	  return { {m_surface->getSurface()/*,EQF_NONE*/} };
+	inline SPhysicalDeviceFeatures getRequiredDeviceFeatures() const override
+	{
+		auto retval = device_base_t::getRequiredDeviceFeatures();
+		retval.rayTracingPipeline = true;
+		retval.accelerationStructure = true;
+		retval.rayQuery = true;
+		return retval;
+	}
 
-	return {};
-  }
+	inline SPhysicalDeviceFeatures getPreferredDeviceFeatures() const override
+	{
+		auto retval = device_base_t::getPreferredDeviceFeatures();
+		retval.accelerationStructureHostCommands = true;
+		return retval;
+	}
 
-  // so that we can use the same queue for asset converter and rendering
-  inline core::vector<queue_req_t> getQueueRequirements() const override
-  {
-	auto reqs = device_base_t::getQueueRequirements();
-	reqs.front().requiredFlags |= IQueue::FAMILY_FLAGS::TRANSFER_BIT;
-	return reqs;
-  }
+	inline core::vector<video::SPhysicalDeviceFilter::SurfaceCompatibility> getSurfaces() const override
+	{
+		if (!m_surface)
+		{
+			{
+				auto windowCallback = core::make_smart_refctd_ptr<CEventCallback>(smart_refctd_ptr(m_inputSystem), smart_refctd_ptr(m_logger));
+				IWindow::SCreationParams params = {};
+				params.callback = core::make_smart_refctd_ptr<ISimpleManagedSurface::ICallback>();
+				params.width = WIN_W;
+				params.height = WIN_H;
+				params.x = 32;
+				params.y = 32;
+				params.flags = ui::IWindow::ECF_HIDDEN | IWindow::ECF_BORDERLESS | IWindow::ECF_RESIZABLE;
+				params.windowCaption = "RaytracingPipelineApp";
+				params.callback = windowCallback;
+				const_cast<std::remove_const_t<decltype(m_window)>&>(m_window) = m_winMgr->createWindow(std::move(params));
+			}
 
-  inline bool onAppInitialized(smart_refctd_ptr<ISystem>&& system) override
-  {
-	m_inputSystem = make_smart_refctd_ptr<InputSystem>(logger_opt_smart_ptr(smart_refctd_ptr(m_logger)));
+			auto surface = CSurfaceVulkanWin32::create(smart_refctd_ptr(m_api), smart_refctd_ptr_static_cast<IWindowWin32>(m_window));
+			const_cast<std::remove_const_t<decltype(m_surface)>&>(m_surface) = CSimpleResizeSurface<ISimpleManagedSurface::ISwapchainResources>::create(std::move(surface));
+		}
 
-	if (!device_base_t::onAppInitialized(smart_refctd_ptr(system)))
-	  return false;
+		if (m_surface)
+			return { {m_surface->getSurface()/*,EQF_NONE*/} };
 
-	if (!asset_base_t::onAppInitialized(smart_refctd_ptr(system)))
-	  return false;
+		return {};
+	}
 
-	smart_refctd_ptr<IShaderCompiler::CCache> shaderReadCache = nullptr;
-	smart_refctd_ptr<IShaderCompiler::CCache> shaderWriteCache = core::make_smart_refctd_ptr<IShaderCompiler::CCache>();
-	auto shaderCachePath = localOutputCWD / "main_pipeline_shader_cache.bin";
+	// so that we can use the same queue for asset converter and rendering
+	inline core::vector<queue_req_t> getQueueRequirements() const override
+	{
+		auto reqs = device_base_t::getQueueRequirements();
+		reqs.front().requiredFlags |= IQueue::FAMILY_FLAGS::TRANSFER_BIT;
+		return reqs;
+	}
 
+	inline bool onAppInitialized(smart_refctd_ptr<ISystem>&& system) override
 	{
-		core::smart_refctd_ptr<system::IFile> shaderReadCacheFile;
+		m_inputSystem = make_smart_refctd_ptr<InputSystem>(logger_opt_smart_ptr(smart_refctd_ptr(m_logger)));
+
+		if (!device_base_t::onAppInitialized(smart_refctd_ptr(system)))
+			return false;
+
+		if (!asset_base_t::onAppInitialized(smart_refctd_ptr(system)))
+			return false;
+
+		smart_refctd_ptr<IShaderCompiler::CCache> shaderReadCache = nullptr;
+		smart_refctd_ptr<IShaderCompiler::CCache> shaderWriteCache = core::make_smart_refctd_ptr<IShaderCompiler::CCache>();
+		auto shaderCachePath = localOutputCWD / "main_pipeline_shader_cache.bin";
+
+		{
+			core::smart_refctd_ptr<system::IFile> shaderReadCacheFile;
+			{
+				system::ISystem::future_t<core::smart_refctd_ptr<system::IFile>> future;
+				m_system->createFile(future, shaderCachePath.c_str(), system::IFile::ECF_READ);
+				if (future.wait())
+				{
+					future.acquire().move_into(shaderReadCacheFile);
+					if (shaderReadCacheFile)
+					{
+						const size_t size = shaderReadCacheFile->getSize();
+						if (size > 0ull)
+						{
+							std::vector<uint8_t> contents(size);
+							system::IFile::success_t succ;
+							shaderReadCacheFile->read(succ, contents.data(), 0, size);
+							if (succ)
+								shaderReadCache = IShaderCompiler::CCache::deserialize(contents);
+						}
+					}
+				}
+				else
+					m_logger->log("Failed Openning Shader Cache File.", ILogger::ELL_ERROR);
+			}
+
+		}
+
+		// Load Custom Shader
+		auto loadCompileAndCreateShader = [&](const std::string& relPath) -> smart_refctd_ptr<IGPUShader>
+			{
+				IAssetLoader::SAssetLoadParams lp = {};
+				lp.logger = m_logger.get();
+				lp.workingDirectory = ""; // virtual root
+				auto assetBundle = m_assetMgr->getAsset(relPath, lp);
+				const auto assets = assetBundle.getContents();
+				if (assets.empty())
+					return nullptr;
+
+				// lets go straight from ICPUSpecializedShader to IGPUSpecializedShader
+				auto sourceRaw = IAsset::castDown<ICPUShader>(assets[0]);
+				if (!sourceRaw)
+					return nullptr;
+
+				return m_device->createShader({ sourceRaw.get(), nullptr, shaderReadCache.get(), shaderWriteCache.get() });
+			};
+
+		// load shaders
+		const auto raygenShader = loadCompileAndCreateShader("app_resources/raytrace.rgen.hlsl");
+		const auto closestHitShader = loadCompileAndCreateShader("app_resources/raytrace.rchit.hlsl");
+		const auto proceduralClosestHitShader = loadCompileAndCreateShader("app_resources/raytrace_procedural.rchit.hlsl");
+		const auto intersectionHitShader = loadCompileAndCreateShader("app_resources/raytrace.rint.hlsl");
+		const auto anyHitShaderColorPayload = loadCompileAndCreateShader("app_resources/raytrace.rahit.hlsl");
+		const auto anyHitShaderShadowPayload = loadCompileAndCreateShader("app_resources/raytrace_shadow.rahit.hlsl");
+		const auto missShader = loadCompileAndCreateShader("app_resources/raytrace.rmiss.hlsl");
+		const auto missShadowShader = loadCompileAndCreateShader("app_resources/raytrace_shadow.rmiss.hlsl");
+		const auto directionalLightCallShader = loadCompileAndCreateShader("app_resources/light_directional.rcall.hlsl");
+		const auto pointLightCallShader = loadCompileAndCreateShader("app_resources/light_point.rcall.hlsl");
+		const auto spotLightCallShader = loadCompileAndCreateShader("app_resources/light_spot.rcall.hlsl");
+		const auto fragmentShader = loadCompileAndCreateShader("app_resources/present.frag.hlsl");
+
+		core::smart_refctd_ptr<system::IFile> shaderWriteCacheFile;
 		{
 			system::ISystem::future_t<core::smart_refctd_ptr<system::IFile>> future;
-			m_system->createFile(future, shaderCachePath.c_str(), system::IFile::ECF_READ);
+			m_system->deleteFile(shaderCachePath); // temp solution instead of trimming, to make sure we won't have corrupted json
+			m_system->createFile(future, shaderCachePath.c_str(), system::IFile::ECF_WRITE);
 			if (future.wait())
 			{
-				future.acquire().move_into(shaderReadCacheFile);
-				if (shaderReadCacheFile)
+				future.acquire().move_into(shaderWriteCacheFile);
+				if (shaderWriteCacheFile)
 				{
-					const size_t size = shaderReadCacheFile->getSize();
-					if (size > 0ull)
+					auto serializedCache = shaderWriteCache->serialize();
+					if (shaderWriteCacheFile)
 					{
-						std::vector<uint8_t> contents(size);
 						system::IFile::success_t succ;
-						shaderReadCacheFile->read(succ, contents.data(), 0, size);
-						if (succ)
-							shaderReadCache = IShaderCompiler::CCache::deserialize(contents);
+						shaderWriteCacheFile->write(succ, serializedCache->getPointer(), 0, serializedCache->getSize());
+						if (!succ)
+							m_logger->log("Failed Writing To Shader Cache File.", ILogger::ELL_ERROR);
 					}
 				}
+				else
+					m_logger->log("Failed Creating Shader Cache File.", ILogger::ELL_ERROR);
 			}
 			else
-				m_logger->log("Failed Openning Shader Cache File.", ILogger::ELL_ERROR);
+				m_logger->log("Failed Creating Shader Cache File.", ILogger::ELL_ERROR);
 		}
 
-	}
+		m_semaphore = m_device->createSemaphore(m_realFrameIx);
+		if (!m_semaphore)
+			return logFail("Failed to Create a Semaphore!");
 
-	// Load Custom Shader
-	auto loadCompileAndCreateShader = [&](const std::string& relPath) -> smart_refctd_ptr<IGPUShader>
-		{
-			IAssetLoader::SAssetLoadParams lp = {};
-			lp.logger = m_logger.get();
-			lp.workingDirectory = ""; // virtual root
-			auto assetBundle = m_assetMgr->getAsset(relPath, lp);
-			const auto assets = assetBundle.getContents();
-			if (assets.empty())
-				return nullptr;
-
-			// lets go straight from ICPUSpecializedShader to IGPUSpecializedShader
-			auto sourceRaw = IAsset::castDown<ICPUShader>(assets[0]);
-			if (!sourceRaw)
-				return nullptr;
-
-			return m_device->createShader({ sourceRaw.get(), nullptr, shaderReadCache.get(), shaderWriteCache.get() });
-		};
+		auto gQueue = getGraphicsQueue();
 
-	// load shaders
-	const auto raygenShader = loadCompileAndCreateShader("app_resources/raytrace.rgen.hlsl");
-	const auto closestHitShader = loadCompileAndCreateShader("app_resources/raytrace.rchit.hlsl");
-	const auto proceduralClosestHitShader = loadCompileAndCreateShader("app_resources/raytrace_procedural.rchit.hlsl");
-	const auto intersectionHitShader = loadCompileAndCreateShader("app_resources/raytrace.rint.hlsl");
-	const auto anyHitShaderColorPayload = loadCompileAndCreateShader("app_resources/raytrace.rahit.hlsl");
-	const auto anyHitShaderShadowPayload = loadCompileAndCreateShader("app_resources/raytrace_shadow.rahit.hlsl");
-	const auto missShader = loadCompileAndCreateShader("app_resources/raytrace.rmiss.hlsl");
-	const auto missShadowShader = loadCompileAndCreateShader("app_resources/raytrace_shadow.rmiss.hlsl");
-	const auto directionalLightCallShader = loadCompileAndCreateShader("app_resources/light_directional.rcall.hlsl");
-	const auto pointLightCallShader = loadCompileAndCreateShader("app_resources/light_point.rcall.hlsl");
-	const auto spotLightCallShader = loadCompileAndCreateShader("app_resources/light_spot.rcall.hlsl");
-	const auto fragmentShader = loadCompileAndCreateShader("app_resources/present.frag.hlsl");
-
-	core::smart_refctd_ptr<system::IFile> shaderWriteCacheFile;
-	{
-		system::ISystem::future_t<core::smart_refctd_ptr<system::IFile>> future;
-		m_system->deleteFile(shaderCachePath); // temp solution instead of trimming, to make sure we won't have corrupted json
-		m_system->createFile(future, shaderCachePath.c_str(), system::IFile::ECF_WRITE);
-		if (future.wait())
+		// Create renderpass and init surface
+		nbl::video::IGPURenderpass* renderpass;
 		{
-			future.acquire().move_into(shaderWriteCacheFile);
-			if (shaderWriteCacheFile)
+			ISwapchain::SCreationParams swapchainParams = { .surface = smart_refctd_ptr<ISurface>(m_surface->getSurface()) };
+			if (!swapchainParams.deduceFormat(m_physicalDevice))
+				return logFail("Could not choose a Surface Format for the Swapchain!");
+
+			const static IGPURenderpass::SCreationParams::SSubpassDependency dependencies[] =
 			{
-				auto serializedCache = shaderWriteCache->serialize();
-				if (shaderWriteCacheFile)
+			  {
+				.srcSubpass = IGPURenderpass::SCreationParams::SSubpassDependency::External,
+				.dstSubpass = 0,
+				.memoryBarrier =
 				{
-					system::IFile::success_t succ;
-					shaderWriteCacheFile->write(succ, serializedCache->getPointer(), 0, serializedCache->getSize());
-					if (!succ)
-						m_logger->log("Failed Writing To Shader Cache File.", ILogger::ELL_ERROR);
+				  .srcStageMask = asset::PIPELINE_STAGE_FLAGS::COPY_BIT,
+				  .srcAccessMask = asset::ACCESS_FLAGS::TRANSFER_WRITE_BIT,
+				  .dstStageMask = asset::PIPELINE_STAGE_FLAGS::COLOR_ATTACHMENT_OUTPUT_BIT,
+				  .dstAccessMask = asset::ACCESS_FLAGS::COLOR_ATTACHMENT_WRITE_BIT
 				}
-			}
-			else
-				m_logger->log("Failed Creating Shader Cache File.", ILogger::ELL_ERROR);
-		}
-		else
-			m_logger->log("Failed Creating Shader Cache File.", ILogger::ELL_ERROR);
-	}
+			  },
+			  {
+				.srcSubpass = 0,
+				.dstSubpass = IGPURenderpass::SCreationParams::SSubpassDependency::External,
+				.memoryBarrier =
+				{
+				  .srcStageMask = asset::PIPELINE_STAGE_FLAGS::COLOR_ATTACHMENT_OUTPUT_BIT,
+				  .srcAccessMask = asset::ACCESS_FLAGS::COLOR_ATTACHMENT_WRITE_BIT
+				}
+			  },
+			  IGPURenderpass::SCreationParams::DependenciesEnd
+			};
 
-	m_semaphore = m_device->createSemaphore(m_realFrameIx);
-	if (!m_semaphore)
-	  return logFail("Failed to Create a Semaphore!");
+			auto scResources = std::make_unique<CDefaultSwapchainFramebuffers>(m_device.get(), swapchainParams.surfaceFormat.format, dependencies);
+			renderpass = scResources->getRenderpass();
 
-	auto gQueue = getGraphicsQueue();
+			if (!renderpass)
+				return logFail("Failed to create Renderpass!");
 
-	// Create renderpass and init surface
-	nbl::video::IGPURenderpass* renderpass;
-	{
-	  ISwapchain::SCreationParams swapchainParams = { .surface = smart_refctd_ptr<ISurface>(m_surface->getSurface()) };
-	  if (!swapchainParams.deduceFormat(m_physicalDevice))
-		return logFail("Could not choose a Surface Format for the Swapchain!");
+			if (!m_surface || !m_surface->init(gQueue, std::move(scResources), swapchainParams.sharedParams))
+				return logFail("Could not create Window & Surface or initialize the Surface!");
+		}
 
-	  const static IGPURenderpass::SCreationParams::SSubpassDependency dependencies[] =
-	  {
-		{
-		  .srcSubpass = IGPURenderpass::SCreationParams::SSubpassDependency::External,
-		  .dstSubpass = 0,
-		  .memoryBarrier =
-		  {
-			.srcStageMask = asset::PIPELINE_STAGE_FLAGS::COPY_BIT,
-			.srcAccessMask = asset::ACCESS_FLAGS::TRANSFER_WRITE_BIT,
-			.dstStageMask = asset::PIPELINE_STAGE_FLAGS::COLOR_ATTACHMENT_OUTPUT_BIT,
-			.dstAccessMask = asset::ACCESS_FLAGS::COLOR_ATTACHMENT_WRITE_BIT
-		  }
-		},
+		auto pool = m_device->createCommandPool(gQueue->getFamilyIndex(), IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT);
+
+		m_converter = CAssetConverter::create({ .device = m_device.get(), .optimizer = {} });
+
+		for (auto i = 0u; i < MaxFramesInFlight; i++)
 		{
-		  .srcSubpass = 0,
-		  .dstSubpass = IGPURenderpass::SCreationParams::SSubpassDependency::External,
-		  .memoryBarrier =
-		  {
-			.srcStageMask = asset::PIPELINE_STAGE_FLAGS::COLOR_ATTACHMENT_OUTPUT_BIT,
-			.srcAccessMask = asset::ACCESS_FLAGS::COLOR_ATTACHMENT_WRITE_BIT
-		  }
-		},
-		IGPURenderpass::SCreationParams::DependenciesEnd
-	  };
-
-	  auto scResources = std::make_unique<CDefaultSwapchainFramebuffers>(m_device.get(), swapchainParams.surfaceFormat.format, dependencies);
-	  renderpass = scResources->getRenderpass();
-
-	  if (!renderpass)
-		return logFail("Failed to create Renderpass!");
-
-	  if (!m_surface || !m_surface->init(gQueue, std::move(scResources), swapchainParams.sharedParams))
-		return logFail("Could not create Window & Surface or initialize the Surface!");
-	}
+			if (!pool)
+				return logFail("Couldn't create Command Pool!");
+			if (!pool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY, { m_cmdBufs.data() + i, 1 }))
+				return logFail("Couldn't create Command Buffer!");
+		}
 
-	auto pool = m_device->createCommandPool(gQueue->getFamilyIndex(), IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT);
+		m_winMgr->setWindowSize(m_window.get(), WIN_W, WIN_H);
+		m_surface->recreateSwapchain();
 
-	m_converter = CAssetConverter::create({ .device = m_device.get(), .optimizer = {} });
 
-	for (auto i = 0u; i < MaxFramesInFlight; i++)
-	{
-	  if (!pool)
-		return logFail("Couldn't create Command Pool!");
-	  if (!pool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY, { m_cmdBufs.data() + i, 1 }))
-		return logFail("Couldn't create Command Buffer!");
-	}
+		// create output images
+		m_hdrImage = m_device->createImage({
+			{
+			  .type = IGPUImage::ET_2D,
+			  .samples = ICPUImage::ESCF_1_BIT,
+			  .format = EF_R16G16B16A16_SFLOAT,
+			  .extent = {WIN_W, WIN_H, 1},
+			  .mipLevels = 1,
+			  .arrayLayers = 1,
+			  .flags = IImage::ECF_NONE,
+			  .usage = bitflag(IImage::EUF_STORAGE_BIT) | IImage::EUF_TRANSFER_SRC_BIT | IImage::EUF_SAMPLED_BIT
+			}
+			});
+
+		if (!m_hdrImage || !m_device->allocate(m_hdrImage->getMemoryReqs(), m_hdrImage.get()).isValid())
+			return logFail("Could not create HDR Image");
 
-	m_winMgr->setWindowSize(m_window.get(), WIN_W, WIN_H);
-	m_surface->recreateSwapchain();
+		m_hdrImageView = m_device->createImageView({
+		  .flags = IGPUImageView::ECF_NONE,
+		  .subUsages = IGPUImage::E_USAGE_FLAGS::EUF_STORAGE_BIT | IGPUImage::E_USAGE_FLAGS::EUF_SAMPLED_BIT,
+		  .image = m_hdrImage,
+		  .viewType = IGPUImageView::E_TYPE::ET_2D,
+		  .format = asset::EF_R16G16B16A16_SFLOAT
+			});
 
 
-	// create output images
-	m_hdrImage = m_device->createImage({
+
+		// ray trace pipeline and descriptor set layout setup
 		{
-		  .type = IGPUImage::ET_2D,
-		  .samples = ICPUImage::ESCF_1_BIT,
-		  .format = EF_R16G16B16A16_SFLOAT,
-		  .extent = {WIN_W, WIN_H, 1},
-		  .mipLevels = 1,
-		  .arrayLayers = 1,
-		  .flags = IImage::ECF_NONE,
-		  .usage = bitflag(IImage::EUF_STORAGE_BIT) | IImage::EUF_TRANSFER_SRC_BIT | IImage::EUF_SAMPLED_BIT
+			const IGPUDescriptorSetLayout::SBinding bindings[] = {
+			  {
+				.binding = 0,
+				.type = asset::IDescriptor::E_TYPE::ET_ACCELERATION_STRUCTURE,
+				.createFlags = IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_NONE,
+				.stageFlags = asset::IShader::E_SHADER_STAGE::ESS_RAYGEN,
+				.count = 1,
+			  },
+			  {
+				.binding = 1,
+				.type = asset::IDescriptor::E_TYPE::ET_STORAGE_IMAGE,
+				.createFlags = IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_NONE,
+				.stageFlags = asset::IShader::E_SHADER_STAGE::ESS_RAYGEN,
+				.count = 1,
+			  }
+			};
+			const auto descriptorSetLayout = m_device->createDescriptorSetLayout(bindings);
+
+			const std::array<IGPUDescriptorSetLayout*, ICPUPipelineLayout::DESCRIPTOR_SET_COUNT> dsLayoutPtrs = { descriptorSetLayout.get() };
+			m_rayTracingDsPool = m_device->createDescriptorPoolForDSLayouts(IDescriptorPool::ECF_UPDATE_AFTER_BIND_BIT, std::span(dsLayoutPtrs.begin(), dsLayoutPtrs.end()));
+			m_rayTracingDs = m_rayTracingDsPool->createDescriptorSet(descriptorSetLayout);
+
+			const SPushConstantRange pcRange = {
+			  .stageFlags = IShader::E_SHADER_STAGE::ESS_ALL_RAY_TRACING,
+			  .offset = 0u,
+			  .size = sizeof(SPushConstants),
+			};
+			const auto pipelineLayout = m_device->createPipelineLayout({ &pcRange, 1 }, smart_refctd_ptr(descriptorSetLayout), nullptr, nullptr, nullptr);
+
+			IGPURayTracingPipeline::SCreationParams params = {};
+
+			enum RtDemoShader
+			{
+				RTDS_RAYGEN,
+				RTDS_MISS,
+				RTDS_MISS_SHADOW,
+				RTDS_CLOSEST_HIT,
+				RTDS_SPHERE_CLOSEST_HIT,
+				RTDS_ANYHIT_PRIMARY,
+				RTDS_ANYHIT_SHADOW,
+				RTDS_INTERSECTION,
+				RTDS_DIRECTIONAL_CALL,
+				RTDS_POINT_CALL,
+				RTDS_SPOT_CALL,
+				RTDS_COUNT
+			};
+
+			IGPUShader::SSpecInfo shaders[RTDS_COUNT];
+			shaders[RTDS_RAYGEN] = { .shader = raygenShader.get() };
+			shaders[RTDS_MISS] = { .shader = missShader.get() };
+			shaders[RTDS_MISS_SHADOW] = { .shader = missShadowShader.get() };
+			shaders[RTDS_CLOSEST_HIT] = { .shader = closestHitShader.get() };
+			shaders[RTDS_SPHERE_CLOSEST_HIT] = { .shader = proceduralClosestHitShader.get() };
+			shaders[RTDS_ANYHIT_PRIMARY] = { .shader = anyHitShaderColorPayload.get() };
+			shaders[RTDS_ANYHIT_SHADOW] = { .shader = anyHitShaderShadowPayload.get() };
+			shaders[RTDS_INTERSECTION] = { .shader = intersectionHitShader.get() };
+			shaders[RTDS_DIRECTIONAL_CALL] = { .shader = directionalLightCallShader.get() };
+			shaders[RTDS_POINT_CALL] = { .shader = pointLightCallShader.get() };
+			shaders[RTDS_SPOT_CALL] = { .shader = spotLightCallShader.get() };
+
+			params.layout = pipelineLayout.get();
+			params.shaders = std::span(shaders);
+			using RayTracingFlags = IGPURayTracingPipeline::SCreationParams::FLAGS;
+			params.flags = core::bitflag(RayTracingFlags::NO_NULL_MISS_SHADERS) |
+				RayTracingFlags::NO_NULL_INTERSECTION_SHADERS |
+				RayTracingFlags::NO_NULL_ANY_HIT_SHADERS;
+
+			auto& shaderGroups = params.shaderGroups;
+
+			shaderGroups.raygen = { .index = RTDS_RAYGEN };
+
+			IRayTracingPipelineBase::SGeneralShaderGroup missGroups[EMT_COUNT];
+			missGroups[EMT_PRIMARY] = { .index = RTDS_MISS };
+			missGroups[EMT_OCCLUSION] = { .index = RTDS_MISS_SHADOW };
+			shaderGroups.misses = missGroups;
+
+			auto getHitGroupIndex = [](E_GEOM_TYPE geomType, E_RAY_TYPE rayType)
+				{
+					return geomType * ERT_COUNT + rayType;
+				};
+			IRayTracingPipelineBase::SHitShaderGroup hitGroups[E_RAY_TYPE::ERT_COUNT * E_GEOM_TYPE::EGT_COUNT];
+			hitGroups[getHitGroupIndex(EGT_TRIANGLES, ERT_PRIMARY)] = {
+			  .closestHit = RTDS_CLOSEST_HIT,
+			  .anyHit = RTDS_ANYHIT_PRIMARY,
+			};
+			hitGroups[getHitGroupIndex(EGT_TRIANGLES, ERT_OCCLUSION)] = {
+			  .closestHit = IGPURayTracingPipeline::SGeneralShaderGroup::Unused,
+			  .anyHit = RTDS_ANYHIT_SHADOW,
+			};
+			hitGroups[getHitGroupIndex(EGT_PROCEDURAL, ERT_PRIMARY)] = {
+			  .closestHit = RTDS_SPHERE_CLOSEST_HIT,
+			  .anyHit = RTDS_ANYHIT_PRIMARY,
+			  .intersection = RTDS_INTERSECTION,
+			};
+			hitGroups[getHitGroupIndex(EGT_PROCEDURAL, ERT_OCCLUSION)] = {
+			  .closestHit = IGPURayTracingPipeline::SGeneralShaderGroup::Unused,
+			  .anyHit = RTDS_ANYHIT_SHADOW,
+			  .intersection = RTDS_INTERSECTION,
+			};
+			shaderGroups.hits = hitGroups;
+
+			IRayTracingPipelineBase::SGeneralShaderGroup callableGroups[ELT_COUNT];
+			callableGroups[ELT_DIRECTIONAL] = { .index = RTDS_DIRECTIONAL_CALL };
+			callableGroups[ELT_POINT] = { .index = RTDS_POINT_CALL };
+			callableGroups[ELT_SPOT] = { .index = RTDS_SPOT_CALL };
+			shaderGroups.callables = callableGroups;
+
+			params.cached.maxRecursionDepth = 1;
+			params.cached.dynamicStackSize = true;
+
+			if (!m_device->createRayTracingPipelines(nullptr, { &params, 1 }, &m_rayTracingPipeline))
+				return logFail("Failed to create ray tracing pipeline");
+
+			calculateRayTracingStackSize(m_rayTracingPipeline);
+
+			if (!createShaderBindingTable(gQueue, m_rayTracingPipeline))
+				return logFail("Could not create shader binding table");
+
 		}
-	  });
 
-	if (!m_hdrImage || !m_device->allocate(m_hdrImage->getMemoryReqs(), m_hdrImage.get()).isValid())
-	  return logFail("Could not create HDR Image");
+		auto assetManager = make_smart_refctd_ptr<nbl::asset::IAssetManager>(smart_refctd_ptr(system));
+		auto* geometryCreator = assetManager->getGeometryCreator();
 
-	m_hdrImageView = m_device->createImageView({
-	  .flags = IGPUImageView::ECF_NONE,
-	  .subUsages = IGPUImage::E_USAGE_FLAGS::EUF_STORAGE_BIT | IGPUImage::E_USAGE_FLAGS::EUF_SAMPLED_BIT,
-	  .image = m_hdrImage,
-	  .viewType = IGPUImageView::E_TYPE::ET_2D,
-	  .format = asset::EF_R16G16B16A16_SFLOAT
-	});
+		if (!createIndirectBuffer(gQueue))
+			return logFail("Could not create indirect buffer");
 
+#ifdef TEST_ASSET_CONV_AS
+		if (!createAccelerationStructuresFromGeometry(getComputeQueue(), geometryCreator))
+			return logFail("Could not create acceleration structures from geometry creator");
+#else
+		// create geometry objects
+		if (!createGeometries(gQueue, geometryCreator))
+			return logFail("Could not create geometries from geometry creator");
 
+		if (!createAccelerationStructures(getComputeQueue()))
+			return logFail("Could not create acceleration structures");
+#endif // TEST_ASSET_CONV_AS
+
+		ISampler::SParams samplerParams = {
+		  .AnisotropicFilter = 0
+		};
+		auto defaultSampler = m_device->createSampler(samplerParams);
 
-	// ray trace pipeline and descriptor set layout setup
-	{
-	  const IGPUDescriptorSetLayout::SBinding bindings[] = {
-		{
-		  .binding = 0,
-		  .type = asset::IDescriptor::E_TYPE::ET_ACCELERATION_STRUCTURE,
-		  .createFlags = IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_NONE,
-		  .stageFlags = asset::IShader::E_SHADER_STAGE::ESS_RAYGEN,
-		  .count = 1,
-		},
 		{
-		  .binding = 1,
-		  .type = asset::IDescriptor::E_TYPE::ET_STORAGE_IMAGE,
-		  .createFlags = IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_NONE,
-		  .stageFlags = asset::IShader::E_SHADER_STAGE::ESS_RAYGEN,
-		  .count = 1,
+			const IGPUDescriptorSetLayout::SBinding bindings[] = {
+			  {
+				.binding = 0u,
+				.type = nbl::asset::IDescriptor::E_TYPE::ET_COMBINED_IMAGE_SAMPLER,
+				.createFlags = ICPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_NONE,
+				.stageFlags = IShader::E_SHADER_STAGE::ESS_FRAGMENT,
+				.count = 1u,
+				.immutableSamplers = &defaultSampler
+			  }
+			};
+			auto gpuPresentDescriptorSetLayout = m_device->createDescriptorSetLayout(bindings);
+			const video::IGPUDescriptorSetLayout* const layouts[] = { gpuPresentDescriptorSetLayout.get() };
+			const uint32_t setCounts[] = { 1u };
+			m_presentDsPool = m_device->createDescriptorPoolForDSLayouts(IDescriptorPool::E_CREATE_FLAGS::ECF_NONE, layouts, setCounts);
+			m_presentDs = m_presentDsPool->createDescriptorSet(gpuPresentDescriptorSetLayout);
+
+			auto scRes = static_cast<CDefaultSwapchainFramebuffers*>(m_surface->getSwapchainResources());
+			ext::FullScreenTriangle::ProtoPipeline fsTriProtoPPln(m_assetMgr.get(), m_device.get(), m_logger.get());
+			if (!fsTriProtoPPln)
+				return logFail("Failed to create Full Screen Triangle protopipeline or load its vertex shader!");
+
+			const IGPUShader::SSpecInfo fragSpec = {
+			  .entryPoint = "main",
+			  .shader = fragmentShader.get()
+			};
+
+			auto presentLayout = m_device->createPipelineLayout(
+				{},
+				core::smart_refctd_ptr(gpuPresentDescriptorSetLayout),
+				nullptr,
+				nullptr,
+				nullptr
+			);
+			m_presentPipeline = fsTriProtoPPln.createPipeline(fragSpec, presentLayout.get(), scRes->getRenderpass());
+			if (!m_presentPipeline)
+				return logFail("Could not create Graphics Pipeline!");
 		}
-	  };
-	  const auto descriptorSetLayout = m_device->createDescriptorSetLayout(bindings);
-
-	  const std::array<IGPUDescriptorSetLayout*, ICPUPipelineLayout::DESCRIPTOR_SET_COUNT> dsLayoutPtrs = { descriptorSetLayout.get() };
-	  m_rayTracingDsPool = m_device->createDescriptorPoolForDSLayouts(IDescriptorPool::ECF_UPDATE_AFTER_BIND_BIT, std::span(dsLayoutPtrs.begin(), dsLayoutPtrs.end()));
-	  m_rayTracingDs = m_rayTracingDsPool->createDescriptorSet(descriptorSetLayout);
-
-	  const SPushConstantRange pcRange = {
-		.stageFlags = IShader::E_SHADER_STAGE::ESS_ALL_RAY_TRACING,
-		.offset = 0u,
-		.size = sizeof(SPushConstants),
-	  };
-	  const auto pipelineLayout = m_device->createPipelineLayout({ &pcRange, 1 }, smart_refctd_ptr(descriptorSetLayout), nullptr, nullptr, nullptr);
-
-	  IGPURayTracingPipeline::SCreationParams params = {};
-
-	  enum RtDemoShader
-	  {
-		RTDS_RAYGEN,
-		RTDS_MISS,
-		RTDS_MISS_SHADOW,
-		RTDS_CLOSEST_HIT,
-		RTDS_SPHERE_CLOSEST_HIT,
-		RTDS_ANYHIT_PRIMARY,
-		RTDS_ANYHIT_SHADOW,
-		RTDS_INTERSECTION,
-		RTDS_DIRECTIONAL_CALL,
-		RTDS_POINT_CALL,
-		RTDS_SPOT_CALL,
-		RTDS_COUNT
-	  };
-
-	  IGPUShader::SSpecInfo shaders[RTDS_COUNT];
-	  shaders[RTDS_RAYGEN] = {.shader = raygenShader.get()};
-	  shaders[RTDS_MISS] = {.shader = missShader.get()};
-	  shaders[RTDS_MISS_SHADOW] = { .shader = missShadowShader.get() };
-	  shaders[RTDS_CLOSEST_HIT] = {.shader = closestHitShader.get()};
-	  shaders[RTDS_SPHERE_CLOSEST_HIT] = {.shader = proceduralClosestHitShader.get()};
-	  shaders[RTDS_ANYHIT_PRIMARY] = {.shader = anyHitShaderColorPayload.get()};
-	  shaders[RTDS_ANYHIT_SHADOW] = {.shader = anyHitShaderShadowPayload.get()};
-	  shaders[RTDS_INTERSECTION] = {.shader = intersectionHitShader.get() };
-	  shaders[RTDS_DIRECTIONAL_CALL] = {.shader = directionalLightCallShader.get()};
-	  shaders[RTDS_POINT_CALL] = {.shader = pointLightCallShader.get()};
-	  shaders[RTDS_SPOT_CALL] = {.shader = spotLightCallShader.get()};
-
-	  params.layout = pipelineLayout.get();
-	  params.shaders = std::span(shaders);
-	  using RayTracingFlags = IGPURayTracingPipeline::SCreationParams::FLAGS;
-	  params.flags = core::bitflag(RayTracingFlags::NO_NULL_MISS_SHADERS) |
-		RayTracingFlags::NO_NULL_INTERSECTION_SHADERS | 
-		RayTracingFlags::NO_NULL_ANY_HIT_SHADERS;
-
-	  auto& shaderGroups = params.shaderGroups;
-
-	  shaderGroups.raygen = { .index = RTDS_RAYGEN };
-
-	  IRayTracingPipelineBase::SGeneralShaderGroup missGroups[EMT_COUNT];
-	  missGroups[EMT_PRIMARY] = { .index = RTDS_MISS };
-	  missGroups[EMT_OCCLUSION] = { .index = RTDS_MISS_SHADOW };
-	  shaderGroups.misses = missGroups;
-
-	  auto getHitGroupIndex = [](E_GEOM_TYPE geomType, E_RAY_TYPE rayType)
-		{
-		  return geomType * ERT_COUNT + rayType;
+
+		// write descriptors
+		IGPUDescriptorSet::SDescriptorInfo infos[3];
+		infos[0].desc = m_gpuTlas;
+
+		infos[1].desc = m_hdrImageView;
+		if (!infos[1].desc)
+			return logFail("Failed to create image view");
+		infos[1].info.image.imageLayout = IImage::LAYOUT::GENERAL;
+
+		infos[2].desc = m_hdrImageView;
+		infos[2].info.image.imageLayout = IImage::LAYOUT::READ_ONLY_OPTIMAL;
+
+		IGPUDescriptorSet::SWriteDescriptorSet writes[] = {
+			{.dstSet = m_rayTracingDs.get(), .binding = 0, .arrayElement = 0, .count = 1, .info = &infos[0]},
+			{.dstSet = m_rayTracingDs.get(), .binding = 1, .arrayElement = 0, .count = 1, .info = &infos[1]},
+			{.dstSet = m_presentDs.get(), .binding = 0, .arrayElement = 0, .count = 1, .info = &infos[2] },
 		};
-	  IRayTracingPipelineBase::SHitShaderGroup hitGroups[E_RAY_TYPE::ERT_COUNT * E_GEOM_TYPE::EGT_COUNT];
-	  hitGroups[getHitGroupIndex(EGT_TRIANGLES, ERT_PRIMARY)] = {
-		.closestHit = RTDS_CLOSEST_HIT,
-		.anyHit = RTDS_ANYHIT_PRIMARY,
-	  };
-	  hitGroups[getHitGroupIndex(EGT_TRIANGLES, ERT_OCCLUSION)] = {
-		.closestHit = IGPURayTracingPipeline::SGeneralShaderGroup::Unused,
-		.anyHit = RTDS_ANYHIT_SHADOW,
-	  };
-	  hitGroups[getHitGroupIndex(EGT_PROCEDURAL, ERT_PRIMARY)] = {
-		.closestHit = RTDS_SPHERE_CLOSEST_HIT,
-		.anyHit = RTDS_ANYHIT_PRIMARY,
-		.intersection = RTDS_INTERSECTION,
-	  };
-	  hitGroups[getHitGroupIndex(EGT_PROCEDURAL, ERT_OCCLUSION)] = {
-		.closestHit = IGPURayTracingPipeline::SGeneralShaderGroup::Unused,
-		.anyHit = RTDS_ANYHIT_SHADOW,
-		.intersection = RTDS_INTERSECTION,
-	  };
-	  shaderGroups.hits = hitGroups;
-
-	  IRayTracingPipelineBase::SGeneralShaderGroup callableGroups[ELT_COUNT];
-	  callableGroups[ELT_DIRECTIONAL] = { .index = RTDS_DIRECTIONAL_CALL };
-	  callableGroups[ELT_POINT] = { .index = RTDS_POINT_CALL };
-	  callableGroups[ELT_SPOT] = { .index = RTDS_SPOT_CALL };
-	  shaderGroups.callables = callableGroups;
-
-	  params.cached.maxRecursionDepth = 1;
-	  params.cached.dynamicStackSize = true;
-
-	  if (!m_device->createRayTracingPipelines(nullptr, { &params, 1 }, &m_rayTracingPipeline))
-		return logFail("Failed to create ray tracing pipeline");
-
-	  calculateRayTracingStackSize(m_rayTracingPipeline);
-	  
-	  if (!createShaderBindingTable(gQueue, m_rayTracingPipeline))
-		return logFail("Could not create shader binding table");
+		m_device->updateDescriptorSets(std::span(writes), {});
 
-	}
+		// gui descriptor setup
+		{
+			using binding_flags_t = IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS;
+			{
+				IGPUSampler::SParams params;
+				params.AnisotropicFilter = 1u;
+				params.TextureWrapU = ETC_REPEAT;
+				params.TextureWrapV = ETC_REPEAT;
+				params.TextureWrapW = ETC_REPEAT;
+
+				m_ui.samplers.gui = m_device->createSampler(params);
+				m_ui.samplers.gui->setObjectDebugName("Nabla IMGUI UI Sampler");
+			}
 
-	auto assetManager = make_smart_refctd_ptr<nbl::asset::IAssetManager>(smart_refctd_ptr(system));
-	auto* geometryCreator = assetManager->getGeometryCreator();
+			std::array<core::smart_refctd_ptr<IGPUSampler>, 69u> immutableSamplers;
+			for (auto& it : immutableSamplers)
+				it = smart_refctd_ptr(m_ui.samplers.scene);
 
-	if (!createIndirectBuffer(gQueue))
-	  return logFail("Could not create indirect buffer");
+			immutableSamplers[nbl::ext::imgui::UI::FontAtlasTexId] = smart_refctd_ptr(m_ui.samplers.gui);
 
-	// create geometry objects
-	if (!createGeometries(gQueue, geometryCreator))
-	  return logFail("Could not create geometries from geometry creator");
+			nbl::ext::imgui::UI::SCreationParameters params;
 
-	if (!createAccelerationStructures(getComputeQueue()))
-	  return logFail("Could not create acceleration structures");
+			params.resources.texturesInfo = { .setIx = 0u, .bindingIx = 0u };
+			params.resources.samplersInfo = { .setIx = 0u, .bindingIx = 1u };
+			params.assetManager = m_assetMgr;
+			params.pipelineCache = nullptr;
+			params.pipelineLayout = nbl::ext::imgui::UI::createDefaultPipelineLayout(m_utils->getLogicalDevice(), params.resources.texturesInfo, params.resources.samplersInfo, MaxUITextureCount);
+			params.renderpass = smart_refctd_ptr<IGPURenderpass>(renderpass);
+			params.streamingBuffer = nullptr;
+			params.subpassIx = 0u;
+			params.transfer = getTransferUpQueue();
+			params.utilities = m_utils;
+			{
+				m_ui.manager = ext::imgui::UI::create(std::move(params));
 
-	ISampler::SParams samplerParams = {
-	  .AnisotropicFilter = 0
-	};
-	auto defaultSampler = m_device->createSampler(samplerParams);
+				// note that we use default layout provided by our extension, but you are free to create your own by filling nbl::ext::imgui::UI::S_CREATION_PARAMETERS::resources
+				const auto* descriptorSetLayout = m_ui.manager->getPipeline()->getLayout()->getDescriptorSetLayout(0u);
+				const auto& params = m_ui.manager->getCreationParameters();
 
-	{
-	  const IGPUDescriptorSetLayout::SBinding bindings[] = {
-		{
-		  .binding = 0u,
-		  .type = nbl::asset::IDescriptor::E_TYPE::ET_COMBINED_IMAGE_SAMPLER,
-		  .createFlags = ICPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_NONE,
-		  .stageFlags = IShader::E_SHADER_STAGE::ESS_FRAGMENT,
-		  .count = 1u,
-		  .immutableSamplers = &defaultSampler
-		}
-	  };
-	  auto gpuPresentDescriptorSetLayout = m_device->createDescriptorSetLayout(bindings);
-	  const video::IGPUDescriptorSetLayout* const layouts[] = { gpuPresentDescriptorSetLayout.get() };
-	  const uint32_t setCounts[] = { 1u };
-	  m_presentDsPool = m_device->createDescriptorPoolForDSLayouts(IDescriptorPool::E_CREATE_FLAGS::ECF_NONE, layouts, setCounts);
-	  m_presentDs = m_presentDsPool->createDescriptorSet(gpuPresentDescriptorSetLayout);
-
-	  auto scRes = static_cast<CDefaultSwapchainFramebuffers*>(m_surface->getSwapchainResources());
-	  ext::FullScreenTriangle::ProtoPipeline fsTriProtoPPln(m_assetMgr.get(), m_device.get(), m_logger.get());
-	  if (!fsTriProtoPPln)
-		return logFail("Failed to create Full Screen Triangle protopipeline or load its vertex shader!");
-
-	  const IGPUShader::SSpecInfo fragSpec = {
-		.entryPoint = "main",
-		.shader = fragmentShader.get()
-	  };
-
-	  auto presentLayout = m_device->createPipelineLayout(
-		{},
-		core::smart_refctd_ptr(gpuPresentDescriptorSetLayout),
-		nullptr,
-		nullptr,
-		nullptr
-	  );
-	  m_presentPipeline = fsTriProtoPPln.createPipeline(fragSpec, presentLayout.get(), scRes->getRenderpass());
-	  if (!m_presentPipeline)
-		return logFail("Could not create Graphics Pipeline!");
-	}
+				IDescriptorPool::SCreateInfo descriptorPoolInfo = {};
+				descriptorPoolInfo.maxDescriptorCount[static_cast<uint32_t>(asset::IDescriptor::E_TYPE::ET_SAMPLER)] = (uint32_t)nbl::ext::imgui::UI::DefaultSamplerIx::COUNT;
+				descriptorPoolInfo.maxDescriptorCount[static_cast<uint32_t>(asset::IDescriptor::E_TYPE::ET_SAMPLED_IMAGE)] = MaxUITextureCount;
+				descriptorPoolInfo.maxSets = 1u;
+				descriptorPoolInfo.flags = IDescriptorPool::E_CREATE_FLAGS::ECF_UPDATE_AFTER_BIND_BIT;
 
-	// write descriptors
-	IGPUDescriptorSet::SDescriptorInfo infos[3];
-	infos[0].desc = m_gpuTlas;
+				m_guiDescriptorSetPool = m_device->createDescriptorPool(std::move(descriptorPoolInfo));
+				assert(m_guiDescriptorSetPool);
 
-	infos[1].desc = m_hdrImageView;
-	if (!infos[1].desc)
-	  return logFail("Failed to create image view");
-	infos[1].info.image.imageLayout = IImage::LAYOUT::GENERAL;
+				m_guiDescriptorSetPool->createDescriptorSets(1u, &descriptorSetLayout, &m_ui.descriptorSet);
+				assert(m_ui.descriptorSet);
+			}
+		}
 
-	infos[2].desc = m_hdrImageView;
-	infos[2].info.image.imageLayout = IImage::LAYOUT::READ_ONLY_OPTIMAL;
+		m_ui.manager->registerListener(
+			[this]() -> void {
+				ImGuiIO& io = ImGui::GetIO();
 
-	IGPUDescriptorSet::SWriteDescriptorSet writes[] = {
-		{.dstSet = m_rayTracingDs.get(), .binding = 0, .arrayElement = 0, .count = 1, .info = &infos[0]},
-		{.dstSet = m_rayTracingDs.get(), .binding = 1, .arrayElement = 0, .count = 1, .info = &infos[1]},
-		{.dstSet = m_presentDs.get(), .binding = 0, .arrayElement = 0, .count = 1, .info = &infos[2] },
-	};
-	m_device->updateDescriptorSets(std::span(writes), {});
+				m_camera.setProjectionMatrix([&]()
+					{
+						static matrix4SIMD projection;
+
+						projection = matrix4SIMD::buildProjectionMatrixPerspectiveFovRH(
+							core::radians(m_cameraSetting.fov),
+							io.DisplaySize.x / io.DisplaySize.y,
+							m_cameraSetting.zNear,
+							m_cameraSetting.zFar);
+
+						return projection;
+					}());
+
+				ImGui::SetNextWindowPos(ImVec2(1024, 100), ImGuiCond_Appearing);
+				ImGui::SetNextWindowSize(ImVec2(256, 256), ImGuiCond_Appearing);
+
+				// create a window and insert the inspector
+				ImGui::SetNextWindowPos(ImVec2(10, 10), ImGuiCond_Appearing);
+				ImGui::SetNextWindowSize(ImVec2(320, 340), ImGuiCond_Appearing);
+				ImGui::Begin("Controls");
+
+				ImGui::SameLine();
+
+				ImGui::Text("Camera");
+
+				ImGui::SliderFloat("Move speed", &m_cameraSetting.moveSpeed, 0.1f, 10.f);
+				ImGui::SliderFloat("Rotate speed", &m_cameraSetting.rotateSpeed, 0.1f, 10.f);
+				ImGui::SliderFloat("Fov", &m_cameraSetting.fov, 20.f, 150.f);
+				ImGui::SliderFloat("zNear", &m_cameraSetting.zNear, 0.1f, 100.f);
+				ImGui::SliderFloat("zFar", &m_cameraSetting.zFar, 110.f, 10000.f);
+				Light m_oldLight = m_light;
+				int light_type = m_light.type;
+				ImGui::ListBox("LightType", &light_type, s_lightTypeNames, ELT_COUNT);
+				m_light.type = static_cast<E_LIGHT_TYPE>(light_type);
+				if (m_light.type == ELT_DIRECTIONAL)
+				{
+					ImGui::SliderFloat3("Light Direction", &m_light.direction.x, -1.f, 1.f);
+				}
+				else if (m_light.type == ELT_POINT)
+				{
+					ImGui::SliderFloat3("Light Position", &m_light.position.x, -20.f, 20.f);
+				}
+				else if (m_light.type == ELT_SPOT)
+				{
+					ImGui::SliderFloat3("Light Direction", &m_light.direction.x, -1.f, 1.f);
+					ImGui::SliderFloat3("Light Position", &m_light.position.x, -20.f, 20.f);
 
-	// gui descriptor setup
-	{
-	  using binding_flags_t = IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS;
-	  {
-		IGPUSampler::SParams params;
-		params.AnisotropicFilter = 1u;
-		params.TextureWrapU = ETC_REPEAT;
-		params.TextureWrapV = ETC_REPEAT;
-		params.TextureWrapW = ETC_REPEAT;
-
-		m_ui.samplers.gui = m_device->createSampler(params);
-		m_ui.samplers.gui->setObjectDebugName("Nabla IMGUI UI Sampler");
-	  }
-
-	  std::array<core::smart_refctd_ptr<IGPUSampler>, 69u> immutableSamplers;
-	  for (auto& it : immutableSamplers)
-		it = smart_refctd_ptr(m_ui.samplers.scene);
-
-	  immutableSamplers[nbl::ext::imgui::UI::FontAtlasTexId] = smart_refctd_ptr(m_ui.samplers.gui);
-
-	  nbl::ext::imgui::UI::SCreationParameters params;
-
-	  params.resources.texturesInfo = { .setIx = 0u, .bindingIx = 0u };
-	  params.resources.samplersInfo = { .setIx = 0u, .bindingIx = 1u };
-	  params.assetManager = m_assetMgr;
-	  params.pipelineCache = nullptr;
-	  params.pipelineLayout = nbl::ext::imgui::UI::createDefaultPipelineLayout(m_utils->getLogicalDevice(), params.resources.texturesInfo, params.resources.samplersInfo, MaxUITextureCount);
-	  params.renderpass = smart_refctd_ptr<IGPURenderpass>(renderpass);
-	  params.streamingBuffer = nullptr;
-	  params.subpassIx = 0u;
-	  params.transfer = getTransferUpQueue();
-	  params.utilities = m_utils;
-	  {
-		m_ui.manager = ext::imgui::UI::create(std::move(params));
-
-		// note that we use default layout provided by our extension, but you are free to create your own by filling nbl::ext::imgui::UI::S_CREATION_PARAMETERS::resources
-		const auto* descriptorSetLayout = m_ui.manager->getPipeline()->getLayout()->getDescriptorSetLayout(0u);
-		const auto& params = m_ui.manager->getCreationParameters();
-
-		IDescriptorPool::SCreateInfo descriptorPoolInfo = {};
-		descriptorPoolInfo.maxDescriptorCount[static_cast<uint32_t>(asset::IDescriptor::E_TYPE::ET_SAMPLER)] = (uint32_t)nbl::ext::imgui::UI::DefaultSamplerIx::COUNT;
-		descriptorPoolInfo.maxDescriptorCount[static_cast<uint32_t>(asset::IDescriptor::E_TYPE::ET_SAMPLED_IMAGE)] = MaxUITextureCount;
-		descriptorPoolInfo.maxSets = 1u;
-		descriptorPoolInfo.flags = IDescriptorPool::E_CREATE_FLAGS::ECF_UPDATE_AFTER_BIND_BIT;
-
-		m_guiDescriptorSetPool = m_device->createDescriptorPool(std::move(descriptorPoolInfo));
-		assert(m_guiDescriptorSetPool);
-
-		m_guiDescriptorSetPool->createDescriptorSets(1u, &descriptorSetLayout, &m_ui.descriptorSet);
-		assert(m_ui.descriptorSet);
-	  }
-	}
+					float32_t dOuterCutoff = hlsl::degrees(acos(m_light.outerCutoff));
+					if (ImGui::SliderFloat("Light Outer Cutoff", &dOuterCutoff, 0.0f, 45.0f))
+					{
+						m_light.outerCutoff = cos(hlsl::radians(dOuterCutoff));
+					}
+				}
+				ImGui::Checkbox("Use Indirect Command", &m_useIndirectCommand);
+				if (m_light != m_oldLight)
+				{
+					m_frameAccumulationCounter = 0;
+				}
 
-	m_ui.manager->registerListener(
-	  [this]() -> void {
-		ImGuiIO& io = ImGui::GetIO();
+				ImGui::Text("X: %f Y: %f", io.MousePos.x, io.MousePos.y);
 
-		m_camera.setProjectionMatrix([&]()
-		{
-		  static matrix4SIMD projection;
-
-		  projection = matrix4SIMD::buildProjectionMatrixPerspectiveFovRH(
-			core::radians(m_cameraSetting.fov), 
-			io.DisplaySize.x / io.DisplaySize.y, 
-			m_cameraSetting.zNear, 
-			m_cameraSetting.zFar);
-
-		  return projection;
-		}());
-
-		ImGui::SetNextWindowPos(ImVec2(1024, 100), ImGuiCond_Appearing);
-		ImGui::SetNextWindowSize(ImVec2(256, 256), ImGuiCond_Appearing);
-
-		// create a window and insert the inspector
-		ImGui::SetNextWindowPos(ImVec2(10, 10), ImGuiCond_Appearing);
-		ImGui::SetNextWindowSize(ImVec2(320, 340), ImGuiCond_Appearing);
-		ImGui::Begin("Controls");
-
-		ImGui::SameLine();
-
-		ImGui::Text("Camera");
-
-		ImGui::SliderFloat("Move speed", &m_cameraSetting.moveSpeed, 0.1f, 10.f);
-		ImGui::SliderFloat("Rotate speed", &m_cameraSetting.rotateSpeed, 0.1f, 10.f);
-		ImGui::SliderFloat("Fov", &m_cameraSetting.fov, 20.f, 150.f);
-		ImGui::SliderFloat("zNear", &m_cameraSetting.zNear, 0.1f, 100.f);
-		ImGui::SliderFloat("zFar", &m_cameraSetting.zFar, 110.f, 10000.f);
-		Light m_oldLight = m_light;
-		int light_type = m_light.type;
-		ImGui::ListBox("LightType", &light_type, s_lightTypeNames, ELT_COUNT);
-		m_light.type = static_cast<E_LIGHT_TYPE>(light_type);
-		if (m_light.type == ELT_DIRECTIONAL)
-		{
-		  ImGui::SliderFloat3("Light Direction", &m_light.direction.x, -1.f, 1.f);
-		} else if (m_light.type == ELT_POINT)
-		{
-		  ImGui::SliderFloat3("Light Position", &m_light.position.x, -20.f, 20.f);
-		} else if (m_light.type == ELT_SPOT)
-		{
-		  ImGui::SliderFloat3("Light Direction", &m_light.direction.x, -1.f, 1.f);
-		  ImGui::SliderFloat3("Light Position", &m_light.position.x, -20.f, 20.f);
-
-		  float32_t dOuterCutoff = hlsl::degrees(acos(m_light.outerCutoff));
-		  if (ImGui::SliderFloat("Light Outer Cutoff", &dOuterCutoff, 0.0f, 45.0f))
-		  {
-			m_light.outerCutoff = cos(hlsl::radians(dOuterCutoff));
-		  }
-		}
-		ImGui::Checkbox("Use Indirect Command", &m_useIndirectCommand);
-		if (m_light != m_oldLight)
+				ImGui::End();
+			}
+		);
+
+		// Set Camera
 		{
-		  m_frameAccumulationCounter = 0;
+			core::vectorSIMDf cameraPosition(0, 5, -10);
+			matrix4SIMD proj = matrix4SIMD::buildProjectionMatrixPerspectiveFovRH(
+				core::radians(60.0f),
+				WIN_W / WIN_H,
+				0.01f,
+				500.0f
+			);
+			m_camera = Camera(cameraPosition, core::vectorSIMDf(0, 0, 0), proj);
 		}
 
-		ImGui::Text("X: %f Y: %f", io.MousePos.x, io.MousePos.y);
-
-		ImGui::End();
-	  }
-	);
+		m_winMgr->setWindowSize(m_window.get(), WIN_W, WIN_H);
+		m_surface->recreateSwapchain();
+		m_winMgr->show(m_window.get());
+		m_oracle.reportBeginFrameRecord();
+		m_camera.mapKeysToWASD();
 
-	// Set Camera
-	{
-	  core::vectorSIMDf cameraPosition(0, 5, -10);
-	  matrix4SIMD proj = matrix4SIMD::buildProjectionMatrixPerspectiveFovRH(
-		core::radians(60.0f),
-		WIN_W / WIN_H,
-		0.01f,
-		500.0f
-	  );
-	  m_camera = Camera(cameraPosition, core::vectorSIMDf(0, 0, 0), proj);
+		return true;
 	}
 
-	m_winMgr->setWindowSize(m_window.get(), WIN_W, WIN_H);
-	m_surface->recreateSwapchain();
-	m_winMgr->show(m_window.get());
-	m_oracle.reportBeginFrameRecord();
-	m_camera.mapKeysToWASD();
-
-	return true;
-  }
+	bool updateGUIDescriptorSet()
+	{
+		// texture atlas, note we don't create info & write pair for the font sampler because UI extension's is immutable and baked into DS layout
+		static std::array<IGPUDescriptorSet::SDescriptorInfo, MaxUITextureCount> descriptorInfo;
+		static IGPUDescriptorSet::SWriteDescriptorSet writes[MaxUITextureCount];
 
-  bool updateGUIDescriptorSet()
-  {
-	// texture atlas, note we don't create info & write pair for the font sampler because UI extension's is immutable and baked into DS layout
-	static std::array<IGPUDescriptorSet::SDescriptorInfo, MaxUITextureCount> descriptorInfo;
-	static IGPUDescriptorSet::SWriteDescriptorSet writes[MaxUITextureCount];
+		descriptorInfo[nbl::ext::imgui::UI::FontAtlasTexId].info.image.imageLayout = IImage::LAYOUT::READ_ONLY_OPTIMAL;
+		descriptorInfo[nbl::ext::imgui::UI::FontAtlasTexId].desc = smart_refctd_ptr<IGPUImageView>(m_ui.manager->getFontAtlasView());
 
-	descriptorInfo[nbl::ext::imgui::UI::FontAtlasTexId].info.image.imageLayout = IImage::LAYOUT::READ_ONLY_OPTIMAL;
-	descriptorInfo[nbl::ext::imgui::UI::FontAtlasTexId].desc = smart_refctd_ptr<IGPUImageView>(m_ui.manager->getFontAtlasView());
+		for (uint32_t i = 0; i < descriptorInfo.size(); ++i)
+		{
+			writes[i].dstSet = m_ui.descriptorSet.get();
+			writes[i].binding = 0u;
+			writes[i].arrayElement = i;
+			writes[i].count = 1u;
+		}
+		writes[nbl::ext::imgui::UI::FontAtlasTexId].info = descriptorInfo.data() + nbl::ext::imgui::UI::FontAtlasTexId;
 
-	for (uint32_t i = 0; i < descriptorInfo.size(); ++i)
-	{
-	  writes[i].dstSet = m_ui.descriptorSet.get();
-	  writes[i].binding = 0u;
-	  writes[i].arrayElement = i;
-	  writes[i].count = 1u;
+		return m_device->updateDescriptorSets(writes, {});
 	}
-	writes[nbl::ext::imgui::UI::FontAtlasTexId].info = descriptorInfo.data() + nbl::ext::imgui::UI::FontAtlasTexId;
-
-	return m_device->updateDescriptorSets(writes, {});
-  }
-
-  inline void workLoopBody() override
-  {
-	// framesInFlight: ensuring safe execution of command buffers and acquires, `framesInFlight` only affect semaphore waits, don't use this to index your resources because it can change with swapchain recreation.
-	const uint32_t framesInFlight = core::min(MaxFramesInFlight, m_surface->getMaxAcquiresInFlight());
-	// We block for semaphores for 2 reasons here:
-	  // A) Resource: Can't use resource like a command buffer BEFORE previous use is finished! [MaxFramesInFlight]
-	  // B) Acquire: Can't have more acquires in flight than a certain threshold returned by swapchain or your surface helper class. [MaxAcquiresInFlight]
-	if (m_realFrameIx >= framesInFlight)
+
+	inline void workLoopBody() override
 	{
-	  const ISemaphore::SWaitInfo cbDonePending[] = 
-	  {
+		// framesInFlight: ensuring safe execution of command buffers and acquires, `framesInFlight` only affect semaphore waits, don't use this to index your resources because it can change with swapchain recreation.
+		const uint32_t framesInFlight = core::min(MaxFramesInFlight, m_surface->getMaxAcquiresInFlight());
+		// We block for semaphores for 2 reasons here:
+		  // A) Resource: Can't use resource like a command buffer BEFORE previous use is finished! [MaxFramesInFlight]
+		  // B) Acquire: Can't have more acquires in flight than a certain threshold returned by swapchain or your surface helper class. [MaxAcquiresInFlight]
+		if (m_realFrameIx >= framesInFlight)
 		{
-		  .semaphore = m_semaphore.get(),
-		  .value = m_realFrameIx + 1 - framesInFlight
+			const ISemaphore::SWaitInfo cbDonePending[] =
+			{
+			  {
+				.semaphore = m_semaphore.get(),
+				.value = m_realFrameIx + 1 - framesInFlight
+			  }
+			};
+			if (m_device->blockForSemaphores(cbDonePending) != ISemaphore::WAIT_RESULT::SUCCESS)
+				return;
 		}
-	  };
-	  if (m_device->blockForSemaphores(cbDonePending) != ISemaphore::WAIT_RESULT::SUCCESS)
-		return;
-	}
-	const auto resourceIx = m_realFrameIx % MaxFramesInFlight;
+		const auto resourceIx = m_realFrameIx % MaxFramesInFlight;
 
-	m_api->startCapture();
+		m_api->startCapture();
 
-	update();
+		update();
 
-	auto queue = getGraphicsQueue();
-	auto cmdbuf = m_cmdBufs[resourceIx].get();
+		auto queue = getGraphicsQueue();
+		auto cmdbuf = m_cmdBufs[resourceIx].get();
 
-	if (!keepRunning())
-	  return;
+		if (!keepRunning())
+			return;
 
-	cmdbuf->reset(IGPUCommandBuffer::RESET_FLAGS::RELEASE_RESOURCES_BIT);
-	cmdbuf->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT);
-	cmdbuf->beginDebugMarker("RaytracingPipelineApp Frame");
+		cmdbuf->reset(IGPUCommandBuffer::RESET_FLAGS::RELEASE_RESOURCES_BIT);
+		cmdbuf->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT);
+		cmdbuf->beginDebugMarker("RaytracingPipelineApp Frame");
 
-	const auto viewMatrix = m_camera.getViewMatrix();
-	const auto projectionMatrix = m_camera.getProjectionMatrix();
-	const auto viewProjectionMatrix = m_camera.getConcatenatedMatrix();
+		const auto viewMatrix = m_camera.getViewMatrix();
+		const auto projectionMatrix = m_camera.getProjectionMatrix();
+		const auto viewProjectionMatrix = m_camera.getConcatenatedMatrix();
 
-	core::matrix3x4SIMD modelMatrix;
-	modelMatrix.setTranslation(nbl::core::vectorSIMDf(0, 0, 0, 0));
-	modelMatrix.setRotation(quaternion(0, 0, 0));
+		core::matrix3x4SIMD modelMatrix;
+		modelMatrix.setTranslation(nbl::core::vectorSIMDf(0, 0, 0, 0));
+		modelMatrix.setRotation(quaternion(0, 0, 0));
 
-	core::matrix4SIMD modelViewProjectionMatrix = core::concatenateBFollowedByA(viewProjectionMatrix, modelMatrix);
-	if (m_cachedModelViewProjectionMatrix != modelViewProjectionMatrix)
-	{
-	  m_frameAccumulationCounter = 0;
-	  m_cachedModelViewProjectionMatrix = modelViewProjectionMatrix;
-	}
-	core::matrix4SIMD invModelViewProjectionMatrix;
-	modelViewProjectionMatrix.getInverseTransform(invModelViewProjectionMatrix);
+		core::matrix4SIMD modelViewProjectionMatrix = core::concatenateBFollowedByA(viewProjectionMatrix, modelMatrix);
+		if (m_cachedModelViewProjectionMatrix != modelViewProjectionMatrix)
+		{
+			m_frameAccumulationCounter = 0;
+			m_cachedModelViewProjectionMatrix = modelViewProjectionMatrix;
+		}
+		core::matrix4SIMD invModelViewProjectionMatrix;
+		modelViewProjectionMatrix.getInverseTransform(invModelViewProjectionMatrix);
 
-	{
-	  IGPUCommandBuffer::SPipelineBarrierDependencyInfo::image_barrier_t imageBarriers[1];
-	  imageBarriers[0].barrier = {
-		 .dep = {
-		   .srcStageMask = PIPELINE_STAGE_FLAGS::FRAGMENT_SHADER_BIT, // previous frame read from framgent shader
-		   .srcAccessMask = ACCESS_FLAGS::SHADER_READ_BITS,
-		   .dstStageMask = PIPELINE_STAGE_FLAGS::RAY_TRACING_SHADER_BIT,
-		   .dstAccessMask = ACCESS_FLAGS::SHADER_WRITE_BITS
+		{
+			IGPUCommandBuffer::SPipelineBarrierDependencyInfo::image_barrier_t imageBarriers[1];
+			imageBarriers[0].barrier = {
+			   .dep = {
+				 .srcStageMask = PIPELINE_STAGE_FLAGS::FRAGMENT_SHADER_BIT, // previous frame read from framgent shader
+				 .srcAccessMask = ACCESS_FLAGS::SHADER_READ_BITS,
+				 .dstStageMask = PIPELINE_STAGE_FLAGS::RAY_TRACING_SHADER_BIT,
+				 .dstAccessMask = ACCESS_FLAGS::SHADER_WRITE_BITS
+			  }
+			};
+			imageBarriers[0].image = m_hdrImage.get();
+			imageBarriers[0].subresourceRange = {
+			  .aspectMask = IImage::EAF_COLOR_BIT,
+			  .baseMipLevel = 0u,
+			  .levelCount = 1u,
+			  .baseArrayLayer = 0u,
+			  .layerCount = 1u
+			};
+			imageBarriers[0].oldLayout = m_frameAccumulationCounter == 0 ? IImage::LAYOUT::UNDEFINED : IImage::LAYOUT::READ_ONLY_OPTIMAL;
+			imageBarriers[0].newLayout = IImage::LAYOUT::GENERAL;
+			cmdbuf->pipelineBarrier(E_DEPENDENCY_FLAGS::EDF_NONE, { .imgBarriers = imageBarriers });
 		}
-	  };
-	  imageBarriers[0].image = m_hdrImage.get();
-	  imageBarriers[0].subresourceRange = {
-		.aspectMask = IImage::EAF_COLOR_BIT,
-		.baseMipLevel = 0u,
-		.levelCount = 1u,
-		.baseArrayLayer = 0u,
-		.layerCount = 1u
-	  };
-	  imageBarriers[0].oldLayout = m_frameAccumulationCounter == 0 ? IImage::LAYOUT::UNDEFINED : IImage::LAYOUT::READ_ONLY_OPTIMAL;
-	  imageBarriers[0].newLayout = IImage::LAYOUT::GENERAL;
-	  cmdbuf->pipelineBarrier(E_DEPENDENCY_FLAGS::EDF_NONE, { .imgBarriers = imageBarriers });
-	}
 
-	// Trace Rays Pass
-	{
-	  SPushConstants pc;
-	  pc.light = m_light;
-	  pc.proceduralGeomInfoBuffer = m_proceduralGeomInfoBuffer->getDeviceAddress();
-	  pc.triangleGeomInfoBuffer = m_triangleGeomInfoBuffer->getDeviceAddress();
-	  pc.frameCounter = m_frameAccumulationCounter;
-	  const core::vector3df camPos = m_camera.getPosition().getAsVector3df();
-	  pc.camPos = { camPos.X, camPos.Y, camPos.Z };
-	  memcpy(&pc.invMVP, invModelViewProjectionMatrix.pointer(), sizeof(pc.invMVP));
-
-	  cmdbuf->bindRayTracingPipeline(m_rayTracingPipeline.get());
-	  cmdbuf->setRayTracingPipelineStackSize(m_rayTracingStackSize);
-	  cmdbuf->pushConstants(m_rayTracingPipeline->getLayout(), IShader::E_SHADER_STAGE::ESS_ALL_RAY_TRACING, 0, sizeof(SPushConstants), &pc);
-	  cmdbuf->bindDescriptorSets(EPBP_RAY_TRACING, m_rayTracingPipeline->getLayout(), 0, 1, &m_rayTracingDs.get());
-	  if (m_useIndirectCommand)
-	  {
-		cmdbuf->traceRaysIndirect(
-		  SBufferBinding<const IGPUBuffer>{
-			.offset = 0,
-			.buffer = m_indirectBuffer,
-		  });
-	  }else
-	  {
-		cmdbuf->traceRays(
-		  m_shaderBindingTable.raygenGroupRange,
-		  m_shaderBindingTable.missGroupsRange, m_shaderBindingTable.missGroupsStride,
-		  m_shaderBindingTable.hitGroupsRange, m_shaderBindingTable.hitGroupsStride,
-		  m_shaderBindingTable.callableGroupsRange, m_shaderBindingTable.callableGroupsStride,
-		  WIN_W, WIN_H, 1);
-	  }
-	}
+		// Trace Rays Pass
+		{
+			SPushConstants pc;
+			pc.light = m_light;
+			pc.proceduralGeomInfoBuffer = m_proceduralGeomInfoBuffer->getDeviceAddress();
+			pc.triangleGeomInfoBuffer = m_triangleGeomInfoBuffer->getDeviceAddress();
+			pc.frameCounter = m_frameAccumulationCounter;
+			const core::vector3df camPos = m_camera.getPosition().getAsVector3df();
+			pc.camPos = { camPos.X, camPos.Y, camPos.Z };
+			memcpy(&pc.invMVP, invModelViewProjectionMatrix.pointer(), sizeof(pc.invMVP));
+
+			cmdbuf->bindRayTracingPipeline(m_rayTracingPipeline.get());
+			cmdbuf->setRayTracingPipelineStackSize(m_rayTracingStackSize);
+			cmdbuf->pushConstants(m_rayTracingPipeline->getLayout(), IShader::E_SHADER_STAGE::ESS_ALL_RAY_TRACING, 0, sizeof(SPushConstants), &pc);
+			cmdbuf->bindDescriptorSets(EPBP_RAY_TRACING, m_rayTracingPipeline->getLayout(), 0, 1, &m_rayTracingDs.get());
+			if (m_useIndirectCommand)
+			{
+				cmdbuf->traceRaysIndirect(
+					SBufferBinding<const IGPUBuffer>{
+					.offset = 0,
+						.buffer = m_indirectBuffer,
+				});
+			}
+			else
+			{
+				cmdbuf->traceRays(
+					m_shaderBindingTable.raygenGroupRange,
+					m_shaderBindingTable.missGroupsRange, m_shaderBindingTable.missGroupsStride,
+					m_shaderBindingTable.hitGroupsRange, m_shaderBindingTable.hitGroupsStride,
+					m_shaderBindingTable.callableGroupsRange, m_shaderBindingTable.callableGroupsStride,
+					WIN_W, WIN_H, 1);
+			}
+		}
 
-	// pipeline barrier
-	{
-	  IGPUCommandBuffer::SPipelineBarrierDependencyInfo::image_barrier_t imageBarriers[1];
-	  imageBarriers[0].barrier = {
-		.dep = {
-		  .srcStageMask = PIPELINE_STAGE_FLAGS::RAY_TRACING_SHADER_BIT,
-		  .srcAccessMask = ACCESS_FLAGS::SHADER_WRITE_BITS,
-		  .dstStageMask = PIPELINE_STAGE_FLAGS::COLOR_ATTACHMENT_OUTPUT_BIT,
-		  .dstAccessMask = ACCESS_FLAGS::COLOR_ATTACHMENT_WRITE_BIT
+		// pipeline barrier
+		{
+			IGPUCommandBuffer::SPipelineBarrierDependencyInfo::image_barrier_t imageBarriers[1];
+			imageBarriers[0].barrier = {
+			  .dep = {
+				.srcStageMask = PIPELINE_STAGE_FLAGS::RAY_TRACING_SHADER_BIT,
+				.srcAccessMask = ACCESS_FLAGS::SHADER_WRITE_BITS,
+				.dstStageMask = PIPELINE_STAGE_FLAGS::COLOR_ATTACHMENT_OUTPUT_BIT,
+				.dstAccessMask = ACCESS_FLAGS::COLOR_ATTACHMENT_WRITE_BIT
+			  }
+			};
+			imageBarriers[0].image = m_hdrImage.get();
+			imageBarriers[0].subresourceRange = {
+			  .aspectMask = IImage::EAF_COLOR_BIT,
+			  .baseMipLevel = 0u,
+			  .levelCount = 1u,
+			  .baseArrayLayer = 0u,
+			  .layerCount = 1u
+			};
+			imageBarriers[0].oldLayout = IImage::LAYOUT::GENERAL;
+			imageBarriers[0].newLayout = IImage::LAYOUT::READ_ONLY_OPTIMAL;
+
+			cmdbuf->pipelineBarrier(E_DEPENDENCY_FLAGS::EDF_NONE, { .imgBarriers = imageBarriers });
 		}
-	  };
-	  imageBarriers[0].image = m_hdrImage.get();
-	  imageBarriers[0].subresourceRange = {
-		.aspectMask = IImage::EAF_COLOR_BIT,
-		.baseMipLevel = 0u,
-		.levelCount = 1u,
-		.baseArrayLayer = 0u,
-		.layerCount = 1u
-	  };
-	  imageBarriers[0].oldLayout = IImage::LAYOUT::GENERAL;
-	  imageBarriers[0].newLayout = IImage::LAYOUT::READ_ONLY_OPTIMAL;
-
-	  cmdbuf->pipelineBarrier(E_DEPENDENCY_FLAGS::EDF_NONE, { .imgBarriers = imageBarriers });
-	}
 
-	{
+		{
 			asset::SViewport viewport;
 			{
 				viewport.minDepth = 1.f;
@@ -797,1080 +805,1223 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication,
 			VkRect2D defaultScisors[] = { {.offset = {(int32_t)viewport.x, (int32_t)viewport.y}, .extent = {(uint32_t)viewport.width, (uint32_t)viewport.height}} };
 			cmdbuf->setScissor(defaultScisors);
 
-	  auto scRes = static_cast<CDefaultSwapchainFramebuffers*>(m_surface->getSwapchainResources());
-	  const VkRect2D currentRenderArea =
-	  {
-		.offset = {0,0},
-		.extent = {m_window->getWidth(),m_window->getHeight()}
-	  };
-	  const IGPUCommandBuffer::SClearColorValue clearColor = { .float32 = {0.f,0.f,0.f,1.f} };
-	  const IGPUCommandBuffer::SRenderpassBeginInfo info =
-	  {
-		.framebuffer = scRes->getFramebuffer(m_currentImageAcquire.imageIndex),
-		.colorClearValues = &clearColor,
-		.depthStencilClearValues = nullptr,
-		.renderArea = currentRenderArea
-	  };
-	  nbl::video::ISemaphore::SWaitInfo waitInfo = { .semaphore = m_semaphore.get(), .value = m_realFrameIx + 1u };
-
-	  cmdbuf->beginRenderPass(info, IGPUCommandBuffer::SUBPASS_CONTENTS::INLINE);
-
-	  cmdbuf->bindGraphicsPipeline(m_presentPipeline.get());
-	  cmdbuf->bindDescriptorSets(EPBP_GRAPHICS, m_presentPipeline->getLayout(), 0, 1u, &m_presentDs.get());
-	  ext::FullScreenTriangle::recordDrawCall(cmdbuf);
-
-	  const auto uiParams = m_ui.manager->getCreationParameters();
-	  auto* uiPipeline = m_ui.manager->getPipeline();
-	  cmdbuf->bindGraphicsPipeline(uiPipeline);
-	  cmdbuf->bindDescriptorSets(EPBP_GRAPHICS, uiPipeline->getLayout(), uiParams.resources.texturesInfo.setIx, 1u, &m_ui.descriptorSet.get());
-	  m_ui.manager->render(cmdbuf, waitInfo);
-
-	  cmdbuf->endRenderPass();
+			auto scRes = static_cast<CDefaultSwapchainFramebuffers*>(m_surface->getSwapchainResources());
+			const VkRect2D currentRenderArea =
+			{
+			  .offset = {0,0},
+			  .extent = {m_window->getWidth(),m_window->getHeight()}
+			};
+			const IGPUCommandBuffer::SClearColorValue clearColor = { .float32 = {0.f,0.f,0.f,1.f} };
+			const IGPUCommandBuffer::SRenderpassBeginInfo info =
+			{
+			  .framebuffer = scRes->getFramebuffer(m_currentImageAcquire.imageIndex),
+			  .colorClearValues = &clearColor,
+			  .depthStencilClearValues = nullptr,
+			  .renderArea = currentRenderArea
+			};
+			nbl::video::ISemaphore::SWaitInfo waitInfo = { .semaphore = m_semaphore.get(), .value = m_realFrameIx + 1u };
+
+			cmdbuf->beginRenderPass(info, IGPUCommandBuffer::SUBPASS_CONTENTS::INLINE);
 
-	}
+			cmdbuf->bindGraphicsPipeline(m_presentPipeline.get());
+			cmdbuf->bindDescriptorSets(EPBP_GRAPHICS, m_presentPipeline->getLayout(), 0, 1u, &m_presentDs.get());
+			ext::FullScreenTriangle::recordDrawCall(cmdbuf);
 
-	cmdbuf->endDebugMarker();
-	cmdbuf->end();
+			const auto uiParams = m_ui.manager->getCreationParameters();
+			auto* uiPipeline = m_ui.manager->getPipeline();
+			cmdbuf->bindGraphicsPipeline(uiPipeline);
+			cmdbuf->bindDescriptorSets(EPBP_GRAPHICS, uiPipeline->getLayout(), uiParams.resources.texturesInfo.setIx, 1u, &m_ui.descriptorSet.get());
+			m_ui.manager->render(cmdbuf, waitInfo);
+
+			cmdbuf->endRenderPass();
 
-	{
-	  const IQueue::SSubmitInfo::SSemaphoreInfo rendered[] =
-	  {
-		{
-		  .semaphore = m_semaphore.get(),
-		  .value = ++m_realFrameIx,
-		  .stageMask = PIPELINE_STAGE_FLAGS::ALL_TRANSFER_BITS
 		}
-	  };
-	  {
-		{
-		  const IQueue::SSubmitInfo::SCommandBufferInfo commandBuffers[] =
-		  {
-			{.cmdbuf = cmdbuf }
-		  };
 
-		  const IQueue::SSubmitInfo::SSemaphoreInfo acquired[] =
-		  {
+		cmdbuf->endDebugMarker();
+		cmdbuf->end();
+
+		{
+			const IQueue::SSubmitInfo::SSemaphoreInfo rendered[] =
 			{
-			  .semaphore = m_currentImageAcquire.semaphore,
-			  .value = m_currentImageAcquire.acquireCount,
-			  .stageMask = PIPELINE_STAGE_FLAGS::NONE
-			}
-		  };
-		  const IQueue::SSubmitInfo infos[] =
-		  {
+			  {
+				.semaphore = m_semaphore.get(),
+				.value = ++m_realFrameIx,
+				.stageMask = PIPELINE_STAGE_FLAGS::ALL_TRANSFER_BITS
+			  }
+			};
 			{
-			  .waitSemaphores = acquired,
-			  .commandBuffers = commandBuffers,
-			  .signalSemaphores = rendered
+				{
+					const IQueue::SSubmitInfo::SCommandBufferInfo commandBuffers[] =
+					{
+					  {.cmdbuf = cmdbuf }
+					};
+
+					const IQueue::SSubmitInfo::SSemaphoreInfo acquired[] =
+					{
+					  {
+						.semaphore = m_currentImageAcquire.semaphore,
+						.value = m_currentImageAcquire.acquireCount,
+						.stageMask = PIPELINE_STAGE_FLAGS::NONE
+					  }
+					};
+					const IQueue::SSubmitInfo infos[] =
+					{
+					  {
+						.waitSemaphores = acquired,
+						.commandBuffers = commandBuffers,
+						.signalSemaphores = rendered
+					  }
+					};
+
+					updateGUIDescriptorSet();
+
+					if (queue->submit(infos) != IQueue::RESULT::SUCCESS)
+						m_realFrameIx--;
+				}
 			}
-		  };
 
-		  updateGUIDescriptorSet();
+			m_window->setCaption("[Nabla Engine] Ray Tracing Pipeline");
+			m_surface->present(m_currentImageAcquire.imageIndex, rendered);
+		}
+		m_api->endCapture();
+		m_frameAccumulationCounter++;
+	}
+
+	inline void update()
+	{
+		m_camera.setMoveSpeed(m_cameraSetting.moveSpeed);
+		m_camera.setRotateSpeed(m_cameraSetting.rotateSpeed);
+
+		static std::chrono::microseconds previousEventTimestamp{};
+
+		m_inputSystem->getDefaultMouse(&m_mouse);
+		m_inputSystem->getDefaultKeyboard(&m_keyboard);
+
+		auto updatePresentationTimestamp = [&]()
+			{
+				m_currentImageAcquire = m_surface->acquireNextImage();
+
+				m_oracle.reportEndFrameRecord();
+				const auto timestamp = m_oracle.getNextPresentationTimeStamp();
+				m_oracle.reportBeginFrameRecord();
+
+				return timestamp;
+			};
+
+		const auto nextPresentationTimestamp = updatePresentationTimestamp();
+
+		struct
+		{
+			std::vector<SMouseEvent> mouse{};
+			std::vector<SKeyboardEvent> keyboard{};
+		} capturedEvents;
+
+		m_camera.beginInputProcessing(nextPresentationTimestamp);
+		{
+			const auto& io = ImGui::GetIO();
+			m_mouse.consumeEvents([&](const IMouseEventChannel::range_t& events) -> void
+				{
+					if (!io.WantCaptureMouse)
+						m_camera.mouseProcess(events); // don't capture the events, only let camera handle them with its impl
+
+					for (const auto& e : events) // here capture
+					{
+						if (e.timeStamp < previousEventTimestamp)
+							continue;
+
+						previousEventTimestamp = e.timeStamp;
+						capturedEvents.mouse.emplace_back(e);
+
+					}
+				}, m_logger.get());
+
+			m_keyboard.consumeEvents([&](const IKeyboardEventChannel::range_t& events) -> void
+				{
+					if (!io.WantCaptureKeyboard)
+						m_camera.keyboardProcess(events); // don't capture the events, only let camera handle them with its impl
+
+					for (const auto& e : events) // here capture
+					{
+						if (e.timeStamp < previousEventTimestamp)
+							continue;
+
+						previousEventTimestamp = e.timeStamp;
+						capturedEvents.keyboard.emplace_back(e);
+					}
+				}, m_logger.get());
 
-		  if (queue->submit(infos) != IQueue::RESULT::SUCCESS)
-			m_realFrameIx--;
 		}
-	  }
+		m_camera.endInputProcessing(nextPresentationTimestamp);
 
-	  m_window->setCaption("[Nabla Engine] Ray Tracing Pipeline");
-	  m_surface->present(m_currentImageAcquire.imageIndex, rendered);
-	}
-	m_api->endCapture();
-	m_frameAccumulationCounter++;
-  }
+		const core::SRange<const nbl::ui::SMouseEvent> mouseEvents(capturedEvents.mouse.data(), capturedEvents.mouse.data() + capturedEvents.mouse.size());
+		const core::SRange<const nbl::ui::SKeyboardEvent> keyboardEvents(capturedEvents.keyboard.data(), capturedEvents.keyboard.data() + capturedEvents.keyboard.size());
+		const auto cursorPosition = m_window->getCursorControl()->getPosition();
+		const auto mousePosition = float32_t2(cursorPosition.x, cursorPosition.y) - float32_t2(m_window->getX(), m_window->getY());
 
-  inline void update()
-  {
-	m_camera.setMoveSpeed(m_cameraSetting.moveSpeed);
-	m_camera.setRotateSpeed(m_cameraSetting.rotateSpeed);
+		const ext::imgui::UI::SUpdateParameters params =
+		{
+		  .mousePosition = mousePosition,
+		  .displaySize = { m_window->getWidth(), m_window->getHeight() },
+		  .mouseEvents = mouseEvents,
+		  .keyboardEvents = keyboardEvents
+		};
+
+		m_ui.manager->update(params);
+	}
 
-	static std::chrono::microseconds previousEventTimestamp{};
+	inline bool keepRunning() override
+	{
+		if (m_surface->irrecoverable())
+			return false;
 
-	m_inputSystem->getDefaultMouse(&m_mouse);
-	m_inputSystem->getDefaultKeyboard(&m_keyboard);
+		return true;
+	}
 
-	auto updatePresentationTimestamp = [&]()
-	  {
-		m_currentImageAcquire = m_surface->acquireNextImage();
+	inline bool onAppTerminated() override
+	{
+		return device_base_t::onAppTerminated();
+	}
 
-		m_oracle.reportEndFrameRecord();
-		const auto timestamp = m_oracle.getNextPresentationTimeStamp();
-		m_oracle.reportBeginFrameRecord();
+private:
+	uint32_t getWorkgroupCount(uint32_t dim, uint32_t size)
+	{
+		return (dim + size - 1) / size;
+	}
 
-		return timestamp;
-	  };
+	smart_refctd_ptr<IGPUBuffer> createBuffer(IGPUBuffer::SCreationParams& params)
+	{
+		smart_refctd_ptr<IGPUBuffer> buffer;
+		buffer = m_device->createBuffer(std::move(params));
+		auto bufReqs = buffer->getMemoryReqs();
+		bufReqs.memoryTypeBits &= m_physicalDevice->getDeviceLocalMemoryTypeBits();
+		m_device->allocate(bufReqs, buffer.get(), IDeviceMemoryAllocation::EMAF_DEVICE_ADDRESS_BIT);
 
-	const auto nextPresentationTimestamp = updatePresentationTimestamp();
+		return buffer;
+	}
 
-	struct
+	smart_refctd_ptr<IGPUCommandBuffer> getSingleUseCommandBufferAndBegin(smart_refctd_ptr<IGPUCommandPool> pool)
 	{
-	  std::vector<SMouseEvent> mouse{};
-	  std::vector<SKeyboardEvent> keyboard{};
-	} capturedEvents;
+		smart_refctd_ptr<IGPUCommandBuffer> cmdbuf;
+		if (!pool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY, 1u, &cmdbuf))
+			return nullptr;
+
+		cmdbuf->reset(IGPUCommandBuffer::RESET_FLAGS::RELEASE_RESOURCES_BIT);
+		cmdbuf->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT);
+
+		return cmdbuf;
+	}
 
-	m_camera.beginInputProcessing(nextPresentationTimestamp);
+	void cmdbufSubmitAndWait(smart_refctd_ptr<IGPUCommandBuffer> cmdbuf, CThreadSafeQueueAdapter* queue, uint64_t startValue)
 	{
-	  const auto& io = ImGui::GetIO();
-	  m_mouse.consumeEvents([&](const IMouseEventChannel::range_t& events) -> void
+		cmdbuf->end();
+
+		uint64_t finishedValue = startValue + 1;
+
+		// submit builds
 		{
-		  if (!io.WantCaptureMouse)
-			m_camera.mouseProcess(events); // don't capture the events, only let camera handle them with its impl
+			auto completed = m_device->createSemaphore(startValue);
 
-		  for (const auto& e : events) // here capture
-		  {
-			if (e.timeStamp < previousEventTimestamp)
-			  continue;
+			std::array<IQueue::SSubmitInfo::SSemaphoreInfo, 1u> signals;
+			{
+				auto& signal = signals.front();
+				signal.value = finishedValue;
+				signal.stageMask = bitflag(PIPELINE_STAGE_FLAGS::ALL_TRANSFER_BITS);
+				signal.semaphore = completed.get();
+			}
 
-			previousEventTimestamp = e.timeStamp;
-			capturedEvents.mouse.emplace_back(e);
+			const IQueue::SSubmitInfo::SCommandBufferInfo commandBuffers[1] = { {
+			  .cmdbuf = cmdbuf.get()
+			} };
 
-		  }
-		}, m_logger.get());
+			const IQueue::SSubmitInfo infos[] =
+			{
+			  {
+				.waitSemaphores = {},
+				.commandBuffers = commandBuffers,
+				.signalSemaphores = signals
+			  }
+			};
 
-	  m_keyboard.consumeEvents([&](const IKeyboardEventChannel::range_t& events) -> void
-		{
-		  if (!io.WantCaptureKeyboard)
-			  m_camera.keyboardProcess(events); // don't capture the events, only let camera handle them with its impl
+			if (queue->submit(infos) != IQueue::RESULT::SUCCESS)
+			{
+				m_logger->log("Failed to submit geometry transfer upload operations!", ILogger::ELL_ERROR);
+				return;
+			}
 
-		  for (const auto& e : events) // here capture
-		  {
-			if (e.timeStamp < previousEventTimestamp)
-			  continue;
+			const ISemaphore::SWaitInfo info[] =
+			{ {
+			  .semaphore = completed.get(),
+			  .value = finishedValue
+			} };
 
-			previousEventTimestamp = e.timeStamp;
-			capturedEvents.keyboard.emplace_back(e);
-		  }
-		}, m_logger.get());
+			m_device->blockForSemaphores(info);
+		}
+	}
 
+	bool createIndirectBuffer(video::CThreadSafeQueueAdapter* queue)
+	{
+		const auto getBufferRangeAddress = [](const SBufferRange<IGPUBuffer>& range)
+			{
+				return range.buffer->getDeviceAddress() + range.offset;
+			};
+		const auto command = TraceRaysIndirectCommand_t{
+		  .raygenShaderRecordAddress = getBufferRangeAddress(m_shaderBindingTable.raygenGroupRange),
+		  .raygenShaderRecordSize = m_shaderBindingTable.raygenGroupRange.size,
+		  .missShaderBindingTableAddress = getBufferRangeAddress(m_shaderBindingTable.missGroupsRange),
+		  .missShaderBindingTableSize = m_shaderBindingTable.missGroupsRange.size,
+		  .missShaderBindingTableStride = m_shaderBindingTable.missGroupsStride,
+		  .hitShaderBindingTableAddress = getBufferRangeAddress(m_shaderBindingTable.hitGroupsRange),
+		  .hitShaderBindingTableSize = m_shaderBindingTable.hitGroupsRange.size,
+		  .hitShaderBindingTableStride = m_shaderBindingTable.hitGroupsStride,
+		  .callableShaderBindingTableAddress = getBufferRangeAddress(m_shaderBindingTable.callableGroupsRange),
+		  .callableShaderBindingTableSize = m_shaderBindingTable.callableGroupsRange.size,
+		  .callableShaderBindingTableStride = m_shaderBindingTable.callableGroupsStride,
+		  .width = WIN_W,
+		  .height = WIN_H,
+		  .depth = 1,
+		};
+		IGPUBuffer::SCreationParams params;
+		params.usage = IGPUBuffer::EUF_TRANSFER_DST_BIT | IGPUBuffer::EUF_INDIRECT_BUFFER_BIT | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT;
+		params.size = sizeof(TraceRaysIndirectCommand_t);
+		m_utils->createFilledDeviceLocalBufferOnDedMem(SIntendedSubmitInfo{ .queue = queue }, std::move(params), &command).move_into(m_indirectBuffer);
+		return true;
 	}
-	m_camera.endInputProcessing(nextPresentationTimestamp);
 
-	const core::SRange<const nbl::ui::SMouseEvent> mouseEvents(capturedEvents.mouse.data(), capturedEvents.mouse.data() + capturedEvents.mouse.size());
-	const core::SRange<const nbl::ui::SKeyboardEvent> keyboardEvents(capturedEvents.keyboard.data(), capturedEvents.keyboard.data() + capturedEvents.keyboard.size());
-	const auto cursorPosition = m_window->getCursorControl()->getPosition();
-	const auto mousePosition = float32_t2(cursorPosition.x, cursorPosition.y) - float32_t2(m_window->getX(), m_window->getY());
+	void calculateRayTracingStackSize(const smart_refctd_ptr<video::IGPURayTracingPipeline>& pipeline)
+	{
+		const auto raygenStackSize = pipeline->getRaygenStackSize();
+		auto getMaxSize = [&](auto ranges, auto valProj) -> uint16_t
+			{
+				auto maxValue = 0;
+				for (const auto& val : ranges)
+				{
+					maxValue = std::max<uint16_t>(maxValue, std::invoke(valProj, val));
+				}
+				return maxValue;
+			};
+
+		const auto closestHitStackMax = getMaxSize(pipeline->getHitStackSizes(), &IGPURayTracingPipeline::SHitGroupStackSize::closestHit);
+		const auto anyHitStackMax = getMaxSize(pipeline->getHitStackSizes(), &IGPURayTracingPipeline::SHitGroupStackSize::anyHit);
+		const auto intersectionStackMax = getMaxSize(pipeline->getHitStackSizes(), &IGPURayTracingPipeline::SHitGroupStackSize::intersection);
+		const auto missStackMax = getMaxSize(pipeline->getMissStackSizes(), std::identity{});
+		const auto callableStackMax = getMaxSize(pipeline->getCallableStackSizes(), std::identity{});
+		auto firstDepthStackSizeMax = std::max(closestHitStackMax, missStackMax);
+		firstDepthStackSizeMax = std::max<uint16_t>(firstDepthStackSizeMax, intersectionStackMax + anyHitStackMax);
+		m_rayTracingStackSize = raygenStackSize + std::max(firstDepthStackSizeMax, callableStackMax);
+	}
 
-	const ext::imgui::UI::SUpdateParameters params =
+	bool createShaderBindingTable(video::CThreadSafeQueueAdapter* queue, const smart_refctd_ptr<video::IGPURayTracingPipeline>& pipeline)
 	{
-	  .mousePosition = mousePosition,
-	  .displaySize = { m_window->getWidth(), m_window->getHeight() },
-	  .mouseEvents = mouseEvents,
-	  .keyboardEvents = keyboardEvents
-	};
+		const auto& limits = m_device->getPhysicalDevice()->getLimits();
+		const auto handleSize = SPhysicalDeviceLimits::ShaderGroupHandleSize;
+		const auto handleSizeAligned = nbl::core::alignUp(handleSize, limits.shaderGroupHandleAlignment);
 
-	m_ui.manager->update(params);
-  }
+		auto& raygenRange = m_shaderBindingTable.raygenGroupRange;
 
-  inline bool keepRunning() override
-  {
-	if (m_surface->irrecoverable())
-	  return false;
+		auto& hitRange = m_shaderBindingTable.hitGroupsRange;
+		const auto hitHandles = pipeline->getHitHandles();
 
-	return true;
-  }
+		auto& missRange = m_shaderBindingTable.missGroupsRange;
+		const auto missHandles = pipeline->getMissHandles();
 
-  inline bool onAppTerminated() override
-  {
-	return device_base_t::onAppTerminated();
-  }
+		auto& callableRange = m_shaderBindingTable.callableGroupsRange;
+		const auto callableHandles = pipeline->getCallableHandles();
 
-private:
-  uint32_t getWorkgroupCount(uint32_t dim, uint32_t size)
-  {
-	return (dim + size - 1) / size;
-  }
+		raygenRange = {
+		  .offset = 0,
+		  .size = core::alignUp(handleSizeAligned, limits.shaderGroupBaseAlignment)
+		};
 
-  smart_refctd_ptr<IGPUBuffer> createBuffer(IGPUBuffer::SCreationParams& params)
-  {
-	smart_refctd_ptr<IGPUBuffer> buffer;
-	buffer = m_device->createBuffer(std::move(params));
-	auto bufReqs = buffer->getMemoryReqs();
-	bufReqs.memoryTypeBits &= m_physicalDevice->getDeviceLocalMemoryTypeBits();
-	m_device->allocate(bufReqs, buffer.get(), IDeviceMemoryAllocation::EMAF_DEVICE_ADDRESS_BIT);
+		missRange = {
+		  .offset = raygenRange.size,
+		  .size = core::alignUp(missHandles.size() * handleSizeAligned, limits.shaderGroupBaseAlignment),
+		};
+		m_shaderBindingTable.missGroupsStride = handleSizeAligned;
 
-	return buffer;
-  }
+		hitRange = {
+		  .offset = missRange.offset + missRange.size,
+		  .size = core::alignUp(hitHandles.size() * handleSizeAligned, limits.shaderGroupBaseAlignment),
+		};
+		m_shaderBindingTable.hitGroupsStride = handleSizeAligned;
 
-  smart_refctd_ptr<IGPUCommandBuffer> getSingleUseCommandBufferAndBegin(smart_refctd_ptr<IGPUCommandPool> pool)
-  {
-	smart_refctd_ptr<IGPUCommandBuffer> cmdbuf;
-	if (!pool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY, 1u, &cmdbuf))
-	  return nullptr;
+		callableRange = {
+		  .offset = hitRange.offset + hitRange.size,
+		  .size = core::alignUp(callableHandles.size() * handleSizeAligned, limits.shaderGroupBaseAlignment),
+		};
+		m_shaderBindingTable.callableGroupsStride = handleSizeAligned;
 
-	cmdbuf->reset(IGPUCommandBuffer::RESET_FLAGS::RELEASE_RESOURCES_BIT);
-	cmdbuf->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT);
+		const auto bufferSize = raygenRange.size + missRange.size + hitRange.size + callableRange.size;
 
-	return cmdbuf;
-  }
+		ICPUBuffer::SCreationParams cpuBufferParams;
+		cpuBufferParams.size = bufferSize;
+		auto cpuBuffer = ICPUBuffer::create(std::move(cpuBufferParams));
+		uint8_t* pData = reinterpret_cast<uint8_t*>(cpuBuffer->getPointer());
 
-  void cmdbufSubmitAndWait(smart_refctd_ptr<IGPUCommandBuffer> cmdbuf, CThreadSafeQueueAdapter* queue, uint64_t startValue)
-  {
-	cmdbuf->end();
+		// copy raygen region
+		memcpy(pData, &pipeline->getRaygen(), handleSize);
 
-	uint64_t finishedValue = startValue + 1;
+		// copy miss region
+		uint8_t* pMissData = pData + missRange.offset;
+		for (const auto& handle : missHandles)
+		{
+			memcpy(pMissData, &handle, handleSize);
+			pMissData += m_shaderBindingTable.missGroupsStride;
+		}
 
-	// submit builds
-	{
-	  auto completed = m_device->createSemaphore(startValue);
-
-	  std::array<IQueue::SSubmitInfo::SSemaphoreInfo, 1u> signals;
-	  {
-		auto& signal = signals.front();
-		signal.value = finishedValue;
-		signal.stageMask = bitflag(PIPELINE_STAGE_FLAGS::ALL_TRANSFER_BITS);
-		signal.semaphore = completed.get();
-	  }
-
-	  const IQueue::SSubmitInfo::SCommandBufferInfo commandBuffers[1] = { {
-		.cmdbuf = cmdbuf.get()
-	  } };
-
-	  const IQueue::SSubmitInfo infos[] =
-	  {
+		// copy hit region
+		uint8_t* pHitData = pData + hitRange.offset;
+		for (const auto& handle : hitHandles)
 		{
-		  .waitSemaphores = {},
-		  .commandBuffers = commandBuffers,
-		  .signalSemaphores = signals
+			memcpy(pHitData, &handle, handleSize);
+			pHitData += m_shaderBindingTable.hitGroupsStride;
 		}
-	  };
 
-	  if (queue->submit(infos) != IQueue::RESULT::SUCCESS)
-	  {
-		m_logger->log("Failed to submit geometry transfer upload operations!", ILogger::ELL_ERROR);
-		return;
-	  }
+		// copy callable region
+		uint8_t* pCallableData = pData + callableRange.offset;
+		for (const auto& handle : callableHandles)
+		{
+			memcpy(pCallableData, &handle, handleSize);
+			pCallableData += m_shaderBindingTable.callableGroupsStride;
+		}
 
-	  const ISemaphore::SWaitInfo info[] =
-	  { {
-		.semaphore = completed.get(),
-		.value = finishedValue
-	  } };
+		{
+			IGPUBuffer::SCreationParams params;
+			params.usage = IGPUBuffer::EUF_TRANSFER_DST_BIT | IGPUBuffer::EUF_INLINE_UPDATE_VIA_CMDBUF | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT | IGPUBuffer::EUF_SHADER_BINDING_TABLE_BIT;
+			params.size = bufferSize;
+			m_utils->createFilledDeviceLocalBufferOnDedMem(SIntendedSubmitInfo{ .queue = queue }, std::move(params), pData).move_into(raygenRange.buffer);
+			missRange.buffer = core::smart_refctd_ptr(raygenRange.buffer);
+			hitRange.buffer = core::smart_refctd_ptr(raygenRange.buffer);
+			callableRange.buffer = core::smart_refctd_ptr(raygenRange.buffer);
+		}
 
-	  m_device->blockForSemaphores(info);
+		return true;
 	}
-  }
-
-  bool createIndirectBuffer(video::CThreadSafeQueueAdapter* queue)
-  {
-	const auto getBufferRangeAddress = [](const SBufferRange<IGPUBuffer>& range)
-	  {
-		return range.buffer->getDeviceAddress() + range.offset;
-	  };
-	const auto command = TraceRaysIndirectCommand_t{
-	  .raygenShaderRecordAddress = getBufferRangeAddress(m_shaderBindingTable.raygenGroupRange),
-	  .raygenShaderRecordSize = m_shaderBindingTable.raygenGroupRange.size,
-	  .missShaderBindingTableAddress = getBufferRangeAddress(m_shaderBindingTable.missGroupsRange),
-	  .missShaderBindingTableSize = m_shaderBindingTable.missGroupsRange.size,
-	  .missShaderBindingTableStride = m_shaderBindingTable.missGroupsStride,
-	  .hitShaderBindingTableAddress = getBufferRangeAddress(m_shaderBindingTable.hitGroupsRange),
-	  .hitShaderBindingTableSize = m_shaderBindingTable.hitGroupsRange.size,
-	  .hitShaderBindingTableStride = m_shaderBindingTable.hitGroupsStride,
-	  .callableShaderBindingTableAddress = getBufferRangeAddress(m_shaderBindingTable.callableGroupsRange),
-	  .callableShaderBindingTableSize = m_shaderBindingTable.callableGroupsRange.size,
-	  .callableShaderBindingTableStride = m_shaderBindingTable.callableGroupsStride,
-	  .width = WIN_W,
-	  .height = WIN_H,
-	  .depth = 1,
-	};
-	IGPUBuffer::SCreationParams params;
-	params.usage = IGPUBuffer::EUF_TRANSFER_DST_BIT | IGPUBuffer::EUF_INDIRECT_BUFFER_BIT | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT;
-	params.size = sizeof(TraceRaysIndirectCommand_t);
-	m_utils->createFilledDeviceLocalBufferOnDedMem(SIntendedSubmitInfo{ .queue = queue }, std::move(params), &command).move_into(m_indirectBuffer);
-	return true;
-  }
-
-  void calculateRayTracingStackSize(const smart_refctd_ptr<video::IGPURayTracingPipeline>& pipeline)
-  {
-	const auto raygenStackSize = pipeline->getRaygenStackSize();
-	auto getMaxSize = [&](auto ranges, auto valProj) -> uint16_t
-	  {
-		auto maxValue = 0;
-		for (const auto& val : ranges)
+
+#ifdef TEST_ASSET_CONV_AS
+	bool createAccelerationStructuresFromGeometry(video::CThreadSafeQueueAdapter* queue, const IGeometryCreator* gc)
+	{
+		// get geometries into ICPUBuffers
+		auto pool = m_device->createCommandPool(queue->getFamilyIndex(), IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT);
+		if (!pool)
+			return logFail("Couldn't create Command Pool for geometry creation!");
+
+		const auto defaultMaterial = Material{
+		  .ambient = {0.2, 0.1, 0.1},
+		  .diffuse = {0.8, 0.3, 0.3},
+		  .specular = {0.8, 0.8, 0.8},
+		  .shininess = 1.0f,
+		  .alpha = 1.0f,
+		};
+
+		auto getTranslationMatrix = [](float32_t x, float32_t y, float32_t z)
+			{
+				core::matrix3x4SIMD transform;
+				transform.setTranslation(nbl::core::vectorSIMDf(x, y, z, 0));
+				return transform;
+			};
+
+		core::matrix3x4SIMD planeTransform;
+		planeTransform.setRotation(quaternion::fromAngleAxis(core::radians(-90.0f), vector3df_SIMD{ 1, 0, 0 }));
+
+		// triangles geometries
+		const auto cpuObjects = std::array{
+		  ReferenceObjectCpu {
+			.meta = {.type = OT_RECTANGLE, .name = "Plane Mesh"},
+			.data = gc->createRectangleMesh(nbl::core::vector2df_SIMD(10, 10)),
+			.material = defaultMaterial,
+			.transform = planeTransform,
+		  },
+		  ReferenceObjectCpu {
+			.meta = {.type = OT_CUBE, .name = "Cube Mesh"},
+			.data = gc->createCubeMesh(nbl::core::vector3df(1, 1, 1)),
+			.material = defaultMaterial,
+			.transform = getTranslationMatrix(0, 0.5f, 0),
+		  },
+		  ReferenceObjectCpu {
+			.meta = {.type = OT_CUBE, .name = "Cube Mesh 2"},
+			.data = gc->createCubeMesh(nbl::core::vector3df(1.5, 1.5, 1.5)),
+			.material = Material{
+			  .ambient = {0.1, 0.1, 0.2},
+			  .diffuse = {0.2, 0.2, 0.8},
+			  .specular = {0.8, 0.8, 0.8},
+			  .shininess = 1.0f,
+			},
+			.transform = getTranslationMatrix(-5.0f, 1.0f, 0),
+		  },
+		  ReferenceObjectCpu {
+			.meta = {.type = OT_CUBE, .name = "Transparent Cube Mesh"},
+			.data = gc->createCubeMesh(nbl::core::vector3df(1.5, 1.5, 1.5)),
+			.material = Material{
+			  .ambient = {0.1, 0.2, 0.1},
+			  .diffuse = {0.2, 0.8, 0.2},
+			  .specular = {0.8, 0.8, 0.8},
+			  .shininess = 1.0f,
+			  .alpha = 0.2,
+			},
+			.transform = getTranslationMatrix(5.0f, 1.0f, 0),
+		  },
+		};
+
+		struct CPUTriBufferBindings
 		{
-		  maxValue = std::max<uint16_t>(maxValue, std::invoke(valProj, val));
-		}
-		return maxValue;
-	  };
-
-	const auto closestHitStackMax = getMaxSize(pipeline->getHitStackSizes(), &IGPURayTracingPipeline::SHitGroupStackSize::closestHit);
-	const auto anyHitStackMax = getMaxSize(pipeline->getHitStackSizes(), &IGPURayTracingPipeline::SHitGroupStackSize::anyHit);
-	const auto intersectionStackMax = getMaxSize(pipeline->getHitStackSizes(), &IGPURayTracingPipeline::SHitGroupStackSize::intersection);
-	const auto missStackMax = getMaxSize(pipeline->getMissStackSizes(), std::identity{});
-	const auto callableStackMax = getMaxSize(pipeline->getCallableStackSizes(), std::identity{});
-	auto firstDepthStackSizeMax = std::max(closestHitStackMax, missStackMax);
-	firstDepthStackSizeMax = std::max<uint16_t>(firstDepthStackSizeMax, intersectionStackMax + anyHitStackMax);
-	m_rayTracingStackSize = raygenStackSize + std::max(firstDepthStackSizeMax, callableStackMax);
-  }
-
-  bool createShaderBindingTable(video::CThreadSafeQueueAdapter* queue, const smart_refctd_ptr<video::IGPURayTracingPipeline>& pipeline)
-  {
-	const auto& limits = m_device->getPhysicalDevice()->getLimits();
-	const auto handleSize = SPhysicalDeviceLimits::ShaderGroupHandleSize;
-	const auto handleSizeAligned = nbl::core::alignUp(handleSize, limits.shaderGroupHandleAlignment);
-
-	auto& raygenRange = m_shaderBindingTable.raygenGroupRange;
-
-	auto& hitRange = m_shaderBindingTable.hitGroupsRange;
-	const auto hitHandles = pipeline->getHitHandles();
-
-	auto& missRange = m_shaderBindingTable.missGroupsRange;
-	const auto missHandles = pipeline->getMissHandles();
-
-	auto& callableRange = m_shaderBindingTable.callableGroupsRange;
-	const auto callableHandles = pipeline->getCallableHandles();
-
-	raygenRange = {
-	  .offset = 0,
-	  .size = core::alignUp(handleSizeAligned, limits.shaderGroupBaseAlignment)
-	};
+			nbl::asset::SBufferBinding<ICPUBuffer> vertex, index;
+		};
+		std::array<CPUTriBufferBindings, std::size(cpuObjects)> cpuTriBuffers;
 
-	missRange = {
-	  .offset = raygenRange.size,
-	  .size = core::alignUp(missHandles.size() * handleSizeAligned, limits.shaderGroupBaseAlignment),
-	};
-	m_shaderBindingTable.missGroupsStride = handleSizeAligned;
+		for (uint32_t i = 0; i < cpuObjects.size(); i++)
+		{
+			const auto& cpuObject = cpuObjects[i];
 
-	hitRange = {
-	  .offset = missRange.offset + missRange.size,
-	  .size = core::alignUp(hitHandles.size() * handleSizeAligned, limits.shaderGroupBaseAlignment),
-	};
-	m_shaderBindingTable.hitGroupsStride = handleSizeAligned;
+			auto vBuffer = smart_refctd_ptr(cpuObject.data.bindings[0].buffer); // no offset
+			auto vUsage = bitflag(IGPUBuffer::EUF_STORAGE_BUFFER_BIT) | IGPUBuffer::EUF_TRANSFER_DST_BIT | IGPUBuffer::EUF_INLINE_UPDATE_VIA_CMDBUF |
+				IGPUBuffer::EUF_ACCELERATION_STRUCTURE_BUILD_INPUT_READ_ONLY_BIT | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT;
+			vBuffer->addUsageFlags(vUsage);
+			vBuffer->setContentHash(vBuffer->computeContentHash());
 
-	callableRange = {
-	  .offset = hitRange.offset + hitRange.size,
-	  .size = core::alignUp(callableHandles.size() * handleSizeAligned, limits.shaderGroupBaseAlignment),
-	};
-	m_shaderBindingTable.callableGroupsStride = handleSizeAligned;
+			auto iBuffer = smart_refctd_ptr(cpuObject.data.indexBuffer.buffer); // no offset
+			auto iUsage = bitflag(IGPUBuffer::EUF_STORAGE_BUFFER_BIT) | IGPUBuffer::EUF_TRANSFER_DST_BIT | IGPUBuffer::EUF_INLINE_UPDATE_VIA_CMDBUF |
+				IGPUBuffer::EUF_ACCELERATION_STRUCTURE_BUILD_INPUT_READ_ONLY_BIT | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT;
 
-	const auto bufferSize = raygenRange.size + missRange.size + hitRange.size + callableRange.size;
+			if (cpuObject.data.indexType != EIT_UNKNOWN)
+				if (iBuffer)
+				{
+					iBuffer->addUsageFlags(iUsage);
+					iBuffer->setContentHash(iBuffer->computeContentHash());
+				}
 
-	ICPUBuffer::SCreationParams cpuBufferParams;
-	cpuBufferParams.size = bufferSize;
-	auto cpuBuffer = ICPUBuffer::create(std::move(cpuBufferParams));
-	uint8_t* pData = reinterpret_cast<uint8_t*>(cpuBuffer->getPointer());
+			cpuTriBuffers[i] = {
+			  .vertex = {.offset = 0, .buffer = vBuffer},
+			  .index = {.offset = 0, .buffer = iBuffer},
+			};
 
-	// copy raygen region
-	memcpy(pData, &pipeline->getRaygen(), handleSize);
+		}
 
-	// copy miss region
-	uint8_t* pMissData = pData + missRange.offset;
-	for (const auto& handle : missHandles)
-	{
-	  memcpy(pMissData, &handle, handleSize);
-	  pMissData += m_shaderBindingTable.missGroupsStride;
-	}
+		// procedural geometries
+		using Aabb = IGPUBottomLevelAccelerationStructure::AABB_t;
 
-	// copy hit region
-	uint8_t* pHitData = pData + hitRange.offset;
-	for (const auto& handle : hitHandles)
-	{
-	  memcpy(pHitData, &handle, handleSize);
-	  pHitData += m_shaderBindingTable.hitGroupsStride;
-	}
+		smart_refctd_ptr<ICPUBuffer> cpuProcBuffer;
+		{
+			ICPUBuffer::SCreationParams params;
+			params.size = NumberOfProceduralGeometries * sizeof(Aabb);
+			cpuProcBuffer = ICPUBuffer::create(std::move(params));
+		}
 
-	// copy callable region
-	uint8_t* pCallableData = pData + callableRange.offset;
-	for (const auto& handle : callableHandles)
-	{
-	  memcpy(pCallableData, &handle, handleSize);
-	  pCallableData += m_shaderBindingTable.callableGroupsStride;
-	}
+		core::vector<SProceduralGeomInfo> proceduralGeoms;
+		proceduralGeoms.reserve(NumberOfProceduralGeometries);
+		auto proceduralGeometries = reinterpret_cast<Aabb*>(cpuProcBuffer->getPointer());
+		for (int32_t i = 0; i < NumberOfProceduralGeometries; i++)
+		{
+			const auto middle_i = NumberOfProceduralGeometries / 2.0;
+			SProceduralGeomInfo sphere = {
+			  .material = hlsl::_static_cast<MaterialPacked>(Material{
+				.ambient = {0.1, 0.05 * i, 0.1},
+				.diffuse = {0.3, 0.2 * i, 0.3},
+				.specular = {0.8, 0.8, 0.8},
+				.shininess = 1.0f,
+			  }),
+			  .center = float32_t3((i - middle_i) * 4.0, 2, 5.0),
+			  .radius = 1,
+			};
+
+			proceduralGeoms.push_back(sphere);
+			const auto sphereMin = sphere.center - sphere.radius;
+			const auto sphereMax = sphere.center + sphere.radius;
+			proceduralGeometries[i] = {
+				vector3d(sphereMin.x, sphereMin.y, sphereMin.z),
+				vector3d(sphereMax.x, sphereMax.y, sphereMax.z)
+			};
+		}
 
-	{
-	  IGPUBuffer::SCreationParams params;
-	  params.usage = IGPUBuffer::EUF_TRANSFER_DST_BIT | IGPUBuffer::EUF_INLINE_UPDATE_VIA_CMDBUF | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT | IGPUBuffer::EUF_SHADER_BINDING_TABLE_BIT;
-	  params.size = bufferSize;
-	  m_utils->createFilledDeviceLocalBufferOnDedMem(SIntendedSubmitInfo{ .queue = queue }, std::move(params), pData).move_into(raygenRange.buffer);
-	  missRange.buffer = core::smart_refctd_ptr(raygenRange.buffer);
-	  hitRange.buffer = core::smart_refctd_ptr(raygenRange.buffer);
-	  callableRange.buffer = core::smart_refctd_ptr(raygenRange.buffer);
-	}
+		// get ICPUBuffers into ICPUBLAS
 
-	return true;
-  }
+		// get ICPUBLAS into ICPUTLAS
 
-#ifdef TEST_ASSET_CONV_AS
+		// reserve, convert
+		return true;
+	}
 #else
-  bool createGeometries(video::CThreadSafeQueueAdapter* queue, const IGeometryCreator* gc)
-  {
-	  auto pool = m_device->createCommandPool(queue->getFamilyIndex(), IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT);
-	  if (!pool)
-		  return logFail("Couldn't create Command Pool for geometry creation!");
-
-	  const auto defaultMaterial = Material{
-		.ambient = {0.2, 0.1, 0.1},
-		.diffuse = {0.8, 0.3, 0.3},
-		.specular = {0.8, 0.8, 0.8},
-		.shininess = 1.0f,
-		.alpha = 1.0f,
-	  };
-
-	  auto getTranslationMatrix = [](float32_t x, float32_t y, float32_t z)
-		  {
-			  core::matrix3x4SIMD transform;
-			  transform.setTranslation(nbl::core::vectorSIMDf(x, y, z, 0));
-			  return transform;
-		  };
-
-	  core::matrix3x4SIMD planeTransform;
-	  planeTransform.setRotation(quaternion::fromAngleAxis(core::radians(-90.0f), vector3df_SIMD{ 1, 0, 0 }));
-
-	  const auto cpuObjects = std::array{
-		ReferenceObjectCpu {
-		  .meta = {.type = OT_RECTANGLE, .name = "Plane Mesh"},
-		  .data = gc->createRectangleMesh(nbl::core::vector2df_SIMD(10, 10)),
-		  .material = defaultMaterial,
-		  .transform = planeTransform,
-		},
-		ReferenceObjectCpu {
-		  .meta = {.type = OT_CUBE, .name = "Cube Mesh"},
-		  .data = gc->createCubeMesh(nbl::core::vector3df(1, 1, 1)),
-		  .material = defaultMaterial,
-		  .transform = getTranslationMatrix(0, 0.5f, 0),
-		},
-		ReferenceObjectCpu {
-		  .meta = {.type = OT_CUBE, .name = "Cube Mesh 2"},
-		  .data = gc->createCubeMesh(nbl::core::vector3df(1.5, 1.5, 1.5)),
-		  .material = Material{
-			.ambient = {0.1, 0.1, 0.2},
-			.diffuse = {0.2, 0.2, 0.8},
-			.specular = {0.8, 0.8, 0.8},
-			.shininess = 1.0f,
+	bool createGeometries(video::CThreadSafeQueueAdapter* queue, const IGeometryCreator* gc)
+	{
+		auto pool = m_device->createCommandPool(queue->getFamilyIndex(), IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT);
+		if (!pool)
+			return logFail("Couldn't create Command Pool for geometry creation!");
+
+		const auto defaultMaterial = Material{
+		  .ambient = {0.2, 0.1, 0.1},
+		  .diffuse = {0.8, 0.3, 0.3},
+		  .specular = {0.8, 0.8, 0.8},
+		  .shininess = 1.0f,
+		  .alpha = 1.0f,
+		};
+
+		auto getTranslationMatrix = [](float32_t x, float32_t y, float32_t z)
+			{
+				core::matrix3x4SIMD transform;
+				transform.setTranslation(nbl::core::vectorSIMDf(x, y, z, 0));
+				return transform;
+			};
+
+		core::matrix3x4SIMD planeTransform;
+		planeTransform.setRotation(quaternion::fromAngleAxis(core::radians(-90.0f), vector3df_SIMD{ 1, 0, 0 }));
+
+		const auto cpuObjects = std::array{
+		  ReferenceObjectCpu {
+			.meta = {.type = OT_RECTANGLE, .name = "Plane Mesh"},
+			.data = gc->createRectangleMesh(nbl::core::vector2df_SIMD(10, 10)),
+			.material = defaultMaterial,
+			.transform = planeTransform,
 		  },
-		  .transform = getTranslationMatrix(-5.0f, 1.0f, 0),
-		},
-		ReferenceObjectCpu {
-		  .meta = {.type = OT_CUBE, .name = "Transparent Cube Mesh"},
-		  .data = gc->createCubeMesh(nbl::core::vector3df(1.5, 1.5, 1.5)),
-		  .material = Material{
-			.ambient = {0.1, 0.2, 0.1},
-			.diffuse = {0.2, 0.8, 0.2},
-			.specular = {0.8, 0.8, 0.8},
-			.shininess = 1.0f,
-			.alpha = 0.2,
+		  ReferenceObjectCpu {
+			.meta = {.type = OT_CUBE, .name = "Cube Mesh"},
+			.data = gc->createCubeMesh(nbl::core::vector3df(1, 1, 1)),
+			.material = defaultMaterial,
+			.transform = getTranslationMatrix(0, 0.5f, 0),
 		  },
-		  .transform = getTranslationMatrix(5.0f, 1.0f, 0),
-		},
-	  };
-
-	  struct ScratchVIBindings
-	  {
-		  nbl::asset::SBufferBinding<ICPUBuffer> vertex, index;
-	  };
-	  std::array<ScratchVIBindings, std::size(cpuObjects)> scratchBuffers;
-
-	  for (uint32_t i = 0; i < cpuObjects.size(); i++)
-	  {
-		  const auto& cpuObject = cpuObjects[i];
-
-		  auto vBuffer = smart_refctd_ptr(cpuObject.data.bindings[0].buffer); // no offset
-		  auto vUsage = bitflag(IGPUBuffer::EUF_STORAGE_BUFFER_BIT) | IGPUBuffer::EUF_TRANSFER_DST_BIT | IGPUBuffer::EUF_INLINE_UPDATE_VIA_CMDBUF |
-			  IGPUBuffer::EUF_ACCELERATION_STRUCTURE_BUILD_INPUT_READ_ONLY_BIT | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT;
-		  vBuffer->addUsageFlags(vUsage);
-		  vBuffer->setContentHash(vBuffer->computeContentHash());
-
-		  auto iBuffer = smart_refctd_ptr(cpuObject.data.indexBuffer.buffer); // no offset
-		  auto iUsage = bitflag(IGPUBuffer::EUF_STORAGE_BUFFER_BIT) | IGPUBuffer::EUF_TRANSFER_DST_BIT | IGPUBuffer::EUF_INLINE_UPDATE_VIA_CMDBUF |
-			  IGPUBuffer::EUF_ACCELERATION_STRUCTURE_BUILD_INPUT_READ_ONLY_BIT | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT;
-
-		  if (cpuObject.data.indexType != EIT_UNKNOWN)
-			  if (iBuffer)
-			  {
-				  iBuffer->addUsageFlags(iUsage);
-				  iBuffer->setContentHash(iBuffer->computeContentHash());
-			  }
+		  ReferenceObjectCpu {
+			.meta = {.type = OT_CUBE, .name = "Cube Mesh 2"},
+			.data = gc->createCubeMesh(nbl::core::vector3df(1.5, 1.5, 1.5)),
+			.material = Material{
+			  .ambient = {0.1, 0.1, 0.2},
+			  .diffuse = {0.2, 0.2, 0.8},
+			  .specular = {0.8, 0.8, 0.8},
+			  .shininess = 1.0f,
+			},
+			.transform = getTranslationMatrix(-5.0f, 1.0f, 0),
+		  },
+		  ReferenceObjectCpu {
+			.meta = {.type = OT_CUBE, .name = "Transparent Cube Mesh"},
+			.data = gc->createCubeMesh(nbl::core::vector3df(1.5, 1.5, 1.5)),
+			.material = Material{
+			  .ambient = {0.1, 0.2, 0.1},
+			  .diffuse = {0.2, 0.8, 0.2},
+			  .specular = {0.8, 0.8, 0.8},
+			  .shininess = 1.0f,
+			  .alpha = 0.2,
+			},
+			.transform = getTranslationMatrix(5.0f, 1.0f, 0),
+		  },
+		};
 
-		  scratchBuffers[i] = {
-			.vertex = {.offset = 0, .buffer = vBuffer},
-			.index = {.offset = 0, .buffer = iBuffer},
-		  };
-
-	  }
-
-	  auto cmdbuf = getSingleUseCommandBufferAndBegin(pool);
-	  cmdbuf->beginDebugMarker("Build geometry vertex and index buffers");
-
-	  CAssetConverter::SInputs inputs = {};
-	  inputs.logger = m_logger.get();
-	  std::array<ICPUBuffer*, std::size(cpuObjects) * 2u> tmpBuffers;
-	  {
-		  for (uint32_t i = 0; i < cpuObjects.size(); i++)
-		  {
-			  tmpBuffers[2 * i + 0] = scratchBuffers[i].vertex.buffer.get();
-			  tmpBuffers[2 * i + 1] = scratchBuffers[i].index.buffer.get();
-		  }
-
-		  std::get<CAssetConverter::SInputs::asset_span_t<ICPUBuffer>>(inputs.assets) = tmpBuffers;
-	  }
-
-	  auto reservation = m_converter->reserve(inputs);
-	  {
-		  auto prepass = [&]<typename asset_type_t>(const auto & references) -> bool
-		  {
-			  auto objects = reservation.getGPUObjects<asset_type_t>();
-			  uint32_t counter = {};
-			  for (auto& object : objects)
-			  {
-				  auto gpu = object.value;
-				  auto* reference = references[counter];
+		struct ScratchVIBindings
+		{
+			nbl::asset::SBufferBinding<ICPUBuffer> vertex, index;
+		};
+		std::array<ScratchVIBindings, std::size(cpuObjects)> scratchBuffers;
 
-				  if (reference)
-				  {
-					  if (!gpu)
-					  {
-						  m_logger->log("Failed to convert a CPU object to GPU!", ILogger::ELL_ERROR);
-						  return false;
-					  }
-				  }
-				  counter++;
-			  }
-			  return true;
-		  };
-
-		  prepass.template operator() < ICPUBuffer > (tmpBuffers);
-	  }
-
-	  auto geomInfoBuffer = ICPUBuffer::create({ std::size(cpuObjects) * sizeof(STriangleGeomInfo) });
-	  STriangleGeomInfo* geomInfos = reinterpret_cast<STriangleGeomInfo*>(geomInfoBuffer->getPointer());
-
-	  m_gpuTriangleGeometries.reserve(std::size(cpuObjects));
-	  // convert
-	  {
-		  // not sure if need this (probably not, originally for transition img view)
-		  auto semaphore = m_device->createSemaphore(0u);
-
-		  std::array<IQueue::SSubmitInfo::SCommandBufferInfo, 1> cmdbufs = {};
-		  cmdbufs.front().cmdbuf = cmdbuf.get();
-
-		  SIntendedSubmitInfo transfer = {};
-		  transfer.queue = queue;
-		  transfer.scratchCommandBuffers = cmdbufs;
-		  transfer.scratchSemaphore = {
-			.semaphore = semaphore.get(),
-			.value = 0u,
-			.stageMask = PIPELINE_STAGE_FLAGS::ALL_TRANSFER_BITS
-		  };
-
-		  CAssetConverter::SConvertParams params = {};
-		  params.utilities = m_utils.get();
-		  params.transfer = &transfer;
-
-		  auto future = reservation.convert(params);
-		  if (future.copy() != IQueue::RESULT::SUCCESS)
-		  {
-			  m_logger->log("Failed to await submission feature!", ILogger::ELL_ERROR);
-			  return false;
-		  }
-
-		  auto&& buffers = reservation.getGPUObjects<ICPUBuffer>();
-		  for (uint32_t i = 0; i < cpuObjects.size(); i++)
-		  {
-			  auto& cpuObject = cpuObjects[i];
-
-			  m_gpuTriangleGeometries.push_back(ReferenceObjectGpu{
-				.meta = cpuObject.meta,
-				.bindings = {
-				  .vertex = {.offset = 0, .buffer = buffers[2 * i + 0].value },
-				  .index = {.offset = 0, .buffer = buffers[2 * i + 1].value },
-				},
-				.vertexStride = cpuObject.data.inputParams.bindings[0].stride,
-				.indexType = cpuObject.data.indexType,
-				.indexCount = cpuObject.data.indexCount,
-				.material = hlsl::_static_cast<MaterialPacked>(cpuObject.material),
-				.transform = cpuObject.transform,
-				  });
-		  }
-
-		  for (uint32_t i = 0; i < m_gpuTriangleGeometries.size(); i++)
-		  {
-			  const auto& gpuObject = m_gpuTriangleGeometries[i];
-			  const uint64_t vertexBufferAddress = gpuObject.bindings.vertex.buffer->getDeviceAddress();
-			  geomInfos[i] = {
-				.material = gpuObject.material,
-				.vertexBufferAddress = vertexBufferAddress,
-				.indexBufferAddress = gpuObject.useIndex() ? gpuObject.bindings.index.buffer->getDeviceAddress() : vertexBufferAddress,
-				.vertexStride = gpuObject.vertexStride,
-				.objType = gpuObject.meta.type,
-				.indexType = gpuObject.indexType,
-				.smoothNormals = s_smoothNormals[gpuObject.meta.type],
-			  };
-		  }
-	  }
-
-	  {
-		  IGPUBuffer::SCreationParams params;
-		  params.usage = IGPUBuffer::EUF_STORAGE_BUFFER_BIT | IGPUBuffer::EUF_TRANSFER_DST_BIT | IGPUBuffer::EUF_INLINE_UPDATE_VIA_CMDBUF | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT;
-		  params.size = geomInfoBuffer->getSize();
-		  m_utils->createFilledDeviceLocalBufferOnDedMem(SIntendedSubmitInfo{ .queue = queue }, std::move(params), geomInfos).move_into(m_triangleGeomInfoBuffer);
-	  }
-
-	  // intersection geometries setup
-	  {
-		  core::vector<SProceduralGeomInfo> proceduralGeoms;
-		  proceduralGeoms.reserve(NumberOfProceduralGeometries);
-		  using Aabb = IGPUBottomLevelAccelerationStructure::AABB_t;
-		  core::vector<Aabb> aabbs;
-		  aabbs.reserve(NumberOfProceduralGeometries);
-		  for (int32_t i = 0; i < NumberOfProceduralGeometries; i++)
-		  {
-			  const auto middle_i = NumberOfProceduralGeometries / 2.0;
-			  SProceduralGeomInfo sphere = {
-				.material = hlsl::_static_cast<MaterialPacked>(Material{
-				  .ambient = {0.1, 0.05 * i, 0.1},
-				  .diffuse = {0.3, 0.2 * i, 0.3},
-				  .specular = {0.8, 0.8, 0.8},
-				  .shininess = 1.0f,
-				}),
-				.center = float32_t3((i - middle_i) * 4.0, 2, 5.0),
-				.radius = 1,
-			  };
-
-			  proceduralGeoms.push_back(sphere);
-			  const auto sphereMin = sphere.center - sphere.radius;
-			  const auto sphereMax = sphere.center + sphere.radius;
-			  aabbs.emplace_back(
-				  vector3d(sphereMin.x, sphereMin.y, sphereMin.z),
-				  vector3d(sphereMax.x, sphereMax.y, sphereMax.z));
-		  }
-
-		  {
-			  IGPUBuffer::SCreationParams params;
-			  params.usage = IGPUBuffer::EUF_STORAGE_BUFFER_BIT | IGPUBuffer::EUF_TRANSFER_DST_BIT | IGPUBuffer::EUF_INLINE_UPDATE_VIA_CMDBUF | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT;
-			  params.size = proceduralGeoms.size() * sizeof(SProceduralGeomInfo);
-			  m_utils->createFilledDeviceLocalBufferOnDedMem(SIntendedSubmitInfo{ .queue = queue }, std::move(params), proceduralGeoms.data()).move_into(m_proceduralGeomInfoBuffer);
-		  }
-
-		  {
-			  IGPUBuffer::SCreationParams params;
-			  params.usage = IGPUBuffer::EUF_TRANSFER_DST_BIT | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT | IGPUBuffer::EUF_ACCELERATION_STRUCTURE_BUILD_INPUT_READ_ONLY_BIT;
-			  params.size = aabbs.size() * sizeof(Aabb);
-			  m_utils->createFilledDeviceLocalBufferOnDedMem(SIntendedSubmitInfo{ .queue = queue }, std::move(params), aabbs.data()).move_into(m_proceduralAabbBuffer);
-		  }
-	  }
-
-	  return true;
-  }
-
-  bool createAccelerationStructures(video::CThreadSafeQueueAdapter* queue)
-  {
-	// plus 1 blas for procedural geometry contains {{var::NumberOfProcedural}}
-	// spheres. Each sphere is a primitive instead one instance or geometry
-	const auto blasCount = m_gpuTriangleGeometries.size() + 1;
-	const auto proceduralBlasIdx = m_gpuTriangleGeometries.size();
-
-	IQueryPool::SCreationParams qParams{ .queryCount = static_cast<uint32_t>(blasCount), .queryType = IQueryPool::ACCELERATION_STRUCTURE_COMPACTED_SIZE };
-	smart_refctd_ptr<IQueryPool> queryPool = m_device->createQueryPool(std::move(qParams));
-
-	auto pool = m_device->createCommandPool(queue->getFamilyIndex(), IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT | IGPUCommandPool::CREATE_FLAGS::TRANSIENT_BIT);
-	if (!pool)
-	  return logFail("Couldn't create Command Pool for blas/tlas creation!");
-
-	m_api->startCapture();
-#ifdef TRY_BUILD_FOR_NGFX // NSight is "debugger-challenged" it can't capture anything not happenning "during a frame", so we need to trick it
-	m_currentImageAcquire = m_surface->acquireNextImage();
-	{
-	  const IQueue::SSubmitInfo::SSemaphoreInfo acquired[] = { {
-		.semaphore = m_currentImageAcquire.semaphore,
-		.value = m_currentImageAcquire.acquireCount,
-		.stageMask = PIPELINE_STAGE_FLAGS::ALL_COMMANDS_BITS
-	  } };
-	  m_surface->present(m_currentImageAcquire.imageIndex, acquired);
-	}
-	m_currentImageAcquire = m_surface->acquireNextImage();
-#endif
-	size_t totalScratchSize = 0;
-	const auto scratchOffsetAlignment = m_device->getPhysicalDevice()->getLimits().minAccelerationStructureScratchOffsetAlignment;
+		for (uint32_t i = 0; i < cpuObjects.size(); i++)
+		{
+			const auto& cpuObject = cpuObjects[i];
 
-	// build bottom level ASes
-	{
-	  core::vector<uint32_t> primitiveCounts(blasCount);
-	  core::vector<IGPUBottomLevelAccelerationStructure::Triangles<const IGPUBuffer>> triangles(m_gpuTriangleGeometries.size());
-	  core::vector<uint32_t> scratchSizes(blasCount);
-	  IGPUBottomLevelAccelerationStructure::AABBs<const IGPUBuffer> aabbs;
-
-	  auto blasFlags = bitflag(IGPUBottomLevelAccelerationStructure::BUILD_FLAGS::PREFER_FAST_TRACE_BIT) | IGPUBottomLevelAccelerationStructure::BUILD_FLAGS::ALLOW_COMPACTION_BIT;
-	  if (m_physicalDevice->getProperties().limits.rayTracingPositionFetch)
-		blasFlags |= IGPUBottomLevelAccelerationStructure::BUILD_FLAGS::ALLOW_DATA_ACCESS;
-
-	  IGPUBottomLevelAccelerationStructure::DeviceBuildInfo initBuildInfo;
-	  initBuildInfo.buildFlags = blasFlags;
-	  initBuildInfo.geometryCount = 1;	// only 1 geometry object per blas
-	  initBuildInfo.srcAS = nullptr;
-	  initBuildInfo.dstAS = nullptr;
-	  initBuildInfo.scratch = {};
-
-	  auto blasBuildInfos = core::vector(blasCount, initBuildInfo);
-
-	  m_gpuBlasList.resize(blasCount);
-	  // setup blas info for triangle geometries
-	  for (uint32_t i = 0; i < blasCount; i++)
-	  {
-		const auto isProcedural = i == proceduralBlasIdx;
-		if (isProcedural)
+			auto vBuffer = smart_refctd_ptr(cpuObject.data.bindings[0].buffer); // no offset
+			auto vUsage = bitflag(IGPUBuffer::EUF_STORAGE_BUFFER_BIT) | IGPUBuffer::EUF_TRANSFER_DST_BIT | IGPUBuffer::EUF_INLINE_UPDATE_VIA_CMDBUF |
+				IGPUBuffer::EUF_ACCELERATION_STRUCTURE_BUILD_INPUT_READ_ONLY_BIT | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT;
+			vBuffer->addUsageFlags(vUsage);
+			vBuffer->setContentHash(vBuffer->computeContentHash());
+
+			auto iBuffer = smart_refctd_ptr(cpuObject.data.indexBuffer.buffer); // no offset
+			auto iUsage = bitflag(IGPUBuffer::EUF_STORAGE_BUFFER_BIT) | IGPUBuffer::EUF_TRANSFER_DST_BIT | IGPUBuffer::EUF_INLINE_UPDATE_VIA_CMDBUF |
+				IGPUBuffer::EUF_ACCELERATION_STRUCTURE_BUILD_INPUT_READ_ONLY_BIT | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT;
+
+			if (cpuObject.data.indexType != EIT_UNKNOWN)
+				if (iBuffer)
+				{
+					iBuffer->addUsageFlags(iUsage);
+					iBuffer->setContentHash(iBuffer->computeContentHash());
+				}
+
+			scratchBuffers[i] = {
+			  .vertex = {.offset = 0, .buffer = vBuffer},
+			  .index = {.offset = 0, .buffer = iBuffer},
+			};
+
+		}
+
+		auto cmdbuf = getSingleUseCommandBufferAndBegin(pool);
+		cmdbuf->beginDebugMarker("Build geometry vertex and index buffers");
+
+		CAssetConverter::SInputs inputs = {};
+		inputs.logger = m_logger.get();
+		std::array<ICPUBuffer*, std::size(cpuObjects) * 2u> tmpBuffers;
 		{
-		  aabbs.data.buffer = smart_refctd_ptr(m_proceduralAabbBuffer);
-		  aabbs.data.offset = 0;
-		  aabbs.stride = sizeof(IGPUBottomLevelAccelerationStructure::AABB_t);
-		  aabbs.geometryFlags = IGPUBottomLevelAccelerationStructure::GEOMETRY_FLAGS::OPAQUE_BIT; // only allow opaque for now
-
-		  primitiveCounts[proceduralBlasIdx] = NumberOfProceduralGeometries;
-		  blasBuildInfos[proceduralBlasIdx].aabbs = &aabbs;
-		  blasBuildInfos[proceduralBlasIdx].buildFlags |= IGPUBottomLevelAccelerationStructure::BUILD_FLAGS::GEOMETRY_TYPE_IS_AABB_BIT;
-		} else
+			for (uint32_t i = 0; i < cpuObjects.size(); i++)
+			{
+				tmpBuffers[2 * i + 0] = scratchBuffers[i].vertex.buffer.get();
+				tmpBuffers[2 * i + 1] = scratchBuffers[i].index.buffer.get();
+			}
+
+			std::get<CAssetConverter::SInputs::asset_span_t<ICPUBuffer>>(inputs.assets) = tmpBuffers;
+		}
+
+		auto reservation = m_converter->reserve(inputs);
 		{
-		  const auto& gpuObject = m_gpuTriangleGeometries[i];
-
-		  const uint32_t vertexStride = gpuObject.vertexStride;
-		  const uint32_t numVertices = gpuObject.bindings.vertex.buffer->getSize() / vertexStride;
-		  if (gpuObject.useIndex())
-			primitiveCounts[i] = gpuObject.indexCount / 3;
-		  else
-			primitiveCounts[i] = numVertices / 3;
-
-		  triangles[i].vertexData[0] = gpuObject.bindings.vertex;
-		  triangles[i].indexData = gpuObject.useIndex() ? gpuObject.bindings.index : gpuObject.bindings.vertex;
-		  triangles[i].maxVertex = numVertices - 1;
-		  triangles[i].vertexStride = vertexStride;
-		  triangles[i].vertexFormat = EF_R32G32B32_SFLOAT;
-		  triangles[i].indexType = gpuObject.indexType;
-		  triangles[i].geometryFlags = gpuObject.material.isTransparent() ?
-			IGPUBottomLevelAccelerationStructure::GEOMETRY_FLAGS::NO_DUPLICATE_ANY_HIT_INVOCATION_BIT :
-			IGPUBottomLevelAccelerationStructure::GEOMETRY_FLAGS::OPAQUE_BIT;
-
-		  blasBuildInfos[i].triangles = &triangles[i];
+			auto prepass = [&]<typename asset_type_t>(const auto & references) -> bool
+			{
+				auto objects = reservation.getGPUObjects<asset_type_t>();
+				uint32_t counter = {};
+				for (auto& object : objects)
+				{
+					auto gpu = object.value;
+					auto* reference = references[counter];
+
+					if (reference)
+					{
+						if (!gpu)
+						{
+							m_logger->log("Failed to convert a CPU object to GPU!", ILogger::ELL_ERROR);
+							return false;
+						}
+					}
+					counter++;
+				}
+				return true;
+			};
+
+			prepass.template operator() < ICPUBuffer > (tmpBuffers);
 		}
-		ILogicalDevice::AccelerationStructureBuildSizes buildSizes;
+
+		auto geomInfoBuffer = ICPUBuffer::create({ std::size(cpuObjects) * sizeof(STriangleGeomInfo) });
+		STriangleGeomInfo* geomInfos = reinterpret_cast<STriangleGeomInfo*>(geomInfoBuffer->getPointer());
+
+		m_gpuTriangleGeometries.reserve(std::size(cpuObjects));
+		// convert
 		{
-		  const uint32_t maxPrimCount[1] = { primitiveCounts[i] };
-		  if (isProcedural)
-		  {
-			const auto* aabbData = &aabbs;
-			buildSizes = m_device->getAccelerationStructureBuildSizes(blasBuildInfos[i].buildFlags, false, std::span{ aabbData, 1}, maxPrimCount);
-		  }
-		  else
-		  {
-			const auto* trianglesData = triangles.data();
-			buildSizes = m_device->getAccelerationStructureBuildSizes(blasBuildInfos[i].buildFlags, false, std::span{trianglesData,1}, maxPrimCount);
-		  }
-		  if (!buildSizes)
-			return logFail("Failed to get BLAS build sizes");
+			// not sure if need this (probably not, originally for transition img view)
+			auto semaphore = m_device->createSemaphore(0u);
+
+			std::array<IQueue::SSubmitInfo::SCommandBufferInfo, 1> cmdbufs = {};
+			cmdbufs.front().cmdbuf = cmdbuf.get();
+
+			SIntendedSubmitInfo transfer = {};
+			transfer.queue = queue;
+			transfer.scratchCommandBuffers = cmdbufs;
+			transfer.scratchSemaphore = {
+			  .semaphore = semaphore.get(),
+			  .value = 0u,
+			  .stageMask = PIPELINE_STAGE_FLAGS::ALL_TRANSFER_BITS
+			};
+
+			CAssetConverter::SConvertParams params = {};
+			params.utilities = m_utils.get();
+			params.transfer = &transfer;
+
+			auto future = reservation.convert(params);
+			if (future.copy() != IQueue::RESULT::SUCCESS)
+			{
+				m_logger->log("Failed to await submission feature!", ILogger::ELL_ERROR);
+				return false;
+			}
+
+			auto&& buffers = reservation.getGPUObjects<ICPUBuffer>();
+			for (uint32_t i = 0; i < cpuObjects.size(); i++)
+			{
+				auto& cpuObject = cpuObjects[i];
+
+				m_gpuTriangleGeometries.push_back(ReferenceObjectGpu{
+				  .meta = cpuObject.meta,
+				  .bindings = {
+					.vertex = {.offset = 0, .buffer = buffers[2 * i + 0].value },
+					.index = {.offset = 0, .buffer = buffers[2 * i + 1].value },
+				  },
+				  .vertexStride = cpuObject.data.inputParams.bindings[0].stride,
+				  .indexType = cpuObject.data.indexType,
+				  .indexCount = cpuObject.data.indexCount,
+				  .material = hlsl::_static_cast<MaterialPacked>(cpuObject.material),
+				  .transform = cpuObject.transform,
+					});
+			}
+
+			for (uint32_t i = 0; i < m_gpuTriangleGeometries.size(); i++)
+			{
+				const auto& gpuObject = m_gpuTriangleGeometries[i];
+				const uint64_t vertexBufferAddress = gpuObject.bindings.vertex.buffer->getDeviceAddress();
+				geomInfos[i] = {
+				  .material = gpuObject.material,
+				  .vertexBufferAddress = vertexBufferAddress,
+				  .indexBufferAddress = gpuObject.useIndex() ? gpuObject.bindings.index.buffer->getDeviceAddress() : vertexBufferAddress,
+				  .vertexStride = gpuObject.vertexStride,
+				  .objType = gpuObject.meta.type,
+				  .indexType = gpuObject.indexType,
+				  .smoothNormals = s_smoothNormals[gpuObject.meta.type],
+				};
+			}
 		}
 
-		scratchSizes[i] = buildSizes.buildScratchSize;
-		totalScratchSize = core::alignUp(totalScratchSize, scratchOffsetAlignment);
-		totalScratchSize += buildSizes.buildScratchSize;
+		{
+			IGPUBuffer::SCreationParams params;
+			params.usage = IGPUBuffer::EUF_STORAGE_BUFFER_BIT | IGPUBuffer::EUF_TRANSFER_DST_BIT | IGPUBuffer::EUF_INLINE_UPDATE_VIA_CMDBUF | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT;
+			params.size = geomInfoBuffer->getSize();
+			m_utils->createFilledDeviceLocalBufferOnDedMem(SIntendedSubmitInfo{ .queue = queue }, std::move(params), geomInfos).move_into(m_triangleGeomInfoBuffer);
+		}
 
+		// intersection geometries setup
 		{
-		  IGPUBuffer::SCreationParams params;
-		  params.usage = bitflag(IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT) | IGPUBuffer::EUF_ACCELERATION_STRUCTURE_STORAGE_BIT;
-		  params.size = buildSizes.accelerationStructureSize;
-		  smart_refctd_ptr<IGPUBuffer> asBuffer = createBuffer(params);
-
-		  IGPUBottomLevelAccelerationStructure::SCreationParams blasParams;
-		  blasParams.bufferRange.buffer = asBuffer;
-		  blasParams.bufferRange.offset = 0u;
-		  blasParams.bufferRange.size = buildSizes.accelerationStructureSize;
-		  blasParams.flags = IGPUBottomLevelAccelerationStructure::SCreationParams::FLAGS::NONE;
-		  m_gpuBlasList[i] = m_device->createBottomLevelAccelerationStructure(std::move(blasParams));
-		  if (!m_gpuBlasList[i])
-			return logFail("Could not create BLAS");
+			core::vector<SProceduralGeomInfo> proceduralGeoms;
+			proceduralGeoms.reserve(NumberOfProceduralGeometries);
+			using Aabb = IGPUBottomLevelAccelerationStructure::AABB_t;
+			core::vector<Aabb> aabbs;
+			aabbs.reserve(NumberOfProceduralGeometries);
+			for (int32_t i = 0; i < NumberOfProceduralGeometries; i++)
+			{
+				const auto middle_i = NumberOfProceduralGeometries / 2.0;
+				SProceduralGeomInfo sphere = {
+				  .material = hlsl::_static_cast<MaterialPacked>(Material{
+					.ambient = {0.1, 0.05 * i, 0.1},
+					.diffuse = {0.3, 0.2 * i, 0.3},
+					.specular = {0.8, 0.8, 0.8},
+					.shininess = 1.0f,
+				  }),
+				  .center = float32_t3((i - middle_i) * 4.0, 2, 5.0),
+				  .radius = 1,
+				};
+
+				proceduralGeoms.push_back(sphere);
+				const auto sphereMin = sphere.center - sphere.radius;
+				const auto sphereMax = sphere.center + sphere.radius;
+				aabbs.emplace_back(
+					vector3d(sphereMin.x, sphereMin.y, sphereMin.z),
+					vector3d(sphereMax.x, sphereMax.y, sphereMax.z));
+			}
+
+			{
+				IGPUBuffer::SCreationParams params;
+				params.usage = IGPUBuffer::EUF_STORAGE_BUFFER_BIT | IGPUBuffer::EUF_TRANSFER_DST_BIT | IGPUBuffer::EUF_INLINE_UPDATE_VIA_CMDBUF | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT;
+				params.size = proceduralGeoms.size() * sizeof(SProceduralGeomInfo);
+				m_utils->createFilledDeviceLocalBufferOnDedMem(SIntendedSubmitInfo{ .queue = queue }, std::move(params), proceduralGeoms.data()).move_into(m_proceduralGeomInfoBuffer);
+			}
+
+			{
+				IGPUBuffer::SCreationParams params;
+				params.usage = IGPUBuffer::EUF_TRANSFER_DST_BIT | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT | IGPUBuffer::EUF_ACCELERATION_STRUCTURE_BUILD_INPUT_READ_ONLY_BIT;
+				params.size = aabbs.size() * sizeof(Aabb);
+				m_utils->createFilledDeviceLocalBufferOnDedMem(SIntendedSubmitInfo{ .queue = queue }, std::move(params), aabbs.data()).move_into(m_proceduralAabbBuffer);
+			}
 		}
-	  }
 
+		return true;
+	}
 
-	  auto cmdbufBlas = getSingleUseCommandBufferAndBegin(pool);
-	  cmdbufBlas->beginDebugMarker("Build BLAS");
+	bool createAccelerationStructures(video::CThreadSafeQueueAdapter* queue)
+	{
+		// plus 1 blas for procedural geometry contains {{var::NumberOfProcedural}}
+		// spheres. Each sphere is a primitive instead one instance or geometry
+		const auto blasCount = m_gpuTriangleGeometries.size() + 1;
+		const auto proceduralBlasIdx = m_gpuTriangleGeometries.size();
 
-	  cmdbufBlas->resetQueryPool(queryPool.get(), 0, blasCount);
+		IQueryPool::SCreationParams qParams{ .queryCount = static_cast<uint32_t>(blasCount), .queryType = IQueryPool::ACCELERATION_STRUCTURE_COMPACTED_SIZE };
+		smart_refctd_ptr<IQueryPool> queryPool = m_device->createQueryPool(std::move(qParams));
 
-	  smart_refctd_ptr<IGPUBuffer> scratchBuffer;
-	  {
-		IGPUBuffer::SCreationParams params;
-		params.usage = bitflag(IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT) | IGPUBuffer::EUF_STORAGE_BUFFER_BIT;
-		params.size = totalScratchSize;
-		scratchBuffer = createBuffer(params);
-	  }
-
-	  core::vector<IGPUBottomLevelAccelerationStructure::BuildRangeInfo> buildRangeInfos(blasCount);
-	  core::vector<IGPUBottomLevelAccelerationStructure::BuildRangeInfo*> pRangeInfos(blasCount);
-	  for (uint32_t i = 0; i < blasCount; i++)
-	  {
-		blasBuildInfos[i].dstAS = m_gpuBlasList[i].get();
-		blasBuildInfos[i].scratch.buffer = scratchBuffer;
-		if (i == 0)
-		{
-		  blasBuildInfos[i].scratch.offset = 0u;
-		} else
+		auto pool = m_device->createCommandPool(queue->getFamilyIndex(), IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT | IGPUCommandPool::CREATE_FLAGS::TRANSIENT_BIT);
+		if (!pool)
+			return logFail("Couldn't create Command Pool for blas/tlas creation!");
+
+		m_api->startCapture();
+#ifdef TRY_BUILD_FOR_NGFX // NSight is "debugger-challenged" it can't capture anything not happenning "during a frame", so we need to trick it
+		m_currentImageAcquire = m_surface->acquireNextImage();
 		{
-		  const auto unalignedOffset = blasBuildInfos[i - 1].scratch.offset + scratchSizes[i - 1];
-		  blasBuildInfos[i].scratch.offset = core::alignUp(unalignedOffset, scratchOffsetAlignment);
+			const IQueue::SSubmitInfo::SSemaphoreInfo acquired[] = { {
+			  .semaphore = m_currentImageAcquire.semaphore,
+			  .value = m_currentImageAcquire.acquireCount,
+			  .stageMask = PIPELINE_STAGE_FLAGS::ALL_COMMANDS_BITS
+			} };
+			m_surface->present(m_currentImageAcquire.imageIndex, acquired);
 		}
+		m_currentImageAcquire = m_surface->acquireNextImage();
+#endif
+		size_t totalScratchSize = 0;
+		const auto scratchOffsetAlignment = m_device->getPhysicalDevice()->getLimits().minAccelerationStructureScratchOffsetAlignment;
+
+		// build bottom level ASes
+		{
+			core::vector<uint32_t> primitiveCounts(blasCount);
+			core::vector<IGPUBottomLevelAccelerationStructure::Triangles<const IGPUBuffer>> triangles(m_gpuTriangleGeometries.size());
+			core::vector<uint32_t> scratchSizes(blasCount);
+			IGPUBottomLevelAccelerationStructure::AABBs<const IGPUBuffer> aabbs;
+
+			auto blasFlags = bitflag(IGPUBottomLevelAccelerationStructure::BUILD_FLAGS::PREFER_FAST_TRACE_BIT) | IGPUBottomLevelAccelerationStructure::BUILD_FLAGS::ALLOW_COMPACTION_BIT;
+			if (m_physicalDevice->getProperties().limits.rayTracingPositionFetch)
+				blasFlags |= IGPUBottomLevelAccelerationStructure::BUILD_FLAGS::ALLOW_DATA_ACCESS;
+
+			IGPUBottomLevelAccelerationStructure::DeviceBuildInfo initBuildInfo;
+			initBuildInfo.buildFlags = blasFlags;
+			initBuildInfo.geometryCount = 1;	// only 1 geometry object per blas
+			initBuildInfo.srcAS = nullptr;
+			initBuildInfo.dstAS = nullptr;
+			initBuildInfo.scratch = {};
+
+			auto blasBuildInfos = core::vector(blasCount, initBuildInfo);
+
+			m_gpuBlasList.resize(blasCount);
+			// setup blas info for triangle geometries
+			for (uint32_t i = 0; i < blasCount; i++)
+			{
+				const auto isProcedural = i == proceduralBlasIdx;
+				if (isProcedural)
+				{
+					aabbs.data.buffer = smart_refctd_ptr(m_proceduralAabbBuffer);
+					aabbs.data.offset = 0;
+					aabbs.stride = sizeof(IGPUBottomLevelAccelerationStructure::AABB_t);
+					aabbs.geometryFlags = IGPUBottomLevelAccelerationStructure::GEOMETRY_FLAGS::OPAQUE_BIT; // only allow opaque for now
+
+					primitiveCounts[proceduralBlasIdx] = NumberOfProceduralGeometries;
+					blasBuildInfos[proceduralBlasIdx].aabbs = &aabbs;
+					blasBuildInfos[proceduralBlasIdx].buildFlags |= IGPUBottomLevelAccelerationStructure::BUILD_FLAGS::GEOMETRY_TYPE_IS_AABB_BIT;
+				}
+				else
+				{
+					const auto& gpuObject = m_gpuTriangleGeometries[i];
+
+					const uint32_t vertexStride = gpuObject.vertexStride;
+					const uint32_t numVertices = gpuObject.bindings.vertex.buffer->getSize() / vertexStride;
+					if (gpuObject.useIndex())
+						primitiveCounts[i] = gpuObject.indexCount / 3;
+					else
+						primitiveCounts[i] = numVertices / 3;
+
+					triangles[i].vertexData[0] = gpuObject.bindings.vertex;
+					triangles[i].indexData = gpuObject.useIndex() ? gpuObject.bindings.index : gpuObject.bindings.vertex;
+					triangles[i].maxVertex = numVertices - 1;
+					triangles[i].vertexStride = vertexStride;
+					triangles[i].vertexFormat = EF_R32G32B32_SFLOAT;
+					triangles[i].indexType = gpuObject.indexType;
+					triangles[i].geometryFlags = gpuObject.material.isTransparent() ?
+						IGPUBottomLevelAccelerationStructure::GEOMETRY_FLAGS::NO_DUPLICATE_ANY_HIT_INVOCATION_BIT :
+						IGPUBottomLevelAccelerationStructure::GEOMETRY_FLAGS::OPAQUE_BIT;
+
+					blasBuildInfos[i].triangles = &triangles[i];
+				}
+				ILogicalDevice::AccelerationStructureBuildSizes buildSizes;
+				{
+					const uint32_t maxPrimCount[1] = { primitiveCounts[i] };
+					if (isProcedural)
+					{
+						const auto* aabbData = &aabbs;
+						buildSizes = m_device->getAccelerationStructureBuildSizes(blasBuildInfos[i].buildFlags, false, std::span{ aabbData, 1 }, maxPrimCount);
+					}
+					else
+					{
+						const auto* trianglesData = triangles.data();
+						buildSizes = m_device->getAccelerationStructureBuildSizes(blasBuildInfos[i].buildFlags, false, std::span{ trianglesData,1 }, maxPrimCount);
+					}
+					if (!buildSizes)
+						return logFail("Failed to get BLAS build sizes");
+				}
 
-		buildRangeInfos[i].primitiveCount = primitiveCounts[i];
-		buildRangeInfos[i].primitiveByteOffset = 0u;
-		buildRangeInfos[i].firstVertex = 0u;
-		buildRangeInfos[i].transformByteOffset = 0u;
+				scratchSizes[i] = buildSizes.buildScratchSize;
+				totalScratchSize = core::alignUp(totalScratchSize, scratchOffsetAlignment);
+				totalScratchSize += buildSizes.buildScratchSize;
 
-		pRangeInfos[i] = &buildRangeInfos[i];
-	  }
+				{
+					IGPUBuffer::SCreationParams params;
+					params.usage = bitflag(IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT) | IGPUBuffer::EUF_ACCELERATION_STRUCTURE_STORAGE_BIT;
+					params.size = buildSizes.accelerationStructureSize;
+					smart_refctd_ptr<IGPUBuffer> asBuffer = createBuffer(params);
+
+					IGPUBottomLevelAccelerationStructure::SCreationParams blasParams;
+					blasParams.bufferRange.buffer = asBuffer;
+					blasParams.bufferRange.offset = 0u;
+					blasParams.bufferRange.size = buildSizes.accelerationStructureSize;
+					blasParams.flags = IGPUBottomLevelAccelerationStructure::SCreationParams::FLAGS::NONE;
+					m_gpuBlasList[i] = m_device->createBottomLevelAccelerationStructure(std::move(blasParams));
+					if (!m_gpuBlasList[i])
+						return logFail("Could not create BLAS");
+				}
+			}
 
-	  if (!cmdbufBlas->buildAccelerationStructures(std::span(blasBuildInfos), pRangeInfos.data()))
-		return logFail("Failed to build BLAS");
 
-	  {
-		SMemoryBarrier memBarrier;
-		memBarrier.srcStageMask = PIPELINE_STAGE_FLAGS::ACCELERATION_STRUCTURE_BUILD_BIT;
-		memBarrier.srcAccessMask = ACCESS_FLAGS::ACCELERATION_STRUCTURE_WRITE_BIT;
-		memBarrier.dstStageMask = PIPELINE_STAGE_FLAGS::ACCELERATION_STRUCTURE_BUILD_BIT;
-		memBarrier.dstAccessMask = ACCESS_FLAGS::ACCELERATION_STRUCTURE_READ_BIT;
-		cmdbufBlas->pipelineBarrier(E_DEPENDENCY_FLAGS::EDF_NONE, { .memBarriers = {&memBarrier, 1} });
-	  }
+			auto cmdbufBlas = getSingleUseCommandBufferAndBegin(pool);
+			cmdbufBlas->beginDebugMarker("Build BLAS");
 
+			cmdbufBlas->resetQueryPool(queryPool.get(), 0, blasCount);
 
-	  core::vector<const IGPUAccelerationStructure*> ases(blasCount);
-	  for (uint32_t i = 0; i < blasCount; i++)
-		ases[i] = m_gpuBlasList[i].get();
-	  if (!cmdbufBlas->writeAccelerationStructureProperties(std::span(ases), IQueryPool::ACCELERATION_STRUCTURE_COMPACTED_SIZE,
-		queryPool.get(), 0))
-		return logFail("Failed to write acceleration structure properties!");
+			smart_refctd_ptr<IGPUBuffer> scratchBuffer;
+			{
+				IGPUBuffer::SCreationParams params;
+				params.usage = bitflag(IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT) | IGPUBuffer::EUF_STORAGE_BUFFER_BIT;
+				params.size = totalScratchSize;
+				scratchBuffer = createBuffer(params);
+			}
 
-	  cmdbufBlas->endDebugMarker();
-	  cmdbufSubmitAndWait(cmdbufBlas, queue, 39);
-	}
+			core::vector<IGPUBottomLevelAccelerationStructure::BuildRangeInfo> buildRangeInfos(blasCount);
+			core::vector<IGPUBottomLevelAccelerationStructure::BuildRangeInfo*> pRangeInfos(blasCount);
+			for (uint32_t i = 0; i < blasCount; i++)
+			{
+				blasBuildInfos[i].dstAS = m_gpuBlasList[i].get();
+				blasBuildInfos[i].scratch.buffer = scratchBuffer;
+				if (i == 0)
+				{
+					blasBuildInfos[i].scratch.offset = 0u;
+				}
+				else
+				{
+					const auto unalignedOffset = blasBuildInfos[i - 1].scratch.offset + scratchSizes[i - 1];
+					blasBuildInfos[i].scratch.offset = core::alignUp(unalignedOffset, scratchOffsetAlignment);
+				}
 
-	auto cmdbufCompact = getSingleUseCommandBufferAndBegin(pool);
-	cmdbufCompact->beginDebugMarker("Compact BLAS");
+				buildRangeInfos[i].primitiveCount = primitiveCounts[i];
+				buildRangeInfos[i].primitiveByteOffset = 0u;
+				buildRangeInfos[i].firstVertex = 0u;
+				buildRangeInfos[i].transformByteOffset = 0u;
 
-	// compact blas
-	{
-	  core::vector<size_t> asSizes(blasCount);
-	  if (!m_device->getQueryPoolResults(queryPool.get(), 0, blasCount, asSizes.data(), sizeof(size_t), bitflag(IQueryPool::WAIT_BIT) | IQueryPool::_64_BIT))
-		return logFail("Could not get query pool results for AS sizes");
-
-	  core::vector<smart_refctd_ptr<IGPUBottomLevelAccelerationStructure>> cleanupBlas(blasCount);
-	  for (uint32_t i = 0; i < blasCount; i++)
-	  {
-		if (asSizes[i] == 0) continue;
-		cleanupBlas[i] = m_gpuBlasList[i];
+				pRangeInfos[i] = &buildRangeInfos[i];
+			}
+
+			if (!cmdbufBlas->buildAccelerationStructures(std::span(blasBuildInfos), pRangeInfos.data()))
+				return logFail("Failed to build BLAS");
+
+			{
+				SMemoryBarrier memBarrier;
+				memBarrier.srcStageMask = PIPELINE_STAGE_FLAGS::ACCELERATION_STRUCTURE_BUILD_BIT;
+				memBarrier.srcAccessMask = ACCESS_FLAGS::ACCELERATION_STRUCTURE_WRITE_BIT;
+				memBarrier.dstStageMask = PIPELINE_STAGE_FLAGS::ACCELERATION_STRUCTURE_BUILD_BIT;
+				memBarrier.dstAccessMask = ACCESS_FLAGS::ACCELERATION_STRUCTURE_READ_BIT;
+				cmdbufBlas->pipelineBarrier(E_DEPENDENCY_FLAGS::EDF_NONE, { .memBarriers = {&memBarrier, 1} });
+			}
+
+
+			core::vector<const IGPUAccelerationStructure*> ases(blasCount);
+			for (uint32_t i = 0; i < blasCount; i++)
+				ases[i] = m_gpuBlasList[i].get();
+			if (!cmdbufBlas->writeAccelerationStructureProperties(std::span(ases), IQueryPool::ACCELERATION_STRUCTURE_COMPACTED_SIZE,
+				queryPool.get(), 0))
+				return logFail("Failed to write acceleration structure properties!");
+
+			cmdbufBlas->endDebugMarker();
+			cmdbufSubmitAndWait(cmdbufBlas, queue, 39);
+		}
+
+		auto cmdbufCompact = getSingleUseCommandBufferAndBegin(pool);
+		cmdbufCompact->beginDebugMarker("Compact BLAS");
+
+		// compact blas
 		{
-		  IGPUBuffer::SCreationParams params;
-		  params.usage = bitflag(IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT) | IGPUBuffer::EUF_ACCELERATION_STRUCTURE_STORAGE_BIT;
-		  params.size = asSizes[i];
-		  smart_refctd_ptr<IGPUBuffer> asBuffer = createBuffer(params);
-
-		  IGPUBottomLevelAccelerationStructure::SCreationParams blasParams;
-		  blasParams.bufferRange.buffer = asBuffer;
-		  blasParams.bufferRange.offset = 0u;
-		  blasParams.bufferRange.size = asSizes[i];
-		  blasParams.flags = IGPUBottomLevelAccelerationStructure::SCreationParams::FLAGS::NONE;
-		  m_gpuBlasList[i] = m_device->createBottomLevelAccelerationStructure(std::move(blasParams));
-		  if (!m_gpuBlasList[i])
-			return logFail("Could not create compacted BLAS");
+			core::vector<size_t> asSizes(blasCount);
+			if (!m_device->getQueryPoolResults(queryPool.get(), 0, blasCount, asSizes.data(), sizeof(size_t), bitflag(IQueryPool::WAIT_BIT) | IQueryPool::_64_BIT))
+				return logFail("Could not get query pool results for AS sizes");
+
+			core::vector<smart_refctd_ptr<IGPUBottomLevelAccelerationStructure>> cleanupBlas(blasCount);
+			for (uint32_t i = 0; i < blasCount; i++)
+			{
+				if (asSizes[i] == 0) continue;
+				cleanupBlas[i] = m_gpuBlasList[i];
+				{
+					IGPUBuffer::SCreationParams params;
+					params.usage = bitflag(IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT) | IGPUBuffer::EUF_ACCELERATION_STRUCTURE_STORAGE_BIT;
+					params.size = asSizes[i];
+					smart_refctd_ptr<IGPUBuffer> asBuffer = createBuffer(params);
+
+					IGPUBottomLevelAccelerationStructure::SCreationParams blasParams;
+					blasParams.bufferRange.buffer = asBuffer;
+					blasParams.bufferRange.offset = 0u;
+					blasParams.bufferRange.size = asSizes[i];
+					blasParams.flags = IGPUBottomLevelAccelerationStructure::SCreationParams::FLAGS::NONE;
+					m_gpuBlasList[i] = m_device->createBottomLevelAccelerationStructure(std::move(blasParams));
+					if (!m_gpuBlasList[i])
+						return logFail("Could not create compacted BLAS");
+				}
+
+				IGPUBottomLevelAccelerationStructure::CopyInfo copyInfo;
+				copyInfo.src = cleanupBlas[i].get();
+				copyInfo.dst = m_gpuBlasList[i].get();
+				copyInfo.mode = IGPUBottomLevelAccelerationStructure::COPY_MODE::COMPACT;
+				if (!cmdbufCompact->copyAccelerationStructure(copyInfo))
+					return logFail("Failed to copy AS to compact");
+			}
 		}
 
-		IGPUBottomLevelAccelerationStructure::CopyInfo copyInfo;
-		copyInfo.src = cleanupBlas[i].get();
-		copyInfo.dst = m_gpuBlasList[i].get();
-		copyInfo.mode = IGPUBottomLevelAccelerationStructure::COPY_MODE::COMPACT;
-		if (!cmdbufCompact->copyAccelerationStructure(copyInfo))
-		  return logFail("Failed to copy AS to compact");
-	  }
-	}
+		cmdbufCompact->endDebugMarker();
+		cmdbufSubmitAndWait(cmdbufCompact, queue, 40);
 
-	cmdbufCompact->endDebugMarker();
-	cmdbufSubmitAndWait(cmdbufCompact, queue, 40);
+		auto cmdbufTlas = getSingleUseCommandBufferAndBegin(pool);
+		cmdbufTlas->beginDebugMarker("Build TLAS");
 
-	auto cmdbufTlas = getSingleUseCommandBufferAndBegin(pool);
-	cmdbufTlas->beginDebugMarker("Build TLAS");
+		// build top level AS
+		{
+			const uint32_t instancesCount = blasCount;
+			core::vector<IGPUTopLevelAccelerationStructure::DeviceStaticInstance> instances(instancesCount);
+			for (uint32_t i = 0; i < instancesCount; i++)
+			{
+				const auto isProceduralInstance = i == proceduralBlasIdx;
+				instances[i].base.blas.deviceAddress = m_gpuBlasList[i]->getReferenceForDeviceOperations().deviceAddress;
+				instances[i].base.mask = 0xFF;
+				instances[i].base.instanceCustomIndex = i;
+				instances[i].base.instanceShaderBindingTableRecordOffset = isProceduralInstance ? 2 : 0;
+				instances[i].base.flags = static_cast<uint32_t>(IGPUTopLevelAccelerationStructure::INSTANCE_FLAGS::TRIANGLE_FACING_CULL_DISABLE_BIT);
+				instances[i].transform = isProceduralInstance ? matrix3x4SIMD() : m_gpuTriangleGeometries[i].transform;
+			}
 
-	// build top level AS
-	{
-	  const uint32_t instancesCount = blasCount;
-	  core::vector<IGPUTopLevelAccelerationStructure::DeviceStaticInstance> instances(instancesCount);
-	  for (uint32_t i = 0; i < instancesCount; i++)
-	  {
-		const auto isProceduralInstance = i == proceduralBlasIdx;
-		instances[i].base.blas.deviceAddress = m_gpuBlasList[i]->getReferenceForDeviceOperations().deviceAddress;
-		instances[i].base.mask = 0xFF;
-		instances[i].base.instanceCustomIndex = i;
-		instances[i].base.instanceShaderBindingTableRecordOffset = isProceduralInstance ? 2 : 0;
-		instances[i].base.flags = static_cast<uint32_t>(IGPUTopLevelAccelerationStructure::INSTANCE_FLAGS::TRIANGLE_FACING_CULL_DISABLE_BIT);
-		instances[i].transform = isProceduralInstance ? matrix3x4SIMD() : m_gpuTriangleGeometries[i].transform;
-	  }
-
-	  {
-		size_t bufSize = instancesCount * sizeof(IGPUTopLevelAccelerationStructure::DeviceStaticInstance);
-		IGPUBuffer::SCreationParams params;
-		params.usage = bitflag(IGPUBuffer::EUF_ACCELERATION_STRUCTURE_BUILD_INPUT_READ_ONLY_BIT) | IGPUBuffer::EUF_STORAGE_BUFFER_BIT |
-		  IGPUBuffer::EUF_INLINE_UPDATE_VIA_CMDBUF | IGPUBuffer::EUF_TRANSFER_DST_BIT | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT;
-		params.size = bufSize;
-		m_instanceBuffer = createBuffer(params);
-
-		SBufferRange<IGPUBuffer> range = { .offset = 0u, .size = bufSize, .buffer = m_instanceBuffer };
-		cmdbufTlas->updateBuffer(range, instances.data());
-	  }
-
-	  // make sure instances upload complete first
-	  {
-		SMemoryBarrier memBarrier;
-		memBarrier.srcStageMask = PIPELINE_STAGE_FLAGS::ALL_TRANSFER_BITS;
-		memBarrier.srcAccessMask = ACCESS_FLAGS::TRANSFER_WRITE_BIT;
-		memBarrier.dstStageMask = PIPELINE_STAGE_FLAGS::ACCELERATION_STRUCTURE_BUILD_BIT;
-		memBarrier.dstAccessMask = ACCESS_FLAGS::ACCELERATION_STRUCTURE_WRITE_BIT;
-		cmdbufTlas->pipelineBarrier(E_DEPENDENCY_FLAGS::EDF_NONE, { .memBarriers = {&memBarrier, 1} });
-	  }
-
-	  auto tlasFlags = bitflag(IGPUTopLevelAccelerationStructure::BUILD_FLAGS::PREFER_FAST_TRACE_BIT);
-
-	  IGPUTopLevelAccelerationStructure::DeviceBuildInfo tlasBuildInfo;
-	  tlasBuildInfo.buildFlags = tlasFlags;
-	  tlasBuildInfo.srcAS = nullptr;
-	  tlasBuildInfo.dstAS = nullptr;
-	  tlasBuildInfo.instanceData.buffer = m_instanceBuffer;
-	  tlasBuildInfo.instanceData.offset = 0u;
-	  tlasBuildInfo.scratch = {};
-
-	  auto buildSizes = m_device->getAccelerationStructureBuildSizes(tlasFlags, false, instancesCount);
-	  if (!buildSizes)
-		return logFail("Failed to get TLAS build sizes");
-
-	  {
-		IGPUBuffer::SCreationParams params;
-		params.usage = bitflag(IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT) | IGPUBuffer::EUF_ACCELERATION_STRUCTURE_STORAGE_BIT;
-		params.size = buildSizes.accelerationStructureSize;
-		smart_refctd_ptr<IGPUBuffer> asBuffer = createBuffer(params);
-
-		IGPUTopLevelAccelerationStructure::SCreationParams tlasParams;
-		tlasParams.bufferRange.buffer = asBuffer;
-		tlasParams.bufferRange.offset = 0u;
-		tlasParams.bufferRange.size = buildSizes.accelerationStructureSize;
-		tlasParams.flags = IGPUTopLevelAccelerationStructure::SCreationParams::FLAGS::NONE;
-		m_gpuTlas = m_device->createTopLevelAccelerationStructure(std::move(tlasParams));
-		if (!m_gpuTlas)
-		  return logFail("Could not create TLAS");
-	  }
-
-	  smart_refctd_ptr<IGPUBuffer> scratchBuffer;
-	  {
-		IGPUBuffer::SCreationParams params;
-		params.usage = bitflag(IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT) | IGPUBuffer::EUF_STORAGE_BUFFER_BIT;
-		params.size = buildSizes.buildScratchSize;
-		scratchBuffer = createBuffer(params);
-	  }
-
-	  tlasBuildInfo.dstAS = m_gpuTlas.get();
-	  tlasBuildInfo.scratch.buffer = scratchBuffer;
-	  tlasBuildInfo.scratch.offset = 0u;
-
-	  IGPUTopLevelAccelerationStructure::BuildRangeInfo buildRangeInfo[1u];
-	  buildRangeInfo[0].instanceCount = instancesCount;
-	  buildRangeInfo[0].instanceByteOffset = 0u;
-	  IGPUTopLevelAccelerationStructure::BuildRangeInfo* pRangeInfos;
-	  pRangeInfos = &buildRangeInfo[0];
-
-	  if (!cmdbufTlas->buildAccelerationStructures({ &tlasBuildInfo, 1 }, pRangeInfos))
-		return logFail("Failed to build TLAS");
-	}
+			{
+				size_t bufSize = instancesCount * sizeof(IGPUTopLevelAccelerationStructure::DeviceStaticInstance);
+				IGPUBuffer::SCreationParams params;
+				params.usage = bitflag(IGPUBuffer::EUF_ACCELERATION_STRUCTURE_BUILD_INPUT_READ_ONLY_BIT) | IGPUBuffer::EUF_STORAGE_BUFFER_BIT |
+					IGPUBuffer::EUF_INLINE_UPDATE_VIA_CMDBUF | IGPUBuffer::EUF_TRANSFER_DST_BIT | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT;
+				params.size = bufSize;
+				m_instanceBuffer = createBuffer(params);
+
+				SBufferRange<IGPUBuffer> range = { .offset = 0u, .size = bufSize, .buffer = m_instanceBuffer };
+				cmdbufTlas->updateBuffer(range, instances.data());
+			}
+
+			// make sure instances upload complete first
+			{
+				SMemoryBarrier memBarrier;
+				memBarrier.srcStageMask = PIPELINE_STAGE_FLAGS::ALL_TRANSFER_BITS;
+				memBarrier.srcAccessMask = ACCESS_FLAGS::TRANSFER_WRITE_BIT;
+				memBarrier.dstStageMask = PIPELINE_STAGE_FLAGS::ACCELERATION_STRUCTURE_BUILD_BIT;
+				memBarrier.dstAccessMask = ACCESS_FLAGS::ACCELERATION_STRUCTURE_WRITE_BIT;
+				cmdbufTlas->pipelineBarrier(E_DEPENDENCY_FLAGS::EDF_NONE, { .memBarriers = {&memBarrier, 1} });
+			}
 
-	cmdbufTlas->endDebugMarker();
-	cmdbufSubmitAndWait(cmdbufTlas, queue, 45);
+			auto tlasFlags = bitflag(IGPUTopLevelAccelerationStructure::BUILD_FLAGS::PREFER_FAST_TRACE_BIT);
+
+			IGPUTopLevelAccelerationStructure::DeviceBuildInfo tlasBuildInfo;
+			tlasBuildInfo.buildFlags = tlasFlags;
+			tlasBuildInfo.srcAS = nullptr;
+			tlasBuildInfo.dstAS = nullptr;
+			tlasBuildInfo.instanceData.buffer = m_instanceBuffer;
+			tlasBuildInfo.instanceData.offset = 0u;
+			tlasBuildInfo.scratch = {};
+
+			auto buildSizes = m_device->getAccelerationStructureBuildSizes(tlasFlags, false, instancesCount);
+			if (!buildSizes)
+				return logFail("Failed to get TLAS build sizes");
+
+			{
+				IGPUBuffer::SCreationParams params;
+				params.usage = bitflag(IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT) | IGPUBuffer::EUF_ACCELERATION_STRUCTURE_STORAGE_BIT;
+				params.size = buildSizes.accelerationStructureSize;
+				smart_refctd_ptr<IGPUBuffer> asBuffer = createBuffer(params);
+
+				IGPUTopLevelAccelerationStructure::SCreationParams tlasParams;
+				tlasParams.bufferRange.buffer = asBuffer;
+				tlasParams.bufferRange.offset = 0u;
+				tlasParams.bufferRange.size = buildSizes.accelerationStructureSize;
+				tlasParams.flags = IGPUTopLevelAccelerationStructure::SCreationParams::FLAGS::NONE;
+				m_gpuTlas = m_device->createTopLevelAccelerationStructure(std::move(tlasParams));
+				if (!m_gpuTlas)
+					return logFail("Could not create TLAS");
+			}
+
+			smart_refctd_ptr<IGPUBuffer> scratchBuffer;
+			{
+				IGPUBuffer::SCreationParams params;
+				params.usage = bitflag(IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT) | IGPUBuffer::EUF_STORAGE_BUFFER_BIT;
+				params.size = buildSizes.buildScratchSize;
+				scratchBuffer = createBuffer(params);
+			}
+
+			tlasBuildInfo.dstAS = m_gpuTlas.get();
+			tlasBuildInfo.scratch.buffer = scratchBuffer;
+			tlasBuildInfo.scratch.offset = 0u;
+
+			IGPUTopLevelAccelerationStructure::BuildRangeInfo buildRangeInfo[1u];
+			buildRangeInfo[0].instanceCount = instancesCount;
+			buildRangeInfo[0].instanceByteOffset = 0u;
+			IGPUTopLevelAccelerationStructure::BuildRangeInfo* pRangeInfos;
+			pRangeInfos = &buildRangeInfo[0];
+
+			if (!cmdbufTlas->buildAccelerationStructures({ &tlasBuildInfo, 1 }, pRangeInfos))
+				return logFail("Failed to build TLAS");
+		}
+
+		cmdbufTlas->endDebugMarker();
+		cmdbufSubmitAndWait(cmdbufTlas, queue, 45);
 
 #ifdef TRY_BUILD_FOR_NGFX
-	{
-	  const IQueue::SSubmitInfo::SSemaphoreInfo acquired[] = { {
-		.semaphore = m_currentImageAcquire.semaphore,
-		.value = m_currentImageAcquire.acquireCount,
-		.stageMask = PIPELINE_STAGE_FLAGS::ALL_COMMANDS_BITS
-	  } };
-	  m_surface->present(m_currentImageAcquire.imageIndex, acquired);
-	}
+		{
+			const IQueue::SSubmitInfo::SSemaphoreInfo acquired[] = { {
+			  .semaphore = m_currentImageAcquire.semaphore,
+			  .value = m_currentImageAcquire.acquireCount,
+			  .stageMask = PIPELINE_STAGE_FLAGS::ALL_COMMANDS_BITS
+			} };
+			m_surface->present(m_currentImageAcquire.imageIndex, acquired);
+		}
 #endif
-	m_api->endCapture();
+		m_api->endCapture();
 
-	return true;
-  }
+		return true;
+	}
 #endif // TEST_ASSET_CONV_AS
 
 
-  smart_refctd_ptr<IWindow> m_window;
-  smart_refctd_ptr<CSimpleResizeSurface<ISimpleManagedSurface::ISwapchainResources>> m_surface;
-  smart_refctd_ptr<ISemaphore> m_semaphore;
-  uint64_t m_realFrameIx = 0;
-  uint32_t m_frameAccumulationCounter = 0;
-  std::array<smart_refctd_ptr<IGPUCommandBuffer>, MaxFramesInFlight> m_cmdBufs;
-  ISimpleManagedSurface::SAcquireResult m_currentImageAcquire = {};
-
-  core::smart_refctd_ptr<InputSystem> m_inputSystem;
-  InputSystem::ChannelReader<IMouseEventChannel> m_mouse;
-  InputSystem::ChannelReader<IKeyboardEventChannel> m_keyboard;
-
-  struct CameraSetting
-  {
-	float fov = 60.f;
-	float zNear = 0.1f;
-	float zFar = 10000.f;
-	float moveSpeed = 1.f;
-	float rotateSpeed = 1.f;
-	float viewWidth = 10.f;
-	float camYAngle = 165.f / 180.f * 3.14159f;
-	float camXAngle = 32.f / 180.f * 3.14159f;
-	
-  } m_cameraSetting;
-  Camera m_camera = Camera(core::vectorSIMDf(0, 0, 0), core::vectorSIMDf(0, 0, 0), core::matrix4SIMD());
-
-  Light m_light = {
-	.direction = {-1.0f, -1.0f, -0.4f},
-	.position = {10.0f, 15.0f, 8.0f},
-	.outerCutoff = 0.866025404f, // {cos(radians(30.0f))}, 
-	.type = ELT_DIRECTIONAL
-  };
-
-  video::CDumbPresentationOracle m_oracle;
-
-  struct C_UI
-  {
-	nbl::core::smart_refctd_ptr<nbl::ext::imgui::UI> manager;
-
-	struct
+	smart_refctd_ptr<IWindow> m_window;
+	smart_refctd_ptr<CSimpleResizeSurface<ISimpleManagedSurface::ISwapchainResources>> m_surface;
+	smart_refctd_ptr<ISemaphore> m_semaphore;
+	uint64_t m_realFrameIx = 0;
+	uint32_t m_frameAccumulationCounter = 0;
+	std::array<smart_refctd_ptr<IGPUCommandBuffer>, MaxFramesInFlight> m_cmdBufs;
+	ISimpleManagedSurface::SAcquireResult m_currentImageAcquire = {};
+
+	core::smart_refctd_ptr<InputSystem> m_inputSystem;
+	InputSystem::ChannelReader<IMouseEventChannel> m_mouse;
+	InputSystem::ChannelReader<IKeyboardEventChannel> m_keyboard;
+
+	struct CameraSetting
 	{
-	  core::smart_refctd_ptr<video::IGPUSampler> gui, scene;
-	} samplers;
+		float fov = 60.f;
+		float zNear = 0.1f;
+		float zFar = 10000.f;
+		float moveSpeed = 1.f;
+		float rotateSpeed = 1.f;
+		float viewWidth = 10.f;
+		float camYAngle = 165.f / 180.f * 3.14159f;
+		float camXAngle = 32.f / 180.f * 3.14159f;
+
+	} m_cameraSetting;
+	Camera m_camera = Camera(core::vectorSIMDf(0, 0, 0), core::vectorSIMDf(0, 0, 0), core::matrix4SIMD());
+
+	Light m_light = {
+	  .direction = {-1.0f, -1.0f, -0.4f},
+	  .position = {10.0f, 15.0f, 8.0f},
+	  .outerCutoff = 0.866025404f, // {cos(radians(30.0f))}, 
+	  .type = ELT_DIRECTIONAL
+	};
+
+	video::CDumbPresentationOracle m_oracle;
+
+	struct C_UI
+	{
+		nbl::core::smart_refctd_ptr<nbl::ext::imgui::UI> manager;
+
+		struct
+		{
+			core::smart_refctd_ptr<video::IGPUSampler> gui, scene;
+		} samplers;
 
-	core::smart_refctd_ptr<IGPUDescriptorSet> descriptorSet;
-  } m_ui;
-  core::smart_refctd_ptr<IDescriptorPool> m_guiDescriptorSetPool;
+		core::smart_refctd_ptr<IGPUDescriptorSet> descriptorSet;
+	} m_ui;
+	core::smart_refctd_ptr<IDescriptorPool> m_guiDescriptorSetPool;
 
-  core::vector<ReferenceObjectGpu> m_gpuTriangleGeometries;
-  core::vector<SProceduralGeomInfo> m_gpuIntersectionSpheres;
-  uint32_t m_intersectionHitGroupIdx;
+	core::vector<ReferenceObjectGpu> m_gpuTriangleGeometries;
+	core::vector<SProceduralGeomInfo> m_gpuIntersectionSpheres;
+	uint32_t m_intersectionHitGroupIdx;
 
-  std::vector<smart_refctd_ptr<IGPUBottomLevelAccelerationStructure>> m_gpuBlasList;
-  smart_refctd_ptr<IGPUTopLevelAccelerationStructure> m_gpuTlas;
-  smart_refctd_ptr<IGPUBuffer> m_instanceBuffer;
+	std::vector<smart_refctd_ptr<IGPUBottomLevelAccelerationStructure>> m_gpuBlasList;
+	smart_refctd_ptr<IGPUTopLevelAccelerationStructure> m_gpuTlas;
+	smart_refctd_ptr<IGPUBuffer> m_instanceBuffer;
 
-  smart_refctd_ptr<IGPUBuffer> m_triangleGeomInfoBuffer;
-  smart_refctd_ptr<IGPUBuffer> m_proceduralGeomInfoBuffer;
-  smart_refctd_ptr<IGPUBuffer> m_proceduralAabbBuffer;
-  smart_refctd_ptr<IGPUBuffer> m_indirectBuffer;
+	smart_refctd_ptr<IGPUBuffer> m_triangleGeomInfoBuffer;
+	smart_refctd_ptr<IGPUBuffer> m_proceduralGeomInfoBuffer;
+	smart_refctd_ptr<IGPUBuffer> m_proceduralAabbBuffer;
+	smart_refctd_ptr<IGPUBuffer> m_indirectBuffer;
 
-  smart_refctd_ptr<IGPUImage> m_hdrImage;
-  smart_refctd_ptr<IGPUImageView> m_hdrImageView;
+	smart_refctd_ptr<IGPUImage> m_hdrImage;
+	smart_refctd_ptr<IGPUImageView> m_hdrImageView;
 
-  smart_refctd_ptr<IDescriptorPool> m_rayTracingDsPool;
-  smart_refctd_ptr<IGPUDescriptorSet> m_rayTracingDs;
-  smart_refctd_ptr<IGPURayTracingPipeline> m_rayTracingPipeline;
-  uint64_t m_rayTracingStackSize;
-  ShaderBindingTable m_shaderBindingTable;
+	smart_refctd_ptr<IDescriptorPool> m_rayTracingDsPool;
+	smart_refctd_ptr<IGPUDescriptorSet> m_rayTracingDs;
+	smart_refctd_ptr<IGPURayTracingPipeline> m_rayTracingPipeline;
+	uint64_t m_rayTracingStackSize;
+	ShaderBindingTable m_shaderBindingTable;
 
-  smart_refctd_ptr<IGPUDescriptorSet> m_presentDs;
-  smart_refctd_ptr<IDescriptorPool> m_presentDsPool;
-  smart_refctd_ptr<IGPUGraphicsPipeline> m_presentPipeline;
+	smart_refctd_ptr<IGPUDescriptorSet> m_presentDs;
+	smart_refctd_ptr<IDescriptorPool> m_presentDsPool;
+	smart_refctd_ptr<IGPUGraphicsPipeline> m_presentPipeline;
 
-  smart_refctd_ptr<CAssetConverter> m_converter;
+	smart_refctd_ptr<CAssetConverter> m_converter;
 
 
-  core::matrix4SIMD m_cachedModelViewProjectionMatrix;
-  bool m_useIndirectCommand = false;
+	core::matrix4SIMD m_cachedModelViewProjectionMatrix;
+	bool m_useIndirectCommand = false;
 
 };
 NBL_MAIN_FUNC(RaytracingPipelineApp)

From ada9c8b565ee428ea7c2077fc46624247b2c74c5 Mon Sep 17 00:00:00 2001
From: Erfan Ahmadi <ahmadierfan99@gmail.com>
Date: Fri, 16 May 2025 14:31:07 +0400
Subject: [PATCH 262/529] Cache&Replay with static images

---
 62_CAD/DrawResourcesFiller.cpp | 221 +++++++++++++++++++++++++--------
 62_CAD/DrawResourcesFiller.h   |  25 ++--
 62_CAD/Images.h                |  34 ++++-
 3 files changed, 209 insertions(+), 71 deletions(-)

diff --git a/62_CAD/DrawResourcesFiller.cpp b/62_CAD/DrawResourcesFiller.cpp
index c7a074d2f..b386f02a8 100644
--- a/62_CAD/DrawResourcesFiller.cpp
+++ b/62_CAD/DrawResourcesFiller.cpp
@@ -63,7 +63,7 @@ void DrawResourcesFiller::allocateResourcesBuffer(ILogicalDevice* logicalDevice,
 		IDeviceMemoryAllocator::SAllocateInfo allocationInfo =
 		{
 			// TODO: Get from user side.
-			.size = 70 * 1024 * 1024, // 70 MB
+			.size = 170 * 1024 * 1024, // 70 MB
 			.flags = IDeviceMemoryAllocation::E_MEMORY_ALLOCATE_FLAGS::EMAF_NONE,
 			.memoryTypeIndex = memoryTypeIdx,
 			.dedication = nullptr,
@@ -131,7 +131,7 @@ void DrawResourcesFiller::allocateMSDFTextures(ILogicalDevice* logicalDevice, ui
 
 	msdfLRUCache = std::unique_ptr<MSDFsLRUCache>(new MSDFsLRUCache(maxMSDFs));
 	msdfTextureArrayIndexAllocator = core::make_smart_refctd_ptr<IndexAllocator>(core::smart_refctd_ptr<ILogicalDevice>(logicalDevice), maxMSDFs);
-	msdfStagedCPUImages.resize(maxMSDFs);
+	msdfImagesState.resize(maxMSDFs);
 }
 
 void DrawResourcesFiller::drawPolyline(const CPolylineBase& polyline, const LineStyleInfo& lineStyleInfo, SIntendedSubmitInfo& intendedNextSubmit)
@@ -368,14 +368,13 @@ uint32_t DrawResourcesFiller::addStaticImage2D(image_id imageID, const core::sma
 	 *   - Ensure safe deallocation of the slot.
 	 *   - Submit any pending draw calls if the evicted image was scheduled to be used in the upcoming submission.
 	 */
-	auto evictionCallback = [&](const ImageReference& evicted)
+	auto evictionCallback = [&](image_id imageID, const ImageReference& evicted)
 	{
 		// Later used to release the image's memory range.
 		core::smart_refctd_ptr<ImageCleanup> cleanupObject = core::make_smart_refctd_ptr<ImageCleanup>();
 		cleanupObject->imagesMemorySuballocator = imagesMemorySubAllocator;
 		cleanupObject->addr = evicted.allocationOffset;
 		cleanupObject->size = evicted.allocationSize;
-		
 
 		const bool imageUsedForNextIntendedSubmit = (evicted.lastUsedFrameIndex == currentFrameIndex);
 		
@@ -401,6 +400,10 @@ uint32_t DrawResourcesFiller::addStaticImage2D(image_id imageID, const core::sma
 			ISemaphore::SWaitInfo deallocationWaitInfo = { .semaphore = intendedNextSubmit.scratchSemaphore.semaphore, .value = intendedNextSubmit.scratchSemaphore.value };
 			suballocatedDescriptorSet->multi_deallocate(imagesArrayBinding, 1u, &evicted.index, deallocationWaitInfo, &cleanupObject.get());
 		}
+
+		// erase imageID from our state map
+		// kindof mirrors the state of the LRUCache
+		staticImagesState.erase(imageID);
 	};
 
 	// Try inserting or updating the image usage in the cache.
@@ -469,12 +472,12 @@ uint32_t DrawResourcesFiller::addStaticImage2D(image_id imageID, const core::sma
 							const bool boundToMemorySuccessfully = device->bindImageMemory({ &bindImageMemoryInfo, 1u });
 							if (boundToMemorySuccessfully)
 							{
+								gpuImage->setObjectDebugName((std::to_string(imageID) + " Static Image 2D").c_str());
 								IGPUImageView::SCreationParams viewParams = {
 									.image = gpuImage,
 									.viewType = IGPUImageView::ET_2D,
 									.format = gpuImage->getCreationParameters().format
 								};
-								gpuImage->setObjectDebugName((std::to_string(imageID) + " Static Image 2D").c_str());
 								gpuImageView = device->createImageView(std::move(viewParams));
 								if (gpuImageView)
 								{
@@ -535,7 +538,7 @@ uint32_t DrawResourcesFiller::addStaticImage2D(image_id imageID, const core::sma
 				const image_id evictionCandidate = imagesUsageCache->select_eviction_candidate();
 				ImageReference* imageRef = imagesUsageCache->peek(evictionCandidate);
 				if (imageRef)
-					evictionCallback(*imageRef);
+					evictionCallback(evictionCandidate, *imageRef);
 				imagesUsageCache->erase(evictionCandidate);
 				while (suballocatedDescriptorSet->cull_frees()) {}; // to make sure deallocation requests in eviction callback are blocked for.
 
@@ -547,14 +550,17 @@ uint32_t DrawResourcesFiller::addStaticImage2D(image_id imageID, const core::sma
 			{
 				inserted->lastUsedFrameIndex = currentFrameIndex; // there was an eviction + auto-submit, we need to update AGAIN
 
-				StaticImagesCopy copyToStage = 
+				StaticImageState newState =
 				{
 					.cpuImage = cpuImage,
 					.gpuImageView = gpuImageView,
+					.allocationOffset = inserted->allocationOffset,
+					.allocationSize = inserted->allocationSize,
 					.arrayIndex = inserted->index,
+					.gpuResident = false,
 				};
 				// printf(std::format("Everything success, ImageID={} ArrayIndex={} \n", imageID, inserted->index).c_str());
-				staticImagesStagedCopies.push_back(copyToStage);
+				staticImagesState.emplace(imageID, newState);
 			}
 			else
 			{
@@ -653,15 +659,126 @@ bool DrawResourcesFiller::pushAllUploads(SIntendedSubmitInfo& intendedNextSubmit
 	{
 		// This means we're in a replay cache scope, use the replay cache to push to GPU instead of internal accumulation
 		success &= pushBufferUploads(intendedNextSubmit, currentReplayCache->resourcesCollection);
-		success &= pushMSDFImagesUploads(intendedNextSubmit, currentReplayCache->msdfStagedCPUImages);
-		// TODO: pushStaticImagesUploads
+		success &= pushMSDFImagesUploads(intendedNextSubmit, currentReplayCache->msdfImagesState);
+
+		// Push Static Images Uploads from replay cache, only those who are not gpu resident
+		auto* device = m_utilities->getLogicalDevice();
+		std::vector<StaticImageCopy> staticImageCopies;
+		for (auto& [id, replayImageState] : currentReplayCache->staticImagesState)
+		{
+			auto it = staticImagesState.find(id);
+			bool alreadyResident = false;
+
+			// compare with existing state, and check whether image id is already resident.
+			if (it != staticImagesState.end())
+			{
+				const StaticImageState& existingState = it->second;
+
+				const bool allocationMatches =
+					existingState.allocationOffset == replayImageState.allocationOffset &&
+					existingState.allocationSize == replayImageState.allocationSize;
+
+				const bool arrayIndexMatches = existingState.arrayIndex == replayImageState.arrayIndex;
+
+				alreadyResident = allocationMatches && arrayIndexMatches && existingState.gpuResident;
+			}
+
+			// if already resident, we don't need to do anything
+			if (alreadyResident)
+				continue;
+
+			bool successCreateNewImage = false;
+
+			// Not already resident, we need to recreate the image and bind the image memory to correct location again, and update the descriptor set and push the uploads
+			auto existingGPUImageViewParams = replayImageState.gpuImageView->getCreationParameters();
+			IGPUImage::SCreationParams imageParams = {};
+			imageParams = existingGPUImageViewParams.image->getCreationParameters();
+
+			auto newGPUImage = device->createImage(std::move(imageParams));
+			if (newGPUImage)
+			{
+				nbl::video::ILogicalDevice::SBindImageMemoryInfo bindImageMemoryInfo =
+				{
+					.image = newGPUImage.get(),
+					.binding = {.memory = imagesMemoryArena.memory.get(), .offset = imagesMemoryArena.offset + replayImageState.allocationOffset }
+				};
+
+				const bool boundToMemorySuccessfully = device->bindImageMemory({ &bindImageMemoryInfo, 1u });
+				if (boundToMemorySuccessfully)
+				{
+					newGPUImage->setObjectDebugName((std::to_string(id) + " Static Image 2D").c_str());
+					IGPUImageView::SCreationParams viewParams = existingGPUImageViewParams;
+					viewParams.image = newGPUImage;
+
+					auto newGPUImageView = device->createImageView(std::move(viewParams));
+					if (newGPUImageView)
+					{
+						successCreateNewImage = true;
+						
+						staticImageCopies.push_back(StaticImageCopy {
+							.cpuImage = replayImageState.cpuImage,
+							.gpuImageView = newGPUImageView,
+							.arrayIndex = replayImageState.arrayIndex
+							});
+
+						newGPUImageView->setObjectDebugName((std::to_string(id) + " Static Image View 2D").c_str());
+					}
+
+				}
+			}
+
+			if (!successCreateNewImage)
+			{
+				// TODO: Log
+				_NBL_DEBUG_BREAK_IF(true);
+				success = false;
+			}
+		}
+
+		bool replayStaticUploadSuccess = true;
+		
+		if (staticImageCopies.size() > 0u)
+		{
+			// We need to block for previous submit in order to safely, rebind image's memory and update the descriptor set array index.
+			// 
+			// [FUTURE_CONSIDERATION]: To avoid stalling the CPU when replaying caches that overflow GPU memory,
+			// we could recreate the image and image view, binding them to entirely new memory locations.
+			// This would require an indirection mechanism in the shader to remap references from cached geometry or objects to the new image array indices.
+			// Note: This isn't a problem if the replayed scene fits in memory and doesn't require overflow submissions due to image memory exhaustion.
+			nbl::video::ISemaphore::SWaitInfo waitInfo = { .semaphore = intendedNextSubmit.scratchSemaphore.semaphore, .value = intendedNextSubmit.scratchSemaphore.value };
+			device->blockForSemaphores({ &waitInfo, 1u });
+			replayStaticUploadSuccess = pushStaticImagesUploads_Internal(intendedNextSubmit, staticImageCopies);
+		}
+
+		if (replayStaticUploadSuccess)
+		{
+			staticImagesState = currentReplayCache->staticImagesState;
+			for (auto& [_, state] : staticImagesState)
+				state.gpuResident = true;
+		}
+
+		success &= replayStaticUploadSuccess;
 	}
 	else
 	{
 		flushDrawObjects();
 		success &= pushBufferUploads(intendedNextSubmit, resourcesCollection);
-		success &= pushMSDFImagesUploads(intendedNextSubmit, msdfStagedCPUImages);
-		success &= pushStaticImagesUploads(intendedNextSubmit);
+		success &= pushMSDFImagesUploads(intendedNextSubmit, msdfImagesState);
+
+		// Push Static Images Uploads, only those who are not gpu resident
+		std::vector<StaticImageCopy> staticImageCopies;
+		for (auto& [id, staticImageState] : staticImagesState)
+		{
+			if (!staticImageState.gpuResident)
+				staticImageCopies.push_back(StaticImageCopy{ .cpuImage = staticImageState.cpuImage, .gpuImageView = staticImageState.gpuImageView, .arrayIndex = staticImageState.arrayIndex });
+		}
+		const bool staticImagesUploadSuccess = pushStaticImagesUploads_Internal(intendedNextSubmit, staticImageCopies);
+		if (staticImagesUploadSuccess)
+		{
+			for (auto& [id, staticImageState] : staticImagesState)
+				staticImageState.gpuResident = true;
+		}
+		success &= staticImagesUploadSuccess;
 	}
 	return success;
 }
@@ -751,11 +868,12 @@ std::unique_ptr<DrawResourcesFiller::ReplayCache> DrawResourcesFiller::createRep
 	flushDrawObjects();
 	std::unique_ptr<ReplayCache> ret = std::unique_ptr<ReplayCache>(new ReplayCache);
 	ret->resourcesCollection = resourcesCollection;
-	ret->msdfStagedCPUImages = msdfStagedCPUImages;
-	for (auto& stagedMSDF : ret->msdfStagedCPUImages)
+	ret->msdfImagesState = msdfImagesState;
+	for (auto& stagedMSDF : ret->msdfImagesState)
 		stagedMSDF.uploadedToGPU = false; // to trigger upload for all msdf functions again.
 	ret->drawCallsData = drawCalls;
 	ret->activeMainObjectIndex = activeMainObjectIndex;
+	ret->staticImagesState = staticImagesState; // copy state of static images
 	return ret;
 }
 
@@ -825,7 +943,7 @@ bool DrawResourcesFiller::pushBufferUploads(SIntendedSubmitInfo& intendedNextSub
 	return true;
 }
 
-bool DrawResourcesFiller::pushMSDFImagesUploads(SIntendedSubmitInfo& intendedNextSubmit, std::vector<MSDFStagedCPUImage>& stagedMSDFCPUImages)
+bool DrawResourcesFiller::pushMSDFImagesUploads(SIntendedSubmitInfo& intendedNextSubmit, std::vector<MSDFImageState>& stagedMSDFCPUImages)
 {
 	auto* cmdBuffInfo = intendedNextSubmit.getCommandBufferForRecording();
 	
@@ -938,7 +1056,7 @@ bool DrawResourcesFiller::pushMSDFImagesUploads(SIntendedSubmitInfo& intendedNex
 			}
 		};
 		commandBuffer->pipelineBarrier(E_DEPENDENCY_FLAGS::EDF_NONE, { .imgBarriers = afterTransferImageBarrier });
-		
+
 		if (!m_hasInitializedMSDFTextureArrays)
 			m_hasInitializedMSDFTextureArrays = true;
 
@@ -951,29 +1069,29 @@ bool DrawResourcesFiller::pushMSDFImagesUploads(SIntendedSubmitInfo& intendedNex
 	}
 }
 
-bool DrawResourcesFiller::pushStaticImagesUploads(SIntendedSubmitInfo& intendedNextSubmit)
+bool DrawResourcesFiller::pushStaticImagesUploads_Internal(SIntendedSubmitInfo& intendedNextSubmit, std::span<StaticImageCopy> staticImagesCopy)
 {
-	auto* device = m_utilities->getLogicalDevice();
-	auto* physDev = m_utilities->getLogicalDevice()->getPhysicalDevice();
-	auto* descriptorSet = suballocatedDescriptorSet->getDescriptorSet();
-	auto* cmdBuffInfo = intendedNextSubmit.getCommandBufferForRecording();
-	
-	if (cmdBuffInfo)
-	{
-		bool success = true;
+	bool success = true;
 
-		if (staticImagesStagedCopies.size() > 0ull)
+	if (staticImagesCopy.size() > 0ull)
+	{
+		auto* device = m_utilities->getLogicalDevice();
+		auto* physDev = m_utilities->getLogicalDevice()->getPhysicalDevice();
+		auto* descriptorSet = suballocatedDescriptorSet->getDescriptorSet();
+		auto* cmdBuffInfo = intendedNextSubmit.getCommandBufferForRecording();
+	
+		if (cmdBuffInfo)
 		{
 			IGPUCommandBuffer* commandBuffer = cmdBuffInfo->cmdbuf;
 
 			// DescriptorSet Updates
 			std::vector<video::IGPUDescriptorSet::SDescriptorInfo> descriptorInfos;
 			std::vector<IGPUDescriptorSet::SWriteDescriptorSet> descriptorWrites;
-			descriptorInfos.resize(staticImagesStagedCopies.size());
-			descriptorWrites.resize(staticImagesStagedCopies.size());
-			for (uint32_t i = 0u; i < staticImagesStagedCopies.size(); ++i)
+			descriptorInfos.resize(staticImagesCopy.size());
+			descriptorWrites.resize(staticImagesCopy.size());
+			for (uint32_t i = 0u; i < staticImagesCopy.size(); ++i)
 			{
-				auto& stagedStaticImage = staticImagesStagedCopies[i];
+				auto& stagedStaticImage = staticImagesCopy[i];
 				// Bind gpu image view to descriptor set
 				descriptorInfos[i].info.image.imageLayout = IImage::LAYOUT::READ_ONLY_OPTIMAL;
 				descriptorInfos[i].desc = stagedStaticImage.gpuImageView;
@@ -989,12 +1107,12 @@ bool DrawResourcesFiller::pushStaticImagesUploads(SIntendedSubmitInfo& intendedN
 			success &= device->updateDescriptorSets(descriptorWrites.size(), descriptorWrites.data(), 0u, nullptr);
 
 			std::vector<IGPUCommandBuffer::SPipelineBarrierDependencyInfo::image_barrier_t> beforeCopyImageBarriers;
-			beforeCopyImageBarriers.resize(staticImagesStagedCopies.size());
+			beforeCopyImageBarriers.resize(staticImagesCopy.size());
 
 			// Pipeline Barriers before stagedStaticImage
-			for (uint32_t i = 0u; i < staticImagesStagedCopies.size(); ++i)
+			for (uint32_t i = 0u; i < staticImagesCopy.size(); ++i)
 			{
-				auto& stagedStaticImage = staticImagesStagedCopies[i];
+				auto& stagedStaticImage = staticImagesCopy[i];
 				const auto& gpuImg = stagedStaticImage.gpuImageView->getCreationParameters().image;
 				beforeCopyImageBarriers[i] =
 				{
@@ -1021,9 +1139,9 @@ bool DrawResourcesFiller::pushStaticImagesUploads(SIntendedSubmitInfo& intendedN
 			}
 			success &= commandBuffer->pipelineBarrier(E_DEPENDENCY_FLAGS::EDF_NONE, { .imgBarriers = beforeCopyImageBarriers });
 
-			for (uint32_t i = 0u; i < staticImagesStagedCopies.size(); ++i)
+			for (uint32_t i = 0u; i < staticImagesCopy.size(); ++i)
 			{
-				auto& stagedStaticImage = staticImagesStagedCopies[i];
+				auto& stagedStaticImage = staticImagesCopy[i];
 				auto& gpuImg = stagedStaticImage.gpuImageView->getCreationParameters().image;
 				success &= m_utilities->updateImageViaStagingBuffer(
 					intendedNextSubmit,
@@ -1035,12 +1153,12 @@ bool DrawResourcesFiller::pushStaticImagesUploads(SIntendedSubmitInfo& intendedN
 			commandBuffer = intendedNextSubmit.getCommandBufferForRecording()->cmdbuf; // overflow-submit in utilities calls might've cause current recording command buffer to change
 
 			std::vector<IGPUCommandBuffer::SPipelineBarrierDependencyInfo::image_barrier_t> afterCopyImageBarriers;
-			afterCopyImageBarriers.resize(staticImagesStagedCopies.size());
+			afterCopyImageBarriers.resize(staticImagesCopy.size());
 
 			// Pipeline Barriers before stagedStaticImage
-			for (uint32_t i = 0u; i < staticImagesStagedCopies.size(); ++i)
+			for (uint32_t i = 0u; i < staticImagesCopy.size(); ++i)
 			{
-				auto& stagedStaticImage = staticImagesStagedCopies[i];
+				auto& stagedStaticImage = staticImagesCopy[i];
 				const auto& gpuImg = stagedStaticImage.gpuImageView->getCreationParameters().image;
 				afterCopyImageBarriers[i] =
 				{
@@ -1067,22 +1185,19 @@ bool DrawResourcesFiller::pushStaticImagesUploads(SIntendedSubmitInfo& intendedN
 			}
 			success &= commandBuffer->pipelineBarrier(E_DEPENDENCY_FLAGS::EDF_NONE, { .imgBarriers = afterCopyImageBarriers });
 		}
-
-		staticImagesStagedCopies.clear();
-		if (!success)
+		else
 		{
-			// TODO: Log
 			_NBL_DEBUG_BREAK_IF(true);
+			success = false;
 		}
-		return success;
-
 	}
-	else
+
+	if (!success)
 	{
 		// TODO: Log
-			_NBL_DEBUG_BREAK_IF(true);
-		return false;
+		_NBL_DEBUG_BREAK_IF(true);
 	}
+	return success;
 }
 
 const size_t DrawResourcesFiller::calculateRemainingResourcesSize() const
@@ -1727,8 +1842,8 @@ uint32_t DrawResourcesFiller::addMSDFTexture(const MSDFInputInfo& msdfInput, cor
 	{
 		// `deallocationWaitInfo` is used to prepare wait info to defer index deallocation until the GPU has finished using the resource.
 		// NOTE: `deallocationWaitInfo` is currently *not* required for correctness because:
-		//   - Both the image upload (msdfStagedCPUImages) and usage occur within the same timeline (`intendedNextSubmit`).
-		//   - timeline semaphores guarantee proper ordering: the next submit's msdfStagedCPUImages will wait on the prior usage.
+		//   - Both the image upload (msdfImagesState) and usage occur within the same timeline (`intendedNextSubmit`).
+		//   - timeline semaphores guarantee proper ordering: the next submit's msdfImagesState will wait on the prior usage.
 		//   - Therefore, we can safely overwrite or reallocate the slot without waiting for explicit GPU completion.
 		//
 		// However, this `deallocationWaitInfo` *will* become essential if we start interacting with MSDF images
@@ -1759,7 +1874,7 @@ uint32_t DrawResourcesFiller::addMSDFTexture(const MSDFInputInfo& msdfInput, cor
 		}
 		
 		// Clear CPU-side metadata associated with the evicted slot.
-		msdfStagedCPUImages[evicted.alloc_idx].evict();
+		msdfImagesState[evicted.alloc_idx].evict();
 	};
 	
 	// We pass nextSemaValue instead of constructing a new MSDFReference and passing it into `insert` that's because we might get a cache hit and only update the value of the nextSema
@@ -1776,9 +1891,9 @@ uint32_t DrawResourcesFiller::addMSDFTexture(const MSDFInputInfo& msdfInput, cor
 
 		if (inserted->alloc_idx != IndexAllocator::AddressAllocator::invalid_address)
 		{
-			// We stage msdfStagedCPUImages, pushMSDFImagesUploads will push it into GPU
-			msdfStagedCPUImages[inserted->alloc_idx].image = std::move(cpuImage);
-			msdfStagedCPUImages[inserted->alloc_idx].uploadedToGPU = false;
+			// We stage msdfImagesState, pushMSDFImagesUploads will push it into GPU
+			msdfImagesState[inserted->alloc_idx].image = std::move(cpuImage);
+			msdfImagesState[inserted->alloc_idx].uploadedToGPU = false;
 		}
 		else
 		{
diff --git a/62_CAD/DrawResourcesFiller.h b/62_CAD/DrawResourcesFiller.h
index a9b5da172..8c95b9a09 100644
--- a/62_CAD/DrawResourcesFiller.h
+++ b/62_CAD/DrawResourcesFiller.h
@@ -304,7 +304,7 @@ struct DrawResourcesFiller
 	/// For advanced use only, (passed to shaders for them to know if we overflow-submitted in the middle if a main obj
 	uint32_t getActiveMainObjectIndex() const;
 
-	struct MSDFStagedCPUImage
+	struct MSDFImageState
 	{
 		core::smart_refctd_ptr<ICPUImage> image;
 		bool uploadedToGPU : 1u;
@@ -352,9 +352,10 @@ struct DrawResourcesFiller
 	/// This enables efficient replays without traversing or re-generating scene content.
 	struct ReplayCache
 	{
-		ResourcesCollection resourcesCollection;
-		std::vector<MSDFStagedCPUImage> msdfStagedCPUImages;
 		std::vector<DrawCallData> drawCallsData;
+		ResourcesCollection resourcesCollection;
+		std::vector<MSDFImageState> msdfImagesState;
+		std::unordered_map<image_id, StaticImageState> staticImagesState;
 		uint32_t activeMainObjectIndex = InvalidMainObjectIdx;
 		// TODO: non msdf general CPU Images
 		// TODO: Get total memory consumption for logging?
@@ -389,11 +390,10 @@ struct DrawResourcesFiller
 	bool pushBufferUploads(SIntendedSubmitInfo& intendedNextSubmit, ResourcesCollection& resourcesCollection);
 	
 	/// @brief Records GPU copy commands for all staged msdf images into the active command buffer.
-	bool pushMSDFImagesUploads(SIntendedSubmitInfo& intendedNextSubmit, std::vector<MSDFStagedCPUImage>& stagedMSDFCPUImages);
+	bool pushMSDFImagesUploads(SIntendedSubmitInfo& intendedNextSubmit, std::vector<MSDFImageState>& msdfImagesState);
 
 	/// @brief Records GPU copy commands for all staged msdf images into the active command buffer.
-	/// TODO: Handle for cache&replay mode later
-	bool pushStaticImagesUploads(SIntendedSubmitInfo& intendedNextSubmit);
+	bool pushStaticImagesUploads_Internal(SIntendedSubmitInfo& intendedNextSubmit, std::span<StaticImageCopy> staticImagesCopy);
 
 	const size_t calculateRemainingResourcesSize() const;
 
@@ -640,7 +640,7 @@ struct DrawResourcesFiller
 	smart_refctd_ptr<IndexAllocator>	msdfTextureArrayIndexAllocator;
 	std::unique_ptr<MSDFsLRUCache>		msdfLRUCache; // LRU Cache to evict Least Recently Used in case of overflow
 
-	std::vector<MSDFStagedCPUImage>		msdfStagedCPUImages = {}; // cached cpu imaged + their status, size equals to LRUCache size
+	std::vector<MSDFImageState>			msdfImagesState = {}; // cached cpu imaged + their status, size equals to LRUCache size
 	static constexpr asset::E_FORMAT	MSDFTextureFormat = asset::E_FORMAT::EF_R8G8B8A8_SNORM;
 	bool m_hasInitializedMSDFTextureArrays = false;
 	
@@ -649,13 +649,8 @@ struct DrawResourcesFiller
 	smart_refctd_ptr<SubAllocatedDescriptorSet> suballocatedDescriptorSet;
 	uint32_t imagesArrayBinding = 0u;
 	
-	// static images (not streamable):
-	struct StaticImagesCopy
-	{
-		core::smart_refctd_ptr<ICPUImage> cpuImage;
-		core::smart_refctd_ptr<IGPUImageView> gpuImageView;
-		uint32_t arrayIndex;
-	};
-	std::vector<StaticImagesCopy> staticImagesStagedCopies;
+	// TODO: consider removing this and just using the `imagesUsageCache` and `ImageReference` when `core::ResizableLRUCache` is copyable and iterable
+	// Current state of the static images, used in `pushStaticImagesUploads` to make StaticImages `gpuResident` and bind them to correct array index
+	std::unordered_map<image_id, StaticImageState> staticImagesState;
 };
 
diff --git a/62_CAD/Images.h b/62_CAD/Images.h
index 7c9609161..d8c6cf864 100644
--- a/62_CAD/Images.h
+++ b/62_CAD/Images.h
@@ -84,6 +84,27 @@ struct ImageCleanup : public core::IReferenceCounted
 
 };
 
+struct StaticImageCopy
+{
+	core::smart_refctd_ptr<ICPUImage> cpuImage;
+	core::smart_refctd_ptr<IGPUImageView> gpuImageView;
+	uint32_t arrayIndex;
+};
+
+// TODO: consider just using the ImagesUsageCache to store this StaticImagesState, i.e. merge this struct with the ImageReference
+//		it will be possible after LRUCache improvements and copyability
+//		for now this will be a mirror of the LRUCache but in an unordered_map
+struct StaticImageState
+{
+	core::smart_refctd_ptr<ICPUImage> cpuImage = nullptr;
+	core::smart_refctd_ptr<IGPUImageView> gpuImageView = nullptr;
+	uint64_t allocationOffset = ImagesMemorySubAllocator::InvalidAddress;
+	uint64_t allocationSize = 0u;
+	uint32_t arrayIndex = ~0u; // in texture array descriptor 
+	bool gpuResident = false;
+};
+
+
 struct ImageReference
 {
 	static constexpr uint32_t InvalidTextureIndex = nbl::hlsl::numeric_limits<uint32_t>::max;
@@ -127,10 +148,17 @@ class ImagesUsageCache
 	// Attempts to insert a new image into the cache.
 	// If the cache is full, invokes the provided `evictCallback` to evict an image.
 	// Returns a pointer to the inserted or existing ImageReference.
-	template<std::invocable<const ImageReference&> EvictionCallback>
+	template<std::invocable<image_id, const ImageReference&> EvictionCallback>
 	inline ImageReference* insert(image_id imageID, uint64_t lastUsedSema, EvictionCallback&& evictCallback)
 	{
-		return lruCache.insert(imageID, lastUsedSema, std::forward<EvictionCallback>(evictCallback));
+		auto lruEvictionCallback = [&](const ImageReference& evicted)
+			{
+				const image_id* evictingKey = lruCache.get_least_recently_used();
+				assert(evictingKey != nullptr);
+				if (evictingKey)
+					evictCallback(*evictingKey, evicted);
+			};
+		return lruCache.insert(imageID, lastUsedSema, lruEvictionCallback);
 	}
 	
 	// Retrieves the image associated with `imageID`, updating its LRU position.
@@ -158,7 +186,7 @@ class ImagesUsageCache
 		{
 			// we shouldn't select eviction candidate if lruCache is empty
 			_NBL_DEBUG_BREAK_IF(true);
-			return 0ull;
+			return ~0ull;
 		}
 	}
 	

From 2632c3abe127a226c1593a47491a381f32762680 Mon Sep 17 00:00:00 2001
From: Erfan Ahmadi <ahmadierfan99@gmail.com>
Date: Sun, 18 May 2025 14:59:00 +0400
Subject: [PATCH 263/529] use OrientedBoundingBox2D for images

---
 62_CAD/DrawResourcesFiller.cpp | 8 ++++----
 62_CAD/DrawResourcesFiller.h   | 2 +-
 62_CAD/main.cpp                | 3 +--
 62_CAD/shaders/globals.hlsl    | 8 ++++++++
 4 files changed, 14 insertions(+), 7 deletions(-)

diff --git a/62_CAD/DrawResourcesFiller.cpp b/62_CAD/DrawResourcesFiller.cpp
index b386f02a8..f50e8f317 100644
--- a/62_CAD/DrawResourcesFiller.cpp
+++ b/62_CAD/DrawResourcesFiller.cpp
@@ -623,16 +623,16 @@ void DrawResourcesFiller::drawGridDTM(
 	endMainObject();
 }
 
-void DrawResourcesFiller::addImageObject(image_id imageID, float64_t2 topLeftPos, float32_t2 size, float32_t rotation, SIntendedSubmitInfo& intendedNextSubmit)
+void DrawResourcesFiller::addImageObject(image_id imageID, const OrientedBoundingBox2D& obb, SIntendedSubmitInfo& intendedNextSubmit)
 {
 	beginMainObject(MainObjectType::IMAGE);
 
 	uint32_t mainObjIdx = acquireActiveMainObjectIndex_SubmitIfNeeded(intendedNextSubmit);
 
 	ImageObjectInfo info = {};
-	info.topLeft = topLeftPos;
-	info.dirU = float32_t2(size.x * cos(rotation), size.x * sin(rotation)); // 
-	info.aspectRatio = size.y / size.x;
+	info.topLeft = obb.topLeft;
+	info.dirU = obb.dirU;
+	info.aspectRatio = obb.aspectRatio;
 	info.textureID = getImageIndexFromID(imageID, intendedNextSubmit); // for this to be valid and safe, this function needs to be called immediately after `addStaticImage` function to make sure image is in memory
 	if (!addImageObject_Internal(info, mainObjIdx))
 	{
diff --git a/62_CAD/DrawResourcesFiller.h b/62_CAD/DrawResourcesFiller.h
index 8c95b9a09..5501e4c84 100644
--- a/62_CAD/DrawResourcesFiller.h
+++ b/62_CAD/DrawResourcesFiller.h
@@ -246,7 +246,7 @@ struct DrawResourcesFiller
 	uint32_t addStaticImage2D(image_id imageID, const core::smart_refctd_ptr<ICPUImage>& cpuImage, SIntendedSubmitInfo& intendedNextSubmit);
 
 	// This function must be called immediately after `addStaticImage` for the same imageID.
-	void addImageObject(image_id imageID, float64_t2 topLeftPos, float32_t2 size, float32_t rotation, SIntendedSubmitInfo& intendedNextSubmit);
+	void addImageObject(image_id imageID, const OrientedBoundingBox2D& obb, SIntendedSubmitInfo& intendedNextSubmit);
 	
 	/// @brief call this function before submitting to ensure all buffer and textures resourcesCollection requested via drawing calls are copied to GPU
 	/// records copy command into intendedNextSubmit's active command buffer and might possibly submits if fails allocation on staging upload memory.
diff --git a/62_CAD/main.cpp b/62_CAD/main.cpp
index 238dbedb6..1394bf719 100644
--- a/62_CAD/main.cpp
+++ b/62_CAD/main.cpp
@@ -2907,8 +2907,7 @@ class ComputerAidedDesign final : public examples::SimpleWindowedApplication, pu
 				uint64_t imageID = i * 69ull; // it can be hash or something of the file path the image was loaded from
 				//printf(std::format("\n Image {} \n", i).c_str());
 				drawResourcesFiller.addStaticImage2D(imageID, sampleImages[i], intendedNextSubmit);
-				drawResourcesFiller.addImageObject(imageID, { 0.0 + (i) * 3.0, 0.0 }, { 3.0 , 3.0 }, 0.0, intendedNextSubmit);
-				// drawResourcesFiller.addImageObject(imageID, { 40.0, +40.0 }, { 100.0, 100.0 }, 0.0, intendedNextSubmit);
+				drawResourcesFiller.addImageObject(imageID, { .topLeft = { 0.0 + (i) * 3.0, 0.0 }, .dirU = { 3.0 , 0.0 }, .aspectRatio = 1.0 }, intendedNextSubmit);
 				//printf("\n");
 			}
 			LineStyleInfo lineStyle = 
diff --git a/62_CAD/shaders/globals.hlsl b/62_CAD/shaders/globals.hlsl
index 538387491..0280b5881 100644
--- a/62_CAD/shaders/globals.hlsl
+++ b/62_CAD/shaders/globals.hlsl
@@ -528,6 +528,14 @@ NBL_CONSTEXPR float MSDFSize = 32.0f;
 NBL_CONSTEXPR uint32_t MSDFMips = 4; 
 NBL_CONSTEXPR float HatchFillMSDFSceenSpaceSize = 8.0; 
 
+// Used in CPU-side only for now
+struct OrientedBoundingBox2D
+{
+    pfloat64_t2 topLeft; // 2 * 8 = 16 bytes (16)
+    float32_t2 dirU; // 2 * 4 = 8 bytes (24)
+    float32_t aspectRatio; // 4 bytes (28)
+};
+
 #ifdef __HLSL_VERSION
 [[vk::binding(0, 0)]] ConstantBuffer<Globals> globals : register(b0);
 

From b4c5ff2f37489357648bc6b629970d4767ceabd9 Mon Sep 17 00:00:00 2001
From: keptsecret <sorchon@gmail.com>
Date: Mon, 19 May 2025 13:51:10 +0700
Subject: [PATCH 264/529] use asset converter to build tlas

---
 71_RayTracingPipeline/main.cpp | 406 +++++++++++++++++++++++++++++----
 1 file changed, 364 insertions(+), 42 deletions(-)

diff --git a/71_RayTracingPipeline/main.cpp b/71_RayTracingPipeline/main.cpp
index 528b2c314..faa392a46 100644
--- a/71_RayTracingPipeline/main.cpp
+++ b/71_RayTracingPipeline/main.cpp
@@ -1225,41 +1225,41 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication,
 
 		// triangles geometries
 		const auto cpuObjects = std::array{
-		  ReferenceObjectCpu {
-			.meta = {.type = OT_RECTANGLE, .name = "Plane Mesh"},
-			.data = gc->createRectangleMesh(nbl::core::vector2df_SIMD(10, 10)),
-			.material = defaultMaterial,
-			.transform = planeTransform,
-		  },
-		  ReferenceObjectCpu {
-			.meta = {.type = OT_CUBE, .name = "Cube Mesh"},
-			.data = gc->createCubeMesh(nbl::core::vector3df(1, 1, 1)),
-			.material = defaultMaterial,
-			.transform = getTranslationMatrix(0, 0.5f, 0),
-		  },
-		  ReferenceObjectCpu {
-			.meta = {.type = OT_CUBE, .name = "Cube Mesh 2"},
-			.data = gc->createCubeMesh(nbl::core::vector3df(1.5, 1.5, 1.5)),
-			.material = Material{
-			  .ambient = {0.1, 0.1, 0.2},
-			  .diffuse = {0.2, 0.2, 0.8},
-			  .specular = {0.8, 0.8, 0.8},
-			  .shininess = 1.0f,
+			ReferenceObjectCpu {
+				.meta = {.type = OT_RECTANGLE, .name = "Plane Mesh"},
+				.data = gc->createRectangleMesh(nbl::core::vector2df_SIMD(10, 10)),
+				.material = defaultMaterial,
+				.transform = planeTransform,
 			},
-			.transform = getTranslationMatrix(-5.0f, 1.0f, 0),
-		  },
-		  ReferenceObjectCpu {
-			.meta = {.type = OT_CUBE, .name = "Transparent Cube Mesh"},
-			.data = gc->createCubeMesh(nbl::core::vector3df(1.5, 1.5, 1.5)),
-			.material = Material{
-			  .ambient = {0.1, 0.2, 0.1},
-			  .diffuse = {0.2, 0.8, 0.2},
-			  .specular = {0.8, 0.8, 0.8},
-			  .shininess = 1.0f,
-			  .alpha = 0.2,
+			ReferenceObjectCpu {
+				.meta = {.type = OT_CUBE, .name = "Cube Mesh"},
+				.data = gc->createCubeMesh(nbl::core::vector3df(1, 1, 1)),
+				.material = defaultMaterial,
+				.transform = getTranslationMatrix(0, 0.5f, 0),
+			},
+			ReferenceObjectCpu {
+				.meta = {.type = OT_CUBE, .name = "Cube Mesh 2"},
+				.data = gc->createCubeMesh(nbl::core::vector3df(1.5, 1.5, 1.5)),
+				.material = Material{
+					.ambient = {0.1, 0.1, 0.2},
+					.diffuse = {0.2, 0.2, 0.8},
+					.specular = {0.8, 0.8, 0.8},
+					.shininess = 1.0f,
+				},
+				.transform = getTranslationMatrix(-5.0f, 1.0f, 0),
+			},
+			ReferenceObjectCpu {
+				.meta = {.type = OT_CUBE, .name = "Transparent Cube Mesh"},
+				.data = gc->createCubeMesh(nbl::core::vector3df(1.5, 1.5, 1.5)),
+				.material = Material{
+					.ambient = {0.1, 0.2, 0.1},
+					.diffuse = {0.2, 0.8, 0.2},
+					.specular = {0.8, 0.8, 0.8},
+					.shininess = 1.0f,
+					.alpha = 0.2,
+				},
+				.transform = getTranslationMatrix(5.0f, 1.0f, 0),
 			},
-			.transform = getTranslationMatrix(5.0f, 1.0f, 0),
-		  },
 		};
 
 		struct CPUTriBufferBindings
@@ -1313,14 +1313,14 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication,
 		{
 			const auto middle_i = NumberOfProceduralGeometries / 2.0;
 			SProceduralGeomInfo sphere = {
-			  .material = hlsl::_static_cast<MaterialPacked>(Material{
-				.ambient = {0.1, 0.05 * i, 0.1},
-				.diffuse = {0.3, 0.2 * i, 0.3},
-				.specular = {0.8, 0.8, 0.8},
-				.shininess = 1.0f,
-			  }),
-			  .center = float32_t3((i - middle_i) * 4.0, 2, 5.0),
-			  .radius = 1,
+					.material = hlsl::_static_cast<MaterialPacked>(Material{
+					.ambient = {0.1, 0.05 * i, 0.1},
+					.diffuse = {0.3, 0.2 * i, 0.3},
+					.specular = {0.8, 0.8, 0.8},
+					.shininess = 1.0f,
+				}),
+				.center = float32_t3((i - middle_i) * 4.0, 2, 5.0),
+				.radius = 1,
 			};
 
 			proceduralGeoms.push_back(sphere);
@@ -1332,11 +1332,333 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication,
 			};
 		}
 
+		{
+			IGPUBuffer::SCreationParams params;
+			params.usage = IGPUBuffer::EUF_STORAGE_BUFFER_BIT | IGPUBuffer::EUF_TRANSFER_DST_BIT | IGPUBuffer::EUF_INLINE_UPDATE_VIA_CMDBUF | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT;
+			params.size = proceduralGeoms.size() * sizeof(SProceduralGeomInfo);
+			m_utils->createFilledDeviceLocalBufferOnDedMem(SIntendedSubmitInfo{ .queue = queue }, std::move(params), proceduralGeoms.data()).move_into(m_proceduralGeomInfoBuffer);
+		}
+
 		// get ICPUBuffers into ICPUBLAS
+		// TODO use one BLAS and multiple triangles/aabbs in one
+		const auto blasCount = std::size(cpuObjects) + 1;
+		const auto proceduralBlasIdx = std::size(cpuObjects);
+
+		std::array<smart_refctd_ptr<ICPUBottomLevelAccelerationStructure>, std::size(cpuObjects)+1u> cpuBlas;
+		for (uint32_t i = 0; i < blasCount; i++)
+		{
+			auto& blas = cpuBlas[i];
+			blas = make_smart_refctd_ptr<ICPUBottomLevelAccelerationStructure>();
+
+			if (i == proceduralBlasIdx)
+			{
+				auto aabbs = make_refctd_dynamic_array<smart_refctd_dynamic_array<ICPUBottomLevelAccelerationStructure::AABBs<ICPUBuffer>>>(1u);
+				auto primitiveCounts = make_refctd_dynamic_array<smart_refctd_dynamic_array<uint32_t>>(1u);
+
+				auto& aabb = aabbs->front();
+				auto& primCount = primitiveCounts->front();
+				
+				primCount = NumberOfProceduralGeometries;
+				aabb.data = { .offset = 0, .buffer = cpuProcBuffer };
+				aabb.stride = sizeof(IGPUBottomLevelAccelerationStructure::AABB_t);
+				aabb.geometryFlags = IGPUBottomLevelAccelerationStructure::GEOMETRY_FLAGS::OPAQUE_BIT; // only allow opaque for now
+
+				blas->setGeometries(std::move(aabbs), std::move(primitiveCounts));
+			}
+			else
+			{
+				auto triangles = make_refctd_dynamic_array<smart_refctd_dynamic_array<ICPUBottomLevelAccelerationStructure::Triangles<ICPUBuffer>>>(1u);
+				auto primitiveCounts = make_refctd_dynamic_array<smart_refctd_dynamic_array<uint32_t>>(1u);
+
+				auto& tri = triangles->front();
+				auto& primCount = primitiveCounts->front();
+				const auto& geom = cpuObjects[i];
+				const auto& cpuBuf = cpuTriBuffers[i];
+
+				const bool useIndex = geom.data.indexType != EIT_UNKNOWN;
+				const uint32_t vertexStride = geom.data.inputParams.bindings[0].stride;
+				const uint32_t numVertices = cpuBuf.vertex.buffer->getSize() / vertexStride;
+
+				if (useIndex)
+					primCount = geom.data.indexCount / 3;
+				else
+					primCount = numVertices / 3;
+
+				tri.vertexData[0] = cpuBuf.vertex;
+				tri.indexData = useIndex ? cpuBuf.index : cpuBuf.vertex;
+				tri.maxVertex = numVertices - 1;
+				tri.vertexStride = vertexStride;
+				tri.vertexFormat = EF_R32G32B32_SFLOAT;
+				tri.indexType = geom.data.indexType;
+				tri.geometryFlags = geom.material.isTransparent() ?
+					IGPUBottomLevelAccelerationStructure::GEOMETRY_FLAGS::NO_DUPLICATE_ANY_HIT_INVOCATION_BIT :
+					IGPUBottomLevelAccelerationStructure::GEOMETRY_FLAGS::OPAQUE_BIT;
+
+				blas->setGeometries(std::move(triangles), std::move(primitiveCounts));
+			}
+
+			auto blasFlags = bitflag(IGPUBottomLevelAccelerationStructure::BUILD_FLAGS::PREFER_FAST_TRACE_BIT) | IGPUBottomLevelAccelerationStructure::BUILD_FLAGS::ALLOW_COMPACTION_BIT;
+			if (i == proceduralBlasIdx)
+				blasFlags |= IGPUBottomLevelAccelerationStructure::BUILD_FLAGS::GEOMETRY_TYPE_IS_AABB_BIT;
+
+			blas->setBuildFlags(blasFlags);
+			blas->setContentHash(blas->computeContentHash());
+		}
+
+		auto geomInfoBuffer = ICPUBuffer::create({ std::size(cpuObjects) * sizeof(STriangleGeomInfo) });
+		STriangleGeomInfo* geomInfos = reinterpret_cast<STriangleGeomInfo*>(geomInfoBuffer->getPointer());
 
 		// get ICPUBLAS into ICPUTLAS
+		auto geomInstances = make_refctd_dynamic_array<smart_refctd_dynamic_array<ICPUTopLevelAccelerationStructure::PolymorphicInstance>>(blasCount);
+		{
+			uint32_t i = 0;
+			for (auto instance = geomInstances->begin(); instance != geomInstances->end(); instance++, i++)
+			{
+				const auto isProceduralInstance = i == proceduralBlasIdx;
+				ICPUTopLevelAccelerationStructure::StaticInstance inst;
+				inst.base.blas = cpuBlas[i];
+				inst.base.flags = static_cast<uint32_t>(IGPUTopLevelAccelerationStructure::INSTANCE_FLAGS::TRIANGLE_FACING_CULL_DISABLE_BIT);
+				inst.base.instanceCustomIndex = i;
+				inst.base.instanceShaderBindingTableRecordOffset = isProceduralInstance ? 2 : 0;;
+				inst.base.mask = 0xFF;
+				inst.transform = isProceduralInstance ? matrix3x4SIMD() : cpuObjects[i].transform;
+
+				instance->instance = inst;
+			}
+		}
+
+		auto cpuTlas = make_smart_refctd_ptr<ICPUTopLevelAccelerationStructure>();
+		cpuTlas->setInstances(std::move(geomInstances));
+		cpuTlas->setBuildFlags(IGPUTopLevelAccelerationStructure::BUILD_FLAGS::PREFER_FAST_TRACE_BIT);
+
+//#define TEST_REBAR_FALLBACK
+		// convert with asset converter
+		smart_refctd_ptr<CAssetConverter> converter = CAssetConverter::create({ .device = m_device.get(), .optimizer = {} });
+		struct MyInputs : CAssetConverter::SInputs
+		{
+#ifndef TEST_REBAR_FALLBACK
+			inline uint32_t constrainMemoryTypeBits(const size_t groupCopyID, const IAsset* canonicalAsset, const blake3_hash_t& contentHash, const IDeviceMemoryBacked* memoryBacked) const override
+			{
+				assert(memoryBacked);
+				return memoryBacked->getObjectType() != IDeviceMemoryBacked::EOT_BUFFER ? (~0u) : rebarMemoryTypes;
+			}
+#endif
+			uint32_t rebarMemoryTypes;
+		} inputs = {};
+		inputs.logger = m_logger.get();
+		inputs.rebarMemoryTypes = m_physicalDevice->getDirectVRAMAccessMemoryTypeBits();
+#ifndef TEST_REBAR_FALLBACK
+		struct MyAllocator final : public IDeviceMemoryAllocator
+		{
+			ILogicalDevice* getDeviceForAllocations() const override { return device; }
+
+			SAllocation allocate(const SAllocateInfo& info) override
+			{
+				auto retval = device->allocate(info);
+				// map what is mappable by default so ReBAR checks succeed
+				if (retval.isValid() && retval.memory->isMappable())
+					retval.memory->map({ .offset = 0,.length = info.size });
+				return retval;
+			}
+
+			ILogicalDevice* device;
+		} myalloc;
+		myalloc.device = m_device.get();
+		inputs.allocator = &myalloc;
+#endif
+
+		std::array<ICPUTopLevelAccelerationStructure*, 1u> tmpTlas;
+		std::array<ICPUBuffer*, 2 * std::size(cpuObjects) + 1u> tmpBuffers;
+		{
+			tmpTlas[0] = cpuTlas.get();
+			for (uint32_t i = 0; i < cpuObjects.size(); i++)
+			{
+				tmpBuffers[2 * i + 0] = cpuTriBuffers[i].vertex.buffer.get();
+				tmpBuffers[2 * i + 1] = cpuTriBuffers[i].index.buffer.get();
+			}
+			tmpBuffers[2 * proceduralBlasIdx] = cpuProcBuffer.get();
+
+			std::get<CAssetConverter::SInputs::asset_span_t<ICPUTopLevelAccelerationStructure>>(inputs.assets) = tmpTlas;
+			std::get<CAssetConverter::SInputs::asset_span_t<ICPUBuffer>>(inputs.assets) = tmpBuffers;
+		}
+
+		auto reservation = converter->reserve(inputs);
+		{
+			auto prepass = [&]<typename asset_type_t>(const auto & references) -> bool
+			{
+				auto objects = reservation.getGPUObjects<asset_type_t>();
+				uint32_t counter = {};
+				for (auto& object : objects)
+				{
+					auto gpu = object.value;
+					auto* reference = references[counter];
+
+					if (reference)
+					{
+						if (!gpu)
+						{
+							m_logger->log("Failed to convert a CPU object to GPU!", ILogger::ELL_ERROR);
+							return false;
+						}
+					}
+					counter++;
+				}
+				return true;
+			};
+
+			prepass.template operator() < ICPUTopLevelAccelerationStructure > (tmpTlas);
+			prepass.template operator() < ICPUBuffer > (tmpBuffers);
+		}
+
+		constexpr auto XferBufferCount = 2;
+		std::array<smart_refctd_ptr<IGPUCommandBuffer>, XferBufferCount> xferBufs = {};
+		std::array<IQueue::SSubmitInfo::SCommandBufferInfo, XferBufferCount> xferBufInfos = {};
+		{
+			auto pool = m_device->createCommandPool(getTransferUpQueue()->getFamilyIndex(), IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT | IGPUCommandPool::CREATE_FLAGS::TRANSIENT_BIT);
+			pool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY, xferBufs);
+			xferBufs.front()->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT);
+			for (auto i = 0; i < XferBufferCount; i++)
+				xferBufInfos[i].cmdbuf = xferBufs[i].get();
+		}
+		auto xferSema = m_device->createSemaphore(0u);
+		SIntendedSubmitInfo transfer = {};
+		transfer.queue = getTransferUpQueue();
+		transfer.scratchCommandBuffers = xferBufInfos;
+		transfer.scratchSemaphore = {
+			.semaphore = xferSema.get(),
+			.value = 0u,
+			.stageMask = PIPELINE_STAGE_FLAGS::ALL_TRANSFER_BITS
+		};
+
+		constexpr auto CompBufferCount = 2;
+		std::array<smart_refctd_ptr<IGPUCommandBuffer>, CompBufferCount> compBufs = {};
+		std::array<IQueue::SSubmitInfo::SCommandBufferInfo, CompBufferCount> compBufInfos = {};
+		{
+			auto pool = m_device->createCommandPool(getComputeQueue()->getFamilyIndex(), IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT | IGPUCommandPool::CREATE_FLAGS::TRANSIENT_BIT);
+			pool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY, compBufs);
+			compBufs.front()->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT);
+			for (auto i = 0; i < CompBufferCount; i++)
+				compBufInfos[i].cmdbuf = compBufs[i].get();
+		}
+		auto compSema = m_device->createSemaphore(0u);
+		SIntendedSubmitInfo compute = {};
+		compute.queue = getComputeQueue();
+		compute.scratchCommandBuffers = compBufInfos;
+		compute.scratchSemaphore = {
+			.semaphore = compSema.get(),
+			.value = 0u,
+			.stageMask = PIPELINE_STAGE_FLAGS::ACCELERATION_STRUCTURE_BUILD_BIT | PIPELINE_STAGE_FLAGS::ACCELERATION_STRUCTURE_COPY_BIT
+		};
+		// convert
+		{
+			smart_refctd_ptr<CAssetConverter::SConvertParams::scratch_for_device_AS_build_t> scratchAlloc;
+			{
+				constexpr auto MaxAlignment = 256;
+				constexpr auto MinAllocationSize = 1024;
+				const auto scratchSize = core::alignUp(reservation.getMinASBuildScratchSize(false), MaxAlignment);
+
+
+				IGPUBuffer::SCreationParams creationParams = {};
+				creationParams.size = scratchSize;
+				creationParams.usage = IGPUBuffer::EUF_ACCELERATION_STRUCTURE_BUILD_INPUT_READ_ONLY_BIT | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT | IGPUBuffer::EUF_STORAGE_BUFFER_BIT;
+#ifdef TEST_REBAR_FALLBACK
+				creationParams.usage |= IGPUBuffer::EUF_TRANSFER_DST_BIT;
+				core::unordered_set<uint32_t> sharingSet = { compute.queue->getFamilyIndex(),transfer.queue->getFamilyIndex() };
+				core::vector<uint32_t> sharingIndices(sharingSet.begin(), sharingSet.end());
+				if (sharingIndices.size() > 1)
+					creationParams.queueFamilyIndexCount = sharingIndices.size();
+				creationParams.queueFamilyIndices = sharingIndices.data();
+#endif
+				auto scratchBuffer = m_device->createBuffer(std::move(creationParams));
+
+				auto reqs = scratchBuffer->getMemoryReqs();
+#ifndef TEST_REBAR_FALLBACK
+				reqs.memoryTypeBits &= m_physicalDevice->getDirectVRAMAccessMemoryTypeBits();
+#endif
+				auto allocation = m_device->allocate(reqs, scratchBuffer.get(), IDeviceMemoryAllocation::EMAF_DEVICE_ADDRESS_BIT);
+#ifndef TEST_REBAR_FALLBACK
+				allocation.memory->map({ .offset = 0,.length = reqs.size });
+#endif
+
+				scratchAlloc = make_smart_refctd_ptr<CAssetConverter::SConvertParams::scratch_for_device_AS_build_t>(
+					SBufferRange<video::IGPUBuffer>{0ull, scratchSize, std::move(scratchBuffer)},
+					core::allocator<uint8_t>(), MaxAlignment, MinAllocationSize
+				);
+			}
+
+			struct MyParams final : CAssetConverter::SConvertParams
+			{
+				inline uint32_t getFinalOwnerQueueFamily(const IGPUBuffer* buffer, const core::blake3_hash_t& createdFrom) override
+				{
+					return finalUser;
+				}
+				inline uint32_t getFinalOwnerQueueFamily(const IGPUAccelerationStructure* image, const core::blake3_hash_t& createdFrom) override
+				{
+					return finalUser;
+				}
+
+				uint8_t finalUser;
+			} params = {};
+#undef TEST_REBAR_FALLBACK
+			params.utilities = m_utils.get();
+			params.transfer = &transfer;
+			params.compute = &compute;
+			params.scratchForDeviceASBuild = scratchAlloc.get();
+			params.finalUser = queue->getFamilyIndex();
+
+			auto future = reservation.convert(params);
+			if (future.copy() != IQueue::RESULT::SUCCESS)
+			{
+				m_logger->log("Failed to await submission feature!", ILogger::ELL_ERROR);
+				return false;
+			}
+
+			// assign gpu objects to output
+			auto&& tlases = reservation.getGPUObjects<ICPUTopLevelAccelerationStructure>();
+			m_gpuTlas = tlases[0].value;
+			auto&& buffers = reservation.getGPUObjects<ICPUBuffer>();
+			for (uint32_t i = 0; i < cpuObjects.size(); i++)
+			{
+				auto& cpuObject = cpuObjects[i];
+
+				m_gpuTriangleGeometries.push_back(ReferenceObjectGpu{
+				  .meta = cpuObject.meta,
+				  .bindings = {
+					.vertex = {.offset = 0, .buffer = buffers[2 * i + 0].value },
+					.index = {.offset = 0, .buffer = buffers[2 * i + 1].value },
+				  },
+				  .vertexStride = cpuObject.data.inputParams.bindings[0].stride,
+				  .indexType = cpuObject.data.indexType,
+				  .indexCount = cpuObject.data.indexCount,
+				  .material = hlsl::_static_cast<MaterialPacked>(cpuObject.material),
+				  .transform = cpuObject.transform,
+					});
+			}
+			m_proceduralAabbBuffer = buffers[2 * proceduralBlasIdx].value;
+
+			for (uint32_t i = 0; i < m_gpuTriangleGeometries.size(); i++)
+			{
+				const auto& gpuObject = m_gpuTriangleGeometries[i];
+				const uint64_t vertexBufferAddress = gpuObject.bindings.vertex.buffer->getDeviceAddress();
+				geomInfos[i] = {
+				  .material = gpuObject.material,
+				  .vertexBufferAddress = vertexBufferAddress,
+				  .indexBufferAddress = gpuObject.useIndex() ? gpuObject.bindings.index.buffer->getDeviceAddress() : vertexBufferAddress,
+				  .vertexStride = gpuObject.vertexStride,
+				  .objType = gpuObject.meta.type,
+				  .indexType = gpuObject.indexType,
+				  .smoothNormals = s_smoothNormals[gpuObject.meta.type],
+				};
+			}
+		}
+
+		{
+			IGPUBuffer::SCreationParams params;
+			params.usage = IGPUBuffer::EUF_STORAGE_BUFFER_BIT | IGPUBuffer::EUF_TRANSFER_DST_BIT | IGPUBuffer::EUF_INLINE_UPDATE_VIA_CMDBUF | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT;
+			params.size = geomInfoBuffer->getSize();
+			m_utils->createFilledDeviceLocalBufferOnDedMem(SIntendedSubmitInfo{ .queue = queue }, std::move(params), geomInfos).move_into(m_triangleGeomInfoBuffer);
+		}
 
-		// reserve, convert
 		return true;
 	}
 #else

From 086c21e3c5237149cc82289d0920f61940eb9d00 Mon Sep 17 00:00:00 2001
From: keptsecret <sorchon@gmail.com>
Date: Tue, 20 May 2025 11:02:23 +0700
Subject: [PATCH 265/529] use bda in unit test

---
 .../app_resources/shaderCommon.hlsl           | 25 ++++---
 .../app_resources/testWorkgroup.comp.hlsl     | 13 ++--
 .../app_resources/workgroupCommon.hlsl        | 11 ++--
 23_Arithmetic2UnitTest/main.cpp               | 66 ++++++++-----------
 4 files changed, 61 insertions(+), 54 deletions(-)

diff --git a/23_Arithmetic2UnitTest/app_resources/shaderCommon.hlsl b/23_Arithmetic2UnitTest/app_resources/shaderCommon.hlsl
index 376f69579..45a1f8097 100644
--- a/23_Arithmetic2UnitTest/app_resources/shaderCommon.hlsl
+++ b/23_Arithmetic2UnitTest/app_resources/shaderCommon.hlsl
@@ -16,10 +16,13 @@ uint32_t3 nbl::hlsl::glsl::gl_WorkGroupSize() {return uint32_t3(WORKGROUP_SIZE,1
 
 typedef vector<uint32_t, ITEMS_PER_INVOCATION> type_t;
 
-// unfortunately DXC chokes on descriptors as static members
-// https://github.com/microsoft/DirectXShaderCompiler/issues/5940
-[[vk::binding(0, 0)]] StructuredBuffer<type_t> inputValue;
-[[vk::binding(1, 0)]] RWByteAddressBuffer output[8];
+struct PushConstantData
+{
+    uint64_t inputBufAddress;
+    uint64_t outputAddressBufAddress;
+};
+
+[[vk::push_constant]] PushConstantData pc;
 
 // because subgroups don't match `gl_LocalInvocationIndex` snake curve addressing, we also can't load inputs that way
 uint32_t globalIndex();
@@ -41,19 +44,25 @@ static void subtest(NBL_CONST_REF_ARG(type_t) sourceVal)
     using config_t = nbl::hlsl::subgroup2::Configuration<SUBGROUP_SIZE_LOG2>;
     using params_t = nbl::hlsl::subgroup2::ArithmeticParams<config_t, typename binop<T>::base_t, N, nbl::hlsl::jit::device_capabilities>;
 
+    const uint64_t outputBufAddr = vk::RawBufferLoad<uint64_t>(pc.outputAddressBufAddress + binop<T>::BindingIndex * sizeof(uint64_t), sizeof(uint64_t));
+
     if (globalIndex()==0u)
-        output[binop<T>::BindingIndex].template Store<uint32_t>(0,nbl::hlsl::glsl::gl_SubgroupSize());
-        
+        vk::RawBufferStore<uint32_t>(outputBufAddr, nbl::hlsl::glsl::gl_SubgroupSize());
+
     operation_t<params_t> func;
+    type_t val = func(sourceVal);
     if (canStore())
-        output[binop<T>::BindingIndex].template Store<type_t>(sizeof(uint32_t)+sizeof(type_t)*globalIndex(),func(sourceVal));
+        [unroll]
+        for (uint32_t i = 0; i < N; i++)
+            vk::RawBufferStore<uint32_t>(outputBufAddr+sizeof(uint32_t)+sizeof(type_t)*globalIndex()+i*sizeof(uint32_t), val[i]);
+        // vk::RawBufferStore<dtype_t>(outputBufAddr + sizeof(uint32_t) + sizeof(dtype_t) * globalIndex(), value, sizeof(uint32_t)); TODO why won't this work???
 }
 
 
 type_t test()
 {
     const uint32_t idx = globalIndex();
-    type_t sourceVal = inputValue[idx];
+    type_t sourceVal = vk::RawBufferLoad<type_t>(pc.inputBufAddress + idx * sizeof(type_t));
 
     subtest<bit_and, uint32_t, ITEMS_PER_INVOCATION>(sourceVal);
     subtest<bit_xor, uint32_t, ITEMS_PER_INVOCATION>(sourceVal);
diff --git a/23_Arithmetic2UnitTest/app_resources/testWorkgroup.comp.hlsl b/23_Arithmetic2UnitTest/app_resources/testWorkgroup.comp.hlsl
index 3aafc0aa7..9debd184d 100644
--- a/23_Arithmetic2UnitTest/app_resources/testWorkgroup.comp.hlsl
+++ b/23_Arithmetic2UnitTest/app_resources/testWorkgroup.comp.hlsl
@@ -12,13 +12,17 @@ struct DataProxy
     void get(const uint32_t ix, NBL_REF_ARG(dtype_t) value)
     {
         const uint32_t workgroupOffset = nbl::hlsl::glsl::gl_WorkGroupID().x * Config::VirtualWorkgroupSize;
-        value = inputValue[workgroupOffset + ix];
+        value = vk::RawBufferLoad<dtype_t>(pc.inputBufAddress + (workgroupOffset + ix) * sizeof(dtype_t));
     }
     template<typename AccessType>
     void set(const uint32_t ix, const dtype_t value)
     {
         const uint32_t workgroupOffset = nbl::hlsl::glsl::gl_WorkGroupID().x * Config::VirtualWorkgroupSize;
-        output[Binop::BindingIndex].template Store<type_t>(sizeof(uint32_t) + sizeof(type_t) * (workgroupOffset+ix), value);
+        uint64_t outputBufAddr = vk::RawBufferLoad<uint64_t>(pc.outputAddressBufAddress + Binop::BindingIndex * sizeof(uint64_t));
+        [unroll]
+        for (uint32_t i = 0; i < Config::ItemsPerInvocation_0; i++)
+            vk::RawBufferStore<uint32_t>(outputBufAddr+sizeof(uint32_t)+sizeof(dtype_t)*(workgroupOffset+ix)+i*sizeof(uint32_t), value[i]);
+        // vk::RawBufferStore<dtype_t>(outputBufAddr + sizeof(uint32_t) + sizeof(dtype_t) * (workgroupOffset+ix), value, sizeof(uint32_t)); TODO why won't this work???
     }
 
     void workgroupExecutionAndMemoryBarrier()
@@ -49,8 +53,9 @@ struct operation_t
 template<template<class> class binop, typename T, uint32_t N>
 static void subtest(NBL_CONST_REF_ARG(type_t) sourceVal)
 {
+    uint64_t outputBufAddr = vk::RawBufferLoad<uint64_t>(pc.outputAddressBufAddress + binop<T>::BindingIndex * sizeof(uint64_t));
     if (globalIndex()==0u)
-        output[binop<T>::BindingIndex].template Store<uint32_t>(0,nbl::hlsl::glsl::gl_SubgroupSize());
+        vk::RawBufferStore<uint32_t>(outputBufAddr, nbl::hlsl::glsl::gl_SubgroupSize());
 
     operation_t<binop<T>,nbl::hlsl::jit::device_capabilities> func;
     func(); // store is done with data accessor now
@@ -59,7 +64,7 @@ static void subtest(NBL_CONST_REF_ARG(type_t) sourceVal)
 
 type_t test()
 {
-    const type_t sourceVal = inputValue[globalIndex()];
+    type_t sourceVal = vk::RawBufferLoad<type_t>(pc.inputBufAddress + globalIndex() * sizeof(type_t));
 
     subtest<bit_and, uint32_t, ITEMS_PER_INVOCATION>(sourceVal);
     subtest<bit_xor, uint32_t, ITEMS_PER_INVOCATION>(sourceVal);
diff --git a/23_Arithmetic2UnitTest/app_resources/workgroupCommon.hlsl b/23_Arithmetic2UnitTest/app_resources/workgroupCommon.hlsl
index b0ccbf295..c02d86969 100644
--- a/23_Arithmetic2UnitTest/app_resources/workgroupCommon.hlsl
+++ b/23_Arithmetic2UnitTest/app_resources/workgroupCommon.hlsl
@@ -25,10 +25,13 @@ using config_t = nbl::hlsl::workgroup2::ArithmeticConfiguration<WORKGROUP_SIZE_L
 
 typedef vector<uint32_t, config_t::ItemsPerInvocation_0> type_t;
 
-// unfortunately DXC chokes on descriptors as static members
-// https://github.com/microsoft/DirectXShaderCompiler/issues/5940
-[[vk::binding(0, 0)]] StructuredBuffer<type_t> inputValue;
-[[vk::binding(1, 0)]] RWByteAddressBuffer output[8];
+struct PushConstantData
+{
+    uint64_t inputBufAddress;
+    uint64_t outputAddressBufAddress;
+};
+
+[[vk::push_constant]] PushConstantData pc;
 
 // because subgroups don't match `gl_LocalInvocationIndex` snake curve addressing, we also can't load inputs that way
 uint32_t globalIndex();
diff --git a/23_Arithmetic2UnitTest/main.cpp b/23_Arithmetic2UnitTest/main.cpp
index e7dfcefa1..282473d12 100644
--- a/23_Arithmetic2UnitTest/main.cpp
+++ b/23_Arithmetic2UnitTest/main.cpp
@@ -45,6 +45,12 @@ struct emulatedScanExclusive
 	static inline constexpr const char* name = "exclusive_scan";
 };
 
+struct PushConstantData
+{
+	uint64_t inputBufAddress;
+	uint64_t outputAddressBufAddress;
+};
+
 class Workgroup2ScanTestApp final : public application_templates::BasicMultiQueueApplication, public application_templates::MonoAssetManagerAndBuiltinResourceApplication
 {
 	using device_base_t = application_templates::BasicMultiQueueApplication;
@@ -76,7 +82,7 @@ class Workgroup2ScanTestApp final : public application_templates::BasicMultiQueu
 
 			IGPUBuffer::SCreationParams inputDataBufferCreationParams = {};
 			inputDataBufferCreationParams.size = sizeof(Output<>::data[0]) * elementCount;
-			inputDataBufferCreationParams.usage = IGPUBuffer::EUF_STORAGE_BUFFER_BIT | IGPUBuffer::EUF_TRANSFER_DST_BIT;
+			inputDataBufferCreationParams.usage = IGPUBuffer::EUF_STORAGE_BUFFER_BIT | IGPUBuffer::EUF_TRANSFER_DST_BIT | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT;
 			m_utils->createFilledDeviceLocalBufferOnDedMem(
 				SIntendedSubmitInfo{.queue=getTransferUpQueue()},
 				std::move(inputDataBufferCreationParams),
@@ -89,53 +95,37 @@ class Workgroup2ScanTestApp final : public application_templates::BasicMultiQueu
 		{
 			IGPUBuffer::SCreationParams params = {};
 			params.size = sizeof(uint32_t) + gpuinputDataBuffer->getSize();
-			params.usage = bitflag(IGPUBuffer::EUF_STORAGE_BUFFER_BIT) | IGPUBuffer::EUF_TRANSFER_SRC_BIT;
+			params.usage = bitflag(IGPUBuffer::EUF_STORAGE_BUFFER_BIT) | IGPUBuffer::EUF_TRANSFER_SRC_BIT | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT;
 
 			outputBuffers[i] = m_device->createBuffer(std::move(params));
 			auto mreq = outputBuffers[i]->getMemoryReqs();
 			mreq.memoryTypeBits &= m_physicalDevice->getDeviceLocalMemoryTypeBits();
 			assert(mreq.memoryTypeBits);
 
-			auto bufferMem = m_device->allocate(mreq, outputBuffers[i].get());
+			auto bufferMem = m_device->allocate(mreq, outputBuffers[i].get(), IDeviceMemoryAllocation::EMAF_DEVICE_ADDRESS_BIT);
 			assert(bufferMem.isValid());
 		}
 
-		// create Descriptor Set and Pipeline Layout
+		// create buffer to store BDA of output buffers
+		smart_refctd_ptr<IGPUBuffer> gpuOutputAddressesBuffer;
 		{
-			// create Descriptor Set Layout
-			smart_refctd_ptr<IGPUDescriptorSetLayout> dsLayout;
-			{
-				IGPUDescriptorSetLayout::SBinding binding[2];
-				for (uint32_t i = 0u; i < 2; i++)
-					binding[i] = {{},i,IDescriptor::E_TYPE::ET_STORAGE_BUFFER,IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_NONE,IShader::E_SHADER_STAGE::ESS_COMPUTE,1u,nullptr };
-				binding[1].count = OutputBufferCount;
-				dsLayout = m_device->createDescriptorSetLayout(binding);
-			}
-
-			// set and transient pool
-			auto descPool = m_device->createDescriptorPoolForDSLayouts(IDescriptorPool::ECF_NONE,{&dsLayout.get(),1});
-			descriptorSet = descPool->createDescriptorSet(smart_refctd_ptr(dsLayout));
-			{
-				IGPUDescriptorSet::SDescriptorInfo infos[1+OutputBufferCount];
-				infos[0].desc = gpuinputDataBuffer;
-				infos[0].info.buffer = { 0u,gpuinputDataBuffer->getSize() };
-				for (uint32_t i = 1u; i <= OutputBufferCount; i++)
-				{
-					auto buff = outputBuffers[i - 1];
-					infos[i].info.buffer = { 0u,buff->getSize() };
-					infos[i].desc = std::move(buff); // save an atomic in the refcount
-
-				}
-
-				IGPUDescriptorSet::SWriteDescriptorSet writes[2];
-				for (uint32_t i=0u; i<2; i++)
-					writes[i] = {descriptorSet.get(),i,0u,1u,infos+i};
-				writes[1].count = OutputBufferCount;
+			std::array<uint64_t, OutputBufferCount> outputAddresses;
+			for (uint32_t i = 0; i < OutputBufferCount; i++)
+				outputAddresses[i] = outputBuffers[i]->getDeviceAddress();
+
+			IGPUBuffer::SCreationParams params;
+			params.usage = IGPUBuffer::EUF_STORAGE_BUFFER_BIT | IGPUBuffer::EUF_TRANSFER_DST_BIT | IGPUBuffer::EUF_INLINE_UPDATE_VIA_CMDBUF | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT;
+			params.size = OutputBufferCount * sizeof(uint64_t);
+			m_utils->createFilledDeviceLocalBufferOnDedMem(SIntendedSubmitInfo{ .queue = getTransferUpQueue() }, std::move(params), outputAddresses.data()).move_into(gpuOutputAddressesBuffer);
+		}
+		pc.inputBufAddress = gpuinputDataBuffer->getDeviceAddress();
+		pc.outputAddressBufAddress = gpuOutputAddressesBuffer->getDeviceAddress();
 
-				m_device->updateDescriptorSets(2, writes, 0u, nullptr);
-			}
+		// create Pipeline Layout
+		{
+			SPushConstantRange pcRange = { .stageFlags = IShader::E_SHADER_STAGE::ESS_COMPUTE, .offset = 0,.size = sizeof(PushConstantData) };
 
-			pipelineLayout = m_device->createPipelineLayout({},std::move(dsLayout));
+			pipelineLayout = m_device->createPipelineLayout({&pcRange, 1});
 		}
 
 		// load shader source from file
@@ -333,7 +323,7 @@ class Workgroup2ScanTestApp final : public application_templates::BasicMultiQueu
 		}	
 		cmdbuf->begin(IGPUCommandBuffer::USAGE::NONE);
 		cmdbuf->bindComputePipeline(pipeline.get());
-		cmdbuf->bindDescriptorSets(EPBP_COMPUTE, pipeline->getLayout(), 0u, 1u, &descriptorSet.get());
+		cmdbuf->pushConstants(pipelineLayout.get(), IShader::E_SHADER_STAGE::ESS_COMPUTE, 0, sizeof(PushConstantData), &pc);
 		cmdbuf->dispatch(workgroupCount, 1, 1);
 		{
 			IGPUCommandBuffer::SPipelineBarrierDependencyInfo::buffer_barrier_t memoryBarrier[OutputBufferCount];
@@ -467,8 +457,8 @@ class Workgroup2ScanTestApp final : public application_templates::BasicMultiQueu
 	uint32_t* inputData = nullptr;
 	constexpr static inline uint32_t OutputBufferCount = 8u;
 	smart_refctd_ptr<IGPUBuffer> outputBuffers[OutputBufferCount];
-	smart_refctd_ptr<IGPUDescriptorSet> descriptorSet;
 	smart_refctd_ptr<IGPUPipelineLayout> pipelineLayout;
+	PushConstantData pc;
 
 	smart_refctd_ptr<ISemaphore> sema;
 	uint64_t timelineValue = 0;

From f4af3edc1cd8d152f6c67bd15577b2595cb2a43f Mon Sep 17 00:00:00 2001
From: keptsecret <sorchon@gmail.com>
Date: Tue, 20 May 2025 12:05:15 +0700
Subject: [PATCH 266/529] benchmarks use bda

---
 .../app_resources/benchmarkSubgroup.comp.hlsl |  8 ++-
 .../benchmarkWorkgroup.comp.hlsl              |  6 +-
 .../app_resources/shaderCommon.hlsl           | 22 ++++---
 .../app_resources/workgroupCommon.hlsl        | 11 ++--
 29_Arithmetic2Bench/main.cpp                  | 59 ++++++++++---------
 5 files changed, 63 insertions(+), 43 deletions(-)

diff --git a/29_Arithmetic2Bench/app_resources/benchmarkSubgroup.comp.hlsl b/29_Arithmetic2Bench/app_resources/benchmarkSubgroup.comp.hlsl
index 2f575d39a..e21d67fcb 100644
--- a/29_Arithmetic2Bench/app_resources/benchmarkSubgroup.comp.hlsl
+++ b/29_Arithmetic2Bench/app_resources/benchmarkSubgroup.comp.hlsl
@@ -25,18 +25,22 @@ static void subbench(NBL_CONST_REF_ARG(type_t) sourceVal)
     using params_t = nbl::hlsl::subgroup2::ArithmeticParams<config_t, typename binop<T>::base_t, N, nbl::hlsl::jit::device_capabilities>;
     type_t value = sourceVal;
 
+    const uint64_t outputBufAddr = vk::RawBufferLoad<uint64_t>(pc.outputAddressBufAddress + binop<T>::BindingIndex * sizeof(uint64_t), sizeof(uint64_t));
+
     operation_t<params_t> func;
     // [unroll]
     for (uint32_t i = 0; i < NUM_LOOPS; i++)
         value = func(value);
 
-    output[binop<T>::BindingIndex].template Store<type_t>(sizeof(uint32_t) + sizeof(type_t) * globalIndex(), value);
+    [unroll]
+    for (uint32_t i = 0; i < N; i++)
+        vk::RawBufferStore<uint32_t>(outputBufAddr+sizeof(uint32_t)+sizeof(type_t)*globalIndex()+i*sizeof(uint32_t), val[i]);
 }
 
 void benchmark()
 {
     const uint32_t idx = globalIndex();
-    type_t sourceVal = inputValue[idx];
+    type_t sourceVal = vk::RawBufferLoad<type_t>(pc.inputBufAddress + idx * sizeof(type_t));
 
     subbench<bit_and, uint32_t, ITEMS_PER_INVOCATION>(sourceVal);
     subbench<bit_xor, uint32_t, ITEMS_PER_INVOCATION>(sourceVal);
diff --git a/29_Arithmetic2Bench/app_resources/benchmarkWorkgroup.comp.hlsl b/29_Arithmetic2Bench/app_resources/benchmarkWorkgroup.comp.hlsl
index 6e32bedbd..0194b2f75 100644
--- a/29_Arithmetic2Bench/app_resources/benchmarkWorkgroup.comp.hlsl
+++ b/29_Arithmetic2Bench/app_resources/benchmarkWorkgroup.comp.hlsl
@@ -55,8 +55,10 @@ struct operation_t
 template<template<class> class binop, typename T, uint32_t N>
 static void subbench(NBL_CONST_REF_ARG(type_t) sourceVal)
 {
+    const uint64_t outputBufAddr = vk::RawBufferLoad<uint64_t>(pc.outputAddressBufAddress + binop<T>::BindingIndex * sizeof(uint64_t), sizeof(uint64_t));
+
     if (globalIndex()==0u)
-        output[binop<T>::BindingIndex].template Store<uint32_t>(0,nbl::hlsl::glsl::gl_SubgroupSize());
+        vk::RawBufferStore<uint32_t>(outputBufAddr, nbl::hlsl::glsl::gl_SubgroupSize());
 
     operation_t<binop<T>,nbl::hlsl::jit::device_capabilities> func;
     // TODO separate out store/load from DataProxy? so we don't do too many RW in benchmark
@@ -67,7 +69,7 @@ static void subbench(NBL_CONST_REF_ARG(type_t) sourceVal)
 
 type_t benchmark()
 {
-    const type_t sourceVal = inputValue[globalIndex()];
+    const type_t sourceVal = vk::RawBufferLoad<type_t>(pc.inputBufAddress + globalIndex() * sizeof(type_t));
 
     subbench<bit_and, uint32_t, ITEMS_PER_INVOCATION>(sourceVal);
     subbench<bit_xor, uint32_t, ITEMS_PER_INVOCATION>(sourceVal);
diff --git a/29_Arithmetic2Bench/app_resources/shaderCommon.hlsl b/29_Arithmetic2Bench/app_resources/shaderCommon.hlsl
index 376f69579..ae0f61f33 100644
--- a/29_Arithmetic2Bench/app_resources/shaderCommon.hlsl
+++ b/29_Arithmetic2Bench/app_resources/shaderCommon.hlsl
@@ -16,10 +16,13 @@ uint32_t3 nbl::hlsl::glsl::gl_WorkGroupSize() {return uint32_t3(WORKGROUP_SIZE,1
 
 typedef vector<uint32_t, ITEMS_PER_INVOCATION> type_t;
 
-// unfortunately DXC chokes on descriptors as static members
-// https://github.com/microsoft/DirectXShaderCompiler/issues/5940
-[[vk::binding(0, 0)]] StructuredBuffer<type_t> inputValue;
-[[vk::binding(1, 0)]] RWByteAddressBuffer output[8];
+struct PushConstantData
+{
+    uint64_t inputBufAddress;
+    uint64_t outputAddressBufAddress;
+};
+
+[[vk::push_constant]] PushConstantData pc;
 
 // because subgroups don't match `gl_LocalInvocationIndex` snake curve addressing, we also can't load inputs that way
 uint32_t globalIndex();
@@ -41,19 +44,24 @@ static void subtest(NBL_CONST_REF_ARG(type_t) sourceVal)
     using config_t = nbl::hlsl::subgroup2::Configuration<SUBGROUP_SIZE_LOG2>;
     using params_t = nbl::hlsl::subgroup2::ArithmeticParams<config_t, typename binop<T>::base_t, N, nbl::hlsl::jit::device_capabilities>;
 
+    const uint64_t outputBufAddr = vk::RawBufferLoad<uint64_t>(pc.outputAddressBufAddress + binop<T>::BindingIndex * sizeof(uint64_t), sizeof(uint64_t));
+
     if (globalIndex()==0u)
-        output[binop<T>::BindingIndex].template Store<uint32_t>(0,nbl::hlsl::glsl::gl_SubgroupSize());
+        vk::RawBufferStore<uint32_t>(outputBufAddr, nbl::hlsl::glsl::gl_SubgroupSize());
         
     operation_t<params_t> func;
     if (canStore())
-        output[binop<T>::BindingIndex].template Store<type_t>(sizeof(uint32_t)+sizeof(type_t)*globalIndex(),func(sourceVal));
+        [unroll]
+        for (uint32_t i = 0; i < N; i++)
+            vk::RawBufferStore<uint32_t>(outputBufAddr+sizeof(uint32_t)+sizeof(type_t)*globalIndex()+i*sizeof(uint32_t), val[i]);
+        // vk::RawBufferStore<dtype_t>(outputBufAddr + sizeof(uint32_t) + sizeof(dtype_t) * globalIndex(), value, sizeof(uint32_t)); TODO why won't this work???
 }
 
 
 type_t test()
 {
     const uint32_t idx = globalIndex();
-    type_t sourceVal = inputValue[idx];
+    type_t sourceVal = vk::RawBufferLoad<type_t>(pc.inputBufAddress + idx * sizeof(type_t));
 
     subtest<bit_and, uint32_t, ITEMS_PER_INVOCATION>(sourceVal);
     subtest<bit_xor, uint32_t, ITEMS_PER_INVOCATION>(sourceVal);
diff --git a/29_Arithmetic2Bench/app_resources/workgroupCommon.hlsl b/29_Arithmetic2Bench/app_resources/workgroupCommon.hlsl
index b0ccbf295..c02d86969 100644
--- a/29_Arithmetic2Bench/app_resources/workgroupCommon.hlsl
+++ b/29_Arithmetic2Bench/app_resources/workgroupCommon.hlsl
@@ -25,10 +25,13 @@ using config_t = nbl::hlsl::workgroup2::ArithmeticConfiguration<WORKGROUP_SIZE_L
 
 typedef vector<uint32_t, config_t::ItemsPerInvocation_0> type_t;
 
-// unfortunately DXC chokes on descriptors as static members
-// https://github.com/microsoft/DirectXShaderCompiler/issues/5940
-[[vk::binding(0, 0)]] StructuredBuffer<type_t> inputValue;
-[[vk::binding(1, 0)]] RWByteAddressBuffer output[8];
+struct PushConstantData
+{
+    uint64_t inputBufAddress;
+    uint64_t outputAddressBufAddress;
+};
+
+[[vk::push_constant]] PushConstantData pc;
 
 // because subgroups don't match `gl_LocalInvocationIndex` snake curve addressing, we also can't load inputs that way
 uint32_t globalIndex();
diff --git a/29_Arithmetic2Bench/main.cpp b/29_Arithmetic2Bench/main.cpp
index bf20d5faa..0772997dc 100644
--- a/29_Arithmetic2Bench/main.cpp
+++ b/29_Arithmetic2Bench/main.cpp
@@ -47,6 +47,12 @@ struct emulatedScanExclusive
 	static inline constexpr const char* name = "exclusive_scan";
 };
 
+struct PushConstantData
+{
+	uint64_t inputBufAddress;
+	uint64_t outputAddressBufAddress;
+};
+
 // NOTE added swapchain + drawing frames to be able to profile with Nsight, which still doesn't support profiling headless compute shaders
 class ArithmeticBenchApp final : public examples::SimpleWindowedApplication, public application_templates::MonoAssetManagerAndBuiltinResourceApplication
 {
@@ -130,7 +136,6 @@ class ArithmeticBenchApp final : public examples::SimpleWindowedApplication, pub
 		const uint32_t elementCount = Output<>::ScanElementCount;
 		// populate our random data buffer on the CPU and create a GPU copy
 		inputData = new uint32_t[elementCount];
-		smart_refctd_ptr<IGPUBuffer> gpuinputDataBuffer;
 		{
 			std::mt19937 randGenerator(0xdeadbeefu);
 			for (uint32_t i = 0u; i < elementCount; i++)
@@ -138,7 +143,7 @@ class ArithmeticBenchApp final : public examples::SimpleWindowedApplication, pub
 
 			IGPUBuffer::SCreationParams inputDataBufferCreationParams = {};
 			inputDataBufferCreationParams.size = sizeof(Output<>::data[0]) * elementCount;
-			inputDataBufferCreationParams.usage = IGPUBuffer::EUF_STORAGE_BUFFER_BIT | IGPUBuffer::EUF_TRANSFER_DST_BIT;
+			inputDataBufferCreationParams.usage = IGPUBuffer::EUF_STORAGE_BUFFER_BIT | IGPUBuffer::EUF_TRANSFER_DST_BIT | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT;
 			m_utils->createFilledDeviceLocalBufferOnDedMem(
 				SIntendedSubmitInfo{.queue=getTransferUpQueue()},
 				std::move(inputDataBufferCreationParams),
@@ -151,17 +156,31 @@ class ArithmeticBenchApp final : public examples::SimpleWindowedApplication, pub
 		{
 			IGPUBuffer::SCreationParams params = {};
 			params.size = sizeof(uint32_t) + gpuinputDataBuffer->getSize();
-			params.usage = bitflag(IGPUBuffer::EUF_STORAGE_BUFFER_BIT) | IGPUBuffer::EUF_TRANSFER_SRC_BIT;
+			params.usage = bitflag(IGPUBuffer::EUF_STORAGE_BUFFER_BIT) | IGPUBuffer::EUF_TRANSFER_SRC_BIT | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT;
 
 			outputBuffers[i] = m_device->createBuffer(std::move(params));
 			auto mreq = outputBuffers[i]->getMemoryReqs();
 			mreq.memoryTypeBits &= m_physicalDevice->getDeviceLocalMemoryTypeBits();
 			assert(mreq.memoryTypeBits);
 
-			auto bufferMem = m_device->allocate(mreq, outputBuffers[i].get());
+			auto bufferMem = m_device->allocate(mreq, outputBuffers[i].get(), IDeviceMemoryAllocation::EMAF_DEVICE_ADDRESS_BIT);
 			assert(bufferMem.isValid());
 		}
 
+		// create buffer to store BDA of output buffers
+		{
+			std::array<uint64_t, OutputBufferCount> outputAddresses;
+			for (uint32_t i = 0; i < OutputBufferCount; i++)
+				outputAddresses[i] = outputBuffers[i]->getDeviceAddress();
+
+			IGPUBuffer::SCreationParams params;
+			params.usage = IGPUBuffer::EUF_STORAGE_BUFFER_BIT | IGPUBuffer::EUF_TRANSFER_DST_BIT | IGPUBuffer::EUF_INLINE_UPDATE_VIA_CMDBUF | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT;
+			params.size = OutputBufferCount * sizeof(uint64_t);
+			m_utils->createFilledDeviceLocalBufferOnDedMem(SIntendedSubmitInfo{ .queue = getTransferUpQueue() }, std::move(params), outputAddresses.data()).move_into(gpuOutputAddressesBuffer);
+		}
+		pc.inputBufAddress = gpuinputDataBuffer->getDeviceAddress();
+		pc.outputAddressBufAddress = gpuOutputAddressesBuffer->getDeviceAddress();
+
 		// create dummy image
 		dummyImg = m_device->createImage({
 				{
@@ -194,36 +213,16 @@ class ArithmeticBenchApp final : public examples::SimpleWindowedApplication, pub
 			// set and transient pool
 			smart_refctd_ptr<IGPUDescriptorSetLayout> benchLayout;
 			{
-				IGPUDescriptorSetLayout::SBinding binding[3];
-				for (uint32_t i = 0u; i < 2; i++)
-					binding[i] = { {},i,IDescriptor::E_TYPE::ET_STORAGE_BUFFER,IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_NONE,IShader::E_SHADER_STAGE::ESS_COMPUTE,1u,nullptr };
-				binding[1].count = OutputBufferCount;
-				binding[2] = { {},2,IDescriptor::E_TYPE::ET_STORAGE_IMAGE,IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_UPDATE_AFTER_BIND_BIT,IShader::E_SHADER_STAGE::ESS_COMPUTE,1u,nullptr };
+				IGPUDescriptorSetLayout::SBinding binding[1];
+				binding[0] = { {},2,IDescriptor::E_TYPE::ET_STORAGE_IMAGE,IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_UPDATE_AFTER_BIND_BIT,IShader::E_SHADER_STAGE::ESS_COMPUTE,1u,nullptr };
 				benchLayout = m_device->createDescriptorSetLayout(binding);
 			}
 
 			benchPool = m_device->createDescriptorPoolForDSLayouts(IDescriptorPool::ECF_UPDATE_AFTER_BIND_BIT, { &benchLayout.get(),1 });
 			benchDs = benchPool->createDescriptorSet(smart_refctd_ptr(benchLayout));
-			{
-				IGPUDescriptorSet::SDescriptorInfo infos[1 + OutputBufferCount];
-				infos[0].desc = gpuinputDataBuffer;
-				infos[0].info.buffer = { 0u,gpuinputDataBuffer->getSize() };
-				for (uint32_t i = 1u; i <= OutputBufferCount; i++)
-				{
-					auto buff = outputBuffers[i - 1];
-					infos[i].info.buffer = { 0u,buff->getSize() };
-					infos[i].desc = std::move(buff); // save an atomic in the refcount
-				}
-				// write swapchain image descriptor in loop
 
-				IGPUDescriptorSet::SWriteDescriptorSet writes[2];
-				for (uint32_t i = 0u; i < 2; i++)
-					writes[i] = { benchDs.get(),i,0u,1u,infos + i };
-				writes[1].count = OutputBufferCount;
-
-				m_device->updateDescriptorSets(2, writes, 0u, nullptr);
-			}
-			benchPplnLayout = m_device->createPipelineLayout({}, std::move(benchLayout));
+			SPushConstantRange pcRange = { .stageFlags = IShader::E_SHADER_STAGE::ESS_COMPUTE, .offset = 0,.size = sizeof(PushConstantData) };
+			benchPplnLayout = m_device->createPipelineLayout({ &pcRange, 1 }, std::move(benchLayout));
 		}
 
 		// load shader source from file
@@ -370,6 +369,7 @@ class ArithmeticBenchApp final : public examples::SimpleWindowedApplication, pub
 		const auto SubgroupSizeLog2 = hlsl::findMSB(MinSubgroupSize);
 
 		cmdbuf->bindDescriptorSets(EPBP_COMPUTE, benchSets[0].pipeline->getLayout(), 0u, 1u, &benchDs.get());
+		cmdbuf->pushConstants(benchSets[0].pipeline->getLayout(), IShader::E_SHADER_STAGE::ESS_COMPUTE, 0, sizeof(PushConstantData), &pc);
 
 		for (uint32_t i = 0; i < benchSets.size(); i++)
 			runBenchmark<DoWorkgroupBenchmarks>(cmdbuf, benchSets[i], elementCount, SubgroupSizeLog2);
@@ -722,8 +722,11 @@ class ArithmeticBenchApp final : public examples::SimpleWindowedApplication, pub
 	smart_refctd_ptr<IGPUDescriptorSet> benchDs;
 
 	uint32_t* inputData = nullptr;
+	smart_refctd_ptr<IGPUBuffer> gpuinputDataBuffer;
 	constexpr static inline uint32_t OutputBufferCount = 8u;
 	smart_refctd_ptr<IGPUBuffer> outputBuffers[OutputBufferCount];
+	smart_refctd_ptr<IGPUBuffer> gpuOutputAddressesBuffer;
+	PushConstantData pc;
 
 	smart_refctd_ptr<ISemaphore> sema;
 	uint64_t timelineValue = 0;

From e2fef3a8627437c307083651d67b40819d71d7d8 Mon Sep 17 00:00:00 2001
From: Erfan Ahmadi <ahmadierfan99@gmail.com>
Date: Tue, 20 May 2025 12:08:04 +0400
Subject: [PATCH 267/529] start Georeferenced Images work and refactoring
 similar functionality with Static Images

---
 62_CAD/DrawResourcesFiller.cpp | 474 +++++++++++++++++++++------------
 62_CAD/DrawResourcesFiller.h   |  49 ++++
 62_CAD/Images.h                |  24 +-
 3 files changed, 378 insertions(+), 169 deletions(-)

diff --git a/62_CAD/DrawResourcesFiller.cpp b/62_CAD/DrawResourcesFiller.cpp
index f50e8f317..b0b3306ae 100644
--- a/62_CAD/DrawResourcesFiller.cpp
+++ b/62_CAD/DrawResourcesFiller.cpp
@@ -362,65 +362,22 @@ void DrawResourcesFiller::drawFontGlyph(
 
 uint32_t DrawResourcesFiller::addStaticImage2D(image_id imageID, const core::smart_refctd_ptr<ICPUImage>& cpuImage, SIntendedSubmitInfo& intendedNextSubmit)
 {
-	/*
-	 * The `suballocatedDescriptorSet` manages indices (slots) into a array of textures binding.
-	 * This callback is invoked on eviction, and must:
-	 *   - Ensure safe deallocation of the slot.
-	 *   - Submit any pending draw calls if the evicted image was scheduled to be used in the upcoming submission.
-	 */
-	auto evictionCallback = [&](image_id imageID, const ImageReference& evicted)
-	{
-		// Later used to release the image's memory range.
-		core::smart_refctd_ptr<ImageCleanup> cleanupObject = core::make_smart_refctd_ptr<ImageCleanup>();
-		cleanupObject->imagesMemorySuballocator = imagesMemorySubAllocator;
-		cleanupObject->addr = evicted.allocationOffset;
-		cleanupObject->size = evicted.allocationSize;
-
-		const bool imageUsedForNextIntendedSubmit = (evicted.lastUsedFrameIndex == currentFrameIndex);
-		
-		// NOTE: `deallocationWaitInfo` is crucial for both paths, we need to make sure we'll write to a descriptor arrayIndex when it's 100% done with previous usages.
-		if (imageUsedForNextIntendedSubmit)
-		{
-			// The evicted image is scheduled for use in the upcoming submit.
-			// To avoid rendering artifacts, we must flush the current draw queue now.
-			// After submission, we reset state so that data referencing the evicted slot can be re-uploaded.
-			submitDraws(intendedNextSubmit);
-			reset(); // resets everything, things referenced through mainObj and other shit will be pushed again through acquireXXX_SubmitIfNeeded
-			
-			// Prepare wait info to defer index deallocation until the GPU has finished using the resource.
-			// we wait on the signal semaphore for the submit we just did above.
-			ISemaphore::SWaitInfo deallocationWaitInfo = { .semaphore = intendedNextSubmit.scratchSemaphore.semaphore, .value = intendedNextSubmit.scratchSemaphore.value };
-			suballocatedDescriptorSet->multi_deallocate(imagesArrayBinding, 1u, &evicted.index, deallocationWaitInfo, &cleanupObject.get());
-		} 
-		else
-		{
-			// The image is not used in the current frame, so we can deallocate without submitting any draws.
-			// Still wait on the semaphore to ensure past GPU usage is complete.
-			// TODO: We don't know which semaphore value the frame with `evicted.lastUsedFrameIndex` index was submitted with, so we wait for the worst case value which is the immediate prev submit.
-			ISemaphore::SWaitInfo deallocationWaitInfo = { .semaphore = intendedNextSubmit.scratchSemaphore.semaphore, .value = intendedNextSubmit.scratchSemaphore.value };
-			suballocatedDescriptorSet->multi_deallocate(imagesArrayBinding, 1u, &evicted.index, deallocationWaitInfo, &cleanupObject.get());
-		}
-
-		// erase imageID from our state map
-		// kindof mirrors the state of the LRUCache
-		staticImagesState.erase(imageID);
-	};
-
 	// Try inserting or updating the image usage in the cache.
 	// If the image is already present, updates its semaphore value.
-	ImageReference* inserted = imagesUsageCache->insert(imageID, intendedNextSubmit.getFutureScratchSemaphore().value, evictionCallback);
+	auto evictCallback = [&](image_id imageID, const ImageReference& evicted) { evictImage_SubmitIfNeeded(imageID, evicted, intendedNextSubmit); };
+	ImageReference* inserted = imagesUsageCache->insert(imageID, intendedNextSubmit.getFutureScratchSemaphore().value, evictCallback);
 	inserted->lastUsedFrameIndex = currentFrameIndex; // in case there was an eviction + auto-submit, we need to update AGAIN
 
 	// if inserted->index was not InvalidTextureIndex then it means we had a cache hit and updated the value of our sema
 	// in which case we don't queue anything for upload, and return the idx
-	if (inserted->index == InvalidTextureIndex)
+	if (inserted->arrayIndex == InvalidTextureIndex)
 	{
 		// This is a new image (cache miss). Allocate a descriptor index for it.
-		inserted->index = video::SubAllocatedDescriptorSet::AddressAllocator::invalid_address;
+		inserted->arrayIndex = video::SubAllocatedDescriptorSet::AddressAllocator::invalid_address;
 		// Blocking allocation attempt; if the descriptor pool is exhausted, this may stall.
-		suballocatedDescriptorSet->multi_allocate(std::chrono::time_point<std::chrono::steady_clock>::max(), imagesArrayBinding, 1u, &inserted->index); // if the prev submit causes DEVICE_LOST then we'll get a deadlock here since we're using max timepoint
+		suballocatedDescriptorSet->multi_allocate(std::chrono::time_point<std::chrono::steady_clock>::max(), imagesArrayBinding, 1u, &inserted->arrayIndex); // if the prev submit causes DEVICE_LOST then we'll get a deadlock here since we're using max timepoint
 
-		if (inserted->index != video::SubAllocatedDescriptorSet::AddressAllocator::invalid_address)
+		if (inserted->arrayIndex != video::SubAllocatedDescriptorSet::AddressAllocator::invalid_address)
 		{
 			auto* device = m_utilities->getLogicalDevice();
 			auto* physDev = m_utilities->getLogicalDevice()->getPhysicalDevice();
@@ -439,116 +396,16 @@ uint32_t DrawResourcesFiller::addStaticImage2D(image_id imageID, const core::sma
 
 			// Attempt to create a GPU image and image view for this texture.
 			core::smart_refctd_ptr<IGPUImageView> gpuImageView = nullptr;
+			ImageAllocateResults allocResults = tryCreateAndAllocateImage_SubmitIfNeeded(imageParams, intendedNextSubmit, std::to_string(imageID));
 
-			// Attempt to create a GPU image and corresponding image view for this texture.
-			// If creation or memory allocation fails (likely due to VRAM exhaustion),
-			// we'll evict another texture from the LRU cache and retry until successful, or until only the currently-inserted image remains.
-			while (imagesUsageCache->size() > 0u)
-			{
-				// Try creating the image and allocating memory for it:
-				auto gpuImage = device->createImage(std::move(imageParams));
-				
-				if (gpuImage)
-				{
-					nbl::video::IDeviceMemoryBacked::SDeviceMemoryRequirements gpuImageMemoryRequirements = gpuImage->getMemoryReqs();
-					uint32_t actualAlignment = 1u << gpuImageMemoryRequirements.alignmentLog2;
-					const bool imageMemoryRequirementsMatch = 
-						(physDev->getDeviceLocalMemoryTypeBits() & gpuImageMemoryRequirements.memoryTypeBits) != 0 && // should have device local memory compatible
-						(gpuImageMemoryRequirements.requiresDedicatedAllocation == false) && // should not require dedicated allocation
-						((ImagesMemorySubAllocator::MaxMemoryAlignment % actualAlignment) == 0u); // should be consistent with our suballocator's max alignment
-
-					if (imageMemoryRequirementsMatch)
-					{
-						inserted->allocationOffset = imagesMemorySubAllocator->allocate(gpuImageMemoryRequirements.size, 1u << gpuImageMemoryRequirements.alignmentLog2);
-						const bool allocationFromImagesMemoryArenaSuccessfull = inserted->allocationOffset != ImagesMemorySubAllocator::InvalidAddress;
-						if (allocationFromImagesMemoryArenaSuccessfull)
-						{
-							inserted->allocationSize = gpuImageMemoryRequirements.size;
-							nbl::video::ILogicalDevice::SBindImageMemoryInfo bindImageMemoryInfo =
-							{
-								.image = gpuImage.get(),
-								.binding = {.memory = imagesMemoryArena.memory.get(), .offset = imagesMemoryArena.offset + inserted->allocationOffset }
-							};
-							const bool boundToMemorySuccessfully = device->bindImageMemory({ &bindImageMemoryInfo, 1u });
-							if (boundToMemorySuccessfully)
-							{
-								gpuImage->setObjectDebugName((std::to_string(imageID) + " Static Image 2D").c_str());
-								IGPUImageView::SCreationParams viewParams = {
-									.image = gpuImage,
-									.viewType = IGPUImageView::ET_2D,
-									.format = gpuImage->getCreationParameters().format
-								};
-								gpuImageView = device->createImageView(std::move(viewParams));
-								if (gpuImageView)
-								{
-									// SUCCESS!
-									gpuImageView->setObjectDebugName((std::to_string(imageID) + " Static Image View 2D").c_str());
-								}
-								else
-								{
-									// irrecoverable error if simple image creation fails.
-									// TODO[LOG]: that's rare, image view creation failed.
-									_NBL_DEBUG_BREAK_IF(true);
-								}
-
-								// succcessful with everything, just break and get out of this retry loop
-								break;
-							}
-							else
-							{
-								// irrecoverable error if simple bindImageMemory fails.
-								// TODO: LOG
-								_NBL_DEBUG_BREAK_IF(true);
-								break;
-							}
-						}
-						else
-						{
-							// printf(std::format("Allocation Failed, Trying again, ImageID={} Size={} \n", imageID, gpuImageMemoryRequirements.size).c_str());
-							// recoverable error when allocation fails, we don't log anything, next code will try evicting other images and retry
-						}
-					}
-					else
-					{
-						// irrecoverable error if memory requirements of the image don't match our preallocated devicememory
-						// TODO: LOG
-						_NBL_DEBUG_BREAK_IF(true);
-						break;
-					}
-				}
-				else
-				{
-					// irrecoverable error if simple image creation fails.
-					// TODO: LOG
-					_NBL_DEBUG_BREAK_IF(true);
-					break;
-				}
-
-				// Getting here means we failed creating or allocating the image, evict and retry.
-				if (imagesUsageCache->size() == 1u)
-					{
-						// Nothing else to evict; give up.
-						// We probably have evicted almost every other texture except the one we just allocated an index for
-						_NBL_DEBUG_BREAK_IF(true);
-						break;
-					}
-
-				assert(imagesUsageCache->size() > 1u);
-
-				const image_id evictionCandidate = imagesUsageCache->select_eviction_candidate();
-				ImageReference* imageRef = imagesUsageCache->peek(evictionCandidate);
-				if (imageRef)
-					evictionCallback(evictionCandidate, *imageRef);
-				imagesUsageCache->erase(evictionCandidate);
-				while (suballocatedDescriptorSet->cull_frees()) {}; // to make sure deallocation requests in eviction callback are blocked for.
-
-				// we don't hold any references to the GPUImageView or GPUImage so descriptor binding will be the last reference
-				// hopefully by here the suballocated descriptor set freed some VRAM by dropping the image last ref and it's dedicated allocation.
-			}
-
-			if (gpuImageView)
+			if (allocResults.isValid())
 			{
+				inserted->imageType = ImageType::STATIC;
+				inserted->gpuResident = false;
 				inserted->lastUsedFrameIndex = currentFrameIndex; // there was an eviction + auto-submit, we need to update AGAIN
+				inserted->allocationOffset = allocResults.allocationOffset;
+				inserted->allocationSize = allocResults.allocationSize;
+				inserted->gpuImageView = allocResults.gpuImageView;
 
 				StaticImageState newState =
 				{
@@ -556,15 +413,14 @@ uint32_t DrawResourcesFiller::addStaticImage2D(image_id imageID, const core::sma
 					.gpuImageView = gpuImageView,
 					.allocationOffset = inserted->allocationOffset,
 					.allocationSize = inserted->allocationSize,
-					.arrayIndex = inserted->index,
+					.arrayIndex = inserted->arrayIndex,
 					.gpuResident = false,
 				};
-				// printf(std::format("Everything success, ImageID={} ArrayIndex={} \n", imageID, inserted->index).c_str());
 				staticImagesState.emplace(imageID, newState);
 			}
 			else
 			{
-				// All attempts to create the GPU image and its corresponding view have failed.
+				// All attempts to try create the GPU image and its corresponding view have failed.
 				// Most likely cause: insufficient GPU memory or unsupported image parameters.
 				// TODO: Log a warning or error here � `addStaticImage2D` failed, likely due to low VRAM.
 				_NBL_DEBUG_BREAK_IF(true);
@@ -577,26 +433,155 @@ uint32_t DrawResourcesFiller::addStaticImage2D(image_id imageID, const core::sma
 					imagesMemorySubAllocator->deallocate(inserted->allocationOffset, inserted->allocationSize);
 				}
 
-				if (inserted->index != InvalidTextureIndex)
+				if (inserted->arrayIndex != InvalidTextureIndex)
 				{
 					// We previously allocated a descriptor index, but failed to create a usable GPU image.
 					// It's crucial to deallocate this index to avoid leaks and preserve descriptor pool space.
 					// No semaphore wait needed here, as the GPU never got to use this slot.
-					suballocatedDescriptorSet->multi_deallocate(imagesArrayBinding, 1u, &inserted->index, {});
-					inserted->index = InvalidTextureIndex;
+					suballocatedDescriptorSet->multi_deallocate(imagesArrayBinding, 1u, &inserted->arrayIndex, {});
+					inserted->arrayIndex = InvalidTextureIndex;
 				}
 			}
 		}
 		else
 		{
 			// TODO: log here, index allocation failed.
-			inserted->index = InvalidTextureIndex;
+			inserted->arrayIndex = InvalidTextureIndex;
 		}
 	}
 	
-	assert(inserted->index != InvalidTextureIndex); // shouldn't happen, because we're using LRU cache, so worst case eviction will happen + multi-deallocate and next next multi_allocate should definitely succeed
+	assert(inserted->arrayIndex != InvalidTextureIndex); // shouldn't happen, because we're using LRU cache, so worst case eviction will happen + multi-deallocate and next next multi_allocate should definitely succeed
 
-	return inserted->index;
+	return inserted->arrayIndex;
+}
+
+uint32_t DrawResourcesFiller::retrieveGeoreferencedImage_AllocateIfNeeded(image_id imageID, const GeoreferencedImageParams& params, SIntendedSubmitInfo& intendedNextSubmit)
+{
+	auto* device = m_utilities->getLogicalDevice();
+	auto* physDev = m_utilities->getLogicalDevice()->getPhysicalDevice();
+
+	// Try inserting or updating the image usage in the cache.
+	// If the image is already present, updates its semaphore value.
+	auto evictCallback = [&](image_id imageID, const ImageReference& evicted) { evictImage_SubmitIfNeeded(imageID, evicted, intendedNextSubmit); };
+	ImageReference* inserted = imagesUsageCache->insert(imageID, intendedNextSubmit.getFutureScratchSemaphore().value, evictCallback);
+	inserted->lastUsedFrameIndex = currentFrameIndex; // in case there was an eviction + auto-submit, we need to update AGAIN
+
+	// TODO: Function call that gets you image creaation params based on georeferencedImageParams (extents and mips and whatever), it will also get you the GEOREFERENED TYPE
+	IGPUImage::SCreationParams imageCreationParams = {};
+	ImageType georeferenceImageType = ImageType::GEOREFERENCED_FULL_RESOLUTION;
+
+	assert(georeferenceImageType != ImageType::STATIC);
+
+	// imageParams = cpuImage->getCreationParameters();
+	imageCreationParams.usage |= IGPUImage::EUF_TRANSFER_DST_BIT|IGPUImage::EUF_SAMPLED_BIT;
+	// promote format because RGB8 and friends don't actually exist in HW
+	{
+		const IPhysicalDevice::SImageFormatPromotionRequest request = {
+			.originalFormat = imageCreationParams.format,
+			.usages = IPhysicalDevice::SFormatImageUsages::SUsage(imageCreationParams.usage)
+		};
+		imageCreationParams.format = physDev->promoteImageFormat(request,imageCreationParams.tiling);
+	}
+
+	// if inserted->index was not InvalidTextureIndex then it means we had a cache hit and updated the value of our sema
+	// in which case we don't queue anything for upload, and return the idx
+	if (inserted->arrayIndex == InvalidTextureIndex)
+	{
+		// This is a new image (cache miss). Allocate a descriptor index for it.
+		inserted->arrayIndex = video::SubAllocatedDescriptorSet::AddressAllocator::invalid_address;
+		// Blocking allocation attempt; if the descriptor pool is exhausted, this may stall.
+		suballocatedDescriptorSet->multi_allocate(std::chrono::time_point<std::chrono::steady_clock>::max(), imagesArrayBinding, 1u, &inserted->arrayIndex); // if the prev submit causes DEVICE_LOST then we'll get a deadlock here since we're using max timepoint
+
+		if (inserted->arrayIndex != video::SubAllocatedDescriptorSet::AddressAllocator::invalid_address)
+		{
+			// Attempt to create a GPU image and image view for this texture.
+			core::smart_refctd_ptr<IGPUImageView> gpuImageView = nullptr;
+			ImageAllocateResults allocResults = tryCreateAndAllocateImage_SubmitIfNeeded(imageCreationParams, intendedNextSubmit, std::to_string(imageID));
+
+			if (allocResults.isValid())
+			{
+				inserted->imageType = georeferenceImageType;
+				inserted->gpuResident = false;
+				inserted->lastUsedFrameIndex = currentFrameIndex; // there was an eviction + auto-submit, we need to update AGAIN
+				inserted->allocationOffset = allocResults.allocationOffset;
+				inserted->allocationSize = allocResults.allocationSize;
+				inserted->gpuImageView = allocResults.gpuImageView;
+
+				// TODO: queue update of the set with the gpu image view.
+			}
+			else
+			{
+				// All attempts to try create the GPU image and its corresponding view have failed.
+				// Most likely cause: insufficient GPU memory or unsupported image parameters.
+				// TODO: Log a warning or error here � `addStaticImage2D` failed, likely due to low VRAM.
+				_NBL_DEBUG_BREAK_IF(true);
+
+				if (inserted->allocationOffset != ImagesMemorySubAllocator::InvalidAddress)
+				{
+					// We previously successfully create and allocated memory for the Image
+					// but failed to bind and create image view
+					// It's crucial to deallocate the offset+size form our images memory suballocator
+					imagesMemorySubAllocator->deallocate(inserted->allocationOffset, inserted->allocationSize);
+				}
+
+				if (inserted->arrayIndex != InvalidTextureIndex)
+				{
+					// We previously allocated a descriptor index, but failed to create a usable GPU image.
+					// It's crucial to deallocate this index to avoid leaks and preserve descriptor pool space.
+					// No semaphore wait needed here, as the GPU never got to use this slot.
+					suballocatedDescriptorSet->multi_deallocate(imagesArrayBinding, 1u, &inserted->arrayIndex, {});
+					inserted->arrayIndex = InvalidTextureIndex;
+				}
+			}
+		}
+		else
+		{
+			// TODO: log here, index allocation failed.
+			inserted->arrayIndex = InvalidTextureIndex;
+		}
+	}
+	else
+	{
+		// found in cache, but does it require resize? recreation?
+		if (inserted->gpuImageView)
+		{
+			auto imgViewParams = inserted->gpuImageView->getCreationParameters();
+			if (imgViewParams.image)
+			{
+				const auto cachedParams = static_cast<asset::IImage::SCreationParams>(imgViewParams.image->getCreationParameters());
+				const auto cachedImageType = inserted->imageType;
+				// image type and creation params (most importantly extent and format) should match, otherwise we evict, recreate and re-pus
+				const auto currentParams = static_cast<asset::IImage::SCreationParams>(imageCreationParams);
+				const bool needsRecreation = cachedImageType != georeferenceImageType || cachedParams != currentParams;
+				if (needsRecreation)
+				{
+					// We need to evict the image.
+					// Find erase the id from the cache, call evictCallback
+					//	wait for the image usage sempahore to finish (later we reallocate and reindex to avoid this)
+					//	try recreating the image (the same try process)
+					//	get the index hopefully from the creation
+				}
+			}
+			else
+			{
+				// TODO[LOG]
+			}
+		}
+		else
+		{
+			// TODO[LOG]
+		}
+	}
+	
+	assert(inserted->arrayIndex != InvalidTextureIndex); // shouldn't happen, because we're using LRU cache, so worst case eviction will happen + multi-deallocate and next next multi_allocate should definitely succeed
+
+	return inserted->arrayIndex;
+	// update frame idx 
+	// if found:
+	// check if needs recreation/resize, if it does, recreate
+	// if not, return set index
+	// if not found
+	// do the recreation process: TRY {create image, allocate and bind memory, create image view}, success --> queue for descriptor set update 
 }
 
 // TODO[Przemek]: similar to other drawXXX and drawXXX_internal functions that create mainobjects, drawObjects and push additional info in geometry buffer, input to function would be a GridDTMInfo
@@ -1786,12 +1771,169 @@ uint32_t DrawResourcesFiller::getImageIndexFromID(image_id imageID, const SInten
 	ImageReference* imageRef = imagesUsageCache->get(imageID);
 	if (imageRef)
 	{
-		textureIdx = imageRef->index;
+		textureIdx = imageRef->arrayIndex;
 		imageRef->lastUsedFrameIndex = currentFrameIndex; // update this because the texture will get used on the next frane
 	}
 	return textureIdx;
 }
 
+void DrawResourcesFiller::evictImage_SubmitIfNeeded(image_id imageID, const ImageReference& evicted, SIntendedSubmitInfo& intendedNextSubmit)
+{
+	// Later used to release the image's memory range.
+	core::smart_refctd_ptr<ImageCleanup> cleanupObject = core::make_smart_refctd_ptr<ImageCleanup>();
+	cleanupObject->imagesMemorySuballocator = imagesMemorySubAllocator;
+	cleanupObject->addr = evicted.allocationOffset;
+	cleanupObject->size = evicted.allocationSize;
+
+	const bool imageUsedForNextIntendedSubmit = (evicted.lastUsedFrameIndex == currentFrameIndex);
+
+	// NOTE: `deallocationWaitInfo` is crucial for both paths, we need to make sure we'll write to a descriptor arrayIndex when it's 100% done with previous usages.
+	if (imageUsedForNextIntendedSubmit)
+	{
+		// The evicted image is scheduled for use in the upcoming submit.
+		// To avoid rendering artifacts, we must flush the current draw queue now.
+		// After submission, we reset state so that data referencing the evicted slot can be re-uploaded.
+		submitDraws(intendedNextSubmit);
+		reset(); // resets everything, things referenced through mainObj and other shit will be pushed again through acquireXXX_SubmitIfNeeded
+
+		// Prepare wait info to defer index deallocation until the GPU has finished using the resource.
+		// we wait on the signal semaphore for the submit we just did above.
+		ISemaphore::SWaitInfo deallocationWaitInfo = { .semaphore = intendedNextSubmit.scratchSemaphore.semaphore, .value = intendedNextSubmit.scratchSemaphore.value };
+		suballocatedDescriptorSet->multi_deallocate(imagesArrayBinding, 1u, &evicted.arrayIndex, deallocationWaitInfo, &cleanupObject.get());
+	}
+	else
+	{
+		// The image is not used in the current frame, so we can deallocate without submitting any draws.
+		// Still wait on the semaphore to ensure past GPU usage is complete.
+		// TODO: We don't know which semaphore value the frame with `evicted.lastUsedFrameIndex` index was submitted with, so we wait for the worst case value which is the immediate prev submit.
+		ISemaphore::SWaitInfo deallocationWaitInfo = { .semaphore = intendedNextSubmit.scratchSemaphore.semaphore, .value = intendedNextSubmit.scratchSemaphore.value };
+		suballocatedDescriptorSet->multi_deallocate(imagesArrayBinding, 1u, &evicted.arrayIndex, deallocationWaitInfo, &cleanupObject.get());
+	}
+
+	// erase imageID from our state map
+	// kindof mirrors the state of the LRUCache for static images
+	if (evicted.imageType == ImageType::STATIC)
+		staticImagesState.erase(imageID);
+}
+
+DrawResourcesFiller::ImageAllocateResults  DrawResourcesFiller::tryCreateAndAllocateImage_SubmitIfNeeded(const nbl::asset::IImage::SCreationParams& imageParams, nbl::video::SIntendedSubmitInfo& intendedNextSubmit, std::string imageDebugName)
+{
+	ImageAllocateResults ret = {};
+
+	auto* device = m_utilities->getLogicalDevice();
+	auto* physDev = m_utilities->getLogicalDevice()->getPhysicalDevice();
+
+	// Attempt to create a GPU image and corresponding image view for this texture.
+	// If creation or memory allocation fails (likely due to VRAM exhaustion),
+	// we'll evict another texture from the LRU cache and retry until successful, or until only the currently-inserted image remains.
+	while (imagesUsageCache->size() > 0u)
+	{
+		// Try creating the image and allocating memory for it:
+		nbl::video::IGPUImage::SCreationParams params = {};
+		params = imageParams;
+		auto gpuImage = device->createImage(std::move(params));
+
+		if (gpuImage)
+		{
+			nbl::video::IDeviceMemoryBacked::SDeviceMemoryRequirements gpuImageMemoryRequirements = gpuImage->getMemoryReqs();
+			uint32_t actualAlignment = 1u << gpuImageMemoryRequirements.alignmentLog2;
+			const bool imageMemoryRequirementsMatch =
+				(physDev->getDeviceLocalMemoryTypeBits() & gpuImageMemoryRequirements.memoryTypeBits) != 0 && // should have device local memory compatible
+				(gpuImageMemoryRequirements.requiresDedicatedAllocation == false) && // should not require dedicated allocation
+				((ImagesMemorySubAllocator::MaxMemoryAlignment % actualAlignment) == 0u); // should be consistent with our suballocator's max alignment
+
+			if (imageMemoryRequirementsMatch)
+			{
+				ret.allocationOffset = imagesMemorySubAllocator->allocate(gpuImageMemoryRequirements.size, 1u << gpuImageMemoryRequirements.alignmentLog2);
+				const bool allocationFromImagesMemoryArenaSuccessfull = ret.allocationOffset != ImagesMemorySubAllocator::InvalidAddress;
+				if (allocationFromImagesMemoryArenaSuccessfull)
+				{
+					ret.allocationSize = gpuImageMemoryRequirements.size;
+					nbl::video::ILogicalDevice::SBindImageMemoryInfo bindImageMemoryInfo =
+					{
+						.image = gpuImage.get(),
+						.binding = { .memory = imagesMemoryArena.memory.get(), .offset = imagesMemoryArena.offset + ret.allocationOffset }
+					};
+					const bool boundToMemorySuccessfully = device->bindImageMemory({ &bindImageMemoryInfo, 1u });
+					if (boundToMemorySuccessfully)
+					{
+						gpuImage->setObjectDebugName(imageDebugName.c_str());
+						IGPUImageView::SCreationParams viewParams = {
+							.image = gpuImage,
+							.viewType = IGPUImageView::ET_2D,
+							.format = gpuImage->getCreationParameters().format
+						};
+						ret.gpuImageView = device->createImageView(std::move(viewParams));
+						if (ret.gpuImageView)
+						{
+							// SUCCESS!
+							ret.gpuImageView->setObjectDebugName((imageDebugName + " View").c_str());
+						}
+						else
+						{
+							// irrecoverable error if simple image creation fails.
+							// TODO[LOG]: that's rare, image view creation failed.
+							_NBL_DEBUG_BREAK_IF(true);
+						}
+
+						// succcessful with everything, just break and get out of this retry loop
+						break;
+					}
+					else
+					{
+						// irrecoverable error if simple bindImageMemory fails.
+						// TODO: LOG
+						_NBL_DEBUG_BREAK_IF(true);
+						break;
+					}
+				}
+				else
+				{
+					// printf(std::format("Allocation Failed, Trying again, ImageID={} Size={} \n", imageID, gpuImageMemoryRequirements.size).c_str());
+					// recoverable error when allocation fails, we don't log anything, next code will try evicting other images and retry
+				}
+			}
+			else
+			{
+				// irrecoverable error if memory requirements of the image don't match our preallocated devicememory
+				// TODO: LOG
+				_NBL_DEBUG_BREAK_IF(true);
+				break;
+			}
+		}
+		else
+		{
+			// irrecoverable error if simple image creation fails.
+			// TODO: LOG
+			_NBL_DEBUG_BREAK_IF(true);
+			break;
+		}
+
+		// Getting here means we failed creating or allocating the image, evict and retry.
+		if (imagesUsageCache->size() == 1u)
+		{
+			// Nothing else to evict; give up.
+			// We probably have evicted almost every other texture except the one we just allocated an index for
+			_NBL_DEBUG_BREAK_IF(true);
+			break;
+		}
+
+		assert(imagesUsageCache->size() > 1u);
+
+		const image_id evictionCandidate = imagesUsageCache->select_eviction_candidate();
+		ImageReference* imageRef = imagesUsageCache->peek(evictionCandidate);
+		if (imageRef)
+			evictImage_SubmitIfNeeded(evictionCandidate, *imageRef, intendedNextSubmit);
+		imagesUsageCache->erase(evictionCandidate);
+		while (suballocatedDescriptorSet->cull_frees()) {}; // to make sure deallocation requests in eviction callback are blocked for.
+
+		// we don't hold any references to the GPUImageView or GPUImage so descriptor binding will be the last reference
+		// hopefully by here the suballocated descriptor set freed some VRAM by dropping the image last ref and it's dedicated allocation.
+	}
+
+	return ret;
+}
+
 void DrawResourcesFiller::setGlyphMSDFTextureFunction(const GetGlyphMSDFTextureFunc& func)
 {
 	getGlyphMSDF = func;
diff --git a/62_CAD/DrawResourcesFiller.h b/62_CAD/DrawResourcesFiller.h
index 5501e4c84..f805c0a82 100644
--- a/62_CAD/DrawResourcesFiller.h
+++ b/62_CAD/DrawResourcesFiller.h
@@ -245,6 +245,8 @@ struct DrawResourcesFiller
 	*/
 	uint32_t addStaticImage2D(image_id imageID, const core::smart_refctd_ptr<ICPUImage>& cpuImage, SIntendedSubmitInfo& intendedNextSubmit);
 
+	uint32_t retrieveGeoreferencedImage_AllocateIfNeeded(image_id imageID, const GeoreferencedImageParams& params, SIntendedSubmitInfo& intendedNextSubmit);
+
 	// This function must be called immediately after `addStaticImage` for the same imageID.
 	void addImageObject(image_id imageID, const OrientedBoundingBox2D& obb, SIntendedSubmitInfo& intendedNextSubmit);
 	
@@ -465,6 +467,53 @@ struct DrawResourcesFiller
 	
 	uint32_t getImageIndexFromID(image_id imageID, const SIntendedSubmitInfo& intendedNextSubmit);
 
+	/**
+	 * @brief Evicts a GPU image and deallocates its associated descriptor and memory, flushing draws if needed.
+	 *
+	 * This function is called when an image must be removed from GPU memory (typically due to VRAM pressure).
+	 * If the evicted image is scheduled to be used in the next draw submission, a flush is performed to avoid
+	 * use-after-free issues. Otherwise, it proceeds with deallocation immediately.
+	 *
+	 * It prepares a cleanup object that ensures the memory range used by the image will be returned to the suballocator
+	 * only after the GPU has finished using it, guarded by a semaphore wait.
+	 *
+	 * @param imageID The unique ID of the image being evicted.
+	 * @param evicted A reference to the evicted image, containing metadata such as allocation offset, size, usage frame, etc.
+	 * @param intendedNextSubmit Reference to the intended submit information. Used for synchronizing draw submission and safe deallocation.
+	 *
+	 * @warning Deallocation may use a conservative semaphore wait value if exact usage information is unavailable. [future todo: fix] 
+	 */
+	void evictImage_SubmitIfNeeded(image_id imageID, const ImageReference& evicted, SIntendedSubmitInfo& intendedNextSubmit);
+	
+	struct ImageAllocateResults
+	{
+		nbl::core::smart_refctd_ptr<nbl::video::IGPUImageView> gpuImageView = nullptr;
+		uint64_t allocationOffset = ImagesMemorySubAllocator::InvalidAddress;
+		uint64_t allocationSize = 0ull;
+		bool isValid() const { return (gpuImageView && (allocationOffset != ImagesMemorySubAllocator::InvalidAddress)); }
+	};
+
+	/**
+	 * @brief Attempts to create and allocate a GPU image and its view, with fallback eviction on failure.
+	 *
+	 * This function tries to create a GPU image using the specified creation parameters, allocate memory
+	 * from the shared image memory arena, bind it to device-local memory, and create an associated image view.
+	 * If memory allocation fails (e.g. due to VRAM exhaustion), the function will evict textures from the internal
+	 * LRU cache and retry the operation until successful, or until only the currently-inserted image remains.
+	 *
+	 * This is primarily used by the draw resource filler to manage GPU image memory for streamed or cached images.
+	 *
+	 * @param imageParams Creation parameters for the image. Should match `nbl::asset::IImage::SCreationParams`.
+	 * @param intendedNextSubmit Reference to the current intended submit info. Used for synchronizing evictions.
+	 * @param imageDebugName Debug name assigned to the image and its view for easier profiling/debugging.
+	 *
+	 * @return ImageAllocateResults A struct containing:
+	 * - `allocationOffset`: Offset into the memory arena (or InvalidAddress on failure).
+	 * - `allocationSize`: Size of the allocated memory region.
+	 * - `gpuImageView`: The created GPU image view (nullptr if creation failed).
+	 */
+	ImageAllocateResults tryCreateAndAllocateImage_SubmitIfNeeded(const nbl::asset::IImage::SCreationParams& imageParams, nbl::video::SIntendedSubmitInfo& intendedNextSubmit, std::string debugName = "UnnamedNablaImage");
+
 	void resetMainObjects()
 	{
 		resourcesCollection.mainObjects.vector.clear();
diff --git a/62_CAD/Images.h b/62_CAD/Images.h
index d8c6cf864..d93c47d3c 100644
--- a/62_CAD/Images.h
+++ b/62_CAD/Images.h
@@ -6,6 +6,20 @@ using namespace nbl::asset;
 
 using image_id = uint64_t; // Could later be templated or replaced with a stronger type or hash key.
 
+enum class ImageType : uint8_t
+{
+    STATIC = 0,                        // Regular non-georeferenced image, fully loaded once
+    GEOREFERENCED_STREAMED,            // Streamed image, resolution depends on camera/view
+    GEOREFERENCED_FULL_RESOLUTION      // For smaller georeferenced images, entire image is eventually loaded and not streamed or view-dependant
+};
+
+struct GeoreferencedImageParams
+{
+	uint32_t2 imageExtents;
+	uint32_t2 viewportExtents;
+	asset::E_FORMAT format;
+};
+
 /**
  * @class ImagesMemorySubAllocator
  * @brief A memory sub-allocator designed for managing sub-allocations within a pre-allocated GPU memory arena for images.
@@ -108,13 +122,17 @@ struct StaticImageState
 struct ImageReference
 {
 	static constexpr uint32_t InvalidTextureIndex = nbl::hlsl::numeric_limits<uint32_t>::max;
-	uint32_t index = InvalidTextureIndex; // index in our array of textures binding
+	
+	uint32_t arrayIndex = InvalidTextureIndex; // index in our array of textures binding
+	ImageType imageType;
+	bool gpuResident = false;
 	uint64_t lastUsedFrameIndex = 0ull; // last used semaphore value on this image
 	uint64_t allocationOffset = ImagesMemorySubAllocator::InvalidAddress;
 	uint64_t allocationSize = 0ull;
+	core::smart_refctd_ptr<IGPUImageView> gpuImageView = nullptr;
 
 	ImageReference() 
-		: index(InvalidTextureIndex)
+		: arrayIndex(InvalidTextureIndex)
 		, lastUsedFrameIndex(0ull)
 		, allocationOffset(ImagesMemorySubAllocator::InvalidAddress)
 		, allocationSize(0ull)
@@ -122,7 +140,7 @@ struct ImageReference
 	
 	// In LRU Cache `insert` function, in case of cache miss, we need to construct the refereence with semaphore value
 	ImageReference(uint64_t currentFrameIndex) 
-		: index(InvalidTextureIndex)
+		: arrayIndex(InvalidTextureIndex)
 		, lastUsedFrameIndex(currentFrameIndex)
 		, allocationOffset(ImagesMemorySubAllocator::InvalidAddress)
 		, allocationSize(0ull)

From a7143525763141d4f3ec05511fd15bae92c5e60c Mon Sep 17 00:00:00 2001
From: Erfan Ahmadi <ahmadierfan99@gmail.com>
Date: Tue, 20 May 2025 12:12:35 +0400
Subject: [PATCH 268/529] small fix

---
 62_CAD/DrawResourcesFiller.cpp | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/62_CAD/DrawResourcesFiller.cpp b/62_CAD/DrawResourcesFiller.cpp
index b0b3306ae..c33356cc2 100644
--- a/62_CAD/DrawResourcesFiller.cpp
+++ b/62_CAD/DrawResourcesFiller.cpp
@@ -395,7 +395,6 @@ uint32_t DrawResourcesFiller::addStaticImage2D(image_id imageID, const core::sma
 			}
 
 			// Attempt to create a GPU image and image view for this texture.
-			core::smart_refctd_ptr<IGPUImageView> gpuImageView = nullptr;
 			ImageAllocateResults allocResults = tryCreateAndAllocateImage_SubmitIfNeeded(imageParams, intendedNextSubmit, std::to_string(imageID));
 
 			if (allocResults.isValid())
@@ -410,7 +409,7 @@ uint32_t DrawResourcesFiller::addStaticImage2D(image_id imageID, const core::sma
 				StaticImageState newState =
 				{
 					.cpuImage = cpuImage,
-					.gpuImageView = gpuImageView,
+					.gpuImageView = allocResults->gpuImageView,
 					.allocationOffset = inserted->allocationOffset,
 					.allocationSize = inserted->allocationSize,
 					.arrayIndex = inserted->arrayIndex,

From 596751c0b9c212393e94bde8c084aaf1f7f81b34 Mon Sep 17 00:00:00 2001
From: Erfan Ahmadi <ahmadierfan99@gmail.com>
Date: Tue, 20 May 2025 12:13:04 +0400
Subject: [PATCH 269/529] small fix2

---
 62_CAD/DrawResourcesFiller.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/62_CAD/DrawResourcesFiller.cpp b/62_CAD/DrawResourcesFiller.cpp
index c33356cc2..b3ac66ce6 100644
--- a/62_CAD/DrawResourcesFiller.cpp
+++ b/62_CAD/DrawResourcesFiller.cpp
@@ -409,9 +409,9 @@ uint32_t DrawResourcesFiller::addStaticImage2D(image_id imageID, const core::sma
 				StaticImageState newState =
 				{
 					.cpuImage = cpuImage,
-					.gpuImageView = allocResults->gpuImageView,
-					.allocationOffset = inserted->allocationOffset,
-					.allocationSize = inserted->allocationSize,
+					.gpuImageView = allocResults.gpuImageView,
+					.allocationOffset = allocResults.allocationOffset,
+					.allocationSize = allocResults.allocationSize,
 					.arrayIndex = inserted->arrayIndex,
 					.gpuResident = false,
 				};

From a394f2216ffa8a843350570ca4f4dafe66b27700 Mon Sep 17 00:00:00 2001
From: keptsecret <sorchon@gmail.com>
Date: Tue, 20 May 2025 15:47:40 +0700
Subject: [PATCH 270/529] use data accessor with preload data in reg

---
 .../app_resources/testWorkgroup.comp.hlsl     | 53 ++++++++++++++++++-
 1 file changed, 51 insertions(+), 2 deletions(-)

diff --git a/23_Arithmetic2UnitTest/app_resources/testWorkgroup.comp.hlsl b/23_Arithmetic2UnitTest/app_resources/testWorkgroup.comp.hlsl
index 9debd184d..047572cde 100644
--- a/23_Arithmetic2UnitTest/app_resources/testWorkgroup.comp.hlsl
+++ b/23_Arithmetic2UnitTest/app_resources/testWorkgroup.comp.hlsl
@@ -32,6 +32,53 @@ struct DataProxy
     }
 };
 
+template<class Config, class Binop>
+struct PreloadedDataProxy
+{
+    using dtype_t = vector<uint32_t, Config::ItemsPerInvocation_0>;
+    static_assert(nbl::hlsl::is_same_v<dtype_t, type_t>);
+
+    NBL_CONSTEXPR_STATIC_INLINE uint32_t PreloadedDataCount = Config::VirtualWorkgroupSize / Config::WorkgroupSize;
+
+    template<typename AccessType>
+    void get(const uint32_t ix, NBL_REF_ARG(dtype_t) value)
+    {
+        value = preloaded[(ix-nbl::hlsl::workgroup::SubgroupContiguousIndex())>>Config::WorkgroupSizeLog2];
+    }
+    template<typename AccessType>
+    void set(const uint32_t ix, const dtype_t value)
+    {
+        preloaded[(ix-nbl::hlsl::workgroup::SubgroupContiguousIndex())>>Config::WorkgroupSizeLog2] = value;
+    }
+
+    void preload()
+    {
+        const uint32_t workgroupOffset = nbl::hlsl::glsl::gl_WorkGroupID().x * Config::VirtualWorkgroupSize;
+        [unroll]
+        for (uint32_t idx = 0; idx < PreloadedDataCount; idx++)
+            preloaded[idx] = vk::RawBufferLoad<dtype_t>(pc.inputBufAddress + (workgroupOffset + idx * Config::WorkgroupSize + nbl::hlsl::workgroup::SubgroupContiguousIndex()) * sizeof(dtype_t));
+    }
+    void unload()
+    {
+        const uint32_t workgroupOffset = nbl::hlsl::glsl::gl_WorkGroupID().x * Config::VirtualWorkgroupSize;
+        uint64_t outputBufAddr = vk::RawBufferLoad<uint64_t>(pc.outputAddressBufAddress + Binop::BindingIndex * sizeof(uint64_t));
+        [unroll]
+        for (uint32_t idx = 0; idx < PreloadedDataCount; idx++)
+            [unroll]
+            for (uint32_t i = 0; i < Config::ItemsPerInvocation_0; i++)
+                vk::RawBufferStore<uint32_t>(outputBufAddr+sizeof(uint32_t)+sizeof(dtype_t)*(workgroupOffset + idx * Config::WorkgroupSize + nbl::hlsl::workgroup::SubgroupContiguousIndex())+i*sizeof(uint32_t), preloaded[idx][i]);
+            // vk::RawBufferStore<dtype_t>(outputBufAddr + sizeof(uint32_t) + sizeof(dtype_t) * (workgroupOffset + idx * Config::WorkgroupSize + nbl::hlsl::workgroup::SubgroupContiguousIndex()), preloaded[idx], sizeof(uint32_t)); TODO why won't this work???
+    }
+
+    void workgroupExecutionAndMemoryBarrier()
+    {
+        nbl::hlsl::glsl::barrier();
+        //nbl::hlsl::glsl::memoryBarrierShared(); implied by the above
+    }
+
+    dtype_t preloaded[PreloadedDataCount];
+};
+
 static ScratchProxy arithmeticAccessor;
 
 template<class Binop, class device_capabilities>
@@ -42,10 +89,12 @@ struct operation_t
 
     void operator()()
     {
-        DataProxy<config_t,Binop> dataAccessor;
-        nbl::hlsl::OPERATION<config_t,binop_base_t,device_capabilities>::template __call<DataProxy<config_t,Binop>, ScratchProxy>(dataAccessor,arithmeticAccessor);
+        PreloadedDataProxy<config_t,Binop> dataAccessor;
+        dataAccessor.preload();
+        nbl::hlsl::OPERATION<config_t,binop_base_t,device_capabilities>::template __call<PreloadedDataProxy<config_t,Binop>, ScratchProxy>(dataAccessor,arithmeticAccessor);
         // we barrier before because we alias the accessors for Binop
         arithmeticAccessor.workgroupExecutionAndMemoryBarrier();
+        dataAccessor.unload();
     }
 };
 

From 44c34a8a65866fb6304c12032efd08e2338c7116 Mon Sep 17 00:00:00 2001
From: keptsecret <sorchon@gmail.com>
Date: Tue, 20 May 2025 16:32:53 +0700
Subject: [PATCH 271/529] use store with data type because it works now

---
 23_Arithmetic2UnitTest/app_resources/shaderCommon.hlsl |  5 +----
 .../app_resources/testWorkgroup.comp.hlsl              | 10 ++--------
 2 files changed, 3 insertions(+), 12 deletions(-)

diff --git a/23_Arithmetic2UnitTest/app_resources/shaderCommon.hlsl b/23_Arithmetic2UnitTest/app_resources/shaderCommon.hlsl
index 45a1f8097..05dcfb469 100644
--- a/23_Arithmetic2UnitTest/app_resources/shaderCommon.hlsl
+++ b/23_Arithmetic2UnitTest/app_resources/shaderCommon.hlsl
@@ -52,10 +52,7 @@ static void subtest(NBL_CONST_REF_ARG(type_t) sourceVal)
     operation_t<params_t> func;
     type_t val = func(sourceVal);
     if (canStore())
-        [unroll]
-        for (uint32_t i = 0; i < N; i++)
-            vk::RawBufferStore<uint32_t>(outputBufAddr+sizeof(uint32_t)+sizeof(type_t)*globalIndex()+i*sizeof(uint32_t), val[i]);
-        // vk::RawBufferStore<dtype_t>(outputBufAddr + sizeof(uint32_t) + sizeof(dtype_t) * globalIndex(), value, sizeof(uint32_t)); TODO why won't this work???
+        vk::RawBufferStore<type_t>(outputBufAddr + sizeof(uint32_t) + sizeof(type_t) * globalIndex(), val, sizeof(uint32_t));
 }
 
 
diff --git a/23_Arithmetic2UnitTest/app_resources/testWorkgroup.comp.hlsl b/23_Arithmetic2UnitTest/app_resources/testWorkgroup.comp.hlsl
index 047572cde..38b6714bd 100644
--- a/23_Arithmetic2UnitTest/app_resources/testWorkgroup.comp.hlsl
+++ b/23_Arithmetic2UnitTest/app_resources/testWorkgroup.comp.hlsl
@@ -19,10 +19,7 @@ struct DataProxy
     {
         const uint32_t workgroupOffset = nbl::hlsl::glsl::gl_WorkGroupID().x * Config::VirtualWorkgroupSize;
         uint64_t outputBufAddr = vk::RawBufferLoad<uint64_t>(pc.outputAddressBufAddress + Binop::BindingIndex * sizeof(uint64_t));
-        [unroll]
-        for (uint32_t i = 0; i < Config::ItemsPerInvocation_0; i++)
-            vk::RawBufferStore<uint32_t>(outputBufAddr+sizeof(uint32_t)+sizeof(dtype_t)*(workgroupOffset+ix)+i*sizeof(uint32_t), value[i]);
-        // vk::RawBufferStore<dtype_t>(outputBufAddr + sizeof(uint32_t) + sizeof(dtype_t) * (workgroupOffset+ix), value, sizeof(uint32_t)); TODO why won't this work???
+        vk::RawBufferStore<dtype_t>(outputBufAddr + sizeof(uint32_t) + sizeof(dtype_t) * (workgroupOffset+ix), value, sizeof(uint32_t));
     }
 
     void workgroupExecutionAndMemoryBarrier()
@@ -64,10 +61,7 @@ struct PreloadedDataProxy
         uint64_t outputBufAddr = vk::RawBufferLoad<uint64_t>(pc.outputAddressBufAddress + Binop::BindingIndex * sizeof(uint64_t));
         [unroll]
         for (uint32_t idx = 0; idx < PreloadedDataCount; idx++)
-            [unroll]
-            for (uint32_t i = 0; i < Config::ItemsPerInvocation_0; i++)
-                vk::RawBufferStore<uint32_t>(outputBufAddr+sizeof(uint32_t)+sizeof(dtype_t)*(workgroupOffset + idx * Config::WorkgroupSize + nbl::hlsl::workgroup::SubgroupContiguousIndex())+i*sizeof(uint32_t), preloaded[idx][i]);
-            // vk::RawBufferStore<dtype_t>(outputBufAddr + sizeof(uint32_t) + sizeof(dtype_t) * (workgroupOffset + idx * Config::WorkgroupSize + nbl::hlsl::workgroup::SubgroupContiguousIndex()), preloaded[idx], sizeof(uint32_t)); TODO why won't this work???
+            vk::RawBufferStore<dtype_t>(outputBufAddr + sizeof(uint32_t) + sizeof(dtype_t) * (workgroupOffset + idx * Config::WorkgroupSize + nbl::hlsl::workgroup::SubgroupContiguousIndex()), preloaded[idx], sizeof(uint32_t));
     }
 
     void workgroupExecutionAndMemoryBarrier()

From 2cbc2b068c7893e9efff9a90c4cd241506b15ed1 Mon Sep 17 00:00:00 2001
From: Erfan Ahmadi <ahmadierfan99@gmail.com>
Date: Tue, 20 May 2025 15:11:57 +0400
Subject: [PATCH 272/529] [Untested] more work on Georeferenced images creation
 and recreating/resizing

---
 62_CAD/DrawResourcesFiller.cpp | 210 +++++++++++++++++++--------------
 62_CAD/DrawResourcesFiller.h   |  13 ++
 62_CAD/Images.h                |  19 +--
 3 files changed, 147 insertions(+), 95 deletions(-)

diff --git a/62_CAD/DrawResourcesFiller.cpp b/62_CAD/DrawResourcesFiller.cpp
index b3ac66ce6..9d638a920 100644
--- a/62_CAD/DrawResourcesFiller.cpp
+++ b/62_CAD/DrawResourcesFiller.cpp
@@ -365,19 +365,19 @@ uint32_t DrawResourcesFiller::addStaticImage2D(image_id imageID, const core::sma
 	// Try inserting or updating the image usage in the cache.
 	// If the image is already present, updates its semaphore value.
 	auto evictCallback = [&](image_id imageID, const ImageReference& evicted) { evictImage_SubmitIfNeeded(imageID, evicted, intendedNextSubmit); };
-	ImageReference* inserted = imagesUsageCache->insert(imageID, intendedNextSubmit.getFutureScratchSemaphore().value, evictCallback);
-	inserted->lastUsedFrameIndex = currentFrameIndex; // in case there was an eviction + auto-submit, we need to update AGAIN
+	ImageReference* cachedImageReference = imagesUsageCache->insert(imageID, intendedNextSubmit.getFutureScratchSemaphore().value, evictCallback);
+	cachedImageReference->lastUsedFrameIndex = currentFrameIndex; // in case there was an eviction + auto-submit, we need to update AGAIN
 
-	// if inserted->index was not InvalidTextureIndex then it means we had a cache hit and updated the value of our sema
+	// if cachedImageReference->index was not InvalidTextureIndex then it means we had a cache hit and updated the value of our sema
 	// in which case we don't queue anything for upload, and return the idx
-	if (inserted->arrayIndex == InvalidTextureIndex)
+	if (cachedImageReference->arrayIndex == InvalidTextureIndex)
 	{
 		// This is a new image (cache miss). Allocate a descriptor index for it.
-		inserted->arrayIndex = video::SubAllocatedDescriptorSet::AddressAllocator::invalid_address;
+		cachedImageReference->arrayIndex = video::SubAllocatedDescriptorSet::AddressAllocator::invalid_address;
 		// Blocking allocation attempt; if the descriptor pool is exhausted, this may stall.
-		suballocatedDescriptorSet->multi_allocate(std::chrono::time_point<std::chrono::steady_clock>::max(), imagesArrayBinding, 1u, &inserted->arrayIndex); // if the prev submit causes DEVICE_LOST then we'll get a deadlock here since we're using max timepoint
+		suballocatedDescriptorSet->multi_allocate(std::chrono::time_point<std::chrono::steady_clock>::max(), imagesArrayBinding, 1u, &cachedImageReference->arrayIndex); // if the prev submit causes DEVICE_LOST then we'll get a deadlock here since we're using max timepoint
 
-		if (inserted->arrayIndex != video::SubAllocatedDescriptorSet::AddressAllocator::invalid_address)
+		if (cachedImageReference->arrayIndex != video::SubAllocatedDescriptorSet::AddressAllocator::invalid_address)
 		{
 			auto* device = m_utilities->getLogicalDevice();
 			auto* physDev = m_utilities->getLogicalDevice()->getPhysicalDevice();
@@ -399,12 +399,12 @@ uint32_t DrawResourcesFiller::addStaticImage2D(image_id imageID, const core::sma
 
 			if (allocResults.isValid())
 			{
-				inserted->imageType = ImageType::STATIC;
-				inserted->gpuResident = false;
-				inserted->lastUsedFrameIndex = currentFrameIndex; // there was an eviction + auto-submit, we need to update AGAIN
-				inserted->allocationOffset = allocResults.allocationOffset;
-				inserted->allocationSize = allocResults.allocationSize;
-				inserted->gpuImageView = allocResults.gpuImageView;
+				cachedImageReference->imageType = ImageType::STATIC;
+				cachedImageReference->gpuResident = false;
+				cachedImageReference->lastUsedFrameIndex = currentFrameIndex; // there was an eviction + auto-submit, we need to update AGAIN
+				cachedImageReference->allocationOffset = allocResults.allocationOffset;
+				cachedImageReference->allocationSize = allocResults.allocationSize;
+				cachedImageReference->gpuImageView = allocResults.gpuImageView;
 
 				StaticImageState newState =
 				{
@@ -412,7 +412,7 @@ uint32_t DrawResourcesFiller::addStaticImage2D(image_id imageID, const core::sma
 					.gpuImageView = allocResults.gpuImageView,
 					.allocationOffset = allocResults.allocationOffset,
 					.allocationSize = allocResults.allocationSize,
-					.arrayIndex = inserted->arrayIndex,
+					.arrayIndex = cachedImageReference->arrayIndex,
 					.gpuResident = false,
 				};
 				staticImagesState.emplace(imageID, newState);
@@ -424,34 +424,37 @@ uint32_t DrawResourcesFiller::addStaticImage2D(image_id imageID, const core::sma
 				// TODO: Log a warning or error here � `addStaticImage2D` failed, likely due to low VRAM.
 				_NBL_DEBUG_BREAK_IF(true);
 
-				if (inserted->allocationOffset != ImagesMemorySubAllocator::InvalidAddress)
+				if (cachedImageReference->allocationOffset != ImagesMemorySubAllocator::InvalidAddress)
 				{
 					// We previously successfully create and allocated memory for the Image
 					// but failed to bind and create image view
 					// It's crucial to deallocate the offset+size form our images memory suballocator
-					imagesMemorySubAllocator->deallocate(inserted->allocationOffset, inserted->allocationSize);
+					imagesMemorySubAllocator->deallocate(cachedImageReference->allocationOffset, cachedImageReference->allocationSize);
 				}
 
-				if (inserted->arrayIndex != InvalidTextureIndex)
+				if (cachedImageReference->arrayIndex != InvalidTextureIndex)
 				{
 					// We previously allocated a descriptor index, but failed to create a usable GPU image.
 					// It's crucial to deallocate this index to avoid leaks and preserve descriptor pool space.
 					// No semaphore wait needed here, as the GPU never got to use this slot.
-					suballocatedDescriptorSet->multi_deallocate(imagesArrayBinding, 1u, &inserted->arrayIndex, {});
-					inserted->arrayIndex = InvalidTextureIndex;
+					suballocatedDescriptorSet->multi_deallocate(imagesArrayBinding, 1u, &cachedImageReference->arrayIndex, {});
+					cachedImageReference->arrayIndex = InvalidTextureIndex;
 				}
 			}
 		}
 		else
 		{
 			// TODO: log here, index allocation failed.
-			inserted->arrayIndex = InvalidTextureIndex;
+			cachedImageReference->arrayIndex = InvalidTextureIndex;
 		}
 	}
 	
-	assert(inserted->arrayIndex != InvalidTextureIndex); // shouldn't happen, because we're using LRU cache, so worst case eviction will happen + multi-deallocate and next next multi_allocate should definitely succeed
+	assert(cachedImageReference->arrayIndex != InvalidTextureIndex); // shouldn't happen, because we're using LRU cache, so worst case eviction will happen + multi-deallocate and next next multi_allocate should definitely succeed
+	
+	// cached or just inserted, we update the lastUsedFrameIndex
+	cachedImageReference->lastUsedFrameIndex = currentFrameIndex;
 
-	return inserted->arrayIndex;
+	return cachedImageReference->arrayIndex;
 }
 
 uint32_t DrawResourcesFiller::retrieveGeoreferencedImage_AllocateIfNeeded(image_id imageID, const GeoreferencedImageParams& params, SIntendedSubmitInfo& intendedNextSubmit)
@@ -462,12 +465,12 @@ uint32_t DrawResourcesFiller::retrieveGeoreferencedImage_AllocateIfNeeded(image_
 	// Try inserting or updating the image usage in the cache.
 	// If the image is already present, updates its semaphore value.
 	auto evictCallback = [&](image_id imageID, const ImageReference& evicted) { evictImage_SubmitIfNeeded(imageID, evicted, intendedNextSubmit); };
-	ImageReference* inserted = imagesUsageCache->insert(imageID, intendedNextSubmit.getFutureScratchSemaphore().value, evictCallback);
-	inserted->lastUsedFrameIndex = currentFrameIndex; // in case there was an eviction + auto-submit, we need to update AGAIN
+	ImageReference* cachedImageReference = imagesUsageCache->insert(imageID, intendedNextSubmit.getFutureScratchSemaphore().value, evictCallback);
 
 	// TODO: Function call that gets you image creaation params based on georeferencedImageParams (extents and mips and whatever), it will also get you the GEOREFERENED TYPE
 	IGPUImage::SCreationParams imageCreationParams = {};
-	ImageType georeferenceImageType = ImageType::GEOREFERENCED_FULL_RESOLUTION;
+	ImageType georeferenceImageType;
+	determineGeoreferencedImageCreationParams(imageCreationParams, georeferenceImageType, params);
 
 	assert(georeferenceImageType != ImageType::STATIC);
 
@@ -481,30 +484,65 @@ uint32_t DrawResourcesFiller::retrieveGeoreferencedImage_AllocateIfNeeded(image_
 		};
 		imageCreationParams.format = physDev->promoteImageFormat(request,imageCreationParams.tiling);
 	}
+	
+	// if cachedImageReference->index was not InvalidTextureIndex then it means we had a cache hit and updated the value of our sema
+	// But we need to check if the cached image needs resizing/recreation.
+	if (cachedImageReference->arrayIndex != InvalidTextureIndex)
+	{
+		// found in cache, but does it require resize? recreation?
+		if (cachedImageReference->gpuImageView)
+		{
+			auto imgViewParams = cachedImageReference->gpuImageView->getCreationParameters();
+			if (imgViewParams.image)
+			{
+				const auto cachedParams = static_cast<asset::IImage::SCreationParams>(imgViewParams.image->getCreationParameters());
+				const auto cachedImageType = cachedImageReference->imageType;
+				// image type and creation params (most importantly extent and format) should match, otherwise we evict, recreate and re-pus
+				const auto currentParams = static_cast<asset::IImage::SCreationParams>(imageCreationParams);
+				const bool needsRecreation = cachedImageType != georeferenceImageType || cachedParams != currentParams;
+				if (needsRecreation)
+				{
+					// call the eviction callbacl so the currently cached imageID gets eventually deallocated from memory arena.
+					evictCallback(imageID, *cachedImageReference);
+					
+					// instead of erasing and inserting the imageID into the cache, we just reset it, so the next block of code goes into array index allocation + creating our new image
+					*cachedImageReference = ImageReference(currentFrameIndex);
+					// imagesUsageCache->erase(imageID);
+					// cachedImageReference = imagesUsageCache->insert(imageID, intendedNextSubmit.getFutureScratchSemaphore().value, evictCallback);
+				}
+			}
+			else
+			{
+				// TODO[LOG]
+			}
+		}
+		else
+		{
+			// TODO[LOG]
+		}
+	}
 
-	// if inserted->index was not InvalidTextureIndex then it means we had a cache hit and updated the value of our sema
 	// in which case we don't queue anything for upload, and return the idx
-	if (inserted->arrayIndex == InvalidTextureIndex)
+	if (cachedImageReference->arrayIndex == InvalidTextureIndex)
 	{
 		// This is a new image (cache miss). Allocate a descriptor index for it.
-		inserted->arrayIndex = video::SubAllocatedDescriptorSet::AddressAllocator::invalid_address;
+		cachedImageReference->arrayIndex = video::SubAllocatedDescriptorSet::AddressAllocator::invalid_address;
 		// Blocking allocation attempt; if the descriptor pool is exhausted, this may stall.
-		suballocatedDescriptorSet->multi_allocate(std::chrono::time_point<std::chrono::steady_clock>::max(), imagesArrayBinding, 1u, &inserted->arrayIndex); // if the prev submit causes DEVICE_LOST then we'll get a deadlock here since we're using max timepoint
+		suballocatedDescriptorSet->multi_allocate(std::chrono::time_point<std::chrono::steady_clock>::max(), imagesArrayBinding, 1u, &cachedImageReference->arrayIndex); // if the prev submit causes DEVICE_LOST then we'll get a deadlock here since we're using max timepoint
 
-		if (inserted->arrayIndex != video::SubAllocatedDescriptorSet::AddressAllocator::invalid_address)
+		if (cachedImageReference->arrayIndex != video::SubAllocatedDescriptorSet::AddressAllocator::invalid_address)
 		{
 			// Attempt to create a GPU image and image view for this texture.
-			core::smart_refctd_ptr<IGPUImageView> gpuImageView = nullptr;
 			ImageAllocateResults allocResults = tryCreateAndAllocateImage_SubmitIfNeeded(imageCreationParams, intendedNextSubmit, std::to_string(imageID));
 
 			if (allocResults.isValid())
 			{
-				inserted->imageType = georeferenceImageType;
-				inserted->gpuResident = false;
-				inserted->lastUsedFrameIndex = currentFrameIndex; // there was an eviction + auto-submit, we need to update AGAIN
-				inserted->allocationOffset = allocResults.allocationOffset;
-				inserted->allocationSize = allocResults.allocationSize;
-				inserted->gpuImageView = allocResults.gpuImageView;
+				cachedImageReference->imageType = georeferenceImageType;
+				cachedImageReference->gpuResident = false;
+				cachedImageReference->lastUsedFrameIndex = currentFrameIndex; // there was an eviction + auto-submit, we need to update AGAIN
+				cachedImageReference->allocationOffset = allocResults.allocationOffset;
+				cachedImageReference->allocationSize = allocResults.allocationSize;
+				cachedImageReference->gpuImageView = allocResults.gpuImageView;
 
 				// TODO: queue update of the set with the gpu image view.
 			}
@@ -515,72 +553,37 @@ uint32_t DrawResourcesFiller::retrieveGeoreferencedImage_AllocateIfNeeded(image_
 				// TODO: Log a warning or error here � `addStaticImage2D` failed, likely due to low VRAM.
 				_NBL_DEBUG_BREAK_IF(true);
 
-				if (inserted->allocationOffset != ImagesMemorySubAllocator::InvalidAddress)
+				if (cachedImageReference->allocationOffset != ImagesMemorySubAllocator::InvalidAddress)
 				{
 					// We previously successfully create and allocated memory for the Image
 					// but failed to bind and create image view
 					// It's crucial to deallocate the offset+size form our images memory suballocator
-					imagesMemorySubAllocator->deallocate(inserted->allocationOffset, inserted->allocationSize);
+					imagesMemorySubAllocator->deallocate(cachedImageReference->allocationOffset, cachedImageReference->allocationSize);
 				}
 
-				if (inserted->arrayIndex != InvalidTextureIndex)
+				if (cachedImageReference->arrayIndex != InvalidTextureIndex)
 				{
 					// We previously allocated a descriptor index, but failed to create a usable GPU image.
 					// It's crucial to deallocate this index to avoid leaks and preserve descriptor pool space.
 					// No semaphore wait needed here, as the GPU never got to use this slot.
-					suballocatedDescriptorSet->multi_deallocate(imagesArrayBinding, 1u, &inserted->arrayIndex, {});
-					inserted->arrayIndex = InvalidTextureIndex;
+					suballocatedDescriptorSet->multi_deallocate(imagesArrayBinding, 1u, &cachedImageReference->arrayIndex, {});
+					cachedImageReference->arrayIndex = InvalidTextureIndex;
 				}
 			}
 		}
 		else
 		{
 			// TODO: log here, index allocation failed.
-			inserted->arrayIndex = InvalidTextureIndex;
-		}
-	}
-	else
-	{
-		// found in cache, but does it require resize? recreation?
-		if (inserted->gpuImageView)
-		{
-			auto imgViewParams = inserted->gpuImageView->getCreationParameters();
-			if (imgViewParams.image)
-			{
-				const auto cachedParams = static_cast<asset::IImage::SCreationParams>(imgViewParams.image->getCreationParameters());
-				const auto cachedImageType = inserted->imageType;
-				// image type and creation params (most importantly extent and format) should match, otherwise we evict, recreate and re-pus
-				const auto currentParams = static_cast<asset::IImage::SCreationParams>(imageCreationParams);
-				const bool needsRecreation = cachedImageType != georeferenceImageType || cachedParams != currentParams;
-				if (needsRecreation)
-				{
-					// We need to evict the image.
-					// Find erase the id from the cache, call evictCallback
-					//	wait for the image usage sempahore to finish (later we reallocate and reindex to avoid this)
-					//	try recreating the image (the same try process)
-					//	get the index hopefully from the creation
-				}
-			}
-			else
-			{
-				// TODO[LOG]
-			}
-		}
-		else
-		{
-			// TODO[LOG]
+			cachedImageReference->arrayIndex = InvalidTextureIndex;
 		}
 	}
+
+	assert(cachedImageReference->arrayIndex != InvalidTextureIndex); // shouldn't happen, because we're using LRU cache, so worst case eviction will happen + multi-deallocate and next next multi_allocate should definitely succeed
 	
-	assert(inserted->arrayIndex != InvalidTextureIndex); // shouldn't happen, because we're using LRU cache, so worst case eviction will happen + multi-deallocate and next next multi_allocate should definitely succeed
+	// cached or just inserted, we update the lastUsedFrameIndex
+	cachedImageReference->lastUsedFrameIndex = currentFrameIndex;
 
-	return inserted->arrayIndex;
-	// update frame idx 
-	// if found:
-	// check if needs recreation/resize, if it does, recreate
-	// if not, return set index
-	// if not found
-	// do the recreation process: TRY {create image, allocate and bind memory, create image view}, success --> queue for descriptor set update 
+	return cachedImageReference->arrayIndex;
 }
 
 // TODO[Przemek]: similar to other drawXXX and drawXXX_internal functions that create mainobjects, drawObjects and push additional info in geometry buffer, input to function would be a GridDTMInfo
@@ -1778,6 +1781,11 @@ uint32_t DrawResourcesFiller::getImageIndexFromID(image_id imageID, const SInten
 
 void DrawResourcesFiller::evictImage_SubmitIfNeeded(image_id imageID, const ImageReference& evicted, SIntendedSubmitInfo& intendedNextSubmit)
 {
+	if (evicted.arrayIndex == InvalidTextureIndex)
+	{
+		_NBL_DEBUG_BREAK_IF(true); // shouldn't happen under normal circumstances, TODO: LOG warning
+		return;
+	}
 	// Later used to release the image's memory range.
 	core::smart_refctd_ptr<ImageCleanup> cleanupObject = core::make_smart_refctd_ptr<ImageCleanup>();
 	cleanupObject->imagesMemorySuballocator = imagesMemorySubAllocator;
@@ -1804,7 +1812,7 @@ void DrawResourcesFiller::evictImage_SubmitIfNeeded(image_id imageID, const Imag
 	{
 		// The image is not used in the current frame, so we can deallocate without submitting any draws.
 		// Still wait on the semaphore to ensure past GPU usage is complete.
-		// TODO: We don't know which semaphore value the frame with `evicted.lastUsedFrameIndex` index was submitted with, so we wait for the worst case value which is the immediate prev submit.
+		// TODO: We don't know which semaphore value the frame with `evicted.lastUsedFrameIndex` index was submitted with, so we wait for the worst case value conservatively, which is the immediate prev submit.
 		ISemaphore::SWaitInfo deallocationWaitInfo = { .semaphore = intendedNextSubmit.scratchSemaphore.semaphore, .value = intendedNextSubmit.scratchSemaphore.value };
 		suballocatedDescriptorSet->multi_deallocate(imagesArrayBinding, 1u, &evicted.arrayIndex, deallocationWaitInfo, &cleanupObject.get());
 	}
@@ -1824,7 +1832,7 @@ DrawResourcesFiller::ImageAllocateResults  DrawResourcesFiller::tryCreateAndAllo
 
 	// Attempt to create a GPU image and corresponding image view for this texture.
 	// If creation or memory allocation fails (likely due to VRAM exhaustion),
-	// we'll evict another texture from the LRU cache and retry until successful, or until only the currently-inserted image remains.
+	// we'll evict another texture from the LRU cache and retry until successful, or until only the currently-cachedImageReference image remains.
 	while (imagesUsageCache->size() > 0u)
 	{
 		// Try creating the image and allocating memory for it:
@@ -1933,6 +1941,36 @@ DrawResourcesFiller::ImageAllocateResults  DrawResourcesFiller::tryCreateAndAllo
 	return ret;
 }
 
+void DrawResourcesFiller::determineGeoreferencedImageCreationParams(nbl::asset::IImage::SCreationParams& outImageParams, ImageType& outImageType, const GeoreferencedImageParams& georeferencedImageParams)
+{
+	// Decide whether the image can reside fully into memory rather than get streamed.
+	// TODO: Improve logic, currently just a simple check to see if the full-screen image has more pixels that viewport or not
+	const bool betterToResideFullyInMem = georeferencedImageParams.imageExtents.x * georeferencedImageParams.imageExtents.y <= georeferencedImageParams.viewportExtents.x * georeferencedImageParams.viewportExtents.y;
+
+	if (betterToResideFullyInMem)
+		outImageType = ImageType::GEOREFERENCED_FULL_RESOLUTION;
+	else
+		outImageType = ImageType::GEOREFERENCED_STREAMED;
+
+	outImageParams.type = asset::IImage::ET_2D;
+	outImageParams.samples = asset::IImage::ESCF_1_BIT;
+	outImageParams.format = georeferencedImageParams.format;
+
+	if (outImageType == ImageType::GEOREFERENCED_FULL_RESOLUTION)
+	{
+		outImageParams.extent = { georeferencedImageParams.imageExtents.x, georeferencedImageParams.imageExtents.y, 1u };
+	}
+	else
+	{
+		// TODO: Better Logic, area around the view, etc...
+		outImageParams.extent = { georeferencedImageParams.viewportExtents.x, georeferencedImageParams.viewportExtents.y, 1u };
+	}
+
+
+	outImageParams.mipLevels = 1u; // TODO: Later do mipmapping
+	outImageParams.arrayLayers = 1u;
+}
+
 void DrawResourcesFiller::setGlyphMSDFTextureFunction(const GetGlyphMSDFTextureFunc& func)
 {
 	getGlyphMSDF = func;
@@ -2023,7 +2061,7 @@ uint32_t DrawResourcesFiller::addMSDFTexture(const MSDFInputInfo& msdfInput, cor
 	
 	inserted->lastUsedFrameIndex = currentFrameIndex; // in case there was an eviction + auto-submit, we need to update AGAIN
 
-	// if inserted->alloc_idx was not InvalidTextureIndex then it means we had a cache hit and updated the value of our sema, in which case we don't queue anything for upload, and return the idx
+	// if cachedImageReference->alloc_idx was not InvalidTextureIndex then it means we had a cache hit and updated the value of our sema, in which case we don't queue anything for upload, and return the idx
 	if (inserted->alloc_idx == InvalidTextureIndex)
 	{
 		// New insertion == cache miss happened and insertion was successfull
diff --git a/62_CAD/DrawResourcesFiller.h b/62_CAD/DrawResourcesFiller.h
index f805c0a82..ae071654a 100644
--- a/62_CAD/DrawResourcesFiller.h
+++ b/62_CAD/DrawResourcesFiller.h
@@ -514,6 +514,19 @@ struct DrawResourcesFiller
 	 */
 	ImageAllocateResults tryCreateAndAllocateImage_SubmitIfNeeded(const nbl::asset::IImage::SCreationParams& imageParams, nbl::video::SIntendedSubmitInfo& intendedNextSubmit, std::string debugName = "UnnamedNablaImage");
 
+	/**
+	 * @brief Determines creation parameters for a georeferenced image based on heuristics.
+	 *
+	 * This function decides whether a georeferenced image should be treated as a fully resident GPU texture
+	 * or as a streamable image based on the relationship between its total resolution and the viewport size.
+	 * It then fills out the appropriate Nabla image creation parameters.
+	 *
+	 * @param[out] outImageParams Structure to be filled with image creation parameters (format, size, etc.).
+	 * @param[out] outImageType Indicates whether the image should be fully resident or streamed.
+	 * @param[in] georeferencedImageParams Parameters describing the full image extents, viewport extents, and format.
+	 */
+	void determineGeoreferencedImageCreationParams(nbl::asset::IImage::SCreationParams& outImageParams, ImageType& outImageType, const GeoreferencedImageParams& georeferencedImageParams);
+
 	void resetMainObjects()
 	{
 		resourcesCollection.mainObjects.vector.clear();
diff --git a/62_CAD/Images.h b/62_CAD/Images.h
index d93c47d3c..fe3e8bde9 100644
--- a/62_CAD/Images.h
+++ b/62_CAD/Images.h
@@ -8,7 +8,8 @@ using image_id = uint64_t; // Could later be templated or replaced with a strong
 
 enum class ImageType : uint8_t
 {
-    STATIC = 0,                        // Regular non-georeferenced image, fully loaded once
+	INVALID = 0,
+    STATIC,                        // Regular non-georeferenced image, fully loaded once
     GEOREFERENCED_STREAMED,            // Streamed image, resolution depends on camera/view
     GEOREFERENCED_FULL_RESOLUTION      // For smaller georeferenced images, entire image is eventually loaded and not streamed or view-dependant
 };
@@ -124,26 +125,26 @@ struct ImageReference
 	static constexpr uint32_t InvalidTextureIndex = nbl::hlsl::numeric_limits<uint32_t>::max;
 	
 	uint32_t arrayIndex = InvalidTextureIndex; // index in our array of textures binding
-	ImageType imageType;
+	ImageType imageType = ImageType::INVALID;
 	bool gpuResident = false;
 	uint64_t lastUsedFrameIndex = 0ull; // last used semaphore value on this image
 	uint64_t allocationOffset = ImagesMemorySubAllocator::InvalidAddress;
 	uint64_t allocationSize = 0ull;
 	core::smart_refctd_ptr<IGPUImageView> gpuImageView = nullptr;
-
-	ImageReference() 
-		: arrayIndex(InvalidTextureIndex)
-		, lastUsedFrameIndex(0ull)
-		, allocationOffset(ImagesMemorySubAllocator::InvalidAddress)
-		, allocationSize(0ull)
-	{}
 	
 	// In LRU Cache `insert` function, in case of cache miss, we need to construct the refereence with semaphore value
 	ImageReference(uint64_t currentFrameIndex) 
 		: arrayIndex(InvalidTextureIndex)
+		, imageType(ImageType::INVALID)
+		, gpuResident(false)
 		, lastUsedFrameIndex(currentFrameIndex)
 		, allocationOffset(ImagesMemorySubAllocator::InvalidAddress)
 		, allocationSize(0ull)
+		, gpuImageView(nullptr)
+	{}
+	
+	ImageReference() 
+		: ImageReference(0ull)
 	{}
 
 	// In LRU Cache `insert` function, in case of cache hit, we need to assign semaphore value without changing `index`

From 52f5485d78f88c721ca4a971349d512664355ef7 Mon Sep 17 00:00:00 2001
From: devsh <devsh@devsh.eu>
Date: Tue, 20 May 2025 14:25:10 +0200
Subject: [PATCH 273/529] make old code compile with new API and work with
 renderdoc

---
 67_RayQueryGeometry/main.cpp | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/67_RayQueryGeometry/main.cpp b/67_RayQueryGeometry/main.cpp
index 0d7494e9c..2b5145913 100644
--- a/67_RayQueryGeometry/main.cpp
+++ b/67_RayQueryGeometry/main.cpp
@@ -4,7 +4,7 @@
 
 #include "common.hpp"
 
-#define TEST_ASSET_CONV_AS
+//#define TEST_ASSET_CONV_AS
 
 class RayQueryGeometryApp final : public examples::SimpleWindowedApplication, public application_templates::MonoAssetManagerAndBuiltinResourceApplication
 {
@@ -722,7 +722,7 @@ class RayQueryGeometryApp final : public examples::SimpleWindowedApplication, pu
 			cpuTlas->setInstances(std::move(geomInstances));
 			cpuTlas->setBuildFlags(IGPUTopLevelAccelerationStructure::BUILD_FLAGS::PREFER_FAST_TRACE_BIT);
 
-//#define TEST_REBAR_FALLBACK
+#define TEST_REBAR_FALLBACK
 			// convert with asset converter
 			smart_refctd_ptr<CAssetConverter> converter = CAssetConverter::create({ .device = m_device.get(), .optimizer = {} });
 			struct MyInputs : CAssetConverter::SInputs
@@ -927,7 +927,7 @@ class RayQueryGeometryApp final : public examples::SimpleWindowedApplication, pu
 				m_utils->createFilledDeviceLocalBufferOnDedMem(SIntendedSubmitInfo{ .queue = queue }, std::move(params), geomInfos).move_into(geometryInfoBuffer);
 			}
 
-			return true;
+			return bool(gpuTlas);
 		}
 #else
 		bool createGeometries(video::CThreadSafeQueueAdapter* queue, const IGeometryCreator* gc)
@@ -1122,7 +1122,7 @@ class RayQueryGeometryApp final : public examples::SimpleWindowedApplication, pu
 			{
 				IGPUBottomLevelAccelerationStructure::DeviceBuildInfo blasBuildInfos[OT_COUNT];
 				uint32_t primitiveCounts[OT_COUNT];
-				IGPUBottomLevelAccelerationStructure::Triangles<const IGPUBuffer> triangles[OT_COUNT];
+				IGPUBottomLevelAccelerationStructure::Triangles<IGPUBuffer> triangles[OT_COUNT];
 				uint32_t scratchSizes[OT_COUNT];
 
 				for (uint32_t i = 0; i < objectsGpu.size(); i++)
@@ -1159,7 +1159,7 @@ class RayQueryGeometryApp final : public examples::SimpleWindowedApplication, pu
 					{
 						const auto* trianglesData = triangles;
 						const uint32_t maxPrimCount[1] = { primitiveCounts[i] };
-						buildSizes = m_device->getAccelerationStructureBuildSizes(blasFlags, false, std::span{trianglesData,1}, maxPrimCount);
+						buildSizes = m_device->getAccelerationStructureBuildSizes(false,blasFlags, false, std::span{trianglesData,1}, maxPrimCount);
 						if (!buildSizes)
 							return logFail("Failed to get BLAS build sizes");
 					}
@@ -1252,7 +1252,7 @@ class RayQueryGeometryApp final : public examples::SimpleWindowedApplication, pu
 			// compact blas
 			{
 				std::array<size_t, OT_COUNT> asSizes{ 0 };
-				if (!m_device->getQueryPoolResults(queryPool.get(), 0, objectsGpu.size(), asSizes.data(), sizeof(size_t), IQueryPool::WAIT_BIT))
+				if (!m_device->getQueryPoolResults(queryPool.get(), 0, objectsGpu.size(), asSizes.data(), sizeof(size_t), bitflag(IQueryPool::RESULTS_FLAGS::WAIT_BIT)|IQueryPool::_64_BIT))
 					return logFail("Could not get query pool results for AS sizes");
 
 				std::array<smart_refctd_ptr<IGPUBottomLevelAccelerationStructure>, OT_COUNT> cleanupBlas;

From 010a9e549619a3e6426474a2dd1625c43654d669 Mon Sep 17 00:00:00 2001
From: devsh <devsh@devsh.eu>
Date: Tue, 20 May 2025 16:14:15 +0200
Subject: [PATCH 274/529] add missing ownership acquire and clean up the code a
 bit

---
 67_RayQueryGeometry/main.cpp | 136 +++++++++++++++++++----------------
 1 file changed, 75 insertions(+), 61 deletions(-)

diff --git a/67_RayQueryGeometry/main.cpp b/67_RayQueryGeometry/main.cpp
index 2b5145913..1248a1bf3 100644
--- a/67_RayQueryGeometry/main.cpp
+++ b/67_RayQueryGeometry/main.cpp
@@ -4,7 +4,7 @@
 
 #include "common.hpp"
 
-//#define TEST_ASSET_CONV_AS
+#define TEST_ASSET_CONV_AS
 
 class RayQueryGeometryApp final : public examples::SimpleWindowedApplication, public application_templates::MonoAssetManagerAndBuiltinResourceApplication
 {
@@ -619,37 +619,6 @@ class RayQueryGeometryApp final : public examples::SimpleWindowedApplication, pu
 			const uint32_t byteOffsets[OT_COUNT] = { 18, 24, 24, 20, 20, 24, 16, 12 };	// based on normals data position
 			const uint32_t smoothNormals[OT_COUNT] = { 0, 1, 1, 0, 0, 1, 1, 1 };
 
-			struct CPUBufferBindings
-			{
-				nbl::asset::SBufferBinding<ICPUBuffer> vertex, index;
-			};
-			std::array<CPUBufferBindings, OT_COUNT> cpuBuffers;
-
-			for (uint32_t i = 0; i < cpuBuffers.size(); i++)
-			{
-				const auto& geom = objectsCpu[i];
-				auto& cpuObj = cpuBuffers[i];
-				const bool useIndex = geom.data.indexType != EIT_UNKNOWN;
-
-				auto vBuffer = smart_refctd_ptr(geom.data.bindings[0].buffer); // no offset
-				auto vUsage = bitflag(IGPUBuffer::EUF_STORAGE_BUFFER_BIT) | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT;
-
-				auto iBuffer = smart_refctd_ptr(geom.data.indexBuffer.buffer); // no offset
-				auto iUsage = bitflag(IGPUBuffer::EUF_STORAGE_BUFFER_BIT) | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT;
-
-				vBuffer->addUsageFlags(vUsage);
-				vBuffer->setContentHash(vBuffer->computeContentHash());
-				cpuObj.vertex = { .offset = 0, .buffer = vBuffer };
-
-				if (useIndex)
-					if (iBuffer)
-					{
-						iBuffer->addUsageFlags(iUsage);
-						iBuffer->setContentHash(iBuffer->computeContentHash());
-					}
-				cpuObj.index = { .offset = 0, .buffer = iBuffer };
-			}
-
 			// get ICPUBuffers into ICPUBottomLevelAccelerationStructures
 			std::array<smart_refctd_ptr<ICPUBottomLevelAccelerationStructure>, OT_COUNT> cpuBlas;
 			for (uint32_t i = 0; i < cpuBlas.size(); i++)
@@ -660,11 +629,10 @@ class RayQueryGeometryApp final : public examples::SimpleWindowedApplication, pu
 				auto& tri = triangles->front();
 				auto& primCount = primitiveCounts->front();
 				const auto& geom = objectsCpu[i];
-				const auto& cpuBuf = cpuBuffers[i];
 
 				const bool useIndex = geom.data.indexType != EIT_UNKNOWN;
 				const uint32_t vertexStride = geom.data.inputParams.bindings[0].stride;
-				const uint32_t numVertices = cpuBuf.vertex.buffer->getSize() / vertexStride;
+				const uint32_t numVertices = (geom.data.bindings[0].buffer->getSize()-geom.data.bindings[0].offset) / vertexStride;
 
 				if (useIndex)
 					primCount = geom.data.indexCount / 3;
@@ -675,11 +643,16 @@ class RayQueryGeometryApp final : public examples::SimpleWindowedApplication, pu
 				geomInfos[i].vertexStride = vertexStride;
 				geomInfos[i].smoothNormals = smoothNormals[i];
 
-				tri.vertexData[0] = cpuBuf.vertex;
-				tri.indexData = useIndex ? cpuBuf.index : cpuBuf.vertex;
+				geom.data.bindings[0].buffer->setContentHash(geom.data.bindings[0].buffer->computeContentHash());
+				tri.vertexData[0] = geom.data.bindings[0];
+				if (useIndex)
+				{
+					geom.data.indexBuffer.buffer->setContentHash(geom.data.indexBuffer.buffer->computeContentHash());
+					tri.indexData = geom.data.indexBuffer;
+				}
 				tri.maxVertex = numVertices - 1;
 				tri.vertexStride = vertexStride;
-				tri.vertexFormat = EF_R32G32B32_SFLOAT;
+				tri.vertexFormat = static_cast<E_FORMAT>(geom.data.inputParams.attributes[0].format);
 				tri.indexType = geom.data.indexType;
 				tri.geometryFlags = IGPUBottomLevelAccelerationStructure::GEOMETRY_FLAGS::OPAQUE_BIT;
 
@@ -758,46 +731,36 @@ class RayQueryGeometryApp final : public examples::SimpleWindowedApplication, pu
 			inputs.allocator = &myalloc;
 #endif
 
-			std::array<ICPUTopLevelAccelerationStructure*, 1u> tmpTlas;
-			std::array<ICPUBuffer*, OT_COUNT * 2u> tmpBuffers;
+			std::array<const ICPUBuffer*, OT_COUNT * 2u> tmpBuffers;
 			{
-				tmpTlas[0] = cpuTlas.get();
 				for (uint32_t i = 0; i < objectsCpu.size(); i++)
 				{
-					tmpBuffers[2 * i + 0] = cpuBuffers[i].vertex.buffer.get();
-					tmpBuffers[2 * i + 1] = cpuBuffers[i].index.buffer.get();
+					tmpBuffers[2 * i + 0] = cpuBlas[i]->getTriangleGeometries().front().vertexData[0].buffer.get();
+					tmpBuffers[2 * i + 1] = cpuBlas[i]->getTriangleGeometries().front().indexData.buffer.get();
 				}
 
-				std::get<CAssetConverter::SInputs::asset_span_t<ICPUTopLevelAccelerationStructure>>(inputs.assets) = tmpTlas;
+				std::get<CAssetConverter::SInputs::asset_span_t<ICPUTopLevelAccelerationStructure>>(inputs.assets) = {&cpuTlas.get(),1};
+				std::get<CAssetConverter::SInputs::asset_span_t<ICPUBottomLevelAccelerationStructure>>(inputs.assets) = {&cpuBlas.data()->get(),cpuBlas.size()};
 				std::get<CAssetConverter::SInputs::asset_span_t<ICPUBuffer>>(inputs.assets) = tmpBuffers;
 			}
 
 			auto reservation = converter->reserve(inputs);
 			{
-				auto prepass = [&]<typename asset_type_t>(const auto & references) -> bool
+				auto prepass = [&]<typename asset_type_t>() -> bool
 				{
 					auto objects = reservation.getGPUObjects<asset_type_t>();
-					uint32_t counter = {};
 					for (auto& object : objects)
+					if (!object.value)
 					{
-						auto gpu = object.value;
-						auto* reference = references[counter];
-
-						if (reference)
-						{
-							if (!gpu)
-							{
-								m_logger->log("Failed to convert a CPU object to GPU!", ILogger::ELL_ERROR);
-								return false;
-							}
-						}
-						counter++;
+						m_logger->log("Failed to convert a CPU object to GPU!", ILogger::ELL_ERROR);
+						return false;
 					}
 					return true;
 				};
 
-				prepass.template operator() < ICPUTopLevelAccelerationStructure > (tmpTlas);
-				prepass.template operator() < ICPUBuffer > (tmpBuffers);
+				prepass.template operator()<ICPUBuffer>();
+				prepass.template operator()<ICPUBottomLevelAccelerationStructure>();
+				prepass.template operator()<ICPUTopLevelAccelerationStructure>();
 			}
 
 
@@ -812,6 +775,7 @@ class RayQueryGeometryApp final : public examples::SimpleWindowedApplication, pu
 					xferBufInfos[i].cmdbuf = xferBufs[i].get();
 			}
 			auto xferSema = m_device->createSemaphore(0u);
+			xferSema->setObjectDebugName("Transfer Semaphore");
 			SIntendedSubmitInfo transfer = {};
 			transfer.queue = getTransferUpQueue();
 			transfer.scratchCommandBuffers = xferBufInfos;
@@ -832,6 +796,7 @@ class RayQueryGeometryApp final : public examples::SimpleWindowedApplication, pu
 					compBufInfos[i].cmdbuf = compBufs[i].get();
 			}
 			auto compSema = m_device->createSemaphore(0u);
+			compSema->setObjectDebugName("Compute Semaphore");
 			SIntendedSubmitInfo compute = {};
 			compute.queue = getComputeQueue();
 			compute.scratchCommandBuffers = compBufInfos;
@@ -841,6 +806,7 @@ class RayQueryGeometryApp final : public examples::SimpleWindowedApplication, pu
 				.stageMask = PIPELINE_STAGE_FLAGS::ACCELERATION_STRUCTURE_BUILD_BIT|PIPELINE_STAGE_FLAGS::ACCELERATION_STRUCTURE_COPY_BIT
 			};
 			// convert
+			auto gQueue = getGraphicsQueue();
 			{
 				smart_refctd_ptr<CAssetConverter::SConvertParams::scratch_for_device_AS_build_t> scratchAlloc;
 				{
@@ -895,7 +861,7 @@ class RayQueryGeometryApp final : public examples::SimpleWindowedApplication, pu
 				params.transfer = &transfer;
 				params.compute = &compute;
 				params.scratchForDeviceASBuild = scratchAlloc.get();
-				params.finalUser = queue->getFamilyIndex();
+				params.finalUser = gQueue->getFamilyIndex();
 
 				auto future = reservation.convert(params);
 				if (future.copy() != IQueue::RESULT::SUCCESS)
@@ -920,11 +886,59 @@ class RayQueryGeometryApp final : public examples::SimpleWindowedApplication, pu
 				}
 			}
 
+			//
 			{
 				IGPUBuffer::SCreationParams params;
 				params.usage = IGPUBuffer::EUF_STORAGE_BUFFER_BIT | IGPUBuffer::EUF_TRANSFER_DST_BIT | IGPUBuffer::EUF_INLINE_UPDATE_VIA_CMDBUF | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT;
 				params.size = OT_COUNT * sizeof(SGeomInfo);
-				m_utils->createFilledDeviceLocalBufferOnDedMem(SIntendedSubmitInfo{ .queue = queue }, std::move(params), geomInfos).move_into(geometryInfoBuffer);
+				m_utils->createFilledDeviceLocalBufferOnDedMem(SIntendedSubmitInfo{ .queue = gQueue }, std::move(params), geomInfos).move_into(geometryInfoBuffer);
+			}
+
+			// acquire ownership
+			if (const auto gQFI=gQueue->getFamilyIndex(), otherQueueFamilyIndex=queue->getFamilyIndex(); gQFI!=otherQueueFamilyIndex)
+			{
+				smart_refctd_ptr<IGPUCommandBuffer> cmdbuf;
+				m_device->createCommandPool(gQFI,IGPUCommandPool::CREATE_FLAGS::TRANSIENT_BIT)->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY,{&cmdbuf,1});
+				cmdbuf->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT);
+				core::vector<IGPUCommandBuffer::SBufferMemoryBarrier<IGPUCommandBuffer::SOwnershipTransferBarrier>> bufBarriers;
+				auto acquireBufferRange = [&bufBarriers,otherQueueFamilyIndex](const SBufferRange<IGPUBuffer>& bufferRange)
+				{
+					bufBarriers.push_back({
+						.barrier = {
+							.dep = {
+								.srcStageMask = PIPELINE_STAGE_FLAGS::NONE,
+								.srcAccessMask = ACCESS_FLAGS::NONE,
+								.dstStageMask = PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT,
+								.dstAccessMask = ACCESS_FLAGS::ACCELERATION_STRUCTURE_READ_BIT|ACCESS_FLAGS::STORAGE_READ_BIT
+							},
+							.ownershipOp = IGPUCommandBuffer::SOwnershipTransferBarrier::OWNERSHIP_OP::ACQUIRE,
+							.otherQueueFamilyIndex = otherQueueFamilyIndex
+						},
+						.range = bufferRange
+					});
+				};
+				for (auto buffer : reservation.getGPUObjects<ICPUBuffer>())
+				{
+					const auto& buff = buffer.value;
+					acquireBufferRange({.offset=0,.size=buff->getSize(),.buffer=buff});
+				}
+				auto acquireAS = [&acquireBufferRange](const IGPUAccelerationStructure* as)
+				{
+					acquireBufferRange(as->getCreationParams().bufferRange);
+				};
+				for (auto blas : reservation.getGPUObjects<ICPUBottomLevelAccelerationStructure>())
+					acquireAS(blas.value.get());
+				acquireAS(gpuTlas.get());
+				cmdbuf->pipelineBarrier(asset::E_DEPENDENCY_FLAGS::EDF_NONE,{.memBarriers={},.bufBarriers=bufBarriers});
+				cmdbuf->end();
+				const IQueue::SSubmitInfo::SCommandBufferInfo cmdbufInfo = {
+					.cmdbuf = cmdbuf.get()
+				};
+				const IQueue::SSubmitInfo info = {
+					.waitSemaphores = {}, // we already waited with the host on the AS build
+					.commandBuffers = {&cmdbufInfo,1}
+				};
+				gQueue->submit({&info,1});
 			}
 
 			return bool(gpuTlas);

From f1fb1b525bbbd8415bba0978289b82c5ee788814 Mon Sep 17 00:00:00 2001
From: Fletterio <fj.letterio@gmail.com>
Date: Tue, 20 May 2025 22:14:06 -0300
Subject: [PATCH 275/529] Adds Cache iteration test

---
 21_LRUCacheUnitTest/main.cpp | 27 +++++++++++++++++++++++++--
 1 file changed, 25 insertions(+), 2 deletions(-)

diff --git a/21_LRUCacheUnitTest/main.cpp b/21_LRUCacheUnitTest/main.cpp
index 1c63fc744..1e7830b16 100644
--- a/21_LRUCacheUnitTest/main.cpp
+++ b/21_LRUCacheUnitTest/main.cpp
@@ -5,6 +5,7 @@
 
 // I've moved out a tiny part of this example into a shared header for reuse, please open and read it.
 #include "nbl/application_templates/MonoSystemMonoLoggerApplication.hpp"
+#include <ranges>
 
 using namespace nbl;
 using namespace core;
@@ -180,6 +181,28 @@ class LRUCacheTestApp final : public nbl::application_templates::MonoSystemMonoL
 			cache3.insert(1, "bar");
 			cache3.clear();
 
+			// Cache iterator test
+			constexpr uint32_t cache4Size = 10;
+			ResizableLRUCache<uint32_t, uint32_t> cache4(cache4Size);
+			for (auto i = 0u; i < cache4Size; i++)
+			{
+				cache4.insert(i, i);
+			}
+			// Default iterator is MRU -> LRU
+			uint32_t counter = cache4Size - 1;
+			for (auto& pair : cache4)
+			{
+				assert(pair.first == counter && pair.second == counter);
+				counter--;
+			}
+			// Reverse LRU -> MRU traversal
+			counter = 0u;
+			for (auto it = cache4.crbegin(); it != cache4.crend(); it++)
+			{
+				assert(it->first == counter && it->second == counter);
+				counter++;
+			}
+
 			// Besides the disposal function that gets called when evicting, we need to check that the Cache properly destroys all resident `Key,Value` pairs when destroyed
 			struct Foo
 			{
@@ -208,9 +231,9 @@ class LRUCacheTestApp final : public nbl::application_templates::MonoSystemMonoL
 
 			int destroyCounter = 0;
 			{
-				ResizableLRUCache<int, Foo> cache4(10u);
+				ResizableLRUCache<int, Foo> cache5(10u);
 				for (int i = 0; i < 10; i++)
-					cache4.insert(i, Foo(&destroyCounter));
+					cache5.insert(i, Foo(&destroyCounter));
 				int x = 0;
 			}
 			

From 3f40b925c97ece98c72527d5c85d2593471f26c9 Mon Sep 17 00:00:00 2001
From: Erfan Ahmadi <ahmadierfan99@gmail.com>
Date: Wed, 21 May 2025 07:52:41 +0400
Subject: [PATCH 276/529] small comment

---
 62_CAD/DrawResourcesFiller.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/62_CAD/DrawResourcesFiller.cpp b/62_CAD/DrawResourcesFiller.cpp
index 9d638a920..510faf6d2 100644
--- a/62_CAD/DrawResourcesFiller.cpp
+++ b/62_CAD/DrawResourcesFiller.cpp
@@ -1945,6 +1945,7 @@ void DrawResourcesFiller::determineGeoreferencedImageCreationParams(nbl::asset::
 {
 	// Decide whether the image can reside fully into memory rather than get streamed.
 	// TODO: Improve logic, currently just a simple check to see if the full-screen image has more pixels that viewport or not
+	// TODO: add criterial that the size of the full-res image shouldn't  consume more than 30% of the total memory arena for images (if we allowed larger than viewport extents)
 	const bool betterToResideFullyInMem = georeferencedImageParams.imageExtents.x * georeferencedImageParams.imageExtents.y <= georeferencedImageParams.viewportExtents.x * georeferencedImageParams.viewportExtents.y;
 
 	if (betterToResideFullyInMem)

From 0ccd26fc93d22587219b12291f855929949cef74 Mon Sep 17 00:00:00 2001
From: keptsecret <sorchon@gmail.com>
Date: Wed, 21 May 2025 15:01:30 +0700
Subject: [PATCH 277/529] save reduction returns to storage

---
 .../app_resources/testWorkgroup.comp.hlsl     | 20 ++++++++++++++++++-
 23_Arithmetic2UnitTest/main.cpp               | 12 ++++++-----
 2 files changed, 26 insertions(+), 6 deletions(-)

diff --git a/23_Arithmetic2UnitTest/app_resources/testWorkgroup.comp.hlsl b/23_Arithmetic2UnitTest/app_resources/testWorkgroup.comp.hlsl
index 38b6714bd..58e293ba3 100644
--- a/23_Arithmetic2UnitTest/app_resources/testWorkgroup.comp.hlsl
+++ b/23_Arithmetic2UnitTest/app_resources/testWorkgroup.comp.hlsl
@@ -81,6 +81,23 @@ struct operation_t
     using binop_base_t = typename Binop::base_t;
     using otype_t = typename Binop::type_t;
 
+    // workgroup reduction returns the value of the reduction
+    // workgroup scans do no return anything, but use the data accessor to do the storing directly
+#if IS_REDUCTION
+    void operator()()
+    {
+        PreloadedDataProxy<config_t,Binop> dataAccessor;
+        dataAccessor.preload();
+        otype_t value = nbl::hlsl::OPERATION<config_t,binop_base_t,device_capabilities>::template __call<PreloadedDataProxy<config_t,Binop>, ScratchProxy>(dataAccessor,arithmeticAccessor);
+        // we barrier before because we alias the accessors for Binop
+        arithmeticAccessor.workgroupExecutionAndMemoryBarrier();
+
+        [unroll]
+        for (uint32_t i = 0; i < PreloadedDataProxy<config_t,Binop>::PreloadedDataCount; i++)
+            dataAccessor.preloaded[i] = value;
+        dataAccessor.unload();
+    }
+#else
     void operator()()
     {
         PreloadedDataProxy<config_t,Binop> dataAccessor;
@@ -90,6 +107,7 @@ struct operation_t
         arithmeticAccessor.workgroupExecutionAndMemoryBarrier();
         dataAccessor.unload();
     }
+#endif
 };
 
 
@@ -101,7 +119,7 @@ static void subtest(NBL_CONST_REF_ARG(type_t) sourceVal)
         vk::RawBufferStore<uint32_t>(outputBufAddr, nbl::hlsl::glsl::gl_SubgroupSize());
 
     operation_t<binop<T>,nbl::hlsl::jit::device_capabilities> func;
-    func(); // store is done with data accessor now
+    func();
 }
 
 
diff --git a/23_Arithmetic2UnitTest/main.cpp b/23_Arithmetic2UnitTest/main.cpp
index 282473d12..2edd34439 100644
--- a/23_Arithmetic2UnitTest/main.cpp
+++ b/23_Arithmetic2UnitTest/main.cpp
@@ -271,22 +271,24 @@ class Workgroup2ScanTestApp final : public application_templates::BasicMultiQueu
 		smart_refctd_ptr<ICPUShader> overriddenUnspecialized;
 		if constexpr (WorkgroupTest)
 		{
-			const std::string definitions[5] = {
+			const std::string definitions[6] = {
 				"workgroup2::" + arith_name,
 				std::to_string(workgroupSizeLog2),
 				std::to_string(itemsPerWG),
 				std::to_string(itemsPerInvoc),
-				std::to_string(subgroupSizeLog2)
+				std::to_string(subgroupSizeLog2),
+				std::to_string(arith_name=="reduction")
 			};
 
-			const IShaderCompiler::SMacroDefinition defines[5] = {
+			const IShaderCompiler::SMacroDefinition defines[6] = {
 				{ "OPERATION", definitions[0] },
 				{ "WORKGROUP_SIZE_LOG2", definitions[1] },
 				{ "ITEMS_PER_WG", definitions[2] },
 				{ "ITEMS_PER_INVOCATION", definitions[3] },
-				{ "SUBGROUP_SIZE_LOG2", definitions[4] }
+				{ "SUBGROUP_SIZE_LOG2", definitions[4] },
+				{ "IS_REDUCTION", definitions[5] }
 			};
-			options.preprocessorOptions.extraDefines = { defines, defines + 5 };
+			options.preprocessorOptions.extraDefines = { defines, defines + 6 };
 
 			overriddenUnspecialized = compiler->compileToSPIRV((const char*)source->getContent()->getPointer(), options);
 		}

From fccbcb2894941d3743021dee874dbe545ea317b5 Mon Sep 17 00:00:00 2001
From: Erfan Ahmadi <ahmadierfan99@gmail.com>
Date: Wed, 21 May 2025 16:40:35 +0400
Subject: [PATCH 278/529] addGeoreferencedImage, and image cache heavy
 refactors (Image States)

---
 62_CAD/DrawResourcesFiller.cpp                | 432 ++++++++++--------
 62_CAD/DrawResourcesFiller.h                  |  54 ++-
 62_CAD/Images.h                               | 103 ++---
 62_CAD/main.cpp                               |  11 +-
 62_CAD/shaders/globals.hlsl                   |  20 +-
 .../main_pipeline/fragment_shader.hlsl        |  17 +-
 .../shaders/main_pipeline/vertex_shader.hlsl  |  23 +-
 7 files changed, 396 insertions(+), 264 deletions(-)

diff --git a/62_CAD/DrawResourcesFiller.cpp b/62_CAD/DrawResourcesFiller.cpp
index 510faf6d2..425834a99 100644
--- a/62_CAD/DrawResourcesFiller.cpp
+++ b/62_CAD/DrawResourcesFiller.cpp
@@ -7,7 +7,7 @@ DrawResourcesFiller::DrawResourcesFiller(smart_refctd_ptr<IUtilities>&& utils, I
 	m_utilities(utils),
 	m_copyQueue(copyQueue)
 {
-	imagesUsageCache = std::unique_ptr<ImagesUsageCache>(new ImagesUsageCache(ImagesBindingArraySize));
+	imagesCache = std::unique_ptr<ImagesCache>(new ImagesCache(ImagesBindingArraySize));
 }
 
 // function is called when buffer is filled and we should submit draws and clear the buffers and continue filling
@@ -63,7 +63,7 @@ void DrawResourcesFiller::allocateResourcesBuffer(ILogicalDevice* logicalDevice,
 		IDeviceMemoryAllocator::SAllocateInfo allocationInfo =
 		{
 			// TODO: Get from user side.
-			.size = 170 * 1024 * 1024, // 70 MB
+			.size = 270 * 1024 * 1024, // 70 MB
 			.flags = IDeviceMemoryAllocation::E_MEMORY_ALLOCATE_FLAGS::EMAF_NONE,
 			.memoryTypeIndex = memoryTypeIdx,
 			.dedication = nullptr,
@@ -360,24 +360,24 @@ void DrawResourcesFiller::drawFontGlyph(
 	}
 }
 
-uint32_t DrawResourcesFiller::addStaticImage2D(image_id imageID, const core::smart_refctd_ptr<ICPUImage>& cpuImage, SIntendedSubmitInfo& intendedNextSubmit)
+bool DrawResourcesFiller::ensureStaticImageAvailability(image_id imageID, const core::smart_refctd_ptr<ICPUImage>& cpuImage, SIntendedSubmitInfo& intendedNextSubmit)
 {
 	// Try inserting or updating the image usage in the cache.
 	// If the image is already present, updates its semaphore value.
-	auto evictCallback = [&](image_id imageID, const ImageReference& evicted) { evictImage_SubmitIfNeeded(imageID, evicted, intendedNextSubmit); };
-	ImageReference* cachedImageReference = imagesUsageCache->insert(imageID, intendedNextSubmit.getFutureScratchSemaphore().value, evictCallback);
-	cachedImageReference->lastUsedFrameIndex = currentFrameIndex; // in case there was an eviction + auto-submit, we need to update AGAIN
+	auto evictCallback = [&](image_id imageID, const CachedImageRecord& evicted) { evictImage_SubmitIfNeeded(imageID, evicted, intendedNextSubmit); };
+	CachedImageRecord* cachedImageRecord = imagesCache->insert(imageID, intendedNextSubmit.getFutureScratchSemaphore().value, evictCallback);
+	cachedImageRecord->lastUsedFrameIndex = currentFrameIndex; // in case there was an eviction + auto-submit, we need to update AGAIN
 
-	// if cachedImageReference->index was not InvalidTextureIndex then it means we had a cache hit and updated the value of our sema
+	// if cachedImageRecord->index was not InvalidTextureIndex then it means we had a cache hit and updated the value of our sema
 	// in which case we don't queue anything for upload, and return the idx
-	if (cachedImageReference->arrayIndex == InvalidTextureIndex)
+	if (cachedImageRecord->arrayIndex == InvalidTextureIndex)
 	{
 		// This is a new image (cache miss). Allocate a descriptor index for it.
-		cachedImageReference->arrayIndex = video::SubAllocatedDescriptorSet::AddressAllocator::invalid_address;
+		cachedImageRecord->arrayIndex = video::SubAllocatedDescriptorSet::AddressAllocator::invalid_address;
 		// Blocking allocation attempt; if the descriptor pool is exhausted, this may stall.
-		suballocatedDescriptorSet->multi_allocate(std::chrono::time_point<std::chrono::steady_clock>::max(), imagesArrayBinding, 1u, &cachedImageReference->arrayIndex); // if the prev submit causes DEVICE_LOST then we'll get a deadlock here since we're using max timepoint
+		suballocatedDescriptorSet->multi_allocate(std::chrono::time_point<std::chrono::steady_clock>::max(), imagesArrayBinding, 1u, &cachedImageRecord->arrayIndex); // if the prev submit causes DEVICE_LOST then we'll get a deadlock here since we're using max timepoint
 
-		if (cachedImageReference->arrayIndex != video::SubAllocatedDescriptorSet::AddressAllocator::invalid_address)
+		if (cachedImageRecord->arrayIndex != video::SubAllocatedDescriptorSet::AddressAllocator::invalid_address)
 		{
 			auto* device = m_utilities->getLogicalDevice();
 			auto* physDev = m_utilities->getLogicalDevice()->getPhysicalDevice();
@@ -399,23 +399,13 @@ uint32_t DrawResourcesFiller::addStaticImage2D(image_id imageID, const core::sma
 
 			if (allocResults.isValid())
 			{
-				cachedImageReference->imageType = ImageType::STATIC;
-				cachedImageReference->gpuResident = false;
-				cachedImageReference->lastUsedFrameIndex = currentFrameIndex; // there was an eviction + auto-submit, we need to update AGAIN
-				cachedImageReference->allocationOffset = allocResults.allocationOffset;
-				cachedImageReference->allocationSize = allocResults.allocationSize;
-				cachedImageReference->gpuImageView = allocResults.gpuImageView;
-
-				StaticImageState newState =
-				{
-					.cpuImage = cpuImage,
-					.gpuImageView = allocResults.gpuImageView,
-					.allocationOffset = allocResults.allocationOffset,
-					.allocationSize = allocResults.allocationSize,
-					.arrayIndex = cachedImageReference->arrayIndex,
-					.gpuResident = false,
-				};
-				staticImagesState.emplace(imageID, newState);
+				cachedImageRecord->type = ImageType::STATIC;
+				cachedImageRecord->state = ImageState::CREATED_AND_MEMORY_BOUND;
+				cachedImageRecord->lastUsedFrameIndex = currentFrameIndex; // there was an eviction + auto-submit, we need to update AGAIN
+				cachedImageRecord->allocationOffset = allocResults.allocationOffset;
+				cachedImageRecord->allocationSize = allocResults.allocationSize;
+				cachedImageRecord->gpuImageView = allocResults.gpuImageView;
+				cachedImageRecord->staticCPUImage = cpuImage;
 			}
 			else
 			{
@@ -424,48 +414,48 @@ uint32_t DrawResourcesFiller::addStaticImage2D(image_id imageID, const core::sma
 				// TODO: Log a warning or error here � `addStaticImage2D` failed, likely due to low VRAM.
 				_NBL_DEBUG_BREAK_IF(true);
 
-				if (cachedImageReference->allocationOffset != ImagesMemorySubAllocator::InvalidAddress)
+				if (cachedImageRecord->allocationOffset != ImagesMemorySubAllocator::InvalidAddress)
 				{
 					// We previously successfully create and allocated memory for the Image
 					// but failed to bind and create image view
 					// It's crucial to deallocate the offset+size form our images memory suballocator
-					imagesMemorySubAllocator->deallocate(cachedImageReference->allocationOffset, cachedImageReference->allocationSize);
+					imagesMemorySubAllocator->deallocate(cachedImageRecord->allocationOffset, cachedImageRecord->allocationSize);
 				}
 
-				if (cachedImageReference->arrayIndex != InvalidTextureIndex)
+				if (cachedImageRecord->arrayIndex != InvalidTextureIndex)
 				{
 					// We previously allocated a descriptor index, but failed to create a usable GPU image.
 					// It's crucial to deallocate this index to avoid leaks and preserve descriptor pool space.
 					// No semaphore wait needed here, as the GPU never got to use this slot.
-					suballocatedDescriptorSet->multi_deallocate(imagesArrayBinding, 1u, &cachedImageReference->arrayIndex, {});
-					cachedImageReference->arrayIndex = InvalidTextureIndex;
+					suballocatedDescriptorSet->multi_deallocate(imagesArrayBinding, 1u, &cachedImageRecord->arrayIndex, {});
+					cachedImageRecord->arrayIndex = InvalidTextureIndex;
 				}
 			}
 		}
 		else
 		{
 			// TODO: log here, index allocation failed.
-			cachedImageReference->arrayIndex = InvalidTextureIndex;
+			cachedImageRecord->arrayIndex = InvalidTextureIndex;
 		}
 	}
 	
-	assert(cachedImageReference->arrayIndex != InvalidTextureIndex); // shouldn't happen, because we're using LRU cache, so worst case eviction will happen + multi-deallocate and next next multi_allocate should definitely succeed
 	
 	// cached or just inserted, we update the lastUsedFrameIndex
-	cachedImageReference->lastUsedFrameIndex = currentFrameIndex;
+	cachedImageRecord->lastUsedFrameIndex = currentFrameIndex;
 
-	return cachedImageReference->arrayIndex;
+	assert(cachedImageRecord->arrayIndex != InvalidTextureIndex); // shouldn't happen, because we're using LRU cache, so worst case eviction will happen + multi-deallocate and next next multi_allocate should definitely succeed
+	return cachedImageRecord->arrayIndex != InvalidTextureIndex;
 }
 
-uint32_t DrawResourcesFiller::retrieveGeoreferencedImage_AllocateIfNeeded(image_id imageID, const GeoreferencedImageParams& params, SIntendedSubmitInfo& intendedNextSubmit)
+bool DrawResourcesFiller::ensureGeoreferencedImageAvailability_AllocateIfNeeded(image_id imageID, const GeoreferencedImageParams& params, SIntendedSubmitInfo& intendedNextSubmit)
 {
 	auto* device = m_utilities->getLogicalDevice();
 	auto* physDev = m_utilities->getLogicalDevice()->getPhysicalDevice();
 
 	// Try inserting or updating the image usage in the cache.
 	// If the image is already present, updates its semaphore value.
-	auto evictCallback = [&](image_id imageID, const ImageReference& evicted) { evictImage_SubmitIfNeeded(imageID, evicted, intendedNextSubmit); };
-	ImageReference* cachedImageReference = imagesUsageCache->insert(imageID, intendedNextSubmit.getFutureScratchSemaphore().value, evictCallback);
+	auto evictCallback = [&](image_id imageID, const CachedImageRecord& evicted) { evictImage_SubmitIfNeeded(imageID, evicted, intendedNextSubmit); };
+	CachedImageRecord* cachedImageRecord = imagesCache->insert(imageID, intendedNextSubmit.getFutureScratchSemaphore().value, evictCallback);
 
 	// TODO: Function call that gets you image creaation params based on georeferencedImageParams (extents and mips and whatever), it will also get you the GEOREFERENED TYPE
 	IGPUImage::SCreationParams imageCreationParams = {};
@@ -485,30 +475,30 @@ uint32_t DrawResourcesFiller::retrieveGeoreferencedImage_AllocateIfNeeded(image_
 		imageCreationParams.format = physDev->promoteImageFormat(request,imageCreationParams.tiling);
 	}
 	
-	// if cachedImageReference->index was not InvalidTextureIndex then it means we had a cache hit and updated the value of our sema
+	// if cachedImageRecord->index was not InvalidTextureIndex then it means we had a cache hit and updated the value of our sema
 	// But we need to check if the cached image needs resizing/recreation.
-	if (cachedImageReference->arrayIndex != InvalidTextureIndex)
+	if (cachedImageRecord->arrayIndex != InvalidTextureIndex)
 	{
 		// found in cache, but does it require resize? recreation?
-		if (cachedImageReference->gpuImageView)
+		if (cachedImageRecord->gpuImageView)
 		{
-			auto imgViewParams = cachedImageReference->gpuImageView->getCreationParameters();
+			auto imgViewParams = cachedImageRecord->gpuImageView->getCreationParameters();
 			if (imgViewParams.image)
 			{
 				const auto cachedParams = static_cast<asset::IImage::SCreationParams>(imgViewParams.image->getCreationParameters());
-				const auto cachedImageType = cachedImageReference->imageType;
+				const auto cachedImageType = cachedImageRecord->type;
 				// image type and creation params (most importantly extent and format) should match, otherwise we evict, recreate and re-pus
 				const auto currentParams = static_cast<asset::IImage::SCreationParams>(imageCreationParams);
 				const bool needsRecreation = cachedImageType != georeferenceImageType || cachedParams != currentParams;
 				if (needsRecreation)
 				{
 					// call the eviction callbacl so the currently cached imageID gets eventually deallocated from memory arena.
-					evictCallback(imageID, *cachedImageReference);
+					evictCallback(imageID, *cachedImageRecord);
 					
 					// instead of erasing and inserting the imageID into the cache, we just reset it, so the next block of code goes into array index allocation + creating our new image
-					*cachedImageReference = ImageReference(currentFrameIndex);
+					*cachedImageRecord = CachedImageRecord(currentFrameIndex);
 					// imagesUsageCache->erase(imageID);
-					// cachedImageReference = imagesUsageCache->insert(imageID, intendedNextSubmit.getFutureScratchSemaphore().value, evictCallback);
+					// cachedImageRecord = imagesUsageCache->insert(imageID, intendedNextSubmit.getFutureScratchSemaphore().value, evictCallback);
 				}
 			}
 			else
@@ -523,28 +513,27 @@ uint32_t DrawResourcesFiller::retrieveGeoreferencedImage_AllocateIfNeeded(image_
 	}
 
 	// in which case we don't queue anything for upload, and return the idx
-	if (cachedImageReference->arrayIndex == InvalidTextureIndex)
+	if (cachedImageRecord->arrayIndex == InvalidTextureIndex)
 	{
 		// This is a new image (cache miss). Allocate a descriptor index for it.
-		cachedImageReference->arrayIndex = video::SubAllocatedDescriptorSet::AddressAllocator::invalid_address;
+		cachedImageRecord->arrayIndex = video::SubAllocatedDescriptorSet::AddressAllocator::invalid_address;
 		// Blocking allocation attempt; if the descriptor pool is exhausted, this may stall.
-		suballocatedDescriptorSet->multi_allocate(std::chrono::time_point<std::chrono::steady_clock>::max(), imagesArrayBinding, 1u, &cachedImageReference->arrayIndex); // if the prev submit causes DEVICE_LOST then we'll get a deadlock here since we're using max timepoint
+		suballocatedDescriptorSet->multi_allocate(std::chrono::time_point<std::chrono::steady_clock>::max(), imagesArrayBinding, 1u, &cachedImageRecord->arrayIndex); // if the prev submit causes DEVICE_LOST then we'll get a deadlock here since we're using max timepoint
 
-		if (cachedImageReference->arrayIndex != video::SubAllocatedDescriptorSet::AddressAllocator::invalid_address)
+		if (cachedImageRecord->arrayIndex != video::SubAllocatedDescriptorSet::AddressAllocator::invalid_address)
 		{
 			// Attempt to create a GPU image and image view for this texture.
 			ImageAllocateResults allocResults = tryCreateAndAllocateImage_SubmitIfNeeded(imageCreationParams, intendedNextSubmit, std::to_string(imageID));
 
 			if (allocResults.isValid())
 			{
-				cachedImageReference->imageType = georeferenceImageType;
-				cachedImageReference->gpuResident = false;
-				cachedImageReference->lastUsedFrameIndex = currentFrameIndex; // there was an eviction + auto-submit, we need to update AGAIN
-				cachedImageReference->allocationOffset = allocResults.allocationOffset;
-				cachedImageReference->allocationSize = allocResults.allocationSize;
-				cachedImageReference->gpuImageView = allocResults.gpuImageView;
-
-				// TODO: queue update of the set with the gpu image view.
+				cachedImageRecord->type = georeferenceImageType;
+				cachedImageRecord->state = ImageState::CREATED_AND_MEMORY_BOUND;
+				cachedImageRecord->lastUsedFrameIndex = currentFrameIndex; // there was an eviction + auto-submit, we need to update AGAIN
+				cachedImageRecord->allocationOffset = allocResults.allocationOffset;
+				cachedImageRecord->allocationSize = allocResults.allocationSize;
+				cachedImageRecord->gpuImageView = allocResults.gpuImageView;
+				cachedImageRecord->staticCPUImage = nullptr;
 			}
 			else
 			{
@@ -553,37 +542,37 @@ uint32_t DrawResourcesFiller::retrieveGeoreferencedImage_AllocateIfNeeded(image_
 				// TODO: Log a warning or error here � `addStaticImage2D` failed, likely due to low VRAM.
 				_NBL_DEBUG_BREAK_IF(true);
 
-				if (cachedImageReference->allocationOffset != ImagesMemorySubAllocator::InvalidAddress)
+				if (cachedImageRecord->allocationOffset != ImagesMemorySubAllocator::InvalidAddress)
 				{
 					// We previously successfully create and allocated memory for the Image
 					// but failed to bind and create image view
 					// It's crucial to deallocate the offset+size form our images memory suballocator
-					imagesMemorySubAllocator->deallocate(cachedImageReference->allocationOffset, cachedImageReference->allocationSize);
+					imagesMemorySubAllocator->deallocate(cachedImageRecord->allocationOffset, cachedImageRecord->allocationSize);
 				}
 
-				if (cachedImageReference->arrayIndex != InvalidTextureIndex)
+				if (cachedImageRecord->arrayIndex != InvalidTextureIndex)
 				{
 					// We previously allocated a descriptor index, but failed to create a usable GPU image.
 					// It's crucial to deallocate this index to avoid leaks and preserve descriptor pool space.
 					// No semaphore wait needed here, as the GPU never got to use this slot.
-					suballocatedDescriptorSet->multi_deallocate(imagesArrayBinding, 1u, &cachedImageReference->arrayIndex, {});
-					cachedImageReference->arrayIndex = InvalidTextureIndex;
+					suballocatedDescriptorSet->multi_deallocate(imagesArrayBinding, 1u, &cachedImageRecord->arrayIndex, {});
+					cachedImageRecord->arrayIndex = InvalidTextureIndex;
 				}
 			}
 		}
 		else
 		{
 			// TODO: log here, index allocation failed.
-			cachedImageReference->arrayIndex = InvalidTextureIndex;
+			cachedImageRecord->arrayIndex = InvalidTextureIndex;
 		}
 	}
 
-	assert(cachedImageReference->arrayIndex != InvalidTextureIndex); // shouldn't happen, because we're using LRU cache, so worst case eviction will happen + multi-deallocate and next next multi_allocate should definitely succeed
 	
 	// cached or just inserted, we update the lastUsedFrameIndex
-	cachedImageReference->lastUsedFrameIndex = currentFrameIndex;
+	cachedImageRecord->lastUsedFrameIndex = currentFrameIndex;
 
-	return cachedImageReference->arrayIndex;
+	assert(cachedImageRecord->arrayIndex != InvalidTextureIndex); // shouldn't happen, because we're using LRU cache, so worst case eviction will happen + multi-deallocate and next next multi_allocate should definitely succeed
+	return (cachedImageRecord->arrayIndex != InvalidTextureIndex);
 }
 
 // TODO[Przemek]: similar to other drawXXX and drawXXX_internal functions that create mainobjects, drawObjects and push additional info in geometry buffer, input to function would be a GridDTMInfo
@@ -612,7 +601,7 @@ void DrawResourcesFiller::drawGridDTM(
 
 void DrawResourcesFiller::addImageObject(image_id imageID, const OrientedBoundingBox2D& obb, SIntendedSubmitInfo& intendedNextSubmit)
 {
-	beginMainObject(MainObjectType::IMAGE);
+	beginMainObject(MainObjectType::STATIC_IMAGE);
 
 	uint32_t mainObjIdx = acquireActiveMainObjectIndex_SubmitIfNeeded(intendedNextSubmit);
 
@@ -632,12 +621,34 @@ void DrawResourcesFiller::addImageObject(image_id imageID, const OrientedBoundin
 	endMainObject();
 }
 
+void DrawResourcesFiller::addGeoreferencedImage(image_id imageID, const GeoreferencedImageParams& params, SIntendedSubmitInfo& intendedNextSubmit)
+{
+	beginMainObject(MainObjectType::STATIC_IMAGE);
+
+	uint32_t mainObjIdx = acquireActiveMainObjectIndex_SubmitIfNeeded(intendedNextSubmit);
+
+	GeoreferencedImageInfo info = {};
+	info.topLeft = params.worldspaceOBB.topLeft;
+	info.dirU = params.worldspaceOBB.dirU;
+	info.aspectRatio = params.worldspaceOBB.aspectRatio;
+	info.textureID = getImageIndexFromID(imageID, intendedNextSubmit); // for this to be valid and safe, this function needs to be called immediately after `addStaticImage` function to make sure image is in memory
+	if (!addGeoreferencedImageInfo_Internal(info, mainObjIdx))
+	{
+		// single image object couldn't fit into memory to push to gpu, so we submit rendering current objects and reset geometry buffer and draw objects
+		submitCurrentDrawObjectsAndReset(intendedNextSubmit, mainObjIdx);
+		bool success = addGeoreferencedImageInfo_Internal(info, mainObjIdx);
+		assert(success); // this should always be true, otherwise it's either bug in code or not enough memory allocated to hold a single GeoreferencedImageInfo 
+	}
+
+	endMainObject();
+}
+
 bool DrawResourcesFiller::pushAllUploads(SIntendedSubmitInfo& intendedNextSubmit)
 {
 	if (!intendedNextSubmit.valid())
 	{
 		// It is a caching submit without command buffer, just for the purpose of accumulation of staging resources
-		// In that case we don't push any uploads (i.e. we don't record any stagedStaticImage commmand in active command buffer, because there is no active command buffer)
+		// In that case we don't push any uploads (i.e. we don't record any imageRecord commmand in active command buffer, because there is no active command buffer)
 		return false;
 	}
 
@@ -648,36 +659,45 @@ bool DrawResourcesFiller::pushAllUploads(SIntendedSubmitInfo& intendedNextSubmit
 		success &= pushBufferUploads(intendedNextSubmit, currentReplayCache->resourcesCollection);
 		success &= pushMSDFImagesUploads(intendedNextSubmit, currentReplayCache->msdfImagesState);
 
-		// Push Static Images Uploads from replay cache, only those who are not gpu resident
+		// Push Static Images Uploads from replay cache, all the work below is necessary to detect whether our image to replay is already in the cache in the exact form OR we need to create new image + bind memory and set array index
 		auto* device = m_utilities->getLogicalDevice();
-		std::vector<StaticImageCopy> staticImageCopies;
-		for (auto& [id, replayImageState] : currentReplayCache->staticImagesState)
+		bool replayCacheFullyCovered = true;
+		for (auto& [imageID, toReplayRecord] : *currentReplayCache->imagesCache)
 		{
-			auto it = staticImagesState.find(id);
+			// TODO: remove temoprary const_cast workaround.
+			CachedImageRecord& toReplayImageRecord_nonConst = const_cast<CachedImageRecord&>(toReplayRecord);
+
+			if (toReplayRecord.type != ImageType::STATIC) // non-static images (Georeferenced) won't be replayed like this
+				continue;
+
+			auto* cachedRecord = imagesCache->peek(imageID);
 			bool alreadyResident = false;
 
 			// compare with existing state, and check whether image id is already resident.
-			if (it != staticImagesState.end())
+			if (cachedRecord != nullptr)
 			{
-				const StaticImageState& existingState = it->second;
-
 				const bool allocationMatches =
-					existingState.allocationOffset == replayImageState.allocationOffset &&
-					existingState.allocationSize == replayImageState.allocationSize;
+					cachedRecord->allocationOffset == toReplayRecord.allocationOffset &&
+					cachedRecord->allocationSize == toReplayRecord.allocationSize;
 
-				const bool arrayIndexMatches = existingState.arrayIndex == replayImageState.arrayIndex;
+				const bool arrayIndexMatches = cachedRecord->arrayIndex == toReplayRecord.arrayIndex;
 
-				alreadyResident = allocationMatches && arrayIndexMatches && existingState.gpuResident;
+				alreadyResident = allocationMatches && arrayIndexMatches && cachedRecord->state == ImageState::GPU_RESIDENT_WITH_VALID_STATIC_DATA;
 			}
 
-			// if already resident, we don't need to do anything
+			// if already resident, just update the state to the cached state (to make sure it doesn't get issued for upload again) and move on.
 			if (alreadyResident)
+			{
+				toReplayImageRecord_nonConst.state = cachedRecord->state; // update the toReplayImageRecords's state, to completely match the currently resident state
 				continue;
+			}
+
+			replayCacheFullyCovered = false;
 
 			bool successCreateNewImage = false;
 
 			// Not already resident, we need to recreate the image and bind the image memory to correct location again, and update the descriptor set and push the uploads
-			auto existingGPUImageViewParams = replayImageState.gpuImageView->getCreationParameters();
+			auto existingGPUImageViewParams = toReplayRecord.gpuImageView->getCreationParameters();
 			IGPUImage::SCreationParams imageParams = {};
 			imageParams = existingGPUImageViewParams.image->getCreationParameters();
 
@@ -687,13 +707,13 @@ bool DrawResourcesFiller::pushAllUploads(SIntendedSubmitInfo& intendedNextSubmit
 				nbl::video::ILogicalDevice::SBindImageMemoryInfo bindImageMemoryInfo =
 				{
 					.image = newGPUImage.get(),
-					.binding = {.memory = imagesMemoryArena.memory.get(), .offset = imagesMemoryArena.offset + replayImageState.allocationOffset }
+					.binding = {.memory = imagesMemoryArena.memory.get(), .offset = imagesMemoryArena.offset + toReplayRecord.allocationOffset }
 				};
 
 				const bool boundToMemorySuccessfully = device->bindImageMemory({ &bindImageMemoryInfo, 1u });
 				if (boundToMemorySuccessfully)
 				{
-					newGPUImage->setObjectDebugName((std::to_string(id) + " Static Image 2D").c_str());
+					newGPUImage->setObjectDebugName((std::to_string(imageID) + " Static Image 2D").c_str());
 					IGPUImageView::SCreationParams viewParams = existingGPUImageViewParams;
 					viewParams.image = newGPUImage;
 
@@ -701,14 +721,9 @@ bool DrawResourcesFiller::pushAllUploads(SIntendedSubmitInfo& intendedNextSubmit
 					if (newGPUImageView)
 					{
 						successCreateNewImage = true;
-						
-						staticImageCopies.push_back(StaticImageCopy {
-							.cpuImage = replayImageState.cpuImage,
-							.gpuImageView = newGPUImageView,
-							.arrayIndex = replayImageState.arrayIndex
-							});
-
-						newGPUImageView->setObjectDebugName((std::to_string(id) + " Static Image View 2D").c_str());
+						toReplayImageRecord_nonConst.gpuImageView = newGPUImageView;
+						toReplayImageRecord_nonConst.state = ImageState::CREATED_AND_MEMORY_BOUND;
+						newGPUImageView->setObjectDebugName((std::to_string(imageID) + " Static Image View 2D").c_str());
 					}
 
 				}
@@ -721,12 +736,15 @@ bool DrawResourcesFiller::pushAllUploads(SIntendedSubmitInfo& intendedNextSubmit
 				success = false;
 			}
 		}
-
-		bool replayStaticUploadSuccess = true;
 		
-		if (staticImageCopies.size() > 0u)
+		// Our actual `imageCache` (which represents GPU state) didn't cover the replayCache fully, so new images had to be created, bound to memory. and they need to be written into their respective descriptor array indices again.
+		imagesCache->clear();
+		for (auto it = currentReplayCache->imagesCache->crbegin(); it != currentReplayCache->imagesCache->crend(); it++)
+			imagesCache->base_t::insert(it->first, it->second);
+
+		if (!replayCacheFullyCovered)
 		{
-			// We need to block for previous submit in order to safely, rebind image's memory and update the descriptor set array index.
+			// We need to block for previous submit in order to safely update the descriptor set array index next.
 			// 
 			// [FUTURE_CONSIDERATION]: To avoid stalling the CPU when replaying caches that overflow GPU memory,
 			// we could recreate the image and image view, binding them to entirely new memory locations.
@@ -734,38 +752,18 @@ bool DrawResourcesFiller::pushAllUploads(SIntendedSubmitInfo& intendedNextSubmit
 			// Note: This isn't a problem if the replayed scene fits in memory and doesn't require overflow submissions due to image memory exhaustion.
 			nbl::video::ISemaphore::SWaitInfo waitInfo = { .semaphore = intendedNextSubmit.scratchSemaphore.semaphore, .value = intendedNextSubmit.scratchSemaphore.value };
 			device->blockForSemaphores({ &waitInfo, 1u });
-			replayStaticUploadSuccess = pushStaticImagesUploads_Internal(intendedNextSubmit, staticImageCopies);
-		}
-
-		if (replayStaticUploadSuccess)
-		{
-			staticImagesState = currentReplayCache->staticImagesState;
-			for (auto& [_, state] : staticImagesState)
-				state.gpuResident = true;
 		}
 
-		success &= replayStaticUploadSuccess;
+		success &= bindImagesToArrayIndices(*imagesCache);
+		success &= pushStaticImagesUploads(intendedNextSubmit, *imagesCache);
 	}
 	else
 	{
 		flushDrawObjects();
 		success &= pushBufferUploads(intendedNextSubmit, resourcesCollection);
 		success &= pushMSDFImagesUploads(intendedNextSubmit, msdfImagesState);
-
-		// Push Static Images Uploads, only those who are not gpu resident
-		std::vector<StaticImageCopy> staticImageCopies;
-		for (auto& [id, staticImageState] : staticImagesState)
-		{
-			if (!staticImageState.gpuResident)
-				staticImageCopies.push_back(StaticImageCopy{ .cpuImage = staticImageState.cpuImage, .gpuImageView = staticImageState.gpuImageView, .arrayIndex = staticImageState.arrayIndex });
-		}
-		const bool staticImagesUploadSuccess = pushStaticImagesUploads_Internal(intendedNextSubmit, staticImageCopies);
-		if (staticImagesUploadSuccess)
-		{
-			for (auto& [id, staticImageState] : staticImagesState)
-				staticImageState.gpuResident = true;
-		}
-		success &= staticImagesUploadSuccess;
+		success &= bindImagesToArrayIndices(*imagesCache);
+		success &= pushStaticImagesUploads(intendedNextSubmit, *imagesCache);
 	}
 	return success;
 }
@@ -860,7 +858,12 @@ std::unique_ptr<DrawResourcesFiller::ReplayCache> DrawResourcesFiller::createRep
 		stagedMSDF.uploadedToGPU = false; // to trigger upload for all msdf functions again.
 	ret->drawCallsData = drawCalls;
 	ret->activeMainObjectIndex = activeMainObjectIndex;
-	ret->staticImagesState = staticImagesState; // copy state of static images
+	ret->imagesCache = std::unique_ptr<ImagesCache>(new ImagesCache(imagesCache->size()));
+	// It should be copyable, here is a temporary hack:
+	for (auto it = imagesCache->crbegin(); it != imagesCache->crend(); it++)
+	{
+		ret->imagesCache->base_t::insert(it->first, it->second);
+	}
 	return ret;
 }
 
@@ -940,7 +943,7 @@ bool DrawResourcesFiller::pushMSDFImagesUploads(SIntendedSubmitInfo& intendedNex
 
 		auto msdfImage = msdfTextureArray->getCreationParameters().image;
 
-		// preparing msdfs for stagedStaticImage
+		// preparing msdfs for imageRecord
 		using image_barrier_t = IGPUCommandBuffer::SPipelineBarrierDependencyInfo::image_barrier_t;
 		image_barrier_t beforeTransferImageBarrier[] =
 		{
@@ -1056,51 +1059,79 @@ bool DrawResourcesFiller::pushMSDFImagesUploads(SIntendedSubmitInfo& intendedNex
 	}
 }
 
-bool DrawResourcesFiller::pushStaticImagesUploads_Internal(SIntendedSubmitInfo& intendedNextSubmit, std::span<StaticImageCopy> staticImagesCopy)
+bool DrawResourcesFiller::bindImagesToArrayIndices(ImagesCache& imagesCache)
+{
+	bool success = true;
+	
+	auto* device = m_utilities->getLogicalDevice();
+	auto* descriptorSet = suballocatedDescriptorSet->getDescriptorSet();
+
+	// DescriptorSet Updates
+	std::vector<video::IGPUDescriptorSet::SDescriptorInfo> descriptorInfos;
+	std::vector<IGPUDescriptorSet::SWriteDescriptorSet> descriptorWrites;
+	descriptorInfos.resize(imagesCache.size());
+	descriptorWrites.resize(imagesCache.size());
+
+	uint32_t descriptorWriteCount = 0u;
+	for (auto& [id, record] : imagesCache)
+	{
+		if (record.state >= ImageState::BOUND_TO_DESCRIPTOR_SET || !record.gpuImageView)
+			continue;
+
+		// Bind gpu image view to descriptor set
+		video::IGPUDescriptorSet::SDescriptorInfo descriptorInfo = {};
+		descriptorInfo.info.image.imageLayout = IImage::LAYOUT::READ_ONLY_OPTIMAL;
+		descriptorInfo.desc = record.gpuImageView;
+		descriptorInfos[descriptorWriteCount] = descriptorInfo;
+
+		// consider batching contiguous writes, if descriptor set updating was a hotspot
+		IGPUDescriptorSet::SWriteDescriptorSet descriptorWrite = {};
+		descriptorWrite.dstSet = descriptorSet;
+		descriptorWrite.binding = imagesArrayBinding;
+		descriptorWrite.arrayElement = record.arrayIndex;
+		descriptorWrite.count = 1u;
+		descriptorWrite.info = &descriptorInfos[descriptorWriteCount];
+		descriptorWrites[descriptorWriteCount] = descriptorWrite;
+
+		const_cast<CachedImageRecord&>(record).state = ImageState::BOUND_TO_DESCRIPTOR_SET;
+		descriptorWriteCount++;
+	}
+
+	if (descriptorWriteCount > 0u)
+		success &= device->updateDescriptorSets(descriptorWriteCount, descriptorWrites.data(), 0u, nullptr);
+	return success;
+}
+
+bool DrawResourcesFiller::pushStaticImagesUploads(SIntendedSubmitInfo& intendedNextSubmit, ImagesCache& imagesCache)
 {
 	bool success = true;
 
-	if (staticImagesCopy.size() > 0ull)
+	// Push Static Images Uploads, only those who are not gpu resident
+	// TODO: remove this vector and check state in each for loop below?
+	std::vector<CachedImageRecord*> nonResidentImageRecords;
+	for (auto& [id, record] : imagesCache)
+	{
+		if (record.staticCPUImage && record.type == ImageType::STATIC && record.state < ImageState::GPU_RESIDENT_WITH_VALID_STATIC_DATA)
+			nonResidentImageRecords.push_back(const_cast<CachedImageRecord*>(&record)); // TODO: remove const_cast
+	}
+
+	if (nonResidentImageRecords.size() > 0ull)
 	{
 		auto* device = m_utilities->getLogicalDevice();
-		auto* physDev = m_utilities->getLogicalDevice()->getPhysicalDevice();
-		auto* descriptorSet = suballocatedDescriptorSet->getDescriptorSet();
 		auto* cmdBuffInfo = intendedNextSubmit.getCommandBufferForRecording();
 	
 		if (cmdBuffInfo)
 		{
 			IGPUCommandBuffer* commandBuffer = cmdBuffInfo->cmdbuf;
 
-			// DescriptorSet Updates
-			std::vector<video::IGPUDescriptorSet::SDescriptorInfo> descriptorInfos;
-			std::vector<IGPUDescriptorSet::SWriteDescriptorSet> descriptorWrites;
-			descriptorInfos.resize(staticImagesCopy.size());
-			descriptorWrites.resize(staticImagesCopy.size());
-			for (uint32_t i = 0u; i < staticImagesCopy.size(); ++i)
-			{
-				auto& stagedStaticImage = staticImagesCopy[i];
-				// Bind gpu image view to descriptor set
-				descriptorInfos[i].info.image.imageLayout = IImage::LAYOUT::READ_ONLY_OPTIMAL;
-				descriptorInfos[i].desc = stagedStaticImage.gpuImageView;
-
-				// consider batching contiguous writes, if descriptor set updating was a hotspot
-				descriptorWrites[i].dstSet = descriptorSet;
-				descriptorWrites[i].binding = imagesArrayBinding;
-				descriptorWrites[i].arrayElement = stagedStaticImage.arrayIndex;
-				descriptorWrites[i].count = 1u;
-				descriptorWrites[i].info = &descriptorInfos[i];
-			}
-
-			success &= device->updateDescriptorSets(descriptorWrites.size(), descriptorWrites.data(), 0u, nullptr);
-
 			std::vector<IGPUCommandBuffer::SPipelineBarrierDependencyInfo::image_barrier_t> beforeCopyImageBarriers;
-			beforeCopyImageBarriers.resize(staticImagesCopy.size());
+			beforeCopyImageBarriers.resize(nonResidentImageRecords.size());
 
-			// Pipeline Barriers before stagedStaticImage
-			for (uint32_t i = 0u; i < staticImagesCopy.size(); ++i)
+			// Pipeline Barriers before imageRecord
+			for (uint32_t i = 0u; i < nonResidentImageRecords.size(); ++i)
 			{
-				auto& stagedStaticImage = staticImagesCopy[i];
-				const auto& gpuImg = stagedStaticImage.gpuImageView->getCreationParameters().image;
+				auto& imageRecord = *nonResidentImageRecords[i];
+				const auto& gpuImg = imageRecord.gpuImageView->getCreationParameters().image;
 				beforeCopyImageBarriers[i] =
 				{
 					.barrier = {
@@ -1126,27 +1157,34 @@ bool DrawResourcesFiller::pushStaticImagesUploads_Internal(SIntendedSubmitInfo&
 			}
 			success &= commandBuffer->pipelineBarrier(E_DEPENDENCY_FLAGS::EDF_NONE, { .imgBarriers = beforeCopyImageBarriers });
 
-			for (uint32_t i = 0u; i < staticImagesCopy.size(); ++i)
+			for (uint32_t i = 0u; i < nonResidentImageRecords.size(); ++i)
 			{
-				auto& stagedStaticImage = staticImagesCopy[i];
-				auto& gpuImg = stagedStaticImage.gpuImageView->getCreationParameters().image;
+				auto& imageRecord = *nonResidentImageRecords[i];
+				auto& gpuImg = imageRecord.gpuImageView->getCreationParameters().image;
 				success &= m_utilities->updateImageViaStagingBuffer(
 					intendedNextSubmit,
-					stagedStaticImage.cpuImage->getBuffer()->getPointer(), stagedStaticImage.cpuImage->getCreationParameters().format,
+					imageRecord.staticCPUImage->getBuffer()->getPointer(), imageRecord.staticCPUImage->getCreationParameters().format,
 					gpuImg.get(), IImage::LAYOUT::TRANSFER_DST_OPTIMAL,
-					stagedStaticImage.cpuImage->getRegions());
+					imageRecord.staticCPUImage->getRegions());
+
+				if (success)
+					imageRecord.state = ImageState::GPU_RESIDENT_WITH_VALID_STATIC_DATA;
+				else
+				{
+					// TODO: LOG
+				}
 			}
 
 			commandBuffer = intendedNextSubmit.getCommandBufferForRecording()->cmdbuf; // overflow-submit in utilities calls might've cause current recording command buffer to change
 
 			std::vector<IGPUCommandBuffer::SPipelineBarrierDependencyInfo::image_barrier_t> afterCopyImageBarriers;
-			afterCopyImageBarriers.resize(staticImagesCopy.size());
+			afterCopyImageBarriers.resize(nonResidentImageRecords.size());
 
-			// Pipeline Barriers before stagedStaticImage
-			for (uint32_t i = 0u; i < staticImagesCopy.size(); ++i)
+			// Pipeline Barriers before imageRecord
+			for (uint32_t i = 0u; i < nonResidentImageRecords.size(); ++i)
 			{
-				auto& stagedStaticImage = staticImagesCopy[i];
-				const auto& gpuImg = stagedStaticImage.gpuImageView->getCreationParameters().image;
+				auto& imageRecord = *nonResidentImageRecords[i];
+				const auto& gpuImg = imageRecord.gpuImageView->getCreationParameters().image;
 				afterCopyImageBarriers[i] =
 				{
 					.barrier = {
@@ -1760,7 +1798,44 @@ bool DrawResourcesFiller::addImageObject_Internal(const ImageObjectInfo& imageOb
 	DrawObject* drawObjectsToBeFilled = resourcesCollection.drawObjects.increaseCountAndGetPtr(1u);
 	DrawObject drawObj = {};
 	drawObj.mainObjIndex = mainObjIdx;
-	drawObj.type_subsectionIdx = uint32_t(static_cast<uint16_t>(ObjectType::IMAGE) | (0 << 16)); // TODO: use custom pack/unpack function
+	drawObj.type_subsectionIdx = uint32_t(static_cast<uint16_t>(ObjectType::STATIC_IMAGE) | (0 << 16)); // TODO: use custom pack/unpack function
+	drawObj.geometryAddress = geometryBufferOffset;
+	drawObjectsToBeFilled[0u] = drawObj;
+
+	return true;
+}
+
+bool DrawResourcesFiller::addGeoreferencedImageInfo_Internal(const GeoreferencedImageInfo& georeferencedImageInfo, uint32_t mainObjIdx)
+{
+	const size_t remainingResourcesSize = calculateRemainingResourcesSize();
+
+	const uint32_t uploadableObjects = (remainingResourcesSize) / (sizeof(GeoreferencedImageInfo) + sizeof(DrawObject) + sizeof(uint32_t) * 6u);
+	// TODO[ERFAN]: later take into account: our maximum indexable vertex 
+
+	if (uploadableObjects <= 0u)
+		return false;
+
+	// Add Geometry
+	size_t geometryBufferOffset = resourcesCollection.geometryInfo.increaseSizeAndGetOffset(sizeof(GeoreferencedImageInfo), alignof(GeoreferencedImageInfo));
+	void* dst = resourcesCollection.geometryInfo.data() + geometryBufferOffset;
+	memcpy(dst, &georeferencedImageInfo, sizeof(GeoreferencedImageInfo));
+
+	// Push Indices, remove later when compute fills this
+	uint32_t* indexBufferToBeFilled = resourcesCollection.indexBuffer.increaseCountAndGetPtr(6u * 1u);
+	const uint32_t startObj = resourcesCollection.drawObjects.getCount();
+	uint32_t i = 0u;
+	indexBufferToBeFilled[i * 6] = (startObj + i) * 4u + 1u;
+	indexBufferToBeFilled[i * 6 + 1u] = (startObj + i) * 4u + 0u;
+	indexBufferToBeFilled[i * 6 + 2u] = (startObj + i) * 4u + 2u;
+	indexBufferToBeFilled[i * 6 + 3u] = (startObj + i) * 4u + 1u;
+	indexBufferToBeFilled[i * 6 + 4u] = (startObj + i) * 4u + 2u;
+	indexBufferToBeFilled[i * 6 + 5u] = (startObj + i) * 4u + 3u;
+
+	// Add DrawObjs
+	DrawObject* drawObjectsToBeFilled = resourcesCollection.drawObjects.increaseCountAndGetPtr(1u);
+	DrawObject drawObj = {};
+	drawObj.mainObjIndex = mainObjIdx;
+	drawObj.type_subsectionIdx = uint32_t(static_cast<uint16_t>(ObjectType::STREAMED_IMAGE) | (0 << 16)); // TODO: use custom pack/unpack function
 	drawObj.geometryAddress = geometryBufferOffset;
 	drawObjectsToBeFilled[0u] = drawObj;
 
@@ -1770,7 +1845,7 @@ bool DrawResourcesFiller::addImageObject_Internal(const ImageObjectInfo& imageOb
 uint32_t DrawResourcesFiller::getImageIndexFromID(image_id imageID, const SIntendedSubmitInfo& intendedNextSubmit)
 {
 	uint32_t textureIdx = InvalidTextureIndex;
-	ImageReference* imageRef = imagesUsageCache->get(imageID);
+	CachedImageRecord* imageRef = imagesCache->get(imageID);
 	if (imageRef)
 	{
 		textureIdx = imageRef->arrayIndex;
@@ -1779,7 +1854,7 @@ uint32_t DrawResourcesFiller::getImageIndexFromID(image_id imageID, const SInten
 	return textureIdx;
 }
 
-void DrawResourcesFiller::evictImage_SubmitIfNeeded(image_id imageID, const ImageReference& evicted, SIntendedSubmitInfo& intendedNextSubmit)
+void DrawResourcesFiller::evictImage_SubmitIfNeeded(image_id imageID, const CachedImageRecord& evicted, SIntendedSubmitInfo& intendedNextSubmit)
 {
 	if (evicted.arrayIndex == InvalidTextureIndex)
 	{
@@ -1816,11 +1891,6 @@ void DrawResourcesFiller::evictImage_SubmitIfNeeded(image_id imageID, const Imag
 		ISemaphore::SWaitInfo deallocationWaitInfo = { .semaphore = intendedNextSubmit.scratchSemaphore.semaphore, .value = intendedNextSubmit.scratchSemaphore.value };
 		suballocatedDescriptorSet->multi_deallocate(imagesArrayBinding, 1u, &evicted.arrayIndex, deallocationWaitInfo, &cleanupObject.get());
 	}
-
-	// erase imageID from our state map
-	// kindof mirrors the state of the LRUCache for static images
-	if (evicted.imageType == ImageType::STATIC)
-		staticImagesState.erase(imageID);
 }
 
 DrawResourcesFiller::ImageAllocateResults  DrawResourcesFiller::tryCreateAndAllocateImage_SubmitIfNeeded(const nbl::asset::IImage::SCreationParams& imageParams, nbl::video::SIntendedSubmitInfo& intendedNextSubmit, std::string imageDebugName)
@@ -1832,8 +1902,8 @@ DrawResourcesFiller::ImageAllocateResults  DrawResourcesFiller::tryCreateAndAllo
 
 	// Attempt to create a GPU image and corresponding image view for this texture.
 	// If creation or memory allocation fails (likely due to VRAM exhaustion),
-	// we'll evict another texture from the LRU cache and retry until successful, or until only the currently-cachedImageReference image remains.
-	while (imagesUsageCache->size() > 0u)
+	// we'll evict another texture from the LRU cache and retry until successful, or until only the currently-cachedImageRecord image remains.
+	while (imagesCache->size() > 0u)
 	{
 		// Try creating the image and allocating memory for it:
 		nbl::video::IGPUImage::SCreationParams params = {};
@@ -1917,7 +1987,7 @@ DrawResourcesFiller::ImageAllocateResults  DrawResourcesFiller::tryCreateAndAllo
 		}
 
 		// Getting here means we failed creating or allocating the image, evict and retry.
-		if (imagesUsageCache->size() == 1u)
+		if (imagesCache->size() == 1u)
 		{
 			// Nothing else to evict; give up.
 			// We probably have evicted almost every other texture except the one we just allocated an index for
@@ -1925,13 +1995,13 @@ DrawResourcesFiller::ImageAllocateResults  DrawResourcesFiller::tryCreateAndAllo
 			break;
 		}
 
-		assert(imagesUsageCache->size() > 1u);
+		assert(imagesCache->size() > 1u);
 
-		const image_id evictionCandidate = imagesUsageCache->select_eviction_candidate();
-		ImageReference* imageRef = imagesUsageCache->peek(evictionCandidate);
+		const image_id evictionCandidate = imagesCache->select_eviction_candidate();
+		CachedImageRecord* imageRef = imagesCache->peek(evictionCandidate);
 		if (imageRef)
 			evictImage_SubmitIfNeeded(evictionCandidate, *imageRef, intendedNextSubmit);
-		imagesUsageCache->erase(evictionCandidate);
+		imagesCache->erase(evictionCandidate);
 		while (suballocatedDescriptorSet->cull_frees()) {}; // to make sure deallocation requests in eviction callback are blocked for.
 
 		// we don't hold any references to the GPUImageView or GPUImage so descriptor binding will be the last reference
@@ -2062,7 +2132,7 @@ uint32_t DrawResourcesFiller::addMSDFTexture(const MSDFInputInfo& msdfInput, cor
 	
 	inserted->lastUsedFrameIndex = currentFrameIndex; // in case there was an eviction + auto-submit, we need to update AGAIN
 
-	// if cachedImageReference->alloc_idx was not InvalidTextureIndex then it means we had a cache hit and updated the value of our sema, in which case we don't queue anything for upload, and return the idx
+	// if cachedImageRecord->alloc_idx was not InvalidTextureIndex then it means we had a cache hit and updated the value of our sema, in which case we don't queue anything for upload, and return the idx
 	if (inserted->alloc_idx == InvalidTextureIndex)
 	{
 		// New insertion == cache miss happened and insertion was successfull
diff --git a/62_CAD/DrawResourcesFiller.h b/62_CAD/DrawResourcesFiller.h
index ae071654a..4faa3fecc 100644
--- a/62_CAD/DrawResourcesFiller.h
+++ b/62_CAD/DrawResourcesFiller.h
@@ -221,7 +221,7 @@ struct DrawResourcesFiller
 	 * This function ensures that a given image is available as a GPU-resident texture for future draw submissions.
 	 * It uses an LRU cache to manage descriptor set slots and evicts old images if necessary to make room for new ones.
 	 *
-	 * If the image is already cached and its slot is valid, it returns the slot index directly.
+	 * If the image is already cached and its slot is valid, it returns true;
 	 * Otherwise, it performs the following:
 	 *   - Allocates a new descriptor set slot.
 	 *   - Promotes the image format to be GPU-compatible.
@@ -233,23 +233,43 @@ struct DrawResourcesFiller
 	 * @param cpuImage             The CPU-side image resource to (possibly) upload.
 	 * @param intendedNextSubmit   Struct representing the upcoming submission, including a semaphore for safe scheduling.
 	 *
-	 * @return The index (slot) into the descriptor set array where the image is or will be bound.
-	 *         Returns `InvalidTextureIndex` only if all fallback and eviction attempts failed.
-	 *
 	 * @note This function ensures that the descriptor slot is not reused while the GPU may still be reading from it.
 	 *       If an eviction is required and the evicted image is scheduled to be used in the next submit, it triggers
 	 *       a flush of pending draws to preserve correctness.
 	 *
 	 * @note The function uses the `imagesUsageCache` LRU cache to track usage and validity of texture slots.
 	 *       If an insertion leads to an eviction, a callback ensures proper deallocation and synchronization.
+	 * @return true if the image was successfully cached and is ready for use; false if allocation failed.
 	*/
-	uint32_t addStaticImage2D(image_id imageID, const core::smart_refctd_ptr<ICPUImage>& cpuImage, SIntendedSubmitInfo& intendedNextSubmit);
+	bool ensureStaticImageAvailability(image_id imageID, const core::smart_refctd_ptr<ICPUImage>& cpuImage, SIntendedSubmitInfo& intendedNextSubmit);
 
-	uint32_t retrieveGeoreferencedImage_AllocateIfNeeded(image_id imageID, const GeoreferencedImageParams& params, SIntendedSubmitInfo& intendedNextSubmit);
+	/**
+	 * @brief Ensures a GPU-resident georeferenced image exists in the cache, allocating resources if necessary.
+	 * 
+	 * If the specified image ID is not already present in the cache, or if the cached version is incompatible
+	 * with the requested parameters (e.g. extent, format, or type), this function allocates GPU memory,
+	 * creates the image and its view, to be bound to a descriptor binding in the future.
+	 * 
+	 * If the image already exists and matches the requested parameters, its usage metadata is updated.
+	 * In either case, the cache is updated to reflect usage in the current frame.
+	 * 
+	 * This function also handles automatic eviction of old images via an LRU policy when space is limited.
+	 * 
+	 * @param imageID                Unique identifier of the image to add or reuse.
+	 * @param params                 Georeferenced Image Params
+	 * @param intendedNextSubmit     Submit info object used to track resources pending GPU submission.
+	 * 
+	 * @return true if the image was successfully cached and is ready for use; false if allocation failed.
+	 * [TODO]: should be internal protected member function.
+	 */
+	bool ensureGeoreferencedImageAvailability_AllocateIfNeeded(image_id imageID, const GeoreferencedImageParams& params, SIntendedSubmitInfo& intendedNextSubmit);
 
 	// This function must be called immediately after `addStaticImage` for the same imageID.
 	void addImageObject(image_id imageID, const OrientedBoundingBox2D& obb, SIntendedSubmitInfo& intendedNextSubmit);
 	
+	// This function must be called immediately after `addStaticImage` for the same imageID.
+	void addGeoreferencedImage(image_id imageID, const GeoreferencedImageParams& params, SIntendedSubmitInfo& intendedNextSubmit);
+
 	/// @brief call this function before submitting to ensure all buffer and textures resourcesCollection requested via drawing calls are copied to GPU
 	/// records copy command into intendedNextSubmit's active command buffer and might possibly submits if fails allocation on staging upload memory.
 	bool pushAllUploads(SIntendedSubmitInfo& intendedNextSubmit);
@@ -357,7 +377,7 @@ struct DrawResourcesFiller
 		std::vector<DrawCallData> drawCallsData;
 		ResourcesCollection resourcesCollection;
 		std::vector<MSDFImageState> msdfImagesState;
-		std::unordered_map<image_id, StaticImageState> staticImagesState;
+		std::unique_ptr<ImagesCache> imagesCache;
 		uint32_t activeMainObjectIndex = InvalidMainObjectIdx;
 		// TODO: non msdf general CPU Images
 		// TODO: Get total memory consumption for logging?
@@ -394,8 +414,11 @@ struct DrawResourcesFiller
 	/// @brief Records GPU copy commands for all staged msdf images into the active command buffer.
 	bool pushMSDFImagesUploads(SIntendedSubmitInfo& intendedNextSubmit, std::vector<MSDFImageState>& msdfImagesState);
 
-	/// @brief Records GPU copy commands for all staged msdf images into the active command buffer.
-	bool pushStaticImagesUploads_Internal(SIntendedSubmitInfo& intendedNextSubmit, std::span<StaticImageCopy> staticImagesCopy);
+	/// @brief binds cached images into their correct descriptor set slot if not already resident.
+	bool bindImagesToArrayIndices(ImagesCache& imagesCache);
+
+	/// @brief Records GPU copy commands for all staged images into the active command buffer, and binds them into correct descriptor set slot.
+	bool pushStaticImagesUploads(SIntendedSubmitInfo& intendedNextSubmit, ImagesCache& imagesCache);
 
 	const size_t calculateRemainingResourcesSize() const;
 
@@ -462,9 +485,12 @@ struct DrawResourcesFiller
 	
 	/// Attempts to upload a single GridDTMInfo considering resource limitations
 	bool addGridDTM_Internal(const GridDTMInfo& gridDTMInfo, uint32_t mainObjIdx);
-	/// Attempts to upload a single image object considering resource limitations (not accounting for the resource image added using addStaticImage2D function)
+	/// Attempts to upload a single image object considering resource limitations (not accounting for the resource image added using ensureStaticImageAvailability function)
 	bool addImageObject_Internal(const ImageObjectInfo& imageObjectInfo, uint32_t mainObjIdx);;
 	
+	/// Attempts to upload a georeferenced image info considering resource limitations (not accounting for the resource image added using ensureStaticImageAvailability function)
+	bool addGeoreferencedImageInfo_Internal(const GeoreferencedImageInfo& georeferencedImageInfo, uint32_t mainObjIdx);;
+	
 	uint32_t getImageIndexFromID(image_id imageID, const SIntendedSubmitInfo& intendedNextSubmit);
 
 	/**
@@ -483,7 +509,7 @@ struct DrawResourcesFiller
 	 *
 	 * @warning Deallocation may use a conservative semaphore wait value if exact usage information is unavailable. [future todo: fix] 
 	 */
-	void evictImage_SubmitIfNeeded(image_id imageID, const ImageReference& evicted, SIntendedSubmitInfo& intendedNextSubmit);
+	void evictImage_SubmitIfNeeded(image_id imageID, const CachedImageRecord& evicted, SIntendedSubmitInfo& intendedNextSubmit);
 	
 	struct ImageAllocateResults
 	{
@@ -707,12 +733,8 @@ struct DrawResourcesFiller
 	bool m_hasInitializedMSDFTextureArrays = false;
 	
 	// Images:
-	std::unique_ptr<ImagesUsageCache> imagesUsageCache;
+	std::unique_ptr<ImagesCache> imagesCache;
 	smart_refctd_ptr<SubAllocatedDescriptorSet> suballocatedDescriptorSet;
 	uint32_t imagesArrayBinding = 0u;
-	
-	// TODO: consider removing this and just using the `imagesUsageCache` and `ImageReference` when `core::ResizableLRUCache` is copyable and iterable
-	// Current state of the static images, used in `pushStaticImagesUploads` to make StaticImages `gpuResident` and bind them to correct array index
-	std::unordered_map<image_id, StaticImageState> staticImagesState;
 };
 
diff --git a/62_CAD/Images.h b/62_CAD/Images.h
index fe3e8bde9..d525a68f6 100644
--- a/62_CAD/Images.h
+++ b/62_CAD/Images.h
@@ -6,19 +6,29 @@ using namespace nbl::asset;
 
 using image_id = uint64_t; // Could later be templated or replaced with a stronger type or hash key.
 
+enum class ImageState : uint8_t
+{
+	INVALID = 0,
+	CREATED_AND_MEMORY_BOUND,             // GPU image created, not bound to descriptor set yet
+	BOUND_TO_DESCRIPTOR_SET,              // Bound to descriptor set, GPU resident, but may contain uninitialized or partial data
+	GPU_RESIDENT_WITH_VALID_STATIC_DATA,  // When data for static images gets issued for upload successfully
+};
+
 enum class ImageType : uint8_t
 {
 	INVALID = 0,
-    STATIC,                        // Regular non-georeferenced image, fully loaded once
-    GEOREFERENCED_STREAMED,            // Streamed image, resolution depends on camera/view
-    GEOREFERENCED_FULL_RESOLUTION      // For smaller georeferenced images, entire image is eventually loaded and not streamed or view-dependant
+	STATIC,                        // Regular non-georeferenced image, fully loaded once
+	GEOREFERENCED_STREAMED,            // Streamed image, resolution depends on camera/view
+	GEOREFERENCED_FULL_RESOLUTION      // For smaller georeferenced images, entire image is eventually loaded and not streamed or view-dependant
 };
 
 struct GeoreferencedImageParams
 {
-	uint32_t2 imageExtents;
-	uint32_t2 viewportExtents;
-	asset::E_FORMAT format;
+	OrientedBoundingBox2D worldspaceOBB = {};
+	uint32_t2 imageExtents = {};
+	uint32_t2 viewportExtents = {};
+	asset::E_FORMAT format = {};
+	// TODO: Need to add other stuff later.
 };
 
 /**
@@ -99,106 +109,89 @@ struct ImageCleanup : public core::IReferenceCounted
 
 };
 
-struct StaticImageCopy
-{
-	core::smart_refctd_ptr<ICPUImage> cpuImage;
-	core::smart_refctd_ptr<IGPUImageView> gpuImageView;
-	uint32_t arrayIndex;
-};
-
-// TODO: consider just using the ImagesUsageCache to store this StaticImagesState, i.e. merge this struct with the ImageReference
-//		it will be possible after LRUCache improvements and copyability
-//		for now this will be a mirror of the LRUCache but in an unordered_map
-struct StaticImageState
-{
-	core::smart_refctd_ptr<ICPUImage> cpuImage = nullptr;
-	core::smart_refctd_ptr<IGPUImageView> gpuImageView = nullptr;
-	uint64_t allocationOffset = ImagesMemorySubAllocator::InvalidAddress;
-	uint64_t allocationSize = 0u;
-	uint32_t arrayIndex = ~0u; // in texture array descriptor 
-	bool gpuResident = false;
-};
-
-
-struct ImageReference
+struct CachedImageRecord
 {
 	static constexpr uint32_t InvalidTextureIndex = nbl::hlsl::numeric_limits<uint32_t>::max;
 	
 	uint32_t arrayIndex = InvalidTextureIndex; // index in our array of textures binding
-	ImageType imageType = ImageType::INVALID;
-	bool gpuResident = false;
+	ImageType type = ImageType::INVALID;
+	ImageState state = ImageState::INVALID;
 	uint64_t lastUsedFrameIndex = 0ull; // last used semaphore value on this image
 	uint64_t allocationOffset = ImagesMemorySubAllocator::InvalidAddress;
 	uint64_t allocationSize = 0ull;
 	core::smart_refctd_ptr<IGPUImageView> gpuImageView = nullptr;
+	core::smart_refctd_ptr<ICPUImage> staticCPUImage = nullptr; // cached cpu image for uploading to gpuImageView when needed.
 	
 	// In LRU Cache `insert` function, in case of cache miss, we need to construct the refereence with semaphore value
-	ImageReference(uint64_t currentFrameIndex) 
+	CachedImageRecord(uint64_t currentFrameIndex) 
 		: arrayIndex(InvalidTextureIndex)
-		, imageType(ImageType::INVALID)
-		, gpuResident(false)
+		, type(ImageType::INVALID)
+		, state(ImageState::INVALID)
 		, lastUsedFrameIndex(currentFrameIndex)
 		, allocationOffset(ImagesMemorySubAllocator::InvalidAddress)
 		, allocationSize(0ull)
 		, gpuImageView(nullptr)
+		, staticCPUImage(nullptr)
 	{}
 	
-	ImageReference() 
-		: ImageReference(0ull)
+	CachedImageRecord() 
+		: CachedImageRecord(0ull)
 	{}
 
 	// In LRU Cache `insert` function, in case of cache hit, we need to assign semaphore value without changing `index`
-	inline ImageReference& operator=(uint64_t currentFrameIndex) { lastUsedFrameIndex = currentFrameIndex; return *this;  }
+	inline CachedImageRecord& operator=(uint64_t currentFrameIndex) { lastUsedFrameIndex = currentFrameIndex; return *this;  }
 };
 
 // A resource-aware image cache with an LRU eviction policy.
-// This cache tracks image usage by ID and provides hooks for eviction logic, such as releasing descriptor slots and deallocating GPU memory.
+// This cache tracks image usage by ID and provides hooks for eviction logic (such as releasing descriptor slots and deallocating GPU memory done by user of this class)
 // Currently, eviction is purely LRU-based. In the future, eviction decisions may incorporate additional factors:
 //   - memory usage per image.
 //   - lastUsedFrameIndex.
-// This class does not own GPU resources directly, but helps coordinate their lifetimes in sync with GPU usage via eviction callbacks.
-class ImagesUsageCache
+// This class helps coordinate images' lifetimes in sync with GPU usage via eviction callbacks.
+class ImagesCache : public core::ResizableLRUCache<image_id, CachedImageRecord>
 {
 public:
-	ImagesUsageCache(size_t capacity) 
-		: lruCache(ImagesLRUCache(capacity))
+	using base_t = core::ResizableLRUCache<image_id, CachedImageRecord>;
+	
+	ImagesCache(size_t capacity) 
+		: base_t(capacity)
 	{}
 
 	// Attempts to insert a new image into the cache.
 	// If the cache is full, invokes the provided `evictCallback` to evict an image.
 	// Returns a pointer to the inserted or existing ImageReference.
-	template<std::invocable<image_id, const ImageReference&> EvictionCallback>
-	inline ImageReference* insert(image_id imageID, uint64_t lastUsedSema, EvictionCallback&& evictCallback)
+	template<std::invocable<image_id, const CachedImageRecord&> EvictionCallback>
+	inline CachedImageRecord* insert(image_id imageID, uint64_t lastUsedSema, EvictionCallback&& evictCallback)
 	{
-		auto lruEvictionCallback = [&](const ImageReference& evicted)
+		auto lruEvictionCallback = [&](const CachedImageRecord& evicted)
 			{
-				const image_id* evictingKey = lruCache.get_least_recently_used();
+				const image_id* evictingKey = base_t::get_least_recently_used();
 				assert(evictingKey != nullptr);
 				if (evictingKey)
 					evictCallback(*evictingKey, evicted);
 			};
-		return lruCache.insert(imageID, lastUsedSema, lruEvictionCallback);
+		return base_t::insert(imageID, lastUsedSema, lruEvictionCallback);
 	}
 	
 	// Retrieves the image associated with `imageID`, updating its LRU position.
-	inline ImageReference* get(image_id imageID)
+	inline CachedImageRecord* get(image_id imageID)
 	{
-		return lruCache.get(imageID);
+		return base_t::get(imageID);
 	}
 	
 	// Retrieves the ImageReference without updating LRU order.
-	inline ImageReference* peek(image_id imageID)
+	inline CachedImageRecord* peek(image_id imageID)
 	{
-		return lruCache.peek(imageID);
+		return base_t::peek(imageID);
 	}
 
-	inline size_t size() const { return lruCache.size(); }
+	inline size_t size() const { return base_t::size(); }
 	
 	// Selects an eviction candidate based on LRU policy.
 	// In the future, this could factor in memory pressure or semaphore sync requirements.
 	inline image_id select_eviction_candidate() 
 	{
-		const image_id* lru = lruCache.get_least_recently_used();
+		const image_id* lru = base_t::get_least_recently_used();
 		if (lru)
 			return *lru;
 		else
@@ -212,10 +205,6 @@ class ImagesUsageCache
 	// Removes a specific image from the cache (manual eviction).
 	inline void erase(image_id imageID)
 	{
-		lruCache.erase(imageID);
+		base_t::erase(imageID);
 	}
-
-private:
-	using ImagesLRUCache = core::ResizableLRUCache<image_id, ImageReference>;
-	ImagesLRUCache lruCache; // TODO: for now, work with simple lru cache, later on consider resource usage along with lastUsedSema value
 };
diff --git a/62_CAD/main.cpp b/62_CAD/main.cpp
index 1394bf719..c59669fa6 100644
--- a/62_CAD/main.cpp
+++ b/62_CAD/main.cpp
@@ -45,7 +45,7 @@ static constexpr bool DebugModeWireframe = false;
 static constexpr bool DebugRotatingViewProj = false;
 static constexpr bool FragmentShaderPixelInterlock = true;
 static constexpr bool LargeGeoTextureStreaming = true;
-static constexpr bool CacheAndReplay = false; // caches first frame resources (buffers and images) from DrawResourcesFiller  and replays in future frames, skiping CPU Logic
+static constexpr bool CacheAndReplay = true; // caches first frame resources (buffers and images) from DrawResourcesFiller  and replays in future frames, skiping CPU Logic
 
 enum class ExampleMode
 {
@@ -2906,10 +2906,17 @@ class ComputerAidedDesign final : public examples::SimpleWindowedApplication, pu
 			{
 				uint64_t imageID = i * 69ull; // it can be hash or something of the file path the image was loaded from
 				//printf(std::format("\n Image {} \n", i).c_str());
-				drawResourcesFiller.addStaticImage2D(imageID, sampleImages[i], intendedNextSubmit);
+				drawResourcesFiller.ensureStaticImageAvailability(imageID, sampleImages[i], intendedNextSubmit);
 				drawResourcesFiller.addImageObject(imageID, { .topLeft = { 0.0 + (i) * 3.0, 0.0 }, .dirU = { 3.0 , 0.0 }, .aspectRatio = 1.0 }, intendedNextSubmit);
 				//printf("\n");
 			}
+
+			GeoreferencedImageParams geoRefParams = {};
+			geoRefParams.format = asset::EF_R8G8B8A8_SRGB;
+			geoRefParams.imageExtents = uint32_t2 (2048, 2048);
+			geoRefParams.viewportExtents = (m_realFrameIx <= 5u) ? uint32_t2(1280, 720) : uint32_t2(3840, 2160); // to test trigerring resize/recreation
+			// drawResourcesFiller.ensureGeoreferencedImageAvailability_AllocateIfNeeded(6996, geoRefParams, intendedNextSubmit);
+			
 			LineStyleInfo lineStyle = 
 			{
 				.color = float32_t4(1.0f, 0.1f, 0.1f, 0.9f),
diff --git a/62_CAD/shaders/globals.hlsl b/62_CAD/shaders/globals.hlsl
index 0280b5881..72ab980c4 100644
--- a/62_CAD/shaders/globals.hlsl
+++ b/62_CAD/shaders/globals.hlsl
@@ -118,9 +118,10 @@ enum class MainObjectType : uint32_t
     POLYLINE,
     HATCH,
     TEXT,
-    IMAGE,
+    STATIC_IMAGE,
     DTM,
-    GRID_DTM
+    GRID_DTM,
+    STREAMED_IMAGE,
 };
 
 enum class ObjectType : uint32_t
@@ -130,9 +131,10 @@ enum class ObjectType : uint32_t
     CURVE_BOX = 2u,
     POLYLINE_CONNECTOR = 3u,
     FONT_GLYPH = 4u,
-    IMAGE = 5u,
+    STATIC_IMAGE = 5u,
     TRIANGLE_MESH = 6u,
-    GRID_DTM = 7u
+    GRID_DTM = 7u,
+    STREAMED_IMAGE = 8u,
 };
 
 enum class MajorAxis : uint32_t
@@ -240,6 +242,16 @@ struct ImageObjectInfo
     uint32_t textureID; // 4 bytes (32)
 };
 
+// Goes into geometry buffer, needs to be aligned by 8
+// Currently a simple OBB like ImageObject, but later will be fullscreen with additional info about UV offset for toroidal(mirror) addressing
+struct GeoreferencedImageInfo
+{
+    pfloat64_t2 topLeft; // 2 * 8 = 16 bytes (16)
+    float32_t2 dirU; // 2 * 4 = 8 bytes (24)
+    float32_t aspectRatio; // 4 bytes (28)
+    uint32_t textureID; // 4 bytes (32)
+};
+
 // Goes into geometry buffer, needs to be aligned by 8
 struct GridDTMInfo
 {
diff --git a/62_CAD/shaders/main_pipeline/fragment_shader.hlsl b/62_CAD/shaders/main_pipeline/fragment_shader.hlsl
index 766225acd..69fab0a4c 100644
--- a/62_CAD/shaders/main_pipeline/fragment_shader.hlsl
+++ b/62_CAD/shaders/main_pipeline/fragment_shader.hlsl
@@ -375,7 +375,7 @@ float4 fragMain(PSInput input) : SV_TARGET
                 localAlpha = 1.0f - smoothstep(-globals.antiAliasingFactor / 2.0f + bolden, globals.antiAliasingFactor / 2.0f + bolden, msdf);
             }
         }
-        else if (objType == ObjectType::IMAGE) 
+        else if (objType == ObjectType::STATIC_IMAGE) 
         {
             const float2 uv = input.getImageUV();
             const uint32_t textureId = input.getImageTextureId();
@@ -413,14 +413,25 @@ float4 fragMain(PSInput input) : SV_TARGET
             printf("uv = %f, %f", uv.x, uv.y);
 
         }
-        
+        else if (objType == ObjectType::STREAMED_IMAGE) 
+        {
+            const float2 uv = input.getImageUV();
+            const uint32_t textureId = input.getImageTextureId();
+
+            if (textureId != InvalidTextureIndex)
+            {
+                float4 colorSample = textures[NonUniformResourceIndex(textureId)].Sample(textureSampler, float2(uv.x, uv.y));
+                textureColor = colorSample.rgb;
+                localAlpha = colorSample.a;
+            }
+        }
 
         uint2 fragCoord = uint2(input.position.xy);
         
         if (localAlpha <= 0)
             discard;
         
-        const bool colorFromTexture = objType == ObjectType::IMAGE || objType == ObjectType::GRID_DTM;
+        const bool colorFromTexture = objType == ObjectType::STREAMED_IMAGE || objType == ObjectType::STATIC_IMAGE || objType == ObjectType::GRID_DTM;
 
         return calculateFinalColor<DeviceConfigCaps::fragmentShaderPixelInterlock>(fragCoord, localAlpha, currentMainObjectIdx, textureColor, colorFromTexture);
     }
diff --git a/62_CAD/shaders/main_pipeline/vertex_shader.hlsl b/62_CAD/shaders/main_pipeline/vertex_shader.hlsl
index e92a8d33b..b225b64a4 100644
--- a/62_CAD/shaders/main_pipeline/vertex_shader.hlsl
+++ b/62_CAD/shaders/main_pipeline/vertex_shader.hlsl
@@ -624,7 +624,7 @@ PSInput main(uint vertexID : SV_VertexID)
             outV.setFontGlyphUV(uv);
             outV.setFontGlyphTextureId(textureID);
         }
-        else if (objType == ObjectType::IMAGE)
+        else if (objType == ObjectType::STATIC_IMAGE)
         {
             pfloat64_t2 topLeft = vk::RawBufferLoad<pfloat64_t2>(globals.pointers.geometryBuffer + drawObj.geometryAddress, 8u);
             float32_t2 dirU = vk::RawBufferLoad<float32_t2>(globals.pointers.geometryBuffer + drawObj.geometryAddress + sizeof(pfloat64_t2), 4u);
@@ -670,6 +670,27 @@ PSInput main(uint vertexID : SV_VertexID)
             outV.setGridDTMScreenSpacePosition(transformPointScreenSpace(clipProjectionData.projectionToNDC, globals.resolution, vtxPos));
             outV.setImageUV(corner);
         }
+        else if (objType == ObjectType::STREAMED_IMAGE)
+        {
+            pfloat64_t2 topLeft = vk::RawBufferLoad<pfloat64_t2>(globals.pointers.geometryBuffer + drawObj.geometryAddress, 8u);
+            float32_t2 dirU = vk::RawBufferLoad<float32_t2>(globals.pointers.geometryBuffer + drawObj.geometryAddress + sizeof(pfloat64_t2), 4u);
+            float32_t aspectRatio = vk::RawBufferLoad<float32_t>(globals.pointers.geometryBuffer + drawObj.geometryAddress + sizeof(pfloat64_t2) + sizeof(float2), 4u);
+            uint32_t textureID = vk::RawBufferLoad<uint32_t>(globals.pointers.geometryBuffer + drawObj.geometryAddress + sizeof(pfloat64_t2) + sizeof(float2) + sizeof(float), 4u);
+
+            const float32_t2 dirV = float32_t2(dirU.y, -dirU.x) * aspectRatio;
+            const float2 ndcTopLeft = _static_cast<float2>(transformPointNdc(clipProjectionData.projectionToNDC, topLeft));
+            const float2 ndcDirU = _static_cast<float2>(transformVectorNdc(clipProjectionData.projectionToNDC, _static_cast<pfloat64_t2>(dirU)));
+            const float2 ndcDirV = _static_cast<float2>(transformVectorNdc(clipProjectionData.projectionToNDC, _static_cast<pfloat64_t2>(dirV)));
+
+            float2 corner = float2(bool2(vertexIdx & 0x1u, vertexIdx >> 1));
+            float2 uv = corner; // non-dilated
+        
+            float2 ndcCorner = ndcTopLeft + corner.x * ndcDirU + corner.y * ndcDirV;
+        
+            outV.position = float4(ndcCorner, 0.f, 1.f);
+            outV.setImageUV(uv);
+            outV.setImageTextureId(textureID);
+        }
 
     // Make the cage fullscreen for testing: 
 #if 0

From f71b1d3117a2434c670bc857286a028e6c27b33d Mon Sep 17 00:00:00 2001
From: devsh <devsh@devsh.eu>
Date: Wed, 21 May 2025 17:10:19 +0200
Subject: [PATCH 279/529] do the ownership acquires properly

---
 67_RayQueryGeometry/main.cpp | 21 ++++++++++++++-------
 1 file changed, 14 insertions(+), 7 deletions(-)

diff --git a/67_RayQueryGeometry/main.cpp b/67_RayQueryGeometry/main.cpp
index 1248a1bf3..6ddefdcf2 100644
--- a/67_RayQueryGeometry/main.cpp
+++ b/67_RayQueryGeometry/main.cpp
@@ -871,9 +871,8 @@ class RayQueryGeometryApp final : public examples::SimpleWindowedApplication, pu
 				}
 
 				// assign gpu objects to output
-				auto&& tlases = reservation.getGPUObjects<ICPUTopLevelAccelerationStructure>();
-				gpuTlas = tlases[0].value;
 				auto&& buffers = reservation.getGPUObjects<ICPUBuffer>();
+				gpuTlas = reservation.getGPUObjects<ICPUTopLevelAccelerationStructure>().front().value;
 				for (uint32_t i = 0; i < objectsCpu.size(); i++)
 				{
 					auto vBuffer = buffers[2 * i + 0].value;
@@ -917,16 +916,17 @@ class RayQueryGeometryApp final : public examples::SimpleWindowedApplication, pu
 						.range = bufferRange
 					});
 				};
-				for (auto buffer : reservation.getGPUObjects<ICPUBuffer>())
+				for (const auto& buffer : reservation.getGPUObjects<ICPUBuffer>())
 				{
 					const auto& buff = buffer.value;
-					acquireBufferRange({.offset=0,.size=buff->getSize(),.buffer=buff});
+					if (buff)
+						acquireBufferRange({.offset=0,.size=buff->getSize(),.buffer=buff});
 				}
 				auto acquireAS = [&acquireBufferRange](const IGPUAccelerationStructure* as)
 				{
 					acquireBufferRange(as->getCreationParams().bufferRange);
 				};
-				for (auto blas : reservation.getGPUObjects<ICPUBottomLevelAccelerationStructure>())
+				for (const auto& blas : reservation.getGPUObjects<ICPUBottomLevelAccelerationStructure>())
 					acquireAS(blas.value.get());
 				acquireAS(gpuTlas.get());
 				cmdbuf->pipelineBarrier(asset::E_DEPENDENCY_FLAGS::EDF_NONE,{.memBarriers={},.bufBarriers=bufBarriers});
@@ -934,11 +934,18 @@ class RayQueryGeometryApp final : public examples::SimpleWindowedApplication, pu
 				const IQueue::SSubmitInfo::SCommandBufferInfo cmdbufInfo = {
 					.cmdbuf = cmdbuf.get()
 				};
+				const IQueue::SSubmitInfo::SSemaphoreInfo signal = {
+					.semaphore = compute.scratchSemaphore.semaphore,
+					.value = compute.getFutureScratchSemaphore().value,
+					.stageMask = asset::PIPELINE_STAGE_FLAGS::ALL_COMMANDS_BITS
+				};
 				const IQueue::SSubmitInfo info = {
 					.waitSemaphores = {}, // we already waited with the host on the AS build
-					.commandBuffers = {&cmdbufInfo,1}
+					.commandBuffers = {&cmdbufInfo,1},
+					.signalSemaphores = {&signal,1}
 				};
-				gQueue->submit({&info,1});
+				if (const auto retval=gQueue->submit({&info,1}); retval!=IQueue::RESULT::SUCCESS)
+					m_logger->log("Failed to transfer ownership with code %d!",system::ILogger::ELL_ERROR,retval);
 			}
 
 			return bool(gpuTlas);

From f5302ec98b37e5473a0563862f6afe2d12ec43d0 Mon Sep 17 00:00:00 2001
From: devsh <devsh@devsh.eu>
Date: Thu, 22 May 2025 00:13:12 +0200
Subject: [PATCH 280/529] Do the QFOT Acquires properly, fix a bug due to
 missing BDA on sideband smooth normal info.

Also fix alignment issues in BDA raw load
---
 67_RayQueryGeometry/app_resources/common.hlsl |   1 +
 67_RayQueryGeometry/main.cpp                  | 119 +++++++++++-------
 2 files changed, 72 insertions(+), 48 deletions(-)

diff --git a/67_RayQueryGeometry/app_resources/common.hlsl b/67_RayQueryGeometry/app_resources/common.hlsl
index ecc811e3f..e39e7192b 100644
--- a/67_RayQueryGeometry/app_resources/common.hlsl
+++ b/67_RayQueryGeometry/app_resources/common.hlsl
@@ -14,6 +14,7 @@ struct SGeomInfo
     uint32_t vertexStride : 29;
     uint32_t indexType : 2; // 16 bit, 32 bit or none
     uint32_t smoothNormals : 1;	// flat for cube, rectangle, disk
+    uint32_t padding;
 };
 
 struct SPushConstants
diff --git a/67_RayQueryGeometry/main.cpp b/67_RayQueryGeometry/main.cpp
index 6ddefdcf2..9d002c1f0 100644
--- a/67_RayQueryGeometry/main.cpp
+++ b/67_RayQueryGeometry/main.cpp
@@ -732,16 +732,21 @@ class RayQueryGeometryApp final : public examples::SimpleWindowedApplication, pu
 #endif
 
 			std::array<const ICPUBuffer*, OT_COUNT * 2u> tmpBuffers;
+			std::array<CAssetConverter::patch_t<ICPUBuffer>, OT_COUNT * 2u> tmpBufferPatches;
 			{
 				for (uint32_t i = 0; i < objectsCpu.size(); i++)
 				{
 					tmpBuffers[2 * i + 0] = cpuBlas[i]->getTriangleGeometries().front().vertexData[0].buffer.get();
 					tmpBuffers[2 * i + 1] = cpuBlas[i]->getTriangleGeometries().front().indexData.buffer.get();
 				}
+				// make sure all buffers are BDA-readable
+				for (auto& patch : tmpBufferPatches)
+					patch.usage |= asset::IBuffer::E_USAGE_FLAGS::EUF_SHADER_DEVICE_ADDRESS_BIT;
 
 				std::get<CAssetConverter::SInputs::asset_span_t<ICPUTopLevelAccelerationStructure>>(inputs.assets) = {&cpuTlas.get(),1};
 				std::get<CAssetConverter::SInputs::asset_span_t<ICPUBottomLevelAccelerationStructure>>(inputs.assets) = {&cpuBlas.data()->get(),cpuBlas.size()};
 				std::get<CAssetConverter::SInputs::asset_span_t<ICPUBuffer>>(inputs.assets) = tmpBuffers;
+				std::get<CAssetConverter::SInputs::patch_span_t<ICPUBuffer>>(inputs.patches) = tmpBufferPatches;
 			}
 
 			auto reservation = converter->reserve(inputs);
@@ -806,6 +811,7 @@ class RayQueryGeometryApp final : public examples::SimpleWindowedApplication, pu
 				.stageMask = PIPELINE_STAGE_FLAGS::ACCELERATION_STRUCTURE_BUILD_BIT|PIPELINE_STAGE_FLAGS::ACCELERATION_STRUCTURE_COPY_BIT
 			};
 			// convert
+			m_api->startCapture();
 			auto gQueue = getGraphicsQueue();
 			{
 				smart_refctd_ptr<CAssetConverter::SConvertParams::scratch_for_device_AS_build_t> scratchAlloc;
@@ -881,7 +887,7 @@ class RayQueryGeometryApp final : public examples::SimpleWindowedApplication, pu
 					const bool useIndex = geom.data.indexType != EIT_UNKNOWN;
 
 					geomInfos[i].vertexBufferAddress = vBuffer->getDeviceAddress() + byteOffsets[i];
-					geomInfos[i].indexBufferAddress = useIndex ? iBuffer->getDeviceAddress() : geomInfos[i].vertexBufferAddress;
+					geomInfos[i].indexBufferAddress = useIndex ? iBuffer->getDeviceAddress():0x0ull;
 				}
 			}
 
@@ -894,60 +900,77 @@ class RayQueryGeometryApp final : public examples::SimpleWindowedApplication, pu
 			}
 
 			// acquire ownership
-			if (const auto gQFI=gQueue->getFamilyIndex(), otherQueueFamilyIndex=queue->getFamilyIndex(); gQFI!=otherQueueFamilyIndex)
 			{
 				smart_refctd_ptr<IGPUCommandBuffer> cmdbuf;
-				m_device->createCommandPool(gQFI,IGPUCommandPool::CREATE_FLAGS::TRANSIENT_BIT)->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY,{&cmdbuf,1});
-				cmdbuf->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT);
-				core::vector<IGPUCommandBuffer::SBufferMemoryBarrier<IGPUCommandBuffer::SOwnershipTransferBarrier>> bufBarriers;
-				auto acquireBufferRange = [&bufBarriers,otherQueueFamilyIndex](const SBufferRange<IGPUBuffer>& bufferRange)
-				{
-					bufBarriers.push_back({
-						.barrier = {
-							.dep = {
-								.srcStageMask = PIPELINE_STAGE_FLAGS::NONE,
-								.srcAccessMask = ACCESS_FLAGS::NONE,
-								.dstStageMask = PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT,
-								.dstAccessMask = ACCESS_FLAGS::ACCELERATION_STRUCTURE_READ_BIT|ACCESS_FLAGS::STORAGE_READ_BIT
-							},
-							.ownershipOp = IGPUCommandBuffer::SOwnershipTransferBarrier::OWNERSHIP_OP::ACQUIRE,
-							.otherQueueFamilyIndex = otherQueueFamilyIndex
-						},
-						.range = bufferRange
-					});
-				};
-				for (const auto& buffer : reservation.getGPUObjects<ICPUBuffer>())
 				{
-					const auto& buff = buffer.value;
-					if (buff)
-						acquireBufferRange({.offset=0,.size=buff->getSize(),.buffer=buff});
+					const auto gQFI = gQueue->getFamilyIndex();
+					m_device->createCommandPool(gQFI,IGPUCommandPool::CREATE_FLAGS::TRANSIENT_BIT)->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY,{&cmdbuf,1});
+					cmdbuf->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT);
+					{
+						core::vector<IGPUCommandBuffer::SBufferMemoryBarrier<IGPUCommandBuffer::SOwnershipTransferBarrier>> bufBarriers;
+						auto acquireBufferRange = [&bufBarriers](const uint8_t otherQueueFamilyIndex, const SBufferRange<IGPUBuffer>& bufferRange)
+						{
+							bufBarriers.push_back({
+								.barrier = {
+									.dep = {
+										.srcStageMask = PIPELINE_STAGE_FLAGS::NONE,
+										.srcAccessMask = ACCESS_FLAGS::NONE,
+										.dstStageMask = PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT,
+										// we don't care what exactly, uncomplex our code
+										.dstAccessMask = ACCESS_FLAGS::SHADER_READ_BITS
+									},
+									.ownershipOp = IGPUCommandBuffer::SOwnershipTransferBarrier::OWNERSHIP_OP::ACQUIRE,
+									.otherQueueFamilyIndex = otherQueueFamilyIndex
+								},
+								.range = bufferRange
+							});
+						};
+						if (const auto otherQueueFamilyIndex=transfer.queue->getFamilyIndex(); gQFI!=otherQueueFamilyIndex)
+						for (const auto& buffer : reservation.getGPUObjects<ICPUBuffer>())
+						{
+							const auto& buff = buffer.value;
+							if (buff)
+								acquireBufferRange(otherQueueFamilyIndex,{.offset=0,.size=buff->getSize(),.buffer=buff});
+						}
+						if (const auto otherQueueFamilyIndex=compute.queue->getFamilyIndex(); gQFI!=otherQueueFamilyIndex)
+						{
+							auto acquireAS = [&acquireBufferRange,otherQueueFamilyIndex](const IGPUAccelerationStructure* as)
+							{
+								acquireBufferRange(otherQueueFamilyIndex,as->getCreationParams().bufferRange);
+							};
+							for (const auto& blas : reservation.getGPUObjects<ICPUBottomLevelAccelerationStructure>())
+								acquireAS(blas.value.get());
+							acquireAS(gpuTlas.get());
+						}
+						if (!bufBarriers.empty())
+							cmdbuf->pipelineBarrier(asset::E_DEPENDENCY_FLAGS::EDF_NONE,{.memBarriers={},.bufBarriers=bufBarriers});
+					}
+					cmdbuf->end();
 				}
-				auto acquireAS = [&acquireBufferRange](const IGPUAccelerationStructure* as)
+				if (!cmdbuf->empty())
 				{
-					acquireBufferRange(as->getCreationParams().bufferRange);
-				};
-				for (const auto& blas : reservation.getGPUObjects<ICPUBottomLevelAccelerationStructure>())
-					acquireAS(blas.value.get());
-				acquireAS(gpuTlas.get());
-				cmdbuf->pipelineBarrier(asset::E_DEPENDENCY_FLAGS::EDF_NONE,{.memBarriers={},.bufBarriers=bufBarriers});
-				cmdbuf->end();
-				const IQueue::SSubmitInfo::SCommandBufferInfo cmdbufInfo = {
-					.cmdbuf = cmdbuf.get()
-				};
-				const IQueue::SSubmitInfo::SSemaphoreInfo signal = {
-					.semaphore = compute.scratchSemaphore.semaphore,
-					.value = compute.getFutureScratchSemaphore().value,
-					.stageMask = asset::PIPELINE_STAGE_FLAGS::ALL_COMMANDS_BITS
-				};
-				const IQueue::SSubmitInfo info = {
-					.waitSemaphores = {}, // we already waited with the host on the AS build
-					.commandBuffers = {&cmdbufInfo,1},
-					.signalSemaphores = {&signal,1}
-				};
-				if (const auto retval=gQueue->submit({&info,1}); retval!=IQueue::RESULT::SUCCESS)
-					m_logger->log("Failed to transfer ownership with code %d!",system::ILogger::ELL_ERROR,retval);
+					const IQueue::SSubmitInfo::SCommandBufferInfo cmdbufInfo = {
+						.cmdbuf = cmdbuf.get()
+					};
+					const IQueue::SSubmitInfo::SSemaphoreInfo signal = {
+						.semaphore = compute.scratchSemaphore.semaphore,
+						.value = compute.getFutureScratchSemaphore().value,
+						.stageMask = asset::PIPELINE_STAGE_FLAGS::ALL_COMMANDS_BITS
+					};
+					auto wait = signal;
+					wait.value--;
+					const IQueue::SSubmitInfo info = {
+						.waitSemaphores = {&wait,1}, // we already waited with the host on the AS build
+						.commandBuffers = {&cmdbufInfo,1},
+						.signalSemaphores = {&signal,1}
+					};
+					if (const auto retval=gQueue->submit({&info,1}); retval!=IQueue::RESULT::SUCCESS)
+						m_logger->log("Failed to transfer ownership with code %d!",system::ILogger::ELL_ERROR,retval);
+				}
 			}
 
+			m_api->endCapture();
+
 			return bool(gpuTlas);
 		}
 #else

From 0bd7fbad07365c205c7b0014d3a5c713937fbef6 Mon Sep 17 00:00:00 2001
From: devsh <devsh@devsh.eu>
Date: Thu, 22 May 2025 00:17:25 +0200
Subject: [PATCH 281/529] ReBAR AS-conversion codepath tested and enabled by
 default now

---
 67_RayQueryGeometry/main.cpp | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/67_RayQueryGeometry/main.cpp b/67_RayQueryGeometry/main.cpp
index 9d002c1f0..7a6abd1af 100644
--- a/67_RayQueryGeometry/main.cpp
+++ b/67_RayQueryGeometry/main.cpp
@@ -695,7 +695,7 @@ class RayQueryGeometryApp final : public examples::SimpleWindowedApplication, pu
 			cpuTlas->setInstances(std::move(geomInstances));
 			cpuTlas->setBuildFlags(IGPUTopLevelAccelerationStructure::BUILD_FLAGS::PREFER_FAST_TRACE_BIT);
 
-#define TEST_REBAR_FALLBACK
+//#define TEST_REBAR_FALLBACK
 			// convert with asset converter
 			smart_refctd_ptr<CAssetConverter> converter = CAssetConverter::create({ .device = m_device.get(), .optimizer = {} });
 			struct MyInputs : CAssetConverter::SInputs
@@ -862,7 +862,6 @@ class RayQueryGeometryApp final : public examples::SimpleWindowedApplication, pu
 
 					uint8_t finalUser;
 				} params = {};
-#undef TEST_REBAR_FALLBACK
 				params.utilities = m_utils.get();
 				params.transfer = &transfer;
 				params.compute = &compute;
@@ -925,6 +924,7 @@ class RayQueryGeometryApp final : public examples::SimpleWindowedApplication, pu
 								.range = bufferRange
 							});
 						};
+#ifndef TEST_REBAR_FALLBACK
 						if (const auto otherQueueFamilyIndex=transfer.queue->getFamilyIndex(); gQFI!=otherQueueFamilyIndex)
 						for (const auto& buffer : reservation.getGPUObjects<ICPUBuffer>())
 						{
@@ -932,6 +932,7 @@ class RayQueryGeometryApp final : public examples::SimpleWindowedApplication, pu
 							if (buff)
 								acquireBufferRange(otherQueueFamilyIndex,{.offset=0,.size=buff->getSize(),.buffer=buff});
 						}
+#endif
 						if (const auto otherQueueFamilyIndex=compute.queue->getFamilyIndex(); gQFI!=otherQueueFamilyIndex)
 						{
 							auto acquireAS = [&acquireBufferRange,otherQueueFamilyIndex](const IGPUAccelerationStructure* as)
@@ -968,6 +969,7 @@ class RayQueryGeometryApp final : public examples::SimpleWindowedApplication, pu
 						m_logger->log("Failed to transfer ownership with code %d!",system::ILogger::ELL_ERROR,retval);
 				}
 			}
+#undef TEST_REBAR_FALLBACK
 
 			m_api->endCapture();
 

From ff962daed731df536a909ee8d12d2f8d3579431b Mon Sep 17 00:00:00 2001
From: devsh <devsh@devsh.eu>
Date: Thu, 22 May 2025 01:04:12 +0200
Subject: [PATCH 282/529] typo and no point announcing ass-conv failures if the
 ass-conv will do it for us

---
 67_RayQueryGeometry/main.cpp | 22 ++--------------------
 1 file changed, 2 insertions(+), 20 deletions(-)

diff --git a/67_RayQueryGeometry/main.cpp b/67_RayQueryGeometry/main.cpp
index 7a6abd1af..8690f55bc 100644
--- a/67_RayQueryGeometry/main.cpp
+++ b/67_RayQueryGeometry/main.cpp
@@ -750,24 +750,6 @@ class RayQueryGeometryApp final : public examples::SimpleWindowedApplication, pu
 			}
 
 			auto reservation = converter->reserve(inputs);
-			{
-				auto prepass = [&]<typename asset_type_t>() -> bool
-				{
-					auto objects = reservation.getGPUObjects<asset_type_t>();
-					for (auto& object : objects)
-					if (!object.value)
-					{
-						m_logger->log("Failed to convert a CPU object to GPU!", ILogger::ELL_ERROR);
-						return false;
-					}
-					return true;
-				};
-
-				prepass.template operator()<ICPUBuffer>();
-				prepass.template operator()<ICPUBottomLevelAccelerationStructure>();
-				prepass.template operator()<ICPUTopLevelAccelerationStructure>();
-			}
-
 
 			constexpr auto XferBufferCount = 2;
 			std::array<smart_refctd_ptr<IGPUCommandBuffer>,XferBufferCount> xferBufs = {};
@@ -893,7 +875,7 @@ class RayQueryGeometryApp final : public examples::SimpleWindowedApplication, pu
 			//
 			{
 				IGPUBuffer::SCreationParams params;
-				params.usage = IGPUBuffer::EUF_STORAGE_BUFFER_BIT | IGPUBuffer::EUF_TRANSFER_DST_BIT | IGPUBuffer::EUF_INLINE_UPDATE_VIA_CMDBUF | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT;
+				params.usage = IGPUBuffer::EUF_STORAGE_BUFFER_BIT | IGPUBuffer::EUF_TRANSFER_DST_BIT | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT;
 				params.size = OT_COUNT * sizeof(SGeomInfo);
 				m_utils->createFilledDeviceLocalBufferOnDedMem(SIntendedSubmitInfo{ .queue = gQueue }, std::move(params), geomInfos).move_into(geometryInfoBuffer);
 			}
@@ -924,7 +906,7 @@ class RayQueryGeometryApp final : public examples::SimpleWindowedApplication, pu
 								.range = bufferRange
 							});
 						};
-#ifndef TEST_REBAR_FALLBACK
+#ifdef TEST_REBAR_FALLBACK
 						if (const auto otherQueueFamilyIndex=transfer.queue->getFamilyIndex(); gQFI!=otherQueueFamilyIndex)
 						for (const auto& buffer : reservation.getGPUObjects<ICPUBuffer>())
 						{

From 09f0d32cf936dd5d200d39cd7eddc3f610766e0b Mon Sep 17 00:00:00 2001
From: devsh <devsh@devsh.eu>
Date: Thu, 22 May 2025 01:47:53 +0200
Subject: [PATCH 283/529] stuff was crashing cause IGPUBuffers backing BDA were
 going out of scope

---
 .../app_resources/render.comp.hlsl            |   4 +-
 67_RayQueryGeometry/main.cpp                  | 584 +-----------------
 2 files changed, 27 insertions(+), 561 deletions(-)

diff --git a/67_RayQueryGeometry/app_resources/render.comp.hlsl b/67_RayQueryGeometry/app_resources/render.comp.hlsl
index e3d78f385..aa4524124 100644
--- a/67_RayQueryGeometry/app_resources/render.comp.hlsl
+++ b/67_RayQueryGeometry/app_resources/render.comp.hlsl
@@ -125,8 +125,8 @@ void main(uint32_t3 threadID : SV_DispatchThreadID)
         const int primID = spirv::rayQueryGetIntersectionPrimitiveIndexKHR(query, true);
 
         // TODO: candidate for `bda::__ptr<SGeomInfo>`
-        const SGeomInfo geom = vk::RawBufferLoad<SGeomInfo>(pc.geometryInfoBuffer + instID * sizeof(SGeomInfo));
-        
+        const SGeomInfo geom = vk::RawBufferLoad<SGeomInfo>(pc.geometryInfoBuffer + instID * sizeof(SGeomInfo),8);
+
         float3 normals;
         if (jit::device_capabilities::rayTracingPositionFetch)
         {
diff --git a/67_RayQueryGeometry/main.cpp b/67_RayQueryGeometry/main.cpp
index 8690f55bc..1faeaf196 100644
--- a/67_RayQueryGeometry/main.cpp
+++ b/67_RayQueryGeometry/main.cpp
@@ -1,11 +1,8 @@
 // Copyright (C) 2018-2024 - DevSH Graphics Programming Sp. z O.O.
 // This file is part of the "Nabla Engine".
 // For conditions of distribution and use, see copyright notice in nabla.h
-
 #include "common.hpp"
 
-#define TEST_ASSET_CONV_AS
-
 class RayQueryGeometryApp final : public examples::SimpleWindowedApplication, public application_templates::MonoAssetManagerAndBuiltinResourceApplication
 {
 		using device_base_t = examples::SimpleWindowedApplication;
@@ -128,14 +125,6 @@ class RayQueryGeometryApp final : public examples::SimpleWindowedApplication, pu
 
 			auto cQueue = getComputeQueue();
 
-#ifdef TEST_ASSET_CONV_AS
-			if (!createAccelerationStructuresFromGeometry(cQueue, geometryCreator))
-				return logFail("Could not create acceleration structures from provided geometry creator");
-#else
-			// create geometry objects
-			if (!createGeometries(gQueue, geometryCreator))
-				return logFail("Could not create geometries from geometry creator");
-
 			// create blas/tlas
 //#define TRY_BUILD_FOR_NGFX // Validation errors on the fake Acquire-Presents, TODO fix
 #ifdef TRY_BUILD_FOR_NGFX
@@ -148,12 +137,11 @@ class RayQueryGeometryApp final : public examples::SimpleWindowedApplication, pu
 					std::this_thread::yield();
 			}
 			// Nsight is special and can't capture anything not on the queue that performs the swapchain acquire/release
-			if (!createAccelerationStructures(gQueue))
+			if (!createAccelerationStructuresFromGeometry(gQueue,geometryCreator))
 #else
-			if (!createAccelerationStructures(cQueue))
+			if (!createAccelerationStructuresFromGeometry(cQueue,geometryCreator))
 #endif
 				return logFail("Could not create acceleration structures");
-#endif	// TEST_ASSET_CONV_AS
 
 			// create pipelines
 			{
@@ -197,10 +185,10 @@ class RayQueryGeometryApp final : public examples::SimpleWindowedApplication, pu
 				auto descriptorSetLayout = m_device->createDescriptorSetLayout(bindings);
 
 				const std::array<IGPUDescriptorSetLayout*, ICPUPipelineLayout::DESCRIPTOR_SET_COUNT> dsLayoutPtrs = { descriptorSetLayout.get() };
-				renderPool = m_device->createDescriptorPoolForDSLayouts(IDescriptorPool::ECF_UPDATE_AFTER_BIND_BIT, std::span(dsLayoutPtrs.begin(), dsLayoutPtrs.end()));
-				if (!renderPool)
+				auto pool = m_device->createDescriptorPoolForDSLayouts(IDescriptorPool::ECF_UPDATE_AFTER_BIND_BIT, std::span(dsLayoutPtrs.begin(), dsLayoutPtrs.end()));
+				if (!pool)
 					return logFail("Could not create descriptor pool");
-				renderDs = renderPool->createDescriptorSet(descriptorSetLayout);
+				renderDs = pool->createDescriptorSet(descriptorSetLayout);
 				if (!renderDs)
 					return logFail("Could not create descriptor set");
 
@@ -288,7 +276,6 @@ class RayQueryGeometryApp final : public examples::SimpleWindowedApplication, pu
 			static bool first = true;
 			if (first)
 			{
-				m_api->startCapture();
 				first = false;
 			}
 
@@ -527,79 +514,6 @@ class RayQueryGeometryApp final : public examples::SimpleWindowedApplication, pu
 			return (dim + size - 1) / size;
 		}
 
-		smart_refctd_ptr<IGPUBuffer> createBuffer(IGPUBuffer::SCreationParams& params)
-		{
-			smart_refctd_ptr<IGPUBuffer> buffer;
-			buffer = m_device->createBuffer(std::move(params));
-			auto bufReqs = buffer->getMemoryReqs();
-			bufReqs.memoryTypeBits &= m_physicalDevice->getDeviceLocalMemoryTypeBits();
-			m_device->allocate(bufReqs, buffer.get(), IDeviceMemoryAllocation::EMAF_DEVICE_ADDRESS_BIT);
-
-			return buffer;
-		}
-
-#ifndef TEST_ASSET_CONV_AS
-		smart_refctd_ptr<IGPUCommandBuffer> getSingleUseCommandBufferAndBegin(smart_refctd_ptr<IGPUCommandPool> pool)
-		{
-			smart_refctd_ptr<IGPUCommandBuffer> cmdbuf;
-			if (!pool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY, 1u, &cmdbuf))
-				return nullptr;
-
-			cmdbuf->reset(IGPUCommandBuffer::RESET_FLAGS::RELEASE_RESOURCES_BIT);
-			cmdbuf->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT);
-
-			return cmdbuf;
-		}
-
-		void cmdbufSubmitAndWait(smart_refctd_ptr<IGPUCommandBuffer> cmdbuf, CThreadSafeQueueAdapter* queue, uint64_t startValue)
-		{
-			cmdbuf->end();
-
-			uint64_t finishedValue = startValue + 1;
-
-			// submit builds
-			{
-				auto completed = m_device->createSemaphore(startValue);
-
-				std::array<IQueue::SSubmitInfo::SSemaphoreInfo, 1u> signals;
-				{
-					auto& signal = signals.front();
-					signal.value = finishedValue;
-					signal.stageMask = bitflag(PIPELINE_STAGE_FLAGS::ALL_TRANSFER_BITS);
-					signal.semaphore = completed.get();
-				}
-
-				const IQueue::SSubmitInfo::SCommandBufferInfo commandBuffers[1] = { {
-					.cmdbuf = cmdbuf.get()
-				} };
-
-				const IQueue::SSubmitInfo infos[] =
-				{
-					{
-						.waitSemaphores = {},
-						.commandBuffers = commandBuffers,
-						.signalSemaphores = signals
-					}
-				};
-
-				if (queue->submit(infos) != IQueue::RESULT::SUCCESS)
-				{
-					m_logger->log("Failed to submit geometry transfer upload operations!", ILogger::ELL_ERROR);
-					return;
-				}
-
-				const ISemaphore::SWaitInfo info[] =
-				{ {
-					.semaphore = completed.get(),
-					.value = finishedValue
-				} };
-
-				m_device->blockForSemaphores(info);
-			}
-		}
-#endif
-
-#ifdef TEST_ASSET_CONV_AS
 		bool createAccelerationStructuresFromGeometry(video::CThreadSafeQueueAdapter* queue, const IGeometryCreator* gc)
 		{
 			// get geometries in ICPUBuffers
@@ -793,6 +707,18 @@ class RayQueryGeometryApp final : public examples::SimpleWindowedApplication, pu
 				.stageMask = PIPELINE_STAGE_FLAGS::ACCELERATION_STRUCTURE_BUILD_BIT|PIPELINE_STAGE_FLAGS::ACCELERATION_STRUCTURE_COPY_BIT
 			};
 			// convert
+#ifdef TRY_BUILD_FOR_NGFX // NSight is "debugger-challenged" it can't capture anything not happenning "during a frame", so we need to trick it
+			m_currentImageAcquire = m_surface->acquireNextImage();
+			{
+				const IQueue::SSubmitInfo::SSemaphoreInfo acquired[] = { {
+					.semaphore = m_currentImageAcquire.semaphore,
+					.value = m_currentImageAcquire.acquireCount,
+					.stageMask = PIPELINE_STAGE_FLAGS::ALL_COMMANDS_BITS
+				} };
+				m_surface->present(m_currentImageAcquire.imageIndex,acquired);
+			}
+			m_currentImageAcquire = m_surface->acquireNextImage();
+#endif
 			m_api->startCapture();
 			auto gQueue = getGraphicsQueue();
 			{
@@ -858,12 +784,14 @@ class RayQueryGeometryApp final : public examples::SimpleWindowedApplication, pu
 				}
 
 				// assign gpu objects to output
-				auto&& buffers = reservation.getGPUObjects<ICPUBuffer>();
 				gpuTlas = reservation.getGPUObjects<ICPUTopLevelAccelerationStructure>().front().value;
+				for (const auto& buffer : reservation.getGPUObjects<ICPUBuffer>())
+				if (buffer)
+					retainedBuffers.push_back(buffer.value);
 				for (uint32_t i = 0; i < objectsCpu.size(); i++)
 				{
-					auto vBuffer = buffers[2 * i + 0].value;
-					auto iBuffer = buffers[2 * i + 1].value;
+					auto vBuffer = retainedBuffers[2 * i + 0].get();
+					auto iBuffer = retainedBuffers[2 * i + 1].get();
 					const auto& geom = objectsCpu[i];
 					const bool useIndex = geom.data.indexType != EIT_UNKNOWN;
 
@@ -952,465 +880,7 @@ class RayQueryGeometryApp final : public examples::SimpleWindowedApplication, pu
 				}
 			}
 #undef TEST_REBAR_FALLBACK
-
-			m_api->endCapture();
-
-			return bool(gpuTlas);
-		}
-#else
-		bool createGeometries(video::CThreadSafeQueueAdapter* queue, const IGeometryCreator* gc)
-		{
-			auto pool = m_device->createCommandPool(queue->getFamilyIndex(), IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT);
-			if (!pool)
-				return logFail("Couldn't create Command Pool for geometry creation!");
-
-			std::array<ReferenceObjectCpu, OT_COUNT> objectsCpu;
-			objectsCpu[OT_CUBE] = ReferenceObjectCpu{ .meta = {.type = OT_CUBE, .name = "Cube Mesh" }, .shadersType = GP_BASIC, .data = gc->createCubeMesh(nbl::core::vector3df(1.f, 1.f, 1.f)) };
-			objectsCpu[OT_SPHERE] = ReferenceObjectCpu{ .meta = {.type = OT_SPHERE, .name = "Sphere Mesh" }, .shadersType = GP_BASIC, .data = gc->createSphereMesh(2, 16, 16) };
-			objectsCpu[OT_CYLINDER] = ReferenceObjectCpu{ .meta = {.type = OT_CYLINDER, .name = "Cylinder Mesh" }, .shadersType = GP_BASIC, .data = gc->createCylinderMesh(2, 2, 20) };
-			objectsCpu[OT_RECTANGLE] = ReferenceObjectCpu{ .meta = {.type = OT_RECTANGLE, .name = "Rectangle Mesh" }, .shadersType = GP_BASIC, .data = gc->createRectangleMesh(nbl::core::vector2df_SIMD(1.5, 3)) };
-			objectsCpu[OT_DISK] = ReferenceObjectCpu{ .meta = {.type = OT_DISK, .name = "Disk Mesh" }, .shadersType = GP_BASIC, .data = gc->createDiskMesh(2, 30) };
-			objectsCpu[OT_ARROW] = ReferenceObjectCpu{ .meta = {.type = OT_ARROW, .name = "Arrow Mesh" }, .shadersType = GP_BASIC, .data = gc->createArrowMesh() };
-			objectsCpu[OT_CONE] = ReferenceObjectCpu{ .meta = {.type = OT_CONE, .name = "Cone Mesh" }, .shadersType = GP_CONE, .data = gc->createConeMesh(2, 3, 10) };
-			objectsCpu[OT_ICOSPHERE] = ReferenceObjectCpu{ .meta = {.type = OT_ICOSPHERE, .name = "Icosphere Mesh" }, .shadersType = GP_ICO, .data = gc->createIcoSphere(1, 3, true) };
-
-			struct ScratchVIBindings
-			{
-				nbl::asset::SBufferBinding<ICPUBuffer> vertex, index;
-			};
-			std::array<ScratchVIBindings, OT_COUNT> scratchBuffers;
-			//std::array<SGeomInfo, OT_COUNT> geomInfos;
-			auto geomInfoBuffer = ICPUBuffer::create({ OT_COUNT * sizeof(SGeomInfo) });
 			
-			SGeomInfo* geomInfos = reinterpret_cast<SGeomInfo*>(geomInfoBuffer->getPointer());
-			const uint32_t byteOffsets[OT_COUNT] = { 18, 24, 24, 20, 20, 24, 16, 12 };	// based on normals data position
-			const uint32_t smoothNormals[OT_COUNT] = { 0, 1, 1, 0, 0, 1, 1, 1 };
-
-			for (uint32_t i = 0; i < objectsCpu.size(); i++)
-			{
-				const auto& geom = objectsCpu[i];
-				auto& obj = objectsGpu[i];
-				auto& scratchObj = scratchBuffers[i];
-
-				obj.meta.name = geom.meta.name;
-				obj.meta.type = geom.meta.type;
-
-				obj.indexCount = geom.data.indexCount;
-				obj.indexType = geom.data.indexType;
-				obj.vertexStride = geom.data.inputParams.bindings[0].stride;
-
-				geomInfos[i].indexType = obj.indexType;
-				geomInfos[i].vertexStride = obj.vertexStride;
-				geomInfos[i].smoothNormals = smoothNormals[i];
-
-				auto vBuffer = smart_refctd_ptr(geom.data.bindings[0].buffer); // no offset
-				auto vUsage = bitflag(IGPUBuffer::EUF_STORAGE_BUFFER_BIT) | IGPUBuffer::EUF_TRANSFER_DST_BIT | IGPUBuffer::EUF_INLINE_UPDATE_VIA_CMDBUF | 
-					IGPUBuffer::EUF_ACCELERATION_STRUCTURE_BUILD_INPUT_READ_ONLY_BIT | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT;
-				obj.bindings.vertex.offset = 0u;
-
-				auto iBuffer = smart_refctd_ptr(geom.data.indexBuffer.buffer); // no offset
-				auto iUsage = bitflag(IGPUBuffer::EUF_STORAGE_BUFFER_BIT) | IGPUBuffer::EUF_TRANSFER_DST_BIT | IGPUBuffer::EUF_INLINE_UPDATE_VIA_CMDBUF |
-					IGPUBuffer::EUF_ACCELERATION_STRUCTURE_BUILD_INPUT_READ_ONLY_BIT | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT;
-				obj.bindings.index.offset = 0u;
-
-				vBuffer->addUsageFlags(vUsage);
-				vBuffer->setContentHash(vBuffer->computeContentHash());
-				scratchObj.vertex = { .offset = 0, .buffer = vBuffer };
-
-				if (geom.data.indexType != EIT_UNKNOWN)
-					if (iBuffer)
-					{
-						iBuffer->addUsageFlags(iUsage);
-						iBuffer->setContentHash(iBuffer->computeContentHash());
-					}
-				scratchObj.index = { .offset = 0, .buffer = iBuffer };
-			}
-
-			auto cmdbuf = getSingleUseCommandBufferAndBegin(pool);
-			cmdbuf->beginDebugMarker("Build geometry vertex and index buffers");
-
-			smart_refctd_ptr<CAssetConverter> converter = CAssetConverter::create({ .device = m_device.get(), .optimizer = {} });
-			CAssetConverter::SInputs inputs = {};
-			inputs.logger = m_logger.get();
-
-			std::array<ICPUBuffer*, OT_COUNT * 2u> tmpBuffers;
-			{
-				for (uint32_t i = 0; i < objectsCpu.size(); i++)
-				{
-					tmpBuffers[2 * i + 0] = scratchBuffers[i].vertex.buffer.get();
-					tmpBuffers[2 * i + 1] = scratchBuffers[i].index.buffer.get();
-				}
-
-				std::get<CAssetConverter::SInputs::asset_span_t<ICPUBuffer>>(inputs.assets) = tmpBuffers;
-			}
-
-			auto reservation = converter->reserve(inputs);
-			{
-				auto prepass = [&]<typename asset_type_t>(const auto & references) -> bool
-				{
-					auto objects = reservation.getGPUObjects<asset_type_t>();
-					uint32_t counter = {};
-					for (auto& object : objects)
-					{
-						auto gpu = object.value;
-						auto* reference = references[counter];
-
-						if (reference)
-						{
-							if (!gpu)
-							{
-								m_logger->log("Failed to convert a CPU object to GPU!", ILogger::ELL_ERROR);
-								return false;
-							}
-						}
-						counter++;
-					}
-					return true;
-				};
-
-				prepass.template operator() < ICPUBuffer > (tmpBuffers);
-			}
-
-			// not sure if need this (probably not, originally for transition img view)
-			auto semaphore = m_device->createSemaphore(0u);
-
-			std::array<IQueue::SSubmitInfo::SCommandBufferInfo, 1> cmdbufs = {};
-			cmdbufs.front().cmdbuf = cmdbuf.get();
-
-			SIntendedSubmitInfo transfer = {};
-			transfer.queue = queue;
-			transfer.scratchCommandBuffers = cmdbufs;
-			transfer.scratchSemaphore = {
-				.semaphore = semaphore.get(),
-				.value = 0u,
-				.stageMask = PIPELINE_STAGE_FLAGS::ALL_TRANSFER_BITS
-			};
-			// convert
-			{
-				CAssetConverter::SConvertParams params = {};
-				params.utilities = m_utils.get();
-				params.transfer = &transfer;
-
-				auto future = reservation.convert(params);
-				if (future.copy() != IQueue::RESULT::SUCCESS)
-				{
-					m_logger->log("Failed to await submission feature!", ILogger::ELL_ERROR);
-					return false;
-				}
-
-				// assign gpu objects to output
-				auto&& buffers = reservation.getGPUObjects<ICPUBuffer>();
-				for (uint32_t i = 0; i < objectsCpu.size(); i++)
-				{
-					auto& obj = objectsGpu[i];
-					obj.bindings.vertex = { .offset = 0, .buffer = buffers[2 * i + 0].value };
-					obj.bindings.index = { .offset = 0, .buffer = buffers[2 * i + 1].value };
-
-					geomInfos[i].vertexBufferAddress = obj.bindings.vertex.buffer->getDeviceAddress() + byteOffsets[i];
-					geomInfos[i].indexBufferAddress = obj.useIndex() ? obj.bindings.index.buffer->getDeviceAddress() : geomInfos[i].vertexBufferAddress;
-				}
-			}
-
-			{
-				IGPUBuffer::SCreationParams params;
-				params.usage = IGPUBuffer::EUF_STORAGE_BUFFER_BIT | IGPUBuffer::EUF_TRANSFER_DST_BIT | IGPUBuffer::EUF_INLINE_UPDATE_VIA_CMDBUF | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT;
-				params.size = OT_COUNT * sizeof(SGeomInfo);
-				m_utils->createFilledDeviceLocalBufferOnDedMem(SIntendedSubmitInfo{.queue = queue}, std::move(params), geomInfos).move_into(geometryInfoBuffer);
-			}
-
-			return true;
-		}
-
-		bool createAccelerationStructures(video::CThreadSafeQueueAdapter* queue)
-		{
-			IQueryPool::SCreationParams qParams{ .queryCount = OT_COUNT, .queryType = IQueryPool::ACCELERATION_STRUCTURE_COMPACTED_SIZE };
-			smart_refctd_ptr<IQueryPool> queryPool = m_device->createQueryPool(std::move(qParams));
-
-			auto pool = m_device->createCommandPool(queue->getFamilyIndex(), IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT | IGPUCommandPool::CREATE_FLAGS::TRANSIENT_BIT);
-			if (!pool)
-				return logFail("Couldn't create Command Pool for blas/tlas creation!");
-			
-			m_api->startCapture();
-#ifdef TRY_BUILD_FOR_NGFX // NSight is "debugger-challenged" it can't capture anything not happenning "during a frame", so we need to trick it
-			m_currentImageAcquire = m_surface->acquireNextImage();
-			{
-				const IQueue::SSubmitInfo::SSemaphoreInfo acquired[] = { {
-					.semaphore = m_currentImageAcquire.semaphore,
-					.value = m_currentImageAcquire.acquireCount,
-					.stageMask = PIPELINE_STAGE_FLAGS::ALL_COMMANDS_BITS
-				} };
-				m_surface->present(m_currentImageAcquire.imageIndex,acquired);
-			}
-			m_currentImageAcquire = m_surface->acquireNextImage();
-#endif
-			size_t totalScratchSize = 0;
-			const auto scratchOffsetAlignment = m_device->getPhysicalDevice()->getLimits().minAccelerationStructureScratchOffsetAlignment;
-
-			// build bottom level ASes
-			{
-				IGPUBottomLevelAccelerationStructure::DeviceBuildInfo blasBuildInfos[OT_COUNT];
-				uint32_t primitiveCounts[OT_COUNT];
-				IGPUBottomLevelAccelerationStructure::Triangles<IGPUBuffer> triangles[OT_COUNT];
-				uint32_t scratchSizes[OT_COUNT];
-
-				for (uint32_t i = 0; i < objectsGpu.size(); i++)
-				{
-					const auto& obj = objectsGpu[i];
-
-					const uint32_t vertexStride = obj.vertexStride;
-					const uint32_t numVertices = obj.bindings.vertex.buffer->getSize() / vertexStride;
-					if (obj.useIndex())
-						primitiveCounts[i] = obj.indexCount / 3;
-					else
-						primitiveCounts[i] = numVertices / 3;
-
-					triangles[i].vertexData[0] = obj.bindings.vertex;
-					triangles[i].indexData = obj.useIndex() ? obj.bindings.index : obj.bindings.vertex;
-					triangles[i].maxVertex = numVertices - 1;
-					triangles[i].vertexStride = vertexStride;
-					triangles[i].vertexFormat = EF_R32G32B32_SFLOAT;
-					triangles[i].indexType = obj.indexType;
-					triangles[i].geometryFlags = IGPUBottomLevelAccelerationStructure::GEOMETRY_FLAGS::OPAQUE_BIT;
-
-					auto blasFlags = bitflag(IGPUBottomLevelAccelerationStructure::BUILD_FLAGS::PREFER_FAST_TRACE_BIT) | IGPUBottomLevelAccelerationStructure::BUILD_FLAGS::ALLOW_COMPACTION_BIT;
-					if (m_physicalDevice->getProperties().limits.rayTracingPositionFetch)
-						blasFlags |= IGPUBottomLevelAccelerationStructure::BUILD_FLAGS::ALLOW_DATA_ACCESS;
-
-					blasBuildInfos[i].buildFlags = blasFlags;
-					blasBuildInfos[i].geometryCount = 1;	// only 1 geometry object per blas
-					blasBuildInfos[i].srcAS = nullptr;
-					blasBuildInfos[i].dstAS = nullptr;
-					blasBuildInfos[i].triangles = &triangles[i];
-					blasBuildInfos[i].scratch = {};
-
-					ILogicalDevice::AccelerationStructureBuildSizes buildSizes;
-					{
-						const auto* trianglesData = triangles;
-						const uint32_t maxPrimCount[1] = { primitiveCounts[i] };
-						buildSizes = m_device->getAccelerationStructureBuildSizes(false,blasFlags, false, std::span{trianglesData,1}, maxPrimCount);
-						if (!buildSizes)
-							return logFail("Failed to get BLAS build sizes");
-					}
-
-					scratchSizes[i] = buildSizes.buildScratchSize;
-					totalScratchSize = core::alignUp(totalScratchSize, scratchOffsetAlignment);
-					totalScratchSize += buildSizes.buildScratchSize;
-
-					{
-						IGPUBuffer::SCreationParams params;
-						params.usage = bitflag(IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT) | IGPUBuffer::EUF_ACCELERATION_STRUCTURE_STORAGE_BIT;
-						params.size = buildSizes.accelerationStructureSize;
-						smart_refctd_ptr<IGPUBuffer> asBuffer = createBuffer(params);
-
-						IGPUBottomLevelAccelerationStructure::SCreationParams blasParams;
-						blasParams.bufferRange.buffer = asBuffer;
-						blasParams.bufferRange.offset = 0u;
-						blasParams.bufferRange.size = buildSizes.accelerationStructureSize;
-						blasParams.flags = IGPUBottomLevelAccelerationStructure::SCreationParams::FLAGS::NONE;
-						gpuBlas[i] = m_device->createBottomLevelAccelerationStructure(std::move(blasParams));
-						if (!gpuBlas[i])
-							return logFail("Could not create BLAS");
-					}
-				}
-
-				auto cmdbufBlas = getSingleUseCommandBufferAndBegin(pool);
-				cmdbufBlas->beginDebugMarker("Build BLAS");
-
-				cmdbufBlas->resetQueryPool(queryPool.get(), 0, objectsGpu.size());
-
-				smart_refctd_ptr<IGPUBuffer> scratchBuffer;
-				{
-					IGPUBuffer::SCreationParams params;
-					params.usage = bitflag(IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT) | IGPUBuffer::EUF_STORAGE_BUFFER_BIT;
-					params.size = totalScratchSize;
-					scratchBuffer = createBuffer(params);
-				}
-
-				uint32_t queryCount = 0;
-				IGPUBottomLevelAccelerationStructure::BuildRangeInfo buildRangeInfos[OT_COUNT];
-				IGPUBottomLevelAccelerationStructure::BuildRangeInfo* pRangeInfos[OT_COUNT];
-				for (uint32_t i = 0; i < objectsGpu.size(); i++)
-				{
-					blasBuildInfos[i].dstAS = gpuBlas[i].get();
-					blasBuildInfos[i].scratch.buffer = scratchBuffer;
-					if (i == 0)
-					{
-						blasBuildInfos[i].scratch.offset = 0u;
-					}
-					else
-					{
-						const auto unalignedOffset = blasBuildInfos[i - 1].scratch.offset + scratchSizes[i - 1];
-						blasBuildInfos[i].scratch.offset = core::alignUp(unalignedOffset, scratchOffsetAlignment);
-					}
-
-					buildRangeInfos[i].primitiveCount = primitiveCounts[i];
-					buildRangeInfos[i].primitiveByteOffset = 0u;
-					buildRangeInfos[i].firstVertex = 0u;
-					buildRangeInfos[i].transformByteOffset = 0u;
-
-					pRangeInfos[i] = &buildRangeInfos[i];
-				}
-
-				if (!cmdbufBlas->buildAccelerationStructures({ blasBuildInfos, OT_COUNT }, pRangeInfos))
-					return logFail("Failed to build BLAS");
-
-				{
-					SMemoryBarrier memBarrier;
-					memBarrier.srcStageMask = PIPELINE_STAGE_FLAGS::ACCELERATION_STRUCTURE_BUILD_BIT;
-					memBarrier.srcAccessMask = ACCESS_FLAGS::ACCELERATION_STRUCTURE_WRITE_BIT;
-					memBarrier.dstStageMask = PIPELINE_STAGE_FLAGS::ACCELERATION_STRUCTURE_BUILD_BIT;
-					memBarrier.dstAccessMask = ACCESS_FLAGS::ACCELERATION_STRUCTURE_READ_BIT;
-					cmdbufBlas->pipelineBarrier(E_DEPENDENCY_FLAGS::EDF_NONE, { .memBarriers = {&memBarrier, 1} });
-				}
-
-				const IGPUAccelerationStructure* ases[OT_COUNT];
-				for (uint32_t i = 0; i < objectsGpu.size(); i++)
-					ases[i] = gpuBlas[i].get();
-				if (!cmdbufBlas->writeAccelerationStructureProperties({ ases, OT_COUNT }, IQueryPool::ACCELERATION_STRUCTURE_COMPACTED_SIZE,
-					queryPool.get(), queryCount++))
-					return logFail("Failed to write acceleration structure properties!");
-
-				cmdbufBlas->endDebugMarker();
-				cmdbufSubmitAndWait(cmdbufBlas, queue, 39);
-			}
-
-			auto cmdbufCompact = getSingleUseCommandBufferAndBegin(pool);
-			cmdbufCompact->beginDebugMarker("Compact BLAS");
-
-			// compact blas
-			{
-				std::array<size_t, OT_COUNT> asSizes{ 0 };
-				if (!m_device->getQueryPoolResults(queryPool.get(), 0, objectsGpu.size(), asSizes.data(), sizeof(size_t), bitflag(IQueryPool::RESULTS_FLAGS::WAIT_BIT)|IQueryPool::_64_BIT))
-					return logFail("Could not get query pool results for AS sizes");
-
-				std::array<smart_refctd_ptr<IGPUBottomLevelAccelerationStructure>, OT_COUNT> cleanupBlas;
-				for (uint32_t i = 0; i < objectsGpu.size(); i++)
-				{
-					cleanupBlas[i] = gpuBlas[i];
-					{
-						IGPUBuffer::SCreationParams params;
-						params.usage = bitflag(IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT) | IGPUBuffer::EUF_ACCELERATION_STRUCTURE_STORAGE_BIT;
-						params.size = asSizes[i];
-						smart_refctd_ptr<IGPUBuffer> asBuffer = createBuffer(params);
-
-						IGPUBottomLevelAccelerationStructure::SCreationParams blasParams;
-						blasParams.bufferRange.buffer = asBuffer;
-						blasParams.bufferRange.offset = 0u;
-						blasParams.bufferRange.size = asSizes[i];
-						blasParams.flags = IGPUBottomLevelAccelerationStructure::SCreationParams::FLAGS::NONE;
-						gpuBlas[i] = m_device->createBottomLevelAccelerationStructure(std::move(blasParams));
-						if (!gpuBlas[i])
-							return logFail("Could not create compacted BLAS");
-					}
-
-					IGPUBottomLevelAccelerationStructure::CopyInfo copyInfo;
-					copyInfo.src = cleanupBlas[i].get();
-					copyInfo.dst = gpuBlas[i].get();
-					copyInfo.mode = IGPUBottomLevelAccelerationStructure::COPY_MODE::COMPACT;
-					if (!cmdbufCompact->copyAccelerationStructure(copyInfo))
-						return logFail("Failed to copy AS to compact");
-				}
-			}
-
-			cmdbufCompact->endDebugMarker();
-			cmdbufSubmitAndWait(cmdbufCompact, queue, 40);
-
-			auto cmdbufTlas = getSingleUseCommandBufferAndBegin(pool);
-			cmdbufTlas->beginDebugMarker("Build TLAS");
-
-			// build top level AS
-			{
-				const uint32_t instancesCount = objectsGpu.size();
-				IGPUTopLevelAccelerationStructure::DeviceStaticInstance instances[OT_COUNT];
-				for (uint32_t i = 0; i < instancesCount; i++)
-				{
-					core::matrix3x4SIMD transform;
-					transform.setTranslation(nbl::core::vectorSIMDf(5.f * i, 0, 0, 0));
-					instances[i].base.blas.deviceAddress = gpuBlas[i]->getReferenceForDeviceOperations().deviceAddress;
-					instances[i].base.mask = 0xFF;
-					instances[i].base.instanceCustomIndex = i;
-					instances[i].base.instanceShaderBindingTableRecordOffset = 0;
-					instances[i].base.flags = static_cast<uint32_t>(IGPUTopLevelAccelerationStructure::INSTANCE_FLAGS::TRIANGLE_FACING_CULL_DISABLE_BIT);
-					instances[i].transform = transform;
-				}
-
-				{
-					size_t bufSize = instancesCount * sizeof(IGPUTopLevelAccelerationStructure::DeviceStaticInstance);
-					IGPUBuffer::SCreationParams params;
-					params.usage = bitflag(IGPUBuffer::EUF_ACCELERATION_STRUCTURE_BUILD_INPUT_READ_ONLY_BIT) | IGPUBuffer::EUF_STORAGE_BUFFER_BIT |
-						IGPUBuffer::EUF_INLINE_UPDATE_VIA_CMDBUF | IGPUBuffer::EUF_TRANSFER_DST_BIT | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT;
-					params.size = bufSize;
-					instancesBuffer = createBuffer(params);
-
-					SBufferRange<IGPUBuffer> range = { .offset = 0u, .size = bufSize, .buffer = instancesBuffer };
-					cmdbufTlas->updateBuffer(range, instances);
-				}
-
-				// make sure instances upload complete first
-				{
-					SMemoryBarrier memBarrier;
-					memBarrier.srcStageMask = PIPELINE_STAGE_FLAGS::ALL_TRANSFER_BITS;
-					memBarrier.srcAccessMask = ACCESS_FLAGS::TRANSFER_WRITE_BIT;
-					memBarrier.dstStageMask = PIPELINE_STAGE_FLAGS::ACCELERATION_STRUCTURE_BUILD_BIT;
-					memBarrier.dstAccessMask = ACCESS_FLAGS::ACCELERATION_STRUCTURE_WRITE_BIT;
-					cmdbufTlas->pipelineBarrier(E_DEPENDENCY_FLAGS::EDF_NONE, { .memBarriers = {&memBarrier, 1} });
-				}
-
-				auto tlasFlags = bitflag(IGPUTopLevelAccelerationStructure::BUILD_FLAGS::PREFER_FAST_TRACE_BIT);
-
-				IGPUTopLevelAccelerationStructure::DeviceBuildInfo tlasBuildInfo;
-				tlasBuildInfo.buildFlags = tlasFlags;
-				tlasBuildInfo.srcAS = nullptr;
-				tlasBuildInfo.dstAS = nullptr;
-				tlasBuildInfo.instanceData.buffer = instancesBuffer;
-				tlasBuildInfo.instanceData.offset = 0u;
-				tlasBuildInfo.scratch = {};
-
-				auto buildSizes = m_device->getAccelerationStructureBuildSizes(tlasFlags, false, instancesCount);
-				if (!buildSizes)
-					return logFail("Failed to get TLAS build sizes");
-
-				{
-					IGPUBuffer::SCreationParams params;
-					params.usage = bitflag(IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT) | IGPUBuffer::EUF_ACCELERATION_STRUCTURE_STORAGE_BIT;
-					params.size = buildSizes.accelerationStructureSize;
-					smart_refctd_ptr<IGPUBuffer> asBuffer = createBuffer(params);
-
-					IGPUTopLevelAccelerationStructure::SCreationParams tlasParams;
-					tlasParams.bufferRange.buffer = asBuffer;
-					tlasParams.bufferRange.offset = 0u;
-					tlasParams.bufferRange.size = buildSizes.accelerationStructureSize;
-					tlasParams.flags = IGPUTopLevelAccelerationStructure::SCreationParams::FLAGS::NONE;
-					gpuTlas = m_device->createTopLevelAccelerationStructure(std::move(tlasParams));
-					if (!gpuTlas)
-						return logFail("Could not create TLAS");
-				}
-
-				smart_refctd_ptr<IGPUBuffer> scratchBuffer;
-				{
-					IGPUBuffer::SCreationParams params;
-					params.usage = bitflag(IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT) | IGPUBuffer::EUF_STORAGE_BUFFER_BIT;
-					params.size = buildSizes.buildScratchSize;
-					scratchBuffer = createBuffer(params);
-				}
-
-				tlasBuildInfo.dstAS = gpuTlas.get();
-				tlasBuildInfo.scratch.buffer = scratchBuffer;
-				tlasBuildInfo.scratch.offset = 0u;
-
-				IGPUTopLevelAccelerationStructure::BuildRangeInfo buildRangeInfo[1u];
-				buildRangeInfo[0].instanceCount = instancesCount;
-				buildRangeInfo[0].instanceByteOffset = 0u;
-				IGPUTopLevelAccelerationStructure::BuildRangeInfo* pRangeInfos;
-				pRangeInfos = &buildRangeInfo[0];
-
-				if (!cmdbufTlas->buildAccelerationStructures({ &tlasBuildInfo, 1 }, pRangeInfos))
-					return logFail("Failed to build TLAS");
-			}
-
-			cmdbufTlas->endDebugMarker();
-			cmdbufSubmitAndWait(cmdbufTlas, queue, 45);
-
 #ifdef TRY_BUILD_FOR_NGFX
 			{
 				const IQueue::SSubmitInfo::SSemaphoreInfo acquired[] = { {
@@ -1423,9 +893,8 @@ class RayQueryGeometryApp final : public examples::SimpleWindowedApplication, pu
 #endif
 			m_api->endCapture();
 
-			return true;
+			return bool(gpuTlas);
 		}
-#endif // TEST_ASSET_CONV_AS
 
 
 		smart_refctd_ptr<IWindow> m_window;
@@ -1442,18 +911,15 @@ class RayQueryGeometryApp final : public examples::SimpleWindowedApplication, pu
 		Camera camera = Camera(core::vectorSIMDf(0, 0, 0), core::vectorSIMDf(0, 0, 0), core::matrix4SIMD());
 		video::CDumbPresentationOracle oracle;
 
-		std::array<ReferenceObjectGpu, OT_COUNT> objectsGpu;
-
-		std::array<smart_refctd_ptr<IGPUBottomLevelAccelerationStructure>, OT_COUNT> gpuBlas;
+		// TODO: maybe convert the descriptor set from ICPU as well?
 		smart_refctd_ptr<IGPUTopLevelAccelerationStructure> gpuTlas;
-		smart_refctd_ptr<IGPUBuffer> instancesBuffer;
 
 		smart_refctd_ptr<IGPUBuffer> geometryInfoBuffer;
+		core::vector<smart_refctd_ptr<IGPUBuffer>> retainedBuffers;
 		smart_refctd_ptr<IGPUImage> outHDRImage;
 
 		smart_refctd_ptr<IGPUComputePipeline> renderPipeline;
 		smart_refctd_ptr<IGPUDescriptorSet> renderDs;
-		smart_refctd_ptr<IDescriptorPool> renderPool;
 
 		uint16_t gcIndex = {};
 

From cac9ea184c1eae6cb4279f6d8603b4f97de59699 Mon Sep 17 00:00:00 2001
From: devsh <devsh@devsh.eu>
Date: Thu, 22 May 2025 01:48:23 +0200
Subject: [PATCH 284/529] typo

---
 67_RayQueryGeometry/main.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/67_RayQueryGeometry/main.cpp b/67_RayQueryGeometry/main.cpp
index 1faeaf196..e096c1b71 100644
--- a/67_RayQueryGeometry/main.cpp
+++ b/67_RayQueryGeometry/main.cpp
@@ -786,7 +786,6 @@ class RayQueryGeometryApp final : public examples::SimpleWindowedApplication, pu
 				// assign gpu objects to output
 				gpuTlas = reservation.getGPUObjects<ICPUTopLevelAccelerationStructure>().front().value;
 				for (const auto& buffer : reservation.getGPUObjects<ICPUBuffer>())
-				if (buffer)
 					retainedBuffers.push_back(buffer.value);
 				for (uint32_t i = 0; i < objectsCpu.size(); i++)
 				{

From 42ab873ba3d698da396368cb2f8b545107ce8f77 Mon Sep 17 00:00:00 2001
From: Erfan Ahmadi <ahmadierfan99@gmail.com>
Date: Thu, 22 May 2025 07:53:43 +0400
Subject: [PATCH 285/529] streamed image copy

---
 62_CAD/DrawResourcesFiller.cpp | 125 ++++++++++++++++++++++++++++++++-
 62_CAD/DrawResourcesFiller.h   |  11 ++-
 62_CAD/Images.h                |   6 ++
 3 files changed, 137 insertions(+), 5 deletions(-)

diff --git a/62_CAD/DrawResourcesFiller.cpp b/62_CAD/DrawResourcesFiller.cpp
index 425834a99..4b1172847 100644
--- a/62_CAD/DrawResourcesFiller.cpp
+++ b/62_CAD/DrawResourcesFiller.cpp
@@ -497,8 +497,8 @@ bool DrawResourcesFiller::ensureGeoreferencedImageAvailability_AllocateIfNeeded(
 					
 					// instead of erasing and inserting the imageID into the cache, we just reset it, so the next block of code goes into array index allocation + creating our new image
 					*cachedImageRecord = CachedImageRecord(currentFrameIndex);
-					// imagesUsageCache->erase(imageID);
-					// cachedImageRecord = imagesUsageCache->insert(imageID, intendedNextSubmit.getFutureScratchSemaphore().value, evictCallback);
+					// imagesCache->erase(imageID);
+					// cachedImageRecord = imagesCache->insert(imageID, intendedNextSubmit.getFutureScratchSemaphore().value, evictCallback);
 				}
 			}
 			else
@@ -756,6 +756,7 @@ bool DrawResourcesFiller::pushAllUploads(SIntendedSubmitInfo& intendedNextSubmit
 
 		success &= bindImagesToArrayIndices(*imagesCache);
 		success &= pushStaticImagesUploads(intendedNextSubmit, *imagesCache);
+		// Streamed uploads in cache&replay?!
 	}
 	else
 	{
@@ -764,6 +765,7 @@ bool DrawResourcesFiller::pushAllUploads(SIntendedSubmitInfo& intendedNextSubmit
 		success &= pushMSDFImagesUploads(intendedNextSubmit, msdfImagesState);
 		success &= bindImagesToArrayIndices(*imagesCache);
 		success &= pushStaticImagesUploads(intendedNextSubmit, *imagesCache);
+		success &= pushStreamedImagesUploads(intendedNextSubmit);
 	}
 	return success;
 }
@@ -1225,6 +1227,125 @@ bool DrawResourcesFiller::pushStaticImagesUploads(SIntendedSubmitInfo& intendedN
 	return success;
 }
 
+bool DrawResourcesFiller::pushStreamedImagesUploads(SIntendedSubmitInfo& intendedNextSubmit)
+{
+	bool success = true;
+
+	if (streamedImageCopies.size() > 0ull)
+	{
+		auto* device = m_utilities->getLogicalDevice();
+		auto* cmdBuffInfo = intendedNextSubmit.getCommandBufferForRecording();
+	
+		if (cmdBuffInfo)
+		{
+			IGPUCommandBuffer* commandBuffer = cmdBuffInfo->cmdbuf;
+
+			std::vector<IGPUCommandBuffer::SPipelineBarrierDependencyInfo::image_barrier_t> beforeCopyImageBarriers;
+			beforeCopyImageBarriers.reserve(streamedImageCopies.size());
+
+			// Pipeline Barriers before imageCopy
+			for (auto& [imageID, imageCopy] : streamedImageCopies)
+			{
+				auto* imageRecord = imagesCache->peek(imageID);
+				if (imageRecord == nullptr)
+					continue;
+
+				const auto& gpuImg = imageRecord->gpuImageView->getCreationParameters().image;
+
+				beforeCopyImageBarriers.push_back(
+					{
+						.barrier = {
+							.dep = {
+								.srcStageMask = PIPELINE_STAGE_FLAGS::NONE, // previous top of pipe -> top_of_pipe in first scope = none
+								.srcAccessMask = ACCESS_FLAGS::NONE,
+								.dstStageMask = PIPELINE_STAGE_FLAGS::COPY_BIT,
+								.dstAccessMask = ACCESS_FLAGS::TRANSFER_WRITE_BIT,
+							}
+							// .ownershipOp. No queueFam ownership transfer
+						},
+						.image = gpuImg.get(),
+						.subresourceRange = {
+							.aspectMask = IImage::E_ASPECT_FLAGS::EAF_COLOR_BIT,
+							.baseMipLevel = 0u,
+							.levelCount = ICPUImageView::remaining_mip_levels,
+							.baseArrayLayer = 0u,
+							.layerCount = ICPUImageView::remaining_array_layers
+						},
+						.oldLayout = IImage::LAYOUT::UNDEFINED,
+						.newLayout = IImage::LAYOUT::TRANSFER_DST_OPTIMAL,
+					});
+			}
+			success &= commandBuffer->pipelineBarrier(E_DEPENDENCY_FLAGS::EDF_NONE, { .imgBarriers = beforeCopyImageBarriers });
+			
+			for (auto& [imageID, imageCopy] : streamedImageCopies)
+			{
+				auto* imageRecord = imagesCache->peek(imageID);
+				if (imageRecord == nullptr)
+					continue;
+
+				const auto& gpuImg = imageRecord->gpuImageView->getCreationParameters().image;
+
+				success &= m_utilities->updateImageViaStagingBuffer(
+					intendedNextSubmit,
+					imageCopy.srcBuffer->getPointer(), gpuImg->getCreationParameters().format,
+					gpuImg.get(), IImage::LAYOUT::TRANSFER_DST_OPTIMAL,
+					{ &imageCopy.region, 1u });
+			}
+
+			commandBuffer = intendedNextSubmit.getCommandBufferForRecording()->cmdbuf; // overflow-submit in utilities calls might've cause current recording command buffer to change
+
+			std::vector<IGPUCommandBuffer::SPipelineBarrierDependencyInfo::image_barrier_t> afterCopyImageBarriers;
+			afterCopyImageBarriers.reserve(streamedImageCopies.size());
+
+			// Pipeline Barriers before imageCopy
+			for (auto& [imageID, imageCopy] : streamedImageCopies)
+			{
+				auto* imageRecord = imagesCache->peek(imageID);
+				if (imageRecord == nullptr)
+					continue;
+
+				const auto& gpuImg = imageRecord->gpuImageView->getCreationParameters().image;
+
+				afterCopyImageBarriers.push_back (
+					{
+						.barrier = {
+							.dep = {
+								.srcStageMask = PIPELINE_STAGE_FLAGS::COPY_BIT, // previous top of pipe -> top_of_pipe in first scope = none
+								.srcAccessMask = ACCESS_FLAGS::TRANSFER_WRITE_BIT,
+								.dstStageMask = PIPELINE_STAGE_FLAGS::FRAGMENT_SHADER_BIT,
+								.dstAccessMask = ACCESS_FLAGS::SHADER_READ_BITS,
+							}
+							// .ownershipOp. No queueFam ownership transfer
+						},
+						.image = gpuImg.get(),
+						.subresourceRange = {
+							.aspectMask = IImage::E_ASPECT_FLAGS::EAF_COLOR_BIT,
+							.baseMipLevel = 0u,
+							.levelCount = ICPUImageView::remaining_mip_levels,
+							.baseArrayLayer = 0u,
+							.layerCount = ICPUImageView::remaining_array_layers
+						},
+						.oldLayout = IImage::LAYOUT::TRANSFER_DST_OPTIMAL,
+						.newLayout = IImage::LAYOUT::READ_ONLY_OPTIMAL,
+					});
+			}
+			success &= commandBuffer->pipelineBarrier(E_DEPENDENCY_FLAGS::EDF_NONE, { .imgBarriers = afterCopyImageBarriers });
+		}
+		else
+		{
+			_NBL_DEBUG_BREAK_IF(true);
+			success = false;
+		}
+	}
+
+	if (!success)
+	{
+		// TODO: Log
+		_NBL_DEBUG_BREAK_IF(true);
+	}
+	return success;
+}
+
 const size_t DrawResourcesFiller::calculateRemainingResourcesSize() const
 {
 	assert(resourcesGPUBuffer->getSize() >= resourcesCollection.calculateTotalConsumption());
diff --git a/62_CAD/DrawResourcesFiller.h b/62_CAD/DrawResourcesFiller.h
index 4faa3fecc..6ece66de3 100644
--- a/62_CAD/DrawResourcesFiller.h
+++ b/62_CAD/DrawResourcesFiller.h
@@ -237,7 +237,7 @@ struct DrawResourcesFiller
 	 *       If an eviction is required and the evicted image is scheduled to be used in the next submit, it triggers
 	 *       a flush of pending draws to preserve correctness.
 	 *
-	 * @note The function uses the `imagesUsageCache` LRU cache to track usage and validity of texture slots.
+	 * @note The function uses the `imagesCache` LRU cache to track usage and validity of texture slots.
 	 *       If an insertion leads to an eviction, a callback ensures proper deallocation and synchronization.
 	 * @return true if the image was successfully cached and is ready for use; false if allocation failed.
 	*/
@@ -417,8 +417,11 @@ struct DrawResourcesFiller
 	/// @brief binds cached images into their correct descriptor set slot if not already resident.
 	bool bindImagesToArrayIndices(ImagesCache& imagesCache);
 
-	/// @brief Records GPU copy commands for all staged images into the active command buffer, and binds them into correct descriptor set slot.
+	/// @brief Records GPU copy commands for all staged images into the active command buffer.
 	bool pushStaticImagesUploads(SIntendedSubmitInfo& intendedNextSubmit, ImagesCache& imagesCache);
+	
+	/// @brief copies the queued up streamed copies.
+	bool pushStreamedImagesUploads(SIntendedSubmitInfo& intendedNextSubmit);
 
 	const size_t calculateRemainingResourcesSize() const;
 
@@ -550,7 +553,7 @@ struct DrawResourcesFiller
 	 * @param[out] outImageParams Structure to be filled with image creation parameters (format, size, etc.).
 	 * @param[out] outImageType Indicates whether the image should be fully resident or streamed.
 	 * @param[in] georeferencedImageParams Parameters describing the full image extents, viewport extents, and format.
-	 */
+	*/
 	void determineGeoreferencedImageCreationParams(nbl::asset::IImage::SCreationParams& outImageParams, ImageType& outImageType, const GeoreferencedImageParams& georeferencedImageParams);
 
 	void resetMainObjects()
@@ -736,5 +739,7 @@ struct DrawResourcesFiller
 	std::unique_ptr<ImagesCache> imagesCache;
 	smart_refctd_ptr<SubAllocatedDescriptorSet> suballocatedDescriptorSet;
 	uint32_t imagesArrayBinding = 0u;
+
+	std::unordered_map<image_id, StreamedImageCopy> streamedImageCopies;
 };
 
diff --git a/62_CAD/Images.h b/62_CAD/Images.h
index d525a68f6..e43c72fd2 100644
--- a/62_CAD/Images.h
+++ b/62_CAD/Images.h
@@ -208,3 +208,9 @@ class ImagesCache : public core::ResizableLRUCache<image_id, CachedImageRecord>
 		base_t::erase(imageID);
 	}
 };
+
+struct StreamedImageCopy
+{
+	core::smart_refctd_ptr<ICPUBuffer> srcBuffer; // Make it 'std::future' later?
+	asset::IImage::SBufferCopy region;
+};

From b3dd4099d394f76d867bc60e068dcb5280fa7f23 Mon Sep 17 00:00:00 2001
From: Erfan Ahmadi <ahmadierfan99@gmail.com>
Date: Thu, 22 May 2025 10:07:24 +0400
Subject: [PATCH 286/529] static images improvements, promoting from the get-go

---
 62_CAD/main.cpp | 204 ++++++++++++++++++++++++++++++++++++++++++------
 1 file changed, 179 insertions(+), 25 deletions(-)

diff --git a/62_CAD/main.cpp b/62_CAD/main.cpp
index c59669fa6..269b037be 100644
--- a/62_CAD/main.cpp
+++ b/62_CAD/main.cpp
@@ -277,6 +277,87 @@ class CSwapchainResources : public ISimpleManagedSurface::ISwapchainResources
 		std::array<core::smart_refctd_ptr<IGPUFramebuffer>,ISwapchain::MaxImages> m_framebuffers;
 };
 
+
+// TODO: Move this funcitons that help with creating a new promoted CPUImage
+template<unsigned int SRC_CHANNELS>
+struct PromotionComponentSwizzle
+{
+    template<typename InT, typename OutT>
+    void operator()(const InT* in, OutT* out) const
+    {
+        using in_t = std::conditional_t<std::is_void_v<InT>, uint64_t, InT>;
+        using out_t = std::conditional_t<std::is_void_v<OutT>, uint64_t, OutT>;
+
+        reinterpret_cast<out_t*>(out)[0u] = reinterpret_cast<const in_t*>(in)[0u];
+
+        if constexpr (SRC_CHANNELS > 1)
+            reinterpret_cast<out_t*>(out)[1u] = reinterpret_cast<const in_t*>(in)[1u];
+        else
+            reinterpret_cast<out_t*>(out)[1u] = static_cast<in_t>(0);
+
+        if constexpr (SRC_CHANNELS > 2)
+            reinterpret_cast<out_t*>(out)[2u] = reinterpret_cast<const in_t*>(in)[2u];
+        else
+            reinterpret_cast<out_t*>(out)[2u] = static_cast<in_t>(0);
+
+        if constexpr (SRC_CHANNELS > 3)
+            reinterpret_cast<out_t*>(out)[3u] = reinterpret_cast<const in_t*>(in)[3u];
+        else
+            reinterpret_cast<out_t*>(out)[3u] = static_cast<in_t>(1);
+    }
+};
+template<typename Filter>
+bool performCopyUsingImageFilter(
+    const core::smart_refctd_ptr<asset::ICPUImage>& inCPUImage,
+    const core::smart_refctd_ptr<asset::ICPUImage>& outCPUImage)
+{
+    Filter filter;
+
+	const uint32_t mipLevels = inCPUImage->getCreationParameters().mipLevels;
+	
+	for (uint32_t level = 0u; level < mipLevels; ++level)
+	{
+		const auto regions = inCPUImage->getRegions(level);
+
+		for (auto& region : regions)
+		{
+			typename Filter::state_type state = {};
+			state.extent = region.imageExtent;
+			state.layerCount = region.imageSubresource.layerCount;
+			state.inImage = inCPUImage.get();
+			state.outImage = outCPUImage.get();
+			state.inOffsetBaseLayer = core::vectorSIMDu32(region.imageOffset.x,  region.imageOffset.y, region.imageOffset.z, region.imageSubresource.baseArrayLayer);
+			state.outOffsetBaseLayer = core::vectorSIMDu32(0u);
+			state.inMipLevel = region.imageSubresource.mipLevel;
+			state.outMipLevel = region.imageSubresource.mipLevel;
+
+			if (!filter.execute(core::execution::par_unseq, &state))
+				return false;
+		}
+	}
+	return true;
+}
+
+bool performImageFormatPromotionCopy(const core::smart_refctd_ptr<asset::ICPUImage>& inCPUImage, const core::smart_refctd_ptr<asset::ICPUImage>& outCPUImage)
+{
+	asset::E_FORMAT srcImageFormat = inCPUImage->getCreationParameters().format;
+	asset::E_FORMAT dstImageFormat = outCPUImage->getCreationParameters().format;
+
+	// In = srcData, Out = stagingBuffer
+	if (srcImageFormat == dstImageFormat)
+		return false;
+
+    auto srcChannelCount = asset::getFormatChannelCount(srcImageFormat);
+    if (srcChannelCount == 1u)
+        return performCopyUsingImageFilter<asset::CSwizzleAndConvertImageFilter<asset::EF_UNKNOWN, asset::EF_UNKNOWN, PromotionComponentSwizzle<1u>>>(inCPUImage, outCPUImage);
+    else if (srcChannelCount == 2u)
+        return performCopyUsingImageFilter<asset::CSwizzleAndConvertImageFilter<asset::EF_UNKNOWN, asset::EF_UNKNOWN, PromotionComponentSwizzle<2u>>>(inCPUImage, outCPUImage);
+    else if (srcChannelCount == 3u)
+        return performCopyUsingImageFilter<asset::CSwizzleAndConvertImageFilter<asset::EF_UNKNOWN, asset::EF_UNKNOWN, PromotionComponentSwizzle<3u>>>(inCPUImage, outCPUImage);
+    else
+        return performCopyUsingImageFilter<asset::CSwizzleAndConvertImageFilter<asset::EF_UNKNOWN, asset::EF_UNKNOWN, PromotionComponentSwizzle<4u>>>(inCPUImage, outCPUImage);
+}
+
 class ComputerAidedDesign final : public examples::SimpleWindowedApplication, public application_templates::MonoAssetManagerAndBuiltinResourceApplication
 {
 	using device_base_t = examples::SimpleWindowedApplication;
@@ -388,22 +469,44 @@ class ComputerAidedDesign final : public examples::SimpleWindowedApplication, pu
 			}
 		}
 
-		IGPUSampler::SParams samplerParams = {};
-		samplerParams.TextureWrapU = IGPUSampler::E_TEXTURE_CLAMP::ETC_CLAMP_TO_BORDER;
-		samplerParams.TextureWrapV = IGPUSampler::E_TEXTURE_CLAMP::ETC_CLAMP_TO_BORDER;
-		samplerParams.TextureWrapW = IGPUSampler::E_TEXTURE_CLAMP::ETC_CLAMP_TO_BORDER;
-		samplerParams.BorderColor  = IGPUSampler::ETBC_FLOAT_OPAQUE_WHITE; // positive means outside shape
-		samplerParams.MinFilter		= IGPUSampler::ETF_LINEAR;
-		samplerParams.MaxFilter		= IGPUSampler::ETF_LINEAR;
-		samplerParams.MipmapMode	= IGPUSampler::ESMM_LINEAR;
-		samplerParams.AnisotropicFilter = 3;
-		samplerParams.CompareEnable = false;
-		samplerParams.CompareFunc = ECO_GREATER;
-		samplerParams.LodBias = 0.f;
-		samplerParams.MinLod = -1000.f;
-		samplerParams.MaxLod = 1000.f;
-		msdfTextureSampler = m_device->createSampler(samplerParams);
-	
+		// MSDF Image Sampler
+		{
+			IGPUSampler::SParams samplerParams = {};
+			samplerParams.TextureWrapU = IGPUSampler::E_TEXTURE_CLAMP::ETC_CLAMP_TO_BORDER;
+			samplerParams.TextureWrapV = IGPUSampler::E_TEXTURE_CLAMP::ETC_CLAMP_TO_BORDER;
+			samplerParams.TextureWrapW = IGPUSampler::E_TEXTURE_CLAMP::ETC_CLAMP_TO_BORDER;
+			samplerParams.BorderColor = IGPUSampler::ETBC_FLOAT_OPAQUE_WHITE; // positive means outside shape
+			samplerParams.MinFilter = IGPUSampler::ETF_LINEAR;
+			samplerParams.MaxFilter = IGPUSampler::ETF_LINEAR;
+			samplerParams.MipmapMode = IGPUSampler::ESMM_LINEAR;
+			samplerParams.AnisotropicFilter = 3;
+			samplerParams.CompareEnable = false;
+			samplerParams.CompareFunc = ECO_GREATER;
+			samplerParams.LodBias = 0.f;
+			samplerParams.MinLod = -1000.f;
+			samplerParams.MaxLod = 1000.f;
+			msdfImageSampler = m_device->createSampler(samplerParams);
+		}
+		
+		// Static Image Sampler
+		{
+			IGPUSampler::SParams samplerParams = {};
+			samplerParams.TextureWrapU = IGPUSampler::E_TEXTURE_CLAMP::ETC_MIRROR;
+			samplerParams.TextureWrapV = IGPUSampler::E_TEXTURE_CLAMP::ETC_MIRROR;
+			samplerParams.TextureWrapW = IGPUSampler::E_TEXTURE_CLAMP::ETC_MIRROR;
+			samplerParams.BorderColor = IGPUSampler::ETBC_FLOAT_TRANSPARENT_BLACK;
+			samplerParams.MinFilter = IGPUSampler::ETF_LINEAR;
+			samplerParams.MaxFilter = IGPUSampler::ETF_LINEAR;
+			samplerParams.MipmapMode = IGPUSampler::ESMM_LINEAR;
+			samplerParams.AnisotropicFilter = 3;
+			samplerParams.CompareEnable = false;
+			samplerParams.CompareFunc = ECO_GREATER;
+			samplerParams.LodBias = 0.f;
+			samplerParams.MinLod = -1000.f;
+			samplerParams.MaxLod = 1000.f;
+			staticImageSampler = m_device->createSampler(samplerParams);
+		}
+
 		// Initial Pipeline Transitions and Clearing of PseudoStencil and ColorStorage
 		// Recorded to Temporary CommandBuffer, Submitted to Graphics Queue, and Blocked on here
 		{
@@ -746,10 +849,10 @@ class ComputerAidedDesign final : public examples::SimpleWindowedApplication, pu
 				descriptorInfosSet0[0u].desc = m_globalsBuffer;
 
 				descriptorInfosSet0[1u].info.combinedImageSampler.imageLayout = IImage::LAYOUT::READ_ONLY_OPTIMAL;
-				descriptorInfosSet0[1u].info.combinedImageSampler.sampler = msdfTextureSampler;
+				descriptorInfosSet0[1u].info.combinedImageSampler.sampler = msdfImageSampler;
 				descriptorInfosSet0[1u].desc = drawResourcesFiller.getMSDFsTextureArray();
 				
-				descriptorInfosSet0[2u].desc = msdfTextureSampler; // TODO[Erfan]: different sampler and make immutable?
+				descriptorInfosSet0[2u].desc = staticImageSampler; // TODO[Erfan]: different sampler and make immutable?
 				
 				// This is bindless to we write to it later.
 				// descriptorInfosSet0[3u].info.image.imageLayout = IImage::LAYOUT::READ_ONLY_OPTIMAL;
@@ -1094,8 +1197,58 @@ class ComputerAidedDesign final : public examples::SimpleWindowedApplication, pu
 				m_logger->log("Failed to load ICPUImage or ICPUImageView got some other Asset Type, skipping!", ILogger::ELL_ERROR);
 			}
 
-			const auto cpuImage = cpuImgView->getCreationParameters().image;
-			sampleImages.push_back(cpuImage);
+
+			const auto loadedCPUImage = cpuImgView->getCreationParameters().image;
+			const auto loadedCPUImageCreationParams = loadedCPUImage->getCreationParameters();
+
+			// Promoting the image to a format GPU supports. (so that updateImageViaStagingBuffer doesn't have to handle that each frame if overflow-submit needs to happen)
+			auto promotedCPUImageCreationParams = loadedCPUImage->getCreationParameters();
+			
+			promotedCPUImageCreationParams.usage |= IGPUImage::EUF_TRANSFER_DST_BIT|IGPUImage::EUF_SAMPLED_BIT;
+			// promote format because RGB8 and friends don't actually exist in HW
+			{
+				const IPhysicalDevice::SImageFormatPromotionRequest request = {
+					.originalFormat = promotedCPUImageCreationParams.format,
+					.usages = IPhysicalDevice::SFormatImageUsages::SUsage(promotedCPUImageCreationParams.usage)
+				};
+				promotedCPUImageCreationParams.format = m_physicalDevice->promoteImageFormat(request,video::IGPUImage::TILING::OPTIMAL);
+			}
+
+			if (loadedCPUImageCreationParams.format != promotedCPUImageCreationParams.format)
+			{
+				smart_refctd_ptr<ICPUImage> promotedCPUImage = ICPUImage::create(promotedCPUImageCreationParams);
+				core::rational<uint32_t> bytesPerPixel = asset::getBytesPerPixel(promotedCPUImageCreationParams.format);
+
+				const auto extent = loadedCPUImageCreationParams.extent;
+				const uint32_t mipLevels = loadedCPUImageCreationParams.mipLevels;
+				const uint32_t arrayLayers = loadedCPUImageCreationParams.arrayLayers;
+				
+				// Only supporting 1 mip, it's just for test..
+				const size_t byteSize = (bytesPerPixel * extent.width * extent.height * extent.depth * arrayLayers).getIntegerApprox(); // TODO: consider mips
+				ICPUBuffer::SCreationParams bufferCreationParams = {};
+				bufferCreationParams.size = byteSize;
+				smart_refctd_ptr<ICPUBuffer> promotedCPUImageBuffer = ICPUBuffer::create(std::move(bufferCreationParams));
+				
+				auto newRegions = core::make_refctd_dynamic_array<core::smart_refctd_dynamic_array<ICPUImage::SBufferCopy>>(1u);
+				ICPUImage::SBufferCopy& region = newRegions->front();
+				region.imageSubresource.aspectMask = IImage::E_ASPECT_FLAGS::EAF_COLOR_BIT;
+				region.imageSubresource.mipLevel = 0u; // TODO
+				region.imageSubresource.baseArrayLayer = 0u;
+				region.imageSubresource.layerCount = arrayLayers;
+				region.bufferOffset = 0u;
+				region.bufferRowLength = 0u;
+				region.bufferImageHeight = 0u;
+				region.imageOffset = { 0u, 0u, 0u };
+				region.imageExtent = extent;
+				promotedCPUImage->setBufferAndRegions(std::move(promotedCPUImageBuffer), newRegions);
+
+				performImageFormatPromotionCopy(loadedCPUImage, promotedCPUImage);
+				sampleImages.push_back(promotedCPUImage);
+			}
+			else
+			{
+				sampleImages.push_back(loadedCPUImage);
+			}
 		}
 
 		return true;
@@ -2928,11 +3081,11 @@ class ComputerAidedDesign final : public examples::SimpleWindowedApplication, pu
 			{
 				std::vector<float64_t2> linePoints;
 				linePoints.push_back({ 0.0, 0.0 });
-				linePoints.push_back({ 100.0, 0.0 });
-				linePoints.push_back({ 100.0, -100.0 });
+				linePoints.push_back({ 1.0, 0.0 });
+				linePoints.push_back({ 1.0, -1.0 });
 				polyline.addLinePoints(linePoints);
 			}
-			// drawResourcesFiller.drawPolyline(polyline, lineStyle, intendedNextSubmit);
+			drawResourcesFiller.drawPolyline(polyline, lineStyle, intendedNextSubmit);
 		}
 		else if (mode == ExampleMode::CASE_8)
 		{
@@ -2985,7 +3138,7 @@ class ComputerAidedDesign final : public examples::SimpleWindowedApplication, pu
 				singleLineText->Draw(drawResourcesFiller, intendedNextSubmit, m_font.get(), float64_t2(0.0,-200.0), float32_t2(1.0, 1.0), rotation, float32_t4(1.0, 1.0, 1.0, 1.0), italicTiltAngle, 0.0f);
 				singleLineText->Draw(drawResourcesFiller, intendedNextSubmit, m_font.get(), float64_t2(0.0,-250.0), float32_t2(1.0, 1.0), rotation, float32_t4(1.0, 1.0, 1.0, 1.0), italicTiltAngle, 0.5f);
 				// singleLineText->Draw(drawResourcesFiller, intendedNextSubmit, float64_t2(0.0,-200.0), float32_t2(1.0, 1.0), nbl::core::PI<float>() * abs(cos(m_timeElapsed * 0.00005)));
-				// Smaller text to test mip maps
+				// Smaller text to test level maps
 				//singleLineText->Draw(drawResourcesFiller, intendedNextSubmit, float64_t2(0.0,-130.0), float32_t2(0.4, 0.4), rotation);
 				//singleLineText->Draw(drawResourcesFiller, intendedNextSubmit, float64_t2(0.0,-150.0), float32_t2(0.2, 0.2), rotation);
 			}
@@ -3482,7 +3635,8 @@ class ComputerAidedDesign final : public examples::SimpleWindowedApplication, pu
 	// pointer to one of the command buffer infos from above, this is the only command buffer used to record current submit in current frame, it will be updated by SIntendedSubmitInfo
 	IQueue::SSubmitInfo::SCommandBufferInfo const * m_currentRecordingCommandBufferInfo; // pointer can change, value cannot
 
-	smart_refctd_ptr<IGPUSampler>		msdfTextureSampler;
+	smart_refctd_ptr<IGPUSampler>		msdfImageSampler;
+	smart_refctd_ptr<IGPUSampler>		staticImageSampler;
 
 	smart_refctd_ptr<IGPUBuffer>		m_globalsBuffer;
 	smart_refctd_ptr<IGPUDescriptorSet>	descriptorSet0;

From 65fe2ab855133edd71a89c4d2cedcf07596c49f8 Mon Sep 17 00:00:00 2001
From: Erfan Ahmadi <ahmadierfan99@gmail.com>
Date: Thu, 22 May 2025 10:12:17 +0400
Subject: [PATCH 287/529] work on internal `streamedImageCopies`

---
 62_CAD/DrawResourcesFiller.cpp | 29 ++++++++++++++++++++---------
 62_CAD/DrawResourcesFiller.h   |  5 ++++-
 2 files changed, 24 insertions(+), 10 deletions(-)

diff --git a/62_CAD/DrawResourcesFiller.cpp b/62_CAD/DrawResourcesFiller.cpp
index 4b1172847..2449e8b05 100644
--- a/62_CAD/DrawResourcesFiller.cpp
+++ b/62_CAD/DrawResourcesFiller.cpp
@@ -63,7 +63,7 @@ void DrawResourcesFiller::allocateResourcesBuffer(ILogicalDevice* logicalDevice,
 		IDeviceMemoryAllocator::SAllocateInfo allocationInfo =
 		{
 			// TODO: Get from user side.
-			.size = 270 * 1024 * 1024, // 70 MB
+			.size = 65 * 1024 * 1024, // 70 MB
 			.flags = IDeviceMemoryAllocation::E_MEMORY_ALLOCATE_FLAGS::EMAF_NONE,
 			.memoryTypeIndex = memoryTypeIdx,
 			.dedication = nullptr,
@@ -575,6 +575,12 @@ bool DrawResourcesFiller::ensureGeoreferencedImageAvailability_AllocateIfNeeded(
 	return (cachedImageRecord->arrayIndex != InvalidTextureIndex);
 }
 
+bool DrawResourcesFiller::queueGeoreferencedImageCopy_Internal(image_id imageID, const StreamedImageCopy& imageCopy)
+{
+	auto& vec = streamedImageCopies[imageID];
+	vec.emplace_back(imageCopy);
+}
+
 // TODO[Przemek]: similar to other drawXXX and drawXXX_internal functions that create mainobjects, drawObjects and push additional info in geometry buffer, input to function would be a GridDTMInfo
 // We don't have an allocator or memory management for texture updates yet, see how `_test_addImageObject` is being temporarily used (Descriptor updates and pipeline barriers) to upload an image into gpu and update a descriptor slot (it will become more sophisticated but doesn't block you)
 void DrawResourcesFiller::drawGridDTM(
@@ -1244,7 +1250,7 @@ bool DrawResourcesFiller::pushStreamedImagesUploads(SIntendedSubmitInfo& intende
 			beforeCopyImageBarriers.reserve(streamedImageCopies.size());
 
 			// Pipeline Barriers before imageCopy
-			for (auto& [imageID, imageCopy] : streamedImageCopies)
+			for (auto& [imageID, imageCopies] : streamedImageCopies)
 			{
 				auto* imageRecord = imagesCache->peek(imageID);
 				if (imageRecord == nullptr)
@@ -1277,7 +1283,7 @@ bool DrawResourcesFiller::pushStreamedImagesUploads(SIntendedSubmitInfo& intende
 			}
 			success &= commandBuffer->pipelineBarrier(E_DEPENDENCY_FLAGS::EDF_NONE, { .imgBarriers = beforeCopyImageBarriers });
 			
-			for (auto& [imageID, imageCopy] : streamedImageCopies)
+			for (auto& [imageID, imageCopies] : streamedImageCopies)
 			{
 				auto* imageRecord = imagesCache->peek(imageID);
 				if (imageRecord == nullptr)
@@ -1285,11 +1291,14 @@ bool DrawResourcesFiller::pushStreamedImagesUploads(SIntendedSubmitInfo& intende
 
 				const auto& gpuImg = imageRecord->gpuImageView->getCreationParameters().image;
 
-				success &= m_utilities->updateImageViaStagingBuffer(
-					intendedNextSubmit,
-					imageCopy.srcBuffer->getPointer(), gpuImg->getCreationParameters().format,
-					gpuImg.get(), IImage::LAYOUT::TRANSFER_DST_OPTIMAL,
-					{ &imageCopy.region, 1u });
+				for (auto& imageCopy : imageCopies)
+				{
+					success &= m_utilities->updateImageViaStagingBuffer(
+						intendedNextSubmit,
+						imageCopy.srcBuffer->getPointer(), gpuImg->getCreationParameters().format,
+						gpuImg.get(), IImage::LAYOUT::TRANSFER_DST_OPTIMAL,
+						{ &imageCopy.region, 1u });
+				}
 			}
 
 			commandBuffer = intendedNextSubmit.getCommandBufferForRecording()->cmdbuf; // overflow-submit in utilities calls might've cause current recording command buffer to change
@@ -1298,7 +1307,7 @@ bool DrawResourcesFiller::pushStreamedImagesUploads(SIntendedSubmitInfo& intende
 			afterCopyImageBarriers.reserve(streamedImageCopies.size());
 
 			// Pipeline Barriers before imageCopy
-			for (auto& [imageID, imageCopy] : streamedImageCopies)
+			for (auto& [imageID, imageCopies] : streamedImageCopies)
 			{
 				auto* imageRecord = imagesCache->peek(imageID);
 				if (imageRecord == nullptr)
@@ -1330,6 +1339,8 @@ bool DrawResourcesFiller::pushStreamedImagesUploads(SIntendedSubmitInfo& intende
 					});
 			}
 			success &= commandBuffer->pipelineBarrier(E_DEPENDENCY_FLAGS::EDF_NONE, { .imgBarriers = afterCopyImageBarriers });
+
+			streamedImageCopies.clear();
 		}
 		else
 		{
diff --git a/62_CAD/DrawResourcesFiller.h b/62_CAD/DrawResourcesFiller.h
index 6ece66de3..520f984a7 100644
--- a/62_CAD/DrawResourcesFiller.h
+++ b/62_CAD/DrawResourcesFiller.h
@@ -264,6 +264,9 @@ struct DrawResourcesFiller
 	 */
 	bool ensureGeoreferencedImageAvailability_AllocateIfNeeded(image_id imageID, const GeoreferencedImageParams& params, SIntendedSubmitInfo& intendedNextSubmit);
 
+	// [TODO]: should be internal protected member function.
+	bool queueGeoreferencedImageCopy_Internal(image_id imageID, const StreamedImageCopy& imageCopy);
+
 	// This function must be called immediately after `addStaticImage` for the same imageID.
 	void addImageObject(image_id imageID, const OrientedBoundingBox2D& obb, SIntendedSubmitInfo& intendedNextSubmit);
 	
@@ -740,6 +743,6 @@ struct DrawResourcesFiller
 	smart_refctd_ptr<SubAllocatedDescriptorSet> suballocatedDescriptorSet;
 	uint32_t imagesArrayBinding = 0u;
 
-	std::unordered_map<image_id, StreamedImageCopy> streamedImageCopies;
+	std::unordered_map<image_id, std::vector<StreamedImageCopy>> streamedImageCopies;
 };
 

From 2a991a95c7eb891e616aa8e79a0b624a43217a86 Mon Sep 17 00:00:00 2001
From: keptsecret <sorchon@gmail.com>
Date: Thu, 22 May 2025 14:42:26 +0700
Subject: [PATCH 288/529] combined headers between subgroup, workgroup stuff,
 restored spirv cache test

---
 .../app_resources/shaderCommon.hlsl           | 43 -----------
 .../app_resources/testSubgroup.comp.hlsl      | 41 ++++++++++
 .../app_resources/testWorkgroup.comp.hlsl     | 41 +++++++++-
 .../app_resources/workgroupCommon.hlsl        | 74 -------------------
 23_Arithmetic2UnitTest/main.cpp               | 49 +++++++++++-
 .../app_resources/benchmarkSubgroup.comp.hlsl | 17 ++---
 .../benchmarkWorkgroup.comp.hlsl              | 47 ++++++++++--
 .../app_resources/shaderCommon.hlsl           | 48 ++----------
 .../app_resources/workgroupCommon.hlsl        | 74 -------------------
 29_Arithmetic2Bench/main.cpp                  |  1 -
 10 files changed, 181 insertions(+), 254 deletions(-)
 delete mode 100644 23_Arithmetic2UnitTest/app_resources/workgroupCommon.hlsl
 delete mode 100644 29_Arithmetic2Bench/app_resources/workgroupCommon.hlsl

diff --git a/23_Arithmetic2UnitTest/app_resources/shaderCommon.hlsl b/23_Arithmetic2UnitTest/app_resources/shaderCommon.hlsl
index 05dcfb469..31d59121b 100644
--- a/23_Arithmetic2UnitTest/app_resources/shaderCommon.hlsl
+++ b/23_Arithmetic2UnitTest/app_resources/shaderCommon.hlsl
@@ -1,10 +1,5 @@
 #include "common.hlsl"
 
-#include "nbl/builtin/hlsl/glsl_compat/core.hlsl"
-#include "nbl/builtin/hlsl/subgroup/basic.hlsl"
-#include "nbl/builtin/hlsl/subgroup/arithmetic_portability.hlsl"
-#include "nbl/builtin/hlsl/subgroup2/arithmetic_portability.hlsl"
-
 #include "nbl/builtin/hlsl/jit/device_capabilities.hlsl"
 
 // https://github.com/microsoft/DirectXShaderCompiler/issues/6144
@@ -14,8 +9,6 @@ uint32_t3 nbl::hlsl::glsl::gl_WorkGroupSize() {return uint32_t3(WORKGROUP_SIZE,1
 #error "Define ITEMS_PER_INVOCATION!"
 #endif
 
-typedef vector<uint32_t, ITEMS_PER_INVOCATION> type_t;
-
 struct PushConstantData
 {
     uint64_t inputBufAddress;
@@ -36,39 +29,3 @@ bool canStore();
 #ifndef SUBGROUP_SIZE_LOG2
 #error "Define SUBGROUP_SIZE_LOG2!"
 #endif
-template<template<class> class binop, typename T, uint32_t N>
-static void subtest(NBL_CONST_REF_ARG(type_t) sourceVal)
-{
-    // TODO static assert vector<T, N> == type_t
-    //using type_t = vector<T, N>;
-    using config_t = nbl::hlsl::subgroup2::Configuration<SUBGROUP_SIZE_LOG2>;
-    using params_t = nbl::hlsl::subgroup2::ArithmeticParams<config_t, typename binop<T>::base_t, N, nbl::hlsl::jit::device_capabilities>;
-
-    const uint64_t outputBufAddr = vk::RawBufferLoad<uint64_t>(pc.outputAddressBufAddress + binop<T>::BindingIndex * sizeof(uint64_t), sizeof(uint64_t));
-
-    if (globalIndex()==0u)
-        vk::RawBufferStore<uint32_t>(outputBufAddr, nbl::hlsl::glsl::gl_SubgroupSize());
-
-    operation_t<params_t> func;
-    type_t val = func(sourceVal);
-    if (canStore())
-        vk::RawBufferStore<type_t>(outputBufAddr + sizeof(uint32_t) + sizeof(type_t) * globalIndex(), val, sizeof(uint32_t));
-}
-
-
-type_t test()
-{
-    const uint32_t idx = globalIndex();
-    type_t sourceVal = vk::RawBufferLoad<type_t>(pc.inputBufAddress + idx * sizeof(type_t));
-
-    subtest<bit_and, uint32_t, ITEMS_PER_INVOCATION>(sourceVal);
-    subtest<bit_xor, uint32_t, ITEMS_PER_INVOCATION>(sourceVal);
-    subtest<bit_or, uint32_t, ITEMS_PER_INVOCATION>(sourceVal);
-    subtest<plus, uint32_t, ITEMS_PER_INVOCATION>(sourceVal);
-    subtest<multiplies, uint32_t, ITEMS_PER_INVOCATION>(sourceVal);
-    subtest<minimum, uint32_t, ITEMS_PER_INVOCATION>(sourceVal);
-    subtest<maximum, uint32_t, ITEMS_PER_INVOCATION>(sourceVal);
-    return sourceVal;
-}
-
-#include "nbl/builtin/hlsl/workgroup/basic.hlsl"
diff --git a/23_Arithmetic2UnitTest/app_resources/testSubgroup.comp.hlsl b/23_Arithmetic2UnitTest/app_resources/testSubgroup.comp.hlsl
index 2cc1ccb60..c5a030851 100644
--- a/23_Arithmetic2UnitTest/app_resources/testSubgroup.comp.hlsl
+++ b/23_Arithmetic2UnitTest/app_resources/testSubgroup.comp.hlsl
@@ -2,7 +2,48 @@
 
 #define operation_t nbl::hlsl::OPERATION
 
+#include "nbl/builtin/hlsl/glsl_compat/core.hlsl"
+#include "nbl/builtin/hlsl/glsl_compat/subgroup_basic.hlsl"
+#include "nbl/builtin/hlsl/subgroup2/arithmetic_portability.hlsl"
+
 #include "shaderCommon.hlsl"
+#include "nbl/builtin/hlsl/workgroup/basic.hlsl"
+
+typedef vector<uint32_t, ITEMS_PER_INVOCATION> type_t;
+
+template<template<class> class binop, typename T, uint32_t N>
+static void subtest(NBL_CONST_REF_ARG(type_t) sourceVal)
+{
+    // TODO static assert vector<T, N> == type_t
+    //using type_t = vector<T, N>;
+    using config_t = nbl::hlsl::subgroup2::Configuration<SUBGROUP_SIZE_LOG2>;
+    using params_t = nbl::hlsl::subgroup2::ArithmeticParams<config_t, typename binop<T>::base_t, N, nbl::hlsl::jit::device_capabilities>;
+
+    const uint64_t outputBufAddr = vk::RawBufferLoad<uint64_t>(pc.outputAddressBufAddress + binop<T>::BindingIndex * sizeof(uint64_t), sizeof(uint64_t));
+
+    if (globalIndex()==0u)
+        vk::RawBufferStore<uint32_t>(outputBufAddr, nbl::hlsl::glsl::gl_SubgroupSize());
+
+    operation_t<params_t> func;
+    type_t val = func(sourceVal);
+    if (canStore())
+        vk::RawBufferStore<type_t>(outputBufAddr + sizeof(uint32_t) + sizeof(type_t) * globalIndex(), val, sizeof(uint32_t));
+}
+
+type_t test()
+{
+    const uint32_t idx = globalIndex();
+    type_t sourceVal = vk::RawBufferLoad<type_t>(pc.inputBufAddress + idx * sizeof(type_t));
+
+    subtest<bit_and, uint32_t, ITEMS_PER_INVOCATION>(sourceVal);
+    subtest<bit_xor, uint32_t, ITEMS_PER_INVOCATION>(sourceVal);
+    subtest<bit_or, uint32_t, ITEMS_PER_INVOCATION>(sourceVal);
+    subtest<plus, uint32_t, ITEMS_PER_INVOCATION>(sourceVal);
+    subtest<multiplies, uint32_t, ITEMS_PER_INVOCATION>(sourceVal);
+    subtest<minimum, uint32_t, ITEMS_PER_INVOCATION>(sourceVal);
+    subtest<maximum, uint32_t, ITEMS_PER_INVOCATION>(sourceVal);
+    return sourceVal;
+}
 
 uint32_t globalIndex()
 {
diff --git a/23_Arithmetic2UnitTest/app_resources/testWorkgroup.comp.hlsl b/23_Arithmetic2UnitTest/app_resources/testWorkgroup.comp.hlsl
index 58e293ba3..51f556797 100644
--- a/23_Arithmetic2UnitTest/app_resources/testWorkgroup.comp.hlsl
+++ b/23_Arithmetic2UnitTest/app_resources/testWorkgroup.comp.hlsl
@@ -1,6 +1,45 @@
 #pragma shader_stage(compute)
 
-#include "workgroupCommon.hlsl"
+#include "nbl/builtin/hlsl/glsl_compat/core.hlsl"
+#include "nbl/builtin/hlsl/glsl_compat/subgroup_basic.hlsl"
+#include "nbl/builtin/hlsl/subgroup2/arithmetic_portability.hlsl"
+#include "nbl/builtin/hlsl/workgroup2/arithmetic.hlsl"
+
+static const uint32_t WORKGROUP_SIZE = 1u << WORKGROUP_SIZE_LOG2;
+
+#include "shaderCommon.hlsl"
+
+using config_t = nbl::hlsl::workgroup2::ArithmeticConfiguration<WORKGROUP_SIZE_LOG2, SUBGROUP_SIZE_LOG2, ITEMS_PER_INVOCATION>;
+
+typedef vector<uint32_t, config_t::ItemsPerInvocation_0> type_t;
+
+// final (level 1/2) scan needs to fit in one subgroup exactly
+groupshared uint32_t scratch[config_t::ElementCount];
+
+struct ScratchProxy
+{
+    template<typename AccessType>
+    void get(const uint32_t ix, NBL_REF_ARG(AccessType) value)
+    {
+        value = scratch[ix];
+    }
+    template<typename AccessType>
+    void set(const uint32_t ix, const AccessType value)
+    {
+        scratch[ix] = value;
+    }
+
+    uint32_t atomicOr(const uint32_t ix, const uint32_t value)
+    {
+        return nbl::hlsl::glsl::atomicOr(scratch[ix],value);
+    }
+
+    void workgroupExecutionAndMemoryBarrier()
+    {
+        nbl::hlsl::glsl::barrier();
+        //nbl::hlsl::glsl::memoryBarrierShared(); implied by the above
+    }
+};
 
 template<class Config, class Binop>
 struct DataProxy
diff --git a/23_Arithmetic2UnitTest/app_resources/workgroupCommon.hlsl b/23_Arithmetic2UnitTest/app_resources/workgroupCommon.hlsl
deleted file mode 100644
index c02d86969..000000000
--- a/23_Arithmetic2UnitTest/app_resources/workgroupCommon.hlsl
+++ /dev/null
@@ -1,74 +0,0 @@
-#include "nbl/builtin/hlsl/workgroup/scratch_size.hlsl"
-
-#include "nbl/builtin/hlsl/workgroup2/arithmetic.hlsl"
-
-#include "nbl/builtin/hlsl/workgroup/broadcast.hlsl"
-
-#include "nbl/builtin/hlsl/glsl_compat/core.hlsl"
-#include "nbl/builtin/hlsl/subgroup/basic.hlsl"
-#include "nbl/builtin/hlsl/subgroup2/arithmetic_portability.hlsl"
-
-#include "nbl/builtin/hlsl/jit/device_capabilities.hlsl"
-
-#include "common.hlsl"
-
-static const uint32_t WORKGROUP_SIZE = 1u << WORKGROUP_SIZE_LOG2;
-
-// https://github.com/microsoft/DirectXShaderCompiler/issues/6144
-uint32_t3 nbl::hlsl::glsl::gl_WorkGroupSize() {return uint32_t3(WORKGROUP_SIZE,1,1);}
-
-#ifndef ITEMS_PER_INVOCATION
-#error "Define ITEMS_PER_INVOCATION!"
-#endif
-
-using config_t = nbl::hlsl::workgroup2::ArithmeticConfiguration<WORKGROUP_SIZE_LOG2, SUBGROUP_SIZE_LOG2, ITEMS_PER_INVOCATION>;
-
-typedef vector<uint32_t, config_t::ItemsPerInvocation_0> type_t;
-
-struct PushConstantData
-{
-    uint64_t inputBufAddress;
-    uint64_t outputAddressBufAddress;
-};
-
-[[vk::push_constant]] PushConstantData pc;
-
-// because subgroups don't match `gl_LocalInvocationIndex` snake curve addressing, we also can't load inputs that way
-uint32_t globalIndex();
-// since we test ITEMS_PER_WG<WorkgroupSize we need this so workgroups don't overwrite each other's outputs
-bool canStore();
-
-#ifndef OPERATION
-#error "Define OPERATION!"
-#endif
-#ifndef SUBGROUP_SIZE_LOG2
-#error "Define SUBGROUP_SIZE_LOG2!"
-#endif
-
-// final (level 1/2) scan needs to fit in one subgroup exactly
-groupshared uint32_t scratch[config_t::ElementCount];
-
-struct ScratchProxy
-{
-    template<typename AccessType>
-    void get(const uint32_t ix, NBL_REF_ARG(AccessType) value)
-    {
-        value = scratch[ix];
-    }
-    template<typename AccessType>
-    void set(const uint32_t ix, const AccessType value)
-    {
-        scratch[ix] = value;
-    }
-
-    uint32_t atomicOr(const uint32_t ix, const uint32_t value)
-    {
-        return nbl::hlsl::glsl::atomicOr(scratch[ix],value);
-    }
-
-    void workgroupExecutionAndMemoryBarrier()
-    {
-        nbl::hlsl::glsl::barrier();
-        //nbl::hlsl::glsl::memoryBarrierShared(); implied by the above
-    }
-};
diff --git a/23_Arithmetic2UnitTest/main.cpp b/23_Arithmetic2UnitTest/main.cpp
index 2edd34439..2daa772ae 100644
--- a/23_Arithmetic2UnitTest/main.cpp
+++ b/23_Arithmetic2UnitTest/main.cpp
@@ -124,10 +124,42 @@ class Workgroup2ScanTestApp final : public application_templates::BasicMultiQueu
 		// create Pipeline Layout
 		{
 			SPushConstantRange pcRange = { .stageFlags = IShader::E_SHADER_STAGE::ESS_COMPUTE, .offset = 0,.size = sizeof(PushConstantData) };
-
 			pipelineLayout = m_device->createPipelineLayout({&pcRange, 1});
 		}
 
+		const auto spirv_isa_cache_path = localOutputCWD / "spirv_isa_cache.bin";
+		// enclose to make sure file goes out of scope and we can reopen it
+		{
+			smart_refctd_ptr<const IFile> spirv_isa_cache_input;
+			// try to load SPIR-V to ISA cache
+			{
+				ISystem::future_t<smart_refctd_ptr<IFile>> fileCreate;
+				m_system->createFile(fileCreate, spirv_isa_cache_path, IFile::ECF_READ | IFile::ECF_MAPPABLE | IFile::ECF_COHERENT);
+				if (auto lock = fileCreate.acquire())
+					spirv_isa_cache_input = *lock;
+			}
+			// create the cache
+			{
+				std::span<const uint8_t> spirv_isa_cache_data = {};
+				if (spirv_isa_cache_input)
+					spirv_isa_cache_data = { reinterpret_cast<const uint8_t*>(spirv_isa_cache_input->getMappedPointer()),spirv_isa_cache_input->getSize() };
+				else
+					m_logger->log("Failed to load SPIR-V 2 ISA cache!", ILogger::ELL_PERFORMANCE);
+				// Normally we'd deserialize a `ICPUPipelineCache` properly and pass that instead
+				m_spirv_isa_cache = m_device->createPipelineCache(spirv_isa_cache_data);
+			}
+		}
+		{
+			// TODO: rename `deleteDirectory` to just `delete`? and a `IFile::setSize()` ?
+			m_system->deleteDirectory(spirv_isa_cache_path);
+			ISystem::future_t<smart_refctd_ptr<IFile>> fileCreate;
+			m_system->createFile(fileCreate, spirv_isa_cache_path, IFile::ECF_WRITE);
+			// I can be relatively sure I'll succeed to acquire the future, the pointer to created file might be null though.
+			m_spirv_isa_cache_output = *fileCreate.acquire();
+			if (!m_spirv_isa_cache_output)
+				logFail("Failed to Create SPIR-V to ISA cache file.");
+		}
+
 		// load shader source from file
 		auto getShaderSource = [&](const char* filePath) -> auto
 		{
@@ -192,6 +224,17 @@ class Workgroup2ScanTestApp final : public application_templates::BasicMultiQueu
 					logTestOutcome(passed, itemsPerWG);
 				}
 				m_api->endCapture();
+
+				// save cache every now and then	
+				{
+					auto cpu = m_spirv_isa_cache->convertToCPUCache();
+					// Normally we'd beautifully JSON serialize the thing, allow multiple devices & drivers + metadata
+					auto bin = cpu->getEntries().begin()->second.bin;
+					IFile::success_t success;
+					m_spirv_isa_cache_output->write(success, bin->data(), 0ull, bin->size());
+					if (!success)
+						logFail("Could not write Create SPIR-V to ISA cache to disk!");
+				}
 			}
 		}
 
@@ -238,7 +281,7 @@ class Workgroup2ScanTestApp final : public application_templates::BasicMultiQueu
 			.requireFullSubgroups = true
 		};
 		core::smart_refctd_ptr<IGPUComputePipeline> pipeline;
-		if (!m_device->createComputePipelines(nullptr,{&params,1},&pipeline))
+		if (!m_device->createComputePipelines(m_spirv_isa_cache.get(),{&params,1},&pipeline))
 			return nullptr;
 		return pipeline;
 	}
@@ -455,6 +498,8 @@ class Workgroup2ScanTestApp final : public application_templates::BasicMultiQueu
 
 	IQueue* transferDownQueue;
 	IQueue* computeQueue;
+	smart_refctd_ptr<IGPUPipelineCache> m_spirv_isa_cache;
+	smart_refctd_ptr<IFile> m_spirv_isa_cache_output;
 
 	uint32_t* inputData = nullptr;
 	constexpr static inline uint32_t OutputBufferCount = 8u;
diff --git a/29_Arithmetic2Bench/app_resources/benchmarkSubgroup.comp.hlsl b/29_Arithmetic2Bench/app_resources/benchmarkSubgroup.comp.hlsl
index e21d67fcb..cb033a5bb 100644
--- a/29_Arithmetic2Bench/app_resources/benchmarkSubgroup.comp.hlsl
+++ b/29_Arithmetic2Bench/app_resources/benchmarkSubgroup.comp.hlsl
@@ -2,10 +2,14 @@
 
 #define operation_t nbl::hlsl::OPERATION
 
+#include "nbl/builtin/hlsl/glsl_compat/core.hlsl"
+#include "nbl/builtin/hlsl/glsl_compat/subgroup_basic.hlsl"
+#include "nbl/builtin/hlsl/subgroup2/arithmetic_portability.hlsl"
+
 #include "shaderCommon.hlsl"
+#include "nbl/builtin/hlsl/workgroup/basic.hlsl"
 
-// NOTE added dummy output image to be able to profile with Nsight, which still doesn't support profiling headless compute shaders
-[[vk::binding(2, 0)]] RWTexture2D<float32_t4> outImage; // dummy
+typedef vector<uint32_t, ITEMS_PER_INVOCATION> type_t;
 
 uint32_t globalIndex()
 {
@@ -14,10 +18,6 @@ uint32_t globalIndex()
 
 bool canStore() {return true;}
 
-#ifndef NUM_LOOPS
-#error "Define NUM_LOOPS!"
-#endif
-
 template<template<class> class binop, typename T, uint32_t N>
 static void subbench(NBL_CONST_REF_ARG(type_t) sourceVal)
 {
@@ -32,9 +32,8 @@ static void subbench(NBL_CONST_REF_ARG(type_t) sourceVal)
     for (uint32_t i = 0; i < NUM_LOOPS; i++)
         value = func(value);
 
-    [unroll]
-    for (uint32_t i = 0; i < N; i++)
-        vk::RawBufferStore<uint32_t>(outputBufAddr+sizeof(uint32_t)+sizeof(type_t)*globalIndex()+i*sizeof(uint32_t), val[i]);
+    if (canStore())
+        vk::RawBufferStore<type_t>(outputBufAddr + sizeof(uint32_t) + sizeof(type_t) * globalIndex(), value, sizeof(uint32_t));
 }
 
 void benchmark()
diff --git a/29_Arithmetic2Bench/app_resources/benchmarkWorkgroup.comp.hlsl b/29_Arithmetic2Bench/app_resources/benchmarkWorkgroup.comp.hlsl
index 0194b2f75..8815eb037 100644
--- a/29_Arithmetic2Bench/app_resources/benchmarkWorkgroup.comp.hlsl
+++ b/29_Arithmetic2Bench/app_resources/benchmarkWorkgroup.comp.hlsl
@@ -1,9 +1,46 @@
 #pragma shader_stage(compute)
 
-#include "workgroupCommon.hlsl"
+#include "nbl/builtin/hlsl/glsl_compat/core.hlsl"
+#include "nbl/builtin/hlsl/glsl_compat/subgroup_basic.hlsl"
+#include "nbl/builtin/hlsl/subgroup2/arithmetic_portability.hlsl"
+#include "nbl/builtin/hlsl/workgroup2/arithmetic.hlsl"
+
+static const uint32_t WORKGROUP_SIZE = 1u << WORKGROUP_SIZE_LOG2;
+
+#include "shaderCommon.hlsl"
+
+using config_t = nbl::hlsl::workgroup2::ArithmeticConfiguration<WORKGROUP_SIZE_LOG2, SUBGROUP_SIZE_LOG2, ITEMS_PER_INVOCATION>;
+
+typedef vector<uint32_t, config_t::ItemsPerInvocation_0> type_t;
+
+// final (level 1/2) scan needs to fit in one subgroup exactly
+groupshared uint32_t scratch[config_t::ElementCount];
+
+struct ScratchProxy
+{
+    template<typename AccessType>
+    void get(const uint32_t ix, NBL_REF_ARG(AccessType) value)
+    {
+        value = scratch[ix];
+    }
+    template<typename AccessType>
+    void set(const uint32_t ix, const AccessType value)
+    {
+        scratch[ix] = value;
+    }
+
+    uint32_t atomicOr(const uint32_t ix, const uint32_t value)
+    {
+        return nbl::hlsl::glsl::atomicOr(scratch[ix],value);
+    }
+
+    void workgroupExecutionAndMemoryBarrier()
+    {
+        nbl::hlsl::glsl::barrier();
+        //nbl::hlsl::glsl::memoryBarrierShared(); implied by the above
+    }
+};
 
-// NOTE added dummy output image to be able to profile with Nsight, which still doesn't support profiling headless compute shaders
-[[vk::binding(2, 0)]] RWTexture2D<float32_t4> outImage; // dummy
 
 template<class Config, class Binop>
 struct DataProxy
@@ -48,10 +85,6 @@ struct operation_t
     }
 };
 
-#ifndef NUM_LOOPS
-#error "Define NUM_LOOPS!"
-#endif
-
 template<template<class> class binop, typename T, uint32_t N>
 static void subbench(NBL_CONST_REF_ARG(type_t) sourceVal)
 {
diff --git a/29_Arithmetic2Bench/app_resources/shaderCommon.hlsl b/29_Arithmetic2Bench/app_resources/shaderCommon.hlsl
index ae0f61f33..a14986e0d 100644
--- a/29_Arithmetic2Bench/app_resources/shaderCommon.hlsl
+++ b/29_Arithmetic2Bench/app_resources/shaderCommon.hlsl
@@ -1,10 +1,5 @@
 #include "common.hlsl"
 
-#include "nbl/builtin/hlsl/glsl_compat/core.hlsl"
-#include "nbl/builtin/hlsl/subgroup/basic.hlsl"
-#include "nbl/builtin/hlsl/subgroup/arithmetic_portability.hlsl"
-#include "nbl/builtin/hlsl/subgroup2/arithmetic_portability.hlsl"
-
 #include "nbl/builtin/hlsl/jit/device_capabilities.hlsl"
 
 // https://github.com/microsoft/DirectXShaderCompiler/issues/6144
@@ -14,8 +9,6 @@ uint32_t3 nbl::hlsl::glsl::gl_WorkGroupSize() {return uint32_t3(WORKGROUP_SIZE,1
 #error "Define ITEMS_PER_INVOCATION!"
 #endif
 
-typedef vector<uint32_t, ITEMS_PER_INVOCATION> type_t;
-
 struct PushConstantData
 {
     uint64_t inputBufAddress;
@@ -36,41 +29,10 @@ bool canStore();
 #ifndef SUBGROUP_SIZE_LOG2
 #error "Define SUBGROUP_SIZE_LOG2!"
 #endif
-template<template<class> class binop, typename T, uint32_t N>
-static void subtest(NBL_CONST_REF_ARG(type_t) sourceVal)
-{
-    // TODO static assert vector<T, N> == type_t
-    //using type_t = vector<T, N>;
-    using config_t = nbl::hlsl::subgroup2::Configuration<SUBGROUP_SIZE_LOG2>;
-    using params_t = nbl::hlsl::subgroup2::ArithmeticParams<config_t, typename binop<T>::base_t, N, nbl::hlsl::jit::device_capabilities>;
-
-    const uint64_t outputBufAddr = vk::RawBufferLoad<uint64_t>(pc.outputAddressBufAddress + binop<T>::BindingIndex * sizeof(uint64_t), sizeof(uint64_t));
-
-    if (globalIndex()==0u)
-        vk::RawBufferStore<uint32_t>(outputBufAddr, nbl::hlsl::glsl::gl_SubgroupSize());
-        
-    operation_t<params_t> func;
-    if (canStore())
-        [unroll]
-        for (uint32_t i = 0; i < N; i++)
-            vk::RawBufferStore<uint32_t>(outputBufAddr+sizeof(uint32_t)+sizeof(type_t)*globalIndex()+i*sizeof(uint32_t), val[i]);
-        // vk::RawBufferStore<dtype_t>(outputBufAddr + sizeof(uint32_t) + sizeof(dtype_t) * globalIndex(), value, sizeof(uint32_t)); TODO why won't this work???
-}
 
+#ifndef NUM_LOOPS
+#error "Define NUM_LOOPS!"
+#endif
 
-type_t test()
-{
-    const uint32_t idx = globalIndex();
-    type_t sourceVal = vk::RawBufferLoad<type_t>(pc.inputBufAddress + idx * sizeof(type_t));
-
-    subtest<bit_and, uint32_t, ITEMS_PER_INVOCATION>(sourceVal);
-    subtest<bit_xor, uint32_t, ITEMS_PER_INVOCATION>(sourceVal);
-    subtest<bit_or, uint32_t, ITEMS_PER_INVOCATION>(sourceVal);
-    subtest<plus, uint32_t, ITEMS_PER_INVOCATION>(sourceVal);
-    subtest<multiplies, uint32_t, ITEMS_PER_INVOCATION>(sourceVal);
-    subtest<minimum, uint32_t, ITEMS_PER_INVOCATION>(sourceVal);
-    subtest<maximum, uint32_t, ITEMS_PER_INVOCATION>(sourceVal);
-    return sourceVal;
-}
-
-#include "nbl/builtin/hlsl/workgroup/basic.hlsl"
+// NOTE added dummy output image to be able to profile with Nsight, which still doesn't support profiling headless compute shaders
+[[vk::binding(2, 0)]] RWTexture2D<float32_t4> outImage; // dummy
diff --git a/29_Arithmetic2Bench/app_resources/workgroupCommon.hlsl b/29_Arithmetic2Bench/app_resources/workgroupCommon.hlsl
deleted file mode 100644
index c02d86969..000000000
--- a/29_Arithmetic2Bench/app_resources/workgroupCommon.hlsl
+++ /dev/null
@@ -1,74 +0,0 @@
-#include "nbl/builtin/hlsl/workgroup/scratch_size.hlsl"
-
-#include "nbl/builtin/hlsl/workgroup2/arithmetic.hlsl"
-
-#include "nbl/builtin/hlsl/workgroup/broadcast.hlsl"
-
-#include "nbl/builtin/hlsl/glsl_compat/core.hlsl"
-#include "nbl/builtin/hlsl/subgroup/basic.hlsl"
-#include "nbl/builtin/hlsl/subgroup2/arithmetic_portability.hlsl"
-
-#include "nbl/builtin/hlsl/jit/device_capabilities.hlsl"
-
-#include "common.hlsl"
-
-static const uint32_t WORKGROUP_SIZE = 1u << WORKGROUP_SIZE_LOG2;
-
-// https://github.com/microsoft/DirectXShaderCompiler/issues/6144
-uint32_t3 nbl::hlsl::glsl::gl_WorkGroupSize() {return uint32_t3(WORKGROUP_SIZE,1,1);}
-
-#ifndef ITEMS_PER_INVOCATION
-#error "Define ITEMS_PER_INVOCATION!"
-#endif
-
-using config_t = nbl::hlsl::workgroup2::ArithmeticConfiguration<WORKGROUP_SIZE_LOG2, SUBGROUP_SIZE_LOG2, ITEMS_PER_INVOCATION>;
-
-typedef vector<uint32_t, config_t::ItemsPerInvocation_0> type_t;
-
-struct PushConstantData
-{
-    uint64_t inputBufAddress;
-    uint64_t outputAddressBufAddress;
-};
-
-[[vk::push_constant]] PushConstantData pc;
-
-// because subgroups don't match `gl_LocalInvocationIndex` snake curve addressing, we also can't load inputs that way
-uint32_t globalIndex();
-// since we test ITEMS_PER_WG<WorkgroupSize we need this so workgroups don't overwrite each other's outputs
-bool canStore();
-
-#ifndef OPERATION
-#error "Define OPERATION!"
-#endif
-#ifndef SUBGROUP_SIZE_LOG2
-#error "Define SUBGROUP_SIZE_LOG2!"
-#endif
-
-// final (level 1/2) scan needs to fit in one subgroup exactly
-groupshared uint32_t scratch[config_t::ElementCount];
-
-struct ScratchProxy
-{
-    template<typename AccessType>
-    void get(const uint32_t ix, NBL_REF_ARG(AccessType) value)
-    {
-        value = scratch[ix];
-    }
-    template<typename AccessType>
-    void set(const uint32_t ix, const AccessType value)
-    {
-        scratch[ix] = value;
-    }
-
-    uint32_t atomicOr(const uint32_t ix, const uint32_t value)
-    {
-        return nbl::hlsl::glsl::atomicOr(scratch[ix],value);
-    }
-
-    void workgroupExecutionAndMemoryBarrier()
-    {
-        nbl::hlsl::glsl::barrier();
-        //nbl::hlsl::glsl::memoryBarrierShared(); implied by the above
-    }
-};
diff --git a/29_Arithmetic2Bench/main.cpp b/29_Arithmetic2Bench/main.cpp
index 0772997dc..5b8792040 100644
--- a/29_Arithmetic2Bench/main.cpp
+++ b/29_Arithmetic2Bench/main.cpp
@@ -708,7 +708,6 @@ class ArithmeticBenchApp final : public examples::SimpleWindowedApplication, pub
 	uint32_t numSubmits = 0;
 
 	/* PARAMETERS TO CHANGE FOR DIFFERENT BENCHMARKS */
-
 	constexpr static inline bool DoWorkgroupBenchmarks = true;
 	uint32_t ItemsPerInvocation = 4u;
 	constexpr static inline uint32_t NumLoops = 1000u;

From e4735a4e840e870803e30fa78325f89fcf01df60 Mon Sep 17 00:00:00 2001
From: keptsecret <sorchon@gmail.com>
Date: Thu, 22 May 2025 15:03:10 +0700
Subject: [PATCH 289/529] simplified test,benchmark function template params

---
 .../app_resources/testSubgroup.comp.hlsl      | 22 +++++++++----------
 .../app_resources/testWorkgroup.comp.hlsl     | 20 ++++++++---------
 .../app_resources/benchmarkSubgroup.comp.hlsl | 20 ++++++++---------
 .../benchmarkWorkgroup.comp.hlsl              | 21 +++++++++---------
 29_Arithmetic2Bench/main.cpp                  |  4 ++--
 5 files changed, 42 insertions(+), 45 deletions(-)

diff --git a/23_Arithmetic2UnitTest/app_resources/testSubgroup.comp.hlsl b/23_Arithmetic2UnitTest/app_resources/testSubgroup.comp.hlsl
index c5a030851..838f7adf9 100644
--- a/23_Arithmetic2UnitTest/app_resources/testSubgroup.comp.hlsl
+++ b/23_Arithmetic2UnitTest/app_resources/testSubgroup.comp.hlsl
@@ -11,15 +11,13 @@
 
 typedef vector<uint32_t, ITEMS_PER_INVOCATION> type_t;
 
-template<template<class> class binop, typename T, uint32_t N>
+template<class Binop, uint32_t N>
 static void subtest(NBL_CONST_REF_ARG(type_t) sourceVal)
 {
-    // TODO static assert vector<T, N> == type_t
-    //using type_t = vector<T, N>;
     using config_t = nbl::hlsl::subgroup2::Configuration<SUBGROUP_SIZE_LOG2>;
-    using params_t = nbl::hlsl::subgroup2::ArithmeticParams<config_t, typename binop<T>::base_t, N, nbl::hlsl::jit::device_capabilities>;
+    using params_t = nbl::hlsl::subgroup2::ArithmeticParams<config_t, typename Binop::base_t, N, nbl::hlsl::jit::device_capabilities>;
 
-    const uint64_t outputBufAddr = vk::RawBufferLoad<uint64_t>(pc.outputAddressBufAddress + binop<T>::BindingIndex * sizeof(uint64_t), sizeof(uint64_t));
+    const uint64_t outputBufAddr = vk::RawBufferLoad<uint64_t>(pc.outputAddressBufAddress + Binop::BindingIndex * sizeof(uint64_t), sizeof(uint64_t));
 
     if (globalIndex()==0u)
         vk::RawBufferStore<uint32_t>(outputBufAddr, nbl::hlsl::glsl::gl_SubgroupSize());
@@ -35,13 +33,13 @@ type_t test()
     const uint32_t idx = globalIndex();
     type_t sourceVal = vk::RawBufferLoad<type_t>(pc.inputBufAddress + idx * sizeof(type_t));
 
-    subtest<bit_and, uint32_t, ITEMS_PER_INVOCATION>(sourceVal);
-    subtest<bit_xor, uint32_t, ITEMS_PER_INVOCATION>(sourceVal);
-    subtest<bit_or, uint32_t, ITEMS_PER_INVOCATION>(sourceVal);
-    subtest<plus, uint32_t, ITEMS_PER_INVOCATION>(sourceVal);
-    subtest<multiplies, uint32_t, ITEMS_PER_INVOCATION>(sourceVal);
-    subtest<minimum, uint32_t, ITEMS_PER_INVOCATION>(sourceVal);
-    subtest<maximum, uint32_t, ITEMS_PER_INVOCATION>(sourceVal);
+    subtest<bit_and<uint32_t>, ITEMS_PER_INVOCATION>(sourceVal);
+    subtest<bit_xor<uint32_t>, ITEMS_PER_INVOCATION>(sourceVal);
+    subtest<bit_or<uint32_t>, ITEMS_PER_INVOCATION>(sourceVal);
+    subtest<plus<uint32_t>, ITEMS_PER_INVOCATION>(sourceVal);
+    subtest<multiplies<uint32_t>, ITEMS_PER_INVOCATION>(sourceVal);
+    subtest<minimum<uint32_t>, ITEMS_PER_INVOCATION>(sourceVal);
+    subtest<maximum<uint32_t>, ITEMS_PER_INVOCATION>(sourceVal);
     return sourceVal;
 }
 
diff --git a/23_Arithmetic2UnitTest/app_resources/testWorkgroup.comp.hlsl b/23_Arithmetic2UnitTest/app_resources/testWorkgroup.comp.hlsl
index 51f556797..e2256d2f1 100644
--- a/23_Arithmetic2UnitTest/app_resources/testWorkgroup.comp.hlsl
+++ b/23_Arithmetic2UnitTest/app_resources/testWorkgroup.comp.hlsl
@@ -150,14 +150,14 @@ struct operation_t
 };
 
 
-template<template<class> class binop, typename T, uint32_t N>
+template<class Binop>
 static void subtest(NBL_CONST_REF_ARG(type_t) sourceVal)
 {
-    uint64_t outputBufAddr = vk::RawBufferLoad<uint64_t>(pc.outputAddressBufAddress + binop<T>::BindingIndex * sizeof(uint64_t));
+    uint64_t outputBufAddr = vk::RawBufferLoad<uint64_t>(pc.outputAddressBufAddress + Binop::BindingIndex * sizeof(uint64_t));
     if (globalIndex()==0u)
         vk::RawBufferStore<uint32_t>(outputBufAddr, nbl::hlsl::glsl::gl_SubgroupSize());
 
-    operation_t<binop<T>,nbl::hlsl::jit::device_capabilities> func;
+    operation_t<Binop,nbl::hlsl::jit::device_capabilities> func;
     func();
 }
 
@@ -166,13 +166,13 @@ type_t test()
 {
     type_t sourceVal = vk::RawBufferLoad<type_t>(pc.inputBufAddress + globalIndex() * sizeof(type_t));
 
-    subtest<bit_and, uint32_t, ITEMS_PER_INVOCATION>(sourceVal);
-    subtest<bit_xor, uint32_t, ITEMS_PER_INVOCATION>(sourceVal);
-    subtest<bit_or, uint32_t, ITEMS_PER_INVOCATION>(sourceVal);
-    subtest<plus, uint32_t, ITEMS_PER_INVOCATION>(sourceVal);
-    subtest<multiplies, uint32_t, ITEMS_PER_INVOCATION>(sourceVal);
-    subtest<minimum, uint32_t, ITEMS_PER_INVOCATION>(sourceVal);
-    subtest<maximum, uint32_t, ITEMS_PER_INVOCATION>(sourceVal);
+    subtest<bit_and<uint32_t> >(sourceVal);
+    subtest<bit_xor<uint32_t> >(sourceVal);
+    subtest<bit_or<uint32_t> >(sourceVal);
+    subtest<plus<uint32_t> >(sourceVal);
+    subtest<multiplies<uint32_t> >(sourceVal);
+    subtest<minimum<uint32_t> >(sourceVal);
+    subtest<maximum<uint32_t> >(sourceVal);
     return sourceVal;
 }
 
diff --git a/29_Arithmetic2Bench/app_resources/benchmarkSubgroup.comp.hlsl b/29_Arithmetic2Bench/app_resources/benchmarkSubgroup.comp.hlsl
index cb033a5bb..113ec2bae 100644
--- a/29_Arithmetic2Bench/app_resources/benchmarkSubgroup.comp.hlsl
+++ b/29_Arithmetic2Bench/app_resources/benchmarkSubgroup.comp.hlsl
@@ -18,14 +18,14 @@ uint32_t globalIndex()
 
 bool canStore() {return true;}
 
-template<template<class> class binop, typename T, uint32_t N>
+template<class Binop, uint32_t N>
 static void subbench(NBL_CONST_REF_ARG(type_t) sourceVal)
 {
     using config_t = nbl::hlsl::subgroup2::Configuration<SUBGROUP_SIZE_LOG2>;
-    using params_t = nbl::hlsl::subgroup2::ArithmeticParams<config_t, typename binop<T>::base_t, N, nbl::hlsl::jit::device_capabilities>;
+    using params_t = nbl::hlsl::subgroup2::ArithmeticParams<config_t, typename Binop::base_t, N, nbl::hlsl::jit::device_capabilities>;
     type_t value = sourceVal;
 
-    const uint64_t outputBufAddr = vk::RawBufferLoad<uint64_t>(pc.outputAddressBufAddress + binop<T>::BindingIndex * sizeof(uint64_t), sizeof(uint64_t));
+    const uint64_t outputBufAddr = vk::RawBufferLoad<uint64_t>(pc.outputAddressBufAddress + Binop::BindingIndex * sizeof(uint64_t), sizeof(uint64_t));
 
     operation_t<params_t> func;
     // [unroll]
@@ -41,13 +41,13 @@ void benchmark()
     const uint32_t idx = globalIndex();
     type_t sourceVal = vk::RawBufferLoad<type_t>(pc.inputBufAddress + idx * sizeof(type_t));
 
-    subbench<bit_and, uint32_t, ITEMS_PER_INVOCATION>(sourceVal);
-    subbench<bit_xor, uint32_t, ITEMS_PER_INVOCATION>(sourceVal);
-    subbench<bit_or, uint32_t, ITEMS_PER_INVOCATION>(sourceVal);
-    subbench<plus, uint32_t, ITEMS_PER_INVOCATION>(sourceVal);
-    subbench<multiplies, uint32_t, ITEMS_PER_INVOCATION>(sourceVal);
-    subbench<minimum, uint32_t, ITEMS_PER_INVOCATION>(sourceVal);
-    subbench<maximum, uint32_t, ITEMS_PER_INVOCATION>(sourceVal);
+    subbench<bit_and<uint32_t>, ITEMS_PER_INVOCATION>(sourceVal);
+    subbench<bit_xor<uint32_t>, ITEMS_PER_INVOCATION>(sourceVal);
+    subbench<bit_or<uint32_t>, ITEMS_PER_INVOCATION>(sourceVal);
+    subbench<plus<uint32_t>, ITEMS_PER_INVOCATION>(sourceVal);
+    subbench<multiplies<uint32_t>, ITEMS_PER_INVOCATION>(sourceVal);
+    subbench<minimum<uint32_t>, ITEMS_PER_INVOCATION>(sourceVal);
+    subbench<maximum<uint32_t>, ITEMS_PER_INVOCATION>(sourceVal);
 }
 
 [numthreads(WORKGROUP_SIZE,1,1)]
diff --git a/29_Arithmetic2Bench/app_resources/benchmarkWorkgroup.comp.hlsl b/29_Arithmetic2Bench/app_resources/benchmarkWorkgroup.comp.hlsl
index 8815eb037..cdd5a9f4e 100644
--- a/29_Arithmetic2Bench/app_resources/benchmarkWorkgroup.comp.hlsl
+++ b/29_Arithmetic2Bench/app_resources/benchmarkWorkgroup.comp.hlsl
@@ -85,16 +85,15 @@ struct operation_t
     }
 };
 
-template<template<class> class binop, typename T, uint32_t N>
+template<class Binop>
 static void subbench(NBL_CONST_REF_ARG(type_t) sourceVal)
 {
-    const uint64_t outputBufAddr = vk::RawBufferLoad<uint64_t>(pc.outputAddressBufAddress + binop<T>::BindingIndex * sizeof(uint64_t), sizeof(uint64_t));
+    const uint64_t outputBufAddr = vk::RawBufferLoad<uint64_t>(pc.outputAddressBufAddress + Binop::BindingIndex * sizeof(uint64_t), sizeof(uint64_t));
 
     if (globalIndex()==0u)
         vk::RawBufferStore<uint32_t>(outputBufAddr, nbl::hlsl::glsl::gl_SubgroupSize());
 
-    operation_t<binop<T>,nbl::hlsl::jit::device_capabilities> func;
-    // TODO separate out store/load from DataProxy? so we don't do too many RW in benchmark
+    operation_t<Binop,nbl::hlsl::jit::device_capabilities> func;
     for (uint32_t i = 0; i < NUM_LOOPS; i++)
         func(); // store is done with data accessor now
 }
@@ -104,13 +103,13 @@ type_t benchmark()
 {
     const type_t sourceVal = vk::RawBufferLoad<type_t>(pc.inputBufAddress + globalIndex() * sizeof(type_t));
 
-    subbench<bit_and, uint32_t, ITEMS_PER_INVOCATION>(sourceVal);
-    subbench<bit_xor, uint32_t, ITEMS_PER_INVOCATION>(sourceVal);
-    subbench<bit_or, uint32_t, ITEMS_PER_INVOCATION>(sourceVal);
-    subbench<plus, uint32_t, ITEMS_PER_INVOCATION>(sourceVal);
-    subbench<multiplies, uint32_t, ITEMS_PER_INVOCATION>(sourceVal);
-    subbench<minimum, uint32_t, ITEMS_PER_INVOCATION>(sourceVal);
-    subbench<maximum, uint32_t, ITEMS_PER_INVOCATION>(sourceVal);
+    subbench<bit_and<uint32_t> >(sourceVal);
+    subbench<bit_xor<uint32_t> >(sourceVal);
+    subbench<bit_or<uint32_t> >(sourceVal);
+    subbench<plus<uint32_t> >(sourceVal);
+    subbench<multiplies<uint32_t> >(sourceVal);
+    subbench<minimum<uint32_t> >(sourceVal);
+    subbench<maximum<uint32_t> >(sourceVal);
     return sourceVal;
 }
 
diff --git a/29_Arithmetic2Bench/main.cpp b/29_Arithmetic2Bench/main.cpp
index 5b8792040..ce2b915b1 100644
--- a/29_Arithmetic2Bench/main.cpp
+++ b/29_Arithmetic2Bench/main.cpp
@@ -708,13 +708,13 @@ class ArithmeticBenchApp final : public examples::SimpleWindowedApplication, pub
 	uint32_t numSubmits = 0;
 
 	/* PARAMETERS TO CHANGE FOR DIFFERENT BENCHMARKS */
-	constexpr static inline bool DoWorkgroupBenchmarks = true;
+	constexpr static inline bool DoWorkgroupBenchmarks = false;
 	uint32_t ItemsPerInvocation = 4u;
 	constexpr static inline uint32_t NumLoops = 1000u;
 	constexpr static inline uint32_t NumBenchmarks = 6u;
 	constexpr static inline std::array<uint32_t, NumBenchmarks> workgroupSizes = { 32, 64, 128, 256, 512, 1024 };
 	template<class BinOp>
-	using ArithmeticOp = emulatedReduction<BinOp>;	// change this to test other arithmetic ops
+	using ArithmeticOp = emulatedScanInclusive<BinOp>;	// change this to test other arithmetic ops
 
 	std::array<BenchmarkSet, NumBenchmarks> benchSets;
 	smart_refctd_ptr<IDescriptorPool> benchPool;

From 13ae89f7d3fc666124486b5e18f13922995d3569 Mon Sep 17 00:00:00 2001
From: keptsecret <sorchon@gmail.com>
Date: Thu, 22 May 2025 15:04:00 +0700
Subject: [PATCH 290/529] revert test to default params

---
 29_Arithmetic2Bench/main.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/29_Arithmetic2Bench/main.cpp b/29_Arithmetic2Bench/main.cpp
index ce2b915b1..5b8792040 100644
--- a/29_Arithmetic2Bench/main.cpp
+++ b/29_Arithmetic2Bench/main.cpp
@@ -708,13 +708,13 @@ class ArithmeticBenchApp final : public examples::SimpleWindowedApplication, pub
 	uint32_t numSubmits = 0;
 
 	/* PARAMETERS TO CHANGE FOR DIFFERENT BENCHMARKS */
-	constexpr static inline bool DoWorkgroupBenchmarks = false;
+	constexpr static inline bool DoWorkgroupBenchmarks = true;
 	uint32_t ItemsPerInvocation = 4u;
 	constexpr static inline uint32_t NumLoops = 1000u;
 	constexpr static inline uint32_t NumBenchmarks = 6u;
 	constexpr static inline std::array<uint32_t, NumBenchmarks> workgroupSizes = { 32, 64, 128, 256, 512, 1024 };
 	template<class BinOp>
-	using ArithmeticOp = emulatedScanInclusive<BinOp>;	// change this to test other arithmetic ops
+	using ArithmeticOp = emulatedReduction<BinOp>;	// change this to test other arithmetic ops
 
 	std::array<BenchmarkSet, NumBenchmarks> benchSets;
 	smart_refctd_ptr<IDescriptorPool> benchPool;

From a8774db88d1d08d0a3fe9f2a30e7dc376120493a Mon Sep 17 00:00:00 2001
From: keptsecret <sorchon@gmail.com>
Date: Thu, 22 May 2025 17:02:32 +0700
Subject: [PATCH 291/529] use preloaded data in benchmark

---
 .../benchmarkWorkgroup.comp.hlsl              | 70 +++++++++++++++++--
 29_Arithmetic2Bench/main.cpp                  | 12 ++--
 2 files changed, 73 insertions(+), 9 deletions(-)

diff --git a/29_Arithmetic2Bench/app_resources/benchmarkWorkgroup.comp.hlsl b/29_Arithmetic2Bench/app_resources/benchmarkWorkgroup.comp.hlsl
index cdd5a9f4e..31284c520 100644
--- a/29_Arithmetic2Bench/app_resources/benchmarkWorkgroup.comp.hlsl
+++ b/29_Arithmetic2Bench/app_resources/benchmarkWorkgroup.comp.hlsl
@@ -68,6 +68,50 @@ struct DataProxy
     }
 };
 
+template<class Config, class Binop>
+struct PreloadedDataProxy
+{
+    using dtype_t = vector<uint32_t, Config::ItemsPerInvocation_0>;
+    static_assert(nbl::hlsl::is_same_v<dtype_t, type_t>);
+
+    NBL_CONSTEXPR_STATIC_INLINE uint32_t PreloadedDataCount = Config::VirtualWorkgroupSize / Config::WorkgroupSize;
+
+    template<typename AccessType>
+    void get(const uint32_t ix, NBL_REF_ARG(dtype_t) value)
+    {
+        value = preloaded[(ix-nbl::hlsl::workgroup::SubgroupContiguousIndex())>>Config::WorkgroupSizeLog2];
+    }
+    template<typename AccessType>
+    void set(const uint32_t ix, const dtype_t value)
+    {
+        preloaded[(ix-nbl::hlsl::workgroup::SubgroupContiguousIndex())>>Config::WorkgroupSizeLog2] = value;
+    }
+
+    void preload()
+    {
+        const uint32_t workgroupOffset = nbl::hlsl::glsl::gl_WorkGroupID().x * Config::VirtualWorkgroupSize;
+        [unroll]
+        for (uint32_t idx = 0; idx < PreloadedDataCount; idx++)
+            preloaded[idx] = vk::RawBufferLoad<dtype_t>(pc.inputBufAddress + (workgroupOffset + idx * Config::WorkgroupSize + nbl::hlsl::workgroup::SubgroupContiguousIndex()) * sizeof(dtype_t));
+    }
+    void unload()
+    {
+        const uint32_t workgroupOffset = nbl::hlsl::glsl::gl_WorkGroupID().x * Config::VirtualWorkgroupSize;
+        uint64_t outputBufAddr = vk::RawBufferLoad<uint64_t>(pc.outputAddressBufAddress + Binop::BindingIndex * sizeof(uint64_t));
+        [unroll]
+        for (uint32_t idx = 0; idx < PreloadedDataCount; idx++)
+            vk::RawBufferStore<dtype_t>(outputBufAddr + sizeof(uint32_t) + sizeof(dtype_t) * (workgroupOffset + idx * Config::WorkgroupSize + nbl::hlsl::workgroup::SubgroupContiguousIndex()), preloaded[idx], sizeof(uint32_t));
+    }
+
+    void workgroupExecutionAndMemoryBarrier()
+    {
+        nbl::hlsl::glsl::barrier();
+        //nbl::hlsl::glsl::memoryBarrierShared(); implied by the above
+    }
+
+    dtype_t preloaded[PreloadedDataCount];
+};
+
 static ScratchProxy arithmeticAccessor;
 
 template<class Binop, class device_capabilities>
@@ -76,13 +120,26 @@ struct operation_t
     using binop_base_t = typename Binop::base_t;
     using otype_t = typename Binop::type_t;
 
-    void operator()()
+#if IS_REDUCTION
+    void operator()(PreloadedDataProxy<config_t,Binop> dataAccessor)
     {
-        DataProxy<config_t,Binop> dataAccessor;
-        nbl::hlsl::OPERATION<config_t,binop_base_t,device_capabilities>::template __call<DataProxy<config_t,Binop>, ScratchProxy>(dataAccessor,arithmeticAccessor);
+        otype_t value = nbl::hlsl::OPERATION<config_t,binop_base_t,device_capabilities>::template __call<PreloadedDataProxy<config_t,Binop>, ScratchProxy>(dataAccessor,arithmeticAccessor);
         // we barrier before because we alias the accessors for Binop
         arithmeticAccessor.workgroupExecutionAndMemoryBarrier();
+
+        [unroll]
+        for (uint32_t i = 0; i < PreloadedDataProxy<config_t,Binop>::PreloadedDataCount; i++)
+            dataAccessor.preloaded[i] = value;
     }
+#else
+    void operator()(PreloadedDataProxy<config_t,Binop> dataAccessor)
+    {
+        nbl::hlsl::OPERATION<config_t,binop_base_t,device_capabilities>::template __call<PreloadedDataProxy<config_t,Binop>, ScratchProxy>(dataAccessor,arithmeticAccessor);
+        // we barrier before because we alias the accessors for Binop
+        arithmeticAccessor.workgroupExecutionAndMemoryBarrier();
+    }
+#endif
+
 };
 
 template<class Binop>
@@ -93,9 +150,14 @@ static void subbench(NBL_CONST_REF_ARG(type_t) sourceVal)
     if (globalIndex()==0u)
         vk::RawBufferStore<uint32_t>(outputBufAddr, nbl::hlsl::glsl::gl_SubgroupSize());
 
+    PreloadedDataProxy<config_t,Binop> dataAccessor;
+    dataAccessor.preload();
+
     operation_t<Binop,nbl::hlsl::jit::device_capabilities> func;
     for (uint32_t i = 0; i < NUM_LOOPS; i++)
-        func(); // store is done with data accessor now
+        func(dataAccessor);
+
+    dataAccessor.unload();
 }
 
 
diff --git a/29_Arithmetic2Bench/main.cpp b/29_Arithmetic2Bench/main.cpp
index 5b8792040..165427750 100644
--- a/29_Arithmetic2Bench/main.cpp
+++ b/29_Arithmetic2Bench/main.cpp
@@ -599,24 +599,26 @@ class ArithmeticBenchApp final : public examples::SimpleWindowedApplication, pub
 		if constexpr (WorkgroupBench)
 		{
 			const uint32_t workgroupSizeLog2 = hlsl::findMSB(workgroupSize);
-			const std::string definitions[6] = {
+			const std::string definitions[7] = {
 				"workgroup2::" + arith_name,
 				std::to_string(workgroupSizeLog2),
 				std::to_string(itemsPerWG),
 				std::to_string(itemsPerInvoc),
 				std::to_string(subgroupSizeLog2),
-				std::to_string(numLoops)
+				std::to_string(numLoops),
+				std::to_string(arith_name=="reduction")
 			};
 
-			const IShaderCompiler::SMacroDefinition defines[6] = {
+			const IShaderCompiler::SMacroDefinition defines[7] = {
 				{ "OPERATION", definitions[0] },
 				{ "WORKGROUP_SIZE_LOG2", definitions[1] },
 				{ "ITEMS_PER_WG", definitions[2] },
 				{ "ITEMS_PER_INVOCATION", definitions[3] },
 				{ "SUBGROUP_SIZE_LOG2", definitions[4] },
-				{ "NUM_LOOPS", definitions[5] }
+				{ "NUM_LOOPS", definitions[5] },
+				{ "IS_REDUCTION", definitions[6] }
 			};
-			options.preprocessorOptions.extraDefines = { defines, defines + 6 };
+			options.preprocessorOptions.extraDefines = { defines, defines + 7 };
 
 			overriddenUnspecialized = compiler->compileToSPIRV((const char*)source->getContent()->getPointer(), options);
 		}

From 8b729d5fae76ac0c63a0744b802ee5d206f7018d Mon Sep 17 00:00:00 2001
From: devsh <devsh@devsh.eu>
Date: Thu, 22 May 2025 14:26:10 +0200
Subject: [PATCH 292/529] test compaction of BLASes

---
 67_RayQueryGeometry/main.cpp | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/67_RayQueryGeometry/main.cpp b/67_RayQueryGeometry/main.cpp
index e096c1b71..362126332 100644
--- a/67_RayQueryGeometry/main.cpp
+++ b/67_RayQueryGeometry/main.cpp
@@ -644,10 +644,14 @@ class RayQueryGeometryApp final : public examples::SimpleWindowedApplication, pu
 			myalloc.device = m_device.get();
 			inputs.allocator = &myalloc;
 #endif
-
+			
+			std::array<CAssetConverter::patch_t<ICPUBottomLevelAccelerationStructure>,OT_COUNT> tmpBLASPatches = {};
 			std::array<const ICPUBuffer*, OT_COUNT * 2u> tmpBuffers;
 			std::array<CAssetConverter::patch_t<ICPUBuffer>, OT_COUNT * 2u> tmpBufferPatches;
 			{
+				tmpBLASPatches.front().compactAfterBuild = true;
+				std::fill(tmpBLASPatches.begin(),tmpBLASPatches.end(),tmpBLASPatches.front());
+				//
 				for (uint32_t i = 0; i < objectsCpu.size(); i++)
 				{
 					tmpBuffers[2 * i + 0] = cpuBlas[i]->getTriangleGeometries().front().vertexData[0].buffer.get();
@@ -659,6 +663,7 @@ class RayQueryGeometryApp final : public examples::SimpleWindowedApplication, pu
 
 				std::get<CAssetConverter::SInputs::asset_span_t<ICPUTopLevelAccelerationStructure>>(inputs.assets) = {&cpuTlas.get(),1};
 				std::get<CAssetConverter::SInputs::asset_span_t<ICPUBottomLevelAccelerationStructure>>(inputs.assets) = {&cpuBlas.data()->get(),cpuBlas.size()};
+				std::get<CAssetConverter::SInputs::patch_span_t<ICPUBottomLevelAccelerationStructure>>(inputs.patches) = tmpBLASPatches;
 				std::get<CAssetConverter::SInputs::asset_span_t<ICPUBuffer>>(inputs.assets) = tmpBuffers;
 				std::get<CAssetConverter::SInputs::patch_span_t<ICPUBuffer>>(inputs.patches) = tmpBufferPatches;
 			}

From 435212210dbf9392143e3ec8204051013215bd86 Mon Sep 17 00:00:00 2001
From: devsh <devsh@devsh.eu>
Date: Thu, 22 May 2025 14:39:51 +0200
Subject: [PATCH 293/529] test TLAS compaction

---
 67_RayQueryGeometry/main.cpp | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/67_RayQueryGeometry/main.cpp b/67_RayQueryGeometry/main.cpp
index 362126332..6fcf6b0d1 100644
--- a/67_RayQueryGeometry/main.cpp
+++ b/67_RayQueryGeometry/main.cpp
@@ -645,6 +645,8 @@ class RayQueryGeometryApp final : public examples::SimpleWindowedApplication, pu
 			inputs.allocator = &myalloc;
 #endif
 			
+			CAssetConverter::patch_t<ICPUTopLevelAccelerationStructure> blasPatch = {};
+			blasPatch.compactAfterBuild = true;
 			std::array<CAssetConverter::patch_t<ICPUBottomLevelAccelerationStructure>,OT_COUNT> tmpBLASPatches = {};
 			std::array<const ICPUBuffer*, OT_COUNT * 2u> tmpBuffers;
 			std::array<CAssetConverter::patch_t<ICPUBuffer>, OT_COUNT * 2u> tmpBufferPatches;
@@ -662,6 +664,7 @@ class RayQueryGeometryApp final : public examples::SimpleWindowedApplication, pu
 					patch.usage |= asset::IBuffer::E_USAGE_FLAGS::EUF_SHADER_DEVICE_ADDRESS_BIT;
 
 				std::get<CAssetConverter::SInputs::asset_span_t<ICPUTopLevelAccelerationStructure>>(inputs.assets) = {&cpuTlas.get(),1};
+				std::get<CAssetConverter::SInputs::patch_span_t<ICPUTopLevelAccelerationStructure>>(inputs.patches) = {&blasPatch,1};
 				std::get<CAssetConverter::SInputs::asset_span_t<ICPUBottomLevelAccelerationStructure>>(inputs.assets) = {&cpuBlas.data()->get(),cpuBlas.size()};
 				std::get<CAssetConverter::SInputs::patch_span_t<ICPUBottomLevelAccelerationStructure>>(inputs.patches) = tmpBLASPatches;
 				std::get<CAssetConverter::SInputs::asset_span_t<ICPUBuffer>>(inputs.assets) = tmpBuffers;

From 9d0d57d700178f6ce2786380f9b14b88bb373bc2 Mon Sep 17 00:00:00 2001
From: Erfan Ahmadi <ahmadierfan99@gmail.com>
Date: Fri, 23 May 2025 12:30:43 +0400
Subject: [PATCH 294/529] small fixes

---
 62_CAD/DrawResourcesFiller.cpp | 5 +++--
 62_CAD/Images.h                | 1 +
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/62_CAD/DrawResourcesFiller.cpp b/62_CAD/DrawResourcesFiller.cpp
index 2449e8b05..da0678c9b 100644
--- a/62_CAD/DrawResourcesFiller.cpp
+++ b/62_CAD/DrawResourcesFiller.cpp
@@ -579,6 +579,7 @@ bool DrawResourcesFiller::queueGeoreferencedImageCopy_Internal(image_id imageID,
 {
 	auto& vec = streamedImageCopies[imageID];
 	vec.emplace_back(imageCopy);
+	return true;
 }
 
 // TODO[Przemek]: similar to other drawXXX and drawXXX_internal functions that create mainobjects, drawObjects and push additional info in geometry buffer, input to function would be a GridDTMInfo
@@ -629,7 +630,7 @@ void DrawResourcesFiller::addImageObject(image_id imageID, const OrientedBoundin
 
 void DrawResourcesFiller::addGeoreferencedImage(image_id imageID, const GeoreferencedImageParams& params, SIntendedSubmitInfo& intendedNextSubmit)
 {
-	beginMainObject(MainObjectType::STATIC_IMAGE);
+	beginMainObject(MainObjectType::STREAMED_IMAGE);
 
 	uint32_t mainObjIdx = acquireActiveMainObjectIndex_SubmitIfNeeded(intendedNextSubmit);
 
@@ -1295,7 +1296,7 @@ bool DrawResourcesFiller::pushStreamedImagesUploads(SIntendedSubmitInfo& intende
 				{
 					success &= m_utilities->updateImageViaStagingBuffer(
 						intendedNextSubmit,
-						imageCopy.srcBuffer->getPointer(), gpuImg->getCreationParameters().format,
+						imageCopy.srcBuffer->getPointer(), imageCopy.srcFormat,
 						gpuImg.get(), IImage::LAYOUT::TRANSFER_DST_OPTIMAL,
 						{ &imageCopy.region, 1u });
 				}
diff --git a/62_CAD/Images.h b/62_CAD/Images.h
index e43c72fd2..73be7ed50 100644
--- a/62_CAD/Images.h
+++ b/62_CAD/Images.h
@@ -211,6 +211,7 @@ class ImagesCache : public core::ResizableLRUCache<image_id, CachedImageRecord>
 
 struct StreamedImageCopy
 {
+	asset::E_FORMAT srcFormat;
 	core::smart_refctd_ptr<ICPUBuffer> srcBuffer; // Make it 'std::future' later?
 	asset::IImage::SBufferCopy region;
 };

From 20fed8cc920f787e134323565ac0d8d30fcbfb99 Mon Sep 17 00:00:00 2001
From: devsh <devsh@devsh.eu>
Date: Fri, 23 May 2025 13:25:55 +0200
Subject: [PATCH 295/529] test Descriptor Set conversion with TLAS rewrites

---
 67_RayQueryGeometry/main.cpp | 102 ++++++++++++++++-------------------
 1 file changed, 45 insertions(+), 57 deletions(-)

diff --git a/67_RayQueryGeometry/main.cpp b/67_RayQueryGeometry/main.cpp
index 6fcf6b0d1..ce9eaee1f 100644
--- a/67_RayQueryGeometry/main.cpp
+++ b/67_RayQueryGeometry/main.cpp
@@ -126,6 +126,7 @@ class RayQueryGeometryApp final : public examples::SimpleWindowedApplication, pu
 			auto cQueue = getComputeQueue();
 
 			// create blas/tlas
+			renderDs = 
 //#define TRY_BUILD_FOR_NGFX // Validation errors on the fake Acquire-Presents, TODO fix
 #ifdef TRY_BUILD_FOR_NGFX
 			// Nsight is special and can't do debugger delay so you can debug your CPU stuff during a capture
@@ -137,11 +138,12 @@ class RayQueryGeometryApp final : public examples::SimpleWindowedApplication, pu
 					std::this_thread::yield();
 			}
 			// Nsight is special and can't capture anything not on the queue that performs the swapchain acquire/release
-			if (!createAccelerationStructuresFromGeometry(gQueue,geometryCreator))
+			createAccelerationStructureDS(gQueue,geometryCreator);
 #else
-			if (!createAccelerationStructuresFromGeometry(cQueue,geometryCreator))
+			createAccelerationStructureDS(cQueue,geometryCreator);
 #endif
-				return logFail("Could not create acceleration structures");
+			if (!renderDs)
+				return logFail("Could not create acceleration structures and descriptor set");
 
 			// create pipelines
 			{
@@ -165,35 +167,8 @@ class RayQueryGeometryApp final : public examples::SimpleWindowedApplication, pu
 				if (!shader)
 					return logFail("Failed to create shader!");
 
-				// descriptors
-				IGPUDescriptorSetLayout::SBinding bindings[] = {
-					{
-						.binding = 0,
-						.type = asset::IDescriptor::E_TYPE::ET_ACCELERATION_STRUCTURE,
-						.createFlags = IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_NONE,
-						.stageFlags = asset::IShader::E_SHADER_STAGE::ESS_COMPUTE,
-						.count = 1,
-					},
-					{
-						.binding = 1,
-						.type = asset::IDescriptor::E_TYPE::ET_STORAGE_IMAGE,
-						.createFlags = IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_NONE,
-						.stageFlags = asset::IShader::E_SHADER_STAGE::ESS_COMPUTE,
-						.count = 1,
-					}
-				};
-				auto descriptorSetLayout = m_device->createDescriptorSetLayout(bindings);
-
-				const std::array<IGPUDescriptorSetLayout*, ICPUPipelineLayout::DESCRIPTOR_SET_COUNT> dsLayoutPtrs = { descriptorSetLayout.get() };
-				auto pool = m_device->createDescriptorPoolForDSLayouts(IDescriptorPool::ECF_UPDATE_AFTER_BIND_BIT, std::span(dsLayoutPtrs.begin(), dsLayoutPtrs.end()));
-				if (!pool)
-					return logFail("Could not create descriptor pool");
-				renderDs = pool->createDescriptorSet(descriptorSetLayout);
-				if (!renderDs)
-					return logFail("Could not create descriptor set");
-
 				SPushConstantRange pcRange = { .stageFlags = IShader::E_SHADER_STAGE::ESS_COMPUTE, .offset = 0u, .size = sizeof(SPushConstants)};
-				auto pipelineLayout = m_device->createPipelineLayout({ &pcRange, 1 }, smart_refctd_ptr(descriptorSetLayout), nullptr, nullptr, nullptr);
+				auto pipelineLayout = m_device->createPipelineLayout({ &pcRange, 1 }, smart_refctd_ptr<const IGPUDescriptorSetLayout>(renderDs->getLayout()), nullptr, nullptr, nullptr);
 
 				IGPUComputePipeline::SCreationParams params = {};
 				params.layout = pipelineLayout.get();
@@ -203,23 +178,21 @@ class RayQueryGeometryApp final : public examples::SimpleWindowedApplication, pu
 			}
 
 			// write descriptors
-			IGPUDescriptorSet::SDescriptorInfo infos[2];
-			infos[0].desc = gpuTlas;
-			infos[1].desc = m_device->createImageView({
-				.flags = IGPUImageView::ECF_NONE,
-				.subUsages = IGPUImage::E_USAGE_FLAGS::EUF_STORAGE_BIT,
-				.image = outHDRImage,
-				.viewType = IGPUImageView::E_TYPE::ET_2D,
-				.format = asset::EF_R16G16B16A16_SFLOAT
-			});
-			if (!infos[1].desc)
-				return logFail("Failed to create image view");
-			infos[1].info.image.imageLayout = IImage::LAYOUT::GENERAL;
-			IGPUDescriptorSet::SWriteDescriptorSet writes[3] = {
-				{.dstSet = renderDs.get(), .binding = 0, .arrayElement = 0, .count = 1, .info = &infos[0]},
-				{.dstSet = renderDs.get(), .binding = 1, .arrayElement = 0, .count = 1, .info = &infos[1]}
-			};
-			m_device->updateDescriptorSets(std::span(writes, 2), {});
+			{
+				IGPUDescriptorSet::SDescriptorInfo info = {};
+				info.desc = m_device->createImageView({
+					.flags = IGPUImageView::ECF_NONE,
+					.subUsages = IGPUImage::E_USAGE_FLAGS::EUF_STORAGE_BIT,
+					.image = outHDRImage,
+					.viewType = IGPUImageView::E_TYPE::ET_2D,
+					.format = asset::EF_R16G16B16A16_SFLOAT
+				});
+				if (!info.desc)
+					return logFail("Failed to create image view");
+				info.info.image.imageLayout = IImage::LAYOUT::GENERAL;
+				const IGPUDescriptorSet::SWriteDescriptorSet write = {.dstSet=renderDs.get(), .binding=1, .arrayElement=0, .count=1, .info=&info};
+				m_device->updateDescriptorSets({&write,1}, {});
+			}
 
 			// camera
 			{
@@ -514,7 +487,7 @@ class RayQueryGeometryApp final : public examples::SimpleWindowedApplication, pu
 			return (dim + size - 1) / size;
 		}
 
-		bool createAccelerationStructuresFromGeometry(video::CThreadSafeQueueAdapter* queue, const IGeometryCreator* gc)
+		smart_refctd_ptr<IGPUDescriptorSet> createAccelerationStructureDS(video::CThreadSafeQueueAdapter* queue, const IGeometryCreator* gc)
 		{
 			// get geometries in ICPUBuffers
 			std::array<ReferenceObjectCpu, OT_COUNT> objectsCpu;
@@ -582,8 +555,6 @@ class RayQueryGeometryApp final : public examples::SimpleWindowedApplication, pu
 				blas->setContentHash(blas->computeContentHash());
 			}
 
-			// TODO: when does compact blas happen?
-
 			// get ICPUBottomLevelAccelerationStructure into ICPUTopLevelAccelerationStructure
 			auto geomInstances = make_refctd_dynamic_array<smart_refctd_dynamic_array<ICPUTopLevelAccelerationStructure::PolymorphicInstance>>(OT_COUNT);
 			{
@@ -608,6 +579,26 @@ class RayQueryGeometryApp final : public examples::SimpleWindowedApplication, pu
 			auto cpuTlas = make_smart_refctd_ptr<ICPUTopLevelAccelerationStructure>();
 			cpuTlas->setInstances(std::move(geomInstances));
 			cpuTlas->setBuildFlags(IGPUTopLevelAccelerationStructure::BUILD_FLAGS::PREFER_FAST_TRACE_BIT);
+			
+			// descriptor set and layout
+			ICPUDescriptorSetLayout::SBinding bindings[] = {
+				{
+					.binding = 0,
+					.type = asset::IDescriptor::E_TYPE::ET_ACCELERATION_STRUCTURE,
+					.createFlags = IDescriptorSetLayoutBase::SBindingBase::E_CREATE_FLAGS::ECF_NONE,
+					.stageFlags = asset::IShader::E_SHADER_STAGE::ESS_COMPUTE,
+					.count = 1,
+				},
+				{
+					.binding = 1,
+					.type = asset::IDescriptor::E_TYPE::ET_STORAGE_IMAGE,
+					.createFlags = IDescriptorSetLayoutBase::SBindingBase::E_CREATE_FLAGS::ECF_NONE,
+					.stageFlags = asset::IShader::E_SHADER_STAGE::ESS_COMPUTE,
+					.count = 1,
+				}
+			};
+			auto descriptorSet = core::make_smart_refctd_ptr<ICPUDescriptorSet>(core::make_smart_refctd_ptr<ICPUDescriptorSetLayout>(bindings));
+			descriptorSet->getDescriptorInfos(IDescriptorSetLayoutBase::CBindingRedirect::binding_number_t{0},IDescriptor::E_TYPE::ET_ACCELERATION_STRUCTURE).front().desc = cpuTlas;
 
 //#define TEST_REBAR_FALLBACK
 			// convert with asset converter
@@ -663,6 +654,7 @@ class RayQueryGeometryApp final : public examples::SimpleWindowedApplication, pu
 				for (auto& patch : tmpBufferPatches)
 					patch.usage |= asset::IBuffer::E_USAGE_FLAGS::EUF_SHADER_DEVICE_ADDRESS_BIT;
 
+				std::get<CAssetConverter::SInputs::asset_span_t<ICPUDescriptorSet>>(inputs.assets) = {&descriptorSet.get(),1};
 				std::get<CAssetConverter::SInputs::asset_span_t<ICPUTopLevelAccelerationStructure>>(inputs.assets) = {&cpuTlas.get(),1};
 				std::get<CAssetConverter::SInputs::patch_span_t<ICPUTopLevelAccelerationStructure>>(inputs.patches) = {&blasPatch,1};
 				std::get<CAssetConverter::SInputs::asset_span_t<ICPUBottomLevelAccelerationStructure>>(inputs.assets) = {&cpuBlas.data()->get(),cpuBlas.size()};
@@ -792,7 +784,6 @@ class RayQueryGeometryApp final : public examples::SimpleWindowedApplication, pu
 				}
 
 				// assign gpu objects to output
-				gpuTlas = reservation.getGPUObjects<ICPUTopLevelAccelerationStructure>().front().value;
 				for (const auto& buffer : reservation.getGPUObjects<ICPUBuffer>())
 					retainedBuffers.push_back(buffer.value);
 				for (uint32_t i = 0; i < objectsCpu.size(); i++)
@@ -858,7 +849,7 @@ class RayQueryGeometryApp final : public examples::SimpleWindowedApplication, pu
 							};
 							for (const auto& blas : reservation.getGPUObjects<ICPUBottomLevelAccelerationStructure>())
 								acquireAS(blas.value.get());
-							acquireAS(gpuTlas.get());
+							acquireAS(reservation.getGPUObjects<ICPUTopLevelAccelerationStructure>().front().value.get());
 						}
 						if (!bufBarriers.empty())
 							cmdbuf->pipelineBarrier(asset::E_DEPENDENCY_FLAGS::EDF_NONE,{.memBarriers={},.bufBarriers=bufBarriers});
@@ -900,7 +891,7 @@ class RayQueryGeometryApp final : public examples::SimpleWindowedApplication, pu
 #endif
 			m_api->endCapture();
 
-			return bool(gpuTlas);
+			return reservation.getGPUObjects<ICPUDescriptorSet>().front().value;
 		}
 
 
@@ -918,9 +909,6 @@ class RayQueryGeometryApp final : public examples::SimpleWindowedApplication, pu
 		Camera camera = Camera(core::vectorSIMDf(0, 0, 0), core::vectorSIMDf(0, 0, 0), core::matrix4SIMD());
 		video::CDumbPresentationOracle oracle;
 
-		// TODO: maybe convert the descriptor set from ICPU as well?
-		smart_refctd_ptr<IGPUTopLevelAccelerationStructure> gpuTlas;
-
 		smart_refctd_ptr<IGPUBuffer> geometryInfoBuffer;
 		core::vector<smart_refctd_ptr<IGPUBuffer>> retainedBuffers;
 		smart_refctd_ptr<IGPUImage> outHDRImage;

From 723506eed5c6e80d8197a345ec4af46eb8829e62 Mon Sep 17 00:00:00 2001
From: Przemek <minikers21@gmail.com>
Date: Fri, 23 May 2025 13:48:18 +0200
Subject: [PATCH 296/529] Implemented grid DTM rendering

---
 62_CAD/DrawResourcesFiller.cpp                |   3 +
 62_CAD/DrawResourcesFiller.h                  |   1 +
 62_CAD/main.cpp                               |   4 +-
 62_CAD/shaders/globals.hlsl                   |   6 +-
 62_CAD/shaders/main_pipeline/common.hlsl      |  14 +-
 62_CAD/shaders/main_pipeline/dtm.hlsl         |   3 +-
 .../main_pipeline/fragment_shader.hlsl        | 190 ++++++++++++++++--
 .../shaders/main_pipeline/vertex_shader.hlsl  |  14 +-
 8 files changed, 200 insertions(+), 35 deletions(-)

diff --git a/62_CAD/DrawResourcesFiller.cpp b/62_CAD/DrawResourcesFiller.cpp
index 105374493..86633a2be 100644
--- a/62_CAD/DrawResourcesFiller.cpp
+++ b/62_CAD/DrawResourcesFiller.cpp
@@ -587,6 +587,7 @@ void DrawResourcesFiller::drawGridDTM(
 	const float64_t2& topLeft,
 	float64_t height,
 	float64_t width,
+	float gridCellWidth,
 	const DTMSettingsInfo& dtmSettingsInfo,
 	SIntendedSubmitInfo& intendedNextSubmit)
 {
@@ -594,7 +595,9 @@ void DrawResourcesFiller::drawGridDTM(
 	gridDTMInfo.topLeft = topLeft;
 	gridDTMInfo.height = height;
 	gridDTMInfo.width = width;
+	gridDTMInfo.gridCellWidth = gridCellWidth;
 
+	setActiveDTMSettings(dtmSettingsInfo);
 	beginMainObject(MainObjectType::GRID_DTM);
 
 	uint32_t mainObjectIdx = acquireActiveMainObjectIndex_SubmitIfNeeded(intendedNextSubmit);
diff --git a/62_CAD/DrawResourcesFiller.h b/62_CAD/DrawResourcesFiller.h
index 801dc41c2..1862679af 100644
--- a/62_CAD/DrawResourcesFiller.h
+++ b/62_CAD/DrawResourcesFiller.h
@@ -207,6 +207,7 @@ struct DrawResourcesFiller
 	void drawGridDTM(const float64_t2& topLeft,
 		float64_t height,
 		float64_t width,
+		float gridCellWidth,
 		const DTMSettingsInfo& dtmSettingsInfo,
 		SIntendedSubmitInfo& intendedNextSubmit);
 	
diff --git a/62_CAD/main.cpp b/62_CAD/main.cpp
index 356ff23aa..78f6a5504 100644
--- a/62_CAD/main.cpp
+++ b/62_CAD/main.cpp
@@ -3364,7 +3364,7 @@ class ComputerAidedDesign final : public examples::SimpleWindowedApplication, pu
 		else if (mode == ExampleMode::CASE_11)
 		{
 			DTMSettingsInfo dtmInfo{};
-			//dtmInfo.mode |= E_DTM_MODE::OUTLINE;
+			dtmInfo.mode |= E_DTM_MODE::OUTLINE;
 			dtmInfo.mode |= E_DTM_MODE::HEIGHT_SHADING;
 			dtmInfo.mode |= E_DTM_MODE::CONTOUR;
 
@@ -3435,7 +3435,7 @@ class ComputerAidedDesign final : public examples::SimpleWindowedApplication, pu
 				}
 			}
 
-			drawResourcesFiller.drawGridDTM({ 0.0f, 200.0f }, 400.0f, 800.0f, dtmInfo, intendedNextSubmit);
+			drawResourcesFiller.drawGridDTM({ 0.0f, 200.0f }, 400.0f, 800.0f, 40.0f, dtmInfo, intendedNextSubmit);
 		}
 	}
 
diff --git a/62_CAD/shaders/globals.hlsl b/62_CAD/shaders/globals.hlsl
index 538387491..0a0801632 100644
--- a/62_CAD/shaders/globals.hlsl
+++ b/62_CAD/shaders/globals.hlsl
@@ -8,7 +8,7 @@
 #endif
 
 // TODO[Erfan]: Turn off in the future, but keep enabled to test
-#define NBL_FORCE_EMULATED_FLOAT_64
+// #define NBL_FORCE_EMULATED_FLOAT_64
 
 #include <nbl/builtin/hlsl/portable/float64_t.hlsl>
 #include <nbl/builtin/hlsl/portable/vector_t.hlsl>
@@ -247,9 +247,7 @@ struct GridDTMInfo
     pfloat64_t height; // 8 bytes (24)
     pfloat64_t width; // 8 bytes (32)
     uint32_t textureID; // 4 bytes (36)
-    uint32_t dtmInfoID; // 4 bytes (40)
-    float gridCellWidth; // 4 bytes (44)
-    float _padding; // 4 bytes (48)
+    float gridCellWidth; // 4 bytes (40)
 };
 
 static uint32_t packR11G11B10_UNORM(float32_t3 color)
diff --git a/62_CAD/shaders/main_pipeline/common.hlsl b/62_CAD/shaders/main_pipeline/common.hlsl
index ccc30b1b8..c0e44ab29 100644
--- a/62_CAD/shaders/main_pipeline/common.hlsl
+++ b/62_CAD/shaders/main_pipeline/common.hlsl
@@ -232,15 +232,17 @@ struct PSInput
 #endif
 
     /* GRID DTM */
-    uint getHeightMapTextureID() { return data1.x; }
-    uint getDTMSettingsID() { return data1.y; }
-    float getGridDTMScreenSpaceCellWidth() { return data2.x; }
+    uint getGridDTMHeightTextureID(uint textureID) { return data1.z; }
+    float getGridDTMScreenSpaceCellWidth() { return data3.x; }
     float2 getGridDTMScreenSpacePosition() { return interp_data5.zw; }
+    float2 getGridDTMScreenSpaceTopLeft() { return data2.xy; }
+    float2 getGridDTMScreenSpaceGridExtents() { return data2.zw; }
 
-    void setHeightMapTextureID(uint heightMapTextureID) { data1.x = heightMapTextureID; }
-    void setDTMSettingsID(uint dtmSettingsID) { data1.y = dtmSettingsID; }
-    void setGridDTMScreenSpaceCellWidth(float screenSpaceGridWidth) { data2.x = screenSpaceGridWidth; }
+    void setGridDTMHeightTextureID(uint textureID) { data1.z = textureID; }
+    void setGridDTMScreenSpaceCellWidth(float screenSpaceGridWidth) { data3.x = screenSpaceGridWidth; }
     void setGridDTMScreenSpacePosition(float2 screenSpacePosition) { interp_data5.zw = screenSpacePosition; }
+    void setGridDTMScreenSpaceTopLeft(float2 screenSpaceTopLeft) { data2.xy = screenSpaceTopLeft; }
+    void setGridDTMScreenSpaceGridExtents(float2 screenSpaceGridExtends) { data2.zw = screenSpaceGridExtends; }
 };
 
 // Set 0 - Scene Data and Globals, buffer bindings don't change the buffers only get updated
diff --git a/62_CAD/shaders/main_pipeline/dtm.hlsl b/62_CAD/shaders/main_pipeline/dtm.hlsl
index 63e1194e4..ee698e19c 100644
--- a/62_CAD/shaders/main_pipeline/dtm.hlsl
+++ b/62_CAD/shaders/main_pipeline/dtm.hlsl
@@ -308,7 +308,7 @@ float4 calculateDTMContourColor(in DTMContourSettings contourSettings, in float3
     return float4(0.0f, 0.0f, 0.0f, 0.0f);
 }
 
-float4 calculateDTMOutlineColor(in uint outlineLineStyleIdx, in float3 v[3], in float2 fragPos, in float3 baryCoord, in float height)
+float4 calculateDTMOutlineColor(in uint outlineLineStyleIdx, in float3 v[3], in float2 fragPos)
 {
     float4 outputColor;
 
@@ -362,7 +362,6 @@ float4 calculateDTMOutlineColor(in uint outlineLineStyleIdx, in float3 v[3], in
 
             minDistance = min(minDistance, distance);
         }
-
     }
 
     outputColor.a = 1.0f - smoothstep(-globals.antiAliasingFactor, globals.antiAliasingFactor, minDistance);
diff --git a/62_CAD/shaders/main_pipeline/fragment_shader.hlsl b/62_CAD/shaders/main_pipeline/fragment_shader.hlsl
index 766225acd..6f8edc7b0 100644
--- a/62_CAD/shaders/main_pipeline/fragment_shader.hlsl
+++ b/62_CAD/shaders/main_pipeline/fragment_shader.hlsl
@@ -117,6 +117,46 @@ float32_t4 calculateFinalColor<true>(const uint2 fragCoord, const float localAlp
     return color;
 }
 
+// TODO: move to other header
+float4 calculateGridDTMOutlineColor(in uint outlineLineStyleIdx, in nbl::hlsl::shapes::Line<float> outlineLineSegments[2], in float2 fragPos, in float phaseShift)
+{
+    LineStyle outlineStyle = loadLineStyle(outlineLineStyleIdx);
+    const float outlineThickness = (outlineStyle.screenSpaceLineWidth + outlineStyle.worldSpaceLineWidth * globals.screenToWorldRatio) * 0.5f;
+    const float stretch = 1.0f;
+
+    // find distance to outline
+    float minDistance = nbl::hlsl::numeric_limits<float>::max;
+    if (!outlineStyle.hasStipples() || stretch == InvalidStyleStretchValue)
+    {
+        for (int i = 0; i < 2; ++i)
+        {
+            float distance = nbl::hlsl::numeric_limits<float>::max;
+            distance = ClippedSignedDistance<nbl::hlsl::shapes::Line<float> >::sdf(outlineLineSegments[i], fragPos, outlineThickness, outlineStyle.isRoadStyleFlag);
+
+            minDistance = min(minDistance, distance);
+        }
+    }
+    else
+    {
+        for (int i = 0; i < 2; ++i)
+        {
+            float distance = nbl::hlsl::numeric_limits<float>::max;
+            nbl::hlsl::shapes::Line<float>::ArcLengthCalculator arcLenCalc = nbl::hlsl::shapes::Line<float>::ArcLengthCalculator::construct(outlineLineSegments[i]);
+            LineStyleClipper clipper = LineStyleClipper::construct(outlineStyle, outlineLineSegments[i], arcLenCalc, phaseShift, stretch, globals.worldToScreenRatio);
+            distance = ClippedSignedDistance<nbl::hlsl::shapes::Line<float>, LineStyleClipper>::sdf(outlineLineSegments[i], fragPos, outlineThickness, outlineStyle.isRoadStyleFlag, clipper);
+
+            minDistance = min(minDistance, distance);
+        }
+    }
+
+    float4 outputColor;
+    outputColor.a = 1.0f - smoothstep(-globals.antiAliasingFactor, globals.antiAliasingFactor, minDistance);
+    outputColor.a *= outlineStyle.color.a;
+    outputColor.rgb = outlineStyle.color.rgb;
+
+    return outputColor;
+}
+
 [[vk::spvexecutionmode(spv::ExecutionModePixelInterlockOrderedEXT)]]
 [shader("pixel")]
 float4 fragMain(PSInput input) : SV_TARGET
@@ -129,7 +169,7 @@ float4 fragMain(PSInput input) : SV_TARGET
     const MainObject mainObj = loadMainObject(currentMainObjectIdx);
     
     if (pc.isDTMRendering)
-    {   
+    {
         DTMSettings dtmSettings = loadDTMSettings(mainObj.dtmSettingsIdx);
 
         float3 v[3];
@@ -143,8 +183,8 @@ float4 fragMain(PSInput input) : SV_TARGET
 
         float4 dtmColor = float4(0.0f, 0.0f, 0.0f, 0.0f);
         
-        if (dtmSettings.drawOutlineEnabled())
-            dtmColor = dtm::blendUnder(dtmColor, dtm::calculateDTMOutlineColor(dtmSettings.outlineLineStyleIdx, v, input.position.xy, baryCoord, height));
+        if (dtmSettings.drawOutlineEnabled())                                                                                                    // TODO: do i need 'height' paramter here?
+            dtmColor = dtm::blendUnder(dtmColor, dtm::calculateDTMOutlineColor(dtmSettings.outlineLineStyleIdx, v, input.position.xy));
         if (dtmSettings.drawContourEnabled())
         {
             for(uint32_t i = 0; i < dtmSettings.contourSettingsCount; ++i) // TODO: should reverse the order with blendUnder
@@ -393,27 +433,147 @@ float4 fragMain(PSInput input) : SV_TARGET
             
             // Query dtm settings
             // use texture Gather to get 4 corners: https://learn.microsoft.com/en-us/windows/win32/direct3dhlsl/dx-graphics-hlsl-to-gather
-            // A. the outlines can be stippled, use phaseshift of the line such that they started from the grid's origin worldspace coordinate
-            // B. the contours are computed for triangles, use the same function as for dtms, choose between the two triangles based on local UV coords in current cell
-                // Make it so we can choose which diagonal to use to construct the triangle, it's either u=v or u=1-v
-            // C. Height shading same as contours (split into two triangles)
+            // DONE (but needs to be fixed): A. the outlines can be stippled, use phaseshift of the line such that they started from the grid's origin worldspace coordinate
+            // DONE: B. the contours are computed for triangles, use the same function as for dtms, choose between the two triangles based on local UV coords in current cell
+                // DONE: Make it so we can choose which diagonal to use to construct the triangle, it's either u=v or u=1-v
+            // DONE: C. Height shading same as contours (split into two triangles)
 
             // Heights can have invalid values (let's say NaN) if a cell corner has NaN value then no triangle (for contour and shading) and no outline should include that corner. (see DTM image in discord with gaps)
             
             // TODO: we need to emulate dilation and do sdf of neighbouring cells as well. because contours, outlines and shading can bleed into other cells for AA.
             // [NOTE] Do dilation as last step, when everything else works fine
 
-            textureColor = float4(1.0f, 1.0f, 1.0f, 1.0f);
-            float2 uv = input.getImageUV();
-            float scalar = uv.x * uv.x * 0.25f + uv.y * uv.y * 0.25f;
-            textureColor *= scalar;
-            localAlpha = 1.0f;
+            DTMSettings dtmSettings = loadDTMSettings(mainObj.dtmSettingsIdx);
+            float2 pos = input.getGridDTMScreenSpacePosition();
+
+            // grid consists of square cells and cells are divided into two triangles:
+            // depending on mode it is
+            // either:        or:
+            // v2a-------v1   v0-------v2b
+            // |  A     / |   | \     B  |
+            // |     /    |   |    \     |
+            // |  /  B    |   |   A   \  |
+            // v0-------v2b   v2a-------v1
+            // 
+
+            // TODO: probably needs to be a part of grid dtm settings struct
+            const bool diagonalFromTopLeftToBottomRight = true;
+
+            // calculate screen space coordinates of vertices of the current tiranlge within the grid
+            float3 v[3];
+            nbl::hlsl::shapes::Line<float> outlineLineSegments[2];
+            float outlinePhaseShift;
+            {
+                float2 topLeft = input.getGridDTMScreenSpaceTopLeft();
+                float2 gridExtents = input.getGridDTMScreenSpaceGridExtents();
+                float cellWidth = input.getGridDTMScreenSpaceCellWidth();
+                float2 uv = input.getImageUV();
+
+                float2 gridSpacePos = uv * gridExtents;
+
+                float2 cellCoords;
+                {
+                    float2 gridSpacePosDivGridCellWidth = gridSpacePos / cellWidth;
+                    cellCoords.x = uint32_t(gridSpacePosDivGridCellWidth.x);
+                    cellCoords.y = uint32_t(gridSpacePosDivGridCellWidth.y);
+                }
+
+                // TODO: do we want to calculate it in the vertex shader?
+                const float MaxCellCoordX = round(gridExtents.x / cellWidth);
+                const float MaxCellCoordY = round(gridExtents.y / cellWidth);
+
+                float2 insideCellCoord = gridSpacePos - float2(cellWidth, cellWidth) * cellCoords; // TODO: use fmod instead?
+
+                const float2 DistancesToTriangleALegs = diagonalFromTopLeftToBottomRight ? min(insideCellCoord.x, insideCellCoord.y) : min(insideCellCoord.x, cellWidth - insideCellCoord.y);
+                const float2 DistancesToTriangleBLegs = diagonalFromTopLeftToBottomRight ? min(cellWidth - insideCellCoord.x, cellWidth - insideCellCoord.y) : min(cellWidth - insideCellCoord.x, insideCellCoord.y);
+
+                float distanceToTriangleAExclusiveCorner = min(DistancesToTriangleALegs.x, DistancesToTriangleALegs.y);
+                float distanceToTriangleBExclusiveCorner = min(DistancesToTriangleBLegs.x, DistancesToTriangleBLegs.y);
+                
+                // my ASCII art above explains which triangle is A and which is B
+                const bool triangleA = distanceToTriangleAExclusiveCorner <= distanceToTriangleBExclusiveCorner;
+
+                float2 gridSpaceCellTopLeftCoords = cellCoords * cellWidth;
 
-            //return outputColor;
-            printf("uv = %f, %f", uv.x, uv.y);
+                if (diagonalFromTopLeftToBottomRight)
+                {
+                    v[0] = float3(gridSpaceCellTopLeftCoords.x, gridSpaceCellTopLeftCoords.y + cellWidth, 0.0f);
+                    v[1] = float3(gridSpaceCellTopLeftCoords.x + cellWidth, gridSpaceCellTopLeftCoords.y, 0.0f);
+                    v[2] = triangleA ? float3(gridSpaceCellTopLeftCoords.x, gridSpaceCellTopLeftCoords.y, 0.0f) : float3(gridSpaceCellTopLeftCoords.x + cellWidth, gridSpaceCellTopLeftCoords.y + cellWidth, 0.0f);
+                }
+                else
+                {
+                    v[0] = float3(gridSpaceCellTopLeftCoords.x, gridSpaceCellTopLeftCoords.y, 0.0f);
+                    v[1] = float3(gridSpaceCellTopLeftCoords.x + cellWidth, gridSpaceCellTopLeftCoords.y + cellWidth, 0.0f);
+                    v[2] = triangleA ? float3(gridSpaceCellTopLeftCoords.x, gridSpaceCellTopLeftCoords.y + cellWidth, 0.0f) : float3(gridSpaceCellTopLeftCoords.x + cellWidth, gridSpaceCellTopLeftCoords.y, 0.0f);
+                }
+                 
+                // TODO: remove when implementing height texture
+                [unroll]
+                for (uint i = 0; i < 3; ++i)
+                {
+                    v[i].z = -20.0f + 5.0f * (v[i].x + v[i].y) / cellWidth;
+
+                    //if (abs(round(v[i].z) - 20.0f) <= 0.1f)
+                    //    v[i].z = asfloat(0x7FC00000);
+
+                }
+
+                if (isnan(v[0].z) || isnan(v[1].z) || isnan(v[2].z))
+                {
+                    discard;
+                }
+
+                // move from grid space to screen space
+                [unroll]
+                for (int i = 0; i < 3; ++i)
+                    v[i].xy += topLeft;
+
+                if (triangleA)
+                {
+                    outlineLineSegments[0] = nbl::hlsl::shapes::Line<float>::construct(v[2].xy, v[0].xy);
+                    outlineLineSegments[1] = nbl::hlsl::shapes::Line<float>::construct(v[2].xy, v[1].xy);
+                }
+                else
+                {
+                    outlineLineSegments[0] = nbl::hlsl::shapes::Line<float>::construct(v[1].xy, v[2].xy);
+                    outlineLineSegments[1] = nbl::hlsl::shapes::Line<float>::construct(v[0].xy, v[2].xy);
+                }
+
+                // test diagonal draw
+                //outlineLineSegments[0] = nbl::hlsl::shapes::Line<float>::construct(v[0].xy, v[1].xy);
+                //outlineLineSegments[1] = nbl::hlsl::shapes::Line<float>::construct(v[0].xy, v[1].xy);
+
+
+                float distancesToVerticalCellSides = min(insideCellCoord.x, cellWidth - insideCellCoord.x);
+                float distancesToHorizontalCellSides = min(insideCellCoord.y, cellWidth - insideCellCoord.y);
+
+                float patternCellCoord = distancesToVerticalCellSides >= distancesToHorizontalCellSides ? cellCoords.x : cellCoords.y;
+
+                // TODO: calculate pattern length!!!
+                float patternLength = 30.0f;
+                outlinePhaseShift = (cellWidth * (1.0f / globals.screenToWorldRatio) * patternCellCoord) / patternLength;
+            }
+
+            const float3 baryCoord = dtm::calculateDTMTriangleBarycentrics(v[0], v[1], v[2], input.position.xy);
+            float height = baryCoord.x * v[0].z + baryCoord.y * v[1].z + baryCoord.z * v[2].z;
+            float2 heightDeriv = fwidth(height);
+
+            float4 dtmColor = float4(0.0f, 0.0f, 0.0f, 0.0f);
+            if (dtmSettings.drawOutlineEnabled())
+                dtmColor = dtm::blendUnder(dtmColor, calculateGridDTMOutlineColor(dtmSettings.outlineLineStyleIdx, outlineLineSegments, input.position.xy, outlinePhaseShift));
+            if (dtmSettings.drawContourEnabled())
+            {
+                for (uint32_t i = 0; i < dtmSettings.contourSettingsCount; ++i) // TODO: should reverse the order with blendUnder
+                    dtmColor = dtm::blendUnder(dtmColor, dtm::calculateDTMContourColor(dtmSettings.contourSettings[i], v, input.position.xy, height));
+            }
+            if (dtmSettings.drawHeightShadingEnabled())
+                dtmColor = dtm::blendUnder(dtmColor, dtm::calculateDTMHeightColor(dtmSettings.heightShadingSettings, v, heightDeriv, input.position.xy, height));
+
+            textureColor = dtmColor.rgb;
+            localAlpha = dtmColor.a;
 
         }
-        
 
         uint2 fragCoord = uint2(input.position.xy);
         
diff --git a/62_CAD/shaders/main_pipeline/vertex_shader.hlsl b/62_CAD/shaders/main_pipeline/vertex_shader.hlsl
index e92a8d33b..f676aa206 100644
--- a/62_CAD/shaders/main_pipeline/vertex_shader.hlsl
+++ b/62_CAD/shaders/main_pipeline/vertex_shader.hlsl
@@ -650,9 +650,8 @@ PSInput main(uint vertexID : SV_VertexID)
             pfloat64_t2 topLeft = vk::RawBufferLoad<pfloat64_t2>(globals.pointers.geometryBuffer + drawObj.geometryAddress, 8u);
             pfloat64_t height = vk::RawBufferLoad<pfloat64_t>(globals.pointers.geometryBuffer + drawObj.geometryAddress + sizeof(pfloat64_t2), 8u);
             pfloat64_t width = vk::RawBufferLoad<pfloat64_t>(globals.pointers.geometryBuffer + drawObj.geometryAddress + sizeof(pfloat64_t2) + sizeof(pfloat64_t), 8u);
-            uint32_t textureID = vk::RawBufferLoad<uint32_t>(globals.pointers.geometryBuffer + drawObj.geometryAddress + sizeof(pfloat64_t2) + 2 * sizeof(pfloat64_t), 8u);
-            uint32_t dtmSettingsID = vk::RawBufferLoad<uint32_t>(globals.pointers.geometryBuffer + drawObj.geometryAddress + sizeof(pfloat64_t2) + 2 * sizeof(pfloat64_t) + sizeof(uint32_t), 8u);
-            float gridCellWidth = vk::RawBufferLoad<float>(globals.pointers.geometryBuffer + drawObj.geometryAddress + sizeof(pfloat64_t2) + 2 * sizeof(pfloat64_t) + 2 * sizeof(uint32_t), 8u);
+            uint32_t dtmSettingsID = vk::RawBufferLoad<uint32_t>(globals.pointers.geometryBuffer + drawObj.geometryAddress + sizeof(pfloat64_t2) + 2 * sizeof(pfloat64_t), 8u);
+            float gridCellWidth = vk::RawBufferLoad<float>(globals.pointers.geometryBuffer + drawObj.geometryAddress + sizeof(pfloat64_t2) + 2 * sizeof(pfloat64_t) + sizeof(uint32_t), 8u);
 
             const float2 corner = float2(bool2(vertexIdx & 0x1u, vertexIdx >> 1));
             pfloat64_t2 vtxPos = topLeft;
@@ -664,10 +663,13 @@ PSInput main(uint vertexID : SV_VertexID)
             float2 ndcVtxPos = _static_cast<float2>(transformPointNdc(clipProjectionData.projectionToNDC, vtxPos));
             outV.position = float4(ndcVtxPos, 0.0f, 1.0f);
 
-            outV.setHeightMapTextureID(textureID);
-            outV.setDTMSettingsID(dtmSettingsID);
-            outV.setGridDTMScreenSpaceCellWidth(gridCellWidth); // TODO: is input world space?
+            outV.setGridDTMScreenSpaceCellWidth(gridCellWidth * globals.screenToWorldRatio);
             outV.setGridDTMScreenSpacePosition(transformPointScreenSpace(clipProjectionData.projectionToNDC, globals.resolution, vtxPos));
+            outV.setGridDTMScreenSpaceTopLeft(transformPointScreenSpace(clipProjectionData.projectionToNDC, globals.resolution, topLeft));
+            pfloat64_t2 gridExtents;
+            gridExtents.x = width;
+            gridExtents.y = height;
+            outV.setGridDTMScreenSpaceGridExtents(gridExtents * globals.screenToWorldRatio);
             outV.setImageUV(corner);
         }
 

From c2023dfbe83d19d3efab475b33a15e3dce2d1681 Mon Sep 17 00:00:00 2001
From: devsh <devsh@devsh.eu>
Date: Fri, 23 May 2025 14:32:16 +0200
Subject: [PATCH 297/529] name variables correctly

---
 67_RayQueryGeometry/main.cpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/67_RayQueryGeometry/main.cpp b/67_RayQueryGeometry/main.cpp
index ce9eaee1f..7371cf1ea 100644
--- a/67_RayQueryGeometry/main.cpp
+++ b/67_RayQueryGeometry/main.cpp
@@ -636,8 +636,8 @@ class RayQueryGeometryApp final : public examples::SimpleWindowedApplication, pu
 			inputs.allocator = &myalloc;
 #endif
 			
-			CAssetConverter::patch_t<ICPUTopLevelAccelerationStructure> blasPatch = {};
-			blasPatch.compactAfterBuild = true;
+			CAssetConverter::patch_t<ICPUTopLevelAccelerationStructure> tlasPatch = {};
+			tlasPatch.compactAfterBuild = true;
 			std::array<CAssetConverter::patch_t<ICPUBottomLevelAccelerationStructure>,OT_COUNT> tmpBLASPatches = {};
 			std::array<const ICPUBuffer*, OT_COUNT * 2u> tmpBuffers;
 			std::array<CAssetConverter::patch_t<ICPUBuffer>, OT_COUNT * 2u> tmpBufferPatches;
@@ -656,7 +656,7 @@ class RayQueryGeometryApp final : public examples::SimpleWindowedApplication, pu
 
 				std::get<CAssetConverter::SInputs::asset_span_t<ICPUDescriptorSet>>(inputs.assets) = {&descriptorSet.get(),1};
 				std::get<CAssetConverter::SInputs::asset_span_t<ICPUTopLevelAccelerationStructure>>(inputs.assets) = {&cpuTlas.get(),1};
-				std::get<CAssetConverter::SInputs::patch_span_t<ICPUTopLevelAccelerationStructure>>(inputs.patches) = {&blasPatch,1};
+				std::get<CAssetConverter::SInputs::patch_span_t<ICPUTopLevelAccelerationStructure>>(inputs.patches) = {&tlasPatch,1};
 				std::get<CAssetConverter::SInputs::asset_span_t<ICPUBottomLevelAccelerationStructure>>(inputs.assets) = {&cpuBlas.data()->get(),cpuBlas.size()};
 				std::get<CAssetConverter::SInputs::patch_span_t<ICPUBottomLevelAccelerationStructure>>(inputs.patches) = tmpBLASPatches;
 				std::get<CAssetConverter::SInputs::asset_span_t<ICPUBuffer>>(inputs.assets) = tmpBuffers;
@@ -780,7 +780,7 @@ class RayQueryGeometryApp final : public examples::SimpleWindowedApplication, pu
 				if (future.copy() != IQueue::RESULT::SUCCESS)
 				{
 					m_logger->log("Failed to await submission feature!", ILogger::ELL_ERROR);
-					return false;
+					return {};
 				}
 
 				// assign gpu objects to output

From f335b2822a3c213a5193f1bb70962e08065a5523 Mon Sep 17 00:00:00 2001
From: Przemek <minikers21@gmail.com>
Date: Fri, 23 May 2025 17:00:51 +0200
Subject: [PATCH 298/529] Improved phase shift calculation

---
 62_CAD/DrawResourcesFiller.cpp                |  6 ++
 62_CAD/main.cpp                               |  2 +-
 62_CAD/shaders/globals.hlsl                   |  2 +
 62_CAD/shaders/main_pipeline/common.hlsl      | 10 ++--
 62_CAD/shaders/main_pipeline/dtm.hlsl         | 39 +++++++++++++
 .../main_pipeline/fragment_shader.hlsl        | 56 ++-----------------
 .../shaders/main_pipeline/vertex_shader.hlsl  |  2 +
 7 files changed, 60 insertions(+), 57 deletions(-)

diff --git a/62_CAD/DrawResourcesFiller.cpp b/62_CAD/DrawResourcesFiller.cpp
index c94529656..483545b2c 100644
--- a/62_CAD/DrawResourcesFiller.cpp
+++ b/62_CAD/DrawResourcesFiller.cpp
@@ -598,6 +598,12 @@ void DrawResourcesFiller::drawGridDTM(
 	gridDTMInfo.width = width;
 	gridDTMInfo.gridCellWidth = gridCellWidth;
 
+	if (dtmSettingsInfo.mode & E_DTM_MODE::OUTLINE)
+	{
+		const bool isOutlineStippled = dtmSettingsInfo.outlineStyleInfo.stipplePatternSize > 0;
+		gridDTMInfo.outlineStipplePatternLengthReciprocal = isOutlineStippled ? dtmSettingsInfo.outlineStyleInfo.reciprocalStipplePatternLen : 0.0f;
+	}
+
 	setActiveDTMSettings(dtmSettingsInfo);
 	beginMainObject(MainObjectType::GRID_DTM);
 
diff --git a/62_CAD/main.cpp b/62_CAD/main.cpp
index 361ce54ee..5b13520ba 100644
--- a/62_CAD/main.cpp
+++ b/62_CAD/main.cpp
@@ -45,7 +45,7 @@ static constexpr bool DebugModeWireframe = false;
 static constexpr bool DebugRotatingViewProj = false;
 static constexpr bool FragmentShaderPixelInterlock = true;
 static constexpr bool LargeGeoTextureStreaming = true;
-static constexpr bool CacheAndReplay = true; // caches first frame resources (buffers and images) from DrawResourcesFiller  and replays in future frames, skiping CPU Logic
+static constexpr bool CacheAndReplay = false; // caches first frame resources (buffers and images) from DrawResourcesFiller  and replays in future frames, skiping CPU Logic
 
 enum class ExampleMode
 {
diff --git a/62_CAD/shaders/globals.hlsl b/62_CAD/shaders/globals.hlsl
index b6385a0bf..cd88773f1 100644
--- a/62_CAD/shaders/globals.hlsl
+++ b/62_CAD/shaders/globals.hlsl
@@ -260,6 +260,8 @@ struct GridDTMInfo
     pfloat64_t width; // 8 bytes (32)
     uint32_t textureID; // 4 bytes (36)
     float gridCellWidth; // 4 bytes (40)
+    float outlineStipplePatternLengthReciprocal; // 4 bytes (44)
+    float _padding; // 4 bytes (48)
 };
 
 static uint32_t packR11G11B10_UNORM(float32_t3 color)
diff --git a/62_CAD/shaders/main_pipeline/common.hlsl b/62_CAD/shaders/main_pipeline/common.hlsl
index c0e44ab29..e492fe4ec 100644
--- a/62_CAD/shaders/main_pipeline/common.hlsl
+++ b/62_CAD/shaders/main_pipeline/common.hlsl
@@ -233,16 +233,18 @@ struct PSInput
 
     /* GRID DTM */
     uint getGridDTMHeightTextureID(uint textureID) { return data1.z; }
-    float getGridDTMScreenSpaceCellWidth() { return data3.x; }
-    float2 getGridDTMScreenSpacePosition() { return interp_data5.zw; }
     float2 getGridDTMScreenSpaceTopLeft() { return data2.xy; }
     float2 getGridDTMScreenSpaceGridExtents() { return data2.zw; }
+    float getGridDTMScreenSpaceCellWidth() { return data3.x; }
+    float getGridDTMOutlineStipplePatternLengthReciprocal() { return data3.y; }
+    float2 getGridDTMScreenSpacePosition() { return interp_data5.zw; }
 
     void setGridDTMHeightTextureID(uint textureID) { data1.z = textureID; }
-    void setGridDTMScreenSpaceCellWidth(float screenSpaceGridWidth) { data3.x = screenSpaceGridWidth; }
-    void setGridDTMScreenSpacePosition(float2 screenSpacePosition) { interp_data5.zw = screenSpacePosition; }
     void setGridDTMScreenSpaceTopLeft(float2 screenSpaceTopLeft) { data2.xy = screenSpaceTopLeft; }
     void setGridDTMScreenSpaceGridExtents(float2 screenSpaceGridExtends) { data2.zw = screenSpaceGridExtends; }
+    void setGridDTMScreenSpaceCellWidth(float screenSpaceGridWidth) { data3.x = screenSpaceGridWidth; }
+    void setGridDTMOutlineStipplePatternLengthReciprocal(float outlineStipplePatternLength) { data3.y = outlineStipplePatternLength; }
+    void setGridDTMScreenSpacePosition(float2 screenSpacePosition) { interp_data5.zw = screenSpacePosition; }
 };
 
 // Set 0 - Scene Data and Globals, buffer bindings don't change the buffers only get updated
diff --git a/62_CAD/shaders/main_pipeline/dtm.hlsl b/62_CAD/shaders/main_pipeline/dtm.hlsl
index ee698e19c..839b5483e 100644
--- a/62_CAD/shaders/main_pipeline/dtm.hlsl
+++ b/62_CAD/shaders/main_pipeline/dtm.hlsl
@@ -371,6 +371,45 @@ float4 calculateDTMOutlineColor(in uint outlineLineStyleIdx, in float3 v[3], in
     return outputColor;
 }
 
+float4 calculateGridDTMOutlineColor(in uint outlineLineStyleIdx, in nbl::hlsl::shapes::Line<float> outlineLineSegments[2], in float2 fragPos, in float phaseShift)
+{
+    LineStyle outlineStyle = loadLineStyle(outlineLineStyleIdx);
+    const float outlineThickness = (outlineStyle.screenSpaceLineWidth + outlineStyle.worldSpaceLineWidth * globals.screenToWorldRatio) * 0.5f;
+    const float stretch = 1.0f;
+
+    // find distance to outline
+    float minDistance = nbl::hlsl::numeric_limits<float>::max;
+    if (!outlineStyle.hasStipples() || stretch == InvalidStyleStretchValue)
+    {
+        for (int i = 0; i < 2; ++i)
+        {
+            float distance = nbl::hlsl::numeric_limits<float>::max;
+            distance = ClippedSignedDistance<nbl::hlsl::shapes::Line<float> >::sdf(outlineLineSegments[i], fragPos, outlineThickness, outlineStyle.isRoadStyleFlag);
+
+            minDistance = min(minDistance, distance);
+        }
+    }
+    else
+    {
+        for (int i = 0; i < 2; ++i)
+        {
+            float distance = nbl::hlsl::numeric_limits<float>::max;
+            nbl::hlsl::shapes::Line<float>::ArcLengthCalculator arcLenCalc = nbl::hlsl::shapes::Line<float>::ArcLengthCalculator::construct(outlineLineSegments[i]);
+            LineStyleClipper clipper = LineStyleClipper::construct(outlineStyle, outlineLineSegments[i], arcLenCalc, phaseShift, stretch, globals.worldToScreenRatio);
+            distance = ClippedSignedDistance<nbl::hlsl::shapes::Line<float>, LineStyleClipper>::sdf(outlineLineSegments[i], fragPos, outlineThickness, outlineStyle.isRoadStyleFlag, clipper);
+
+            minDistance = min(minDistance, distance);
+        }
+    }
+
+    float4 outputColor;
+    outputColor.a = 1.0f - smoothstep(-globals.antiAliasingFactor, globals.antiAliasingFactor, minDistance);
+    outputColor.a *= outlineStyle.color.a;
+    outputColor.rgb = outlineStyle.color.rgb;
+
+    return outputColor;
+}
+
 float4 blendUnder(in float4 dstColor, in float4 srcColor)
 {
     dstColor.rgb = dstColor.rgb + (1 - dstColor.a) * srcColor.a * srcColor.rgb;
diff --git a/62_CAD/shaders/main_pipeline/fragment_shader.hlsl b/62_CAD/shaders/main_pipeline/fragment_shader.hlsl
index 6d3a20ddc..7f72c5d2e 100644
--- a/62_CAD/shaders/main_pipeline/fragment_shader.hlsl
+++ b/62_CAD/shaders/main_pipeline/fragment_shader.hlsl
@@ -117,46 +117,6 @@ float32_t4 calculateFinalColor<true>(const uint2 fragCoord, const float localAlp
     return color;
 }
 
-// TODO: move to other header
-float4 calculateGridDTMOutlineColor(in uint outlineLineStyleIdx, in nbl::hlsl::shapes::Line<float> outlineLineSegments[2], in float2 fragPos, in float phaseShift)
-{
-    LineStyle outlineStyle = loadLineStyle(outlineLineStyleIdx);
-    const float outlineThickness = (outlineStyle.screenSpaceLineWidth + outlineStyle.worldSpaceLineWidth * globals.screenToWorldRatio) * 0.5f;
-    const float stretch = 1.0f;
-
-    // find distance to outline
-    float minDistance = nbl::hlsl::numeric_limits<float>::max;
-    if (!outlineStyle.hasStipples() || stretch == InvalidStyleStretchValue)
-    {
-        for (int i = 0; i < 2; ++i)
-        {
-            float distance = nbl::hlsl::numeric_limits<float>::max;
-            distance = ClippedSignedDistance<nbl::hlsl::shapes::Line<float> >::sdf(outlineLineSegments[i], fragPos, outlineThickness, outlineStyle.isRoadStyleFlag);
-
-            minDistance = min(minDistance, distance);
-        }
-    }
-    else
-    {
-        for (int i = 0; i < 2; ++i)
-        {
-            float distance = nbl::hlsl::numeric_limits<float>::max;
-            nbl::hlsl::shapes::Line<float>::ArcLengthCalculator arcLenCalc = nbl::hlsl::shapes::Line<float>::ArcLengthCalculator::construct(outlineLineSegments[i]);
-            LineStyleClipper clipper = LineStyleClipper::construct(outlineStyle, outlineLineSegments[i], arcLenCalc, phaseShift, stretch, globals.worldToScreenRatio);
-            distance = ClippedSignedDistance<nbl::hlsl::shapes::Line<float>, LineStyleClipper>::sdf(outlineLineSegments[i], fragPos, outlineThickness, outlineStyle.isRoadStyleFlag, clipper);
-
-            minDistance = min(minDistance, distance);
-        }
-    }
-
-    float4 outputColor;
-    outputColor.a = 1.0f - smoothstep(-globals.antiAliasingFactor, globals.antiAliasingFactor, minDistance);
-    outputColor.a *= outlineStyle.color.a;
-    outputColor.rgb = outlineStyle.color.rgb;
-
-    return outputColor;
-}
-
 [[vk::spvexecutionmode(spv::ExecutionModePixelInterlockOrderedEXT)]]
 [shader("pixel")]
 float4 fragMain(PSInput input) : SV_TARGET
@@ -511,18 +471,10 @@ float4 fragMain(PSInput input) : SV_TARGET
                 // TODO: remove when implementing height texture
                 [unroll]
                 for (uint i = 0; i < 3; ++i)
-                {
                     v[i].z = -20.0f + 5.0f * (v[i].x + v[i].y) / cellWidth;
 
-                    //if (abs(round(v[i].z) - 20.0f) <= 0.1f)
-                    //    v[i].z = asfloat(0x7FC00000);
-
-                }
-
                 if (isnan(v[0].z) || isnan(v[1].z) || isnan(v[2].z))
-                {
                     discard;
-                }
 
                 // move from grid space to screen space
                 [unroll]
@@ -550,9 +502,9 @@ float4 fragMain(PSInput input) : SV_TARGET
 
                 float patternCellCoord = distancesToVerticalCellSides >= distancesToHorizontalCellSides ? cellCoords.x : cellCoords.y;
 
-                // TODO: calculate pattern length!!!
-                float patternLength = 30.0f;
-                outlinePhaseShift = (cellWidth * (1.0f / globals.screenToWorldRatio) * patternCellCoord) / patternLength;
+                float reciprocalPatternLength = input.getGridDTMOutlineStipplePatternLengthReciprocal();
+                if(reciprocalPatternLength > 0.0f)
+                    outlinePhaseShift = (cellWidth * (1.0f / globals.screenToWorldRatio) * patternCellCoord) * reciprocalPatternLength;
             }
 
             const float3 baryCoord = dtm::calculateDTMTriangleBarycentrics(v[0], v[1], v[2], input.position.xy);
@@ -561,7 +513,7 @@ float4 fragMain(PSInput input) : SV_TARGET
 
             float4 dtmColor = float4(0.0f, 0.0f, 0.0f, 0.0f);
             if (dtmSettings.drawOutlineEnabled())
-                dtmColor = dtm::blendUnder(dtmColor, calculateGridDTMOutlineColor(dtmSettings.outlineLineStyleIdx, outlineLineSegments, input.position.xy, outlinePhaseShift));
+                dtmColor = dtm::blendUnder(dtmColor, dtm::calculateGridDTMOutlineColor(dtmSettings.outlineLineStyleIdx, outlineLineSegments, input.position.xy, outlinePhaseShift));
             if (dtmSettings.drawContourEnabled())
             {
                 for (uint32_t i = 0; i < dtmSettings.contourSettingsCount; ++i) // TODO: should reverse the order with blendUnder
diff --git a/62_CAD/shaders/main_pipeline/vertex_shader.hlsl b/62_CAD/shaders/main_pipeline/vertex_shader.hlsl
index 97d438fb2..1074cc265 100644
--- a/62_CAD/shaders/main_pipeline/vertex_shader.hlsl
+++ b/62_CAD/shaders/main_pipeline/vertex_shader.hlsl
@@ -652,6 +652,7 @@ PSInput main(uint vertexID : SV_VertexID)
             pfloat64_t width = vk::RawBufferLoad<pfloat64_t>(globals.pointers.geometryBuffer + drawObj.geometryAddress + sizeof(pfloat64_t2) + sizeof(pfloat64_t), 8u);
             uint32_t dtmSettingsID = vk::RawBufferLoad<uint32_t>(globals.pointers.geometryBuffer + drawObj.geometryAddress + sizeof(pfloat64_t2) + 2 * sizeof(pfloat64_t), 8u);
             float gridCellWidth = vk::RawBufferLoad<float>(globals.pointers.geometryBuffer + drawObj.geometryAddress + sizeof(pfloat64_t2) + 2 * sizeof(pfloat64_t) + sizeof(uint32_t), 8u);
+            float reciprocalOutlineStipplePatternLength = vk::RawBufferLoad<float>(globals.pointers.geometryBuffer + drawObj.geometryAddress + sizeof(pfloat64_t2) + 2 * sizeof(pfloat64_t) + sizeof(uint32_t) + sizeof(float), 8u);
 
             const float2 corner = float2(bool2(vertexIdx & 0x1u, vertexIdx >> 1));
             pfloat64_t2 vtxPos = topLeft;
@@ -671,6 +672,7 @@ PSInput main(uint vertexID : SV_VertexID)
             gridExtents.y = height;
             outV.setGridDTMScreenSpaceGridExtents(gridExtents * globals.screenToWorldRatio);
             outV.setImageUV(corner);
+            outV.setGridDTMOutlineStipplePatternLengthReciprocal(reciprocalOutlineStipplePatternLength);
         }
         else if (objType == ObjectType::STREAMED_IMAGE)
         {

From 43f4dd616c0f0b05ba58393ec138c28ad72a3a07 Mon Sep 17 00:00:00 2001
From: Przemek <minikers21@gmail.com>
Date: Fri, 23 May 2025 18:31:43 +0200
Subject: [PATCH 299/529] Added a todo comment

---
 62_CAD/shaders/main_pipeline/fragment_shader.hlsl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/62_CAD/shaders/main_pipeline/fragment_shader.hlsl b/62_CAD/shaders/main_pipeline/fragment_shader.hlsl
index 7f72c5d2e..36fa3abf4 100644
--- a/62_CAD/shaders/main_pipeline/fragment_shader.hlsl
+++ b/62_CAD/shaders/main_pipeline/fragment_shader.hlsl
@@ -398,7 +398,7 @@ float4 fragMain(PSInput input) : SV_TARGET
                 // DONE: Make it so we can choose which diagonal to use to construct the triangle, it's either u=v or u=1-v
             // DONE: C. Height shading same as contours (split into two triangles)
 
-            // Heights can have invalid values (let's say NaN) if a cell corner has NaN value then no triangle (for contour and shading) and no outline should include that corner. (see DTM image in discord with gaps)
+            // DONE (but needs to be tested after i implement texture height maps) Heights can have invalid values (let's say NaN) if a cell corner has NaN value then no triangle (for contour and shading) and no outline should include that corner. (see DTM image in discord with gaps)
             
             // TODO: we need to emulate dilation and do sdf of neighbouring cells as well. because contours, outlines and shading can bleed into other cells for AA.
             // [NOTE] Do dilation as last step, when everything else works fine

From 81a5c4356aa0c36a5d776db77a37bb0b346396c5 Mon Sep 17 00:00:00 2001
From: Fletterio <fj.letterio@gmail.com>
Date: Fri, 23 May 2025 23:22:44 -0300
Subject: [PATCH 300/529] Add test for cache copy

---
 21_LRUCacheUnitTest/main.cpp | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/21_LRUCacheUnitTest/main.cpp b/21_LRUCacheUnitTest/main.cpp
index 1e7830b16..467c6d4e4 100644
--- a/21_LRUCacheUnitTest/main.cpp
+++ b/21_LRUCacheUnitTest/main.cpp
@@ -203,6 +203,16 @@ class LRUCacheTestApp final : public nbl::application_templates::MonoSystemMonoL
 				counter++;
 			}
 
+			// Cache copy test
+			ResizableLRUCache<uint32_t, uint32_t> cache4Copy(cache4);
+			for (auto it = cache4.cbegin(), itCopy = cache4Copy.cbegin(); it != cache4.cend(); it++, itCopy++)
+			{
+				assert(*it == *itCopy);
+				// Assert deep copy
+				assert(it.operator->() != itCopy.operator->());
+
+			}
+
 			// Besides the disposal function that gets called when evicting, we need to check that the Cache properly destroys all resident `Key,Value` pairs when destroyed
 			struct Foo
 			{
@@ -236,10 +246,8 @@ class LRUCacheTestApp final : public nbl::application_templates::MonoSystemMonoL
 					cache5.insert(i, Foo(&destroyCounter));
 				int x = 0;
 			}
-			
 			assert(destroyCounter == 10);
 
-
 			m_logger->log("all good");
 
 			m_textureLRUCache = std::unique_ptr<TextureLRUCache>(new TextureLRUCache(1024u));

From 6999e3674052eb3dcb6a29e9d651c862b4c4d215 Mon Sep 17 00:00:00 2001
From: Erfan Ahmadi <ahmadierfan99@gmail.com>
Date: Sat, 24 May 2025 18:47:18 +0400
Subject: [PATCH 301/529] ensure multiple static images availability

---
 62_CAD/DrawResourcesFiller.cpp | 20 +++++++++++++++++++-
 62_CAD/DrawResourcesFiller.h   | 25 +++++++++++++++++++++----
 62_CAD/main.cpp                |  2 +-
 3 files changed, 41 insertions(+), 6 deletions(-)

diff --git a/62_CAD/DrawResourcesFiller.cpp b/62_CAD/DrawResourcesFiller.cpp
index 483545b2c..7ece534cf 100644
--- a/62_CAD/DrawResourcesFiller.cpp
+++ b/62_CAD/DrawResourcesFiller.cpp
@@ -360,8 +360,11 @@ void DrawResourcesFiller::drawFontGlyph(
 	}
 }
 
-bool DrawResourcesFiller::ensureStaticImageAvailability(image_id imageID, const core::smart_refctd_ptr<ICPUImage>& cpuImage, SIntendedSubmitInfo& intendedNextSubmit)
+bool DrawResourcesFiller::ensureStaticImageAvailability(const StaticImageInfo& staticImage, SIntendedSubmitInfo& intendedNextSubmit)
 {
+	const auto& imageID = staticImage.imageID;
+	const auto& cpuImage = staticImage.cpuImage;
+	
 	// Try inserting or updating the image usage in the cache.
 	// If the image is already present, updates its semaphore value.
 	auto evictCallback = [&](image_id imageID, const CachedImageRecord& evicted) { evictImage_SubmitIfNeeded(imageID, evicted, intendedNextSubmit); };
@@ -447,6 +450,21 @@ bool DrawResourcesFiller::ensureStaticImageAvailability(image_id imageID, const
 	return cachedImageRecord->arrayIndex != InvalidTextureIndex;
 }
 
+bool DrawResourcesFiller::ensureMultipleStaticImagesAvailability(std::span<StaticImageInfo> staticImages, SIntendedSubmitInfo& intendedNextSubmit)
+{
+	for (auto& staticImage : staticImages)
+	{
+		if (!ensureStaticImageAvailability(staticImage, intendedNextSubmit))
+			return false; // failed ensuring a single staticImage is available, shouldn't happen unless the image is larger than the memory arena allocated for images.
+	}
+	for (auto& staticImage : staticImages)
+	{
+		if (imagesCache->peek(staticImage.imageID) == nullptr)
+			return false; // this means one of the images evicted another, most likely due to VRAM limitations not all images can be resident all at once.
+	}
+	return true;
+}
+
 bool DrawResourcesFiller::ensureGeoreferencedImageAvailability_AllocateIfNeeded(image_id imageID, const GeoreferencedImageParams& params, SIntendedSubmitInfo& intendedNextSubmit)
 {
 	auto* device = m_utilities->getLogicalDevice();
diff --git a/62_CAD/DrawResourcesFiller.h b/62_CAD/DrawResourcesFiller.h
index 0b7a8cf01..18c09f83e 100644
--- a/62_CAD/DrawResourcesFiller.h
+++ b/62_CAD/DrawResourcesFiller.h
@@ -216,6 +216,12 @@ struct DrawResourcesFiller
 		const DTMSettingsInfo& dtmSettingsInfo,
 		SIntendedSubmitInfo& intendedNextSubmit);
 	
+	struct StaticImageInfo
+	{
+		image_id imageID;
+		core::smart_refctd_ptr<ICPUImage> cpuImage;
+	};
+
 	/**
 	 * @brief Adds a static 2D image to the draw resource set for rendering.
 	 *
@@ -230,8 +236,7 @@ struct DrawResourcesFiller
 	 *   - Queues the image for uploading via staging in the next submit.
 	 *   - If memory is constrained, attempts to evict other images to free up space.
 	 *
-	 * @param imageID              Unique identifier for the image resource.
-	 * @param cpuImage             The CPU-side image resource to (possibly) upload.
+	 * @param staticImage              Unique identifier for the image resource plus the CPU-side image resource to (possibly) upload.
 	 * @param intendedNextSubmit   Struct representing the upcoming submission, including a semaphore for safe scheduling.
 	 *
 	 * @note This function ensures that the descriptor slot is not reused while the GPU may still be reading from it.
@@ -240,9 +245,21 @@ struct DrawResourcesFiller
 	 *
 	 * @note The function uses the `imagesCache` LRU cache to track usage and validity of texture slots.
 	 *       If an insertion leads to an eviction, a callback ensures proper deallocation and synchronization.
-	 * @return true if the image was successfully cached and is ready for use; false if allocation failed.
+	 * @return true if the image was successfully cached and is ready for use; false if allocation failed most likely due to the image being larger than the memory arena allocated for all images.
 	*/
-	bool ensureStaticImageAvailability(image_id imageID, const core::smart_refctd_ptr<ICPUImage>& cpuImage, SIntendedSubmitInfo& intendedNextSubmit);
+	bool ensureStaticImageAvailability(const StaticImageInfo& staticImage, SIntendedSubmitInfo& intendedNextSubmit);
+	
+	/**
+	 * @brief Adds multiple static 2D image to the draw resource set for rendering.
+	 * 
+	 * This function should theoratically succeed if the size of staticImages is less that max descriptor slots and more importantly if all of the images can fit in the images memory arena (using the GeneralPurposeAddressAllocatoe)
+	 *		There is a low chance that failure might be due to fragmentation of images memory allocator (GPAA), in which case clearing the cache and retrying MIGHT work.
+	 * 
+	 * @return true if all of them are successfully cache and available for rendering
+	 * @return false if the images couldn't be resident all at once. // TODO: maybe return something about which ones are available.
+	 */
+	bool ensureStaticImagesAvailability(std::span<StaticImageInfo> staticImages, SIntendedSubmitInfo& intendedNextSubmit);
+
 
 	/**
 	 * @brief Ensures a GPU-resident georeferenced image exists in the cache, allocating resources if necessary.
diff --git a/62_CAD/main.cpp b/62_CAD/main.cpp
index 5b13520ba..c5123473b 100644
--- a/62_CAD/main.cpp
+++ b/62_CAD/main.cpp
@@ -3059,7 +3059,7 @@ class ComputerAidedDesign final : public examples::SimpleWindowedApplication, pu
 			{
 				uint64_t imageID = i * 69ull; // it can be hash or something of the file path the image was loaded from
 				//printf(std::format("\n Image {} \n", i).c_str());
-				drawResourcesFiller.ensureStaticImageAvailability(imageID, sampleImages[i], intendedNextSubmit);
+				drawResourcesFiller.ensureStaticImageAvailability({ imageID, sampleImages[i] }, intendedNextSubmit);
 				drawResourcesFiller.addImageObject(imageID, { .topLeft = { 0.0 + (i) * 3.0, 0.0 }, .dirU = { 3.0 , 0.0 }, .aspectRatio = 1.0 }, intendedNextSubmit);
 				//printf("\n");
 			}

From 48864b79128cfbd8bfe2989287e726b9fe070063 Mon Sep 17 00:00:00 2001
From: Erfan Ahmadi <ahmadierfan99@gmail.com>
Date: Sat, 24 May 2025 18:48:03 +0400
Subject: [PATCH 302/529] forgot to push function rename

---
 62_CAD/DrawResourcesFiller.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/62_CAD/DrawResourcesFiller.h b/62_CAD/DrawResourcesFiller.h
index 18c09f83e..ba8cb80de 100644
--- a/62_CAD/DrawResourcesFiller.h
+++ b/62_CAD/DrawResourcesFiller.h
@@ -258,7 +258,7 @@ struct DrawResourcesFiller
 	 * @return true if all of them are successfully cache and available for rendering
 	 * @return false if the images couldn't be resident all at once. // TODO: maybe return something about which ones are available.
 	 */
-	bool ensureStaticImagesAvailability(std::span<StaticImageInfo> staticImages, SIntendedSubmitInfo& intendedNextSubmit);
+	bool ensureMultipleStaticImagesAvailability(std::span<StaticImageInfo> staticImages, SIntendedSubmitInfo& intendedNextSubmit);
 
 
 	/**

From 96cc801291537b119e053342cf42e29ec1c0fd26 Mon Sep 17 00:00:00 2001
From: Erfan Ahmadi <ahmadierfan99@gmail.com>
Date: Sat, 24 May 2025 18:52:47 +0400
Subject: [PATCH 303/529] documentation for
 `ensureMultipleStaticImagesAvailability`

---
 62_CAD/DrawResourcesFiller.cpp |  3 +++
 62_CAD/DrawResourcesFiller.h   | 29 ++++++++++++++++++++---------
 2 files changed, 23 insertions(+), 9 deletions(-)

diff --git a/62_CAD/DrawResourcesFiller.cpp b/62_CAD/DrawResourcesFiller.cpp
index 7ece534cf..9fa24e1a0 100644
--- a/62_CAD/DrawResourcesFiller.cpp
+++ b/62_CAD/DrawResourcesFiller.cpp
@@ -452,6 +452,9 @@ bool DrawResourcesFiller::ensureStaticImageAvailability(const StaticImageInfo& s
 
 bool DrawResourcesFiller::ensureMultipleStaticImagesAvailability(std::span<StaticImageInfo> staticImages, SIntendedSubmitInfo& intendedNextSubmit)
 {
+	if (staticImages.size() > ImagesBindingArraySize)
+		return false;
+
 	for (auto& staticImage : staticImages)
 	{
 		if (!ensureStaticImageAvailability(staticImage, intendedNextSubmit))
diff --git a/62_CAD/DrawResourcesFiller.h b/62_CAD/DrawResourcesFiller.h
index ba8cb80de..04bc08df3 100644
--- a/62_CAD/DrawResourcesFiller.h
+++ b/62_CAD/DrawResourcesFiller.h
@@ -246,21 +246,32 @@ struct DrawResourcesFiller
 	 * @note The function uses the `imagesCache` LRU cache to track usage and validity of texture slots.
 	 *       If an insertion leads to an eviction, a callback ensures proper deallocation and synchronization.
 	 * @return true if the image was successfully cached and is ready for use; false if allocation failed most likely due to the image being larger than the memory arena allocated for all images.
-	*/
+	 */
 	bool ensureStaticImageAvailability(const StaticImageInfo& staticImage, SIntendedSubmitInfo& intendedNextSubmit);
 	
 	/**
-	 * @brief Adds multiple static 2D image to the draw resource set for rendering.
-	 * 
-	 * This function should theoratically succeed if the size of staticImages is less that max descriptor slots and more importantly if all of the images can fit in the images memory arena (using the GeneralPurposeAddressAllocatoe)
-	 *		There is a low chance that failure might be due to fragmentation of images memory allocator (GPAA), in which case clearing the cache and retrying MIGHT work.
-	 * 
-	 * @return true if all of them are successfully cache and available for rendering
-	 * @return false if the images couldn't be resident all at once. // TODO: maybe return something about which ones are available.
+	 * @brief Ensures that multiple static 2D images are resident and ready for rendering.
+	 *
+	 * Attempts to make all provided static images GPU-resident by calling `ensureStaticImageAvailability`
+	 * for each. Afterward, it verifies that none of the newly ensured images have been evicted,
+	 * which could happen due to limited VRAM or memory fragmentation.
+	 *
+	 * This function is expected to succeed if:
+	 * - The number of images does not exceed `ImagesBindingArraySize`.
+	 * - Each image individually fits into the image memory arena.
+	 * - There is enough VRAM to hold all images simultaneously.
+	 *
+	 * @param staticImages A span of StaticImageInfo structures describing the images to be ensured.
+	 * @param intendedNextSubmit Struct representing the upcoming submission, including a semaphore for safe scheduling.
+	 *
+	 * @return true If all images were successfully made resident and none were evicted during the process.
+	 * @return false If:
+	 *   - The number of images exceeds the descriptor binding array size.
+	 *   - Any individual image could not be made resident (e.g., larger than the allocator can support).
+	 *   - Some images were evicted due to VRAM pressure or allocator fragmentation, in which case Clearing the image cache and retrying MIGHT be a success (TODO: handle internally)
 	 */
 	bool ensureMultipleStaticImagesAvailability(std::span<StaticImageInfo> staticImages, SIntendedSubmitInfo& intendedNextSubmit);
 
-
 	/**
 	 * @brief Ensures a GPU-resident georeferenced image exists in the cache, allocating resources if necessary.
 	 * 

From fae6490244a69223321488214a4606cd4c5044d1 Mon Sep 17 00:00:00 2001
From: devsh <devsh@devsh.eu>
Date: Sun, 25 May 2025 19:46:53 +0200
Subject: [PATCH 304/529] get old Acceleration Structure code workin in ex 71
 after API change

---
 71_RayTracingPipeline/main.cpp | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/71_RayTracingPipeline/main.cpp b/71_RayTracingPipeline/main.cpp
index faa392a46..54a37a3d3 100644
--- a/71_RayTracingPipeline/main.cpp
+++ b/71_RayTracingPipeline/main.cpp
@@ -6,7 +6,7 @@
 #include "nbl/ext/FullScreenTriangle/FullScreenTriangle.h"
 #include "nbl/builtin/hlsl/indirect_commands.hlsl"
 
-#define TEST_ASSET_CONV_AS
+//#define TEST_ASSET_CONV_AS
 
 class RaytracingPipelineApp final : public examples::SimpleWindowedApplication, public application_templates::MonoAssetManagerAndBuiltinResourceApplication
 {
@@ -1955,9 +1955,9 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication,
 		// build bottom level ASes
 		{
 			core::vector<uint32_t> primitiveCounts(blasCount);
-			core::vector<IGPUBottomLevelAccelerationStructure::Triangles<const IGPUBuffer>> triangles(m_gpuTriangleGeometries.size());
+			core::vector<IGPUBottomLevelAccelerationStructure::Triangles<IGPUBuffer>> triangles(m_gpuTriangleGeometries.size());
 			core::vector<uint32_t> scratchSizes(blasCount);
-			IGPUBottomLevelAccelerationStructure::AABBs<const IGPUBuffer> aabbs;
+			IGPUBottomLevelAccelerationStructure::AABBs<IGPUBuffer> aabbs;
 
 			auto blasFlags = bitflag(IGPUBottomLevelAccelerationStructure::BUILD_FLAGS::PREFER_FAST_TRACE_BIT) | IGPUBottomLevelAccelerationStructure::BUILD_FLAGS::ALLOW_COMPACTION_BIT;
 			if (m_physicalDevice->getProperties().limits.rayTracingPositionFetch)
@@ -2017,12 +2017,12 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication,
 					if (isProcedural)
 					{
 						const auto* aabbData = &aabbs;
-						buildSizes = m_device->getAccelerationStructureBuildSizes(blasBuildInfos[i].buildFlags, false, std::span{ aabbData, 1 }, maxPrimCount);
+						buildSizes = m_device->getAccelerationStructureBuildSizes(false, blasBuildInfos[i].buildFlags, false, std::span{ aabbData, 1 }, maxPrimCount);
 					}
 					else
 					{
 						const auto* trianglesData = triangles.data();
-						buildSizes = m_device->getAccelerationStructureBuildSizes(blasBuildInfos[i].buildFlags, false, std::span{ trianglesData,1 }, maxPrimCount);
+						buildSizes = m_device->getAccelerationStructureBuildSizes(false, blasBuildInfos[i].buildFlags, false, std::span{ trianglesData,1 }, maxPrimCount);
 					}
 					if (!buildSizes)
 						return logFail("Failed to get BLAS build sizes");
@@ -2144,8 +2144,8 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication,
 				IGPUBottomLevelAccelerationStructure::CopyInfo copyInfo;
 				copyInfo.src = cleanupBlas[i].get();
 				copyInfo.dst = m_gpuBlasList[i].get();
-				copyInfo.mode = IGPUBottomLevelAccelerationStructure::COPY_MODE::COMPACT;
-				if (!cmdbufCompact->copyAccelerationStructure(copyInfo))
+				copyInfo.compact = true;
+				if (!cmdbufCompact->copyAccelerationStructure<IGPUBottomLevelAccelerationStructure>(copyInfo))
 					return logFail("Failed to copy AS to compact");
 			}
 		}

From c8d4b607a281dfd04e7554253499f88ca055b087 Mon Sep 17 00:00:00 2001
From: devsh <devsh@devsh.eu>
Date: Mon, 26 May 2025 09:44:06 +0200
Subject: [PATCH 305/529] the multi-queue was badly implemented, just use
 Graphics & Compute throughout

---
 71_RayTracingPipeline/main.cpp | 754 +--------------------------------
 1 file changed, 18 insertions(+), 736 deletions(-)

diff --git a/71_RayTracingPipeline/main.cpp b/71_RayTracingPipeline/main.cpp
index 54a37a3d3..0b6b4d724 100644
--- a/71_RayTracingPipeline/main.cpp
+++ b/71_RayTracingPipeline/main.cpp
@@ -6,7 +6,6 @@
 #include "nbl/ext/FullScreenTriangle/FullScreenTriangle.h"
 #include "nbl/builtin/hlsl/indirect_commands.hlsl"
 
-//#define TEST_ASSET_CONV_AS
 
 class RaytracingPipelineApp final : public examples::SimpleWindowedApplication, public application_templates::MonoAssetManagerAndBuiltinResourceApplication
 {
@@ -25,8 +24,6 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication,
 	  "Spot"
 	};
 
-	constexpr static inline clock_t::duration DisplayImageDuration = std::chrono::milliseconds(900);
-
 	struct ShaderBindingTable
 	{
 		SBufferRange<IGPUBuffer> raygenGroupRange;
@@ -93,7 +90,7 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication,
 	inline core::vector<queue_req_t> getQueueRequirements() const override
 	{
 		auto reqs = device_base_t::getQueueRequirements();
-		reqs.front().requiredFlags |= IQueue::FAMILY_FLAGS::TRANSFER_BIT;
+		reqs.front().requiredFlags |= IQueue::FAMILY_FLAGS::COMPUTE_BIT;
 		return reqs;
 	}
 
@@ -405,7 +402,7 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication,
 
 			calculateRayTracingStackSize(m_rayTracingPipeline);
 
-			if (!createShaderBindingTable(gQueue, m_rayTracingPipeline))
+			if (!createShaderBindingTable(m_rayTracingPipeline))
 				return logFail("Could not create shader binding table");
 
 		}
@@ -413,20 +410,11 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication,
 		auto assetManager = make_smart_refctd_ptr<nbl::asset::IAssetManager>(smart_refctd_ptr(system));
 		auto* geometryCreator = assetManager->getGeometryCreator();
 
-		if (!createIndirectBuffer(gQueue))
+		if (!createIndirectBuffer())
 			return logFail("Could not create indirect buffer");
 
-#ifdef TEST_ASSET_CONV_AS
-		if (!createAccelerationStructuresFromGeometry(getComputeQueue(), geometryCreator))
+		if (!createAccelerationStructuresFromGeometry(geometryCreator))
 			return logFail("Could not create acceleration structures from geometry creator");
-#else
-		// create geometry objects
-		if (!createGeometries(gQueue, geometryCreator))
-			return logFail("Could not create geometries from geometry creator");
-
-		if (!createAccelerationStructures(getComputeQueue()))
-			return logFail("Could not create acceleration structures");
-#endif // TEST_ASSET_CONV_AS
 
 		ISampler::SParams samplerParams = {
 		  .AnisotropicFilter = 0
@@ -521,7 +509,7 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication,
 			params.renderpass = smart_refctd_ptr<IGPURenderpass>(renderpass);
 			params.streamingBuffer = nullptr;
 			params.subpassIx = 0u;
-			params.transfer = getTransferUpQueue();
+			params.transfer = getGraphicsQueue();
 			params.utilities = m_utils;
 			{
 				m_ui.manager = ext::imgui::UI::create(std::move(params));
@@ -988,77 +976,7 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication,
 		return (dim + size - 1) / size;
 	}
 
-	smart_refctd_ptr<IGPUBuffer> createBuffer(IGPUBuffer::SCreationParams& params)
-	{
-		smart_refctd_ptr<IGPUBuffer> buffer;
-		buffer = m_device->createBuffer(std::move(params));
-		auto bufReqs = buffer->getMemoryReqs();
-		bufReqs.memoryTypeBits &= m_physicalDevice->getDeviceLocalMemoryTypeBits();
-		m_device->allocate(bufReqs, buffer.get(), IDeviceMemoryAllocation::EMAF_DEVICE_ADDRESS_BIT);
-
-		return buffer;
-	}
-
-	smart_refctd_ptr<IGPUCommandBuffer> getSingleUseCommandBufferAndBegin(smart_refctd_ptr<IGPUCommandPool> pool)
-	{
-		smart_refctd_ptr<IGPUCommandBuffer> cmdbuf;
-		if (!pool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY, 1u, &cmdbuf))
-			return nullptr;
-
-		cmdbuf->reset(IGPUCommandBuffer::RESET_FLAGS::RELEASE_RESOURCES_BIT);
-		cmdbuf->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT);
-
-		return cmdbuf;
-	}
-
-	void cmdbufSubmitAndWait(smart_refctd_ptr<IGPUCommandBuffer> cmdbuf, CThreadSafeQueueAdapter* queue, uint64_t startValue)
-	{
-		cmdbuf->end();
-
-		uint64_t finishedValue = startValue + 1;
-
-		// submit builds
-		{
-			auto completed = m_device->createSemaphore(startValue);
-
-			std::array<IQueue::SSubmitInfo::SSemaphoreInfo, 1u> signals;
-			{
-				auto& signal = signals.front();
-				signal.value = finishedValue;
-				signal.stageMask = bitflag(PIPELINE_STAGE_FLAGS::ALL_TRANSFER_BITS);
-				signal.semaphore = completed.get();
-			}
-
-			const IQueue::SSubmitInfo::SCommandBufferInfo commandBuffers[1] = { {
-			  .cmdbuf = cmdbuf.get()
-			} };
-
-			const IQueue::SSubmitInfo infos[] =
-			{
-			  {
-				.waitSemaphores = {},
-				.commandBuffers = commandBuffers,
-				.signalSemaphores = signals
-			  }
-			};
-
-			if (queue->submit(infos) != IQueue::RESULT::SUCCESS)
-			{
-				m_logger->log("Failed to submit geometry transfer upload operations!", ILogger::ELL_ERROR);
-				return;
-			}
-
-			const ISemaphore::SWaitInfo info[] =
-			{ {
-			  .semaphore = completed.get(),
-			  .value = finishedValue
-			} };
-
-			m_device->blockForSemaphores(info);
-		}
-	}
-
-	bool createIndirectBuffer(video::CThreadSafeQueueAdapter* queue)
+	bool createIndirectBuffer()
 	{
 		const auto getBufferRangeAddress = [](const SBufferRange<IGPUBuffer>& range)
 			{
@@ -1083,7 +1001,7 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication,
 		IGPUBuffer::SCreationParams params;
 		params.usage = IGPUBuffer::EUF_TRANSFER_DST_BIT | IGPUBuffer::EUF_INDIRECT_BUFFER_BIT | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT;
 		params.size = sizeof(TraceRaysIndirectCommand_t);
-		m_utils->createFilledDeviceLocalBufferOnDedMem(SIntendedSubmitInfo{ .queue = queue }, std::move(params), &command).move_into(m_indirectBuffer);
+		m_utils->createFilledDeviceLocalBufferOnDedMem(SIntendedSubmitInfo{ .queue = getGraphicsQueue() }, std::move(params), &command).move_into(m_indirectBuffer);
 		return true;
 	}
 
@@ -1110,7 +1028,7 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication,
 		m_rayTracingStackSize = raygenStackSize + std::max(firstDepthStackSizeMax, callableStackMax);
 	}
 
-	bool createShaderBindingTable(video::CThreadSafeQueueAdapter* queue, const smart_refctd_ptr<video::IGPURayTracingPipeline>& pipeline)
+	bool createShaderBindingTable(const smart_refctd_ptr<video::IGPURayTracingPipeline>& pipeline)
 	{
 		const auto& limits = m_device->getPhysicalDevice()->getLimits();
 		const auto handleSize = SPhysicalDeviceLimits::ShaderGroupHandleSize;
@@ -1188,7 +1106,7 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication,
 			IGPUBuffer::SCreationParams params;
 			params.usage = IGPUBuffer::EUF_TRANSFER_DST_BIT | IGPUBuffer::EUF_INLINE_UPDATE_VIA_CMDBUF | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT | IGPUBuffer::EUF_SHADER_BINDING_TABLE_BIT;
 			params.size = bufferSize;
-			m_utils->createFilledDeviceLocalBufferOnDedMem(SIntendedSubmitInfo{ .queue = queue }, std::move(params), pData).move_into(raygenRange.buffer);
+			m_utils->createFilledDeviceLocalBufferOnDedMem(SIntendedSubmitInfo{ .queue = getGraphicsQueue() }, std::move(params), pData).move_into(raygenRange.buffer);
 			missRange.buffer = core::smart_refctd_ptr(raygenRange.buffer);
 			hitRange.buffer = core::smart_refctd_ptr(raygenRange.buffer);
 			callableRange.buffer = core::smart_refctd_ptr(raygenRange.buffer);
@@ -1197,9 +1115,9 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication,
 		return true;
 	}
 
-#ifdef TEST_ASSET_CONV_AS
-	bool createAccelerationStructuresFromGeometry(video::CThreadSafeQueueAdapter* queue, const IGeometryCreator* gc)
+	bool createAccelerationStructuresFromGeometry(const IGeometryCreator* gc)
 	{
+		auto queue = getGraphicsQueue();
 		// get geometries into ICPUBuffers
 		auto pool = m_device->createCommandPool(queue->getFamilyIndex(), IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT);
 		if (!pool)
@@ -1431,23 +1349,23 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication,
 		cpuTlas->setInstances(std::move(geomInstances));
 		cpuTlas->setBuildFlags(IGPUTopLevelAccelerationStructure::BUILD_FLAGS::PREFER_FAST_TRACE_BIT);
 
-//#define TEST_REBAR_FALLBACK
 		// convert with asset converter
 		smart_refctd_ptr<CAssetConverter> converter = CAssetConverter::create({ .device = m_device.get(), .optimizer = {} });
 		struct MyInputs : CAssetConverter::SInputs
 		{
-#ifndef TEST_REBAR_FALLBACK
+			// For the GPU Buffers to be directly writeable and so that we don't need a Transfer Queue submit at all
 			inline uint32_t constrainMemoryTypeBits(const size_t groupCopyID, const IAsset* canonicalAsset, const blake3_hash_t& contentHash, const IDeviceMemoryBacked* memoryBacked) const override
 			{
 				assert(memoryBacked);
 				return memoryBacked->getObjectType() != IDeviceMemoryBacked::EOT_BUFFER ? (~0u) : rebarMemoryTypes;
 			}
-#endif
+
 			uint32_t rebarMemoryTypes;
 		} inputs = {};
 		inputs.logger = m_logger.get();
 		inputs.rebarMemoryTypes = m_physicalDevice->getDirectVRAMAccessMemoryTypeBits();
-#ifndef TEST_REBAR_FALLBACK
+		// the allocator needs to be overriden to hand out memory ranges which have already been mapped so that the ReBAR fast-path can kick in
+		// (multiple buffers can be bound to same memory, but memory can only be mapped once at one place, so Asset Converter can't do it)
 		struct MyAllocator final : public IDeviceMemoryAllocator
 		{
 			ILogicalDevice* getDeviceForAllocations() const override { return device; }
@@ -1465,7 +1383,6 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication,
 		} myalloc;
 		myalloc.device = m_device.get();
 		inputs.allocator = &myalloc;
-#endif
 
 		std::array<ICPUTopLevelAccelerationStructure*, 1u> tmpTlas;
 		std::array<ICPUBuffer*, 2 * std::size(cpuObjects) + 1u> tmpBuffers;
@@ -1510,31 +1427,11 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication,
 			prepass.template operator() < ICPUBuffer > (tmpBuffers);
 		}
 
-		constexpr auto XferBufferCount = 2;
-		std::array<smart_refctd_ptr<IGPUCommandBuffer>, XferBufferCount> xferBufs = {};
-		std::array<IQueue::SSubmitInfo::SCommandBufferInfo, XferBufferCount> xferBufInfos = {};
-		{
-			auto pool = m_device->createCommandPool(getTransferUpQueue()->getFamilyIndex(), IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT | IGPUCommandPool::CREATE_FLAGS::TRANSIENT_BIT);
-			pool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY, xferBufs);
-			xferBufs.front()->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT);
-			for (auto i = 0; i < XferBufferCount; i++)
-				xferBufInfos[i].cmdbuf = xferBufs[i].get();
-		}
-		auto xferSema = m_device->createSemaphore(0u);
-		SIntendedSubmitInfo transfer = {};
-		transfer.queue = getTransferUpQueue();
-		transfer.scratchCommandBuffers = xferBufInfos;
-		transfer.scratchSemaphore = {
-			.semaphore = xferSema.get(),
-			.value = 0u,
-			.stageMask = PIPELINE_STAGE_FLAGS::ALL_TRANSFER_BITS
-		};
-
 		constexpr auto CompBufferCount = 2;
 		std::array<smart_refctd_ptr<IGPUCommandBuffer>, CompBufferCount> compBufs = {};
 		std::array<IQueue::SSubmitInfo::SCommandBufferInfo, CompBufferCount> compBufInfos = {};
 		{
-			auto pool = m_device->createCommandPool(getComputeQueue()->getFamilyIndex(), IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT | IGPUCommandPool::CREATE_FLAGS::TRANSIENT_BIT);
+			auto pool = m_device->createCommandPool(queue->getFamilyIndex(), IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT | IGPUCommandPool::CREATE_FLAGS::TRANSIENT_BIT);
 			pool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY, compBufs);
 			compBufs.front()->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT);
 			for (auto i = 0; i < CompBufferCount; i++)
@@ -1542,7 +1439,7 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication,
 		}
 		auto compSema = m_device->createSemaphore(0u);
 		SIntendedSubmitInfo compute = {};
-		compute.queue = getComputeQueue();
+		compute.queue = queue;
 		compute.scratchCommandBuffers = compBufInfos;
 		compute.scratchSemaphore = {
 			.semaphore = compSema.get(),
@@ -1561,24 +1458,13 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication,
 				IGPUBuffer::SCreationParams creationParams = {};
 				creationParams.size = scratchSize;
 				creationParams.usage = IGPUBuffer::EUF_ACCELERATION_STRUCTURE_BUILD_INPUT_READ_ONLY_BIT | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT | IGPUBuffer::EUF_STORAGE_BUFFER_BIT;
-#ifdef TEST_REBAR_FALLBACK
-				creationParams.usage |= IGPUBuffer::EUF_TRANSFER_DST_BIT;
-				core::unordered_set<uint32_t> sharingSet = { compute.queue->getFamilyIndex(),transfer.queue->getFamilyIndex() };
-				core::vector<uint32_t> sharingIndices(sharingSet.begin(), sharingSet.end());
-				if (sharingIndices.size() > 1)
-					creationParams.queueFamilyIndexCount = sharingIndices.size();
-				creationParams.queueFamilyIndices = sharingIndices.data();
-#endif
 				auto scratchBuffer = m_device->createBuffer(std::move(creationParams));
 
 				auto reqs = scratchBuffer->getMemoryReqs();
-#ifndef TEST_REBAR_FALLBACK
 				reqs.memoryTypeBits &= m_physicalDevice->getDirectVRAMAccessMemoryTypeBits();
-#endif
+
 				auto allocation = m_device->allocate(reqs, scratchBuffer.get(), IDeviceMemoryAllocation::EMAF_DEVICE_ADDRESS_BIT);
-#ifndef TEST_REBAR_FALLBACK
 				allocation.memory->map({ .offset = 0,.length = reqs.size });
-#endif
 
 				scratchAlloc = make_smart_refctd_ptr<CAssetConverter::SConvertParams::scratch_for_device_AS_build_t>(
 					SBufferRange<video::IGPUBuffer>{0ull, scratchSize, std::move(scratchBuffer)},
@@ -1599,9 +1485,7 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication,
 
 				uint8_t finalUser;
 			} params = {};
-#undef TEST_REBAR_FALLBACK
 			params.utilities = m_utils.get();
-			params.transfer = &transfer;
 			params.compute = &compute;
 			params.scratchForDeviceASBuild = scratchAlloc.get();
 			params.finalUser = queue->getFamilyIndex();
@@ -1661,608 +1545,7 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication,
 
 		return true;
 	}
-#else
-	bool createGeometries(video::CThreadSafeQueueAdapter* queue, const IGeometryCreator* gc)
-	{
-		auto pool = m_device->createCommandPool(queue->getFamilyIndex(), IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT);
-		if (!pool)
-			return logFail("Couldn't create Command Pool for geometry creation!");
-
-		const auto defaultMaterial = Material{
-		  .ambient = {0.2, 0.1, 0.1},
-		  .diffuse = {0.8, 0.3, 0.3},
-		  .specular = {0.8, 0.8, 0.8},
-		  .shininess = 1.0f,
-		  .alpha = 1.0f,
-		};
-
-		auto getTranslationMatrix = [](float32_t x, float32_t y, float32_t z)
-			{
-				core::matrix3x4SIMD transform;
-				transform.setTranslation(nbl::core::vectorSIMDf(x, y, z, 0));
-				return transform;
-			};
-
-		core::matrix3x4SIMD planeTransform;
-		planeTransform.setRotation(quaternion::fromAngleAxis(core::radians(-90.0f), vector3df_SIMD{ 1, 0, 0 }));
-
-		const auto cpuObjects = std::array{
-		  ReferenceObjectCpu {
-			.meta = {.type = OT_RECTANGLE, .name = "Plane Mesh"},
-			.data = gc->createRectangleMesh(nbl::core::vector2df_SIMD(10, 10)),
-			.material = defaultMaterial,
-			.transform = planeTransform,
-		  },
-		  ReferenceObjectCpu {
-			.meta = {.type = OT_CUBE, .name = "Cube Mesh"},
-			.data = gc->createCubeMesh(nbl::core::vector3df(1, 1, 1)),
-			.material = defaultMaterial,
-			.transform = getTranslationMatrix(0, 0.5f, 0),
-		  },
-		  ReferenceObjectCpu {
-			.meta = {.type = OT_CUBE, .name = "Cube Mesh 2"},
-			.data = gc->createCubeMesh(nbl::core::vector3df(1.5, 1.5, 1.5)),
-			.material = Material{
-			  .ambient = {0.1, 0.1, 0.2},
-			  .diffuse = {0.2, 0.2, 0.8},
-			  .specular = {0.8, 0.8, 0.8},
-			  .shininess = 1.0f,
-			},
-			.transform = getTranslationMatrix(-5.0f, 1.0f, 0),
-		  },
-		  ReferenceObjectCpu {
-			.meta = {.type = OT_CUBE, .name = "Transparent Cube Mesh"},
-			.data = gc->createCubeMesh(nbl::core::vector3df(1.5, 1.5, 1.5)),
-			.material = Material{
-			  .ambient = {0.1, 0.2, 0.1},
-			  .diffuse = {0.2, 0.8, 0.2},
-			  .specular = {0.8, 0.8, 0.8},
-			  .shininess = 1.0f,
-			  .alpha = 0.2,
-			},
-			.transform = getTranslationMatrix(5.0f, 1.0f, 0),
-		  },
-		};
-
-		struct ScratchVIBindings
-		{
-			nbl::asset::SBufferBinding<ICPUBuffer> vertex, index;
-		};
-		std::array<ScratchVIBindings, std::size(cpuObjects)> scratchBuffers;
-
-		for (uint32_t i = 0; i < cpuObjects.size(); i++)
-		{
-			const auto& cpuObject = cpuObjects[i];
-
-			auto vBuffer = smart_refctd_ptr(cpuObject.data.bindings[0].buffer); // no offset
-			auto vUsage = bitflag(IGPUBuffer::EUF_STORAGE_BUFFER_BIT) | IGPUBuffer::EUF_TRANSFER_DST_BIT | IGPUBuffer::EUF_INLINE_UPDATE_VIA_CMDBUF |
-				IGPUBuffer::EUF_ACCELERATION_STRUCTURE_BUILD_INPUT_READ_ONLY_BIT | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT;
-			vBuffer->addUsageFlags(vUsage);
-			vBuffer->setContentHash(vBuffer->computeContentHash());
-
-			auto iBuffer = smart_refctd_ptr(cpuObject.data.indexBuffer.buffer); // no offset
-			auto iUsage = bitflag(IGPUBuffer::EUF_STORAGE_BUFFER_BIT) | IGPUBuffer::EUF_TRANSFER_DST_BIT | IGPUBuffer::EUF_INLINE_UPDATE_VIA_CMDBUF |
-				IGPUBuffer::EUF_ACCELERATION_STRUCTURE_BUILD_INPUT_READ_ONLY_BIT | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT;
-
-			if (cpuObject.data.indexType != EIT_UNKNOWN)
-				if (iBuffer)
-				{
-					iBuffer->addUsageFlags(iUsage);
-					iBuffer->setContentHash(iBuffer->computeContentHash());
-				}
-
-			scratchBuffers[i] = {
-			  .vertex = {.offset = 0, .buffer = vBuffer},
-			  .index = {.offset = 0, .buffer = iBuffer},
-			};
-
-		}
-
-		auto cmdbuf = getSingleUseCommandBufferAndBegin(pool);
-		cmdbuf->beginDebugMarker("Build geometry vertex and index buffers");
-
-		CAssetConverter::SInputs inputs = {};
-		inputs.logger = m_logger.get();
-		std::array<ICPUBuffer*, std::size(cpuObjects) * 2u> tmpBuffers;
-		{
-			for (uint32_t i = 0; i < cpuObjects.size(); i++)
-			{
-				tmpBuffers[2 * i + 0] = scratchBuffers[i].vertex.buffer.get();
-				tmpBuffers[2 * i + 1] = scratchBuffers[i].index.buffer.get();
-			}
-
-			std::get<CAssetConverter::SInputs::asset_span_t<ICPUBuffer>>(inputs.assets) = tmpBuffers;
-		}
-
-		auto reservation = m_converter->reserve(inputs);
-		{
-			auto prepass = [&]<typename asset_type_t>(const auto & references) -> bool
-			{
-				auto objects = reservation.getGPUObjects<asset_type_t>();
-				uint32_t counter = {};
-				for (auto& object : objects)
-				{
-					auto gpu = object.value;
-					auto* reference = references[counter];
-
-					if (reference)
-					{
-						if (!gpu)
-						{
-							m_logger->log("Failed to convert a CPU object to GPU!", ILogger::ELL_ERROR);
-							return false;
-						}
-					}
-					counter++;
-				}
-				return true;
-			};
-
-			prepass.template operator() < ICPUBuffer > (tmpBuffers);
-		}
-
-		auto geomInfoBuffer = ICPUBuffer::create({ std::size(cpuObjects) * sizeof(STriangleGeomInfo) });
-		STriangleGeomInfo* geomInfos = reinterpret_cast<STriangleGeomInfo*>(geomInfoBuffer->getPointer());
-
-		m_gpuTriangleGeometries.reserve(std::size(cpuObjects));
-		// convert
-		{
-			// not sure if need this (probably not, originally for transition img view)
-			auto semaphore = m_device->createSemaphore(0u);
-
-			std::array<IQueue::SSubmitInfo::SCommandBufferInfo, 1> cmdbufs = {};
-			cmdbufs.front().cmdbuf = cmdbuf.get();
-
-			SIntendedSubmitInfo transfer = {};
-			transfer.queue = queue;
-			transfer.scratchCommandBuffers = cmdbufs;
-			transfer.scratchSemaphore = {
-			  .semaphore = semaphore.get(),
-			  .value = 0u,
-			  .stageMask = PIPELINE_STAGE_FLAGS::ALL_TRANSFER_BITS
-			};
-
-			CAssetConverter::SConvertParams params = {};
-			params.utilities = m_utils.get();
-			params.transfer = &transfer;
-
-			auto future = reservation.convert(params);
-			if (future.copy() != IQueue::RESULT::SUCCESS)
-			{
-				m_logger->log("Failed to await submission feature!", ILogger::ELL_ERROR);
-				return false;
-			}
-
-			auto&& buffers = reservation.getGPUObjects<ICPUBuffer>();
-			for (uint32_t i = 0; i < cpuObjects.size(); i++)
-			{
-				auto& cpuObject = cpuObjects[i];
-
-				m_gpuTriangleGeometries.push_back(ReferenceObjectGpu{
-				  .meta = cpuObject.meta,
-				  .bindings = {
-					.vertex = {.offset = 0, .buffer = buffers[2 * i + 0].value },
-					.index = {.offset = 0, .buffer = buffers[2 * i + 1].value },
-				  },
-				  .vertexStride = cpuObject.data.inputParams.bindings[0].stride,
-				  .indexType = cpuObject.data.indexType,
-				  .indexCount = cpuObject.data.indexCount,
-				  .material = hlsl::_static_cast<MaterialPacked>(cpuObject.material),
-				  .transform = cpuObject.transform,
-					});
-			}
-
-			for (uint32_t i = 0; i < m_gpuTriangleGeometries.size(); i++)
-			{
-				const auto& gpuObject = m_gpuTriangleGeometries[i];
-				const uint64_t vertexBufferAddress = gpuObject.bindings.vertex.buffer->getDeviceAddress();
-				geomInfos[i] = {
-				  .material = gpuObject.material,
-				  .vertexBufferAddress = vertexBufferAddress,
-				  .indexBufferAddress = gpuObject.useIndex() ? gpuObject.bindings.index.buffer->getDeviceAddress() : vertexBufferAddress,
-				  .vertexStride = gpuObject.vertexStride,
-				  .objType = gpuObject.meta.type,
-				  .indexType = gpuObject.indexType,
-				  .smoothNormals = s_smoothNormals[gpuObject.meta.type],
-				};
-			}
-		}
-
-		{
-			IGPUBuffer::SCreationParams params;
-			params.usage = IGPUBuffer::EUF_STORAGE_BUFFER_BIT | IGPUBuffer::EUF_TRANSFER_DST_BIT | IGPUBuffer::EUF_INLINE_UPDATE_VIA_CMDBUF | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT;
-			params.size = geomInfoBuffer->getSize();
-			m_utils->createFilledDeviceLocalBufferOnDedMem(SIntendedSubmitInfo{ .queue = queue }, std::move(params), geomInfos).move_into(m_triangleGeomInfoBuffer);
-		}
-
-		// intersection geometries setup
-		{
-			core::vector<SProceduralGeomInfo> proceduralGeoms;
-			proceduralGeoms.reserve(NumberOfProceduralGeometries);
-			using Aabb = IGPUBottomLevelAccelerationStructure::AABB_t;
-			core::vector<Aabb> aabbs;
-			aabbs.reserve(NumberOfProceduralGeometries);
-			for (int32_t i = 0; i < NumberOfProceduralGeometries; i++)
-			{
-				const auto middle_i = NumberOfProceduralGeometries / 2.0;
-				SProceduralGeomInfo sphere = {
-				  .material = hlsl::_static_cast<MaterialPacked>(Material{
-					.ambient = {0.1, 0.05 * i, 0.1},
-					.diffuse = {0.3, 0.2 * i, 0.3},
-					.specular = {0.8, 0.8, 0.8},
-					.shininess = 1.0f,
-				  }),
-				  .center = float32_t3((i - middle_i) * 4.0, 2, 5.0),
-				  .radius = 1,
-				};
-
-				proceduralGeoms.push_back(sphere);
-				const auto sphereMin = sphere.center - sphere.radius;
-				const auto sphereMax = sphere.center + sphere.radius;
-				aabbs.emplace_back(
-					vector3d(sphereMin.x, sphereMin.y, sphereMin.z),
-					vector3d(sphereMax.x, sphereMax.y, sphereMax.z));
-			}
-
-			{
-				IGPUBuffer::SCreationParams params;
-				params.usage = IGPUBuffer::EUF_STORAGE_BUFFER_BIT | IGPUBuffer::EUF_TRANSFER_DST_BIT | IGPUBuffer::EUF_INLINE_UPDATE_VIA_CMDBUF | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT;
-				params.size = proceduralGeoms.size() * sizeof(SProceduralGeomInfo);
-				m_utils->createFilledDeviceLocalBufferOnDedMem(SIntendedSubmitInfo{ .queue = queue }, std::move(params), proceduralGeoms.data()).move_into(m_proceduralGeomInfoBuffer);
-			}
-
-			{
-				IGPUBuffer::SCreationParams params;
-				params.usage = IGPUBuffer::EUF_TRANSFER_DST_BIT | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT | IGPUBuffer::EUF_ACCELERATION_STRUCTURE_BUILD_INPUT_READ_ONLY_BIT;
-				params.size = aabbs.size() * sizeof(Aabb);
-				m_utils->createFilledDeviceLocalBufferOnDedMem(SIntendedSubmitInfo{ .queue = queue }, std::move(params), aabbs.data()).move_into(m_proceduralAabbBuffer);
-			}
-		}
-
-		return true;
-	}
-
-	bool createAccelerationStructures(video::CThreadSafeQueueAdapter* queue)
-	{
-		// plus 1 blas for procedural geometry contains {{var::NumberOfProcedural}}
-		// spheres. Each sphere is a primitive instead one instance or geometry
-		const auto blasCount = m_gpuTriangleGeometries.size() + 1;
-		const auto proceduralBlasIdx = m_gpuTriangleGeometries.size();
-
-		IQueryPool::SCreationParams qParams{ .queryCount = static_cast<uint32_t>(blasCount), .queryType = IQueryPool::ACCELERATION_STRUCTURE_COMPACTED_SIZE };
-		smart_refctd_ptr<IQueryPool> queryPool = m_device->createQueryPool(std::move(qParams));
-
-		auto pool = m_device->createCommandPool(queue->getFamilyIndex(), IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT | IGPUCommandPool::CREATE_FLAGS::TRANSIENT_BIT);
-		if (!pool)
-			return logFail("Couldn't create Command Pool for blas/tlas creation!");
-
-		m_api->startCapture();
-#ifdef TRY_BUILD_FOR_NGFX // NSight is "debugger-challenged" it can't capture anything not happenning "during a frame", so we need to trick it
-		m_currentImageAcquire = m_surface->acquireNextImage();
-		{
-			const IQueue::SSubmitInfo::SSemaphoreInfo acquired[] = { {
-			  .semaphore = m_currentImageAcquire.semaphore,
-			  .value = m_currentImageAcquire.acquireCount,
-			  .stageMask = PIPELINE_STAGE_FLAGS::ALL_COMMANDS_BITS
-			} };
-			m_surface->present(m_currentImageAcquire.imageIndex, acquired);
-		}
-		m_currentImageAcquire = m_surface->acquireNextImage();
-#endif
-		size_t totalScratchSize = 0;
-		const auto scratchOffsetAlignment = m_device->getPhysicalDevice()->getLimits().minAccelerationStructureScratchOffsetAlignment;
-
-		// build bottom level ASes
-		{
-			core::vector<uint32_t> primitiveCounts(blasCount);
-			core::vector<IGPUBottomLevelAccelerationStructure::Triangles<IGPUBuffer>> triangles(m_gpuTriangleGeometries.size());
-			core::vector<uint32_t> scratchSizes(blasCount);
-			IGPUBottomLevelAccelerationStructure::AABBs<IGPUBuffer> aabbs;
-
-			auto blasFlags = bitflag(IGPUBottomLevelAccelerationStructure::BUILD_FLAGS::PREFER_FAST_TRACE_BIT) | IGPUBottomLevelAccelerationStructure::BUILD_FLAGS::ALLOW_COMPACTION_BIT;
-			if (m_physicalDevice->getProperties().limits.rayTracingPositionFetch)
-				blasFlags |= IGPUBottomLevelAccelerationStructure::BUILD_FLAGS::ALLOW_DATA_ACCESS;
-
-			IGPUBottomLevelAccelerationStructure::DeviceBuildInfo initBuildInfo;
-			initBuildInfo.buildFlags = blasFlags;
-			initBuildInfo.geometryCount = 1;	// only 1 geometry object per blas
-			initBuildInfo.srcAS = nullptr;
-			initBuildInfo.dstAS = nullptr;
-			initBuildInfo.scratch = {};
-
-			auto blasBuildInfos = core::vector(blasCount, initBuildInfo);
-
-			m_gpuBlasList.resize(blasCount);
-			// setup blas info for triangle geometries
-			for (uint32_t i = 0; i < blasCount; i++)
-			{
-				const auto isProcedural = i == proceduralBlasIdx;
-				if (isProcedural)
-				{
-					aabbs.data.buffer = smart_refctd_ptr(m_proceduralAabbBuffer);
-					aabbs.data.offset = 0;
-					aabbs.stride = sizeof(IGPUBottomLevelAccelerationStructure::AABB_t);
-					aabbs.geometryFlags = IGPUBottomLevelAccelerationStructure::GEOMETRY_FLAGS::OPAQUE_BIT; // only allow opaque for now
-
-					primitiveCounts[proceduralBlasIdx] = NumberOfProceduralGeometries;
-					blasBuildInfos[proceduralBlasIdx].aabbs = &aabbs;
-					blasBuildInfos[proceduralBlasIdx].buildFlags |= IGPUBottomLevelAccelerationStructure::BUILD_FLAGS::GEOMETRY_TYPE_IS_AABB_BIT;
-				}
-				else
-				{
-					const auto& gpuObject = m_gpuTriangleGeometries[i];
-
-					const uint32_t vertexStride = gpuObject.vertexStride;
-					const uint32_t numVertices = gpuObject.bindings.vertex.buffer->getSize() / vertexStride;
-					if (gpuObject.useIndex())
-						primitiveCounts[i] = gpuObject.indexCount / 3;
-					else
-						primitiveCounts[i] = numVertices / 3;
-
-					triangles[i].vertexData[0] = gpuObject.bindings.vertex;
-					triangles[i].indexData = gpuObject.useIndex() ? gpuObject.bindings.index : gpuObject.bindings.vertex;
-					triangles[i].maxVertex = numVertices - 1;
-					triangles[i].vertexStride = vertexStride;
-					triangles[i].vertexFormat = EF_R32G32B32_SFLOAT;
-					triangles[i].indexType = gpuObject.indexType;
-					triangles[i].geometryFlags = gpuObject.material.isTransparent() ?
-						IGPUBottomLevelAccelerationStructure::GEOMETRY_FLAGS::NO_DUPLICATE_ANY_HIT_INVOCATION_BIT :
-						IGPUBottomLevelAccelerationStructure::GEOMETRY_FLAGS::OPAQUE_BIT;
-
-					blasBuildInfos[i].triangles = &triangles[i];
-				}
-				ILogicalDevice::AccelerationStructureBuildSizes buildSizes;
-				{
-					const uint32_t maxPrimCount[1] = { primitiveCounts[i] };
-					if (isProcedural)
-					{
-						const auto* aabbData = &aabbs;
-						buildSizes = m_device->getAccelerationStructureBuildSizes(false, blasBuildInfos[i].buildFlags, false, std::span{ aabbData, 1 }, maxPrimCount);
-					}
-					else
-					{
-						const auto* trianglesData = triangles.data();
-						buildSizes = m_device->getAccelerationStructureBuildSizes(false, blasBuildInfos[i].buildFlags, false, std::span{ trianglesData,1 }, maxPrimCount);
-					}
-					if (!buildSizes)
-						return logFail("Failed to get BLAS build sizes");
-				}
-
-				scratchSizes[i] = buildSizes.buildScratchSize;
-				totalScratchSize = core::alignUp(totalScratchSize, scratchOffsetAlignment);
-				totalScratchSize += buildSizes.buildScratchSize;
-
-				{
-					IGPUBuffer::SCreationParams params;
-					params.usage = bitflag(IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT) | IGPUBuffer::EUF_ACCELERATION_STRUCTURE_STORAGE_BIT;
-					params.size = buildSizes.accelerationStructureSize;
-					smart_refctd_ptr<IGPUBuffer> asBuffer = createBuffer(params);
-
-					IGPUBottomLevelAccelerationStructure::SCreationParams blasParams;
-					blasParams.bufferRange.buffer = asBuffer;
-					blasParams.bufferRange.offset = 0u;
-					blasParams.bufferRange.size = buildSizes.accelerationStructureSize;
-					blasParams.flags = IGPUBottomLevelAccelerationStructure::SCreationParams::FLAGS::NONE;
-					m_gpuBlasList[i] = m_device->createBottomLevelAccelerationStructure(std::move(blasParams));
-					if (!m_gpuBlasList[i])
-						return logFail("Could not create BLAS");
-				}
-			}
-
-
-			auto cmdbufBlas = getSingleUseCommandBufferAndBegin(pool);
-			cmdbufBlas->beginDebugMarker("Build BLAS");
-
-			cmdbufBlas->resetQueryPool(queryPool.get(), 0, blasCount);
-
-			smart_refctd_ptr<IGPUBuffer> scratchBuffer;
-			{
-				IGPUBuffer::SCreationParams params;
-				params.usage = bitflag(IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT) | IGPUBuffer::EUF_STORAGE_BUFFER_BIT;
-				params.size = totalScratchSize;
-				scratchBuffer = createBuffer(params);
-			}
-
-			core::vector<IGPUBottomLevelAccelerationStructure::BuildRangeInfo> buildRangeInfos(blasCount);
-			core::vector<IGPUBottomLevelAccelerationStructure::BuildRangeInfo*> pRangeInfos(blasCount);
-			for (uint32_t i = 0; i < blasCount; i++)
-			{
-				blasBuildInfos[i].dstAS = m_gpuBlasList[i].get();
-				blasBuildInfos[i].scratch.buffer = scratchBuffer;
-				if (i == 0)
-				{
-					blasBuildInfos[i].scratch.offset = 0u;
-				}
-				else
-				{
-					const auto unalignedOffset = blasBuildInfos[i - 1].scratch.offset + scratchSizes[i - 1];
-					blasBuildInfos[i].scratch.offset = core::alignUp(unalignedOffset, scratchOffsetAlignment);
-				}
-
-				buildRangeInfos[i].primitiveCount = primitiveCounts[i];
-				buildRangeInfos[i].primitiveByteOffset = 0u;
-				buildRangeInfos[i].firstVertex = 0u;
-				buildRangeInfos[i].transformByteOffset = 0u;
-
-				pRangeInfos[i] = &buildRangeInfos[i];
-			}
-
-			if (!cmdbufBlas->buildAccelerationStructures(std::span(blasBuildInfos), pRangeInfos.data()))
-				return logFail("Failed to build BLAS");
-
-			{
-				SMemoryBarrier memBarrier;
-				memBarrier.srcStageMask = PIPELINE_STAGE_FLAGS::ACCELERATION_STRUCTURE_BUILD_BIT;
-				memBarrier.srcAccessMask = ACCESS_FLAGS::ACCELERATION_STRUCTURE_WRITE_BIT;
-				memBarrier.dstStageMask = PIPELINE_STAGE_FLAGS::ACCELERATION_STRUCTURE_BUILD_BIT;
-				memBarrier.dstAccessMask = ACCESS_FLAGS::ACCELERATION_STRUCTURE_READ_BIT;
-				cmdbufBlas->pipelineBarrier(E_DEPENDENCY_FLAGS::EDF_NONE, { .memBarriers = {&memBarrier, 1} });
-			}
-
-
-			core::vector<const IGPUAccelerationStructure*> ases(blasCount);
-			for (uint32_t i = 0; i < blasCount; i++)
-				ases[i] = m_gpuBlasList[i].get();
-			if (!cmdbufBlas->writeAccelerationStructureProperties(std::span(ases), IQueryPool::ACCELERATION_STRUCTURE_COMPACTED_SIZE,
-				queryPool.get(), 0))
-				return logFail("Failed to write acceleration structure properties!");
-
-			cmdbufBlas->endDebugMarker();
-			cmdbufSubmitAndWait(cmdbufBlas, queue, 39);
-		}
-
-		auto cmdbufCompact = getSingleUseCommandBufferAndBegin(pool);
-		cmdbufCompact->beginDebugMarker("Compact BLAS");
 
-		// compact blas
-		{
-			core::vector<size_t> asSizes(blasCount);
-			if (!m_device->getQueryPoolResults(queryPool.get(), 0, blasCount, asSizes.data(), sizeof(size_t), bitflag(IQueryPool::WAIT_BIT) | IQueryPool::_64_BIT))
-				return logFail("Could not get query pool results for AS sizes");
-
-			core::vector<smart_refctd_ptr<IGPUBottomLevelAccelerationStructure>> cleanupBlas(blasCount);
-			for (uint32_t i = 0; i < blasCount; i++)
-			{
-				if (asSizes[i] == 0) continue;
-				cleanupBlas[i] = m_gpuBlasList[i];
-				{
-					IGPUBuffer::SCreationParams params;
-					params.usage = bitflag(IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT) | IGPUBuffer::EUF_ACCELERATION_STRUCTURE_STORAGE_BIT;
-					params.size = asSizes[i];
-					smart_refctd_ptr<IGPUBuffer> asBuffer = createBuffer(params);
-
-					IGPUBottomLevelAccelerationStructure::SCreationParams blasParams;
-					blasParams.bufferRange.buffer = asBuffer;
-					blasParams.bufferRange.offset = 0u;
-					blasParams.bufferRange.size = asSizes[i];
-					blasParams.flags = IGPUBottomLevelAccelerationStructure::SCreationParams::FLAGS::NONE;
-					m_gpuBlasList[i] = m_device->createBottomLevelAccelerationStructure(std::move(blasParams));
-					if (!m_gpuBlasList[i])
-						return logFail("Could not create compacted BLAS");
-				}
-
-				IGPUBottomLevelAccelerationStructure::CopyInfo copyInfo;
-				copyInfo.src = cleanupBlas[i].get();
-				copyInfo.dst = m_gpuBlasList[i].get();
-				copyInfo.compact = true;
-				if (!cmdbufCompact->copyAccelerationStructure<IGPUBottomLevelAccelerationStructure>(copyInfo))
-					return logFail("Failed to copy AS to compact");
-			}
-		}
-
-		cmdbufCompact->endDebugMarker();
-		cmdbufSubmitAndWait(cmdbufCompact, queue, 40);
-
-		auto cmdbufTlas = getSingleUseCommandBufferAndBegin(pool);
-		cmdbufTlas->beginDebugMarker("Build TLAS");
-
-		// build top level AS
-		{
-			const uint32_t instancesCount = blasCount;
-			core::vector<IGPUTopLevelAccelerationStructure::DeviceStaticInstance> instances(instancesCount);
-			for (uint32_t i = 0; i < instancesCount; i++)
-			{
-				const auto isProceduralInstance = i == proceduralBlasIdx;
-				instances[i].base.blas.deviceAddress = m_gpuBlasList[i]->getReferenceForDeviceOperations().deviceAddress;
-				instances[i].base.mask = 0xFF;
-				instances[i].base.instanceCustomIndex = i;
-				instances[i].base.instanceShaderBindingTableRecordOffset = isProceduralInstance ? 2 : 0;
-				instances[i].base.flags = static_cast<uint32_t>(IGPUTopLevelAccelerationStructure::INSTANCE_FLAGS::TRIANGLE_FACING_CULL_DISABLE_BIT);
-				instances[i].transform = isProceduralInstance ? matrix3x4SIMD() : m_gpuTriangleGeometries[i].transform;
-			}
-
-			{
-				size_t bufSize = instancesCount * sizeof(IGPUTopLevelAccelerationStructure::DeviceStaticInstance);
-				IGPUBuffer::SCreationParams params;
-				params.usage = bitflag(IGPUBuffer::EUF_ACCELERATION_STRUCTURE_BUILD_INPUT_READ_ONLY_BIT) | IGPUBuffer::EUF_STORAGE_BUFFER_BIT |
-					IGPUBuffer::EUF_INLINE_UPDATE_VIA_CMDBUF | IGPUBuffer::EUF_TRANSFER_DST_BIT | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT;
-				params.size = bufSize;
-				m_instanceBuffer = createBuffer(params);
-
-				SBufferRange<IGPUBuffer> range = { .offset = 0u, .size = bufSize, .buffer = m_instanceBuffer };
-				cmdbufTlas->updateBuffer(range, instances.data());
-			}
-
-			// make sure instances upload complete first
-			{
-				SMemoryBarrier memBarrier;
-				memBarrier.srcStageMask = PIPELINE_STAGE_FLAGS::ALL_TRANSFER_BITS;
-				memBarrier.srcAccessMask = ACCESS_FLAGS::TRANSFER_WRITE_BIT;
-				memBarrier.dstStageMask = PIPELINE_STAGE_FLAGS::ACCELERATION_STRUCTURE_BUILD_BIT;
-				memBarrier.dstAccessMask = ACCESS_FLAGS::ACCELERATION_STRUCTURE_WRITE_BIT;
-				cmdbufTlas->pipelineBarrier(E_DEPENDENCY_FLAGS::EDF_NONE, { .memBarriers = {&memBarrier, 1} });
-			}
-
-			auto tlasFlags = bitflag(IGPUTopLevelAccelerationStructure::BUILD_FLAGS::PREFER_FAST_TRACE_BIT);
-
-			IGPUTopLevelAccelerationStructure::DeviceBuildInfo tlasBuildInfo;
-			tlasBuildInfo.buildFlags = tlasFlags;
-			tlasBuildInfo.srcAS = nullptr;
-			tlasBuildInfo.dstAS = nullptr;
-			tlasBuildInfo.instanceData.buffer = m_instanceBuffer;
-			tlasBuildInfo.instanceData.offset = 0u;
-			tlasBuildInfo.scratch = {};
-
-			auto buildSizes = m_device->getAccelerationStructureBuildSizes(tlasFlags, false, instancesCount);
-			if (!buildSizes)
-				return logFail("Failed to get TLAS build sizes");
-
-			{
-				IGPUBuffer::SCreationParams params;
-				params.usage = bitflag(IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT) | IGPUBuffer::EUF_ACCELERATION_STRUCTURE_STORAGE_BIT;
-				params.size = buildSizes.accelerationStructureSize;
-				smart_refctd_ptr<IGPUBuffer> asBuffer = createBuffer(params);
-
-				IGPUTopLevelAccelerationStructure::SCreationParams tlasParams;
-				tlasParams.bufferRange.buffer = asBuffer;
-				tlasParams.bufferRange.offset = 0u;
-				tlasParams.bufferRange.size = buildSizes.accelerationStructureSize;
-				tlasParams.flags = IGPUTopLevelAccelerationStructure::SCreationParams::FLAGS::NONE;
-				m_gpuTlas = m_device->createTopLevelAccelerationStructure(std::move(tlasParams));
-				if (!m_gpuTlas)
-					return logFail("Could not create TLAS");
-			}
-
-			smart_refctd_ptr<IGPUBuffer> scratchBuffer;
-			{
-				IGPUBuffer::SCreationParams params;
-				params.usage = bitflag(IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT) | IGPUBuffer::EUF_STORAGE_BUFFER_BIT;
-				params.size = buildSizes.buildScratchSize;
-				scratchBuffer = createBuffer(params);
-			}
-
-			tlasBuildInfo.dstAS = m_gpuTlas.get();
-			tlasBuildInfo.scratch.buffer = scratchBuffer;
-			tlasBuildInfo.scratch.offset = 0u;
-
-			IGPUTopLevelAccelerationStructure::BuildRangeInfo buildRangeInfo[1u];
-			buildRangeInfo[0].instanceCount = instancesCount;
-			buildRangeInfo[0].instanceByteOffset = 0u;
-			IGPUTopLevelAccelerationStructure::BuildRangeInfo* pRangeInfos;
-			pRangeInfos = &buildRangeInfo[0];
-
-			if (!cmdbufTlas->buildAccelerationStructures({ &tlasBuildInfo, 1 }, pRangeInfos))
-				return logFail("Failed to build TLAS");
-		}
-
-		cmdbufTlas->endDebugMarker();
-		cmdbufSubmitAndWait(cmdbufTlas, queue, 45);
-
-#ifdef TRY_BUILD_FOR_NGFX
-		{
-			const IQueue::SSubmitInfo::SSemaphoreInfo acquired[] = { {
-			  .semaphore = m_currentImageAcquire.semaphore,
-			  .value = m_currentImageAcquire.acquireCount,
-			  .stageMask = PIPELINE_STAGE_FLAGS::ALL_COMMANDS_BITS
-			} };
-			m_surface->present(m_currentImageAcquire.imageIndex, acquired);
-		}
-#endif
-		m_api->endCapture();
-
-		return true;
-	}
-#endif // TEST_ASSET_CONV_AS
 
 
 	smart_refctd_ptr<IWindow> m_window;
@@ -2317,7 +1600,6 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication,
 	core::vector<SProceduralGeomInfo> m_gpuIntersectionSpheres;
 	uint32_t m_intersectionHitGroupIdx;
 
-	std::vector<smart_refctd_ptr<IGPUBottomLevelAccelerationStructure>> m_gpuBlasList;
 	smart_refctd_ptr<IGPUTopLevelAccelerationStructure> m_gpuTlas;
 	smart_refctd_ptr<IGPUBuffer> m_instanceBuffer;
 

From e30938c2615dd5d3ab69cadca3ba11d1e03f8233 Mon Sep 17 00:00:00 2001
From: devsh <devsh@devsh.eu>
Date: Mon, 26 May 2025 10:05:49 +0200
Subject: [PATCH 306/529] test that we're not overflown submitted when
 providing correct max size scratch buffer

---
 71_RayTracingPipeline/main.cpp | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/71_RayTracingPipeline/main.cpp b/71_RayTracingPipeline/main.cpp
index 0b6b4d724..968f7c42e 100644
--- a/71_RayTracingPipeline/main.cpp
+++ b/71_RayTracingPipeline/main.cpp
@@ -1452,7 +1452,7 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication,
 			{
 				constexpr auto MaxAlignment = 256;
 				constexpr auto MinAllocationSize = 1024;
-				const auto scratchSize = core::alignUp(reservation.getMinASBuildScratchSize(false), MaxAlignment);
+				const auto scratchSize = core::alignUp(reservation.getMaxASBuildScratchSize(false), MaxAlignment);
 
 
 				IGPUBuffer::SCreationParams creationParams = {};
@@ -1496,6 +1496,9 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication,
 				m_logger->log("Failed to await submission feature!", ILogger::ELL_ERROR);
 				return false;
 			}
+			// 2 submits, BLAS build, TLAS build, DO NOT ADD COMPACTIONS IN THIS EXAMPLE!
+			if (compute.getFutureScratchSemaphore().value>3)
+				m_logger->log("Overflow submitted on Compute Queue despite using ReBAR (no transfer submits or usage of staging buffer) and providing a AS Build Scratch Buffer of correctly queried max size!",system::ILogger::ELL_ERROR);
 
 			// assign gpu objects to output
 			auto&& tlases = reservation.getGPUObjects<ICPUTopLevelAccelerationStructure>();

From 280d119e6435928496ac69da91782e60ddea5dca Mon Sep 17 00:00:00 2001
From: Erfan Ahmadi <ahmadierfan99@gmail.com>
Date: Mon, 26 May 2025 13:52:16 +0400
Subject: [PATCH 307/529] Fixes to images cache regarding georeferenced image
 resize

---
 62_CAD/DrawResourcesFiller.cpp | 37 +++++++++++++++++++++++++---------
 1 file changed, 27 insertions(+), 10 deletions(-)

diff --git a/62_CAD/DrawResourcesFiller.cpp b/62_CAD/DrawResourcesFiller.cpp
index 9fa24e1a0..9b0bdfaac 100644
--- a/62_CAD/DrawResourcesFiller.cpp
+++ b/62_CAD/DrawResourcesFiller.cpp
@@ -433,6 +433,9 @@ bool DrawResourcesFiller::ensureStaticImageAvailability(const StaticImageInfo& s
 					suballocatedDescriptorSet->multi_deallocate(imagesArrayBinding, 1u, &cachedImageRecord->arrayIndex, {});
 					cachedImageRecord->arrayIndex = InvalidTextureIndex;
 				}
+
+				// erase the entry we failed to fill, no need for `evictImage_SubmitIfNeeded`, because it didn't get to be used in any submit to defer it's memory and index deallocation
+				imagesCache->erase(imageID);
 			}
 		}
 		else
@@ -579,6 +582,9 @@ bool DrawResourcesFiller::ensureGeoreferencedImageAvailability_AllocateIfNeeded(
 					suballocatedDescriptorSet->multi_deallocate(imagesArrayBinding, 1u, &cachedImageRecord->arrayIndex, {});
 					cachedImageRecord->arrayIndex = InvalidTextureIndex;
 				}
+				
+				// erase the entry we failed to fill, no need for `evictImage_SubmitIfNeeded`, because it didn't get to be used in any submit to defer it's memory and index deallocation
+				imagesCache->erase(imageID);
 			}
 		}
 		else
@@ -2056,13 +2062,15 @@ void DrawResourcesFiller::evictImage_SubmitIfNeeded(image_id imageID, const Cach
 	}
 }
 
-DrawResourcesFiller::ImageAllocateResults  DrawResourcesFiller::tryCreateAndAllocateImage_SubmitIfNeeded(const nbl::asset::IImage::SCreationParams& imageParams, nbl::video::SIntendedSubmitInfo& intendedNextSubmit, std::string imageDebugName)
+DrawResourcesFiller::ImageAllocateResults DrawResourcesFiller::tryCreateAndAllocateImage_SubmitIfNeeded(const nbl::asset::IImage::SCreationParams& imageParams, nbl::video::SIntendedSubmitInfo& intendedNextSubmit, std::string imageDebugName)
 {
 	ImageAllocateResults ret = {};
 
 	auto* device = m_utilities->getLogicalDevice();
 	auto* physDev = m_utilities->getLogicalDevice()->getPhysicalDevice();
 
+	bool alreadyBlockedForDeferredFrees = false;
+
 	// Attempt to create a GPU image and corresponding image view for this texture.
 	// If creation or memory allocation fails (likely due to VRAM exhaustion),
 	// we'll evict another texture from the LRU cache and retry until successful, or until only the currently-cachedImageRecord image remains.
@@ -2150,22 +2158,31 @@ DrawResourcesFiller::ImageAllocateResults  DrawResourcesFiller::tryCreateAndAllo
 		}
 
 		// Getting here means we failed creating or allocating the image, evict and retry.
-		if (imagesCache->size() == 1u)
+
+
+		// If imageCache size is 1 it means there is nothing else to evict, but there may still be already evicts/frees queued up.
+		// `cull_frees` will make sure all pending deallocations will be blocked for.
+		if (imagesCache->size() == 1u && alreadyBlockedForDeferredFrees)
 		{
-			// Nothing else to evict; give up.
-			// We probably have evicted almost every other texture except the one we just allocated an index for
+			// We give up, it's really nothing we can do, no image to evict (alreadyBlockedForDeferredFrees==1) and no more memory to free up (alreadyBlockedForDeferredFrees).
+			// We probably have evicted almost every other texture except the one we just allocated an index for. 
+			// This is most likely due to current image memory requirement being greater than the whole memory allocated for all images
 			_NBL_DEBUG_BREAK_IF(true);
+			// TODO[LOG]
 			break;
 		}
 
-		assert(imagesCache->size() > 1u);
+		if (imagesCache->size() > 1u)
+		{
+			const image_id evictionCandidate = imagesCache->select_eviction_candidate();
+			CachedImageRecord* imageRef = imagesCache->peek(evictionCandidate);
+			if (imageRef)
+				evictImage_SubmitIfNeeded(evictionCandidate, *imageRef, intendedNextSubmit);
+			imagesCache->erase(evictionCandidate);
+		}
 
-		const image_id evictionCandidate = imagesCache->select_eviction_candidate();
-		CachedImageRecord* imageRef = imagesCache->peek(evictionCandidate);
-		if (imageRef)
-			evictImage_SubmitIfNeeded(evictionCandidate, *imageRef, intendedNextSubmit);
-		imagesCache->erase(evictionCandidate);
 		while (suballocatedDescriptorSet->cull_frees()) {}; // to make sure deallocation requests in eviction callback are blocked for.
+		alreadyBlockedForDeferredFrees = true;
 
 		// we don't hold any references to the GPUImageView or GPUImage so descriptor binding will be the last reference
 		// hopefully by here the suballocated descriptor set freed some VRAM by dropping the image last ref and it's dedicated allocation.

From 0b693638c1f68f13229ad103d10bc4791de2c13b Mon Sep 17 00:00:00 2001
From: Erfan Ahmadi <ahmadierfan99@gmail.com>
Date: Mon, 26 May 2025 15:49:40 +0400
Subject: [PATCH 308/529] edits after LRUCache Improvement

---
 62_CAD/DrawResourcesFiller.cpp | 23 ++++++++---------------
 62_CAD/Images.h                |  9 +--------
 2 files changed, 9 insertions(+), 23 deletions(-)

diff --git a/62_CAD/DrawResourcesFiller.cpp b/62_CAD/DrawResourcesFiller.cpp
index 9b0bdfaac..1d6e95e66 100644
--- a/62_CAD/DrawResourcesFiller.cpp
+++ b/62_CAD/DrawResourcesFiller.cpp
@@ -707,9 +707,6 @@ bool DrawResourcesFiller::pushAllUploads(SIntendedSubmitInfo& intendedNextSubmit
 		bool replayCacheFullyCovered = true;
 		for (auto& [imageID, toReplayRecord] : *currentReplayCache->imagesCache)
 		{
-			// TODO: remove temoprary const_cast workaround.
-			CachedImageRecord& toReplayImageRecord_nonConst = const_cast<CachedImageRecord&>(toReplayRecord);
-
 			if (toReplayRecord.type != ImageType::STATIC) // non-static images (Georeferenced) won't be replayed like this
 				continue;
 
@@ -731,7 +728,7 @@ bool DrawResourcesFiller::pushAllUploads(SIntendedSubmitInfo& intendedNextSubmit
 			// if already resident, just update the state to the cached state (to make sure it doesn't get issued for upload again) and move on.
 			if (alreadyResident)
 			{
-				toReplayImageRecord_nonConst.state = cachedRecord->state; // update the toReplayImageRecords's state, to completely match the currently resident state
+				toReplayRecord.state = cachedRecord->state; // update the toReplayImageRecords's state, to completely match the currently resident state
 				continue;
 			}
 
@@ -764,8 +761,8 @@ bool DrawResourcesFiller::pushAllUploads(SIntendedSubmitInfo& intendedNextSubmit
 					if (newGPUImageView)
 					{
 						successCreateNewImage = true;
-						toReplayImageRecord_nonConst.gpuImageView = newGPUImageView;
-						toReplayImageRecord_nonConst.state = ImageState::CREATED_AND_MEMORY_BOUND;
+						toReplayRecord.gpuImageView = newGPUImageView;
+						toReplayRecord.state = ImageState::CREATED_AND_MEMORY_BOUND;
 						newGPUImageView->setObjectDebugName((std::to_string(imageID) + " Static Image View 2D").c_str());
 					}
 
@@ -781,8 +778,9 @@ bool DrawResourcesFiller::pushAllUploads(SIntendedSubmitInfo& intendedNextSubmit
 		}
 		
 		// Our actual `imageCache` (which represents GPU state) didn't cover the replayCache fully, so new images had to be created, bound to memory. and they need to be written into their respective descriptor array indices again.
+		// imagesCache = std::make_unique<ImagesCache>(*currentReplayCache->imagesCache);
 		imagesCache->clear();
-		for (auto it = currentReplayCache->imagesCache->crbegin(); it != currentReplayCache->imagesCache->crend(); it++)
+		for (auto it = currentReplayCache->imagesCache->rbegin(); it != currentReplayCache->imagesCache->rend(); it++)
 			imagesCache->base_t::insert(it->first, it->second);
 
 		if (!replayCacheFullyCovered)
@@ -903,12 +901,7 @@ std::unique_ptr<DrawResourcesFiller::ReplayCache> DrawResourcesFiller::createRep
 		stagedMSDF.uploadedToGPU = false; // to trigger upload for all msdf functions again.
 	ret->drawCallsData = drawCalls;
 	ret->activeMainObjectIndex = activeMainObjectIndex;
-	ret->imagesCache = std::unique_ptr<ImagesCache>(new ImagesCache(imagesCache->size()));
-	// It should be copyable, here is a temporary hack:
-	for (auto it = imagesCache->crbegin(); it != imagesCache->crend(); it++)
-	{
-		ret->imagesCache->base_t::insert(it->first, it->second);
-	}
+	ret->imagesCache = std::unique_ptr<ImagesCache>(new ImagesCache(*imagesCache));
 	return ret;
 }
 
@@ -1138,7 +1131,7 @@ bool DrawResourcesFiller::bindImagesToArrayIndices(ImagesCache& imagesCache)
 		descriptorWrite.info = &descriptorInfos[descriptorWriteCount];
 		descriptorWrites[descriptorWriteCount] = descriptorWrite;
 
-		const_cast<CachedImageRecord&>(record).state = ImageState::BOUND_TO_DESCRIPTOR_SET;
+		record.state = ImageState::BOUND_TO_DESCRIPTOR_SET;
 		descriptorWriteCount++;
 	}
 
@@ -1157,7 +1150,7 @@ bool DrawResourcesFiller::pushStaticImagesUploads(SIntendedSubmitInfo& intendedN
 	for (auto& [id, record] : imagesCache)
 	{
 		if (record.staticCPUImage && record.type == ImageType::STATIC && record.state < ImageState::GPU_RESIDENT_WITH_VALID_STATIC_DATA)
-			nonResidentImageRecords.push_back(const_cast<CachedImageRecord*>(&record)); // TODO: remove const_cast
+			nonResidentImageRecords.push_back(&record);
 	}
 
 	if (nonResidentImageRecords.size() > 0ull)
diff --git a/62_CAD/Images.h b/62_CAD/Images.h
index 73be7ed50..ed09da9d6 100644
--- a/62_CAD/Images.h
+++ b/62_CAD/Images.h
@@ -163,14 +163,7 @@ class ImagesCache : public core::ResizableLRUCache<image_id, CachedImageRecord>
 	template<std::invocable<image_id, const CachedImageRecord&> EvictionCallback>
 	inline CachedImageRecord* insert(image_id imageID, uint64_t lastUsedSema, EvictionCallback&& evictCallback)
 	{
-		auto lruEvictionCallback = [&](const CachedImageRecord& evicted)
-			{
-				const image_id* evictingKey = base_t::get_least_recently_used();
-				assert(evictingKey != nullptr);
-				if (evictingKey)
-					evictCallback(*evictingKey, evicted);
-			};
-		return base_t::insert(imageID, lastUsedSema, lruEvictionCallback);
+		return base_t::insert(imageID, lastUsedSema, evictCallback);
 	}
 	
 	// Retrieves the image associated with `imageID`, updating its LRU position.

From d33f32e5b7370bc3e87488ae455a8e74419deba1 Mon Sep 17 00:00:00 2001
From: Przemek <minikers21@gmail.com>
Date: Mon, 26 May 2025 16:30:44 +0200
Subject: [PATCH 309/529] Corrections

---
 62_CAD/DrawResourcesFiller.cpp                    |  8 +++++++-
 62_CAD/shaders/main_pipeline/fragment_shader.hlsl | 11 ++++-------
 62_CAD/shaders/main_pipeline/vertex_shader.hlsl   | 15 +++++++--------
 3 files changed, 18 insertions(+), 16 deletions(-)

diff --git a/62_CAD/DrawResourcesFiller.cpp b/62_CAD/DrawResourcesFiller.cpp
index 1d6e95e66..8b440edf7 100644
--- a/62_CAD/DrawResourcesFiller.cpp
+++ b/62_CAD/DrawResourcesFiller.cpp
@@ -637,7 +637,13 @@ void DrawResourcesFiller::drawGridDTM(
 	uint32_t mainObjectIdx = acquireActiveMainObjectIndex_SubmitIfNeeded(intendedNextSubmit);
 	assert(mainObjectIdx != InvalidMainObjectIdx);
 
-	addGridDTM_Internal(gridDTMInfo, mainObjectIdx);
+	if (!addGridDTM_Internal(gridDTMInfo, mainObjectIdx))
+	{
+		// single grid DTM couldn't fit into memory to push to gpu, so we submit rendering current objects and reset geometry buffer and draw objects
+		submitCurrentDrawObjectsAndReset(intendedNextSubmit, mainObjectIdx);
+		bool success = addGridDTM_Internal(gridDTMInfo, mainObjectIdx);
+		assert(success); // this should always be true, otherwise it's either bug in code or not enough memory allocated to hold a single GridDTMInfo
+	}
 
 	endMainObject();
 }
diff --git a/62_CAD/shaders/main_pipeline/fragment_shader.hlsl b/62_CAD/shaders/main_pipeline/fragment_shader.hlsl
index 36fa3abf4..7738b169b 100644
--- a/62_CAD/shaders/main_pipeline/fragment_shader.hlsl
+++ b/62_CAD/shaders/main_pipeline/fragment_shader.hlsl
@@ -443,15 +443,12 @@ float4 fragMain(PSInput input) : SV_TARGET
                 const float MaxCellCoordY = round(gridExtents.y / cellWidth);
 
                 float2 insideCellCoord = gridSpacePos - float2(cellWidth, cellWidth) * cellCoords; // TODO: use fmod instead?
-
-                const float2 DistancesToTriangleALegs = diagonalFromTopLeftToBottomRight ? min(insideCellCoord.x, insideCellCoord.y) : min(insideCellCoord.x, cellWidth - insideCellCoord.y);
-                const float2 DistancesToTriangleBLegs = diagonalFromTopLeftToBottomRight ? min(cellWidth - insideCellCoord.x, cellWidth - insideCellCoord.y) : min(cellWidth - insideCellCoord.x, insideCellCoord.y);
-
-                float distanceToTriangleAExclusiveCorner = min(DistancesToTriangleALegs.x, DistancesToTriangleALegs.y);
-                float distanceToTriangleBExclusiveCorner = min(DistancesToTriangleBLegs.x, DistancesToTriangleBLegs.y);
                 
                 // my ASCII art above explains which triangle is A and which is B
-                const bool triangleA = distanceToTriangleAExclusiveCorner <= distanceToTriangleBExclusiveCorner;
+                const bool triangleA = diagonalFromTopLeftToBottomRight ?
+                    insideCellCoord.x < cellWidth - insideCellCoord.y :
+                    insideCellCoord.x < insideCellCoord.y;
+                
 
                 float2 gridSpaceCellTopLeftCoords = cellCoords * cellWidth;
 
diff --git a/62_CAD/shaders/main_pipeline/vertex_shader.hlsl b/62_CAD/shaders/main_pipeline/vertex_shader.hlsl
index 1074cc265..0624c159d 100644
--- a/62_CAD/shaders/main_pipeline/vertex_shader.hlsl
+++ b/62_CAD/shaders/main_pipeline/vertex_shader.hlsl
@@ -655,11 +655,13 @@ PSInput main(uint vertexID : SV_VertexID)
             float reciprocalOutlineStipplePatternLength = vk::RawBufferLoad<float>(globals.pointers.geometryBuffer + drawObj.geometryAddress + sizeof(pfloat64_t2) + 2 * sizeof(pfloat64_t) + sizeof(uint32_t) + sizeof(float), 8u);
 
             const float2 corner = float2(bool2(vertexIdx & 0x1u, vertexIdx >> 1));
+            pfloat64_t2 gridExtents;
+            gridExtents.x = width;
+            gridExtents.y = -height;
+
             pfloat64_t2 vtxPos = topLeft;
-            if (corner.x)
-                vtxPos.x = vtxPos.x + width;
-            if (corner.y)
-                vtxPos.y = vtxPos.y - height;
+            vtxPos = vtxPos + corner * gridExtents;
+            gridExtents.y = -gridExtents.y;
 
             float2 ndcVtxPos = _static_cast<float2>(transformPointNdc(clipProjectionData.projectionToNDC, vtxPos));
             outV.position = float4(ndcVtxPos, 0.0f, 1.0f);
@@ -667,10 +669,7 @@ PSInput main(uint vertexID : SV_VertexID)
             outV.setGridDTMScreenSpaceCellWidth(gridCellWidth * globals.screenToWorldRatio);
             outV.setGridDTMScreenSpacePosition(transformPointScreenSpace(clipProjectionData.projectionToNDC, globals.resolution, vtxPos));
             outV.setGridDTMScreenSpaceTopLeft(transformPointScreenSpace(clipProjectionData.projectionToNDC, globals.resolution, topLeft));
-            pfloat64_t2 gridExtents;
-            gridExtents.x = width;
-            gridExtents.y = height;
-            outV.setGridDTMScreenSpaceGridExtents(gridExtents * globals.screenToWorldRatio);
+            outV.setGridDTMScreenSpaceGridExtents(_static_cast<float2>(gridExtents) * globals.screenToWorldRatio);
             outV.setImageUV(corner);
             outV.setGridDTMOutlineStipplePatternLengthReciprocal(reciprocalOutlineStipplePatternLength);
         }

From 2a85f4e0911185a85df31f798b92e6902db3383e Mon Sep 17 00:00:00 2001
From: keptsecret <sorchon@gmail.com>
Date: Tue, 27 May 2025 11:24:43 +0700
Subject: [PATCH 310/529] refactor config member name

---
 23_Arithmetic2UnitTest/app_resources/testWorkgroup.comp.hlsl   | 2 +-
 29_Arithmetic2Bench/app_resources/benchmarkWorkgroup.comp.hlsl | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/23_Arithmetic2UnitTest/app_resources/testWorkgroup.comp.hlsl b/23_Arithmetic2UnitTest/app_resources/testWorkgroup.comp.hlsl
index e2256d2f1..048ccf316 100644
--- a/23_Arithmetic2UnitTest/app_resources/testWorkgroup.comp.hlsl
+++ b/23_Arithmetic2UnitTest/app_resources/testWorkgroup.comp.hlsl
@@ -14,7 +14,7 @@ using config_t = nbl::hlsl::workgroup2::ArithmeticConfiguration<WORKGROUP_SIZE_L
 typedef vector<uint32_t, config_t::ItemsPerInvocation_0> type_t;
 
 // final (level 1/2) scan needs to fit in one subgroup exactly
-groupshared uint32_t scratch[config_t::ElementCount];
+groupshared uint32_t scratch[config_t::SharedScratchElementCount];
 
 struct ScratchProxy
 {
diff --git a/29_Arithmetic2Bench/app_resources/benchmarkWorkgroup.comp.hlsl b/29_Arithmetic2Bench/app_resources/benchmarkWorkgroup.comp.hlsl
index 31284c520..fe340cf0c 100644
--- a/29_Arithmetic2Bench/app_resources/benchmarkWorkgroup.comp.hlsl
+++ b/29_Arithmetic2Bench/app_resources/benchmarkWorkgroup.comp.hlsl
@@ -14,7 +14,7 @@ using config_t = nbl::hlsl::workgroup2::ArithmeticConfiguration<WORKGROUP_SIZE_L
 typedef vector<uint32_t, config_t::ItemsPerInvocation_0> type_t;
 
 // final (level 1/2) scan needs to fit in one subgroup exactly
-groupshared uint32_t scratch[config_t::ElementCount];
+groupshared uint32_t scratch[config_t::SharedScratchElementCount];
 
 struct ScratchProxy
 {

From 99f6dfe5b4345cc8bbe7ff2ab2353993e395d3bd Mon Sep 17 00:00:00 2001
From: keptsecret <sorchon@gmail.com>
Date: Tue, 27 May 2025 13:49:02 +0700
Subject: [PATCH 311/529] fit new accessor concepts

---
 .../app_resources/testWorkgroup.comp.hlsl                 | 4 ++--
 .../app_resources/benchmarkWorkgroup.comp.hlsl            | 8 ++++----
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/23_Arithmetic2UnitTest/app_resources/testWorkgroup.comp.hlsl b/23_Arithmetic2UnitTest/app_resources/testWorkgroup.comp.hlsl
index 048ccf316..bda735b44 100644
--- a/23_Arithmetic2UnitTest/app_resources/testWorkgroup.comp.hlsl
+++ b/23_Arithmetic2UnitTest/app_resources/testWorkgroup.comp.hlsl
@@ -18,12 +18,12 @@ groupshared uint32_t scratch[config_t::SharedScratchElementCount];
 
 struct ScratchProxy
 {
-    template<typename AccessType>
+    template<typename IndexType, typename AccessType>
     void get(const uint32_t ix, NBL_REF_ARG(AccessType) value)
     {
         value = scratch[ix];
     }
-    template<typename AccessType>
+    template<typename IndexType, typename AccessType>
     void set(const uint32_t ix, const AccessType value)
     {
         scratch[ix] = value;
diff --git a/29_Arithmetic2Bench/app_resources/benchmarkWorkgroup.comp.hlsl b/29_Arithmetic2Bench/app_resources/benchmarkWorkgroup.comp.hlsl
index fe340cf0c..bfbe30ac9 100644
--- a/29_Arithmetic2Bench/app_resources/benchmarkWorkgroup.comp.hlsl
+++ b/29_Arithmetic2Bench/app_resources/benchmarkWorkgroup.comp.hlsl
@@ -18,13 +18,13 @@ groupshared uint32_t scratch[config_t::SharedScratchElementCount];
 
 struct ScratchProxy
 {
-    template<typename AccessType>
-    void get(const uint32_t ix, NBL_REF_ARG(AccessType) value)
+    template<typename IndexType, typename AccessType>
+    void get(const IndexType ix, NBL_REF_ARG(AccessType) value)
     {
         value = scratch[ix];
     }
-    template<typename AccessType>
-    void set(const uint32_t ix, const AccessType value)
+    template<typename IndexType, typename AccessType>
+    void set(const IndexType ix, const AccessType value)
     {
         scratch[ix] = value;
     }

From 0ed8dc4d42b8e11820f813b1c8281701ef1eebf0 Mon Sep 17 00:00:00 2001
From: devsh <devsh@devsh.eu>
Date: Tue, 27 May 2025 10:51:09 +0200
Subject: [PATCH 312/529] Lets remake the STL/PLY loaders

---
 27_PLYSTLDemo/config.json.template            |   28 -
 27_PLYSTLDemo/main.cpp                        |  579 ------
 .../CMakeLists.txt                            |    0
 .../config.json.template                      |    2 +-
 29_MeshLoaders/main.cpp                       | 1634 +++++++++++++++++
 .../pipeline.groovy                           |    0
 29_SpecializationConstants/CMakeLists.txt     |    7 -
 29_SpecializationConstants/main.cpp           |  566 ------
 29_SpecializationConstants/particles.comp     |   39 -
 29_SpecializationConstants/particles.frag     |   12 -
 29_SpecializationConstants/particles.vert     |   21 -
 29_SpecializationConstants/pipeline.groovy    |   50 -
 CMakeLists.txt                                |    1 +
 13 files changed, 1636 insertions(+), 1303 deletions(-)
 delete mode 100644 27_PLYSTLDemo/config.json.template
 delete mode 100644 27_PLYSTLDemo/main.cpp
 rename {27_PLYSTLDemo => 29_MeshLoaders}/CMakeLists.txt (100%)
 rename {29_SpecializationConstants => 29_MeshLoaders}/config.json.template (90%)
 create mode 100644 29_MeshLoaders/main.cpp
 rename {27_PLYSTLDemo => 29_MeshLoaders}/pipeline.groovy (100%)
 delete mode 100644 29_SpecializationConstants/CMakeLists.txt
 delete mode 100644 29_SpecializationConstants/main.cpp
 delete mode 100644 29_SpecializationConstants/particles.comp
 delete mode 100644 29_SpecializationConstants/particles.frag
 delete mode 100644 29_SpecializationConstants/particles.vert
 delete mode 100644 29_SpecializationConstants/pipeline.groovy

diff --git a/27_PLYSTLDemo/config.json.template b/27_PLYSTLDemo/config.json.template
deleted file mode 100644
index cb1b3b7a7..000000000
--- a/27_PLYSTLDemo/config.json.template
+++ /dev/null
@@ -1,28 +0,0 @@
-{
-  "enableParallelBuild": true,
-  "threadsPerBuildProcess" : 2,
-  "isExecuted": false,
-  "scriptPath": "",
-  "cmake": {
-    "configurations": [ "Release", "Debug", "RelWithDebInfo" ],
-    "buildModes": [],
-    "requiredOptions": [ "NBL_BUILD_MITSUBA_LOADER", "NBL_BUILD_OPTIX" ]
-  }, 
-  "profiles": [
-    {
-      "backend": "vulkan",
-      "platform": "windows",
-      "buildModes": [],
-      "runConfiguration": "Release",
-      "gpuArchitectures": []
-    }
-  ],
-  "dependencies": [],
-  "data": [
-    {
-      "dependencies": [],
-      "command": [""],
-      "outputs": []
-    }
-  ]
-}
\ No newline at end of file
diff --git a/27_PLYSTLDemo/main.cpp b/27_PLYSTLDemo/main.cpp
deleted file mode 100644
index 1e6d470e2..000000000
--- a/27_PLYSTLDemo/main.cpp
+++ /dev/null
@@ -1,579 +0,0 @@
-// Copyright (C) 2018-2020 - DevSH Graphics Programming Sp. z O.O.
-// This file is part of the "Nabla Engine".
-// For conditions of distribution and use, see copyright notice in nabla.h
-
-#define _NBL_STATIC_LIB_
-#include <iostream>
-#include <cstdio>
-#include <nabla.h>
-
-#include "CCamera.hpp"
-#include "../common/CommonAPI.h"
-#include "nbl/ext/ScreenShot/ScreenShot.h"
-
-using namespace nbl;
-using namespace core;
-
-/*
-	Uncomment for more detailed logging
-*/
-
-// #define NBL_MORE_LOGS
-
-/*
-	Uncomment for writing assets
-*/
-
-#define WRITE_ASSETS
-
-class PLYSTLDemo : public ApplicationBase
-{
-	static constexpr uint32_t WIN_W = 1280;
-	static constexpr uint32_t WIN_H = 720;
-	static constexpr uint32_t SC_IMG_COUNT = 3u;
-	static constexpr uint32_t FRAMES_IN_FLIGHT = 5u;
-	static constexpr uint64_t MAX_TIMEOUT = 99999999999999ull;
-	static constexpr size_t NBL_FRAMES_TO_AVERAGE = 100ull;
-	static_assert(FRAMES_IN_FLIGHT > SC_IMG_COUNT);
-
-	using RENDERPASS_INDEPENDENT_PIPELINE_ADRESS = size_t;
-	using GPU_PIPELINE_HASH_CONTAINER = std::map<RENDERPASS_INDEPENDENT_PIPELINE_ADRESS, core::smart_refctd_ptr<video::IGPUGraphicsPipeline>>;
-	using DependentDrawData = std::tuple<core::smart_refctd_ptr<video::IGPUMesh>, core::smart_refctd_ptr<video::IGPUBuffer>, core::smart_refctd_ptr<video::IGPUDescriptorSet>, uint32_t, const asset::IRenderpassIndependentPipelineMetadata*>;
-
-public:
-	nbl::core::smart_refctd_ptr<nbl::ui::IWindowManager> windowManager;
-	nbl::core::smart_refctd_ptr<nbl::ui::IWindow> window;
-	nbl::core::smart_refctd_ptr<CommonAPI::CommonAPIEventCallback> windowCallback;
-	nbl::core::smart_refctd_ptr<nbl::video::IAPIConnection> gl;
-	nbl::core::smart_refctd_ptr<nbl::video::ISurface> surface;
-	nbl::core::smart_refctd_ptr<nbl::video::IUtilities> utilities;
-	nbl::core::smart_refctd_ptr<nbl::video::ILogicalDevice> logicalDevice;
-	nbl::video::IPhysicalDevice* gpuPhysicalDevice;
-	std::array<nbl::video::IGPUQueue*, CommonAPI::InitOutput::MaxQueuesCount> queues = { nullptr, nullptr, nullptr, nullptr };
-	nbl::core::smart_refctd_ptr<nbl::video::ISwapchain> swapchain;
-	nbl::core::smart_refctd_ptr<nbl::video::IGPURenderpass> renderpass;
-	nbl::core::smart_refctd_dynamic_array<nbl::core::smart_refctd_ptr<nbl::video::IGPUFramebuffer>> fbos;
-	std::array<std::array<nbl::core::smart_refctd_ptr<nbl::video::IGPUCommandPool>, CommonAPI::InitOutput::MaxFramesInFlight>, CommonAPI::InitOutput::MaxQueuesCount> commandPools;
-	nbl::core::smart_refctd_ptr<nbl::system::ISystem> system;
-	nbl::core::smart_refctd_ptr<nbl::asset::IAssetManager> assetManager;
-	nbl::video::IGPUObjectFromAssetConverter::SParams cpu2gpuParams;
-	nbl::core::smart_refctd_ptr<nbl::system::ILogger> logger;
-	nbl::core::smart_refctd_ptr<CommonAPI::InputSystem> inputSystem;
-	
-	nbl::core::smart_refctd_ptr<video::IGPUFence> gpuTransferFence;
-	nbl::core::smart_refctd_ptr<video::IGPUFence> gpuComputeFence;
-	nbl::video::IGPUObjectFromAssetConverter cpu2gpu;
-	
-	uint32_t acquiredNextFBO = {};
-	int resourceIx = -1;
-	
-	core::smart_refctd_ptr<video::IGPUCommandBuffer> commandBuffers[FRAMES_IN_FLIGHT];
-	
-	core::smart_refctd_ptr<video::IGPUFence> frameComplete[FRAMES_IN_FLIGHT] = { nullptr };
-	core::smart_refctd_ptr<video::IGPUSemaphore> imageAcquire[FRAMES_IN_FLIGHT] = { nullptr };
-	core::smart_refctd_ptr<video::IGPUSemaphore> renderFinished[FRAMES_IN_FLIGHT] = { nullptr };
-
-	nbl::video::ISwapchain::SCreationParams m_swapchainCreationParams;
-	
-	std::chrono::system_clock::time_point lastTime;
-	bool frameDataFilled = false;
-	size_t frame_count = 0ull;
-	double time_sum = 0;
-	double dtList[NBL_FRAMES_TO_AVERAGE] = {};
-	
-	CommonAPI::InputSystem::ChannelReader<ui::IMouseEventChannel> mouse;
-	CommonAPI::InputSystem::ChannelReader<ui::IKeyboardEventChannel> keyboard;
-	
-	Camera camera = Camera(core::vectorSIMDf(0, 0, 0), core::vectorSIMDf(0, 0, 0), core::matrix4SIMD());
-	
-	GPU_PIPELINE_HASH_CONTAINER gpuPipelinesPly;
-	GPU_PIPELINE_HASH_CONTAINER gpuPipelinesStl;
-	
-	DependentDrawData plyDrawData;
-	DependentDrawData stlDrawData;
-	
-	void setWindow(core::smart_refctd_ptr<nbl::ui::IWindow>&& wnd) override
-	{
-		window = std::move(wnd);
-	}
-	nbl::ui::IWindow* getWindow() override
-	{
-		return window.get();
-	}
-	void setSystem(core::smart_refctd_ptr<nbl::system::ISystem>&& s) override
-	{
-		system = std::move(s);
-	}
-	video::IAPIConnection* getAPIConnection() override
-	{
-		return gl.get();
-	}
-	video::ILogicalDevice* getLogicalDevice()  override
-	{
-		return logicalDevice.get();
-	}
-	video::IGPURenderpass* getRenderpass() override
-	{
-		return renderpass.get();
-	}
-	void setSurface(core::smart_refctd_ptr<video::ISurface>&& s) override
-	{
-		surface = std::move(s);
-	}
-	void setFBOs(std::vector<core::smart_refctd_ptr<video::IGPUFramebuffer>>& f) override
-	{
-		for (int i = 0; i < f.size(); i++)
-		{
-			fbos->begin()[i] = core::smart_refctd_ptr(f[i]);
-		}
-	}
-	void setSwapchain(core::smart_refctd_ptr<video::ISwapchain>&& s) override
-	{
-		swapchain = std::move(s);
-	}
-	uint32_t getSwapchainImageCount() override
-	{
-		return swapchain->getImageCount();
-	}
-	virtual nbl::asset::E_FORMAT getDepthFormat() override
-	{
-		return nbl::asset::EF_D32_SFLOAT;
-	}
-
-APP_CONSTRUCTOR(PLYSTLDemo)
-
-	void onAppInitialized_impl() override
-	{
-		const auto swapchainImageUsage = static_cast<asset::IImage::E_USAGE_FLAGS>(asset::IImage::EUF_COLOR_ATTACHMENT_BIT);
-		CommonAPI::InitParams initParams;
-		initParams.window = core::smart_refctd_ptr(window);
-		initParams.apiType = video::EAT_VULKAN;
-		initParams.appName = { _NBL_APP_NAME_ };
-		initParams.framesInFlight = FRAMES_IN_FLIGHT;
-		initParams.windowWidth = WIN_W;
-		initParams.windowHeight = WIN_H;
-		initParams.swapchainImageCount = SC_IMG_COUNT;
-		initParams.swapchainImageUsage = swapchainImageUsage;
-		initParams.depthFormat = nbl::asset::EF_D32_SFLOAT;
-		auto initOutput = CommonAPI::InitWithDefaultExt(std::move(initParams));
-
-		window = std::move(initParams.window);
-		gl = std::move(initOutput.apiConnection);
-		surface = std::move(initOutput.surface);
-		gpuPhysicalDevice = std::move(initOutput.physicalDevice);
-		logicalDevice = std::move(initOutput.logicalDevice);
-		queues = std::move(initOutput.queues);
-		renderpass = std::move(initOutput.renderToSwapchainRenderpass);
-		commandPools = std::move(initOutput.commandPools);
-		assetManager = std::move(initOutput.assetManager);
-		logger = std::move(initOutput.logger);
-		inputSystem = std::move(initOutput.inputSystem);
-		system = std::move(initOutput.system);
-		windowCallback = std::move(initParams.windowCb);
-		utilities = std::move(initOutput.utilities);
-		m_swapchainCreationParams = std::move(initOutput.swapchainCreationParams);
-
-		CommonAPI::createSwapchain(std::move(logicalDevice), m_swapchainCreationParams, WIN_W, WIN_H, swapchain);
-		assert(swapchain);
-		fbos = CommonAPI::createFBOWithSwapchainImages(
-			swapchain->getImageCount(), WIN_W, WIN_H,
-			logicalDevice, swapchain, renderpass,
-			nbl::asset::EF_D32_SFLOAT
-		);
-
-		auto defaultComputeCommandPool = commandPools[CommonAPI::InitOutput::EQT_COMPUTE][0];
-		auto defaultTransferUpCommandPool = commandPools[CommonAPI::InitOutput::EQT_TRANSFER_UP][0];
-
-		nbl::video::IGPUObjectFromAssetConverter cpu2gpu;
-		nbl::video::IGPUObjectFromAssetConverter::SParams cpu2gpuParams;
-
-		nbl::core::smart_refctd_ptr<nbl::video::IGPUFence> gpuTransferFence;
-		nbl::core::smart_refctd_ptr<nbl::video::IGPUSemaphore> gpuTransferSemaphore;
-
-		nbl::core::smart_refctd_ptr<nbl::video::IGPUFence> gpuComputeFence;
-		nbl::core::smart_refctd_ptr<nbl::video::IGPUSemaphore> gpuComputeSemaphore;
-
-		{
-			gpuTransferFence = logicalDevice->createFence(static_cast<video::IGPUFence::E_CREATE_FLAGS>(0));
-			gpuTransferSemaphore = logicalDevice->createSemaphore();
-
-			gpuComputeFence = logicalDevice->createFence(static_cast<video::IGPUFence::E_CREATE_FLAGS>(0));
-			gpuComputeSemaphore = logicalDevice->createSemaphore();
-
-			cpu2gpuParams.utilities = utilities.get();
-			cpu2gpuParams.device = logicalDevice.get();
-			cpu2gpuParams.assetManager = assetManager.get();
-			cpu2gpuParams.pipelineCache = nullptr;
-			cpu2gpuParams.limits = gpuPhysicalDevice->getLimits();
-			cpu2gpuParams.finalQueueFamIx = queues[decltype(initOutput)::EQT_GRAPHICS]->getFamilyIndex();
-
-			logicalDevice->createCommandBuffers(defaultTransferUpCommandPool.get(),video::IGPUCommandBuffer::EL_PRIMARY,1u,&cpu2gpuParams.perQueue[nbl::video::IGPUObjectFromAssetConverter::EQU_TRANSFER].cmdbuf);
-			cpu2gpuParams.perQueue[nbl::video::IGPUObjectFromAssetConverter::EQU_TRANSFER].queue = queues[decltype(initOutput)::EQT_TRANSFER_UP];
-			cpu2gpuParams.perQueue[nbl::video::IGPUObjectFromAssetConverter::EQU_TRANSFER].semaphore = &gpuTransferSemaphore;
-			
-			logicalDevice->createCommandBuffers(defaultComputeCommandPool.get(),video::IGPUCommandBuffer::EL_PRIMARY,1u,&cpu2gpuParams.perQueue[nbl::video::IGPUObjectFromAssetConverter::EQU_COMPUTE].cmdbuf);
-			cpu2gpuParams.perQueue[nbl::video::IGPUObjectFromAssetConverter::EQU_COMPUTE].queue = queues[decltype(initOutput)::EQT_COMPUTE];
-			cpu2gpuParams.perQueue[nbl::video::IGPUObjectFromAssetConverter::EQU_COMPUTE].semaphore = &gpuComputeSemaphore;
-
-			cpu2gpuParams.beginCommandBuffers();
-		}
-
-		auto loadAndGetCpuMesh = [&](system::path path) -> std::pair<core::smart_refctd_ptr<asset::ICPUMesh>, const asset::IAssetMetadata*>
-		{
-			auto meshes_bundle = assetManager->getAsset(path.string(), {});
-			{
-				bool status = !meshes_bundle.getContents().empty();
-				assert(status);
-			}
-
-			auto mesh = core::smart_refctd_ptr_static_cast<asset::ICPUMesh>(meshes_bundle.getContents().begin()[0]);
-			auto metadata = meshes_bundle.getMetadata();
-			return std::make_pair(mesh, metadata);
-			//return std::make_pair(core::smart_refctd_ptr_static_cast<asset::ICPUMesh>(meshes_bundle.getContents().begin()[0]), meshes_bundle.getMetadata());
-		};
-
-		auto cpuBundlePLYData = loadAndGetCpuMesh(sharedInputCWD / "ply/Spanner-ply.ply");
-		auto cpuBundleSTLData = loadAndGetCpuMesh(sharedInputCWD / "extrusionLogo_TEST_fixed.stl");
-
-		core::smart_refctd_ptr<asset::ICPUMesh> cpuMeshPly = cpuBundlePLYData.first;
-		auto metadataPly = cpuBundlePLYData.second->selfCast<const asset::CPLYMetadata>();
-
-		core::smart_refctd_ptr<asset::ICPUMesh> cpuMeshStl = cpuBundleSTLData.first;
-		auto metadataStl = cpuBundleSTLData.second->selfCast<const asset::CSTLMetadata>();
-
-#ifdef WRITE_ASSETS
-		{
-			asset::IAssetWriter::SAssetWriteParams wp(cpuMeshPly.get());
-			bool status = assetManager->writeAsset("Spanner_ply.ply", wp);
-			assert(status);
-		}
-
-		{
-			asset::IAssetWriter::SAssetWriteParams wp(cpuMeshStl.get());
-			bool status = assetManager->writeAsset("extrusionLogo_TEST_fixedTest.stl", wp);
-			assert(status);
-		}
-#endif // WRITE_ASSETS
-
-		/*
-			For the testing puposes we can safely assume all meshbuffers within mesh loaded from PLY & STL has same DS1 layout (used for camera-specific data)
-		*/
-
-		auto getMeshDependentDrawData = [&](core::smart_refctd_ptr<asset::ICPUMesh> cpuMesh, bool isPLY) -> DependentDrawData
-		{
-			const asset::ICPUMeshBuffer* const firstMeshBuffer = cpuMesh->getMeshBuffers().begin()[0];
-			const asset::ICPUDescriptorSetLayout* ds1layout = firstMeshBuffer->getPipeline()->getLayout()->getDescriptorSetLayout(1u); //! DS1
-			const asset::IRenderpassIndependentPipelineMetadata* pipelineMetadata;
-			{
-				if (isPLY)
-					pipelineMetadata = metadataPly->getAssetSpecificMetadata(firstMeshBuffer->getPipeline());
-				else
-					pipelineMetadata = metadataStl->getAssetSpecificMetadata(firstMeshBuffer->getPipeline());
-			}
-
-			/*
-				So we can create just one DescriptorSet
-			*/
-
-			const uint32_t ds1UboBinding = ds1layout->getDescriptorRedirect(asset::IDescriptor::E_TYPE::ET_UNIFORM_BUFFER).getBinding(asset::ICPUDescriptorSetLayout::CBindingRedirect::storage_range_index_t{ 0 }).data;
-
-			auto getNeededDS1UboByteSize = [&]()
-			{
-				size_t neededDS1UboSize = 0ull;
-				{
-					for (const auto& shaderInputs : pipelineMetadata->m_inputSemantics)
-						if (shaderInputs.descriptorSection.type == asset::IRenderpassIndependentPipelineMetadata::ShaderInput::E_TYPE::ET_UNIFORM_BUFFER && shaderInputs.descriptorSection.uniformBufferObject.set == 1u && shaderInputs.descriptorSection.uniformBufferObject.binding == ds1UboBinding)
-							neededDS1UboSize = std::max<size_t>(neededDS1UboSize, shaderInputs.descriptorSection.uniformBufferObject.relByteoffset + shaderInputs.descriptorSection.uniformBufferObject.bytesize);
-				}
-				return neededDS1UboSize;
-			};
-
-			const uint64_t uboDS1ByteSize = getNeededDS1UboByteSize();
-
-			core::smart_refctd_ptr<video::IGPUDescriptorSetLayout> gpuds1layout;
-			{
-				auto gpu_array = cpu2gpu.getGPUObjectsFromAssets(&ds1layout, &ds1layout + 1, cpu2gpuParams);
-				if (!gpu_array || gpu_array->size() < 1u || !(*gpu_array)[0])
-					assert(false);
-
-				gpuds1layout = (*gpu_array)[0];
-			}
-
-			const uint32_t setCount = 1;
-			auto gpuUBODescriptorPool = logicalDevice->createDescriptorPoolForDSLayouts(video::IDescriptorPool::ECF_NONE, &gpuds1layout.get(), &gpuds1layout.get()+1ull, &setCount);
-
-			video::IGPUBuffer::SCreationParams creationParams;
-			creationParams.usage = asset::IBuffer::E_USAGE_FLAGS(asset::IBuffer::EUF_UNIFORM_BUFFER_BIT | asset::IBuffer::EUF_INLINE_UPDATE_VIA_CMDBUF);
-			creationParams.queueFamilyIndices = 0u;
-			creationParams.queueFamilyIndices = nullptr;
-			creationParams.size = uboDS1ByteSize;
-
-			auto gpuubo = logicalDevice->createBuffer(std::move(creationParams));
-			auto gpuuboMemReqs = gpuubo->getMemoryReqs();
-			gpuuboMemReqs.memoryTypeBits &= logicalDevice->getPhysicalDevice()->getDeviceLocalMemoryTypeBits();
-			logicalDevice->allocate(gpuuboMemReqs, gpuubo.get());
-
-			auto gpuds1 = gpuUBODescriptorPool->createDescriptorSet(std::move(gpuds1layout));
-			{
-				video::IGPUDescriptorSet::SWriteDescriptorSet write;
-				write.dstSet = gpuds1.get();
-				write.binding = ds1UboBinding;
-				write.count = 1u;
-				write.arrayElement = 0u;
-				write.descriptorType = asset::IDescriptor::E_TYPE::ET_UNIFORM_BUFFER;
-				video::IGPUDescriptorSet::SDescriptorInfo info;
-				{
-					info.desc = gpuubo;
-					info.info.buffer.offset = 0ull;
-					info.info.buffer.size = uboDS1ByteSize;
-				}
-				write.info = &info;
-				logicalDevice->updateDescriptorSets(1u, &write, 0u, nullptr);
-			}
-
-			core::smart_refctd_ptr<video::IGPUMesh> gpumesh;
-			{
-				auto gpu_array = cpu2gpu.getGPUObjectsFromAssets(&cpuMesh.get(), &cpuMesh.get() + 1, cpu2gpuParams);
-				cpu2gpuParams.waitForCreationToComplete(true);
-				cpu2gpuParams.beginCommandBuffers();
-				if (!gpu_array || gpu_array->size() < 1u || !(*gpu_array)[0])
-					assert(false);
-
-				gpumesh = (*gpu_array)[0];
-			}
-
-			return std::make_tuple(gpumesh, gpuubo, gpuds1, ds1UboBinding, pipelineMetadata);
-		};
-
-		plyDrawData = getMeshDependentDrawData(cpuMeshPly, true);
-		stlDrawData = getMeshDependentDrawData(cpuMeshStl, false);
-
-		{
-			auto fillGpuPipeline = [&](GPU_PIPELINE_HASH_CONTAINER& container, video::IGPUMesh* gpuMesh)
-			{
-				for (size_t i = 0; i < gpuMesh->getMeshBuffers().size(); ++i)
-				{
-					auto gpuIndependentPipeline = gpuMesh->getMeshBuffers().begin()[i]->getPipeline();
-
-					nbl::video::IGPUGraphicsPipeline::SCreationParams graphicsPipelineParams;
-					graphicsPipelineParams.renderpassIndependent = core::smart_refctd_ptr<nbl::video::IGPURenderpassIndependentPipeline>(const_cast<video::IGPURenderpassIndependentPipeline*>(gpuIndependentPipeline));
-					graphicsPipelineParams.renderpass = core::smart_refctd_ptr(renderpass);
-
-					const RENDERPASS_INDEPENDENT_PIPELINE_ADRESS adress = reinterpret_cast<RENDERPASS_INDEPENDENT_PIPELINE_ADRESS>(graphicsPipelineParams.renderpassIndependent.get());
-					container[adress] = logicalDevice->createGraphicsPipeline(nullptr, std::move(graphicsPipelineParams));
-				}
-			};
-
-			fillGpuPipeline(gpuPipelinesPly, std::get<core::smart_refctd_ptr<video::IGPUMesh>>(plyDrawData).get());
-			fillGpuPipeline(gpuPipelinesStl, std::get<core::smart_refctd_ptr<video::IGPUMesh>>(stlDrawData).get());
-		}
-
-		core::vectorSIMDf cameraPosition(0, 5, -10);
-		matrix4SIMD projectionMatrix = matrix4SIMD::buildProjectionMatrixPerspectiveFovLH(core::radians(60.0f), video::ISurface::getTransformedAspectRatio(swapchain->getPreTransform(), WIN_W, WIN_H), 0.001, 1000);
-		camera = Camera(cameraPosition, core::vectorSIMDf(0, 0, 0), projectionMatrix, 0.01f, 1.f);
-		lastTime = std::chrono::system_clock::now();
-
-		for (size_t i = 0ull; i < NBL_FRAMES_TO_AVERAGE; ++i)
-			dtList[i] = 0.0;
-
- 		const auto& graphicsCommandPools = commandPools[CommonAPI::InitOutput::EQT_GRAPHICS];
-		for (uint32_t i = 0u; i < FRAMES_IN_FLIGHT; i++)
-		{
-			logicalDevice->createCommandBuffers(graphicsCommandPools[i].get(), video::IGPUCommandBuffer::EL_PRIMARY, 1, commandBuffers+i);
-			imageAcquire[i] = logicalDevice->createSemaphore();
-			renderFinished[i] = logicalDevice->createSemaphore();
-		}
-	}
-
-	void onAppTerminated_impl() override
-	{
-		const auto& fboCreationParams = fbos->begin()[acquiredNextFBO]->getCreationParameters();
-		auto gpuSourceImageView = fboCreationParams.attachments[0];
-
-		//TODO: 
-		bool status = ext::ScreenShot::createScreenShot(
-			logicalDevice.get(),
-			queues[CommonAPI::InitOutput::EQT_TRANSFER_UP],
-			renderFinished[resourceIx].get(),
-			gpuSourceImageView.get(),
-			assetManager.get(),
-			"ScreenShot.png",
-			asset::IImage::EL_PRESENT_SRC,
-			asset::EAF_NONE);
-		assert(status);
-	}
-
-	void workLoopBody() override
-	{
-		++resourceIx;
-		if (resourceIx >= FRAMES_IN_FLIGHT)
-			resourceIx = 0;
-
-		auto& commandBuffer = commandBuffers[resourceIx];
-		auto& fence = frameComplete[resourceIx];
-
-		if (fence)
-			while (logicalDevice->waitForFences(1u, &fence.get(), false, MAX_TIMEOUT) == video::IGPUFence::ES_TIMEOUT) {}
-		else
-			fence = logicalDevice->createFence(static_cast<video::IGPUFence::E_CREATE_FLAGS>(0));
-
-		auto renderStart = std::chrono::system_clock::now();
-		const auto renderDt = std::chrono::duration_cast<std::chrono::milliseconds>(renderStart - lastTime).count();
-		lastTime = renderStart;
-		{ // Calculate Simple Moving Average for FrameTime
-			time_sum -= dtList[frame_count];
-			time_sum += renderDt;
-			dtList[frame_count] = renderDt;
-			frame_count++;
-			if (frame_count >= NBL_FRAMES_TO_AVERAGE)
-			{
-				frameDataFilled = true;
-				frame_count = 0;
-			}
-
-		}
-		const double averageFrameTime = frameDataFilled ? (time_sum / (double)NBL_FRAMES_TO_AVERAGE) : (time_sum / frame_count);
-
-#ifdef NBL_MORE_LOGS
-		logger->log("renderDt = %f ------ averageFrameTime = %f", system::ILogger::ELL_INFO, renderDt, averageFrameTime);
-#endif // NBL_MORE_LOGS
-
-		auto averageFrameTimeDuration = std::chrono::duration<double, std::milli>(averageFrameTime);
-		auto nextPresentationTime = renderStart + averageFrameTimeDuration;
-		auto nextPresentationTimeStamp = std::chrono::duration_cast<std::chrono::microseconds>(nextPresentationTime.time_since_epoch());
-
-		inputSystem->getDefaultMouse(&mouse);
-		inputSystem->getDefaultKeyboard(&keyboard);
-
-		camera.beginInputProcessing(nextPresentationTimeStamp);
-		mouse.consumeEvents([&](const ui::IMouseEventChannel::range_t& events) -> void { camera.mouseProcess(events); }, logger.get());
-		keyboard.consumeEvents([&](const ui::IKeyboardEventChannel::range_t& events) -> void { camera.keyboardProcess(events); }, logger.get());
-		camera.endInputProcessing(nextPresentationTimeStamp);
-
-		const auto& viewMatrix = camera.getViewMatrix();
-		const auto& viewProjectionMatrix = matrix4SIMD::concatenateBFollowedByAPrecisely(
-			video::ISurface::getSurfaceTransformationMatrix(swapchain->getPreTransform()),
-			camera.getConcatenatedMatrix()
-		);
-
-		commandBuffer->reset(nbl::video::IGPUCommandBuffer::ERF_RELEASE_RESOURCES_BIT);
-		commandBuffer->begin(video::IGPUCommandBuffer::EU_ONE_TIME_SUBMIT_BIT);  // TODO: Reset Frame's CommandPool
-
-		asset::SViewport viewport;
-		viewport.minDepth = 1.f;
-		viewport.maxDepth = 0.f;
-		viewport.x = 0u;
-		viewport.y = 0u;
-		viewport.width = WIN_W;
-		viewport.height = WIN_H;
-		commandBuffer->setViewport(0u, 1u, &viewport);
-
-		swapchain->acquireNextImage(MAX_TIMEOUT, imageAcquire[resourceIx].get(), nullptr, &acquiredNextFBO);
-
-		nbl::video::IGPUCommandBuffer::SRenderpassBeginInfo beginInfo;
-		{
-			VkRect2D area;
-			area.offset = { 0,0 };
-			area.extent = { WIN_W, WIN_H };
-			asset::SClearValue clear[2] = {};
-			clear[0].color.float32[0] = 1.f;
-			clear[0].color.float32[1] = 1.f;
-			clear[0].color.float32[2] = 1.f;
-			clear[0].color.float32[3] = 1.f;
-			clear[1].depthStencil.depth = 0.f;
-
-			beginInfo.clearValueCount = 2u;
-			beginInfo.framebuffer = fbos->begin()[acquiredNextFBO];
-			beginInfo.renderpass = renderpass;
-			beginInfo.renderArea = area;
-			beginInfo.clearValues = clear;
-		}
-
-		commandBuffer->beginRenderPass(&beginInfo, nbl::asset::ESC_INLINE);
-
-		auto renderMesh = [&](GPU_PIPELINE_HASH_CONTAINER& gpuPipelines, DependentDrawData& drawData, uint32_t index)
-		{
-			auto gpuMesh = std::get<core::smart_refctd_ptr<video::IGPUMesh>>(drawData);
-			auto gpuubo = std::get<core::smart_refctd_ptr<video::IGPUBuffer>>(drawData);
-			auto gpuds1 = std::get<core::smart_refctd_ptr<video::IGPUDescriptorSet>>(drawData);
-			auto ds1UboBinding = std::get<uint32_t>(drawData);
-			const auto* pipelineMetadata = std::get<const asset::IRenderpassIndependentPipelineMetadata*>(drawData);
-
-			core::matrix3x4SIMD modelMatrix;
-
-			if (index == 1)
-				modelMatrix.setScale(core::vectorSIMDf(10, 10, 10));
-			modelMatrix.setTranslation(nbl::core::vectorSIMDf(index * 150, 0, 0, 0));
-
-			core::matrix4SIMD mvp = core::concatenateBFollowedByA(viewProjectionMatrix, modelMatrix);
-
-			core::vector<uint8_t> uboData(gpuubo->getSize());
-			for (const auto& shaderInputs : pipelineMetadata->m_inputSemantics)
-			{
-				if (shaderInputs.descriptorSection.type == asset::IRenderpassIndependentPipelineMetadata::ShaderInput::E_TYPE::ET_UNIFORM_BUFFER && shaderInputs.descriptorSection.uniformBufferObject.set == 1u && shaderInputs.descriptorSection.uniformBufferObject.binding == ds1UboBinding)
-				{
-					switch (shaderInputs.type)
-					{
-					case asset::IRenderpassIndependentPipelineMetadata::ECSI_WORLD_VIEW_PROJ:
-					{
-						memcpy(uboData.data() + shaderInputs.descriptorSection.uniformBufferObject.relByteoffset, mvp.pointer(), shaderInputs.descriptorSection.uniformBufferObject.bytesize);
-					} break;
-
-					case asset::IRenderpassIndependentPipelineMetadata::ECSI_WORLD_VIEW:
-					{
-						memcpy(uboData.data() + shaderInputs.descriptorSection.uniformBufferObject.relByteoffset, viewMatrix.pointer(), shaderInputs.descriptorSection.uniformBufferObject.bytesize);
-					} break;
-
-					case asset::IRenderpassIndependentPipelineMetadata::ECSI_WORLD_VIEW_INVERSE_TRANSPOSE:
-					{
-						memcpy(uboData.data() + shaderInputs.descriptorSection.uniformBufferObject.relByteoffset, viewMatrix.pointer(), shaderInputs.descriptorSection.uniformBufferObject.bytesize);
-					} break;
-					}
-				}
-			}
-
-			commandBuffer->updateBuffer(gpuubo.get(), 0ull, gpuubo->getSize(), uboData.data());
-
-			for (auto gpuMeshBuffer : gpuMesh->getMeshBuffers())
-			{
-				auto gpuGraphicsPipeline = gpuPipelines[reinterpret_cast<RENDERPASS_INDEPENDENT_PIPELINE_ADRESS>(gpuMeshBuffer->getPipeline())];
-
-				const video::IGPURenderpassIndependentPipeline* gpuRenderpassIndependentPipeline = gpuMeshBuffer->getPipeline();
-				const video::IGPUDescriptorSet* ds3 = gpuMeshBuffer->getAttachedDescriptorSet();
-
-				commandBuffer->bindGraphicsPipeline(gpuGraphicsPipeline.get());
-
-				const video::IGPUDescriptorSet* gpuds1_ptr = gpuds1.get();
-				commandBuffer->bindDescriptorSets(asset::EPBP_GRAPHICS, gpuRenderpassIndependentPipeline->getLayout(), 1u, 1u, &gpuds1_ptr, 0u);
-				const video::IGPUDescriptorSet* gpuds3_ptr = gpuMeshBuffer->getAttachedDescriptorSet();
-
-				if (gpuds3_ptr)
-					commandBuffer->bindDescriptorSets(asset::EPBP_GRAPHICS, gpuRenderpassIndependentPipeline->getLayout(), 3u, 1u, &gpuds3_ptr, 0u);
-				if (gpuRenderpassIndependentPipeline->getLayout()->m_pushConstantRanges)
-					commandBuffer->pushConstants(gpuRenderpassIndependentPipeline->getLayout(), video::IGPUShader::ESS_FRAGMENT, 0u, gpuMeshBuffer->MAX_PUSH_CONSTANT_BYTESIZE, gpuMeshBuffer->getPushConstantsDataPtr());
-
-				commandBuffer->drawMeshBuffer(gpuMeshBuffer);
-			}
-		};
-
-		/*
-			Record PLY and STL rendering commands
-		*/
-
-		renderMesh(gpuPipelinesPly, plyDrawData, 0);
-		renderMesh(gpuPipelinesStl, stlDrawData, 1);
-
-		commandBuffer->endRenderPass();
-		commandBuffer->end();
-
-		CommonAPI::Submit(logicalDevice.get(), commandBuffer.get(), queues[CommonAPI::InitOutput::EQT_GRAPHICS], imageAcquire[resourceIx].get(), renderFinished[resourceIx].get(), fence.get());
-		CommonAPI::Present(logicalDevice.get(), swapchain.get(), queues[CommonAPI::InitOutput::EQT_GRAPHICS], renderFinished[resourceIx].get(), acquiredNextFBO);
-	}
-
-	bool keepRunning() override
-	{
-		return windowCallback->isWindowOpen();
-	}
-};
-
-NBL_COMMON_API_MAIN(PLYSTLDemo)
\ No newline at end of file
diff --git a/27_PLYSTLDemo/CMakeLists.txt b/29_MeshLoaders/CMakeLists.txt
similarity index 100%
rename from 27_PLYSTLDemo/CMakeLists.txt
rename to 29_MeshLoaders/CMakeLists.txt
diff --git a/29_SpecializationConstants/config.json.template b/29_MeshLoaders/config.json.template
similarity index 90%
rename from 29_SpecializationConstants/config.json.template
rename to 29_MeshLoaders/config.json.template
index f961745c1..2c42b001d 100644
--- a/29_SpecializationConstants/config.json.template
+++ b/29_MeshLoaders/config.json.template
@@ -6,7 +6,7 @@
   "cmake": {
     "configurations": [ "Release", "Debug", "RelWithDebInfo" ],
     "buildModes": [],
-    "requiredOptions": []
+    "requiredOptions": [ "NBL_BUILD_MITSUBA_LOADER" ]
   }, 
   "profiles": [
     {
diff --git a/29_MeshLoaders/main.cpp b/29_MeshLoaders/main.cpp
new file mode 100644
index 000000000..968f7c42e
--- /dev/null
+++ b/29_MeshLoaders/main.cpp
@@ -0,0 +1,1634 @@
+// Copyright (C) 2018-2024 - DevSH Graphics Programming Sp. z O.O.
+// This file is part of the "Nabla Engine".
+// For conditions of distribution and use, see copyright notice in nabla.h
+
+#include "common.hpp"
+#include "nbl/ext/FullScreenTriangle/FullScreenTriangle.h"
+#include "nbl/builtin/hlsl/indirect_commands.hlsl"
+
+
+class RaytracingPipelineApp final : public examples::SimpleWindowedApplication, public application_templates::MonoAssetManagerAndBuiltinResourceApplication
+{
+	using device_base_t = examples::SimpleWindowedApplication;
+	using asset_base_t = application_templates::MonoAssetManagerAndBuiltinResourceApplication;
+	using clock_t = std::chrono::steady_clock;
+
+	constexpr static inline uint32_t WIN_W = 1280, WIN_H = 720;
+	constexpr static inline uint32_t MaxFramesInFlight = 3u;
+	constexpr static inline uint8_t MaxUITextureCount = 1u;
+	constexpr static inline uint32_t NumberOfProceduralGeometries = 5;
+
+	static constexpr const char* s_lightTypeNames[E_LIGHT_TYPE::ELT_COUNT] = {
+	  "Directional",
+	  "Point",
+	  "Spot"
+	};
+
+	struct ShaderBindingTable
+	{
+		SBufferRange<IGPUBuffer> raygenGroupRange;
+		SBufferRange<IGPUBuffer> hitGroupsRange;
+		uint32_t hitGroupsStride;
+		SBufferRange<IGPUBuffer> missGroupsRange;
+		uint32_t missGroupsStride;
+		SBufferRange<IGPUBuffer> callableGroupsRange;
+		uint32_t callableGroupsStride;
+	};
+
+
+public:
+	inline RaytracingPipelineApp(const path& _localInputCWD, const path& _localOutputCWD, const path& _sharedInputCWD, const path& _sharedOutputCWD)
+		: IApplicationFramework(_localInputCWD, _localOutputCWD, _sharedInputCWD, _sharedOutputCWD)
+	{
+	}
+
+	inline SPhysicalDeviceFeatures getRequiredDeviceFeatures() const override
+	{
+		auto retval = device_base_t::getRequiredDeviceFeatures();
+		retval.rayTracingPipeline = true;
+		retval.accelerationStructure = true;
+		retval.rayQuery = true;
+		return retval;
+	}
+
+	inline SPhysicalDeviceFeatures getPreferredDeviceFeatures() const override
+	{
+		auto retval = device_base_t::getPreferredDeviceFeatures();
+		retval.accelerationStructureHostCommands = true;
+		return retval;
+	}
+
+	inline core::vector<video::SPhysicalDeviceFilter::SurfaceCompatibility> getSurfaces() const override
+	{
+		if (!m_surface)
+		{
+			{
+				auto windowCallback = core::make_smart_refctd_ptr<CEventCallback>(smart_refctd_ptr(m_inputSystem), smart_refctd_ptr(m_logger));
+				IWindow::SCreationParams params = {};
+				params.callback = core::make_smart_refctd_ptr<ISimpleManagedSurface::ICallback>();
+				params.width = WIN_W;
+				params.height = WIN_H;
+				params.x = 32;
+				params.y = 32;
+				params.flags = ui::IWindow::ECF_HIDDEN | IWindow::ECF_BORDERLESS | IWindow::ECF_RESIZABLE;
+				params.windowCaption = "RaytracingPipelineApp";
+				params.callback = windowCallback;
+				const_cast<std::remove_const_t<decltype(m_window)>&>(m_window) = m_winMgr->createWindow(std::move(params));
+			}
+
+			auto surface = CSurfaceVulkanWin32::create(smart_refctd_ptr(m_api), smart_refctd_ptr_static_cast<IWindowWin32>(m_window));
+			const_cast<std::remove_const_t<decltype(m_surface)>&>(m_surface) = CSimpleResizeSurface<ISimpleManagedSurface::ISwapchainResources>::create(std::move(surface));
+		}
+
+		if (m_surface)
+			return { {m_surface->getSurface()/*,EQF_NONE*/} };
+
+		return {};
+	}
+
+	// so that we can use the same queue for asset converter and rendering
+	inline core::vector<queue_req_t> getQueueRequirements() const override
+	{
+		auto reqs = device_base_t::getQueueRequirements();
+		reqs.front().requiredFlags |= IQueue::FAMILY_FLAGS::COMPUTE_BIT;
+		return reqs;
+	}
+
+	inline bool onAppInitialized(smart_refctd_ptr<ISystem>&& system) override
+	{
+		m_inputSystem = make_smart_refctd_ptr<InputSystem>(logger_opt_smart_ptr(smart_refctd_ptr(m_logger)));
+
+		if (!device_base_t::onAppInitialized(smart_refctd_ptr(system)))
+			return false;
+
+		if (!asset_base_t::onAppInitialized(smart_refctd_ptr(system)))
+			return false;
+
+		smart_refctd_ptr<IShaderCompiler::CCache> shaderReadCache = nullptr;
+		smart_refctd_ptr<IShaderCompiler::CCache> shaderWriteCache = core::make_smart_refctd_ptr<IShaderCompiler::CCache>();
+		auto shaderCachePath = localOutputCWD / "main_pipeline_shader_cache.bin";
+
+		{
+			core::smart_refctd_ptr<system::IFile> shaderReadCacheFile;
+			{
+				system::ISystem::future_t<core::smart_refctd_ptr<system::IFile>> future;
+				m_system->createFile(future, shaderCachePath.c_str(), system::IFile::ECF_READ);
+				if (future.wait())
+				{
+					future.acquire().move_into(shaderReadCacheFile);
+					if (shaderReadCacheFile)
+					{
+						const size_t size = shaderReadCacheFile->getSize();
+						if (size > 0ull)
+						{
+							std::vector<uint8_t> contents(size);
+							system::IFile::success_t succ;
+							shaderReadCacheFile->read(succ, contents.data(), 0, size);
+							if (succ)
+								shaderReadCache = IShaderCompiler::CCache::deserialize(contents);
+						}
+					}
+				}
+				else
+					m_logger->log("Failed Openning Shader Cache File.", ILogger::ELL_ERROR);
+			}
+
+		}
+
+		// Load Custom Shader
+		auto loadCompileAndCreateShader = [&](const std::string& relPath) -> smart_refctd_ptr<IGPUShader>
+			{
+				IAssetLoader::SAssetLoadParams lp = {};
+				lp.logger = m_logger.get();
+				lp.workingDirectory = ""; // virtual root
+				auto assetBundle = m_assetMgr->getAsset(relPath, lp);
+				const auto assets = assetBundle.getContents();
+				if (assets.empty())
+					return nullptr;
+
+				// lets go straight from ICPUSpecializedShader to IGPUSpecializedShader
+				auto sourceRaw = IAsset::castDown<ICPUShader>(assets[0]);
+				if (!sourceRaw)
+					return nullptr;
+
+				return m_device->createShader({ sourceRaw.get(), nullptr, shaderReadCache.get(), shaderWriteCache.get() });
+			};
+
+		// load shaders
+		const auto raygenShader = loadCompileAndCreateShader("app_resources/raytrace.rgen.hlsl");
+		const auto closestHitShader = loadCompileAndCreateShader("app_resources/raytrace.rchit.hlsl");
+		const auto proceduralClosestHitShader = loadCompileAndCreateShader("app_resources/raytrace_procedural.rchit.hlsl");
+		const auto intersectionHitShader = loadCompileAndCreateShader("app_resources/raytrace.rint.hlsl");
+		const auto anyHitShaderColorPayload = loadCompileAndCreateShader("app_resources/raytrace.rahit.hlsl");
+		const auto anyHitShaderShadowPayload = loadCompileAndCreateShader("app_resources/raytrace_shadow.rahit.hlsl");
+		const auto missShader = loadCompileAndCreateShader("app_resources/raytrace.rmiss.hlsl");
+		const auto missShadowShader = loadCompileAndCreateShader("app_resources/raytrace_shadow.rmiss.hlsl");
+		const auto directionalLightCallShader = loadCompileAndCreateShader("app_resources/light_directional.rcall.hlsl");
+		const auto pointLightCallShader = loadCompileAndCreateShader("app_resources/light_point.rcall.hlsl");
+		const auto spotLightCallShader = loadCompileAndCreateShader("app_resources/light_spot.rcall.hlsl");
+		const auto fragmentShader = loadCompileAndCreateShader("app_resources/present.frag.hlsl");
+
+		core::smart_refctd_ptr<system::IFile> shaderWriteCacheFile;
+		{
+			system::ISystem::future_t<core::smart_refctd_ptr<system::IFile>> future;
+			m_system->deleteFile(shaderCachePath); // temp solution instead of trimming, to make sure we won't have corrupted json
+			m_system->createFile(future, shaderCachePath.c_str(), system::IFile::ECF_WRITE);
+			if (future.wait())
+			{
+				future.acquire().move_into(shaderWriteCacheFile);
+				if (shaderWriteCacheFile)
+				{
+					auto serializedCache = shaderWriteCache->serialize();
+					if (shaderWriteCacheFile)
+					{
+						system::IFile::success_t succ;
+						shaderWriteCacheFile->write(succ, serializedCache->getPointer(), 0, serializedCache->getSize());
+						if (!succ)
+							m_logger->log("Failed Writing To Shader Cache File.", ILogger::ELL_ERROR);
+					}
+				}
+				else
+					m_logger->log("Failed Creating Shader Cache File.", ILogger::ELL_ERROR);
+			}
+			else
+				m_logger->log("Failed Creating Shader Cache File.", ILogger::ELL_ERROR);
+		}
+
+		m_semaphore = m_device->createSemaphore(m_realFrameIx);
+		if (!m_semaphore)
+			return logFail("Failed to Create a Semaphore!");
+
+		auto gQueue = getGraphicsQueue();
+
+		// Create renderpass and init surface
+		nbl::video::IGPURenderpass* renderpass;
+		{
+			ISwapchain::SCreationParams swapchainParams = { .surface = smart_refctd_ptr<ISurface>(m_surface->getSurface()) };
+			if (!swapchainParams.deduceFormat(m_physicalDevice))
+				return logFail("Could not choose a Surface Format for the Swapchain!");
+
+			const static IGPURenderpass::SCreationParams::SSubpassDependency dependencies[] =
+			{
+			  {
+				.srcSubpass = IGPURenderpass::SCreationParams::SSubpassDependency::External,
+				.dstSubpass = 0,
+				.memoryBarrier =
+				{
+				  .srcStageMask = asset::PIPELINE_STAGE_FLAGS::COPY_BIT,
+				  .srcAccessMask = asset::ACCESS_FLAGS::TRANSFER_WRITE_BIT,
+				  .dstStageMask = asset::PIPELINE_STAGE_FLAGS::COLOR_ATTACHMENT_OUTPUT_BIT,
+				  .dstAccessMask = asset::ACCESS_FLAGS::COLOR_ATTACHMENT_WRITE_BIT
+				}
+			  },
+			  {
+				.srcSubpass = 0,
+				.dstSubpass = IGPURenderpass::SCreationParams::SSubpassDependency::External,
+				.memoryBarrier =
+				{
+				  .srcStageMask = asset::PIPELINE_STAGE_FLAGS::COLOR_ATTACHMENT_OUTPUT_BIT,
+				  .srcAccessMask = asset::ACCESS_FLAGS::COLOR_ATTACHMENT_WRITE_BIT
+				}
+			  },
+			  IGPURenderpass::SCreationParams::DependenciesEnd
+			};
+
+			auto scResources = std::make_unique<CDefaultSwapchainFramebuffers>(m_device.get(), swapchainParams.surfaceFormat.format, dependencies);
+			renderpass = scResources->getRenderpass();
+
+			if (!renderpass)
+				return logFail("Failed to create Renderpass!");
+
+			if (!m_surface || !m_surface->init(gQueue, std::move(scResources), swapchainParams.sharedParams))
+				return logFail("Could not create Window & Surface or initialize the Surface!");
+		}
+
+		auto pool = m_device->createCommandPool(gQueue->getFamilyIndex(), IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT);
+
+		m_converter = CAssetConverter::create({ .device = m_device.get(), .optimizer = {} });
+
+		for (auto i = 0u; i < MaxFramesInFlight; i++)
+		{
+			if (!pool)
+				return logFail("Couldn't create Command Pool!");
+			if (!pool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY, { m_cmdBufs.data() + i, 1 }))
+				return logFail("Couldn't create Command Buffer!");
+		}
+
+		m_winMgr->setWindowSize(m_window.get(), WIN_W, WIN_H);
+		m_surface->recreateSwapchain();
+
+
+		// create output images
+		m_hdrImage = m_device->createImage({
+			{
+			  .type = IGPUImage::ET_2D,
+			  .samples = ICPUImage::ESCF_1_BIT,
+			  .format = EF_R16G16B16A16_SFLOAT,
+			  .extent = {WIN_W, WIN_H, 1},
+			  .mipLevels = 1,
+			  .arrayLayers = 1,
+			  .flags = IImage::ECF_NONE,
+			  .usage = bitflag(IImage::EUF_STORAGE_BIT) | IImage::EUF_TRANSFER_SRC_BIT | IImage::EUF_SAMPLED_BIT
+			}
+			});
+
+		if (!m_hdrImage || !m_device->allocate(m_hdrImage->getMemoryReqs(), m_hdrImage.get()).isValid())
+			return logFail("Could not create HDR Image");
+
+		m_hdrImageView = m_device->createImageView({
+		  .flags = IGPUImageView::ECF_NONE,
+		  .subUsages = IGPUImage::E_USAGE_FLAGS::EUF_STORAGE_BIT | IGPUImage::E_USAGE_FLAGS::EUF_SAMPLED_BIT,
+		  .image = m_hdrImage,
+		  .viewType = IGPUImageView::E_TYPE::ET_2D,
+		  .format = asset::EF_R16G16B16A16_SFLOAT
+			});
+
+
+
+		// ray trace pipeline and descriptor set layout setup
+		{
+			const IGPUDescriptorSetLayout::SBinding bindings[] = {
+			  {
+				.binding = 0,
+				.type = asset::IDescriptor::E_TYPE::ET_ACCELERATION_STRUCTURE,
+				.createFlags = IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_NONE,
+				.stageFlags = asset::IShader::E_SHADER_STAGE::ESS_RAYGEN,
+				.count = 1,
+			  },
+			  {
+				.binding = 1,
+				.type = asset::IDescriptor::E_TYPE::ET_STORAGE_IMAGE,
+				.createFlags = IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_NONE,
+				.stageFlags = asset::IShader::E_SHADER_STAGE::ESS_RAYGEN,
+				.count = 1,
+			  }
+			};
+			const auto descriptorSetLayout = m_device->createDescriptorSetLayout(bindings);
+
+			const std::array<IGPUDescriptorSetLayout*, ICPUPipelineLayout::DESCRIPTOR_SET_COUNT> dsLayoutPtrs = { descriptorSetLayout.get() };
+			m_rayTracingDsPool = m_device->createDescriptorPoolForDSLayouts(IDescriptorPool::ECF_UPDATE_AFTER_BIND_BIT, std::span(dsLayoutPtrs.begin(), dsLayoutPtrs.end()));
+			m_rayTracingDs = m_rayTracingDsPool->createDescriptorSet(descriptorSetLayout);
+
+			const SPushConstantRange pcRange = {
+			  .stageFlags = IShader::E_SHADER_STAGE::ESS_ALL_RAY_TRACING,
+			  .offset = 0u,
+			  .size = sizeof(SPushConstants),
+			};
+			const auto pipelineLayout = m_device->createPipelineLayout({ &pcRange, 1 }, smart_refctd_ptr(descriptorSetLayout), nullptr, nullptr, nullptr);
+
+			IGPURayTracingPipeline::SCreationParams params = {};
+
+			enum RtDemoShader
+			{
+				RTDS_RAYGEN,
+				RTDS_MISS,
+				RTDS_MISS_SHADOW,
+				RTDS_CLOSEST_HIT,
+				RTDS_SPHERE_CLOSEST_HIT,
+				RTDS_ANYHIT_PRIMARY,
+				RTDS_ANYHIT_SHADOW,
+				RTDS_INTERSECTION,
+				RTDS_DIRECTIONAL_CALL,
+				RTDS_POINT_CALL,
+				RTDS_SPOT_CALL,
+				RTDS_COUNT
+			};
+
+			IGPUShader::SSpecInfo shaders[RTDS_COUNT];
+			shaders[RTDS_RAYGEN] = { .shader = raygenShader.get() };
+			shaders[RTDS_MISS] = { .shader = missShader.get() };
+			shaders[RTDS_MISS_SHADOW] = { .shader = missShadowShader.get() };
+			shaders[RTDS_CLOSEST_HIT] = { .shader = closestHitShader.get() };
+			shaders[RTDS_SPHERE_CLOSEST_HIT] = { .shader = proceduralClosestHitShader.get() };
+			shaders[RTDS_ANYHIT_PRIMARY] = { .shader = anyHitShaderColorPayload.get() };
+			shaders[RTDS_ANYHIT_SHADOW] = { .shader = anyHitShaderShadowPayload.get() };
+			shaders[RTDS_INTERSECTION] = { .shader = intersectionHitShader.get() };
+			shaders[RTDS_DIRECTIONAL_CALL] = { .shader = directionalLightCallShader.get() };
+			shaders[RTDS_POINT_CALL] = { .shader = pointLightCallShader.get() };
+			shaders[RTDS_SPOT_CALL] = { .shader = spotLightCallShader.get() };
+
+			params.layout = pipelineLayout.get();
+			params.shaders = std::span(shaders);
+			using RayTracingFlags = IGPURayTracingPipeline::SCreationParams::FLAGS;
+			params.flags = core::bitflag(RayTracingFlags::NO_NULL_MISS_SHADERS) |
+				RayTracingFlags::NO_NULL_INTERSECTION_SHADERS |
+				RayTracingFlags::NO_NULL_ANY_HIT_SHADERS;
+
+			auto& shaderGroups = params.shaderGroups;
+
+			shaderGroups.raygen = { .index = RTDS_RAYGEN };
+
+			IRayTracingPipelineBase::SGeneralShaderGroup missGroups[EMT_COUNT];
+			missGroups[EMT_PRIMARY] = { .index = RTDS_MISS };
+			missGroups[EMT_OCCLUSION] = { .index = RTDS_MISS_SHADOW };
+			shaderGroups.misses = missGroups;
+
+			auto getHitGroupIndex = [](E_GEOM_TYPE geomType, E_RAY_TYPE rayType)
+				{
+					return geomType * ERT_COUNT + rayType;
+				};
+			IRayTracingPipelineBase::SHitShaderGroup hitGroups[E_RAY_TYPE::ERT_COUNT * E_GEOM_TYPE::EGT_COUNT];
+			hitGroups[getHitGroupIndex(EGT_TRIANGLES, ERT_PRIMARY)] = {
+			  .closestHit = RTDS_CLOSEST_HIT,
+			  .anyHit = RTDS_ANYHIT_PRIMARY,
+			};
+			hitGroups[getHitGroupIndex(EGT_TRIANGLES, ERT_OCCLUSION)] = {
+			  .closestHit = IGPURayTracingPipeline::SGeneralShaderGroup::Unused,
+			  .anyHit = RTDS_ANYHIT_SHADOW,
+			};
+			hitGroups[getHitGroupIndex(EGT_PROCEDURAL, ERT_PRIMARY)] = {
+			  .closestHit = RTDS_SPHERE_CLOSEST_HIT,
+			  .anyHit = RTDS_ANYHIT_PRIMARY,
+			  .intersection = RTDS_INTERSECTION,
+			};
+			hitGroups[getHitGroupIndex(EGT_PROCEDURAL, ERT_OCCLUSION)] = {
+			  .closestHit = IGPURayTracingPipeline::SGeneralShaderGroup::Unused,
+			  .anyHit = RTDS_ANYHIT_SHADOW,
+			  .intersection = RTDS_INTERSECTION,
+			};
+			shaderGroups.hits = hitGroups;
+
+			IRayTracingPipelineBase::SGeneralShaderGroup callableGroups[ELT_COUNT];
+			callableGroups[ELT_DIRECTIONAL] = { .index = RTDS_DIRECTIONAL_CALL };
+			callableGroups[ELT_POINT] = { .index = RTDS_POINT_CALL };
+			callableGroups[ELT_SPOT] = { .index = RTDS_SPOT_CALL };
+			shaderGroups.callables = callableGroups;
+
+			params.cached.maxRecursionDepth = 1;
+			params.cached.dynamicStackSize = true;
+
+			if (!m_device->createRayTracingPipelines(nullptr, { &params, 1 }, &m_rayTracingPipeline))
+				return logFail("Failed to create ray tracing pipeline");
+
+			calculateRayTracingStackSize(m_rayTracingPipeline);
+
+			if (!createShaderBindingTable(m_rayTracingPipeline))
+				return logFail("Could not create shader binding table");
+
+		}
+
+		auto assetManager = make_smart_refctd_ptr<nbl::asset::IAssetManager>(smart_refctd_ptr(system));
+		auto* geometryCreator = assetManager->getGeometryCreator();
+
+		if (!createIndirectBuffer())
+			return logFail("Could not create indirect buffer");
+
+		if (!createAccelerationStructuresFromGeometry(geometryCreator))
+			return logFail("Could not create acceleration structures from geometry creator");
+
+		ISampler::SParams samplerParams = {
+		  .AnisotropicFilter = 0
+		};
+		auto defaultSampler = m_device->createSampler(samplerParams);
+
+		{
+			const IGPUDescriptorSetLayout::SBinding bindings[] = {
+			  {
+				.binding = 0u,
+				.type = nbl::asset::IDescriptor::E_TYPE::ET_COMBINED_IMAGE_SAMPLER,
+				.createFlags = ICPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_NONE,
+				.stageFlags = IShader::E_SHADER_STAGE::ESS_FRAGMENT,
+				.count = 1u,
+				.immutableSamplers = &defaultSampler
+			  }
+			};
+			auto gpuPresentDescriptorSetLayout = m_device->createDescriptorSetLayout(bindings);
+			const video::IGPUDescriptorSetLayout* const layouts[] = { gpuPresentDescriptorSetLayout.get() };
+			const uint32_t setCounts[] = { 1u };
+			m_presentDsPool = m_device->createDescriptorPoolForDSLayouts(IDescriptorPool::E_CREATE_FLAGS::ECF_NONE, layouts, setCounts);
+			m_presentDs = m_presentDsPool->createDescriptorSet(gpuPresentDescriptorSetLayout);
+
+			auto scRes = static_cast<CDefaultSwapchainFramebuffers*>(m_surface->getSwapchainResources());
+			ext::FullScreenTriangle::ProtoPipeline fsTriProtoPPln(m_assetMgr.get(), m_device.get(), m_logger.get());
+			if (!fsTriProtoPPln)
+				return logFail("Failed to create Full Screen Triangle protopipeline or load its vertex shader!");
+
+			const IGPUShader::SSpecInfo fragSpec = {
+			  .entryPoint = "main",
+			  .shader = fragmentShader.get()
+			};
+
+			auto presentLayout = m_device->createPipelineLayout(
+				{},
+				core::smart_refctd_ptr(gpuPresentDescriptorSetLayout),
+				nullptr,
+				nullptr,
+				nullptr
+			);
+			m_presentPipeline = fsTriProtoPPln.createPipeline(fragSpec, presentLayout.get(), scRes->getRenderpass());
+			if (!m_presentPipeline)
+				return logFail("Could not create Graphics Pipeline!");
+		}
+
+		// write descriptors
+		IGPUDescriptorSet::SDescriptorInfo infos[3];
+		infos[0].desc = m_gpuTlas;
+
+		infos[1].desc = m_hdrImageView;
+		if (!infos[1].desc)
+			return logFail("Failed to create image view");
+		infos[1].info.image.imageLayout = IImage::LAYOUT::GENERAL;
+
+		infos[2].desc = m_hdrImageView;
+		infos[2].info.image.imageLayout = IImage::LAYOUT::READ_ONLY_OPTIMAL;
+
+		IGPUDescriptorSet::SWriteDescriptorSet writes[] = {
+			{.dstSet = m_rayTracingDs.get(), .binding = 0, .arrayElement = 0, .count = 1, .info = &infos[0]},
+			{.dstSet = m_rayTracingDs.get(), .binding = 1, .arrayElement = 0, .count = 1, .info = &infos[1]},
+			{.dstSet = m_presentDs.get(), .binding = 0, .arrayElement = 0, .count = 1, .info = &infos[2] },
+		};
+		m_device->updateDescriptorSets(std::span(writes), {});
+
+		// gui descriptor setup
+		{
+			using binding_flags_t = IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS;
+			{
+				IGPUSampler::SParams params;
+				params.AnisotropicFilter = 1u;
+				params.TextureWrapU = ETC_REPEAT;
+				params.TextureWrapV = ETC_REPEAT;
+				params.TextureWrapW = ETC_REPEAT;
+
+				m_ui.samplers.gui = m_device->createSampler(params);
+				m_ui.samplers.gui->setObjectDebugName("Nabla IMGUI UI Sampler");
+			}
+
+			std::array<core::smart_refctd_ptr<IGPUSampler>, 69u> immutableSamplers;
+			for (auto& it : immutableSamplers)
+				it = smart_refctd_ptr(m_ui.samplers.scene);
+
+			immutableSamplers[nbl::ext::imgui::UI::FontAtlasTexId] = smart_refctd_ptr(m_ui.samplers.gui);
+
+			nbl::ext::imgui::UI::SCreationParameters params;
+
+			params.resources.texturesInfo = { .setIx = 0u, .bindingIx = 0u };
+			params.resources.samplersInfo = { .setIx = 0u, .bindingIx = 1u };
+			params.assetManager = m_assetMgr;
+			params.pipelineCache = nullptr;
+			params.pipelineLayout = nbl::ext::imgui::UI::createDefaultPipelineLayout(m_utils->getLogicalDevice(), params.resources.texturesInfo, params.resources.samplersInfo, MaxUITextureCount);
+			params.renderpass = smart_refctd_ptr<IGPURenderpass>(renderpass);
+			params.streamingBuffer = nullptr;
+			params.subpassIx = 0u;
+			params.transfer = getGraphicsQueue();
+			params.utilities = m_utils;
+			{
+				m_ui.manager = ext::imgui::UI::create(std::move(params));
+
+				// note that we use default layout provided by our extension, but you are free to create your own by filling nbl::ext::imgui::UI::S_CREATION_PARAMETERS::resources
+				const auto* descriptorSetLayout = m_ui.manager->getPipeline()->getLayout()->getDescriptorSetLayout(0u);
+				const auto& params = m_ui.manager->getCreationParameters();
+
+				IDescriptorPool::SCreateInfo descriptorPoolInfo = {};
+				descriptorPoolInfo.maxDescriptorCount[static_cast<uint32_t>(asset::IDescriptor::E_TYPE::ET_SAMPLER)] = (uint32_t)nbl::ext::imgui::UI::DefaultSamplerIx::COUNT;
+				descriptorPoolInfo.maxDescriptorCount[static_cast<uint32_t>(asset::IDescriptor::E_TYPE::ET_SAMPLED_IMAGE)] = MaxUITextureCount;
+				descriptorPoolInfo.maxSets = 1u;
+				descriptorPoolInfo.flags = IDescriptorPool::E_CREATE_FLAGS::ECF_UPDATE_AFTER_BIND_BIT;
+
+				m_guiDescriptorSetPool = m_device->createDescriptorPool(std::move(descriptorPoolInfo));
+				assert(m_guiDescriptorSetPool);
+
+				m_guiDescriptorSetPool->createDescriptorSets(1u, &descriptorSetLayout, &m_ui.descriptorSet);
+				assert(m_ui.descriptorSet);
+			}
+		}
+
+		m_ui.manager->registerListener(
+			[this]() -> void {
+				ImGuiIO& io = ImGui::GetIO();
+
+				m_camera.setProjectionMatrix([&]()
+					{
+						static matrix4SIMD projection;
+
+						projection = matrix4SIMD::buildProjectionMatrixPerspectiveFovRH(
+							core::radians(m_cameraSetting.fov),
+							io.DisplaySize.x / io.DisplaySize.y,
+							m_cameraSetting.zNear,
+							m_cameraSetting.zFar);
+
+						return projection;
+					}());
+
+				ImGui::SetNextWindowPos(ImVec2(1024, 100), ImGuiCond_Appearing);
+				ImGui::SetNextWindowSize(ImVec2(256, 256), ImGuiCond_Appearing);
+
+				// create a window and insert the inspector
+				ImGui::SetNextWindowPos(ImVec2(10, 10), ImGuiCond_Appearing);
+				ImGui::SetNextWindowSize(ImVec2(320, 340), ImGuiCond_Appearing);
+				ImGui::Begin("Controls");
+
+				ImGui::SameLine();
+
+				ImGui::Text("Camera");
+
+				ImGui::SliderFloat("Move speed", &m_cameraSetting.moveSpeed, 0.1f, 10.f);
+				ImGui::SliderFloat("Rotate speed", &m_cameraSetting.rotateSpeed, 0.1f, 10.f);
+				ImGui::SliderFloat("Fov", &m_cameraSetting.fov, 20.f, 150.f);
+				ImGui::SliderFloat("zNear", &m_cameraSetting.zNear, 0.1f, 100.f);
+				ImGui::SliderFloat("zFar", &m_cameraSetting.zFar, 110.f, 10000.f);
+				Light m_oldLight = m_light;
+				int light_type = m_light.type;
+				ImGui::ListBox("LightType", &light_type, s_lightTypeNames, ELT_COUNT);
+				m_light.type = static_cast<E_LIGHT_TYPE>(light_type);
+				if (m_light.type == ELT_DIRECTIONAL)
+				{
+					ImGui::SliderFloat3("Light Direction", &m_light.direction.x, -1.f, 1.f);
+				}
+				else if (m_light.type == ELT_POINT)
+				{
+					ImGui::SliderFloat3("Light Position", &m_light.position.x, -20.f, 20.f);
+				}
+				else if (m_light.type == ELT_SPOT)
+				{
+					ImGui::SliderFloat3("Light Direction", &m_light.direction.x, -1.f, 1.f);
+					ImGui::SliderFloat3("Light Position", &m_light.position.x, -20.f, 20.f);
+
+					float32_t dOuterCutoff = hlsl::degrees(acos(m_light.outerCutoff));
+					if (ImGui::SliderFloat("Light Outer Cutoff", &dOuterCutoff, 0.0f, 45.0f))
+					{
+						m_light.outerCutoff = cos(hlsl::radians(dOuterCutoff));
+					}
+				}
+				ImGui::Checkbox("Use Indirect Command", &m_useIndirectCommand);
+				if (m_light != m_oldLight)
+				{
+					m_frameAccumulationCounter = 0;
+				}
+
+				ImGui::Text("X: %f Y: %f", io.MousePos.x, io.MousePos.y);
+
+				ImGui::End();
+			}
+		);
+
+		// Set Camera
+		{
+			core::vectorSIMDf cameraPosition(0, 5, -10);
+			matrix4SIMD proj = matrix4SIMD::buildProjectionMatrixPerspectiveFovRH(
+				core::radians(60.0f),
+				WIN_W / WIN_H,
+				0.01f,
+				500.0f
+			);
+			m_camera = Camera(cameraPosition, core::vectorSIMDf(0, 0, 0), proj);
+		}
+
+		m_winMgr->setWindowSize(m_window.get(), WIN_W, WIN_H);
+		m_surface->recreateSwapchain();
+		m_winMgr->show(m_window.get());
+		m_oracle.reportBeginFrameRecord();
+		m_camera.mapKeysToWASD();
+
+		return true;
+	}
+
+	bool updateGUIDescriptorSet()
+	{
+		// texture atlas, note we don't create info & write pair for the font sampler because UI extension's is immutable and baked into DS layout
+		static std::array<IGPUDescriptorSet::SDescriptorInfo, MaxUITextureCount> descriptorInfo;
+		static IGPUDescriptorSet::SWriteDescriptorSet writes[MaxUITextureCount];
+
+		descriptorInfo[nbl::ext::imgui::UI::FontAtlasTexId].info.image.imageLayout = IImage::LAYOUT::READ_ONLY_OPTIMAL;
+		descriptorInfo[nbl::ext::imgui::UI::FontAtlasTexId].desc = smart_refctd_ptr<IGPUImageView>(m_ui.manager->getFontAtlasView());
+
+		for (uint32_t i = 0; i < descriptorInfo.size(); ++i)
+		{
+			writes[i].dstSet = m_ui.descriptorSet.get();
+			writes[i].binding = 0u;
+			writes[i].arrayElement = i;
+			writes[i].count = 1u;
+		}
+		writes[nbl::ext::imgui::UI::FontAtlasTexId].info = descriptorInfo.data() + nbl::ext::imgui::UI::FontAtlasTexId;
+
+		return m_device->updateDescriptorSets(writes, {});
+	}
+
+	inline void workLoopBody() override
+	{
+		// framesInFlight: ensuring safe execution of command buffers and acquires, `framesInFlight` only affect semaphore waits, don't use this to index your resources because it can change with swapchain recreation.
+		const uint32_t framesInFlight = core::min(MaxFramesInFlight, m_surface->getMaxAcquiresInFlight());
+		// We block for semaphores for 2 reasons here:
+		  // A) Resource: Can't use resource like a command buffer BEFORE previous use is finished! [MaxFramesInFlight]
+		  // B) Acquire: Can't have more acquires in flight than a certain threshold returned by swapchain or your surface helper class. [MaxAcquiresInFlight]
+		if (m_realFrameIx >= framesInFlight)
+		{
+			const ISemaphore::SWaitInfo cbDonePending[] =
+			{
+			  {
+				.semaphore = m_semaphore.get(),
+				.value = m_realFrameIx + 1 - framesInFlight
+			  }
+			};
+			if (m_device->blockForSemaphores(cbDonePending) != ISemaphore::WAIT_RESULT::SUCCESS)
+				return;
+		}
+		const auto resourceIx = m_realFrameIx % MaxFramesInFlight;
+
+		m_api->startCapture();
+
+		update();
+
+		auto queue = getGraphicsQueue();
+		auto cmdbuf = m_cmdBufs[resourceIx].get();
+
+		if (!keepRunning())
+			return;
+
+		cmdbuf->reset(IGPUCommandBuffer::RESET_FLAGS::RELEASE_RESOURCES_BIT);
+		cmdbuf->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT);
+		cmdbuf->beginDebugMarker("RaytracingPipelineApp Frame");
+
+		const auto viewMatrix = m_camera.getViewMatrix();
+		const auto projectionMatrix = m_camera.getProjectionMatrix();
+		const auto viewProjectionMatrix = m_camera.getConcatenatedMatrix();
+
+		core::matrix3x4SIMD modelMatrix;
+		modelMatrix.setTranslation(nbl::core::vectorSIMDf(0, 0, 0, 0));
+		modelMatrix.setRotation(quaternion(0, 0, 0));
+
+		core::matrix4SIMD modelViewProjectionMatrix = core::concatenateBFollowedByA(viewProjectionMatrix, modelMatrix);
+		if (m_cachedModelViewProjectionMatrix != modelViewProjectionMatrix)
+		{
+			m_frameAccumulationCounter = 0;
+			m_cachedModelViewProjectionMatrix = modelViewProjectionMatrix;
+		}
+		core::matrix4SIMD invModelViewProjectionMatrix;
+		modelViewProjectionMatrix.getInverseTransform(invModelViewProjectionMatrix);
+
+		{
+			IGPUCommandBuffer::SPipelineBarrierDependencyInfo::image_barrier_t imageBarriers[1];
+			imageBarriers[0].barrier = {
+			   .dep = {
+				 .srcStageMask = PIPELINE_STAGE_FLAGS::FRAGMENT_SHADER_BIT, // previous frame read from framgent shader
+				 .srcAccessMask = ACCESS_FLAGS::SHADER_READ_BITS,
+				 .dstStageMask = PIPELINE_STAGE_FLAGS::RAY_TRACING_SHADER_BIT,
+				 .dstAccessMask = ACCESS_FLAGS::SHADER_WRITE_BITS
+			  }
+			};
+			imageBarriers[0].image = m_hdrImage.get();
+			imageBarriers[0].subresourceRange = {
+			  .aspectMask = IImage::EAF_COLOR_BIT,
+			  .baseMipLevel = 0u,
+			  .levelCount = 1u,
+			  .baseArrayLayer = 0u,
+			  .layerCount = 1u
+			};
+			imageBarriers[0].oldLayout = m_frameAccumulationCounter == 0 ? IImage::LAYOUT::UNDEFINED : IImage::LAYOUT::READ_ONLY_OPTIMAL;
+			imageBarriers[0].newLayout = IImage::LAYOUT::GENERAL;
+			cmdbuf->pipelineBarrier(E_DEPENDENCY_FLAGS::EDF_NONE, { .imgBarriers = imageBarriers });
+		}
+
+		// Trace Rays Pass
+		{
+			SPushConstants pc;
+			pc.light = m_light;
+			pc.proceduralGeomInfoBuffer = m_proceduralGeomInfoBuffer->getDeviceAddress();
+			pc.triangleGeomInfoBuffer = m_triangleGeomInfoBuffer->getDeviceAddress();
+			pc.frameCounter = m_frameAccumulationCounter;
+			const core::vector3df camPos = m_camera.getPosition().getAsVector3df();
+			pc.camPos = { camPos.X, camPos.Y, camPos.Z };
+			memcpy(&pc.invMVP, invModelViewProjectionMatrix.pointer(), sizeof(pc.invMVP));
+
+			cmdbuf->bindRayTracingPipeline(m_rayTracingPipeline.get());
+			cmdbuf->setRayTracingPipelineStackSize(m_rayTracingStackSize);
+			cmdbuf->pushConstants(m_rayTracingPipeline->getLayout(), IShader::E_SHADER_STAGE::ESS_ALL_RAY_TRACING, 0, sizeof(SPushConstants), &pc);
+			cmdbuf->bindDescriptorSets(EPBP_RAY_TRACING, m_rayTracingPipeline->getLayout(), 0, 1, &m_rayTracingDs.get());
+			if (m_useIndirectCommand)
+			{
+				cmdbuf->traceRaysIndirect(
+					SBufferBinding<const IGPUBuffer>{
+					.offset = 0,
+						.buffer = m_indirectBuffer,
+				});
+			}
+			else
+			{
+				cmdbuf->traceRays(
+					m_shaderBindingTable.raygenGroupRange,
+					m_shaderBindingTable.missGroupsRange, m_shaderBindingTable.missGroupsStride,
+					m_shaderBindingTable.hitGroupsRange, m_shaderBindingTable.hitGroupsStride,
+					m_shaderBindingTable.callableGroupsRange, m_shaderBindingTable.callableGroupsStride,
+					WIN_W, WIN_H, 1);
+			}
+		}
+
+		// pipeline barrier
+		{
+			IGPUCommandBuffer::SPipelineBarrierDependencyInfo::image_barrier_t imageBarriers[1];
+			imageBarriers[0].barrier = {
+			  .dep = {
+				.srcStageMask = PIPELINE_STAGE_FLAGS::RAY_TRACING_SHADER_BIT,
+				.srcAccessMask = ACCESS_FLAGS::SHADER_WRITE_BITS,
+				.dstStageMask = PIPELINE_STAGE_FLAGS::COLOR_ATTACHMENT_OUTPUT_BIT,
+				.dstAccessMask = ACCESS_FLAGS::COLOR_ATTACHMENT_WRITE_BIT
+			  }
+			};
+			imageBarriers[0].image = m_hdrImage.get();
+			imageBarriers[0].subresourceRange = {
+			  .aspectMask = IImage::EAF_COLOR_BIT,
+			  .baseMipLevel = 0u,
+			  .levelCount = 1u,
+			  .baseArrayLayer = 0u,
+			  .layerCount = 1u
+			};
+			imageBarriers[0].oldLayout = IImage::LAYOUT::GENERAL;
+			imageBarriers[0].newLayout = IImage::LAYOUT::READ_ONLY_OPTIMAL;
+
+			cmdbuf->pipelineBarrier(E_DEPENDENCY_FLAGS::EDF_NONE, { .imgBarriers = imageBarriers });
+		}
+
+		{
+			asset::SViewport viewport;
+			{
+				viewport.minDepth = 1.f;
+				viewport.maxDepth = 0.f;
+				viewport.x = 0u;
+				viewport.y = 0u;
+				viewport.width = WIN_W;
+				viewport.height = WIN_H;
+			}
+			cmdbuf->setViewport(0u, 1u, &viewport);
+
+
+			VkRect2D defaultScisors[] = { {.offset = {(int32_t)viewport.x, (int32_t)viewport.y}, .extent = {(uint32_t)viewport.width, (uint32_t)viewport.height}} };
+			cmdbuf->setScissor(defaultScisors);
+
+			auto scRes = static_cast<CDefaultSwapchainFramebuffers*>(m_surface->getSwapchainResources());
+			const VkRect2D currentRenderArea =
+			{
+			  .offset = {0,0},
+			  .extent = {m_window->getWidth(),m_window->getHeight()}
+			};
+			const IGPUCommandBuffer::SClearColorValue clearColor = { .float32 = {0.f,0.f,0.f,1.f} };
+			const IGPUCommandBuffer::SRenderpassBeginInfo info =
+			{
+			  .framebuffer = scRes->getFramebuffer(m_currentImageAcquire.imageIndex),
+			  .colorClearValues = &clearColor,
+			  .depthStencilClearValues = nullptr,
+			  .renderArea = currentRenderArea
+			};
+			nbl::video::ISemaphore::SWaitInfo waitInfo = { .semaphore = m_semaphore.get(), .value = m_realFrameIx + 1u };
+
+			cmdbuf->beginRenderPass(info, IGPUCommandBuffer::SUBPASS_CONTENTS::INLINE);
+
+			cmdbuf->bindGraphicsPipeline(m_presentPipeline.get());
+			cmdbuf->bindDescriptorSets(EPBP_GRAPHICS, m_presentPipeline->getLayout(), 0, 1u, &m_presentDs.get());
+			ext::FullScreenTriangle::recordDrawCall(cmdbuf);
+
+			const auto uiParams = m_ui.manager->getCreationParameters();
+			auto* uiPipeline = m_ui.manager->getPipeline();
+			cmdbuf->bindGraphicsPipeline(uiPipeline);
+			cmdbuf->bindDescriptorSets(EPBP_GRAPHICS, uiPipeline->getLayout(), uiParams.resources.texturesInfo.setIx, 1u, &m_ui.descriptorSet.get());
+			m_ui.manager->render(cmdbuf, waitInfo);
+
+			cmdbuf->endRenderPass();
+
+		}
+
+		cmdbuf->endDebugMarker();
+		cmdbuf->end();
+
+		{
+			const IQueue::SSubmitInfo::SSemaphoreInfo rendered[] =
+			{
+			  {
+				.semaphore = m_semaphore.get(),
+				.value = ++m_realFrameIx,
+				.stageMask = PIPELINE_STAGE_FLAGS::ALL_TRANSFER_BITS
+			  }
+			};
+			{
+				{
+					const IQueue::SSubmitInfo::SCommandBufferInfo commandBuffers[] =
+					{
+					  {.cmdbuf = cmdbuf }
+					};
+
+					const IQueue::SSubmitInfo::SSemaphoreInfo acquired[] =
+					{
+					  {
+						.semaphore = m_currentImageAcquire.semaphore,
+						.value = m_currentImageAcquire.acquireCount,
+						.stageMask = PIPELINE_STAGE_FLAGS::NONE
+					  }
+					};
+					const IQueue::SSubmitInfo infos[] =
+					{
+					  {
+						.waitSemaphores = acquired,
+						.commandBuffers = commandBuffers,
+						.signalSemaphores = rendered
+					  }
+					};
+
+					updateGUIDescriptorSet();
+
+					if (queue->submit(infos) != IQueue::RESULT::SUCCESS)
+						m_realFrameIx--;
+				}
+			}
+
+			m_window->setCaption("[Nabla Engine] Ray Tracing Pipeline");
+			m_surface->present(m_currentImageAcquire.imageIndex, rendered);
+		}
+		m_api->endCapture();
+		m_frameAccumulationCounter++;
+	}
+
+	inline void update()
+	{
+		m_camera.setMoveSpeed(m_cameraSetting.moveSpeed);
+		m_camera.setRotateSpeed(m_cameraSetting.rotateSpeed);
+
+		static std::chrono::microseconds previousEventTimestamp{};
+
+		m_inputSystem->getDefaultMouse(&m_mouse);
+		m_inputSystem->getDefaultKeyboard(&m_keyboard);
+
+		auto updatePresentationTimestamp = [&]()
+			{
+				m_currentImageAcquire = m_surface->acquireNextImage();
+
+				m_oracle.reportEndFrameRecord();
+				const auto timestamp = m_oracle.getNextPresentationTimeStamp();
+				m_oracle.reportBeginFrameRecord();
+
+				return timestamp;
+			};
+
+		const auto nextPresentationTimestamp = updatePresentationTimestamp();
+
+		struct
+		{
+			std::vector<SMouseEvent> mouse{};
+			std::vector<SKeyboardEvent> keyboard{};
+		} capturedEvents;
+
+		m_camera.beginInputProcessing(nextPresentationTimestamp);
+		{
+			const auto& io = ImGui::GetIO();
+			m_mouse.consumeEvents([&](const IMouseEventChannel::range_t& events) -> void
+				{
+					if (!io.WantCaptureMouse)
+						m_camera.mouseProcess(events); // don't capture the events, only let camera handle them with its impl
+
+					for (const auto& e : events) // here capture
+					{
+						if (e.timeStamp < previousEventTimestamp)
+							continue;
+
+						previousEventTimestamp = e.timeStamp;
+						capturedEvents.mouse.emplace_back(e);
+
+					}
+				}, m_logger.get());
+
+			m_keyboard.consumeEvents([&](const IKeyboardEventChannel::range_t& events) -> void
+				{
+					if (!io.WantCaptureKeyboard)
+						m_camera.keyboardProcess(events); // don't capture the events, only let camera handle them with its impl
+
+					for (const auto& e : events) // here capture
+					{
+						if (e.timeStamp < previousEventTimestamp)
+							continue;
+
+						previousEventTimestamp = e.timeStamp;
+						capturedEvents.keyboard.emplace_back(e);
+					}
+				}, m_logger.get());
+
+		}
+		m_camera.endInputProcessing(nextPresentationTimestamp);
+
+		const core::SRange<const nbl::ui::SMouseEvent> mouseEvents(capturedEvents.mouse.data(), capturedEvents.mouse.data() + capturedEvents.mouse.size());
+		const core::SRange<const nbl::ui::SKeyboardEvent> keyboardEvents(capturedEvents.keyboard.data(), capturedEvents.keyboard.data() + capturedEvents.keyboard.size());
+		const auto cursorPosition = m_window->getCursorControl()->getPosition();
+		const auto mousePosition = float32_t2(cursorPosition.x, cursorPosition.y) - float32_t2(m_window->getX(), m_window->getY());
+
+		const ext::imgui::UI::SUpdateParameters params =
+		{
+		  .mousePosition = mousePosition,
+		  .displaySize = { m_window->getWidth(), m_window->getHeight() },
+		  .mouseEvents = mouseEvents,
+		  .keyboardEvents = keyboardEvents
+		};
+
+		m_ui.manager->update(params);
+	}
+
+	inline bool keepRunning() override
+	{
+		if (m_surface->irrecoverable())
+			return false;
+
+		return true;
+	}
+
+	inline bool onAppTerminated() override
+	{
+		return device_base_t::onAppTerminated();
+	}
+
+private:
+	uint32_t getWorkgroupCount(uint32_t dim, uint32_t size)
+	{
+		return (dim + size - 1) / size;
+	}
+
+	bool createIndirectBuffer()
+	{
+		const auto getBufferRangeAddress = [](const SBufferRange<IGPUBuffer>& range)
+			{
+				return range.buffer->getDeviceAddress() + range.offset;
+			};
+		const auto command = TraceRaysIndirectCommand_t{
+		  .raygenShaderRecordAddress = getBufferRangeAddress(m_shaderBindingTable.raygenGroupRange),
+		  .raygenShaderRecordSize = m_shaderBindingTable.raygenGroupRange.size,
+		  .missShaderBindingTableAddress = getBufferRangeAddress(m_shaderBindingTable.missGroupsRange),
+		  .missShaderBindingTableSize = m_shaderBindingTable.missGroupsRange.size,
+		  .missShaderBindingTableStride = m_shaderBindingTable.missGroupsStride,
+		  .hitShaderBindingTableAddress = getBufferRangeAddress(m_shaderBindingTable.hitGroupsRange),
+		  .hitShaderBindingTableSize = m_shaderBindingTable.hitGroupsRange.size,
+		  .hitShaderBindingTableStride = m_shaderBindingTable.hitGroupsStride,
+		  .callableShaderBindingTableAddress = getBufferRangeAddress(m_shaderBindingTable.callableGroupsRange),
+		  .callableShaderBindingTableSize = m_shaderBindingTable.callableGroupsRange.size,
+		  .callableShaderBindingTableStride = m_shaderBindingTable.callableGroupsStride,
+		  .width = WIN_W,
+		  .height = WIN_H,
+		  .depth = 1,
+		};
+		IGPUBuffer::SCreationParams params;
+		params.usage = IGPUBuffer::EUF_TRANSFER_DST_BIT | IGPUBuffer::EUF_INDIRECT_BUFFER_BIT | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT;
+		params.size = sizeof(TraceRaysIndirectCommand_t);
+		m_utils->createFilledDeviceLocalBufferOnDedMem(SIntendedSubmitInfo{ .queue = getGraphicsQueue() }, std::move(params), &command).move_into(m_indirectBuffer);
+		return true;
+	}
+
+	void calculateRayTracingStackSize(const smart_refctd_ptr<video::IGPURayTracingPipeline>& pipeline)
+	{
+		const auto raygenStackSize = pipeline->getRaygenStackSize();
+		auto getMaxSize = [&](auto ranges, auto valProj) -> uint16_t
+			{
+				auto maxValue = 0;
+				for (const auto& val : ranges)
+				{
+					maxValue = std::max<uint16_t>(maxValue, std::invoke(valProj, val));
+				}
+				return maxValue;
+			};
+
+		const auto closestHitStackMax = getMaxSize(pipeline->getHitStackSizes(), &IGPURayTracingPipeline::SHitGroupStackSize::closestHit);
+		const auto anyHitStackMax = getMaxSize(pipeline->getHitStackSizes(), &IGPURayTracingPipeline::SHitGroupStackSize::anyHit);
+		const auto intersectionStackMax = getMaxSize(pipeline->getHitStackSizes(), &IGPURayTracingPipeline::SHitGroupStackSize::intersection);
+		const auto missStackMax = getMaxSize(pipeline->getMissStackSizes(), std::identity{});
+		const auto callableStackMax = getMaxSize(pipeline->getCallableStackSizes(), std::identity{});
+		auto firstDepthStackSizeMax = std::max(closestHitStackMax, missStackMax);
+		firstDepthStackSizeMax = std::max<uint16_t>(firstDepthStackSizeMax, intersectionStackMax + anyHitStackMax);
+		m_rayTracingStackSize = raygenStackSize + std::max(firstDepthStackSizeMax, callableStackMax);
+	}
+
+	bool createShaderBindingTable(const smart_refctd_ptr<video::IGPURayTracingPipeline>& pipeline)
+	{
+		const auto& limits = m_device->getPhysicalDevice()->getLimits();
+		const auto handleSize = SPhysicalDeviceLimits::ShaderGroupHandleSize;
+		const auto handleSizeAligned = nbl::core::alignUp(handleSize, limits.shaderGroupHandleAlignment);
+
+		auto& raygenRange = m_shaderBindingTable.raygenGroupRange;
+
+		auto& hitRange = m_shaderBindingTable.hitGroupsRange;
+		const auto hitHandles = pipeline->getHitHandles();
+
+		auto& missRange = m_shaderBindingTable.missGroupsRange;
+		const auto missHandles = pipeline->getMissHandles();
+
+		auto& callableRange = m_shaderBindingTable.callableGroupsRange;
+		const auto callableHandles = pipeline->getCallableHandles();
+
+		raygenRange = {
+		  .offset = 0,
+		  .size = core::alignUp(handleSizeAligned, limits.shaderGroupBaseAlignment)
+		};
+
+		missRange = {
+		  .offset = raygenRange.size,
+		  .size = core::alignUp(missHandles.size() * handleSizeAligned, limits.shaderGroupBaseAlignment),
+		};
+		m_shaderBindingTable.missGroupsStride = handleSizeAligned;
+
+		hitRange = {
+		  .offset = missRange.offset + missRange.size,
+		  .size = core::alignUp(hitHandles.size() * handleSizeAligned, limits.shaderGroupBaseAlignment),
+		};
+		m_shaderBindingTable.hitGroupsStride = handleSizeAligned;
+
+		callableRange = {
+		  .offset = hitRange.offset + hitRange.size,
+		  .size = core::alignUp(callableHandles.size() * handleSizeAligned, limits.shaderGroupBaseAlignment),
+		};
+		m_shaderBindingTable.callableGroupsStride = handleSizeAligned;
+
+		const auto bufferSize = raygenRange.size + missRange.size + hitRange.size + callableRange.size;
+
+		ICPUBuffer::SCreationParams cpuBufferParams;
+		cpuBufferParams.size = bufferSize;
+		auto cpuBuffer = ICPUBuffer::create(std::move(cpuBufferParams));
+		uint8_t* pData = reinterpret_cast<uint8_t*>(cpuBuffer->getPointer());
+
+		// copy raygen region
+		memcpy(pData, &pipeline->getRaygen(), handleSize);
+
+		// copy miss region
+		uint8_t* pMissData = pData + missRange.offset;
+		for (const auto& handle : missHandles)
+		{
+			memcpy(pMissData, &handle, handleSize);
+			pMissData += m_shaderBindingTable.missGroupsStride;
+		}
+
+		// copy hit region
+		uint8_t* pHitData = pData + hitRange.offset;
+		for (const auto& handle : hitHandles)
+		{
+			memcpy(pHitData, &handle, handleSize);
+			pHitData += m_shaderBindingTable.hitGroupsStride;
+		}
+
+		// copy callable region
+		uint8_t* pCallableData = pData + callableRange.offset;
+		for (const auto& handle : callableHandles)
+		{
+			memcpy(pCallableData, &handle, handleSize);
+			pCallableData += m_shaderBindingTable.callableGroupsStride;
+		}
+
+		{
+			IGPUBuffer::SCreationParams params;
+			params.usage = IGPUBuffer::EUF_TRANSFER_DST_BIT | IGPUBuffer::EUF_INLINE_UPDATE_VIA_CMDBUF | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT | IGPUBuffer::EUF_SHADER_BINDING_TABLE_BIT;
+			params.size = bufferSize;
+			m_utils->createFilledDeviceLocalBufferOnDedMem(SIntendedSubmitInfo{ .queue = getGraphicsQueue() }, std::move(params), pData).move_into(raygenRange.buffer);
+			missRange.buffer = core::smart_refctd_ptr(raygenRange.buffer);
+			hitRange.buffer = core::smart_refctd_ptr(raygenRange.buffer);
+			callableRange.buffer = core::smart_refctd_ptr(raygenRange.buffer);
+		}
+
+		return true;
+	}
+
+	bool createAccelerationStructuresFromGeometry(const IGeometryCreator* gc)
+	{
+		auto queue = getGraphicsQueue();
+		// get geometries into ICPUBuffers
+		auto pool = m_device->createCommandPool(queue->getFamilyIndex(), IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT);
+		if (!pool)
+			return logFail("Couldn't create Command Pool for geometry creation!");
+
+		const auto defaultMaterial = Material{
+		  .ambient = {0.2, 0.1, 0.1},
+		  .diffuse = {0.8, 0.3, 0.3},
+		  .specular = {0.8, 0.8, 0.8},
+		  .shininess = 1.0f,
+		  .alpha = 1.0f,
+		};
+
+		auto getTranslationMatrix = [](float32_t x, float32_t y, float32_t z)
+			{
+				core::matrix3x4SIMD transform;
+				transform.setTranslation(nbl::core::vectorSIMDf(x, y, z, 0));
+				return transform;
+			};
+
+		core::matrix3x4SIMD planeTransform;
+		planeTransform.setRotation(quaternion::fromAngleAxis(core::radians(-90.0f), vector3df_SIMD{ 1, 0, 0 }));
+
+		// triangles geometries
+		const auto cpuObjects = std::array{
+			ReferenceObjectCpu {
+				.meta = {.type = OT_RECTANGLE, .name = "Plane Mesh"},
+				.data = gc->createRectangleMesh(nbl::core::vector2df_SIMD(10, 10)),
+				.material = defaultMaterial,
+				.transform = planeTransform,
+			},
+			ReferenceObjectCpu {
+				.meta = {.type = OT_CUBE, .name = "Cube Mesh"},
+				.data = gc->createCubeMesh(nbl::core::vector3df(1, 1, 1)),
+				.material = defaultMaterial,
+				.transform = getTranslationMatrix(0, 0.5f, 0),
+			},
+			ReferenceObjectCpu {
+				.meta = {.type = OT_CUBE, .name = "Cube Mesh 2"},
+				.data = gc->createCubeMesh(nbl::core::vector3df(1.5, 1.5, 1.5)),
+				.material = Material{
+					.ambient = {0.1, 0.1, 0.2},
+					.diffuse = {0.2, 0.2, 0.8},
+					.specular = {0.8, 0.8, 0.8},
+					.shininess = 1.0f,
+				},
+				.transform = getTranslationMatrix(-5.0f, 1.0f, 0),
+			},
+			ReferenceObjectCpu {
+				.meta = {.type = OT_CUBE, .name = "Transparent Cube Mesh"},
+				.data = gc->createCubeMesh(nbl::core::vector3df(1.5, 1.5, 1.5)),
+				.material = Material{
+					.ambient = {0.1, 0.2, 0.1},
+					.diffuse = {0.2, 0.8, 0.2},
+					.specular = {0.8, 0.8, 0.8},
+					.shininess = 1.0f,
+					.alpha = 0.2,
+				},
+				.transform = getTranslationMatrix(5.0f, 1.0f, 0),
+			},
+		};
+
+		struct CPUTriBufferBindings
+		{
+			nbl::asset::SBufferBinding<ICPUBuffer> vertex, index;
+		};
+		std::array<CPUTriBufferBindings, std::size(cpuObjects)> cpuTriBuffers;
+
+		for (uint32_t i = 0; i < cpuObjects.size(); i++)
+		{
+			const auto& cpuObject = cpuObjects[i];
+
+			auto vBuffer = smart_refctd_ptr(cpuObject.data.bindings[0].buffer); // no offset
+			auto vUsage = bitflag(IGPUBuffer::EUF_STORAGE_BUFFER_BIT) | IGPUBuffer::EUF_TRANSFER_DST_BIT | IGPUBuffer::EUF_INLINE_UPDATE_VIA_CMDBUF |
+				IGPUBuffer::EUF_ACCELERATION_STRUCTURE_BUILD_INPUT_READ_ONLY_BIT | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT;
+			vBuffer->addUsageFlags(vUsage);
+			vBuffer->setContentHash(vBuffer->computeContentHash());
+
+			auto iBuffer = smart_refctd_ptr(cpuObject.data.indexBuffer.buffer); // no offset
+			auto iUsage = bitflag(IGPUBuffer::EUF_STORAGE_BUFFER_BIT) | IGPUBuffer::EUF_TRANSFER_DST_BIT | IGPUBuffer::EUF_INLINE_UPDATE_VIA_CMDBUF |
+				IGPUBuffer::EUF_ACCELERATION_STRUCTURE_BUILD_INPUT_READ_ONLY_BIT | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT;
+
+			if (cpuObject.data.indexType != EIT_UNKNOWN)
+				if (iBuffer)
+				{
+					iBuffer->addUsageFlags(iUsage);
+					iBuffer->setContentHash(iBuffer->computeContentHash());
+				}
+
+			cpuTriBuffers[i] = {
+			  .vertex = {.offset = 0, .buffer = vBuffer},
+			  .index = {.offset = 0, .buffer = iBuffer},
+			};
+
+		}
+
+		// procedural geometries
+		using Aabb = IGPUBottomLevelAccelerationStructure::AABB_t;
+
+		smart_refctd_ptr<ICPUBuffer> cpuProcBuffer;
+		{
+			ICPUBuffer::SCreationParams params;
+			params.size = NumberOfProceduralGeometries * sizeof(Aabb);
+			cpuProcBuffer = ICPUBuffer::create(std::move(params));
+		}
+
+		core::vector<SProceduralGeomInfo> proceduralGeoms;
+		proceduralGeoms.reserve(NumberOfProceduralGeometries);
+		auto proceduralGeometries = reinterpret_cast<Aabb*>(cpuProcBuffer->getPointer());
+		for (int32_t i = 0; i < NumberOfProceduralGeometries; i++)
+		{
+			const auto middle_i = NumberOfProceduralGeometries / 2.0;
+			SProceduralGeomInfo sphere = {
+					.material = hlsl::_static_cast<MaterialPacked>(Material{
+					.ambient = {0.1, 0.05 * i, 0.1},
+					.diffuse = {0.3, 0.2 * i, 0.3},
+					.specular = {0.8, 0.8, 0.8},
+					.shininess = 1.0f,
+				}),
+				.center = float32_t3((i - middle_i) * 4.0, 2, 5.0),
+				.radius = 1,
+			};
+
+			proceduralGeoms.push_back(sphere);
+			const auto sphereMin = sphere.center - sphere.radius;
+			const auto sphereMax = sphere.center + sphere.radius;
+			proceduralGeometries[i] = {
+				vector3d(sphereMin.x, sphereMin.y, sphereMin.z),
+				vector3d(sphereMax.x, sphereMax.y, sphereMax.z)
+			};
+		}
+
+		{
+			IGPUBuffer::SCreationParams params;
+			params.usage = IGPUBuffer::EUF_STORAGE_BUFFER_BIT | IGPUBuffer::EUF_TRANSFER_DST_BIT | IGPUBuffer::EUF_INLINE_UPDATE_VIA_CMDBUF | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT;
+			params.size = proceduralGeoms.size() * sizeof(SProceduralGeomInfo);
+			m_utils->createFilledDeviceLocalBufferOnDedMem(SIntendedSubmitInfo{ .queue = queue }, std::move(params), proceduralGeoms.data()).move_into(m_proceduralGeomInfoBuffer);
+		}
+
+		// get ICPUBuffers into ICPUBLAS
+		// TODO use one BLAS and multiple triangles/aabbs in one
+		const auto blasCount = std::size(cpuObjects) + 1;
+		const auto proceduralBlasIdx = std::size(cpuObjects);
+
+		std::array<smart_refctd_ptr<ICPUBottomLevelAccelerationStructure>, std::size(cpuObjects)+1u> cpuBlas;
+		for (uint32_t i = 0; i < blasCount; i++)
+		{
+			auto& blas = cpuBlas[i];
+			blas = make_smart_refctd_ptr<ICPUBottomLevelAccelerationStructure>();
+
+			if (i == proceduralBlasIdx)
+			{
+				auto aabbs = make_refctd_dynamic_array<smart_refctd_dynamic_array<ICPUBottomLevelAccelerationStructure::AABBs<ICPUBuffer>>>(1u);
+				auto primitiveCounts = make_refctd_dynamic_array<smart_refctd_dynamic_array<uint32_t>>(1u);
+
+				auto& aabb = aabbs->front();
+				auto& primCount = primitiveCounts->front();
+				
+				primCount = NumberOfProceduralGeometries;
+				aabb.data = { .offset = 0, .buffer = cpuProcBuffer };
+				aabb.stride = sizeof(IGPUBottomLevelAccelerationStructure::AABB_t);
+				aabb.geometryFlags = IGPUBottomLevelAccelerationStructure::GEOMETRY_FLAGS::OPAQUE_BIT; // only allow opaque for now
+
+				blas->setGeometries(std::move(aabbs), std::move(primitiveCounts));
+			}
+			else
+			{
+				auto triangles = make_refctd_dynamic_array<smart_refctd_dynamic_array<ICPUBottomLevelAccelerationStructure::Triangles<ICPUBuffer>>>(1u);
+				auto primitiveCounts = make_refctd_dynamic_array<smart_refctd_dynamic_array<uint32_t>>(1u);
+
+				auto& tri = triangles->front();
+				auto& primCount = primitiveCounts->front();
+				const auto& geom = cpuObjects[i];
+				const auto& cpuBuf = cpuTriBuffers[i];
+
+				const bool useIndex = geom.data.indexType != EIT_UNKNOWN;
+				const uint32_t vertexStride = geom.data.inputParams.bindings[0].stride;
+				const uint32_t numVertices = cpuBuf.vertex.buffer->getSize() / vertexStride;
+
+				if (useIndex)
+					primCount = geom.data.indexCount / 3;
+				else
+					primCount = numVertices / 3;
+
+				tri.vertexData[0] = cpuBuf.vertex;
+				tri.indexData = useIndex ? cpuBuf.index : cpuBuf.vertex;
+				tri.maxVertex = numVertices - 1;
+				tri.vertexStride = vertexStride;
+				tri.vertexFormat = EF_R32G32B32_SFLOAT;
+				tri.indexType = geom.data.indexType;
+				tri.geometryFlags = geom.material.isTransparent() ?
+					IGPUBottomLevelAccelerationStructure::GEOMETRY_FLAGS::NO_DUPLICATE_ANY_HIT_INVOCATION_BIT :
+					IGPUBottomLevelAccelerationStructure::GEOMETRY_FLAGS::OPAQUE_BIT;
+
+				blas->setGeometries(std::move(triangles), std::move(primitiveCounts));
+			}
+
+			auto blasFlags = bitflag(IGPUBottomLevelAccelerationStructure::BUILD_FLAGS::PREFER_FAST_TRACE_BIT) | IGPUBottomLevelAccelerationStructure::BUILD_FLAGS::ALLOW_COMPACTION_BIT;
+			if (i == proceduralBlasIdx)
+				blasFlags |= IGPUBottomLevelAccelerationStructure::BUILD_FLAGS::GEOMETRY_TYPE_IS_AABB_BIT;
+
+			blas->setBuildFlags(blasFlags);
+			blas->setContentHash(blas->computeContentHash());
+		}
+
+		auto geomInfoBuffer = ICPUBuffer::create({ std::size(cpuObjects) * sizeof(STriangleGeomInfo) });
+		STriangleGeomInfo* geomInfos = reinterpret_cast<STriangleGeomInfo*>(geomInfoBuffer->getPointer());
+
+		// get ICPUBLAS into ICPUTLAS
+		auto geomInstances = make_refctd_dynamic_array<smart_refctd_dynamic_array<ICPUTopLevelAccelerationStructure::PolymorphicInstance>>(blasCount);
+		{
+			uint32_t i = 0;
+			for (auto instance = geomInstances->begin(); instance != geomInstances->end(); instance++, i++)
+			{
+				const auto isProceduralInstance = i == proceduralBlasIdx;
+				ICPUTopLevelAccelerationStructure::StaticInstance inst;
+				inst.base.blas = cpuBlas[i];
+				inst.base.flags = static_cast<uint32_t>(IGPUTopLevelAccelerationStructure::INSTANCE_FLAGS::TRIANGLE_FACING_CULL_DISABLE_BIT);
+				inst.base.instanceCustomIndex = i;
+				inst.base.instanceShaderBindingTableRecordOffset = isProceduralInstance ? 2 : 0;;
+				inst.base.mask = 0xFF;
+				inst.transform = isProceduralInstance ? matrix3x4SIMD() : cpuObjects[i].transform;
+
+				instance->instance = inst;
+			}
+		}
+
+		auto cpuTlas = make_smart_refctd_ptr<ICPUTopLevelAccelerationStructure>();
+		cpuTlas->setInstances(std::move(geomInstances));
+		cpuTlas->setBuildFlags(IGPUTopLevelAccelerationStructure::BUILD_FLAGS::PREFER_FAST_TRACE_BIT);
+
+		// convert with asset converter
+		smart_refctd_ptr<CAssetConverter> converter = CAssetConverter::create({ .device = m_device.get(), .optimizer = {} });
+		struct MyInputs : CAssetConverter::SInputs
+		{
+			// For the GPU Buffers to be directly writeable and so that we don't need a Transfer Queue submit at all
+			inline uint32_t constrainMemoryTypeBits(const size_t groupCopyID, const IAsset* canonicalAsset, const blake3_hash_t& contentHash, const IDeviceMemoryBacked* memoryBacked) const override
+			{
+				assert(memoryBacked);
+				return memoryBacked->getObjectType() != IDeviceMemoryBacked::EOT_BUFFER ? (~0u) : rebarMemoryTypes;
+			}
+
+			uint32_t rebarMemoryTypes;
+		} inputs = {};
+		inputs.logger = m_logger.get();
+		inputs.rebarMemoryTypes = m_physicalDevice->getDirectVRAMAccessMemoryTypeBits();
+		// the allocator needs to be overriden to hand out memory ranges which have already been mapped so that the ReBAR fast-path can kick in
+		// (multiple buffers can be bound to same memory, but memory can only be mapped once at one place, so Asset Converter can't do it)
+		struct MyAllocator final : public IDeviceMemoryAllocator
+		{
+			ILogicalDevice* getDeviceForAllocations() const override { return device; }
+
+			SAllocation allocate(const SAllocateInfo& info) override
+			{
+				auto retval = device->allocate(info);
+				// map what is mappable by default so ReBAR checks succeed
+				if (retval.isValid() && retval.memory->isMappable())
+					retval.memory->map({ .offset = 0,.length = info.size });
+				return retval;
+			}
+
+			ILogicalDevice* device;
+		} myalloc;
+		myalloc.device = m_device.get();
+		inputs.allocator = &myalloc;
+
+		std::array<ICPUTopLevelAccelerationStructure*, 1u> tmpTlas;
+		std::array<ICPUBuffer*, 2 * std::size(cpuObjects) + 1u> tmpBuffers;
+		{
+			tmpTlas[0] = cpuTlas.get();
+			for (uint32_t i = 0; i < cpuObjects.size(); i++)
+			{
+				tmpBuffers[2 * i + 0] = cpuTriBuffers[i].vertex.buffer.get();
+				tmpBuffers[2 * i + 1] = cpuTriBuffers[i].index.buffer.get();
+			}
+			tmpBuffers[2 * proceduralBlasIdx] = cpuProcBuffer.get();
+
+			std::get<CAssetConverter::SInputs::asset_span_t<ICPUTopLevelAccelerationStructure>>(inputs.assets) = tmpTlas;
+			std::get<CAssetConverter::SInputs::asset_span_t<ICPUBuffer>>(inputs.assets) = tmpBuffers;
+		}
+
+		auto reservation = converter->reserve(inputs);
+		{
+			auto prepass = [&]<typename asset_type_t>(const auto & references) -> bool
+			{
+				auto objects = reservation.getGPUObjects<asset_type_t>();
+				uint32_t counter = {};
+				for (auto& object : objects)
+				{
+					auto gpu = object.value;
+					auto* reference = references[counter];
+
+					if (reference)
+					{
+						if (!gpu)
+						{
+							m_logger->log("Failed to convert a CPU object to GPU!", ILogger::ELL_ERROR);
+							return false;
+						}
+					}
+					counter++;
+				}
+				return true;
+			};
+
+			prepass.template operator() < ICPUTopLevelAccelerationStructure > (tmpTlas);
+			prepass.template operator() < ICPUBuffer > (tmpBuffers);
+		}
+
+		constexpr auto CompBufferCount = 2;
+		std::array<smart_refctd_ptr<IGPUCommandBuffer>, CompBufferCount> compBufs = {};
+		std::array<IQueue::SSubmitInfo::SCommandBufferInfo, CompBufferCount> compBufInfos = {};
+		{
+			auto pool = m_device->createCommandPool(queue->getFamilyIndex(), IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT | IGPUCommandPool::CREATE_FLAGS::TRANSIENT_BIT);
+			pool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY, compBufs);
+			compBufs.front()->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT);
+			for (auto i = 0; i < CompBufferCount; i++)
+				compBufInfos[i].cmdbuf = compBufs[i].get();
+		}
+		auto compSema = m_device->createSemaphore(0u);
+		SIntendedSubmitInfo compute = {};
+		compute.queue = queue;
+		compute.scratchCommandBuffers = compBufInfos;
+		compute.scratchSemaphore = {
+			.semaphore = compSema.get(),
+			.value = 0u,
+			.stageMask = PIPELINE_STAGE_FLAGS::ACCELERATION_STRUCTURE_BUILD_BIT | PIPELINE_STAGE_FLAGS::ACCELERATION_STRUCTURE_COPY_BIT
+		};
+		// convert
+		{
+			smart_refctd_ptr<CAssetConverter::SConvertParams::scratch_for_device_AS_build_t> scratchAlloc;
+			{
+				constexpr auto MaxAlignment = 256;
+				constexpr auto MinAllocationSize = 1024;
+				const auto scratchSize = core::alignUp(reservation.getMaxASBuildScratchSize(false), MaxAlignment);
+
+
+				IGPUBuffer::SCreationParams creationParams = {};
+				creationParams.size = scratchSize;
+				creationParams.usage = IGPUBuffer::EUF_ACCELERATION_STRUCTURE_BUILD_INPUT_READ_ONLY_BIT | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT | IGPUBuffer::EUF_STORAGE_BUFFER_BIT;
+				auto scratchBuffer = m_device->createBuffer(std::move(creationParams));
+
+				auto reqs = scratchBuffer->getMemoryReqs();
+				reqs.memoryTypeBits &= m_physicalDevice->getDirectVRAMAccessMemoryTypeBits();
+
+				auto allocation = m_device->allocate(reqs, scratchBuffer.get(), IDeviceMemoryAllocation::EMAF_DEVICE_ADDRESS_BIT);
+				allocation.memory->map({ .offset = 0,.length = reqs.size });
+
+				scratchAlloc = make_smart_refctd_ptr<CAssetConverter::SConvertParams::scratch_for_device_AS_build_t>(
+					SBufferRange<video::IGPUBuffer>{0ull, scratchSize, std::move(scratchBuffer)},
+					core::allocator<uint8_t>(), MaxAlignment, MinAllocationSize
+				);
+			}
+
+			struct MyParams final : CAssetConverter::SConvertParams
+			{
+				inline uint32_t getFinalOwnerQueueFamily(const IGPUBuffer* buffer, const core::blake3_hash_t& createdFrom) override
+				{
+					return finalUser;
+				}
+				inline uint32_t getFinalOwnerQueueFamily(const IGPUAccelerationStructure* image, const core::blake3_hash_t& createdFrom) override
+				{
+					return finalUser;
+				}
+
+				uint8_t finalUser;
+			} params = {};
+			params.utilities = m_utils.get();
+			params.compute = &compute;
+			params.scratchForDeviceASBuild = scratchAlloc.get();
+			params.finalUser = queue->getFamilyIndex();
+
+			auto future = reservation.convert(params);
+			if (future.copy() != IQueue::RESULT::SUCCESS)
+			{
+				m_logger->log("Failed to await submission feature!", ILogger::ELL_ERROR);
+				return false;
+			}
+			// 2 submits, BLAS build, TLAS build, DO NOT ADD COMPACTIONS IN THIS EXAMPLE!
+			if (compute.getFutureScratchSemaphore().value>3)
+				m_logger->log("Overflow submitted on Compute Queue despite using ReBAR (no transfer submits or usage of staging buffer) and providing a AS Build Scratch Buffer of correctly queried max size!",system::ILogger::ELL_ERROR);
+
+			// assign gpu objects to output
+			auto&& tlases = reservation.getGPUObjects<ICPUTopLevelAccelerationStructure>();
+			m_gpuTlas = tlases[0].value;
+			auto&& buffers = reservation.getGPUObjects<ICPUBuffer>();
+			for (uint32_t i = 0; i < cpuObjects.size(); i++)
+			{
+				auto& cpuObject = cpuObjects[i];
+
+				m_gpuTriangleGeometries.push_back(ReferenceObjectGpu{
+				  .meta = cpuObject.meta,
+				  .bindings = {
+					.vertex = {.offset = 0, .buffer = buffers[2 * i + 0].value },
+					.index = {.offset = 0, .buffer = buffers[2 * i + 1].value },
+				  },
+				  .vertexStride = cpuObject.data.inputParams.bindings[0].stride,
+				  .indexType = cpuObject.data.indexType,
+				  .indexCount = cpuObject.data.indexCount,
+				  .material = hlsl::_static_cast<MaterialPacked>(cpuObject.material),
+				  .transform = cpuObject.transform,
+					});
+			}
+			m_proceduralAabbBuffer = buffers[2 * proceduralBlasIdx].value;
+
+			for (uint32_t i = 0; i < m_gpuTriangleGeometries.size(); i++)
+			{
+				const auto& gpuObject = m_gpuTriangleGeometries[i];
+				const uint64_t vertexBufferAddress = gpuObject.bindings.vertex.buffer->getDeviceAddress();
+				geomInfos[i] = {
+				  .material = gpuObject.material,
+				  .vertexBufferAddress = vertexBufferAddress,
+				  .indexBufferAddress = gpuObject.useIndex() ? gpuObject.bindings.index.buffer->getDeviceAddress() : vertexBufferAddress,
+				  .vertexStride = gpuObject.vertexStride,
+				  .objType = gpuObject.meta.type,
+				  .indexType = gpuObject.indexType,
+				  .smoothNormals = s_smoothNormals[gpuObject.meta.type],
+				};
+			}
+		}
+
+		{
+			IGPUBuffer::SCreationParams params;
+			params.usage = IGPUBuffer::EUF_STORAGE_BUFFER_BIT | IGPUBuffer::EUF_TRANSFER_DST_BIT | IGPUBuffer::EUF_INLINE_UPDATE_VIA_CMDBUF | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT;
+			params.size = geomInfoBuffer->getSize();
+			m_utils->createFilledDeviceLocalBufferOnDedMem(SIntendedSubmitInfo{ .queue = queue }, std::move(params), geomInfos).move_into(m_triangleGeomInfoBuffer);
+		}
+
+		return true;
+	}
+
+
+
+	smart_refctd_ptr<IWindow> m_window;
+	smart_refctd_ptr<CSimpleResizeSurface<ISimpleManagedSurface::ISwapchainResources>> m_surface;
+	smart_refctd_ptr<ISemaphore> m_semaphore;
+	uint64_t m_realFrameIx = 0;
+	uint32_t m_frameAccumulationCounter = 0;
+	std::array<smart_refctd_ptr<IGPUCommandBuffer>, MaxFramesInFlight> m_cmdBufs;
+	ISimpleManagedSurface::SAcquireResult m_currentImageAcquire = {};
+
+	core::smart_refctd_ptr<InputSystem> m_inputSystem;
+	InputSystem::ChannelReader<IMouseEventChannel> m_mouse;
+	InputSystem::ChannelReader<IKeyboardEventChannel> m_keyboard;
+
+	struct CameraSetting
+	{
+		float fov = 60.f;
+		float zNear = 0.1f;
+		float zFar = 10000.f;
+		float moveSpeed = 1.f;
+		float rotateSpeed = 1.f;
+		float viewWidth = 10.f;
+		float camYAngle = 165.f / 180.f * 3.14159f;
+		float camXAngle = 32.f / 180.f * 3.14159f;
+
+	} m_cameraSetting;
+	Camera m_camera = Camera(core::vectorSIMDf(0, 0, 0), core::vectorSIMDf(0, 0, 0), core::matrix4SIMD());
+
+	Light m_light = {
+	  .direction = {-1.0f, -1.0f, -0.4f},
+	  .position = {10.0f, 15.0f, 8.0f},
+	  .outerCutoff = 0.866025404f, // {cos(radians(30.0f))}, 
+	  .type = ELT_DIRECTIONAL
+	};
+
+	video::CDumbPresentationOracle m_oracle;
+
+	struct C_UI
+	{
+		nbl::core::smart_refctd_ptr<nbl::ext::imgui::UI> manager;
+
+		struct
+		{
+			core::smart_refctd_ptr<video::IGPUSampler> gui, scene;
+		} samplers;
+
+		core::smart_refctd_ptr<IGPUDescriptorSet> descriptorSet;
+	} m_ui;
+	core::smart_refctd_ptr<IDescriptorPool> m_guiDescriptorSetPool;
+
+	core::vector<ReferenceObjectGpu> m_gpuTriangleGeometries;
+	core::vector<SProceduralGeomInfo> m_gpuIntersectionSpheres;
+	uint32_t m_intersectionHitGroupIdx;
+
+	smart_refctd_ptr<IGPUTopLevelAccelerationStructure> m_gpuTlas;
+	smart_refctd_ptr<IGPUBuffer> m_instanceBuffer;
+
+	smart_refctd_ptr<IGPUBuffer> m_triangleGeomInfoBuffer;
+	smart_refctd_ptr<IGPUBuffer> m_proceduralGeomInfoBuffer;
+	smart_refctd_ptr<IGPUBuffer> m_proceduralAabbBuffer;
+	smart_refctd_ptr<IGPUBuffer> m_indirectBuffer;
+
+	smart_refctd_ptr<IGPUImage> m_hdrImage;
+	smart_refctd_ptr<IGPUImageView> m_hdrImageView;
+
+	smart_refctd_ptr<IDescriptorPool> m_rayTracingDsPool;
+	smart_refctd_ptr<IGPUDescriptorSet> m_rayTracingDs;
+	smart_refctd_ptr<IGPURayTracingPipeline> m_rayTracingPipeline;
+	uint64_t m_rayTracingStackSize;
+	ShaderBindingTable m_shaderBindingTable;
+
+	smart_refctd_ptr<IGPUDescriptorSet> m_presentDs;
+	smart_refctd_ptr<IDescriptorPool> m_presentDsPool;
+	smart_refctd_ptr<IGPUGraphicsPipeline> m_presentPipeline;
+
+	smart_refctd_ptr<CAssetConverter> m_converter;
+
+
+	core::matrix4SIMD m_cachedModelViewProjectionMatrix;
+	bool m_useIndirectCommand = false;
+
+};
+NBL_MAIN_FUNC(RaytracingPipelineApp)
diff --git a/27_PLYSTLDemo/pipeline.groovy b/29_MeshLoaders/pipeline.groovy
similarity index 100%
rename from 27_PLYSTLDemo/pipeline.groovy
rename to 29_MeshLoaders/pipeline.groovy
diff --git a/29_SpecializationConstants/CMakeLists.txt b/29_SpecializationConstants/CMakeLists.txt
deleted file mode 100644
index a476b6203..000000000
--- a/29_SpecializationConstants/CMakeLists.txt
+++ /dev/null
@@ -1,7 +0,0 @@
-
-include(common RESULT_VARIABLE RES)
-if(NOT RES)
-	message(FATAL_ERROR "common.cmake not found. Should be in {repo_root}/cmake directory")
-endif()
-
-nbl_create_executable_project("" "" "" "" "${NBL_EXECUTABLE_PROJECT_CREATION_PCH_TARGET}")
\ No newline at end of file
diff --git a/29_SpecializationConstants/main.cpp b/29_SpecializationConstants/main.cpp
deleted file mode 100644
index 11b73a330..000000000
--- a/29_SpecializationConstants/main.cpp
+++ /dev/null
@@ -1,566 +0,0 @@
-// Copyright (C) 2018-2020 - DevSH Graphics Programming Sp. z O.O.
-// This file is part of the "Nabla Engine".
-// For conditions of distribution and use, see copyright notice in nabla.h
-
-#define _NBL_STATIC_LIB_
-#include <nabla.h>
-
-#include "../common/CommonAPI.h"
-using namespace nbl;
-using namespace core;
-using namespace ui;
-
-struct UBOCompute
-{
-	//xyz - gravity point, w - dt
-	core::vectorSIMDf gravPointAndDt;
-};
-
-class SpecializationConstantsSampleApp : public ApplicationBase
-{
-	constexpr static uint32_t WIN_W = 1280u;
-	constexpr static uint32_t WIN_H = 720u;
-	constexpr static uint32_t SC_IMG_COUNT = 3u;
-	constexpr static uint32_t FRAMES_IN_FLIGHT = 5u;
-	static constexpr uint64_t MAX_TIMEOUT = 99999999999999ull;
-	static_assert(FRAMES_IN_FLIGHT > SC_IMG_COUNT);
-
-	core::smart_refctd_ptr<nbl::ui::IWindow> window;
-	core::smart_refctd_ptr<nbl::system::ISystem> system;
-	core::smart_refctd_ptr<CommonAPI::CommonAPIEventCallback> windowCb;
-	core::smart_refctd_ptr<nbl::video::IAPIConnection> api;
-	core::smart_refctd_ptr<nbl::video::ISurface> surface;
-	core::smart_refctd_ptr<nbl::video::IUtilities> utils;
-	core::smart_refctd_ptr<nbl::video::ILogicalDevice> device;
-	video::IPhysicalDevice* gpu;
-	std::array<video::IGPUQueue*, CommonAPI::InitOutput::MaxQueuesCount> queues;
-	core::smart_refctd_ptr<nbl::video::ISwapchain> swapchain;
-	core::smart_refctd_ptr<nbl::video::IGPURenderpass> renderpass;
-	nbl::core::smart_refctd_dynamic_array<nbl::core::smart_refctd_ptr<nbl::video::IGPUFramebuffer>> fbo;
-	std::array<std::array<nbl::core::smart_refctd_ptr<nbl::video::IGPUCommandPool>, CommonAPI::InitOutput::MaxFramesInFlight>, CommonAPI::InitOutput::MaxQueuesCount> commandPools;
-	core::smart_refctd_ptr<nbl::system::ISystem> filesystem;
-	core::smart_refctd_ptr<nbl::asset::IAssetManager> assetManager;
-	video::IGPUObjectFromAssetConverter::SParams cpu2gpuParams;
-	core::smart_refctd_ptr<nbl::system::ILogger> logger;
-	core::smart_refctd_ptr<CommonAPI::InputSystem> inputSystem;
-	video::IGPUObjectFromAssetConverter cpu2gpu;
-
-	constexpr static uint32_t COMPUTE_SET = 0u;
-	constexpr static uint32_t PARTICLE_BUF_BINDING = 0u;
-	constexpr static uint32_t COMPUTE_DATA_UBO_BINDING = 1u;
-	constexpr static uint32_t WORKGROUP_SIZE = 256u;
-	constexpr static uint32_t PARTICLE_COUNT = 1u << 21;
-	constexpr static uint32_t PARTICLE_COUNT_PER_AXIS = 1u << 7;
-	constexpr static uint32_t POS_BUF_IX = 0u;
-	constexpr static uint32_t VEL_BUF_IX = 1u;
-	constexpr static uint32_t BUF_COUNT = 2u;
-	constexpr static uint32_t GRAPHICS_SET = 0u;
-	constexpr static uint32_t GRAPHICS_DATA_UBO_BINDING = 0u;
-
-	std::chrono::high_resolution_clock::time_point m_lastTime;
-	int32_t m_resourceIx = -1;
-	core::smart_refctd_ptr<video::IGPUCommandBuffer> m_cmdbuf[FRAMES_IN_FLIGHT];
-	core::smart_refctd_ptr<video::IGPUFence> m_frameComplete[FRAMES_IN_FLIGHT] = { nullptr };
-	core::smart_refctd_ptr<video::IGPUSemaphore> m_imageAcquire[FRAMES_IN_FLIGHT] = { nullptr };
-	core::smart_refctd_ptr<video::IGPUSemaphore> m_renderFinished[FRAMES_IN_FLIGHT] = { nullptr };
-	core::vectorSIMDf m_cameraPosition;
-	core::vectorSIMDf m_camFront;
-	UBOCompute m_uboComputeData;
-	asset::SBufferRange<video::IGPUBuffer> m_computeUBORange;
-	asset::SBufferRange<video::IGPUBuffer> m_graphicsUBORange;
-	core::smart_refctd_ptr<video::IGPUComputePipeline> m_gpuComputePipeline;
-	core::smart_refctd_ptr<video::IGPUGraphicsPipeline> m_graphicsPipeline;
-	core::smart_refctd_ptr<video::IGPUDescriptorSet> m_gpuds0Compute;
-	core::smart_refctd_ptr<video::IGPUDescriptorSet> m_gpuds0Graphics;
-	asset::SBasicViewParameters m_viewParams;
-	core::matrix4SIMD m_viewProj;
-	core::smart_refctd_ptr<video::IGPUBuffer> m_gpuParticleBuf;
-	core::smart_refctd_ptr<video::IGPURenderpassIndependentPipeline> m_rpIndependentPipeline;
-	nbl::video::ISwapchain::SCreationParams m_swapchainCreationParams;
-
-public:
-
-	void setWindow(core::smart_refctd_ptr<nbl::ui::IWindow>&& wnd) override
-	{
-		window = std::move(wnd);
-	}
-	void setSystem(core::smart_refctd_ptr<nbl::system::ISystem>&& s) override
-	{
-		system = std::move(s);
-	}
-	nbl::ui::IWindow* getWindow() override
-	{
-		return window.get();
-	}
-	video::IAPIConnection* getAPIConnection() override
-	{
-		return api.get();
-	}
-	video::ILogicalDevice* getLogicalDevice()  override
-	{
-		return device.get();
-	}
-	video::IGPURenderpass* getRenderpass() override
-	{
-		return renderpass.get();
-	}
-	void setSurface(core::smart_refctd_ptr<video::ISurface>&& s) override
-	{
-		surface = std::move(s);
-	}
-	void setFBOs(std::vector<core::smart_refctd_ptr<video::IGPUFramebuffer>>& f) override
-	{
-		for (int i = 0; i < f.size(); i++)
-		{
-			fbo->begin()[i] = core::smart_refctd_ptr(f[i]);
-		}
-	}
-	void setSwapchain(core::smart_refctd_ptr<video::ISwapchain>&& s) override
-	{
-		swapchain = std::move(s);
-	}
-	uint32_t getSwapchainImageCount() override
-	{
-		return swapchain->getImageCount();
-	}
-	virtual nbl::asset::E_FORMAT getDepthFormat() override
-	{
-		return nbl::asset::EF_UNKNOWN;
-	}
-
-	APP_CONSTRUCTOR(SpecializationConstantsSampleApp);
-
-	void onAppInitialized_impl() override
-	{
-		const auto swapchainImageUsage = static_cast<asset::IImage::E_USAGE_FLAGS>(asset::IImage::EUF_COLOR_ATTACHMENT_BIT | asset::IImage::EUF_STORAGE_BIT);
-		const asset::E_FORMAT depthFormat = asset::EF_UNKNOWN;
-		CommonAPI::InitParams initParams;
-		initParams.window = core::smart_refctd_ptr(window);
-		initParams.apiType = video::EAT_VULKAN;
-		initParams.appName = { _NBL_APP_NAME_ };
-		initParams.framesInFlight = FRAMES_IN_FLIGHT;
-		initParams.windowWidth = WIN_W;
-		initParams.windowHeight = WIN_H;
-		initParams.swapchainImageCount = SC_IMG_COUNT;
-		initParams.swapchainImageUsage = swapchainImageUsage;
-		initParams.depthFormat = depthFormat;
-		initParams.physicalDeviceFilter.minimumLimits.workgroupSizeFromSpecConstant = true;
-		auto initOutp = CommonAPI::InitWithDefaultExt(std::move(initParams));
-
-		window = std::move(initParams.window);
-		system = std::move(initOutp.system);
-		windowCb = std::move(initParams.windowCb);
-		api = std::move(initOutp.apiConnection);
-		surface = std::move(initOutp.surface);
-		device = std::move(initOutp.logicalDevice);
-		gpu = std::move(initOutp.physicalDevice);
-		queues = std::move(initOutp.queues);
-		renderpass = std::move(initOutp.renderToSwapchainRenderpass);
-		commandPools = std::move(initOutp.commandPools);
-		assetManager = std::move(initOutp.assetManager);
-		filesystem = std::move(initOutp.system);
-		cpu2gpuParams = std::move(initOutp.cpu2gpuParams);
-		utils = std::move(initOutp.utilities);
-		m_swapchainCreationParams = std::move(initOutp.swapchainCreationParams);
-
-		CommonAPI::createSwapchain(std::move(device), m_swapchainCreationParams, WIN_W, WIN_H, swapchain);
-		assert(swapchain);
-		fbo = CommonAPI::createFBOWithSwapchainImages(
-			swapchain->getImageCount(), WIN_W, WIN_H,
-			device, swapchain, renderpass,
-			depthFormat
-		);
-
-		video::IGPUObjectFromAssetConverter CPU2GPU;
-		m_cameraPosition = core::vectorSIMDf(0, 0, -10);
-		matrix4SIMD proj = matrix4SIMD::buildProjectionMatrixPerspectiveFovRH(core::radians(90.0f), video::ISurface::getTransformedAspectRatio(swapchain->getPreTransform(), WIN_W, WIN_H), 0.01, 100);
-		matrix3x4SIMD view = matrix3x4SIMD::buildCameraLookAtMatrixRH(m_cameraPosition, core::vectorSIMDf(0, 0, 0), core::vectorSIMDf(0, 1, 0));
-		m_viewProj = matrix4SIMD::concatenateBFollowedByAPrecisely(
-			video::ISurface::getSurfaceTransformationMatrix(swapchain->getPreTransform()),
-			matrix4SIMD::concatenateBFollowedByA(proj, matrix4SIMD(view))
-		);
-		m_camFront = view[2];
-
-		// auto glslExts = device->getSupportedGLSLExtensions();
-		asset::CSPIRVIntrospector introspector;
-
-		const char* pathToCompShader = "../particles.comp";
-		auto compilerSet = assetManager->getCompilerSet();
-		core::smart_refctd_ptr<asset::ICPUShader> computeUnspec = nullptr;
-		core::smart_refctd_ptr<asset::ICPUShader> computeUnspecSPIRV = nullptr;
-		{
-			auto csBundle = assetManager->getAsset(pathToCompShader, {});
-			auto csContents = csBundle.getContents();
-			if (csContents.empty())
-				assert(false);
-
-			asset::ICPUSpecializedShader* csSpec = static_cast<nbl::asset::ICPUSpecializedShader*>(csContents.begin()->get());
-			computeUnspec = core::smart_refctd_ptr<asset::ICPUShader>(csSpec->getUnspecialized());
-
-			auto compiler = compilerSet->getShaderCompiler(computeUnspec->getContentType());
-
-			asset::IShaderCompiler::SPreprocessorOptions preprocessOptions = {};
-			preprocessOptions.sourceIdentifier = pathToCompShader;
-			preprocessOptions.includeFinder = compiler->getDefaultIncludeFinder();
-			computeUnspec = compilerSet->preprocessShader(computeUnspec.get(), preprocessOptions);
-		}
-
-		core::smart_refctd_ptr<const asset::CSPIRVIntrospector::CIntrospectionData> introspection = nullptr;
-		{
-			//! This example first preprocesses and then compiles the shader, although it could've been done by calling compileToSPIRV with setting compilerOptions.preprocessorOptions 
-			asset::IShaderCompiler::SCompilerOptions compilerOptions = {};
-			// compilerOptions.entryPoint = "main";
-			compilerOptions.stage = computeUnspec->getStage();
-			compilerOptions.debugInfoFlags = asset::IShaderCompiler::E_DEBUG_INFO_FLAGS::EDIF_SOURCE_BIT; // should be DIF_SOURCE_BIT for introspection
-			compilerOptions.preprocessorOptions.sourceIdentifier = computeUnspec->getFilepathHint(); // already preprocessed but for logging it's best to fill sourceIdentifier
-			computeUnspecSPIRV = compilerSet->compileToSPIRV(computeUnspec.get(), compilerOptions);
-
-			asset::CSPIRVIntrospector::SIntrospectionParams params = { "main", computeUnspecSPIRV };
-			introspection = introspector.introspect(params);
-		}
-
-		asset::ISpecializedShader::SInfo specInfo;
-		{
-			struct SpecConstants
-			{
-				int32_t wg_size;
-				int32_t particle_count;
-				int32_t pos_buf_ix;
-				int32_t vel_buf_ix;
-				int32_t buf_count;
-			};
-			SpecConstants swapchain{ WORKGROUP_SIZE, PARTICLE_COUNT, POS_BUF_IX, VEL_BUF_IX, BUF_COUNT };
-
-			auto it_particleBufDescIntro = std::find_if(introspection->descriptorSetBindings[COMPUTE_SET].begin(), introspection->descriptorSetBindings[COMPUTE_SET].end(),
-				[=](auto b) { return b.binding == PARTICLE_BUF_BINDING; }
-			);
-			assert(it_particleBufDescIntro->descCountIsSpecConstant);
-			const uint32_t buf_count_specID = it_particleBufDescIntro->count_specID;
-			auto& particleDataArrayIntro = it_particleBufDescIntro->get<asset::ESRT_STORAGE_BUFFER>().members.array[0];
-			assert(particleDataArrayIntro.countIsSpecConstant);
-			const uint32_t particle_count_specID = particleDataArrayIntro.count_specID;
-
-			auto backbuf = asset::ICPUBuffer::create({ sizeof(swapchain) });
-			memcpy(backbuf->getPointer(), &swapchain, sizeof(swapchain));
-			auto entries = core::make_refctd_dynamic_array<core::smart_refctd_dynamic_array<asset::ISpecializedShader::SInfo::SMapEntry>>(5u);
-			(*entries)[0] = { 0u,offsetof(SpecConstants,wg_size),sizeof(int32_t) };//currently local_size_{x|y|z}_id is not queryable via introspection API
-			(*entries)[1] = { particle_count_specID,offsetof(SpecConstants,particle_count),sizeof(int32_t) };
-			(*entries)[2] = { 2u,offsetof(SpecConstants,pos_buf_ix),sizeof(int32_t) };
-			(*entries)[3] = { 3u,offsetof(SpecConstants,vel_buf_ix),sizeof(int32_t) };
-			(*entries)[4] = { buf_count_specID,offsetof(SpecConstants,buf_count),sizeof(int32_t) };
-
-			specInfo = asset::ISpecializedShader::SInfo(std::move(entries), std::move(backbuf), "main");
-		}
-
-		auto compute = core::make_smart_refctd_ptr<asset::ICPUSpecializedShader>(std::move(computeUnspecSPIRV), std::move(specInfo));
-
-		auto computePipeline = introspector.createApproximateComputePipelineFromIntrospection(compute.get());
-		auto computeLayout = core::make_smart_refctd_ptr<asset::ICPUPipelineLayout>(nullptr, nullptr, core::smart_refctd_ptr<asset::ICPUDescriptorSetLayout>(computePipeline->getLayout()->getDescriptorSetLayout(0)));
-		computePipeline->setLayout(core::smart_refctd_ptr(computeLayout));
-
-		// These conversions don't require command buffers
-		m_gpuComputePipeline = CPU2GPU.getGPUObjectsFromAssets(&computePipeline.get(), &computePipeline.get() + 1, cpu2gpuParams)->front();
-		auto* ds0layoutCompute = computeLayout->getDescriptorSetLayout(0);
-		core::smart_refctd_ptr<video::IGPUDescriptorSetLayout> gpuDs0layoutCompute = CPU2GPU.getGPUObjectsFromAssets(&ds0layoutCompute, &ds0layoutCompute + 1, cpu2gpuParams)->front();
-
-		core::vector<core::vector3df_SIMD> particlePosAndVel;
-		particlePosAndVel.reserve(PARTICLE_COUNT * 2);
-		for (int32_t i = 0; i < PARTICLE_COUNT_PER_AXIS; ++i)
-			for (int32_t j = 0; j < PARTICLE_COUNT_PER_AXIS; ++j)
-				for (int32_t k = 0; k < PARTICLE_COUNT_PER_AXIS; ++k)
-					particlePosAndVel.push_back(core::vector3df_SIMD(i, j, k) * 0.5f);
-
-		for (int32_t i = 0; i < PARTICLE_COUNT; ++i)
-			particlePosAndVel.push_back(core::vector3df_SIMD(0.0f));
-
-		constexpr size_t BUF_SZ = 4ull * sizeof(float) * PARTICLE_COUNT;
-		video::IGPUBuffer::SCreationParams bufferCreationParams = {};
-		bufferCreationParams.usage = static_cast<asset::IBuffer::E_USAGE_FLAGS>(asset::IBuffer::EUF_TRANSFER_DST_BIT | asset::IBuffer::EUF_STORAGE_BUFFER_BIT | asset::IBuffer::EUF_VERTEX_BUFFER_BIT);
-		bufferCreationParams.size = 2ull * BUF_SZ;
-		m_gpuParticleBuf = device->createBuffer(std::move(bufferCreationParams));
-		m_gpuParticleBuf->setObjectDebugName("m_gpuParticleBuf");
-		auto particleBufMemReqs = m_gpuParticleBuf->getMemoryReqs();
-		particleBufMemReqs.memoryTypeBits &= device->getPhysicalDevice()->getDeviceLocalMemoryTypeBits();
-		device->allocate(particleBufMemReqs, m_gpuParticleBuf.get());
-		asset::SBufferRange<video::IGPUBuffer> range;
-		range.buffer = m_gpuParticleBuf;
-		range.offset = 0ull;
-		range.size = BUF_SZ * 2ull;
-		utils->updateBufferRangeViaStagingBufferAutoSubmit(range, particlePosAndVel.data(), queues[CommonAPI::InitOutput::EQT_GRAPHICS]);
-		particlePosAndVel.clear();
-
-		video::IGPUBuffer::SCreationParams uboComputeCreationParams = {};
-		uboComputeCreationParams.usage = static_cast<asset::IBuffer::E_USAGE_FLAGS>(asset::IBuffer::EUF_UNIFORM_BUFFER_BIT | asset::IBuffer::EUF_TRANSFER_DST_BIT | asset::IBuffer::EUF_INLINE_UPDATE_VIA_CMDBUF);
-		uboComputeCreationParams.size = core::roundUp(sizeof(UBOCompute), 64ull);
-		auto gpuUboCompute = device->createBuffer(std::move(uboComputeCreationParams));
-		auto gpuUboComputeMemReqs = gpuUboCompute->getMemoryReqs();
-		gpuUboComputeMemReqs.memoryTypeBits &= device->getPhysicalDevice()->getDeviceLocalMemoryTypeBits();
-		device->allocate(gpuUboComputeMemReqs, gpuUboCompute.get());		
-
-		asset::SBufferBinding<video::IGPUBuffer> vtxBindings[video::IGPUMeshBuffer::MAX_ATTR_BUF_BINDING_COUNT];
-		vtxBindings[0].buffer = m_gpuParticleBuf;
-		vtxBindings[0].offset = 0u;
-		//auto meshbuffer = core::make_smart_refctd_ptr<video::IGPUMeshBuffer>(nullptr, nullptr, vtxBindings, asset::SBufferBinding<video::IGPUBuffer>{});
-		//meshbuffer->setIndexCount(PARTICLE_COUNT);
-		//meshbuffer->setIndexType(asset::EIT_UNKNOWN);
-
-
-		auto createSpecShader = [&](const char* filepath, asset::IShader::E_SHADER_STAGE stage)
-		{
-			auto shaderBundle = assetManager->getAsset(filepath, {});
-			auto shaderContents = shaderBundle.getContents();
-			if (shaderContents.empty())
-				assert(false);
-
-			auto specializedShader = static_cast<nbl::asset::ICPUSpecializedShader*>(shaderContents.begin()->get());
-			auto unspecShader = specializedShader->getUnspecialized();
-
-			auto compiler = compilerSet->getShaderCompiler(computeUnspec->getContentType());
-			asset::IShaderCompiler::SCompilerOptions compilerOptions = {};
-			// compilerOptions.entryPoint = specializedShader->getSpecializationInfo().entryPoint;
-			compilerOptions.stage = unspecShader->getStage();
-			compilerOptions.debugInfoFlags = asset::IShaderCompiler::E_DEBUG_INFO_FLAGS::EDIF_SOURCE_BIT;
-			compilerOptions.preprocessorOptions.sourceIdentifier = unspecShader->getFilepathHint(); // already preprocessed but for logging it's best to fill sourceIdentifier
-			compilerOptions.preprocessorOptions.includeFinder = compiler->getDefaultIncludeFinder();
-			auto unspecSPIRV = compilerSet->compileToSPIRV(unspecShader, compilerOptions);
-
-			return core::make_smart_refctd_ptr<asset::ICPUSpecializedShader>(std::move(unspecSPIRV), asset::ISpecializedShader::SInfo(specializedShader->getSpecializationInfo()));
-		};
-		auto vs = createSpecShader("../particles.vert", asset::IShader::ESS_VERTEX);
-		auto fs = createSpecShader("../particles.frag", asset::IShader::ESS_FRAGMENT);
-
-		asset::ICPUSpecializedShader* shaders[2] = { vs.get(),fs.get() };
-		auto pipeline = introspector.createApproximateRenderpassIndependentPipelineFromIntrospection({ shaders, shaders + 2 });
-		{
-			auto& vtxParams = pipeline->getVertexInputParams();
-			vtxParams.attributes[0].binding = 0u;
-			vtxParams.attributes[0].format = asset::EF_R32G32B32_SFLOAT;
-			vtxParams.attributes[0].relativeOffset = 0u;
-			vtxParams.bindings[0].inputRate = asset::EVIR_PER_VERTEX;
-			vtxParams.bindings[0].stride = 4u * sizeof(float);
-
-			pipeline->getPrimitiveAssemblyParams().primitiveType = asset::EPT_POINT_LIST;
-
-			auto& blendParams = pipeline->getBlendParams();
-			blendParams.logicOpEnable = false;
-			blendParams.logicOp = nbl::asset::ELO_NO_OP;
-		}
-		auto gfxLayout = core::make_smart_refctd_ptr<asset::ICPUPipelineLayout>(nullptr, nullptr, core::smart_refctd_ptr<asset::ICPUDescriptorSetLayout>(pipeline->getLayout()->getDescriptorSetLayout(0)));
-		pipeline->setLayout(core::smart_refctd_ptr(gfxLayout));
-
-		m_rpIndependentPipeline = CPU2GPU.getGPUObjectsFromAssets(&pipeline.get(), &pipeline.get() + 1, cpu2gpuParams)->front();
-		auto* ds0layoutGraphics = gfxLayout->getDescriptorSetLayout(0);
-		core::smart_refctd_ptr<video::IGPUDescriptorSetLayout> gpuDs0layoutGraphics = CPU2GPU.getGPUObjectsFromAssets(&ds0layoutGraphics, &ds0layoutGraphics + 1, cpu2gpuParams)->front();
-
-		video::IGPUDescriptorSetLayout* gpuDSLayouts_raw[2] = { gpuDs0layoutCompute.get(), gpuDs0layoutGraphics.get() };
-		const uint32_t setCount[2] = { 1u, 1u };
-		auto dscPool = device->createDescriptorPoolForDSLayouts(video::IDescriptorPool::ECF_NONE, gpuDSLayouts_raw, gpuDSLayouts_raw + 2ull, setCount);
-
-		m_gpuds0Compute = dscPool->createDescriptorSet(std::move(gpuDs0layoutCompute));
-		{
-			video::IGPUDescriptorSet::SDescriptorInfo i[3];
-			video::IGPUDescriptorSet::SWriteDescriptorSet w[2];
-			w[0].arrayElement = 0u;
-			w[0].binding = PARTICLE_BUF_BINDING;
-			w[0].count = BUF_COUNT;
-			w[0].descriptorType = asset::IDescriptor::E_TYPE::ET_STORAGE_BUFFER;
-			w[0].dstSet = m_gpuds0Compute.get();
-			w[0].info = i;
-			w[1].arrayElement = 0u;
-			w[1].binding = COMPUTE_DATA_UBO_BINDING;
-			w[1].count = 1u;
-			w[1].descriptorType = asset::IDescriptor::E_TYPE::ET_UNIFORM_BUFFER;
-			w[1].dstSet = m_gpuds0Compute.get();
-			w[1].info = i + 2u;
-			i[0].desc = m_gpuParticleBuf;
-			i[0].info.buffer.offset = 0ull;
-			i[0].info.buffer.size = BUF_SZ;
-			i[1].desc = m_gpuParticleBuf;
-			i[1].info.buffer.offset = BUF_SZ;
-			i[1].info.buffer.size = BUF_SZ;
-			i[2].desc = gpuUboCompute;
-			i[2].info.buffer.offset = 0ull;
-			i[2].info.buffer.size = gpuUboCompute->getSize();
-
-			device->updateDescriptorSets(2u, w, 0u, nullptr);
-		}
-
-
-		m_gpuds0Graphics = dscPool->createDescriptorSet(std::move(gpuDs0layoutGraphics));
-
-		video::IGPUGraphicsPipeline::SCreationParams gp_params;
-		gp_params.rasterizationSamples = asset::IImage::ESCF_1_BIT;
-		gp_params.renderpass = core::smart_refctd_ptr<video::IGPURenderpass>(renderpass);
-		gp_params.renderpassIndependent = core::smart_refctd_ptr<video::IGPURenderpassIndependentPipeline>(m_rpIndependentPipeline);
-		gp_params.subpassIx = 0u;
-
-		m_graphicsPipeline = device->createGraphicsPipeline(nullptr, std::move(gp_params));
-
-		video::IGPUBuffer::SCreationParams gfxUboCreationParams = {};
-		gfxUboCreationParams.usage = static_cast<asset::IBuffer::E_USAGE_FLAGS>(asset::IBuffer::EUF_UNIFORM_BUFFER_BIT | asset::IBuffer::EUF_TRANSFER_DST_BIT | asset::IBuffer::EUF_INLINE_UPDATE_VIA_CMDBUF);
-		gfxUboCreationParams.size = sizeof(m_viewParams);
-		auto gpuUboGraphics = device->createBuffer(std::move(gfxUboCreationParams));
-		auto gpuUboGraphicsMemReqs = gpuUboGraphics->getMemoryReqs();
-		gpuUboGraphicsMemReqs.memoryTypeBits &= device->getPhysicalDevice()->getDeviceLocalMemoryTypeBits();
-
-		device->allocate(gpuUboGraphicsMemReqs, gpuUboGraphics.get());
-		{
-			video::IGPUDescriptorSet::SWriteDescriptorSet w;
-			video::IGPUDescriptorSet::SDescriptorInfo i;
-			w.arrayElement = 0u;
-			w.binding = GRAPHICS_DATA_UBO_BINDING;
-			w.count = 1u;
-			w.descriptorType = asset::IDescriptor::E_TYPE::ET_UNIFORM_BUFFER;
-			w.dstSet = m_gpuds0Graphics.get();
-			w.info = &i;
-			i.desc = gpuUboGraphics;
-			i.info.buffer.offset = 0u;
-			i.info.buffer.size = gpuUboGraphics->getSize(); // gpuUboGraphics->getSize();
-
-			device->updateDescriptorSets(1u, &w, 0u, nullptr);
-		}
-
-		m_lastTime = std::chrono::high_resolution_clock::now();
-		constexpr uint32_t FRAME_COUNT = 500000u;
-		constexpr uint64_t MAX_TIMEOUT = 99999999999999ull;
-		m_computeUBORange = { 0, gpuUboCompute->getSize(), gpuUboCompute };
-		m_graphicsUBORange = { 0, gpuUboGraphics->getSize(), gpuUboGraphics };
-
-		const auto& graphicsCommandPools = commandPools[CommonAPI::InitOutput::EQT_GRAPHICS];
-		for (uint32_t i = 0u; i < FRAMES_IN_FLIGHT; i++)
-		{
-			device->createCommandBuffers(graphicsCommandPools[i].get(), video::IGPUCommandBuffer::EL_PRIMARY, 1, m_cmdbuf+i);
-			m_imageAcquire[i] = device->createSemaphore();
-			m_renderFinished[i] = device->createSemaphore();
-		}
-	}
-
-	void onAppTerminated_impl() override
-	{
-		device->waitIdle();
-	}
-
-	void workLoopBody() override
-	{
-		m_resourceIx++;
-		if (m_resourceIx >= FRAMES_IN_FLIGHT)
-			m_resourceIx = 0;
-
-		auto& cb = m_cmdbuf[m_resourceIx];
-		auto& fence = m_frameComplete[m_resourceIx];
-		if (fence)
-		{
-			auto retval = device->waitForFences(1u, &fence.get(), false, MAX_TIMEOUT);
-			assert(retval == video::IGPUFence::ES_TIMEOUT || retval == video::IGPUFence::ES_SUCCESS);
-			device->resetFences(1u, &fence.get());
-		}
-		else
-		{
-			fence = device->createFence(static_cast<video::IGPUFence::E_CREATE_FLAGS>(0));
-		}
-
-		// safe to proceed
-		cb->begin(video::IGPUCommandBuffer::EU_ONE_TIME_SUBMIT_BIT);  // TODO: Reset Frame's CommandPool
-
-		{
-			auto time = std::chrono::high_resolution_clock::now();
-			core::vector3df_SIMD gravPoint = m_cameraPosition + m_camFront * 250.f;
-			m_uboComputeData.gravPointAndDt = gravPoint;
-			m_uboComputeData.gravPointAndDt.w = std::chrono::duration_cast<std::chrono::milliseconds>(time - m_lastTime).count() * 1e-4;
-
-			m_lastTime = time;
-			cb->updateBuffer(m_computeUBORange.buffer.get(), m_computeUBORange.offset, m_computeUBORange.size, &m_uboComputeData);
-		}
-		cb->bindComputePipeline(m_gpuComputePipeline.get());
-		cb->bindDescriptorSets(asset::EPBP_COMPUTE,
-			m_gpuComputePipeline->getLayout(),
-			COMPUTE_SET,
-			1u,
-			&m_gpuds0Compute.get(),
-			0u);
-		cb->dispatch(PARTICLE_COUNT / WORKGROUP_SIZE, 1u, 1u);
-
-		asset::SMemoryBarrier memBarrier;
-		memBarrier.srcAccessMask = asset::EAF_SHADER_WRITE_BIT;
-		memBarrier.dstAccessMask = asset::EAF_VERTEX_ATTRIBUTE_READ_BIT;
-		cb->pipelineBarrier(
-			asset::EPSF_COMPUTE_SHADER_BIT,
-			asset::EPSF_VERTEX_INPUT_BIT,
-			static_cast<asset::E_DEPENDENCY_FLAGS>(0u),
-			1, &memBarrier,
-			0, nullptr,
-			0, nullptr);
-
-		{
-			memcpy(m_viewParams.MVP, &m_viewProj, sizeof(m_viewProj));
-			cb->updateBuffer(m_graphicsUBORange.buffer.get(), m_graphicsUBORange.offset, m_graphicsUBORange.size, &m_viewParams);
-		}
-		{
-			asset::SViewport vp;
-			vp.minDepth = 1.f;
-			vp.maxDepth = 0.f;
-			vp.x = 0u;
-			vp.y = 0u;
-			vp.width = WIN_W;
-			vp.height = WIN_H;
-			cb->setViewport(0u, 1u, &vp);
-
-			VkRect2D scissor;
-			scissor.offset = { 0, 0 };
-			scissor.extent = { WIN_W, WIN_H };
-			cb->setScissor(0u, 1u, &scissor);
-		}
-		// renderpass 
-		uint32_t imgnum = 0u;
-		swapchain->acquireNextImage(MAX_TIMEOUT, m_imageAcquire[m_resourceIx].get(), nullptr, &imgnum);
-		{
-			video::IGPUCommandBuffer::SRenderpassBeginInfo info;
-			asset::SClearValue clear;
-			clear.color.float32[0] = 0.f;
-			clear.color.float32[1] = 0.f;
-			clear.color.float32[2] = 0.f;
-			clear.color.float32[3] = 1.f;
-			info.renderpass = renderpass;
-			info.framebuffer = fbo->begin()[imgnum];
-			info.clearValueCount = 1u;
-			info.clearValues = &clear;
-			info.renderArea.offset = { 0, 0 };
-			info.renderArea.extent = { WIN_W, WIN_H };
-			cb->beginRenderPass(&info, asset::ESC_INLINE);
-		}
-		// individual draw
-		{
-			cb->bindGraphicsPipeline(m_graphicsPipeline.get());
-			size_t vbOffset = 0;
-			cb->bindVertexBuffers(0, 1, &m_gpuParticleBuf.get(), &vbOffset);
-			cb->bindDescriptorSets(asset::EPBP_GRAPHICS, m_rpIndependentPipeline->getLayout(), GRAPHICS_SET, 1u, &m_gpuds0Graphics.get(), 0u);
-			cb->draw(PARTICLE_COUNT, 1, 0, 0);
-		}
-		cb->endRenderPass();
-		cb->end();
-
-		CommonAPI::Submit(
-			device.get(),
-			cb.get(),
-			queues[CommonAPI::InitOutput::EQT_GRAPHICS],
-			m_imageAcquire[m_resourceIx].get(),
-			m_renderFinished[m_resourceIx].get(),
-			fence.get());
-
-		CommonAPI::Present(
-			device.get(),
-			swapchain.get(),
-			queues[CommonAPI::InitOutput::EQT_GRAPHICS],
-			m_renderFinished[m_resourceIx].get(),
-			imgnum);
-	}
-
-	bool keepRunning() override
-	{
-		return windowCb->isWindowOpen();
-	}
-};
-
-NBL_COMMON_API_MAIN(SpecializationConstantsSampleApp)
-
-extern "C" {  _declspec(dllexport) DWORD NvOptimusEnablement = 0x00000001; }
\ No newline at end of file
diff --git a/29_SpecializationConstants/particles.comp b/29_SpecializationConstants/particles.comp
deleted file mode 100644
index 5889af74c..000000000
--- a/29_SpecializationConstants/particles.comp
+++ /dev/null
@@ -1,39 +0,0 @@
-// Copyright (C) 2018-2020 - DevSH Graphics Programming Sp. z O.O.
-// This file is part of the "Nabla Engine".
-// For conditions of distribution and use, see copyright notice in nabla.h
-
-#version 430 core
-
-layout (constant_id = 1) const int PARTICLE_COUNT = 256;
-layout (constant_id = 2) const int POS_BUF_IX = 0;
-layout (constant_id = 3) const int VEL_BUF_IX = 1;
-layout (constant_id = 4) const int BUF_COUNT = 2;
-
-layout (local_size_x_id = 0) in;
-
-layout (set = 0, binding = 0, std430) restrict buffer PARTICLE_DATA
-{
-	vec3 p[PARTICLE_COUNT];
-} data[BUF_COUNT];
-layout (set = 0, binding = 1, std140) uniform UBO
-{
-	vec3 gravP;
-	float dt;
-} ubo;
-
-void main()
-{
-	uint GID = gl_GlobalInvocationID.x;
-	
-	vec3 p = data[POS_BUF_IX].p[GID];
-	vec3 v = data[VEL_BUF_IX].p[GID];
-	
-	v *= 1.0 - 0.99*ubo.dt;
-	float d = distance(ubo.gravP,p);
-	float a = 10000.0 / max(1.0, 0.01*pow(d,1.5));
-	v += (ubo.gravP-p)/d * a * ubo.dt;
-	p += v*ubo.dt;
-	
-	data[POS_BUF_IX].p[GID] = p;
-	data[VEL_BUF_IX].p[GID] = v;
-}
\ No newline at end of file
diff --git a/29_SpecializationConstants/particles.frag b/29_SpecializationConstants/particles.frag
deleted file mode 100644
index c03ba9afc..000000000
--- a/29_SpecializationConstants/particles.frag
+++ /dev/null
@@ -1,12 +0,0 @@
-// Copyright (C) 2018-2020 - DevSH Graphics Programming Sp. z O.O.
-// This file is part of the "Nabla Engine".
-// For conditions of distribution and use, see copyright notice in nabla.h
-
-#version 430 core
-
-layout (location = 0) out vec4 Color;
-
-void main()
-{
-	Color = vec4(1.0);
-}
\ No newline at end of file
diff --git a/29_SpecializationConstants/particles.vert b/29_SpecializationConstants/particles.vert
deleted file mode 100644
index f87486cac..000000000
--- a/29_SpecializationConstants/particles.vert
+++ /dev/null
@@ -1,21 +0,0 @@
-// Copyright (C) 2018-2020 - DevSH Graphics Programming Sp. z O.O.
-// This file is part of the "Nabla Engine".
-// For conditions of distribution and use, see copyright notice in nabla.h
-
-#version 430 core
-
-layout (location = 0) in vec3 vPos;
-
-#include <nbl/builtin/glsl/utils/common.glsl>
-#include <nbl/builtin/glsl/utils/transform.glsl>
-
-layout (set = 0, binding = 0, row_major, std140) uniform UBO
-{
-    nbl_glsl_SBasicViewParameters params;
-} CamData;
-
-void main()
-{
-	gl_PointSize = 1;
-	gl_Position = nbl_glsl_pseudoMul4x4with3x1(CamData.params.MVP, vPos);
-}
\ No newline at end of file
diff --git a/29_SpecializationConstants/pipeline.groovy b/29_SpecializationConstants/pipeline.groovy
deleted file mode 100644
index d61a3c808..000000000
--- a/29_SpecializationConstants/pipeline.groovy
+++ /dev/null
@@ -1,50 +0,0 @@
-import org.DevshGraphicsProgramming.Agent
-import org.DevshGraphicsProgramming.BuilderInfo
-import org.DevshGraphicsProgramming.IBuilder
-
-class CSpecializationConstantsBuilder extends IBuilder
-{
-	public CSpecializationConstantsBuilder(Agent _agent, _info)
-	{
-		super(_agent, _info)
-	}
-	
-	@Override
-	public boolean prepare(Map axisMapping)
-	{
-		return true
-	}
-	
-	@Override
-  	public boolean build(Map axisMapping)
-	{
-		IBuilder.CONFIGURATION config = axisMapping.get("CONFIGURATION")
-		IBuilder.BUILD_TYPE buildType = axisMapping.get("BUILD_TYPE")
-		
-		def nameOfBuildDirectory = getNameOfBuildDirectory(buildType)
-		def nameOfConfig = getNameOfConfig(config)
-		
-		agent.execute("cmake --build ${info.rootProjectPath}/${nameOfBuildDirectory}/${info.targetProjectPathRelativeToRoot} --target ${info.targetBaseName} --config ${nameOfConfig} -j12 -v")
-		
-		return true
-	}
-	
-	@Override
-  	public boolean test(Map axisMapping)
-	{
-		return true
-	}
-	
-	@Override
-	public boolean install(Map axisMapping)
-	{
-		return true
-	}
-}
-
-def create(Agent _agent, _info)
-{
-	return new CSpecializationConstantsBuilder(_agent, _info)
-}
-
-return this
\ No newline at end of file
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 0b3279a48..0c0584ebe 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -64,6 +64,7 @@ if(NBL_BUILD_EXAMPLES)
 	add_subdirectory(26_Blur EXCLUDE_FROM_ALL)
 	add_subdirectory(27_MPMCScheduler EXCLUDE_FROM_ALL)	
 	add_subdirectory(28_FFTBloom EXCLUDE_FROM_ALL)
+	add_subdirectory(29_MeshLoaders EXCLUDE_FROM_ALL)
 	# add_subdirectory(36_CUDAInterop EXCLUDE_FROM_ALL)
 
 	# Showcase compute pathtracing

From 3d898943fb9bd4690aa3b92b7a80f5a61198f0de Mon Sep 17 00:00:00 2001
From: keptsecret <sorchon@gmail.com>
Date: Tue, 27 May 2025 16:12:55 +0700
Subject: [PATCH 313/529] fix template accessors

---
 .../app_resources/testWorkgroup.comp.hlsl     | 24 +++++++++----------
 .../benchmarkWorkgroup.comp.hlsl              | 20 ++++++++--------
 2 files changed, 22 insertions(+), 22 deletions(-)

diff --git a/23_Arithmetic2UnitTest/app_resources/testWorkgroup.comp.hlsl b/23_Arithmetic2UnitTest/app_resources/testWorkgroup.comp.hlsl
index bda735b44..0a7fde9ba 100644
--- a/23_Arithmetic2UnitTest/app_resources/testWorkgroup.comp.hlsl
+++ b/23_Arithmetic2UnitTest/app_resources/testWorkgroup.comp.hlsl
@@ -18,12 +18,12 @@ groupshared uint32_t scratch[config_t::SharedScratchElementCount];
 
 struct ScratchProxy
 {
-    template<typename IndexType, typename AccessType>
+    template<typename AccessType, typename IndexType>
     void get(const uint32_t ix, NBL_REF_ARG(AccessType) value)
     {
         value = scratch[ix];
     }
-    template<typename IndexType, typename AccessType>
+    template<typename AccessType, typename IndexType>
     void set(const uint32_t ix, const AccessType value)
     {
         scratch[ix] = value;
@@ -47,18 +47,18 @@ struct DataProxy
     using dtype_t = vector<uint32_t, Config::ItemsPerInvocation_0>;
     static_assert(nbl::hlsl::is_same_v<dtype_t, type_t>);
 
-    template<typename AccessType>
-    void get(const uint32_t ix, NBL_REF_ARG(dtype_t) value)
+    template<typename AccessType, typename IndexType>
+    void get(const IndexType ix, NBL_REF_ARG(AccessType) value)
     {
         const uint32_t workgroupOffset = nbl::hlsl::glsl::gl_WorkGroupID().x * Config::VirtualWorkgroupSize;
-        value = vk::RawBufferLoad<dtype_t>(pc.inputBufAddress + (workgroupOffset + ix) * sizeof(dtype_t));
+        value = vk::RawBufferLoad<AccessType>(pc.inputBufAddress + (workgroupOffset + ix) * sizeof(AccessType));
     }
-    template<typename AccessType>
-    void set(const uint32_t ix, const dtype_t value)
+    template<typename AccessType, typename IndexType>
+    void set(const IndexType ix, const AccessType value)
     {
         const uint32_t workgroupOffset = nbl::hlsl::glsl::gl_WorkGroupID().x * Config::VirtualWorkgroupSize;
         uint64_t outputBufAddr = vk::RawBufferLoad<uint64_t>(pc.outputAddressBufAddress + Binop::BindingIndex * sizeof(uint64_t));
-        vk::RawBufferStore<dtype_t>(outputBufAddr + sizeof(uint32_t) + sizeof(dtype_t) * (workgroupOffset+ix), value, sizeof(uint32_t));
+        vk::RawBufferStore<AccessType>(outputBufAddr + sizeof(uint32_t) + sizeof(AccessType) * (workgroupOffset+ix), value, sizeof(uint32_t));
     }
 
     void workgroupExecutionAndMemoryBarrier()
@@ -76,13 +76,13 @@ struct PreloadedDataProxy
 
     NBL_CONSTEXPR_STATIC_INLINE uint32_t PreloadedDataCount = Config::VirtualWorkgroupSize / Config::WorkgroupSize;
 
-    template<typename AccessType>
-    void get(const uint32_t ix, NBL_REF_ARG(dtype_t) value)
+    template<typename AccessType, typename IndexType>
+    void get(const IndexType ix, NBL_REF_ARG(AccessType) value)
     {
         value = preloaded[(ix-nbl::hlsl::workgroup::SubgroupContiguousIndex())>>Config::WorkgroupSizeLog2];
     }
-    template<typename AccessType>
-    void set(const uint32_t ix, const dtype_t value)
+    template<typename AccessType, typename IndexType>
+    void set(const IndexType ix, const AccessType value)
     {
         preloaded[(ix-nbl::hlsl::workgroup::SubgroupContiguousIndex())>>Config::WorkgroupSizeLog2] = value;
     }
diff --git a/29_Arithmetic2Bench/app_resources/benchmarkWorkgroup.comp.hlsl b/29_Arithmetic2Bench/app_resources/benchmarkWorkgroup.comp.hlsl
index bfbe30ac9..e44bf4f06 100644
--- a/29_Arithmetic2Bench/app_resources/benchmarkWorkgroup.comp.hlsl
+++ b/29_Arithmetic2Bench/app_resources/benchmarkWorkgroup.comp.hlsl
@@ -18,12 +18,12 @@ groupshared uint32_t scratch[config_t::SharedScratchElementCount];
 
 struct ScratchProxy
 {
-    template<typename IndexType, typename AccessType>
+    template<typename AccessType, typename IndexType>
     void get(const IndexType ix, NBL_REF_ARG(AccessType) value)
     {
         value = scratch[ix];
     }
-    template<typename IndexType, typename AccessType>
+    template<typename AccessType, typename IndexType>
     void set(const IndexType ix, const AccessType value)
     {
         scratch[ix] = value;
@@ -49,14 +49,14 @@ struct DataProxy
     static_assert(nbl::hlsl::is_same_v<dtype_t, type_t>);
 
     // we don't want to write/read storage multiple times in loop; doesn't seem optimized out in generated spirv
-    template<typename AccessType>
-    void get(const uint32_t ix, NBL_REF_ARG(dtype_t) value)
+    template<typename AccessType, typename IndexType>
+    void get(const IndexType ix, NBL_REF_ARG(dtype_t) value)
     {
         // value = inputValue[ix];
         value = nbl::hlsl::promote<dtype_t>(globalIndex());
     }
-    template<typename AccessType>
-    void set(const uint32_t ix, const dtype_t value)
+    template<typename AccessType, typename IndexType>
+    void set(const IndexType ix, const dtype_t value)
     {
         // output[Binop::BindingIndex].template Store<type_t>(sizeof(uint32_t) + sizeof(type_t) * ix, value);
     }
@@ -76,13 +76,13 @@ struct PreloadedDataProxy
 
     NBL_CONSTEXPR_STATIC_INLINE uint32_t PreloadedDataCount = Config::VirtualWorkgroupSize / Config::WorkgroupSize;
 
-    template<typename AccessType>
-    void get(const uint32_t ix, NBL_REF_ARG(dtype_t) value)
+    template<typename AccessType, typename IndexType>
+    void get(const IndexType ix, NBL_REF_ARG(AccessType) value)
     {
         value = preloaded[(ix-nbl::hlsl::workgroup::SubgroupContiguousIndex())>>Config::WorkgroupSizeLog2];
     }
-    template<typename AccessType>
-    void set(const uint32_t ix, const dtype_t value)
+    template<typename AccessType, typename IndexType>
+    void set(const IndexType ix, const AccessType value)
     {
         preloaded[(ix-nbl::hlsl::workgroup::SubgroupContiguousIndex())>>Config::WorkgroupSizeLog2] = value;
     }

From 3d63ed732838c3073dfb7993d3eb1305fb5882be Mon Sep 17 00:00:00 2001
From: keptsecret <sorchon@gmail.com>
Date: Tue, 27 May 2025 16:30:55 +0700
Subject: [PATCH 314/529] add accessor index template type

---
 28_FFTBloom/app_resources/fft_common.hlsl | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/28_FFTBloom/app_resources/fft_common.hlsl b/28_FFTBloom/app_resources/fft_common.hlsl
index 41f8821cc..9f2be1432 100644
--- a/28_FFTBloom/app_resources/fft_common.hlsl
+++ b/28_FFTBloom/app_resources/fft_common.hlsl
@@ -5,13 +5,13 @@ groupshared uint32_t sharedmem[FFTParameters::SharedMemoryDWORDs];
 
 struct SharedMemoryAccessor
 {
-	template <typename IndexType, typename AccessType>
+	template <typename AccessType, typename IndexType>
 	void set(IndexType idx, AccessType value)
 	{
 		sharedmem[idx] = value;
 	}
 
-	template <typename IndexType, typename AccessType>
+	template <typename AccessType, typename IndexType>
 	void get(IndexType idx, NBL_REF_ARG(AccessType) value)
 	{
 		value = sharedmem[idx];
@@ -36,14 +36,14 @@ struct PreloadedAccessorCommonBase
 
 struct PreloadedAccessorBase : PreloadedAccessorCommonBase
 {
-	template <typename AccessType>
-	void set(uint32_t idx, AccessType value)
+	template <typename AccessType, typename IndexType>
+	void set(IndexType idx, AccessType value)
 	{
 		preloaded[idx >> WorkgroupSizeLog2] = value;
 	}
 
-	template <typename AccessType>
-	void get(uint32_t idx, NBL_REF_ARG(AccessType) value)
+	template <typename AccessType, typename IndexType>
+	void get(IndexType idx, NBL_REF_ARG(AccessType) value)
 	{
 		value = preloaded[idx >> WorkgroupSizeLog2];
 	}
@@ -54,14 +54,14 @@ struct PreloadedAccessorBase : PreloadedAccessorCommonBase
 // In the case for preloading all channels at once we make it stateful so we track which channel we're running FFT on
 struct MultiChannelPreloadedAccessorBase : PreloadedAccessorCommonBase
 {
-	template <typename AccessType>
-	void set(uint32_t idx, AccessType value)
+	template <typename AccessType, typename IndexType>
+	void set(IndexType idx, AccessType value)
 	{
 		preloaded[currentChannel][idx >> WorkgroupSizeLog2] = value;
 	}
 
-	template <typename AccessType>
-	void get(uint32_t idx, NBL_REF_ARG(AccessType) value)
+	template <typename AccessType, typename IndexType>
+	void get(IndexType idx, NBL_REF_ARG(AccessType) value)
 	{
 		value = preloaded[currentChannel][idx >> WorkgroupSizeLog2];
 	}

From 2f743b5a9f9c1f83e2c0b5e6aae62f632416c888 Mon Sep 17 00:00:00 2001
From: Przemek <minikers21@gmail.com>
Date: Tue, 27 May 2025 16:06:31 +0200
Subject: [PATCH 315/529] Implemented height map

---
 62_CAD/DrawResourcesFiller.cpp                |  4 +-
 62_CAD/DrawResourcesFiller.h                  |  3 +-
 62_CAD/main.cpp                               | 30 +++++++++++++--
 62_CAD/shaders/main_pipeline/common.hlsl      |  2 +-
 .../main_pipeline/fragment_shader.hlsl        | 37 +++++++++++--------
 .../shaders/main_pipeline/vertex_shader.hlsl  |  3 +-
 6 files changed, 55 insertions(+), 24 deletions(-)

diff --git a/62_CAD/DrawResourcesFiller.cpp b/62_CAD/DrawResourcesFiller.cpp
index 8b440edf7..c81f781bf 100644
--- a/62_CAD/DrawResourcesFiller.cpp
+++ b/62_CAD/DrawResourcesFiller.cpp
@@ -613,9 +613,10 @@ bool DrawResourcesFiller::queueGeoreferencedImageCopy_Internal(image_id imageID,
 // We don't have an allocator or memory management for texture updates yet, see how `_test_addImageObject` is being temporarily used (Descriptor updates and pipeline barriers) to upload an image into gpu and update a descriptor slot (it will become more sophisticated but doesn't block you)
 void DrawResourcesFiller::drawGridDTM(
 	const float64_t2& topLeft,
-	float64_t height,
 	float64_t width,
+	float64_t height,
 	float gridCellWidth,
+	uint64_t textureID,
 	const DTMSettingsInfo& dtmSettingsInfo,
 	SIntendedSubmitInfo& intendedNextSubmit)
 {
@@ -624,6 +625,7 @@ void DrawResourcesFiller::drawGridDTM(
 	gridDTMInfo.height = height;
 	gridDTMInfo.width = width;
 	gridDTMInfo.gridCellWidth = gridCellWidth;
+	gridDTMInfo.textureID = getImageIndexFromID(textureID, intendedNextSubmit); // for this to be valid and safe, this function needs to be called immediately after `addStaticImage` function to make sure image is in memory
 
 	if (dtmSettingsInfo.mode & E_DTM_MODE::OUTLINE)
 	{
diff --git a/62_CAD/DrawResourcesFiller.h b/62_CAD/DrawResourcesFiller.h
index 04bc08df3..619257e2c 100644
--- a/62_CAD/DrawResourcesFiller.h
+++ b/62_CAD/DrawResourcesFiller.h
@@ -210,9 +210,10 @@ struct DrawResourcesFiller
 		SIntendedSubmitInfo& intendedNextSubmit);
 
 	void drawGridDTM(const float64_t2& topLeft,
-		float64_t height,
 		float64_t width,
+		float64_t height,
 		float gridCellWidth,
+		uint64_t textureID,
 		const DTMSettingsInfo& dtmSettingsInfo,
 		SIntendedSubmitInfo& intendedNextSubmit);
 	
diff --git a/62_CAD/main.cpp b/62_CAD/main.cpp
index c5123473b..7114c1a06 100644
--- a/62_CAD/main.cpp
+++ b/62_CAD/main.cpp
@@ -1153,7 +1153,7 @@ class ComputerAidedDesign final : public examples::SimpleWindowedApplication, pu
 			"../../media/color_space_test/R8G8B8A8_1.png",
 		};
 
-		for (const auto& imagePath : imagePaths)
+		auto loadImage = [&](const std::string& imagePath) -> smart_refctd_ptr<ICPUImage>
 		{
 			constexpr auto cachingFlags = static_cast<IAssetLoader::E_CACHING_FLAGS>(IAssetLoader::ECF_DONT_CACHE_REFERENCES & IAssetLoader::ECF_DONT_CACHE_TOP_LEVEL);
 			const IAssetLoader::SAssetLoadParams loadParams(0ull, nullptr, cachingFlags, IAssetLoader::ELPF_NONE, m_logger.get(), m_loadCWD);
@@ -1162,6 +1162,7 @@ class ComputerAidedDesign final : public examples::SimpleWindowedApplication, pu
 			if (contents.empty())
 			{
 				m_logger->log("Failed to load image with path %s, skipping!", ILogger::ELL_ERROR, (m_loadCWD / imagePath).c_str());
+				return nullptr;
 			}
 
 			smart_refctd_ptr<ICPUImageView> cpuImgView;
@@ -1195,6 +1196,7 @@ class ComputerAidedDesign final : public examples::SimpleWindowedApplication, pu
 				break;
 			default:
 				m_logger->log("Failed to load ICPUImage or ICPUImageView got some other Asset Type, skipping!", ILogger::ELL_ERROR);
+				return nullptr;
 			}
 
 
@@ -1243,14 +1245,24 @@ class ComputerAidedDesign final : public examples::SimpleWindowedApplication, pu
 				promotedCPUImage->setBufferAndRegions(std::move(promotedCPUImageBuffer), newRegions);
 
 				performImageFormatPromotionCopy(loadedCPUImage, promotedCPUImage);
-				sampleImages.push_back(promotedCPUImage);
+				return promotedCPUImage;
 			}
 			else
 			{
-				sampleImages.push_back(loadedCPUImage);
+				return loadedCPUImage;
 			}
+		};
+
+		for (const auto& imagePath : imagePaths)
+		{
+			auto image = loadImage(imagePath);
+			if (image)
+				sampleImages.push_back(image);
 		}
 
+		gridDTMHeightMap = loadImage("../../media/gridDTMHeightMap.exr");
+		assert(gridDTMHeightMap);
+
 		return true;
 	}
 
@@ -3598,7 +3610,16 @@ class ComputerAidedDesign final : public examples::SimpleWindowedApplication, pu
 				}
 			}
 
-			drawResourcesFiller.drawGridDTM({ 0.0f, 200.0f }, 400.0f, 800.0f, 40.0f, dtmInfo, intendedNextSubmit);
+			constexpr float HeightMapCellWidth = 50.0f;
+			const auto heightMapExtent = gridDTMHeightMap->getCreationParameters().extent;
+			assert(heightMapExtent.width > 0 && heightMapExtent.height > 0);
+			const float heightMapWidth = (heightMapExtent.width - 1) * HeightMapCellWidth;
+			const float heightMapHeight = (heightMapExtent.height - 1) * HeightMapCellWidth;
+
+			const uint64_t heightMapTextureID = 0ull;
+			if (!drawResourcesFiller.ensureStaticImageAvailability({ heightMapTextureID, gridDTMHeightMap }, intendedNextSubmit))
+				m_logger->log("Grid DTM height map texture unavailable!", ILogger::ELL_ERROR);
+			drawResourcesFiller.drawGridDTM({ 0.0f, 200.0f }, heightMapWidth, heightMapHeight, HeightMapCellWidth, heightMapTextureID,  dtmInfo, intendedNextSubmit);
 		}
 	}
 
@@ -3673,6 +3694,7 @@ class ComputerAidedDesign final : public examples::SimpleWindowedApplication, pu
 	std::vector<std::unique_ptr<msdfgen::Shape>> m_shapeMSDFImages = {};
 
 	std::vector<smart_refctd_ptr<ICPUImage>> sampleImages;
+	smart_refctd_ptr<ICPUImage> gridDTMHeightMap;
 
 	static constexpr char FirstGeneratedCharacter = ' ';
 	static constexpr char LastGeneratedCharacter = '~';
diff --git a/62_CAD/shaders/main_pipeline/common.hlsl b/62_CAD/shaders/main_pipeline/common.hlsl
index e492fe4ec..79dbc0bd1 100644
--- a/62_CAD/shaders/main_pipeline/common.hlsl
+++ b/62_CAD/shaders/main_pipeline/common.hlsl
@@ -232,7 +232,7 @@ struct PSInput
 #endif
 
     /* GRID DTM */
-    uint getGridDTMHeightTextureID(uint textureID) { return data1.z; }
+    uint getGridDTMHeightTextureID() { return data1.z; }
     float2 getGridDTMScreenSpaceTopLeft() { return data2.xy; }
     float2 getGridDTMScreenSpaceGridExtents() { return data2.zw; }
     float getGridDTMScreenSpaceCellWidth() { return data3.x; }
diff --git a/62_CAD/shaders/main_pipeline/fragment_shader.hlsl b/62_CAD/shaders/main_pipeline/fragment_shader.hlsl
index 7738b169b..081a4ef16 100644
--- a/62_CAD/shaders/main_pipeline/fragment_shader.hlsl
+++ b/62_CAD/shaders/main_pipeline/fragment_shader.hlsl
@@ -404,7 +404,10 @@ float4 fragMain(PSInput input) : SV_TARGET
             // [NOTE] Do dilation as last step, when everything else works fine
 
             DTMSettings dtmSettings = loadDTMSettings(mainObj.dtmSettingsIdx);
+
             float2 pos = input.getGridDTMScreenSpacePosition();
+            float2 uv = input.getImageUV();
+            const uint32_t textureId = input.getGridDTMHeightTextureID();
 
             // grid consists of square cells and cells are divided into two triangles:
             // depending on mode it is
@@ -438,37 +441,39 @@ float4 fragMain(PSInput input) : SV_TARGET
                     cellCoords.y = uint32_t(gridSpacePosDivGridCellWidth.y);
                 }
 
-                // TODO: do we want to calculate it in the vertex shader?
-                const float MaxCellCoordX = round(gridExtents.x / cellWidth);
-                const float MaxCellCoordY = round(gridExtents.y / cellWidth);
-
                 float2 insideCellCoord = gridSpacePos - float2(cellWidth, cellWidth) * cellCoords; // TODO: use fmod instead?
                 
                 // my ASCII art above explains which triangle is A and which is B
                 const bool triangleA = diagonalFromTopLeftToBottomRight ?
                     insideCellCoord.x < cellWidth - insideCellCoord.y :
                     insideCellCoord.x < insideCellCoord.y;
-                
 
                 float2 gridSpaceCellTopLeftCoords = cellCoords * cellWidth;
 
+                const float InvalidHeightValue = asfloat(0x7FC00000);
+                float4 cellHeights = float4(InvalidHeightValue, InvalidHeightValue, InvalidHeightValue, InvalidHeightValue);
+                if (textureId != InvalidTextureIndex)
+                {
+                    const float2 maxCellCoords = float2(round(gridExtents.x / cellWidth), round(gridExtents.y / cellWidth));
+                    const float2 location = (cellCoords + float2(0.5f, 0.5f)) / maxCellCoords;
+
+                    cellHeights = textures[NonUniformResourceIndex(textureId)].Gather(textureSampler, float2(location.x, location.y), 0);
+                    if (cellHeights.x == 100.0f)
+                        printf("uv = { %f, %f }cellHeights = { %f, %f, %f, %f }", location.x, location.y, cellHeights.x, cellHeights.y, cellHeights.z, cellHeights.w);
+                }
+
                 if (diagonalFromTopLeftToBottomRight)
                 {
-                    v[0] = float3(gridSpaceCellTopLeftCoords.x, gridSpaceCellTopLeftCoords.y + cellWidth, 0.0f);
-                    v[1] = float3(gridSpaceCellTopLeftCoords.x + cellWidth, gridSpaceCellTopLeftCoords.y, 0.0f);
-                    v[2] = triangleA ? float3(gridSpaceCellTopLeftCoords.x, gridSpaceCellTopLeftCoords.y, 0.0f) : float3(gridSpaceCellTopLeftCoords.x + cellWidth, gridSpaceCellTopLeftCoords.y + cellWidth, 0.0f);
+                    v[0] = float3(gridSpaceCellTopLeftCoords.x, gridSpaceCellTopLeftCoords.y + cellWidth, cellHeights.x);
+                    v[1] = float3(gridSpaceCellTopLeftCoords.x + cellWidth, gridSpaceCellTopLeftCoords.y, cellHeights.z);
+                    v[2] = triangleA ? float3(gridSpaceCellTopLeftCoords.x, gridSpaceCellTopLeftCoords.y, cellHeights.w) : float3(gridSpaceCellTopLeftCoords.x + cellWidth, gridSpaceCellTopLeftCoords.y + cellWidth, cellHeights.y);
                 }
                 else
                 {
-                    v[0] = float3(gridSpaceCellTopLeftCoords.x, gridSpaceCellTopLeftCoords.y, 0.0f);
-                    v[1] = float3(gridSpaceCellTopLeftCoords.x + cellWidth, gridSpaceCellTopLeftCoords.y + cellWidth, 0.0f);
-                    v[2] = triangleA ? float3(gridSpaceCellTopLeftCoords.x, gridSpaceCellTopLeftCoords.y + cellWidth, 0.0f) : float3(gridSpaceCellTopLeftCoords.x + cellWidth, gridSpaceCellTopLeftCoords.y, 0.0f);
+                    v[0] = float3(gridSpaceCellTopLeftCoords.x, gridSpaceCellTopLeftCoords.y, cellHeights.w);
+                    v[1] = float3(gridSpaceCellTopLeftCoords.x + cellWidth, gridSpaceCellTopLeftCoords.y + cellWidth, cellHeights.y);
+                    v[2] = triangleA ? float3(gridSpaceCellTopLeftCoords.x, gridSpaceCellTopLeftCoords.y + cellWidth, cellHeights.x) : float3(gridSpaceCellTopLeftCoords.x + cellWidth, gridSpaceCellTopLeftCoords.y, cellHeights.z);
                 }
-                 
-                // TODO: remove when implementing height texture
-                [unroll]
-                for (uint i = 0; i < 3; ++i)
-                    v[i].z = -20.0f + 5.0f * (v[i].x + v[i].y) / cellWidth;
 
                 if (isnan(v[0].z) || isnan(v[1].z) || isnan(v[2].z))
                     discard;
diff --git a/62_CAD/shaders/main_pipeline/vertex_shader.hlsl b/62_CAD/shaders/main_pipeline/vertex_shader.hlsl
index 0624c159d..7fd533439 100644
--- a/62_CAD/shaders/main_pipeline/vertex_shader.hlsl
+++ b/62_CAD/shaders/main_pipeline/vertex_shader.hlsl
@@ -650,7 +650,7 @@ PSInput main(uint vertexID : SV_VertexID)
             pfloat64_t2 topLeft = vk::RawBufferLoad<pfloat64_t2>(globals.pointers.geometryBuffer + drawObj.geometryAddress, 8u);
             pfloat64_t height = vk::RawBufferLoad<pfloat64_t>(globals.pointers.geometryBuffer + drawObj.geometryAddress + sizeof(pfloat64_t2), 8u);
             pfloat64_t width = vk::RawBufferLoad<pfloat64_t>(globals.pointers.geometryBuffer + drawObj.geometryAddress + sizeof(pfloat64_t2) + sizeof(pfloat64_t), 8u);
-            uint32_t dtmSettingsID = vk::RawBufferLoad<uint32_t>(globals.pointers.geometryBuffer + drawObj.geometryAddress + sizeof(pfloat64_t2) + 2 * sizeof(pfloat64_t), 8u);
+            uint32_t textureID = vk::RawBufferLoad<uint32_t>(globals.pointers.geometryBuffer + drawObj.geometryAddress + sizeof(pfloat64_t2) + 2 * sizeof(pfloat64_t), 8u);
             float gridCellWidth = vk::RawBufferLoad<float>(globals.pointers.geometryBuffer + drawObj.geometryAddress + sizeof(pfloat64_t2) + 2 * sizeof(pfloat64_t) + sizeof(uint32_t), 8u);
             float reciprocalOutlineStipplePatternLength = vk::RawBufferLoad<float>(globals.pointers.geometryBuffer + drawObj.geometryAddress + sizeof(pfloat64_t2) + 2 * sizeof(pfloat64_t) + sizeof(uint32_t) + sizeof(float), 8u);
 
@@ -666,6 +666,7 @@ PSInput main(uint vertexID : SV_VertexID)
             float2 ndcVtxPos = _static_cast<float2>(transformPointNdc(clipProjectionData.projectionToNDC, vtxPos));
             outV.position = float4(ndcVtxPos, 0.0f, 1.0f);
 
+            outV.setGridDTMHeightTextureID(textureID);
             outV.setGridDTMScreenSpaceCellWidth(gridCellWidth * globals.screenToWorldRatio);
             outV.setGridDTMScreenSpacePosition(transformPointScreenSpace(clipProjectionData.projectionToNDC, globals.resolution, vtxPos));
             outV.setGridDTMScreenSpaceTopLeft(transformPointScreenSpace(clipProjectionData.projectionToNDC, globals.resolution, topLeft));

From 1100876a9901f66adf362c17fb85ff23e6addf27 Mon Sep 17 00:00:00 2001
From: keptsecret <sorchon@gmail.com>
Date: Wed, 28 May 2025 15:06:52 +0700
Subject: [PATCH 316/529] limit workgroup count

---
 23_Arithmetic2UnitTest/main.cpp | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/23_Arithmetic2UnitTest/main.cpp b/23_Arithmetic2UnitTest/main.cpp
index 2daa772ae..176ef993e 100644
--- a/23_Arithmetic2UnitTest/main.cpp
+++ b/23_Arithmetic2UnitTest/main.cpp
@@ -365,7 +365,9 @@ class Workgroup2ScanTestApp final : public application_templates::BasicMultiQueu
 		{
 			itemsPerWG = workgroupSize;
 			workgroupCount = elementCount / (itemsPerWG * itemsPerInvoc);
-		}	
+		}
+		workgroupCount = min(workgroupCount, m_physicalDevice->getLimits().maxComputeWorkGroupCount[0]);
+
 		cmdbuf->begin(IGPUCommandBuffer::USAGE::NONE);
 		cmdbuf->bindComputePipeline(pipeline.get());
 		cmdbuf->pushConstants(pipelineLayout.get(), IShader::E_SHADER_STAGE::ESS_COMPUTE, 0, sizeof(PushConstantData), &pc);

From a1c6dd1a16bb09a76e43525c091a8561111ee16d Mon Sep 17 00:00:00 2001
From: Erfan Ahmadi <ahmadierfan99@gmail.com>
Date: Wed, 28 May 2025 09:18:42 +0400
Subject: [PATCH 317/529] forceUpdate in ensureStaticImageAvailability

---
 62_CAD/DrawResourcesFiller.cpp | 49 +++++++++++++++++++++++++++-------
 62_CAD/DrawResourcesFiller.h   | 11 +++-----
 62_CAD/Images.h                |  8 ++++++
 3 files changed, 50 insertions(+), 18 deletions(-)

diff --git a/62_CAD/DrawResourcesFiller.cpp b/62_CAD/DrawResourcesFiller.cpp
index c81f781bf..bbdb4eb7c 100644
--- a/62_CAD/DrawResourcesFiller.cpp
+++ b/62_CAD/DrawResourcesFiller.cpp
@@ -362,15 +362,44 @@ void DrawResourcesFiller::drawFontGlyph(
 
 bool DrawResourcesFiller::ensureStaticImageAvailability(const StaticImageInfo& staticImage, SIntendedSubmitInfo& intendedNextSubmit)
 {
-	const auto& imageID = staticImage.imageID;
-	const auto& cpuImage = staticImage.cpuImage;
-	
 	// Try inserting or updating the image usage in the cache.
 	// If the image is already present, updates its semaphore value.
 	auto evictCallback = [&](image_id imageID, const CachedImageRecord& evicted) { evictImage_SubmitIfNeeded(imageID, evicted, intendedNextSubmit); };
-	CachedImageRecord* cachedImageRecord = imagesCache->insert(imageID, intendedNextSubmit.getFutureScratchSemaphore().value, evictCallback);
+	CachedImageRecord* cachedImageRecord = imagesCache->insert(staticImage.imageID, intendedNextSubmit.getFutureScratchSemaphore().value, evictCallback);
 	cachedImageRecord->lastUsedFrameIndex = currentFrameIndex; // in case there was an eviction + auto-submit, we need to update AGAIN
 
+	if (cachedImageRecord->arrayIndex != InvalidTextureIndex && staticImage.forceUpdate)
+	{
+		// found in cache, and we want to force new data into the image
+		if (cachedImageRecord->staticCPUImage)
+		{
+			const auto cachedImageParams = cachedImageRecord->staticCPUImage->getCreationParameters();
+			const auto newImageParams = staticImage.cpuImage->getCreationParameters();
+			const bool needsRecreation = newImageParams != cachedImageParams;
+			if (needsRecreation)
+			{
+				// call the eviction callback so the currently cached imageID gets eventually deallocated from memory arena along with it's allocated array slot from the suballocated descriptor set
+				evictCallback(staticImage.imageID, *cachedImageRecord);
+					
+				// Instead of erasing and inserting the imageID into the cache, we just reset it, so the next block of code goes into array index allocation + creating our new image
+				// imagesCache->erase(imageID);
+				// cachedImageRecord = imagesCache->insert(imageID, intendedNextSubmit.getFutureScratchSemaphore().value, evictCallback);
+				*cachedImageRecord = CachedImageRecord(currentFrameIndex);
+			}
+			else
+			{
+				// Doesn't need image recreation, we'll use the same array index in descriptor set + the same bound memory.
+				// reset it's state + update the cpu image used for copying.
+				cachedImageRecord->state = ImageState::CREATED_AND_MEMORY_BOUND; 
+				cachedImageRecord->staticCPUImage = staticImage.cpuImage;
+			}
+		}
+		else
+		{
+			// TODO[LOG]: ? found static image has empty cpu image, shouldn't happen
+		}
+	}
+
 	// if cachedImageRecord->index was not InvalidTextureIndex then it means we had a cache hit and updated the value of our sema
 	// in which case we don't queue anything for upload, and return the idx
 	if (cachedImageRecord->arrayIndex == InvalidTextureIndex)
@@ -386,7 +415,7 @@ bool DrawResourcesFiller::ensureStaticImageAvailability(const StaticImageInfo& s
 			auto* physDev = m_utilities->getLogicalDevice()->getPhysicalDevice();
 
 			IGPUImage::SCreationParams imageParams = {};
-			imageParams = cpuImage->getCreationParameters();
+			imageParams = staticImage.cpuImage->getCreationParameters();
 			imageParams.usage |= IGPUImage::EUF_TRANSFER_DST_BIT|IGPUImage::EUF_SAMPLED_BIT;
 			// promote format because RGB8 and friends don't actually exist in HW
 			{
@@ -398,7 +427,7 @@ bool DrawResourcesFiller::ensureStaticImageAvailability(const StaticImageInfo& s
 			}
 
 			// Attempt to create a GPU image and image view for this texture.
-			ImageAllocateResults allocResults = tryCreateAndAllocateImage_SubmitIfNeeded(imageParams, intendedNextSubmit, std::to_string(imageID));
+			ImageAllocateResults allocResults = tryCreateAndAllocateImage_SubmitIfNeeded(imageParams, intendedNextSubmit, std::to_string(staticImage.imageID));
 
 			if (allocResults.isValid())
 			{
@@ -408,7 +437,7 @@ bool DrawResourcesFiller::ensureStaticImageAvailability(const StaticImageInfo& s
 				cachedImageRecord->allocationOffset = allocResults.allocationOffset;
 				cachedImageRecord->allocationSize = allocResults.allocationSize;
 				cachedImageRecord->gpuImageView = allocResults.gpuImageView;
-				cachedImageRecord->staticCPUImage = cpuImage;
+				cachedImageRecord->staticCPUImage = staticImage.cpuImage;
 			}
 			else
 			{
@@ -434,8 +463,8 @@ bool DrawResourcesFiller::ensureStaticImageAvailability(const StaticImageInfo& s
 					cachedImageRecord->arrayIndex = InvalidTextureIndex;
 				}
 
-				// erase the entry we failed to fill, no need for `evictImage_SubmitIfNeeded`, because it didn't get to be used in any submit to defer it's memory and index deallocation
-				imagesCache->erase(imageID);
+				// erase the entry we failed to allocate an image for, no need for `evictImage_SubmitIfNeeded`, because it didn't get to be used in any submit to defer it's memory and index deallocation
+				imagesCache->erase(staticImage.imageID);
 			}
 		}
 		else
@@ -516,7 +545,7 @@ bool DrawResourcesFiller::ensureGeoreferencedImageAvailability_AllocateIfNeeded(
 				const bool needsRecreation = cachedImageType != georeferenceImageType || cachedParams != currentParams;
 				if (needsRecreation)
 				{
-					// call the eviction callbacl so the currently cached imageID gets eventually deallocated from memory arena.
+					// call the eviction callback so the currently cached imageID gets eventually deallocated from memory arena.
 					evictCallback(imageID, *cachedImageRecord);
 					
 					// instead of erasing and inserting the imageID into the cache, we just reset it, so the next block of code goes into array index allocation + creating our new image
diff --git a/62_CAD/DrawResourcesFiller.h b/62_CAD/DrawResourcesFiller.h
index 619257e2c..b12eb8920 100644
--- a/62_CAD/DrawResourcesFiller.h
+++ b/62_CAD/DrawResourcesFiller.h
@@ -216,12 +216,6 @@ struct DrawResourcesFiller
 		uint64_t textureID,
 		const DTMSettingsInfo& dtmSettingsInfo,
 		SIntendedSubmitInfo& intendedNextSubmit);
-	
-	struct StaticImageInfo
-	{
-		image_id imageID;
-		core::smart_refctd_ptr<ICPUImage> cpuImage;
-	};
 
 	/**
 	 * @brief Adds a static 2D image to the draw resource set for rendering.
@@ -237,8 +231,9 @@ struct DrawResourcesFiller
 	 *   - Queues the image for uploading via staging in the next submit.
 	 *   - If memory is constrained, attempts to evict other images to free up space.
 	 *
-	 * @param staticImage              Unique identifier for the image resource plus the CPU-side image resource to (possibly) upload.
-	 * @param intendedNextSubmit   Struct representing the upcoming submission, including a semaphore for safe scheduling.
+	 * @param staticImage                       Unique identifier for the image resource plus the CPU-side image resource to (possibly) upload.
+	 * @param staticImage::forceUpdate          If true, bypasses the existing GPU-side cache and forces an update of the image data; Useful when replacing the contents of a static image that may already be resident.
+	 * @param intendedNextSubmit                Struct representing the upcoming submission, including a semaphore for safe scheduling.
 	 *
 	 * @note This function ensures that the descriptor slot is not reused while the GPU may still be reading from it.
 	 *       If an eviction is required and the evicted image is scheduled to be used in the next submit, it triggers
diff --git a/62_CAD/Images.h b/62_CAD/Images.h
index ed09da9d6..bb7b7d3ae 100644
--- a/62_CAD/Images.h
+++ b/62_CAD/Images.h
@@ -208,3 +208,11 @@ struct StreamedImageCopy
 	core::smart_refctd_ptr<ICPUBuffer> srcBuffer; // Make it 'std::future' later?
 	asset::IImage::SBufferCopy region;
 };
+
+// TODO: Rename to StaticImageAvailabilityRequest?
+struct StaticImageInfo
+{
+	image_id imageID = ~0ull;
+	core::smart_refctd_ptr<ICPUImage> cpuImage = nullptr;
+	bool forceUpdate = false; // If true, bypasses the existing GPU-side cache and forces an update of the image data; Useful when replacing the contents of a static image that may already be resident.
+};

From f202ef563249c172d4a6c699379c6793ae939863 Mon Sep 17 00:00:00 2001
From: keptsecret <sorchon@gmail.com>
Date: Thu, 29 May 2025 17:29:00 +0700
Subject: [PATCH 318/529] utility func to get items per wg

---
 23_Arithmetic2UnitTest/main.cpp | 23 ++++++++++++++++++++++-
 1 file changed, 22 insertions(+), 1 deletion(-)

diff --git a/23_Arithmetic2UnitTest/main.cpp b/23_Arithmetic2UnitTest/main.cpp
index 176ef993e..73e6a144e 100644
--- a/23_Arithmetic2UnitTest/main.cpp
+++ b/23_Arithmetic2UnitTest/main.cpp
@@ -214,7 +214,7 @@ class Workgroup2ScanTestApp final : public application_templates::BasicMultiQueu
 					passed = runTest<emulatedScanExclusive, false>(subgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize, ~0u, itemsPerInvocation) && passed;
 					logTestOutcome(passed, workgroupSize);
 
-					const uint32_t itemsPerWG = workgroupSize <= subgroupSize ? workgroupSize * itemsPerInvocation : itemsPerInvocation * max(workgroupSize >> subgroupSizeLog2, subgroupSize) << subgroupSizeLog2;	// TODO use Config somehow
+					const uint32_t itemsPerWG = calculateItemsPerWorkgroup(workgroupSize, subgroupSize, itemsPerInvocation);
 					m_logger->log("Testing Item Count %u", ILogger::ELL_INFO, itemsPerWG);
 					passed = runTest<emulatedReduction, true>(workgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize, itemsPerWG, itemsPerInvocation) && passed;
 					logTestOutcome(passed, itemsPerWG);
@@ -267,6 +267,27 @@ class Workgroup2ScanTestApp final : public application_templates::BasicMultiQueu
 		}
 	}
 
+	// reflects calculations in workgroup2::ArithmeticConfiguration
+	uint32_t calculateItemsPerWorkgroup(const uint32_t workgroupSize, const uint32_t subgroupSize, const uint32_t itemsPerInvocation)
+	{
+		if (workgroupSize <= subgroupSize)
+			return workgroupSize * itemsPerInvocation;
+		
+		const uint8_t subgroupSizeLog2 = hlsl::findMSB(subgroupSize);
+		const uint8_t workgroupSizeLog2 = hlsl::findMSB(workgroupSize);
+
+		const uint16_t levels = (workgroupSizeLog2 == subgroupSizeLog2) ? 1 :
+			(workgroupSizeLog2 > subgroupSizeLog2 * 2 + 2) ? 3 : 2;
+
+		const uint16_t itemsPerInvocationProductLog2 = max(workgroupSizeLog2 - subgroupSizeLog2 * levels, 0);
+		uint16_t itemsPerInvocation1 = (levels == 3) ? min(itemsPerInvocationProductLog2, 2) : itemsPerInvocationProductLog2;
+		itemsPerInvocation1 = uint16_t(1u) << itemsPerInvocation1;
+
+		uint32_t virtualWorkgroupSize = 1u << max(subgroupSizeLog2 * levels, workgroupSizeLog2);
+
+		return itemsPerInvocation * virtualWorkgroupSize;
+	}
+
 	// create pipeline (specialized every test) [TODO: turn into a future/async]
 	smart_refctd_ptr<IGPUComputePipeline> createPipeline(const ICPUShader* overridenUnspecialized, const uint8_t subgroupSizeLog2)
 	{

From 3ccb6f957f978b9f675bce85927e311161e81db2 Mon Sep 17 00:00:00 2001
From: Przemek <minikers21@gmail.com>
Date: Thu, 29 May 2025 13:19:34 +0200
Subject: [PATCH 319/529] Implemented nan height value handling

---
 62_CAD/DrawResourcesFiller.cpp                |  6 +-
 62_CAD/DrawResourcesFiller.h                  |  3 +-
 62_CAD/main.cpp                               | 16 ++--
 62_CAD/shaders/globals.hlsl                   |  3 +-
 .../main_pipeline/fragment_shader.hlsl        | 85 ++++++++++++++-----
 .../shaders/main_pipeline/vertex_shader.hlsl  | 19 ++---
 6 files changed, 87 insertions(+), 45 deletions(-)

diff --git a/62_CAD/DrawResourcesFiller.cpp b/62_CAD/DrawResourcesFiller.cpp
index c81f781bf..a33d16b74 100644
--- a/62_CAD/DrawResourcesFiller.cpp
+++ b/62_CAD/DrawResourcesFiller.cpp
@@ -613,8 +613,7 @@ bool DrawResourcesFiller::queueGeoreferencedImageCopy_Internal(image_id imageID,
 // We don't have an allocator or memory management for texture updates yet, see how `_test_addImageObject` is being temporarily used (Descriptor updates and pipeline barriers) to upload an image into gpu and update a descriptor slot (it will become more sophisticated but doesn't block you)
 void DrawResourcesFiller::drawGridDTM(
 	const float64_t2& topLeft,
-	float64_t width,
-	float64_t height,
+	float64_t2 worldSpaceExtents,
 	float gridCellWidth,
 	uint64_t textureID,
 	const DTMSettingsInfo& dtmSettingsInfo,
@@ -622,8 +621,7 @@ void DrawResourcesFiller::drawGridDTM(
 {
 	GridDTMInfo gridDTMInfo;
 	gridDTMInfo.topLeft = topLeft;
-	gridDTMInfo.height = height;
-	gridDTMInfo.width = width;
+	gridDTMInfo.worldSpaceExtents = worldSpaceExtents;
 	gridDTMInfo.gridCellWidth = gridCellWidth;
 	gridDTMInfo.textureID = getImageIndexFromID(textureID, intendedNextSubmit); // for this to be valid and safe, this function needs to be called immediately after `addStaticImage` function to make sure image is in memory
 
diff --git a/62_CAD/DrawResourcesFiller.h b/62_CAD/DrawResourcesFiller.h
index 619257e2c..09caa3b4f 100644
--- a/62_CAD/DrawResourcesFiller.h
+++ b/62_CAD/DrawResourcesFiller.h
@@ -210,8 +210,7 @@ struct DrawResourcesFiller
 		SIntendedSubmitInfo& intendedNextSubmit);
 
 	void drawGridDTM(const float64_t2& topLeft,
-		float64_t width,
-		float64_t height,
+		float64_t2 worldSpaceExtents,
 		float gridCellWidth,
 		uint64_t textureID,
 		const DTMSettingsInfo& dtmSettingsInfo,
diff --git a/62_CAD/main.cpp b/62_CAD/main.cpp
index 7114c1a06..3336f3609 100644
--- a/62_CAD/main.cpp
+++ b/62_CAD/main.cpp
@@ -77,7 +77,7 @@ constexpr std::array<float, (uint32_t)ExampleMode::CASE_COUNT> cameraExtents =
 	600.0,	// CASE_8
 	600.0,	// CASE_9
 	10.0,	// CASE_BUG
-	600.0	// CASE_11
+	1000.0	// CASE_11
 };
 
 constexpr ExampleMode mode = ExampleMode::CASE_11;
@@ -3569,6 +3569,7 @@ class ComputerAidedDesign final : public examples::SimpleWindowedApplication, pu
 			// 2 - DISCRETE_FIXED_LENGTH_INTERVALS
 			// 3 - CONTINOUS_INTERVALS
 			float animatedAlpha = (std::cos(m_timeElapsed * 0.0005) + 1.0) * 0.5;
+			animatedAlpha = 1.0f;
 			switch (m_shadingModeExample)
 			{
 				case E_HEIGHT_SHADING_MODE::DISCRETE_VARIABLE_LENGTH_INTERVALS:
@@ -3589,8 +3590,8 @@ class ComputerAidedDesign final : public examples::SimpleWindowedApplication, pu
 					dtmInfo.heightShadingInfo.intervalIndexToHeightMultiplier = dtmInfo.heightShadingInfo.intervalLength;
 					dtmInfo.heightShadingInfo.isCenteredShading = false;
 					dtmInfo.heightShadingInfo.heightShadingMode = E_HEIGHT_SHADING_MODE::DISCRETE_FIXED_LENGTH_INTERVALS;
-					dtmInfo.heightShadingInfo.addHeightColorMapEntry(0.0f, float32_t4(0.0f, 0.0f, 1.0f, animatedAlpha));
-					dtmInfo.heightShadingInfo.addHeightColorMapEntry(25.0f, float32_t4(0.0f, 1.0f, 1.0f, animatedAlpha));
+					dtmInfo.heightShadingInfo.addHeightColorMapEntry(-20.0f, float32_t4(0.0f, 0.5f, 0.0f, animatedAlpha));
+					dtmInfo.heightShadingInfo.addHeightColorMapEntry(25.0f, float32_t4(0.0f, 0.7f, 0.0f, animatedAlpha));
 					dtmInfo.heightShadingInfo.addHeightColorMapEntry(50.0f, float32_t4(0.0f, 1.0f, 0.0f, animatedAlpha));
 					dtmInfo.heightShadingInfo.addHeightColorMapEntry(75.0f, float32_t4(1.0f, 1.0f, 0.0f, animatedAlpha));
 					dtmInfo.heightShadingInfo.addHeightColorMapEntry(100.0f, float32_t4(1.0f, 0.0f, 0.0f, animatedAlpha));
@@ -3610,16 +3611,17 @@ class ComputerAidedDesign final : public examples::SimpleWindowedApplication, pu
 				}
 			}
 
-			constexpr float HeightMapCellWidth = 50.0f;
+			constexpr float HeightMapCellWidth = 20.0f;
 			const auto heightMapExtent = gridDTMHeightMap->getCreationParameters().extent;
 			assert(heightMapExtent.width > 0 && heightMapExtent.height > 0);
-			const float heightMapWidth = (heightMapExtent.width - 1) * HeightMapCellWidth;
-			const float heightMapHeight = (heightMapExtent.height - 1) * HeightMapCellWidth;
 
+			float64_t2 worldSpaceExtents;
+			worldSpaceExtents.x = (heightMapExtent.width - 1) * HeightMapCellWidth;
+			worldSpaceExtents.y = (heightMapExtent.height - 1) * HeightMapCellWidth;
 			const uint64_t heightMapTextureID = 0ull;
 			if (!drawResourcesFiller.ensureStaticImageAvailability({ heightMapTextureID, gridDTMHeightMap }, intendedNextSubmit))
 				m_logger->log("Grid DTM height map texture unavailable!", ILogger::ELL_ERROR);
-			drawResourcesFiller.drawGridDTM({ 0.0f, 200.0f }, heightMapWidth, heightMapHeight, HeightMapCellWidth, heightMapTextureID,  dtmInfo, intendedNextSubmit);
+			drawResourcesFiller.drawGridDTM({ -400.0f, 400.0f }, worldSpaceExtents, HeightMapCellWidth, heightMapTextureID,  dtmInfo, intendedNextSubmit);
 		}
 	}
 
diff --git a/62_CAD/shaders/globals.hlsl b/62_CAD/shaders/globals.hlsl
index cd88773f1..0ff238289 100644
--- a/62_CAD/shaders/globals.hlsl
+++ b/62_CAD/shaders/globals.hlsl
@@ -256,8 +256,7 @@ struct GeoreferencedImageInfo
 struct GridDTMInfo
 {
     pfloat64_t2 topLeft; // 2 * 8 = 16 bytes (16)
-    pfloat64_t height; // 8 bytes (24)
-    pfloat64_t width; // 8 bytes (32)
+    pfloat64_t2 worldSpaceExtents; // 16 bytes (32)
     uint32_t textureID; // 4 bytes (36)
     float gridCellWidth; // 4 bytes (40)
     float outlineStipplePatternLengthReciprocal; // 4 bytes (44)
diff --git a/62_CAD/shaders/main_pipeline/fragment_shader.hlsl b/62_CAD/shaders/main_pipeline/fragment_shader.hlsl
index 081a4ef16..45f64aac6 100644
--- a/62_CAD/shaders/main_pipeline/fragment_shader.hlsl
+++ b/62_CAD/shaders/main_pipeline/fragment_shader.hlsl
@@ -117,6 +117,40 @@ float32_t4 calculateFinalColor<true>(const uint2 fragCoord, const float localAlp
     return color;
 }
 
+enum E_CELL_DIAGONAL
+{
+    TOP_LEFT_TO_BOTTOM_RIGHT,
+    BOTTOM_LEFT_TO_TOP_RIGHT,
+    INVALID
+};
+
+E_CELL_DIAGONAL resolveGridDTMCellDiagonal(in float4 cellHeights)
+{
+    static const E_CELL_DIAGONAL DefaultDiagonal = TOP_LEFT_TO_BOTTOM_RIGHT;
+
+    const bool4 invalidHeights = bool4(
+        isnan(cellHeights.x),
+        isnan(cellHeights.y),
+        isnan(cellHeights.z),
+        isnan(cellHeights.w)
+    );
+
+    int invalidHeightsCount = 0;
+    for (int i = 0; i < 4; ++i)
+        invalidHeightsCount += int(invalidHeights[i]);
+
+    if (invalidHeightsCount == 0)
+        return DefaultDiagonal;
+
+    if (invalidHeightsCount > 1)
+        return INVALID;
+
+    if (invalidHeights.x || invalidHeights.z)
+        return TOP_LEFT_TO_BOTTOM_RIGHT;
+    else
+        return BOTTOM_LEFT_TO_TOP_RIGHT;
+}
+
 [[vk::spvexecutionmode(spv::ExecutionModePixelInterlockOrderedEXT)]]
 [shader("pixel")]
 float4 fragMain(PSInput input) : SV_TARGET
@@ -419,9 +453,6 @@ float4 fragMain(PSInput input) : SV_TARGET
             // v0-------v2b   v2a-------v1
             // 
 
-            // TODO: probably needs to be a part of grid dtm settings struct
-            const bool diagonalFromTopLeftToBottomRight = true;
-
             // calculate screen space coordinates of vertices of the current tiranlge within the grid
             float3 v[3];
             nbl::hlsl::shapes::Line<float> outlineLineSegments[2];
@@ -443,13 +474,6 @@ float4 fragMain(PSInput input) : SV_TARGET
 
                 float2 insideCellCoord = gridSpacePos - float2(cellWidth, cellWidth) * cellCoords; // TODO: use fmod instead?
                 
-                // my ASCII art above explains which triangle is A and which is B
-                const bool triangleA = diagonalFromTopLeftToBottomRight ?
-                    insideCellCoord.x < cellWidth - insideCellCoord.y :
-                    insideCellCoord.x < insideCellCoord.y;
-
-                float2 gridSpaceCellTopLeftCoords = cellCoords * cellWidth;
-
                 const float InvalidHeightValue = asfloat(0x7FC00000);
                 float4 cellHeights = float4(InvalidHeightValue, InvalidHeightValue, InvalidHeightValue, InvalidHeightValue);
                 if (textureId != InvalidTextureIndex)
@@ -458,24 +482,47 @@ float4 fragMain(PSInput input) : SV_TARGET
                     const float2 location = (cellCoords + float2(0.5f, 0.5f)) / maxCellCoords;
 
                     cellHeights = textures[NonUniformResourceIndex(textureId)].Gather(textureSampler, float2(location.x, location.y), 0);
-                    if (cellHeights.x == 100.0f)
-                        printf("uv = { %f, %f }cellHeights = { %f, %f, %f, %f }", location.x, location.y, cellHeights.x, cellHeights.y, cellHeights.z, cellHeights.w);
                 }
 
+
+                const E_CELL_DIAGONAL cellDiagonal = resolveGridDTMCellDiagonal(cellHeights);
+                const bool diagonalFromTopLeftToBottomRight = cellDiagonal == E_CELL_DIAGONAL::TOP_LEFT_TO_BOTTOM_RIGHT;
+
+                if (cellDiagonal == E_CELL_DIAGONAL::INVALID)
+                    discard;
+
+                // my ASCII art above explains which triangle is A and which is B
+                const bool triangleA = diagonalFromTopLeftToBottomRight ?
+                    insideCellCoord.x < insideCellCoord.y :
+                    insideCellCoord.x < cellWidth - insideCellCoord.y;
+
+                float2 gridSpaceCellTopLeftCoords = cellCoords * cellWidth;
+
+                //printf("uv = { %f, %f } diagonalTLtoBR = %i triangleA = %i, insiceCellCoords = { %f, %f }", uv.x, uv.y, int(diagonalFromTopLeftToBottomRight), int(triangleA), insideCellCoord.x / cellWidth, insideCellCoord.y / cellWidth);
+
                 if (diagonalFromTopLeftToBottomRight)
-                {
-                    v[0] = float3(gridSpaceCellTopLeftCoords.x, gridSpaceCellTopLeftCoords.y + cellWidth, cellHeights.x);
-                    v[1] = float3(gridSpaceCellTopLeftCoords.x + cellWidth, gridSpaceCellTopLeftCoords.y, cellHeights.z);
-                    v[2] = triangleA ? float3(gridSpaceCellTopLeftCoords.x, gridSpaceCellTopLeftCoords.y, cellHeights.w) : float3(gridSpaceCellTopLeftCoords.x + cellWidth, gridSpaceCellTopLeftCoords.y + cellWidth, cellHeights.y);
-                }
-                else
                 {
                     v[0] = float3(gridSpaceCellTopLeftCoords.x, gridSpaceCellTopLeftCoords.y, cellHeights.w);
                     v[1] = float3(gridSpaceCellTopLeftCoords.x + cellWidth, gridSpaceCellTopLeftCoords.y + cellWidth, cellHeights.y);
                     v[2] = triangleA ? float3(gridSpaceCellTopLeftCoords.x, gridSpaceCellTopLeftCoords.y + cellWidth, cellHeights.x) : float3(gridSpaceCellTopLeftCoords.x + cellWidth, gridSpaceCellTopLeftCoords.y, cellHeights.z);
                 }
+                else
+                {
+                    v[0] = float3(gridSpaceCellTopLeftCoords.x, gridSpaceCellTopLeftCoords.y + cellWidth, cellHeights.x);
+                    v[1] = float3(gridSpaceCellTopLeftCoords.x + cellWidth, gridSpaceCellTopLeftCoords.y, cellHeights.z);
+                    v[2] = triangleA ? float3(gridSpaceCellTopLeftCoords.x, gridSpaceCellTopLeftCoords.y, cellHeights.w) : float3(gridSpaceCellTopLeftCoords.x + cellWidth, gridSpaceCellTopLeftCoords.y + cellWidth, cellHeights.y);
+                }
+
+                if (triangleA)
+                    printf("v0 = { %f, %f }, v1 = { %f, %f }, v2 = { %f, %f }", v[0].x, v[0].y, v[1].x, v[1].y, v[2].x, v[2].y);
+
+                bool isTriangleInvalid = isnan(v[0].z) || isnan(v[1].z) || isnan(v[2].z);
+                bool isCellPartiallyInvalid = isnan(cellHeights.x) || isnan(cellHeights.y) || isnan(cellHeights.z) || isnan(cellHeights.w);
+
+                if (!isTriangleInvalid && isCellPartiallyInvalid)
+                    printf("asdf");
 
-                if (isnan(v[0].z) || isnan(v[1].z) || isnan(v[2].z))
+                if (isTriangleInvalid)
                     discard;
 
                 // move from grid space to screen space
diff --git a/62_CAD/shaders/main_pipeline/vertex_shader.hlsl b/62_CAD/shaders/main_pipeline/vertex_shader.hlsl
index 7fd533439..cdeea3569 100644
--- a/62_CAD/shaders/main_pipeline/vertex_shader.hlsl
+++ b/62_CAD/shaders/main_pipeline/vertex_shader.hlsl
@@ -648,20 +648,17 @@ PSInput main(uint vertexID : SV_VertexID)
         else if (objType == ObjectType::GRID_DTM)
         {
             pfloat64_t2 topLeft = vk::RawBufferLoad<pfloat64_t2>(globals.pointers.geometryBuffer + drawObj.geometryAddress, 8u);
-            pfloat64_t height = vk::RawBufferLoad<pfloat64_t>(globals.pointers.geometryBuffer + drawObj.geometryAddress + sizeof(pfloat64_t2), 8u);
-            pfloat64_t width = vk::RawBufferLoad<pfloat64_t>(globals.pointers.geometryBuffer + drawObj.geometryAddress + sizeof(pfloat64_t2) + sizeof(pfloat64_t), 8u);
-            uint32_t textureID = vk::RawBufferLoad<uint32_t>(globals.pointers.geometryBuffer + drawObj.geometryAddress + sizeof(pfloat64_t2) + 2 * sizeof(pfloat64_t), 8u);
-            float gridCellWidth = vk::RawBufferLoad<float>(globals.pointers.geometryBuffer + drawObj.geometryAddress + sizeof(pfloat64_t2) + 2 * sizeof(pfloat64_t) + sizeof(uint32_t), 8u);
-            float reciprocalOutlineStipplePatternLength = vk::RawBufferLoad<float>(globals.pointers.geometryBuffer + drawObj.geometryAddress + sizeof(pfloat64_t2) + 2 * sizeof(pfloat64_t) + sizeof(uint32_t) + sizeof(float), 8u);
+            pfloat64_t2 worldSpaceExtents = vk::RawBufferLoad<pfloat64_t2>(globals.pointers.geometryBuffer + drawObj.geometryAddress + sizeof(pfloat64_t2), 8u);
+            uint32_t textureID = vk::RawBufferLoad<uint32_t>(globals.pointers.geometryBuffer + drawObj.geometryAddress + 2 * sizeof(pfloat64_t2), 8u);
+            float gridCellWidth = vk::RawBufferLoad<float>(globals.pointers.geometryBuffer + drawObj.geometryAddress + 2 * sizeof(pfloat64_t2) + sizeof(uint32_t), 8u);
+            float reciprocalOutlineStipplePatternLength = vk::RawBufferLoad<float>(globals.pointers.geometryBuffer + drawObj.geometryAddress + 2 * sizeof(pfloat64_t2) + sizeof(uint32_t) + sizeof(float), 8u);
 
             const float2 corner = float2(bool2(vertexIdx & 0x1u, vertexIdx >> 1));
-            pfloat64_t2 gridExtents;
-            gridExtents.x = width;
-            gridExtents.y = -height;
+            worldSpaceExtents.y = -worldSpaceExtents.y;
 
             pfloat64_t2 vtxPos = topLeft;
-            vtxPos = vtxPos + corner * gridExtents;
-            gridExtents.y = -gridExtents.y;
+            vtxPos = vtxPos + corner * worldSpaceExtents;
+            worldSpaceExtents.y = -worldSpaceExtents.y;
 
             float2 ndcVtxPos = _static_cast<float2>(transformPointNdc(clipProjectionData.projectionToNDC, vtxPos));
             outV.position = float4(ndcVtxPos, 0.0f, 1.0f);
@@ -670,7 +667,7 @@ PSInput main(uint vertexID : SV_VertexID)
             outV.setGridDTMScreenSpaceCellWidth(gridCellWidth * globals.screenToWorldRatio);
             outV.setGridDTMScreenSpacePosition(transformPointScreenSpace(clipProjectionData.projectionToNDC, globals.resolution, vtxPos));
             outV.setGridDTMScreenSpaceTopLeft(transformPointScreenSpace(clipProjectionData.projectionToNDC, globals.resolution, topLeft));
-            outV.setGridDTMScreenSpaceGridExtents(_static_cast<float2>(gridExtents) * globals.screenToWorldRatio);
+            outV.setGridDTMScreenSpaceGridExtents(_static_cast<float2>(worldSpaceExtents) * globals.screenToWorldRatio);
             outV.setImageUV(corner);
             outV.setGridDTMOutlineStipplePatternLengthReciprocal(reciprocalOutlineStipplePatternLength);
         }

From 52d5670455047142c2865a4341398071a439bdfc Mon Sep 17 00:00:00 2001
From: Przemek <minikers21@gmail.com>
Date: Thu, 29 May 2025 16:29:11 +0200
Subject: [PATCH 320/529] Added test polyline draw

---
 62_CAD/main.cpp | 24 +++++++++++++++++++++++-
 1 file changed, 23 insertions(+), 1 deletion(-)

diff --git a/62_CAD/main.cpp b/62_CAD/main.cpp
index 3336f3609..27589f1d2 100644
--- a/62_CAD/main.cpp
+++ b/62_CAD/main.cpp
@@ -3616,12 +3616,34 @@ class ComputerAidedDesign final : public examples::SimpleWindowedApplication, pu
 			assert(heightMapExtent.width > 0 && heightMapExtent.height > 0);
 
 			float64_t2 worldSpaceExtents;
+			const float64_t2 topLeft = { -400.0f, 400.0f };
 			worldSpaceExtents.x = (heightMapExtent.width - 1) * HeightMapCellWidth;
 			worldSpaceExtents.y = (heightMapExtent.height - 1) * HeightMapCellWidth;
 			const uint64_t heightMapTextureID = 0ull;
 			if (!drawResourcesFiller.ensureStaticImageAvailability({ heightMapTextureID, gridDTMHeightMap }, intendedNextSubmit))
 				m_logger->log("Grid DTM height map texture unavailable!", ILogger::ELL_ERROR);
-			drawResourcesFiller.drawGridDTM({ -400.0f, 400.0f }, worldSpaceExtents, HeightMapCellWidth, heightMapTextureID,  dtmInfo, intendedNextSubmit);
+			drawResourcesFiller.drawGridDTM(topLeft, worldSpaceExtents, HeightMapCellWidth, heightMapTextureID,  dtmInfo, intendedNextSubmit);
+
+			// draw test polyline
+			{
+				LineStyleInfo style = {};
+				style.screenSpaceLineWidth = 0.0f;
+				style.worldSpaceLineWidth = 15.0f;
+				style.color = float32_t4(0.7f, 0.3f, 0.1f, 0.5f);
+
+				CPolyline polyline;
+				{
+					std::vector<float64_t2> linePoints;
+					linePoints.push_back(topLeft);
+					linePoints.push_back(topLeft + float64_t2(worldSpaceExtents.x, 0.0));
+					linePoints.push_back(topLeft + float64_t2(worldSpaceExtents.x, -worldSpaceExtents.y));
+					linePoints.push_back(topLeft + float64_t2(0.0, -worldSpaceExtents.y));
+					linePoints.push_back(topLeft);
+					polyline.addLinePoints(linePoints);
+				}
+
+				drawResourcesFiller.drawPolyline(polyline, style, intendedNextSubmit);
+			}
 		}
 	}
 

From 93b78108b433cfb85407c5f6816adc4c58b0fb7b Mon Sep 17 00:00:00 2001
From: keptsecret <sorchon@gmail.com>
Date: Fri, 30 May 2025 15:56:16 +0700
Subject: [PATCH 321/529] added check for vk spec requirement

---
 23_Arithmetic2UnitTest/main.cpp | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/23_Arithmetic2UnitTest/main.cpp b/23_Arithmetic2UnitTest/main.cpp
index 73e6a144e..b172d79e7 100644
--- a/23_Arithmetic2UnitTest/main.cpp
+++ b/23_Arithmetic2UnitTest/main.cpp
@@ -193,11 +193,15 @@ class Workgroup2ScanTestApp final : public application_templates::BasicMultiQueu
 		const auto MaxWorkgroupSize = m_physicalDevice->getLimits().maxComputeWorkGroupInvocations;
 		const auto MinSubgroupSize = m_physicalDevice->getLimits().minSubgroupSize;
 		const auto MaxSubgroupSize = m_physicalDevice->getLimits().maxSubgroupSize;
+		const auto MaxComputeWorkgroupSubgroups = m_physicalDevice->getLimits().maxComputeWorkgroupSubgroups;
 		for (auto subgroupSize=MinSubgroupSize; subgroupSize <= MaxSubgroupSize; subgroupSize *= 2u)
 		{
 			const uint8_t subgroupSizeLog2 = hlsl::findMSB(subgroupSize);
 			for (uint32_t workgroupSize = subgroupSize; workgroupSize <= MaxWorkgroupSize; workgroupSize *= 2u)
 			{
+				if (workgroupSize > subgroupSize * MaxComputeWorkgroupSubgroups)
+					continue;	// vk spec requirement: https://vulkan.lunarg.com/doc/view/1.4.304.0/windows/1.4-extensions/vkspec.html#VUID-VkPipelineShaderStageCreateInfo-pNext-02756
+
 				// make sure renderdoc captures everything for debugging
 				m_api->startCapture();
 				m_logger->log("Testing Workgroup Size %u with Subgroup Size %u", ILogger::ELL_INFO, workgroupSize, subgroupSize);

From f72a308feefebfd47e22fea756ed8a50dd78e30d Mon Sep 17 00:00:00 2001
From: Przemek <minikers21@gmail.com>
Date: Fri, 30 May 2025 16:36:19 +0200
Subject: [PATCH 322/529] Fixed vertex shader so it now compiles with emulated
 float

---
 .../main_pipeline/fragment_shader.hlsl        |  7 ---
 .../shaders/main_pipeline/vertex_shader.hlsl  | 56 ++++++++++++++++---
 2 files changed, 49 insertions(+), 14 deletions(-)

diff --git a/62_CAD/shaders/main_pipeline/fragment_shader.hlsl b/62_CAD/shaders/main_pipeline/fragment_shader.hlsl
index 45f64aac6..eacc4ae64 100644
--- a/62_CAD/shaders/main_pipeline/fragment_shader.hlsl
+++ b/62_CAD/shaders/main_pipeline/fragment_shader.hlsl
@@ -461,7 +461,6 @@ float4 fragMain(PSInput input) : SV_TARGET
                 float2 topLeft = input.getGridDTMScreenSpaceTopLeft();
                 float2 gridExtents = input.getGridDTMScreenSpaceGridExtents();
                 float cellWidth = input.getGridDTMScreenSpaceCellWidth();
-                float2 uv = input.getImageUV();
 
                 float2 gridSpacePos = uv * gridExtents;
 
@@ -513,15 +512,9 @@ float4 fragMain(PSInput input) : SV_TARGET
                     v[2] = triangleA ? float3(gridSpaceCellTopLeftCoords.x, gridSpaceCellTopLeftCoords.y, cellHeights.w) : float3(gridSpaceCellTopLeftCoords.x + cellWidth, gridSpaceCellTopLeftCoords.y + cellWidth, cellHeights.y);
                 }
 
-                if (triangleA)
-                    printf("v0 = { %f, %f }, v1 = { %f, %f }, v2 = { %f, %f }", v[0].x, v[0].y, v[1].x, v[1].y, v[2].x, v[2].y);
-
                 bool isTriangleInvalid = isnan(v[0].z) || isnan(v[1].z) || isnan(v[2].z);
                 bool isCellPartiallyInvalid = isnan(cellHeights.x) || isnan(cellHeights.y) || isnan(cellHeights.z) || isnan(cellHeights.w);
 
-                if (!isTriangleInvalid && isCellPartiallyInvalid)
-                    printf("asdf");
-
                 if (isTriangleInvalid)
                     discard;
 
diff --git a/62_CAD/shaders/main_pipeline/vertex_shader.hlsl b/62_CAD/shaders/main_pipeline/vertex_shader.hlsl
index cdeea3569..f01f1ca4b 100644
--- a/62_CAD/shaders/main_pipeline/vertex_shader.hlsl
+++ b/62_CAD/shaders/main_pipeline/vertex_shader.hlsl
@@ -654,22 +654,64 @@ PSInput main(uint vertexID : SV_VertexID)
             float reciprocalOutlineStipplePatternLength = vk::RawBufferLoad<float>(globals.pointers.geometryBuffer + drawObj.geometryAddress + 2 * sizeof(pfloat64_t2) + sizeof(uint32_t) + sizeof(float), 8u);
 
             const float2 corner = float2(bool2(vertexIdx & 0x1u, vertexIdx >> 1));
-            worldSpaceExtents.y = -worldSpaceExtents.y;
+            worldSpaceExtents.y = ieee754::flipSign(worldSpaceExtents.y);
 
             pfloat64_t2 vtxPos = topLeft;
-            vtxPos = vtxPos + corner * worldSpaceExtents;
-            worldSpaceExtents.y = -worldSpaceExtents.y;
-
-            float2 ndcVtxPos = _static_cast<float2>(transformPointNdc(clipProjectionData.projectionToNDC, vtxPos));
-            outV.position = float4(ndcVtxPos, 0.0f, 1.0f);
+            vtxPos.x = vtxPos.x + worldSpaceExtents.x * corner.x;
+            vtxPos.y = vtxPos.y + worldSpaceExtents.y * corner.y;
+            worldSpaceExtents.y = ieee754::flipSign(worldSpaceExtents.y);
 
             outV.setGridDTMHeightTextureID(textureID);
             outV.setGridDTMScreenSpaceCellWidth(gridCellWidth * globals.screenToWorldRatio);
             outV.setGridDTMScreenSpacePosition(transformPointScreenSpace(clipProjectionData.projectionToNDC, globals.resolution, vtxPos));
             outV.setGridDTMScreenSpaceTopLeft(transformPointScreenSpace(clipProjectionData.projectionToNDC, globals.resolution, topLeft));
             outV.setGridDTMScreenSpaceGridExtents(_static_cast<float2>(worldSpaceExtents) * globals.screenToWorldRatio);
-            outV.setImageUV(corner);
             outV.setGridDTMOutlineStipplePatternLengthReciprocal(reciprocalOutlineStipplePatternLength);
+
+            // TODO: finish implementing grid dilation
+            // TODO: calculate actual thicknessOfTheThickestLine
+            /*float thicknessOfTheThickestLine = 20.0f;
+
+            static const float SquareRootOfTwo = 1.4142135f;
+            const pfloat64_t dilationFactor = SquareRootOfTwo * thicknessOfTheThickestLine;
+            pfloat64_t2 dilationVector = pfloat64_t2(dilationFactor, dilationFactor);
+
+            if (corner.x == 0.0f && corner.y == 0.0f)
+            {
+                dilationVector.x = -dilationVector.x;
+            }
+            else if (corner.x == 0.0f && corner.y == 1.0f)
+            {
+                dilationVector.x = -dilationVector.x;
+                dilationVector.y = -dilationVector.y;
+            }
+            else if (corner.x == 1.0f && corner.y == 1.0f)
+            {
+                dilationVector.y = -dilationVector.y;
+            }
+
+            const pfloat64_t dilationFactorTimesTwo = dilationFactor * 2.0f;
+            const pfloat64_t2 dilatedGridExtents = worldSpaceExtents + pfloat64_t2(dilationFactorTimesTwo, dilationFactorTimesTwo);
+            
+            float2 uvScale = _static_cast<float2>(worldSpaceExtents) / _static_cast<float2>(dilatedGridExtents);
+            float2 uvOffset = float2(-dilationFactor, -dilationFactor) / _static_cast<float2>(dilatedGridExtents);
+
+            outV.setImageUV(corner * uvScale + uvOffset);
+
+            pfloat64_t2 topLeftToGridCenterVector = worldSpaceExtents * 0.5;
+            topLeftToGridCenterVector.y = -topLeftToGridCenterVector.y;
+            pfloat64_t2 gridCenter = topLeft + topLeftToGridCenterVector;
+
+            pfloat64_t2 dilatedVtxPos = vtxPos + dilationVector;
+
+            printf("actual = { %f, %f } dialated = { %f, %f }", _static_cast<float>(uvScale.x), _static_cast<float>(uvScale.y), _static_cast<float>(dilatedVtxPos.x), _static_cast<float>(dilatedVtxPos.y));
+
+            float2 ndcVtxPos = _static_cast<float2>(transformPointNdc(clipProjectionData.projectionToNDC, dilatedVtxPos));
+            outV.position = float4(ndcVtxPos, 0.0f, 1.0f);*/
+
+            outV.setImageUV(corner);
+            float2 ndcVtxPos = _static_cast<float2>(transformPointNdc(clipProjectionData.projectionToNDC, vtxPos));
+            outV.position = float4(ndcVtxPos, 0.0f, 1.0f);
         }
         else if (objType == ObjectType::STREAMED_IMAGE)
         {

From 5d139c4fbaae84decc0ca84b8eac3b070b396b85 Mon Sep 17 00:00:00 2001
From: Przemek <minikers21@gmail.com>
Date: Sat, 31 May 2025 12:44:50 +0200
Subject: [PATCH 323/529] Updated media

---
 media | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/media b/media
index a98646358..286860ce9 160000
--- a/media
+++ b/media
@@ -1 +1 @@
-Subproject commit a9864635879e5a616ac400eecd8b6451b498fbf1
+Subproject commit 286860ce9510571820d5f6d7e14abdd8ac1b22be

From 7762984070daa3ac424d320ca91ecd3e8f9f0892 Mon Sep 17 00:00:00 2001
From: Przemek <minikers21@gmail.com>
Date: Sat, 31 May 2025 12:50:45 +0200
Subject: [PATCH 324/529] Updated media

---
 media | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/media b/media
index 286860ce9..4d9fcebb1 160000
--- a/media
+++ b/media
@@ -1 +1 @@
-Subproject commit 286860ce9510571820d5f6d7e14abdd8ac1b22be
+Subproject commit 4d9fcebb12f8c52f61882054b0da9bd60b295ced

From 00a1a1dd4f50e6b802b72991cd2437556fe5e65d Mon Sep 17 00:00:00 2001
From: Erfan Ahmadi <ahmadierfan99@gmail.com>
Date: Sat, 31 May 2025 15:37:12 +0400
Subject: [PATCH 325/529] GRID DTM Small Fixes

---
 62_CAD/DrawResourcesFiller.cpp                    | 2 +-
 62_CAD/shaders/main_pipeline/fragment_shader.hlsl | 6 +++---
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/62_CAD/DrawResourcesFiller.cpp b/62_CAD/DrawResourcesFiller.cpp
index 4a63d01a8..3491218a9 100644
--- a/62_CAD/DrawResourcesFiller.cpp
+++ b/62_CAD/DrawResourcesFiller.cpp
@@ -1959,7 +1959,7 @@ bool DrawResourcesFiller::addGridDTM_Internal(const GridDTMInfo& gridDTMInfo, ui
 	DrawObject drawObj = {};
 	drawObj.mainObjIndex = mainObjIdx;
 	drawObj.type_subsectionIdx = uint32_t(static_cast<uint16_t>(ObjectType::GRID_DTM) | (0 << 16));
-	//drawObj.geometryAddress = 0;
+	drawObj.geometryAddress = geometryBufferOffset;
 	drawObjectsToBeFilled[0u] = drawObj;
 
 	return true;
diff --git a/62_CAD/shaders/main_pipeline/fragment_shader.hlsl b/62_CAD/shaders/main_pipeline/fragment_shader.hlsl
index eacc4ae64..49e5c7ac1 100644
--- a/62_CAD/shaders/main_pipeline/fragment_shader.hlsl
+++ b/62_CAD/shaders/main_pipeline/fragment_shader.hlsl
@@ -554,13 +554,13 @@ float4 fragMain(PSInput input) : SV_TARGET
             float2 heightDeriv = fwidth(height);
 
             float4 dtmColor = float4(0.0f, 0.0f, 0.0f, 0.0f);
-            if (dtmSettings.drawOutlineEnabled())
-                dtmColor = dtm::blendUnder(dtmColor, dtm::calculateGridDTMOutlineColor(dtmSettings.outlineLineStyleIdx, outlineLineSegments, input.position.xy, outlinePhaseShift));
             if (dtmSettings.drawContourEnabled())
             {
-                for (uint32_t i = 0; i < dtmSettings.contourSettingsCount; ++i) // TODO: should reverse the order with blendUnder
+                for (int i = dtmSettings.contourSettingsCount-1u; i >= 0; --i) 
                     dtmColor = dtm::blendUnder(dtmColor, dtm::calculateDTMContourColor(dtmSettings.contourSettings[i], v, input.position.xy, height));
             }
+            if (dtmSettings.drawOutlineEnabled())
+                dtmColor = dtm::blendUnder(dtmColor, dtm::calculateGridDTMOutlineColor(dtmSettings.outlineLineStyleIdx, outlineLineSegments, input.position.xy, outlinePhaseShift));
             if (dtmSettings.drawHeightShadingEnabled())
                 dtmColor = dtm::blendUnder(dtmColor, dtm::calculateDTMHeightColor(dtmSettings.heightShadingSettings, v, heightDeriv, input.position.xy, height));
 

From cd802d6ad83510c467b651ca48cdba1fd55c106b Mon Sep 17 00:00:00 2001
From: Erfan Ahmadi <ahmadierfan99@gmail.com>
Date: Sat, 31 May 2025 16:08:40 +0400
Subject: [PATCH 326/529] fixed memcmp in DtmSettings ==

---
 62_CAD/shaders/globals.hlsl | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/62_CAD/shaders/globals.hlsl b/62_CAD/shaders/globals.hlsl
index 0ff238289..56c8b438c 100644
--- a/62_CAD/shaders/globals.hlsl
+++ b/62_CAD/shaders/globals.hlsl
@@ -491,7 +491,7 @@ inline bool operator==(const DTMSettings& lhs, const DTMSettings& rhs)
     {
         if (lhs.contourSettingsCount != rhs.contourSettingsCount)
             return false;
-        if (!memcmp(lhs.contourSettings, rhs.contourSettings, lhs.contourSettingsCount * sizeof(DTMContourSettings)))
+        if (memcmp(lhs.contourSettings, rhs.contourSettings, lhs.contourSettingsCount * sizeof(DTMContourSettings)))
             return false;
     }
 
@@ -507,9 +507,9 @@ inline bool operator==(const DTMSettings& lhs, const DTMSettings& rhs)
             return false;
         
                 
-        if(!memcmp(lhs.heightShadingSettings.heightColorMapHeights, rhs.heightShadingSettings.heightColorMapHeights, lhs.heightShadingSettings.heightColorEntryCount * sizeof(float)))
+        if(memcmp(lhs.heightShadingSettings.heightColorMapHeights, rhs.heightShadingSettings.heightColorMapHeights, lhs.heightShadingSettings.heightColorEntryCount * sizeof(float)))
             return false;
-        if(!memcmp(lhs.heightShadingSettings.heightColorMapColors, rhs.heightShadingSettings.heightColorMapColors, lhs.heightShadingSettings.heightColorEntryCount * sizeof(float32_t4)))
+        if(memcmp(lhs.heightShadingSettings.heightColorMapColors, rhs.heightShadingSettings.heightColorMapColors, lhs.heightShadingSettings.heightColorEntryCount * sizeof(float32_t4)))
             return false;
     }
 

From 3a3aaa9fce04cda7726170e2128124d466252a27 Mon Sep 17 00:00:00 2001
From: keptsecret <sorchon@gmail.com>
Date: Mon, 2 Jun 2025 11:31:03 +0700
Subject: [PATCH 327/529] removed maxComputeWorkgroupSubgroups*subgroupsize
 check

---
 23_Arithmetic2UnitTest/main.cpp | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/23_Arithmetic2UnitTest/main.cpp b/23_Arithmetic2UnitTest/main.cpp
index b172d79e7..73e6a144e 100644
--- a/23_Arithmetic2UnitTest/main.cpp
+++ b/23_Arithmetic2UnitTest/main.cpp
@@ -193,15 +193,11 @@ class Workgroup2ScanTestApp final : public application_templates::BasicMultiQueu
 		const auto MaxWorkgroupSize = m_physicalDevice->getLimits().maxComputeWorkGroupInvocations;
 		const auto MinSubgroupSize = m_physicalDevice->getLimits().minSubgroupSize;
 		const auto MaxSubgroupSize = m_physicalDevice->getLimits().maxSubgroupSize;
-		const auto MaxComputeWorkgroupSubgroups = m_physicalDevice->getLimits().maxComputeWorkgroupSubgroups;
 		for (auto subgroupSize=MinSubgroupSize; subgroupSize <= MaxSubgroupSize; subgroupSize *= 2u)
 		{
 			const uint8_t subgroupSizeLog2 = hlsl::findMSB(subgroupSize);
 			for (uint32_t workgroupSize = subgroupSize; workgroupSize <= MaxWorkgroupSize; workgroupSize *= 2u)
 			{
-				if (workgroupSize > subgroupSize * MaxComputeWorkgroupSubgroups)
-					continue;	// vk spec requirement: https://vulkan.lunarg.com/doc/view/1.4.304.0/windows/1.4-extensions/vkspec.html#VUID-VkPipelineShaderStageCreateInfo-pNext-02756
-
 				// make sure renderdoc captures everything for debugging
 				m_api->startCapture();
 				m_logger->log("Testing Workgroup Size %u with Subgroup Size %u", ILogger::ELL_INFO, workgroupSize, subgroupSize);

From 6340d2f0ca983463721c61f2db35630ac7aa5248 Mon Sep 17 00:00:00 2001
From: Erfan Ahmadi <ahmadierfan99@gmail.com>
Date: Mon, 2 Jun 2025 11:32:42 +0400
Subject: [PATCH 328/529] logging in DrawResourcesFiller

---
 62_CAD/DrawResourcesFiller.cpp | 165 +++++++++++++++++++++++----------
 62_CAD/DrawResourcesFiller.h   |   5 +-
 2 files changed, 120 insertions(+), 50 deletions(-)

diff --git a/62_CAD/DrawResourcesFiller.cpp b/62_CAD/DrawResourcesFiller.cpp
index 3491218a9..eaa8eccd2 100644
--- a/62_CAD/DrawResourcesFiller.cpp
+++ b/62_CAD/DrawResourcesFiller.cpp
@@ -3,9 +3,10 @@
 DrawResourcesFiller::DrawResourcesFiller()
 {}
 
-DrawResourcesFiller::DrawResourcesFiller(smart_refctd_ptr<IUtilities>&& utils, IQueue* copyQueue) :
-	m_utilities(utils),
-	m_copyQueue(copyQueue)
+DrawResourcesFiller::DrawResourcesFiller(smart_refctd_ptr<IUtilities>&& utils, IQueue* copyQueue, core::smart_refctd_ptr<system::ILogger>&& logger) :
+	m_utilities(std::move(utils)),
+	m_copyQueue(copyQueue),
+	m_logger(std::move(logger))
 {
 	imagesCache = std::unique_ptr<ImagesCache>(new ImagesCache(ImagesBindingArraySize));
 }
@@ -56,7 +57,7 @@ void DrawResourcesFiller::allocateResourcesBuffer(ILogicalDevice* logicalDevice,
 
 		if (memoryTypeIdx == ~0u)
 		{
-			// TODO: Log, no device local memory found?! weird
+			m_logger.log("allocateResourcesBuffer: no device local memory type found.", nbl::system::ILogger::ELL_ERROR);
 			assert(false);
 		}
 
@@ -76,7 +77,7 @@ void DrawResourcesFiller::allocateResourcesBuffer(ILogicalDevice* logicalDevice,
 		}
 		else
 		{
-			// LOG: Allocation failure to allocate memory arena for images 
+			m_logger.log("failure to allocate memory arena for images", nbl::system::ILogger::ELL_ERROR);
 			assert(false);
 		}
 	}
@@ -88,8 +89,12 @@ void DrawResourcesFiller::allocateMSDFTextures(ILogicalDevice* logicalDevice, ui
 	// TODO: Make this function failable and report insufficient memory
 	asset::E_FORMAT msdfFormat = MSDFTextureFormat;
 	asset::VkExtent3D MSDFsExtent = { msdfsExtent.x, msdfsExtent.y, 1u }; 
-	assert(maxMSDFs <= logicalDevice->getPhysicalDevice()->getLimits().maxImageArrayLayers);
-
+	if (maxMSDFs > logicalDevice->getPhysicalDevice()->getLimits().maxImageArrayLayers)
+	{
+		m_logger.log("requested maxMSDFs is greater than maxImageArrayLayers. lowering the limit...", nbl::system::ILogger::ELL_WARNING);
+		maxMSDFs = logicalDevice->getPhysicalDevice()->getLimits().maxImageArrayLayers;
+	}
+	
 	IPhysicalDevice::SImageFormatPromotionRequest promotionRequest = {};
 	promotionRequest.originalFormat = msdfFormat;
 	promotionRequest.usages = {};
@@ -176,7 +181,7 @@ void DrawResourcesFiller::drawPolyline(const CPolylineBase& polyline, SIntendedS
 	uint32_t mainObjectIdx = acquireActiveMainObjectIndex_SubmitIfNeeded(intendedNextSubmit);
 	if (mainObjectIdx == InvalidMainObjectIdx)
 	{
-		// TODO: assert or log error here
+		m_logger.log("drawPolyline: acquireActiveMainObjectIndex returned invalid index", nbl::system::ILogger::ELL_ERROR);
 		assert(false);
 		return;
 	}
@@ -227,6 +232,12 @@ void DrawResourcesFiller::drawTriangleMesh(
 	drawCallData.isDTMRendering = true;
 
 	uint32_t mainObjectIdx = acquireActiveMainObjectIndex_SubmitIfNeeded(intendedNextSubmit);
+	if (mainObjectIdx == InvalidMainObjectIdx)
+	{
+		m_logger.log("drawTriangleMesh: acquireActiveMainObjectIndex returned invalid index", nbl::system::ILogger::ELL_ERROR);
+		assert(false);
+		return;
+	}
 	drawCallData.dtm.triangleMeshMainObjectIndex = mainObjectIdx;
 
 	ICPUBuffer::SCreationParams geometryBuffParams;
@@ -297,7 +308,9 @@ void DrawResourcesFiller::drawHatch(
 		textureIdx = getMSDFIndexFromInputInfo(msdfInfo, intendedNextSubmit);
 		if (textureIdx == InvalidTextureIndex)
 			textureIdx = addMSDFTexture(msdfInfo, getHatchFillPatternMSDF(fillPattern), intendedNextSubmit);
-		_NBL_DEBUG_BREAK_IF(textureIdx == InvalidTextureIndex); // probably getHatchFillPatternMSDF returned nullptr
+
+		if (textureIdx == InvalidTextureIndex)
+			m_logger.log("drawHatch: textureIdx returned invalid index", nbl::system::ILogger::ELL_ERROR);
 	}
 
 	LineStyleInfo lineStyle = {};
@@ -308,6 +321,13 @@ void DrawResourcesFiller::drawHatch(
 	beginMainObject(MainObjectType::HATCH);
 	
 	uint32_t mainObjectIdx = acquireActiveMainObjectIndex_SubmitIfNeeded(intendedNextSubmit);
+	if (mainObjectIdx == InvalidMainObjectIdx)
+	{
+		m_logger.log("drawHatch: acquireActiveMainObjectIndex returned invalid index", nbl::system::ILogger::ELL_ERROR);
+		assert(false);
+		return;
+	}
+
 	uint32_t currentObjectInSection = 0u; // Object here refers to DrawObject. You can think of it as a Cage.
 	while (currentObjectInSection < hatch.getHatchBoxCount())
 	{
@@ -340,8 +360,13 @@ void DrawResourcesFiller::drawFontGlyph(
 		textureIdx = addMSDFTexture(msdfInput, getGlyphMSDF(fontFace, glyphIdx), intendedNextSubmit);
 
 	uint32_t mainObjIdx = acquireActiveMainObjectIndex_SubmitIfNeeded(intendedNextSubmit);
-	assert(mainObjIdx != InvalidMainObjectIdx);
-
+	if (mainObjIdx == InvalidMainObjectIdx)
+	{
+		m_logger.log("drawFontGlyph: acquireActiveMainObjectIndex returned invalid index", nbl::system::ILogger::ELL_ERROR);
+		assert(false);
+		return;
+	}
+	
 	if (textureIdx != InvalidTextureIndex)
 	{
 		GlyphInfo glyphInfo = GlyphInfo(topLeft, dirU, aspectRatio, textureIdx, minUV);
@@ -349,13 +374,17 @@ void DrawResourcesFiller::drawFontGlyph(
 		{
 			// single font glyph couldn't fit into memory to push to gpu, so we submit rendering current objects and reset geometry buffer and draw objects
 			submitCurrentDrawObjectsAndReset(intendedNextSubmit, mainObjIdx);
-			bool success = addFontGlyph_Internal(glyphInfo, mainObjIdx);
-			assert(success); // this should always be true, otherwise it's either bug in code or not enough memory allocated to hold a single GlyphInfo
+			const bool success = addFontGlyph_Internal(glyphInfo, mainObjIdx);
+			if (!success)
+			{
+				m_logger.log("addFontGlyph_Internal failed, even after overflow-submission, this is irrecoverable.", nbl::system::ILogger::ELL_ERROR);
+				assert(false);
+			}
 		}
 	}
 	else
 	{
-		// TODO: Log, probably getGlyphMSDF(face,glyphIdx) returned nullptr ICPUImage ptr
+		m_logger.log("drawFontGlyph: textureIdx is invalid.", nbl::system::ILogger::ELL_ERROR);
 		_NBL_DEBUG_BREAK_IF(true);
 	}
 }
@@ -396,7 +425,7 @@ bool DrawResourcesFiller::ensureStaticImageAvailability(const StaticImageInfo& s
 		}
 		else
 		{
-			// TODO[LOG]: ? found static image has empty cpu image, shouldn't happen
+			m_logger.log("found static image has empty cpu image, shouldn't happen", nbl::system::ILogger::ELL_ERROR);
 		}
 	}
 
@@ -443,7 +472,7 @@ bool DrawResourcesFiller::ensureStaticImageAvailability(const StaticImageInfo& s
 			{
 				// All attempts to try create the GPU image and its corresponding view have failed.
 				// Most likely cause: insufficient GPU memory or unsupported image parameters.
-				// TODO: Log a warning or error here � `addStaticImage2D` failed, likely due to low VRAM.
+				m_logger.log("ensureStaticImageAvailability failed, likely due to low VRAM.", nbl::system::ILogger::ELL_ERROR);
 				_NBL_DEBUG_BREAK_IF(true);
 
 				if (cachedImageRecord->allocationOffset != ImagesMemorySubAllocator::InvalidAddress)
@@ -469,7 +498,7 @@ bool DrawResourcesFiller::ensureStaticImageAvailability(const StaticImageInfo& s
 		}
 		else
 		{
-			// TODO: log here, index allocation failed.
+			m_logger.log("ensureStaticImageAvailability failed index allocation. shouldn't have happened.", nbl::system::ILogger::ELL_ERROR);
 			cachedImageRecord->arrayIndex = InvalidTextureIndex;
 		}
 	}
@@ -515,8 +544,6 @@ bool DrawResourcesFiller::ensureGeoreferencedImageAvailability_AllocateIfNeeded(
 	ImageType georeferenceImageType;
 	determineGeoreferencedImageCreationParams(imageCreationParams, georeferenceImageType, params);
 
-	assert(georeferenceImageType != ImageType::STATIC);
-
 	// imageParams = cpuImage->getCreationParameters();
 	imageCreationParams.usage |= IGPUImage::EUF_TRANSFER_DST_BIT|IGPUImage::EUF_SAMPLED_BIT;
 	// promote format because RGB8 and friends don't actually exist in HW
@@ -556,12 +583,12 @@ bool DrawResourcesFiller::ensureGeoreferencedImageAvailability_AllocateIfNeeded(
 			}
 			else
 			{
-				// TODO[LOG]
+				m_logger.log("Cached georeferenced image has invalid gpu image.", nbl::system::ILogger::ELL_ERROR);
 			}
 		}
 		else
 		{
-			// TODO[LOG]
+			m_logger.log("Cached georeferenced image has invalid gpu image view.", nbl::system::ILogger::ELL_ERROR);
 		}
 	}
 
@@ -592,7 +619,8 @@ bool DrawResourcesFiller::ensureGeoreferencedImageAvailability_AllocateIfNeeded(
 			{
 				// All attempts to try create the GPU image and its corresponding view have failed.
 				// Most likely cause: insufficient GPU memory or unsupported image parameters.
-				// TODO: Log a warning or error here � `addStaticImage2D` failed, likely due to low VRAM.
+				
+				m_logger.log("ensureGeoreferencedImageAvailability_AllocateIfNeeded failed, likely due to low VRAM.", nbl::system::ILogger::ELL_ERROR);
 				_NBL_DEBUG_BREAK_IF(true);
 
 				if (cachedImageRecord->allocationOffset != ImagesMemorySubAllocator::InvalidAddress)
@@ -618,7 +646,7 @@ bool DrawResourcesFiller::ensureGeoreferencedImageAvailability_AllocateIfNeeded(
 		}
 		else
 		{
-			// TODO: log here, index allocation failed.
+			m_logger.log("ensureGeoreferencedImageAvailability_AllocateIfNeeded failed index allocation. shouldn't have happened.", nbl::system::ILogger::ELL_ERROR);
 			cachedImageRecord->arrayIndex = InvalidTextureIndex;
 		}
 	}
@@ -664,14 +692,23 @@ void DrawResourcesFiller::drawGridDTM(
 	beginMainObject(MainObjectType::GRID_DTM);
 
 	uint32_t mainObjectIdx = acquireActiveMainObjectIndex_SubmitIfNeeded(intendedNextSubmit);
-	assert(mainObjectIdx != InvalidMainObjectIdx);
+	if (mainObjectIdx == InvalidMainObjectIdx)
+	{
+		m_logger.log("drawGridDTM: acquireActiveMainObjectIndex returned invalid index", nbl::system::ILogger::ELL_ERROR);
+		assert(false);
+		return;
+	}
 
 	if (!addGridDTM_Internal(gridDTMInfo, mainObjectIdx))
 	{
 		// single grid DTM couldn't fit into memory to push to gpu, so we submit rendering current objects and reset geometry buffer and draw objects
 		submitCurrentDrawObjectsAndReset(intendedNextSubmit, mainObjectIdx);
-		bool success = addGridDTM_Internal(gridDTMInfo, mainObjectIdx);
-		assert(success); // this should always be true, otherwise it's either bug in code or not enough memory allocated to hold a single GridDTMInfo
+		const bool success = addGridDTM_Internal(gridDTMInfo, mainObjectIdx);
+		if (!success)
+		{
+			m_logger.log("addGridDTM_Internal failed, even after overflow-submission, this is irrecoverable.", nbl::system::ILogger::ELL_ERROR);
+			assert(false);
+		}
 	}
 
 	endMainObject();
@@ -682,6 +719,12 @@ void DrawResourcesFiller::addImageObject(image_id imageID, const OrientedBoundin
 	beginMainObject(MainObjectType::STATIC_IMAGE);
 
 	uint32_t mainObjIdx = acquireActiveMainObjectIndex_SubmitIfNeeded(intendedNextSubmit);
+	if (mainObjIdx == InvalidMainObjectIdx)
+	{
+		m_logger.log("addImageObject: acquireActiveMainObjectIndex returned invalid index", nbl::system::ILogger::ELL_ERROR);
+		assert(false);
+		return;
+	}
 
 	ImageObjectInfo info = {};
 	info.topLeft = obb.topLeft;
@@ -692,8 +735,12 @@ void DrawResourcesFiller::addImageObject(image_id imageID, const OrientedBoundin
 	{
 		// single image object couldn't fit into memory to push to gpu, so we submit rendering current objects and reset geometry buffer and draw objects
 		submitCurrentDrawObjectsAndReset(intendedNextSubmit, mainObjIdx);
-		bool success = addImageObject_Internal(info, mainObjIdx);
-		assert(success); // this should always be true, otherwise it's either bug in code or not enough memory allocated to hold a single image object 
+		const bool success = addImageObject_Internal(info, mainObjIdx);
+		if (!success)
+		{
+			m_logger.log("addImageObject_Internal failed, even after overflow-submission, this is irrecoverable.", nbl::system::ILogger::ELL_ERROR);
+			assert(false);
+		}
 	}
 
 	endMainObject();
@@ -704,6 +751,12 @@ void DrawResourcesFiller::addGeoreferencedImage(image_id imageID, const Georefer
 	beginMainObject(MainObjectType::STREAMED_IMAGE);
 
 	uint32_t mainObjIdx = acquireActiveMainObjectIndex_SubmitIfNeeded(intendedNextSubmit);
+	if (mainObjIdx == InvalidMainObjectIdx)
+	{
+		m_logger.log("addGeoreferencedImage: acquireActiveMainObjectIndex returned invalid index", nbl::system::ILogger::ELL_ERROR);
+		assert(false);
+		return;
+	}
 
 	GeoreferencedImageInfo info = {};
 	info.topLeft = params.worldspaceOBB.topLeft;
@@ -714,8 +767,12 @@ void DrawResourcesFiller::addGeoreferencedImage(image_id imageID, const Georefer
 	{
 		// single image object couldn't fit into memory to push to gpu, so we submit rendering current objects and reset geometry buffer and draw objects
 		submitCurrentDrawObjectsAndReset(intendedNextSubmit, mainObjIdx);
-		bool success = addGeoreferencedImageInfo_Internal(info, mainObjIdx);
-		assert(success); // this should always be true, otherwise it's either bug in code or not enough memory allocated to hold a single GeoreferencedImageInfo 
+		const bool success = addGeoreferencedImageInfo_Internal(info, mainObjIdx);
+		if (!success)
+		{
+			m_logger.log("addGeoreferencedImageInfo_Internal failed, even after overflow-submission, this is irrecoverable.", nbl::system::ILogger::ELL_ERROR);
+			assert(false);
+		}
 	}
 
 	endMainObject();
@@ -806,7 +863,7 @@ bool DrawResourcesFiller::pushAllUploads(SIntendedSubmitInfo& intendedNextSubmit
 
 			if (!successCreateNewImage)
 			{
-				// TODO: Log
+				m_logger.log("Couldn't create new gpu image in pushAllUploads: cache and replay mode.", nbl::system::ILogger::ELL_ERROR);
 				_NBL_DEBUG_BREAK_IF(true);
 				success = false;
 			}
@@ -954,7 +1011,12 @@ bool DrawResourcesFiller::pushBufferUploads(SIntendedSubmitInfo& intendedNextSub
 {
 	copiedResourcesSize = 0ull;
 
-	assert(resourcesCollection.calculateTotalConsumption() <= resourcesGPUBuffer->getSize());
+	if (resourcesCollection.calculateTotalConsumption() > resourcesGPUBuffer->getSize())
+	{
+		m_logger.log("some bug has caused the resourcesCollection to consume more memory than available in resourcesGPUBuffer without overflow submit", nbl::system::ILogger::ELL_ERROR);
+		assert(false);
+		return false;
+	}
 
 	auto copyCPUFilledDrawBuffer = [&](auto& drawBuffer) -> bool
 		{
@@ -963,7 +1025,7 @@ bool DrawResourcesFiller::pushBufferUploads(SIntendedSubmitInfo& intendedNextSub
 
 			if (copyRange.offset + copyRange.size > resourcesGPUBuffer->getSize())
 			{
-				// TODO: LOG ERROR, this shouldn't happen with correct auto-submission mechanism
+				m_logger.log("`copyRange.offset + copyRange.size > resourcesGPUBuffer->getSize()` is true in `copyCPUFilledDrawBuffer`, this shouldn't happen with correct auto-submission mechanism.", nbl::system::ILogger::ELL_ERROR);
 				assert(false);
 				return false;
 			}
@@ -985,7 +1047,7 @@ bool DrawResourcesFiller::pushBufferUploads(SIntendedSubmitInfo& intendedNextSub
 
 			if (copyRange.offset + copyRange.size > resourcesGPUBuffer->getSize())
 			{
-				// TODO: LOG ERROR, this shouldn't happen with correct auto-submission mechanism
+				m_logger.log("`copyRange.offset + copyRange.size > resourcesGPUBuffer->getSize()` is true in `addComputeReservedFilledDrawBuffer`, this shouldn't happen with correct auto-submission mechanism.", nbl::system::ILogger::ELL_ERROR);
 				assert(false);
 				return false;
 			}
@@ -1127,7 +1189,7 @@ bool DrawResourcesFiller::pushMSDFImagesUploads(SIntendedSubmitInfo& intendedNex
 	}
 	else
 	{
-		// TODO: Log no valid command buffer to record into
+		m_logger.log("`copyRange.offset + copyRange.size > resourcesGPUBuffer->getSize()` is true in `addComputeReservedFilledDrawBuffer`, this shouldn't happen with correct auto-submission mechanism.", nbl::system::ILogger::ELL_ERROR);
 		return false;
 	}
 }
@@ -1244,7 +1306,7 @@ bool DrawResourcesFiller::pushStaticImagesUploads(SIntendedSubmitInfo& intendedN
 					imageRecord.state = ImageState::GPU_RESIDENT_WITH_VALID_STATIC_DATA;
 				else
 				{
-					// TODO: LOG
+					m_logger.log("Failed `updateImageViaStagingBuffer` in pushStaticImagesUploads.", nbl::system::ILogger::ELL_ERROR);
 				}
 			}
 
@@ -1292,7 +1354,7 @@ bool DrawResourcesFiller::pushStaticImagesUploads(SIntendedSubmitInfo& intendedN
 
 	if (!success)
 	{
-		// TODO: Log
+		m_logger.log("Failure in `pushStaticImagesUploads`.", nbl::system::ILogger::ELL_ERROR);
 		_NBL_DEBUG_BREAK_IF(true);
 	}
 	return success;
@@ -1416,7 +1478,7 @@ bool DrawResourcesFiller::pushStreamedImagesUploads(SIntendedSubmitInfo& intende
 
 	if (!success)
 	{
-		// TODO: Log
+		m_logger.log("Failure in `pushStreamedImagesUploads`.", nbl::system::ILogger::ELL_ERROR);
 		_NBL_DEBUG_BREAK_IF(true);
 	}
 	return success;
@@ -2055,7 +2117,8 @@ void DrawResourcesFiller::evictImage_SubmitIfNeeded(image_id imageID, const Cach
 {
 	if (evicted.arrayIndex == InvalidTextureIndex)
 	{
-		_NBL_DEBUG_BREAK_IF(true); // shouldn't happen under normal circumstances, TODO: LOG warning
+		m_logger.log("evictImage_SubmitIfNeeded: `evicted.arrayIndex == InvalidTextureIndex` is true, shouldn't happen under normal circumstances.", nbl::system::ILogger::ELL_WARNING);
+		_NBL_DEBUG_BREAK_IF(true);
 		return;
 	}
 	// Later used to release the image's memory range.
@@ -2148,7 +2211,7 @@ DrawResourcesFiller::ImageAllocateResults DrawResourcesFiller::tryCreateAndAlloc
 						else
 						{
 							// irrecoverable error if simple image creation fails.
-							// TODO[LOG]: that's rare, image view creation failed.
+							m_logger.log("tryCreateAndAllocateImage_SubmitIfNeeded: gpuImageView creation failed, that's rare and irrecoverable when adding a new image.", nbl::system::ILogger::ELL_ERROR);
 							_NBL_DEBUG_BREAK_IF(true);
 						}
 
@@ -2158,7 +2221,7 @@ DrawResourcesFiller::ImageAllocateResults DrawResourcesFiller::tryCreateAndAlloc
 					else
 					{
 						// irrecoverable error if simple bindImageMemory fails.
-						// TODO: LOG
+						m_logger.log("tryCreateAndAllocateImage_SubmitIfNeeded: bindImageMemory failed, that's irrecoverable when adding a new image.", nbl::system::ILogger::ELL_ERROR);
 						_NBL_DEBUG_BREAK_IF(true);
 						break;
 					}
@@ -2171,16 +2234,14 @@ DrawResourcesFiller::ImageAllocateResults DrawResourcesFiller::tryCreateAndAlloc
 			}
 			else
 			{
-				// irrecoverable error if memory requirements of the image don't match our preallocated devicememory
-				// TODO: LOG
+				m_logger.log("tryCreateAndAllocateImage_SubmitIfNeeded: memory requirements of the gpu image doesn't match our preallocated device memory, that's irrecoverable when adding a new image.", nbl::system::ILogger::ELL_ERROR);
 				_NBL_DEBUG_BREAK_IF(true);
 				break;
 			}
 		}
 		else
 		{
-			// irrecoverable error if simple image creation fails.
-			// TODO: LOG
+			m_logger.log("tryCreateAndAllocateImage_SubmitIfNeeded: gpuImage creation failed, that's irrecoverable when adding a new image.", nbl::system::ILogger::ELL_ERROR);
 			_NBL_DEBUG_BREAK_IF(true);
 			break;
 		}
@@ -2195,8 +2256,8 @@ DrawResourcesFiller::ImageAllocateResults DrawResourcesFiller::tryCreateAndAlloc
 			// We give up, it's really nothing we can do, no image to evict (alreadyBlockedForDeferredFrees==1) and no more memory to free up (alreadyBlockedForDeferredFrees).
 			// We probably have evicted almost every other texture except the one we just allocated an index for. 
 			// This is most likely due to current image memory requirement being greater than the whole memory allocated for all images
+			m_logger.log("tryCreateAndAllocateImage_SubmitIfNeeded: failed allocating an image, there is nothing more from mcache to evict, the current memory requirement is simply greater than the whole memory allocated for all images.", nbl::system::ILogger::ELL_ERROR);
 			_NBL_DEBUG_BREAK_IF(true);
-			// TODO[LOG]
 			break;
 		}
 
@@ -2282,12 +2343,18 @@ uint32_t DrawResourcesFiller::getMSDFIndexFromInputInfo(const MSDFInputInfo& msd
 uint32_t DrawResourcesFiller::addMSDFTexture(const MSDFInputInfo& msdfInput, core::smart_refctd_ptr<ICPUImage>&& cpuImage, SIntendedSubmitInfo& intendedNextSubmit)
 {
 	if (!cpuImage)
-		return InvalidTextureIndex; // TODO: Log
+	{
+		m_logger.log("addMSDFTexture: cpuImage is nullptr.", nbl::system::ILogger::ELL_ERROR);
+		return InvalidTextureIndex;
+	}
 
 	const auto cpuImageSize = cpuImage->getMipSize(0);
 	const bool sizeMatch = cpuImageSize.x == getMSDFResolution().x && cpuImageSize.y == getMSDFResolution().y && cpuImageSize.z == 1u;
 	if (!sizeMatch)
-		return InvalidTextureIndex; // TODO: Log
+	{
+		m_logger.log("addMSDFTexture: cpuImage size doesn't match with msdf array image.", nbl::system::ILogger::ELL_ERROR);
+		return InvalidTextureIndex;
+	}
 
 	/*
 	 * The `msdfTextureArrayIndexAllocator` manages indices (slots) into a texture array for MSDF images.
@@ -2355,7 +2422,7 @@ uint32_t DrawResourcesFiller::addMSDFTexture(const MSDFInputInfo& msdfInput, cor
 		}
 		else
 		{
-			// TODO: log here, assert will be called in a few lines
+			m_logger.log("addMSDFTexture: index allocation failed.", nbl::system::ILogger::ELL_ERROR);
 			inserted->alloc_idx = InvalidTextureIndex;
 		}
 	}
diff --git a/62_CAD/DrawResourcesFiller.h b/62_CAD/DrawResourcesFiller.h
index e91ff6413..f482d8435 100644
--- a/62_CAD/DrawResourcesFiller.h
+++ b/62_CAD/DrawResourcesFiller.h
@@ -122,7 +122,7 @@ struct DrawResourcesFiller
 	
 	DrawResourcesFiller();
 
-	DrawResourcesFiller(smart_refctd_ptr<IUtilities>&& utils, IQueue* copyQueue);
+	DrawResourcesFiller(smart_refctd_ptr<IUtilities>&& utils, IQueue* copyQueue, core::smart_refctd_ptr<system::ILogger>&& logger);
 
 	typedef std::function<void(SIntendedSubmitInfo&)> SubmitFunc;
 	void setSubmitDrawsFunction(const SubmitFunc& func);
@@ -708,6 +708,9 @@ struct DrawResourcesFiller
 	// Flushes Current Draw Call and adds to drawCalls
 	void flushDrawObjects();
 
+	// Logger
+	nbl::system::logger_opt_smart_ptr m_logger = nullptr;
+
 	// FrameIndex used as a criteria for resource/image eviction in case of limitations
 	uint32_t currentFrameIndex = 0u;
 

From 9e0448c171e3db745de5ee146cd05c8ec5597781 Mon Sep 17 00:00:00 2001
From: Przemek <minikers21@gmail.com>
Date: Tue, 3 Jun 2025 14:46:05 +0200
Subject: [PATCH 329/529] Saving work

---
 62_CAD/shaders/globals.hlsl                   |  2 +-
 .../main_pipeline/fragment_shader.hlsl        | 13 ++++++--
 .../shaders/main_pipeline/vertex_shader.hlsl  | 32 ++++++++++++-------
 3 files changed, 32 insertions(+), 15 deletions(-)

diff --git a/62_CAD/shaders/globals.hlsl b/62_CAD/shaders/globals.hlsl
index 0ff238289..21f33eda3 100644
--- a/62_CAD/shaders/globals.hlsl
+++ b/62_CAD/shaders/globals.hlsl
@@ -260,7 +260,7 @@ struct GridDTMInfo
     uint32_t textureID; // 4 bytes (36)
     float gridCellWidth; // 4 bytes (40)
     float outlineStipplePatternLengthReciprocal; // 4 bytes (44)
-    float _padding; // 4 bytes (48)
+    float thicknessOfTheThickestLine; // 4 bytes (48)
 };
 
 static uint32_t packR11G11B10_UNORM(float32_t3 color)
diff --git a/62_CAD/shaders/main_pipeline/fragment_shader.hlsl b/62_CAD/shaders/main_pipeline/fragment_shader.hlsl
index eacc4ae64..0e5ca93c0 100644
--- a/62_CAD/shaders/main_pipeline/fragment_shader.hlsl
+++ b/62_CAD/shaders/main_pipeline/fragment_shader.hlsl
@@ -453,7 +453,8 @@ float4 fragMain(PSInput input) : SV_TARGET
             // v0-------v2b   v2a-------v1
             // 
 
-            // calculate screen space coordinates of vertices of the current tiranlge within the grid
+            // calculate screen space coordinates of vertices of t
+            // he current tiranlge within the grid
             float3 v[3];
             nbl::hlsl::shapes::Line<float> outlineLineSegments[2];
             float outlinePhaseShift;
@@ -553,6 +554,7 @@ float4 fragMain(PSInput input) : SV_TARGET
             float height = baryCoord.x * v[0].z + baryCoord.y * v[1].z + baryCoord.z * v[2].z;
             float2 heightDeriv = fwidth(height);
 
+
             float4 dtmColor = float4(0.0f, 0.0f, 0.0f, 0.0f);
             if (dtmSettings.drawOutlineEnabled())
                 dtmColor = dtm::blendUnder(dtmColor, dtm::calculateGridDTMOutlineColor(dtmSettings.outlineLineStyleIdx, outlineLineSegments, input.position.xy, outlinePhaseShift));
@@ -561,12 +563,19 @@ float4 fragMain(PSInput input) : SV_TARGET
                 for (uint32_t i = 0; i < dtmSettings.contourSettingsCount; ++i) // TODO: should reverse the order with blendUnder
                     dtmColor = dtm::blendUnder(dtmColor, dtm::calculateDTMContourColor(dtmSettings.contourSettings[i], v, input.position.xy, height));
             }
-            if (dtmSettings.drawHeightShadingEnabled())
+            const bool outOfBoundsUV = uv.x < 0.0f || uv.y < 0.0f || uv.x > 1.0f || uv.y > 1.0f;
+            if (dtmSettings.drawHeightShadingEnabled() && !outOfBoundsUV)
                 dtmColor = dtm::blendUnder(dtmColor, dtm::calculateDTMHeightColor(dtmSettings.heightShadingSettings, v, heightDeriv, input.position.xy, height));
 
             textureColor = dtmColor.rgb;
             localAlpha = dtmColor.a;
 
+            /*if (outOfBoundsUV)
+                textureColor = float3(0.0f, 1.0f, 0.0f);
+            else
+                textureColor = float3(0.0f, 0.0f, 1.0f);
+
+            localAlpha = 0.5f;*/
         }
         else if (objType == ObjectType::STREAMED_IMAGE) 
         {
diff --git a/62_CAD/shaders/main_pipeline/vertex_shader.hlsl b/62_CAD/shaders/main_pipeline/vertex_shader.hlsl
index f01f1ca4b..65f3eea64 100644
--- a/62_CAD/shaders/main_pipeline/vertex_shader.hlsl
+++ b/62_CAD/shaders/main_pipeline/vertex_shader.hlsl
@@ -670,33 +670,42 @@ PSInput main(uint vertexID : SV_VertexID)
 
             // TODO: finish implementing grid dilation
             // TODO: calculate actual thicknessOfTheThickestLine
-            /*float thicknessOfTheThickestLine = 20.0f;
+            float thicknessOfTheThickestLine = 200.0f;
 
             static const float SquareRootOfTwo = 1.4142135f;
             const pfloat64_t dilationFactor = SquareRootOfTwo * thicknessOfTheThickestLine;
             pfloat64_t2 dilationVector = pfloat64_t2(dilationFactor, dilationFactor);
 
+            const pfloat64_t dilationFactorTimesTwo = dilationFactor * 2.0f;
+            const pfloat64_t2 dilatedGridExtents = worldSpaceExtents + pfloat64_t2(dilationFactorTimesTwo, dilationFactorTimesTwo);
+            const float2 uvScale = _static_cast<float2>(worldSpaceExtents) / _static_cast<float2>(dilatedGridExtents);
+            float2 uvOffset = float2(dilationFactor, dilationFactor) / _static_cast<float2>(dilatedGridExtents);
+            uvOffset /= uvScale;
+
             if (corner.x == 0.0f && corner.y == 0.0f)
             {
                 dilationVector.x = -dilationVector.x;
+                uvOffset.x = -uvOffset.x;
+                uvOffset.y = -uvOffset.y;
             }
             else if (corner.x == 0.0f && corner.y == 1.0f)
             {
                 dilationVector.x = -dilationVector.x;
                 dilationVector.y = -dilationVector.y;
+                uvOffset.x = -uvOffset.x;
             }
             else if (corner.x == 1.0f && corner.y == 1.0f)
             {
                 dilationVector.y = -dilationVector.y;
             }
+            else if (corner.x == 1.0f && corner.y == 0.0f)
+            {
+                uvOffset.y = -uvOffset.y;
+            }
 
-            const pfloat64_t dilationFactorTimesTwo = dilationFactor * 2.0f;
-            const pfloat64_t2 dilatedGridExtents = worldSpaceExtents + pfloat64_t2(dilationFactorTimesTwo, dilationFactorTimesTwo);
-            
-            float2 uvScale = _static_cast<float2>(worldSpaceExtents) / _static_cast<float2>(dilatedGridExtents);
-            float2 uvOffset = float2(-dilationFactor, -dilationFactor) / _static_cast<float2>(dilatedGridExtents);
-
-            outV.setImageUV(corner * uvScale + uvOffset);
+            const float2 uv = corner + uvOffset;
+            outV.setImageUV(uv);
+            printf("uv = { %f, %f } scale = { %f, %f }", _static_cast<float>(uv.x), _static_cast<float>(uv.y), _static_cast<float>(uvScale.x), _static_cast<float>(uvScale.y));
 
             pfloat64_t2 topLeftToGridCenterVector = worldSpaceExtents * 0.5;
             topLeftToGridCenterVector.y = -topLeftToGridCenterVector.y;
@@ -704,14 +713,13 @@ PSInput main(uint vertexID : SV_VertexID)
 
             pfloat64_t2 dilatedVtxPos = vtxPos + dilationVector;
 
-            printf("actual = { %f, %f } dialated = { %f, %f }", _static_cast<float>(uvScale.x), _static_cast<float>(uvScale.y), _static_cast<float>(dilatedVtxPos.x), _static_cast<float>(dilatedVtxPos.y));
 
             float2 ndcVtxPos = _static_cast<float2>(transformPointNdc(clipProjectionData.projectionToNDC, dilatedVtxPos));
-            outV.position = float4(ndcVtxPos, 0.0f, 1.0f);*/
+            outV.position = float4(ndcVtxPos, 0.0f, 1.0f);
 
-            outV.setImageUV(corner);
+            /*outV.setImageUV(corner);
             float2 ndcVtxPos = _static_cast<float2>(transformPointNdc(clipProjectionData.projectionToNDC, vtxPos));
-            outV.position = float4(ndcVtxPos, 0.0f, 1.0f);
+            outV.position = float4(ndcVtxPos, 0.0f, 1.0f);*/
         }
         else if (objType == ObjectType::STREAMED_IMAGE)
         {

From fef1cd5f1502ce9cf356f6cfd1045ee9bfb6bd21 Mon Sep 17 00:00:00 2001
From: devsh <devsh@devsh.eu>
Date: Tue, 3 Jun 2025 14:59:19 +0200
Subject: [PATCH 330/529] first outline

---
 29_MeshLoaders/CMakeLists.txt |   36 +-
 29_MeshLoaders/main.cpp       | 1603 ++++++++++++++-------------------
 2 files changed, 723 insertions(+), 916 deletions(-)

diff --git a/29_MeshLoaders/CMakeLists.txt b/29_MeshLoaders/CMakeLists.txt
index a476b6203..07b0fd396 100644
--- a/29_MeshLoaders/CMakeLists.txt
+++ b/29_MeshLoaders/CMakeLists.txt
@@ -1,7 +1,37 @@
-
 include(common RESULT_VARIABLE RES)
 if(NOT RES)
-	message(FATAL_ERROR "common.cmake not found. Should be in {repo_root}/cmake directory")
+        message(FATAL_ERROR "common.cmake not found. Should be in {repo_root}/cmake directory")
 endif()
 
-nbl_create_executable_project("" "" "" "" "${NBL_EXECUTABLE_PROJECT_CREATION_PCH_TARGET}")
\ No newline at end of file
+if(NBL_BUILD_IMGUI)
+	set(NBL_INCLUDE_SERACH_DIRECTORIES
+		"${CMAKE_CURRENT_SOURCE_DIR}/include"
+	)
+
+	list(APPEND NBL_LIBRARIES 
+		imtestengine
+		"${NBL_EXT_IMGUI_UI_LIB}"
+	)
+
+	nbl_create_executable_project("" "" "${NBL_INCLUDE_SERACH_DIRECTORIES}" "${NBL_LIBRARIES}" "${NBL_EXECUTABLE_PROJECT_CREATION_PCH_TARGET}")
+
+	if(NBL_EMBED_BUILTIN_RESOURCES)
+		set(_BR_TARGET_ ${EXECUTABLE_NAME}_builtinResourceData)
+		set(RESOURCE_DIR "app_resources")
+
+		get_filename_component(_SEARCH_DIRECTORIES_ "${CMAKE_CURRENT_SOURCE_DIR}" ABSOLUTE)
+		get_filename_component(_OUTPUT_DIRECTORY_SOURCE_ "${CMAKE_CURRENT_BINARY_DIR}/src" ABSOLUTE)
+		get_filename_component(_OUTPUT_DIRECTORY_HEADER_ "${CMAKE_CURRENT_BINARY_DIR}/include" ABSOLUTE)
+
+		file(GLOB_RECURSE BUILTIN_RESOURCE_FILES RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}/${RESOURCE_DIR}" "${CMAKE_CURRENT_SOURCE_DIR}/${RESOURCE_DIR}/*")
+		foreach(RES_FILE ${BUILTIN_RESOURCE_FILES})
+			LIST_BUILTIN_RESOURCE(RESOURCES_TO_EMBED "${RES_FILE}")
+		endforeach()
+
+		ADD_CUSTOM_BUILTIN_RESOURCES(${_BR_TARGET_} RESOURCES_TO_EMBED "${_SEARCH_DIRECTORIES_}" "${RESOURCE_DIR}" "nbl::this_example::builtin" "${_OUTPUT_DIRECTORY_HEADER_}" "${_OUTPUT_DIRECTORY_SOURCE_}")
+
+		LINK_BUILTIN_RESOURCES_TO_TARGET(${EXECUTABLE_NAME} ${_BR_TARGET_})
+	endif()
+endif()
+
+
diff --git a/29_MeshLoaders/main.cpp b/29_MeshLoaders/main.cpp
index 968f7c42e..feb52936a 100644
--- a/29_MeshLoaders/main.cpp
+++ b/29_MeshLoaders/main.cpp
@@ -2,139 +2,101 @@
 // This file is part of the "Nabla Engine".
 // For conditions of distribution and use, see copyright notice in nabla.h
 
-#include "common.hpp"
-#include "nbl/ext/FullScreenTriangle/FullScreenTriangle.h"
-#include "nbl/builtin/hlsl/indirect_commands.hlsl"
+#include <nabla.h>
+#include "nbl/asset/utils/CGeometryCreator.h"
+#include "nbl/application_templates/MonoAssetManagerAndBuiltinResourceApplication.hpp"
 
+#include "SimpleWindowedApplication.hpp"
 
-class RaytracingPipelineApp final : public examples::SimpleWindowedApplication, public application_templates::MonoAssetManagerAndBuiltinResourceApplication
-{
-	using device_base_t = examples::SimpleWindowedApplication;
-	using asset_base_t = application_templates::MonoAssetManagerAndBuiltinResourceApplication;
-	using clock_t = std::chrono::steady_clock;
-
-	constexpr static inline uint32_t WIN_W = 1280, WIN_H = 720;
-	constexpr static inline uint32_t MaxFramesInFlight = 3u;
-	constexpr static inline uint8_t MaxUITextureCount = 1u;
-	constexpr static inline uint32_t NumberOfProceduralGeometries = 5;
-
-	static constexpr const char* s_lightTypeNames[E_LIGHT_TYPE::ELT_COUNT] = {
-	  "Directional",
-	  "Point",
-	  "Spot"
-	};
-
-	struct ShaderBindingTable
-	{
-		SBufferRange<IGPUBuffer> raygenGroupRange;
-		SBufferRange<IGPUBuffer> hitGroupsRange;
-		uint32_t hitGroupsStride;
-		SBufferRange<IGPUBuffer> missGroupsRange;
-		uint32_t missGroupsStride;
-		SBufferRange<IGPUBuffer> callableGroupsRange;
-		uint32_t callableGroupsStride;
-	};
-
-
-public:
-	inline RaytracingPipelineApp(const path& _localInputCWD, const path& _localOutputCWD, const path& _sharedInputCWD, const path& _sharedOutputCWD)
-		: IApplicationFramework(_localInputCWD, _localOutputCWD, _sharedInputCWD, _sharedOutputCWD)
-	{
-	}
+#include "InputSystem.hpp"
+#include "CEventCallback.hpp"
 
-	inline SPhysicalDeviceFeatures getRequiredDeviceFeatures() const override
-	{
-		auto retval = device_base_t::getRequiredDeviceFeatures();
-		retval.rayTracingPipeline = true;
-		retval.accelerationStructure = true;
-		retval.rayQuery = true;
-		return retval;
-	}
-
-	inline SPhysicalDeviceFeatures getPreferredDeviceFeatures() const override
-	{
-		auto retval = device_base_t::getPreferredDeviceFeatures();
-		retval.accelerationStructureHostCommands = true;
-		return retval;
-	}
+#include "CCamera.hpp"
 
-	inline core::vector<video::SPhysicalDeviceFilter::SurfaceCompatibility> getSurfaces() const override
-	{
-		if (!m_surface)
-		{
-			{
-				auto windowCallback = core::make_smart_refctd_ptr<CEventCallback>(smart_refctd_ptr(m_inputSystem), smart_refctd_ptr(m_logger));
-				IWindow::SCreationParams params = {};
-				params.callback = core::make_smart_refctd_ptr<ISimpleManagedSurface::ICallback>();
-				params.width = WIN_W;
-				params.height = WIN_H;
-				params.x = 32;
-				params.y = 32;
-				params.flags = ui::IWindow::ECF_HIDDEN | IWindow::ECF_BORDERLESS | IWindow::ECF_RESIZABLE;
-				params.windowCaption = "RaytracingPipelineApp";
-				params.callback = windowCallback;
-				const_cast<std::remove_const_t<decltype(m_window)>&>(m_window) = m_winMgr->createWindow(std::move(params));
-			}
+#include <nbl/builtin/hlsl/cpp_compat.hlsl>
+#include <nbl/builtin/hlsl/cpp_compat/matrix.hlsl>
 
-			auto surface = CSurfaceVulkanWin32::create(smart_refctd_ptr(m_api), smart_refctd_ptr_static_cast<IWindowWin32>(m_window));
-			const_cast<std::remove_const_t<decltype(m_surface)>&>(m_surface) = CSimpleResizeSurface<ISimpleManagedSurface::ISwapchainResources>::create(std::move(surface));
-		}
+using namespace nbl;
+using namespace core;
+using namespace hlsl;
+using namespace system;
+using namespace asset;
+using namespace ui;
+using namespace video;
 
-		if (m_surface)
-			return { {m_surface->getSurface()/*,EQF_NONE*/} };
 
-		return {};
-	}
+class MeshLoadersApp final : public examples::SimpleWindowedApplication, public application_templates::MonoAssetManagerAndBuiltinResourceApplication
+{
+		using device_base_t = examples::SimpleWindowedApplication;
+		using asset_base_t = application_templates::MonoAssetManagerAndBuiltinResourceApplication;
 
-	// so that we can use the same queue for asset converter and rendering
-	inline core::vector<queue_req_t> getQueueRequirements() const override
-	{
-		auto reqs = device_base_t::getQueueRequirements();
-		reqs.front().requiredFlags |= IQueue::FAMILY_FLAGS::COMPUTE_BIT;
-		return reqs;
-	}
+		constexpr static inline uint32_t WIN_W = 1280, WIN_H = 720;
+		constexpr static inline uint32_t MaxFramesInFlight = 3u;
+		constexpr static inline uint8_t MaxUITextureCount = 1u;
 
-	inline bool onAppInitialized(smart_refctd_ptr<ISystem>&& system) override
-	{
-		m_inputSystem = make_smart_refctd_ptr<InputSystem>(logger_opt_smart_ptr(smart_refctd_ptr(m_logger)));
 
-		if (!device_base_t::onAppInitialized(smart_refctd_ptr(system)))
-			return false;
-
-		if (!asset_base_t::onAppInitialized(smart_refctd_ptr(system)))
-			return false;
+	public:
+		inline MeshLoadersApp(const path& _localInputCWD, const path& _localOutputCWD, const path& _sharedInputCWD, const path& _sharedOutputCWD)
+			: IApplicationFramework(_localInputCWD, _localOutputCWD, _sharedInputCWD, _sharedOutputCWD)
+		{
+		}
 
-		smart_refctd_ptr<IShaderCompiler::CCache> shaderReadCache = nullptr;
-		smart_refctd_ptr<IShaderCompiler::CCache> shaderWriteCache = core::make_smart_refctd_ptr<IShaderCompiler::CCache>();
-		auto shaderCachePath = localOutputCWD / "main_pipeline_shader_cache.bin";
+		inline SPhysicalDeviceFeatures getPreferredDeviceFeatures() const override
+		{
+			auto retval = device_base_t::getPreferredDeviceFeatures();
+			retval.accelerationStructure = true;
+			retval.rayQuery = true;
+			return retval;
+		}
 
+		inline core::vector<video::SPhysicalDeviceFilter::SurfaceCompatibility> getSurfaces() const override
 		{
-			core::smart_refctd_ptr<system::IFile> shaderReadCacheFile;
+			if (!m_surface)
 			{
-				system::ISystem::future_t<core::smart_refctd_ptr<system::IFile>> future;
-				m_system->createFile(future, shaderCachePath.c_str(), system::IFile::ECF_READ);
-				if (future.wait())
 				{
-					future.acquire().move_into(shaderReadCacheFile);
-					if (shaderReadCacheFile)
-					{
-						const size_t size = shaderReadCacheFile->getSize();
-						if (size > 0ull)
-						{
-							std::vector<uint8_t> contents(size);
-							system::IFile::success_t succ;
-							shaderReadCacheFile->read(succ, contents.data(), 0, size);
-							if (succ)
-								shaderReadCache = IShaderCompiler::CCache::deserialize(contents);
-						}
-					}
+					auto windowCallback = core::make_smart_refctd_ptr<CEventCallback>(smart_refctd_ptr(m_inputSystem), smart_refctd_ptr(m_logger));
+					IWindow::SCreationParams params = {};
+					params.callback = core::make_smart_refctd_ptr<ISimpleManagedSurface::ICallback>();
+					params.width = WIN_W;
+					params.height = WIN_H;
+					params.x = 32;
+					params.y = 32;
+					params.flags = ui::IWindow::ECF_HIDDEN | IWindow::ECF_BORDERLESS | IWindow::ECF_RESIZABLE;
+					params.windowCaption = "MeshLoadersApp";
+					params.callback = windowCallback;
+					const_cast<std::remove_const_t<decltype(m_window)>&>(m_window) = m_winMgr->createWindow(std::move(params));
 				}
-				else
-					m_logger->log("Failed Openning Shader Cache File.", ILogger::ELL_ERROR);
+
+				auto surface = CSurfaceVulkanWin32::create(smart_refctd_ptr(m_api), smart_refctd_ptr_static_cast<IWindowWin32>(m_window));
+				const_cast<std::remove_const_t<decltype(m_surface)>&>(m_surface) = CSimpleResizeSurface<ISimpleManagedSurface::ISwapchainResources>::create(std::move(surface));
 			}
 
+			if (m_surface)
+				return { {m_surface->getSurface()/*,EQF_NONE*/} };
+
+			return {};
 		}
 
+		// so that we can use the same queue for asset converter and rendering
+		inline core::vector<queue_req_t> getQueueRequirements() const override
+		{
+			auto reqs = device_base_t::getQueueRequirements();
+			reqs.front().requiredFlags |= IQueue::FAMILY_FLAGS::TRANSFER_BIT;
+			reqs.front().requiredFlags |= IQueue::FAMILY_FLAGS::COMPUTE_BIT;
+			return reqs;
+		}
+
+		inline bool onAppInitialized(smart_refctd_ptr<ISystem>&& system) override
+		{
+			m_inputSystem = make_smart_refctd_ptr<InputSystem>(logger_opt_smart_ptr(smart_refctd_ptr(m_logger)));
+
+			if (!device_base_t::onAppInitialized(smart_refctd_ptr(system)))
+				return false;
+
+			if (!asset_base_t::onAppInitialized(smart_refctd_ptr(system)))
+				return false;
+
+#if 0
 		// Load Custom Shader
 		auto loadCompileAndCreateShader = [&](const std::string& relPath) -> smart_refctd_ptr<IGPUShader>
 			{
@@ -167,82 +129,57 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication,
 		const auto pointLightCallShader = loadCompileAndCreateShader("app_resources/light_point.rcall.hlsl");
 		const auto spotLightCallShader = loadCompileAndCreateShader("app_resources/light_spot.rcall.hlsl");
 		const auto fragmentShader = loadCompileAndCreateShader("app_resources/present.frag.hlsl");
+#endif
 
-		core::smart_refctd_ptr<system::IFile> shaderWriteCacheFile;
-		{
-			system::ISystem::future_t<core::smart_refctd_ptr<system::IFile>> future;
-			m_system->deleteFile(shaderCachePath); // temp solution instead of trimming, to make sure we won't have corrupted json
-			m_system->createFile(future, shaderCachePath.c_str(), system::IFile::ECF_WRITE);
-			if (future.wait())
-			{
-				future.acquire().move_into(shaderWriteCacheFile);
-				if (shaderWriteCacheFile)
-				{
-					auto serializedCache = shaderWriteCache->serialize();
-					if (shaderWriteCacheFile)
-					{
-						system::IFile::success_t succ;
-						shaderWriteCacheFile->write(succ, serializedCache->getPointer(), 0, serializedCache->getSize());
-						if (!succ)
-							m_logger->log("Failed Writing To Shader Cache File.", ILogger::ELL_ERROR);
-					}
-				}
-				else
-					m_logger->log("Failed Creating Shader Cache File.", ILogger::ELL_ERROR);
-			}
-			else
-				m_logger->log("Failed Creating Shader Cache File.", ILogger::ELL_ERROR);
-		}
-
-		m_semaphore = m_device->createSemaphore(m_realFrameIx);
-		if (!m_semaphore)
-			return logFail("Failed to Create a Semaphore!");
+			m_semaphore = m_device->createSemaphore(m_realFrameIx);
+			if (!m_semaphore)
+				return logFail("Failed to Create a Semaphore!");
 
-		auto gQueue = getGraphicsQueue();
+			auto gQueue = getGraphicsQueue();
 
-		// Create renderpass and init surface
-		nbl::video::IGPURenderpass* renderpass;
-		{
-			ISwapchain::SCreationParams swapchainParams = { .surface = smart_refctd_ptr<ISurface>(m_surface->getSurface()) };
-			if (!swapchainParams.deduceFormat(m_physicalDevice))
-				return logFail("Could not choose a Surface Format for the Swapchain!");
-
-			const static IGPURenderpass::SCreationParams::SSubpassDependency dependencies[] =
+			// Create renderpass and init surface
+			nbl::video::IGPURenderpass* renderpass;
 			{
-			  {
-				.srcSubpass = IGPURenderpass::SCreationParams::SSubpassDependency::External,
-				.dstSubpass = 0,
-				.memoryBarrier =
-				{
-				  .srcStageMask = asset::PIPELINE_STAGE_FLAGS::COPY_BIT,
-				  .srcAccessMask = asset::ACCESS_FLAGS::TRANSFER_WRITE_BIT,
-				  .dstStageMask = asset::PIPELINE_STAGE_FLAGS::COLOR_ATTACHMENT_OUTPUT_BIT,
-				  .dstAccessMask = asset::ACCESS_FLAGS::COLOR_ATTACHMENT_WRITE_BIT
-				}
-			  },
-			  {
-				.srcSubpass = 0,
-				.dstSubpass = IGPURenderpass::SCreationParams::SSubpassDependency::External,
-				.memoryBarrier =
-				{
-				  .srcStageMask = asset::PIPELINE_STAGE_FLAGS::COLOR_ATTACHMENT_OUTPUT_BIT,
-				  .srcAccessMask = asset::ACCESS_FLAGS::COLOR_ATTACHMENT_WRITE_BIT
-				}
-			  },
-			  IGPURenderpass::SCreationParams::DependenciesEnd
-			};
+				ISwapchain::SCreationParams swapchainParams = { .surface = smart_refctd_ptr<ISurface>(m_surface->getSurface()) };
+				if (!swapchainParams.deduceFormat(m_physicalDevice))
+					return logFail("Could not choose a Surface Format for the Swapchain!");
 
-			auto scResources = std::make_unique<CDefaultSwapchainFramebuffers>(m_device.get(), swapchainParams.surfaceFormat.format, dependencies);
-			renderpass = scResources->getRenderpass();
+				const static IGPURenderpass::SCreationParams::SSubpassDependency dependencies[] =
+				{
+				  {
+					.srcSubpass = IGPURenderpass::SCreationParams::SSubpassDependency::External,
+					.dstSubpass = 0,
+					.memoryBarrier =
+					{
+					  .srcStageMask = asset::PIPELINE_STAGE_FLAGS::COPY_BIT,
+					  .srcAccessMask = asset::ACCESS_FLAGS::TRANSFER_WRITE_BIT,
+					  .dstStageMask = asset::PIPELINE_STAGE_FLAGS::COLOR_ATTACHMENT_OUTPUT_BIT,
+					  .dstAccessMask = asset::ACCESS_FLAGS::COLOR_ATTACHMENT_WRITE_BIT
+					}
+				  },
+				  {
+					.srcSubpass = 0,
+					.dstSubpass = IGPURenderpass::SCreationParams::SSubpassDependency::External,
+					.memoryBarrier =
+					{
+					  .srcStageMask = asset::PIPELINE_STAGE_FLAGS::COLOR_ATTACHMENT_OUTPUT_BIT,
+					  .srcAccessMask = asset::ACCESS_FLAGS::COLOR_ATTACHMENT_WRITE_BIT
+					}
+				  },
+				  IGPURenderpass::SCreationParams::DependenciesEnd
+				};
 
-			if (!renderpass)
-				return logFail("Failed to create Renderpass!");
+				auto scResources = std::make_unique<CDefaultSwapchainFramebuffers>(m_device.get(), swapchainParams.surfaceFormat.format, dependencies);
+				renderpass = scResources->getRenderpass();
 
-			if (!m_surface || !m_surface->init(gQueue, std::move(scResources), swapchainParams.sharedParams))
-				return logFail("Could not create Window & Surface or initialize the Surface!");
-		}
+				if (!renderpass)
+					return logFail("Failed to create Renderpass!");
 
-		auto pool = m_device->createCommandPool(gQueue->getFamilyIndex(), IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT);
+				if (!m_surface || !m_surface->init(gQueue, std::move(scResources), swapchainParams.sharedParams))
+					return logFail("Could not create Window & Surface or initialize the Surface!");
+			}
+#if 0
+			auto pool = m_device->createCommandPool(gQueue->getFamilyIndex(), IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT);
 
 		m_converter = CAssetConverter::create({ .device = m_device.get(), .optimizer = {} });
 
@@ -253,11 +190,11 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication,
 			if (!pool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY, { m_cmdBufs.data() + i, 1 }))
 				return logFail("Couldn't create Command Buffer!");
 		}
+#endif
+			m_winMgr->setWindowSize(m_window.get(), WIN_W, WIN_H);
+			m_surface->recreateSwapchain();
 
-		m_winMgr->setWindowSize(m_window.get(), WIN_W, WIN_H);
-		m_surface->recreateSwapchain();
-
-
+#if 0
 		// create output images
 		m_hdrImage = m_device->createImage({
 			{
@@ -600,84 +537,84 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication,
 				ImGui::End();
 			}
 		);
+#endif
+			// Set Camera
+			{
+				core::vectorSIMDf cameraPosition(0, 5, -10);
+				matrix4SIMD proj = matrix4SIMD::buildProjectionMatrixPerspectiveFovRH(
+					core::radians(60.0f),
+					WIN_W / WIN_H,
+					0.01f,
+					500.0f
+				);
+				m_camera = Camera(cameraPosition, core::vectorSIMDf(0, 0, 0), proj);
+			}
 
-		// Set Camera
-		{
-			core::vectorSIMDf cameraPosition(0, 5, -10);
-			matrix4SIMD proj = matrix4SIMD::buildProjectionMatrixPerspectiveFovRH(
-				core::radians(60.0f),
-				WIN_W / WIN_H,
-				0.01f,
-				500.0f
-			);
-			m_camera = Camera(cameraPosition, core::vectorSIMDf(0, 0, 0), proj);
-		}
+			m_winMgr->setWindowSize(m_window.get(), WIN_W, WIN_H);
+			m_surface->recreateSwapchain();
+			m_winMgr->show(m_window.get());
+			m_oracle.reportBeginFrameRecord();
+			m_camera.mapKeysToWASD();
 
-		m_winMgr->setWindowSize(m_window.get(), WIN_W, WIN_H);
-		m_surface->recreateSwapchain();
-		m_winMgr->show(m_window.get());
-		m_oracle.reportBeginFrameRecord();
-		m_camera.mapKeysToWASD();
+			return true;
+		}
 
-		return true;
-	}
+		bool updateGUIDescriptorSet()
+		{
+			// texture atlas, note we don't create info & write pair for the font sampler because UI extension's is immutable and baked into DS layout
+			static std::array<IGPUDescriptorSet::SDescriptorInfo, MaxUITextureCount> descriptorInfo;
+			static IGPUDescriptorSet::SWriteDescriptorSet writes[MaxUITextureCount];
 
-	bool updateGUIDescriptorSet()
-	{
-		// texture atlas, note we don't create info & write pair for the font sampler because UI extension's is immutable and baked into DS layout
-		static std::array<IGPUDescriptorSet::SDescriptorInfo, MaxUITextureCount> descriptorInfo;
-		static IGPUDescriptorSet::SWriteDescriptorSet writes[MaxUITextureCount];
+			descriptorInfo[ext::imgui::UI::FontAtlasTexId].info.image.imageLayout = IImage::LAYOUT::READ_ONLY_OPTIMAL;
+			descriptorInfo[ext::imgui::UI::FontAtlasTexId].desc = smart_refctd_ptr<IGPUImageView>(m_ui.manager->getFontAtlasView());
 
-		descriptorInfo[nbl::ext::imgui::UI::FontAtlasTexId].info.image.imageLayout = IImage::LAYOUT::READ_ONLY_OPTIMAL;
-		descriptorInfo[nbl::ext::imgui::UI::FontAtlasTexId].desc = smart_refctd_ptr<IGPUImageView>(m_ui.manager->getFontAtlasView());
+			for (uint32_t i = 0; i < descriptorInfo.size(); ++i)
+			{
+				writes[i].dstSet = m_ui.descriptorSet.get();
+				writes[i].binding = 0u;
+				writes[i].arrayElement = i;
+				writes[i].count = 1u;
+			}
+			writes[ext::imgui::UI::FontAtlasTexId].info = descriptorInfo.data() + ext::imgui::UI::FontAtlasTexId;
 
-		for (uint32_t i = 0; i < descriptorInfo.size(); ++i)
-		{
-			writes[i].dstSet = m_ui.descriptorSet.get();
-			writes[i].binding = 0u;
-			writes[i].arrayElement = i;
-			writes[i].count = 1u;
+			return m_device->updateDescriptorSets(writes, {});
 		}
-		writes[nbl::ext::imgui::UI::FontAtlasTexId].info = descriptorInfo.data() + nbl::ext::imgui::UI::FontAtlasTexId;
-
-		return m_device->updateDescriptorSets(writes, {});
-	}
 
-	inline void workLoopBody() override
-	{
-		// framesInFlight: ensuring safe execution of command buffers and acquires, `framesInFlight` only affect semaphore waits, don't use this to index your resources because it can change with swapchain recreation.
-		const uint32_t framesInFlight = core::min(MaxFramesInFlight, m_surface->getMaxAcquiresInFlight());
-		// We block for semaphores for 2 reasons here:
-		  // A) Resource: Can't use resource like a command buffer BEFORE previous use is finished! [MaxFramesInFlight]
-		  // B) Acquire: Can't have more acquires in flight than a certain threshold returned by swapchain or your surface helper class. [MaxAcquiresInFlight]
-		if (m_realFrameIx >= framesInFlight)
+		inline void workLoopBody() override
 		{
-			const ISemaphore::SWaitInfo cbDonePending[] =
+			// framesInFlight: ensuring safe execution of command buffers and acquires, `framesInFlight` only affect semaphore waits, don't use this to index your resources because it can change with swapchain recreation.
+			const uint32_t framesInFlight = core::min(MaxFramesInFlight, m_surface->getMaxAcquiresInFlight());
+			// We block for semaphores for 2 reasons here:
+			  // A) Resource: Can't use resource like a command buffer BEFORE previous use is finished! [MaxFramesInFlight]
+			  // B) Acquire: Can't have more acquires in flight than a certain threshold returned by swapchain or your surface helper class. [MaxAcquiresInFlight]
+			if (m_realFrameIx >= framesInFlight)
 			{
-			  {
-				.semaphore = m_semaphore.get(),
-				.value = m_realFrameIx + 1 - framesInFlight
-			  }
-			};
-			if (m_device->blockForSemaphores(cbDonePending) != ISemaphore::WAIT_RESULT::SUCCESS)
-				return;
-		}
-		const auto resourceIx = m_realFrameIx % MaxFramesInFlight;
-
-		m_api->startCapture();
+				const ISemaphore::SWaitInfo cbDonePending[] =
+				{
+				  {
+					.semaphore = m_semaphore.get(),
+					.value = m_realFrameIx + 1 - framesInFlight
+				  }
+				};
+				if (m_device->blockForSemaphores(cbDonePending) != ISemaphore::WAIT_RESULT::SUCCESS)
+					return;
+			}
+			const auto resourceIx = m_realFrameIx % MaxFramesInFlight;
 
-		update();
+			m_api->startCapture();
 
-		auto queue = getGraphicsQueue();
-		auto cmdbuf = m_cmdBufs[resourceIx].get();
+//		update();
 
-		if (!keepRunning())
-			return;
+			auto queue = getGraphicsQueue();
+			auto cmdbuf = m_cmdBufs[resourceIx].get();
 
-		cmdbuf->reset(IGPUCommandBuffer::RESET_FLAGS::RELEASE_RESOURCES_BIT);
-		cmdbuf->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT);
-		cmdbuf->beginDebugMarker("RaytracingPipelineApp Frame");
+			if (!keepRunning())
+				return;
 
+			cmdbuf->reset(IGPUCommandBuffer::RESET_FLAGS::RELEASE_RESOURCES_BIT);
+			cmdbuf->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT);
+			cmdbuf->beginDebugMarker("Frame");
+#if 0
 		const auto viewMatrix = m_camera.getViewMatrix();
 		const auto projectionMatrix = m_camera.getProjectionMatrix();
 		const auto viewProjectionMatrix = m_camera.getConcatenatedMatrix();
@@ -686,12 +623,6 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication,
 		modelMatrix.setTranslation(nbl::core::vectorSIMDf(0, 0, 0, 0));
 		modelMatrix.setRotation(quaternion(0, 0, 0));
 
-		core::matrix4SIMD modelViewProjectionMatrix = core::concatenateBFollowedByA(viewProjectionMatrix, modelMatrix);
-		if (m_cachedModelViewProjectionMatrix != modelViewProjectionMatrix)
-		{
-			m_frameAccumulationCounter = 0;
-			m_cachedModelViewProjectionMatrix = modelViewProjectionMatrix;
-		}
 		core::matrix4SIMD invModelViewProjectionMatrix;
 		modelViewProjectionMatrix.getInverseTransform(invModelViewProjectionMatrix);
 
@@ -824,768 +755,619 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication,
 			cmdbuf->endRenderPass();
 
 		}
+#endif
+			cmdbuf->endDebugMarker();
+			cmdbuf->end();
 
-		cmdbuf->endDebugMarker();
-		cmdbuf->end();
-
-		{
-			const IQueue::SSubmitInfo::SSemaphoreInfo rendered[] =
-			{
-			  {
-				.semaphore = m_semaphore.get(),
-				.value = ++m_realFrameIx,
-				.stageMask = PIPELINE_STAGE_FLAGS::ALL_TRANSFER_BITS
-			  }
-			};
 			{
+				const IQueue::SSubmitInfo::SSemaphoreInfo rendered[] =
+				{
+				  {
+					.semaphore = m_semaphore.get(),
+					.value = ++m_realFrameIx,
+					.stageMask = PIPELINE_STAGE_FLAGS::ALL_TRANSFER_BITS
+				  }
+				};
 				{
-					const IQueue::SSubmitInfo::SCommandBufferInfo commandBuffers[] =
 					{
-					  {.cmdbuf = cmdbuf }
-					};
+						const IQueue::SSubmitInfo::SCommandBufferInfo commandBuffers[] =
+						{
+						  {.cmdbuf = cmdbuf }
+						};
 
-					const IQueue::SSubmitInfo::SSemaphoreInfo acquired[] =
-					{
-					  {
-						.semaphore = m_currentImageAcquire.semaphore,
-						.value = m_currentImageAcquire.acquireCount,
-						.stageMask = PIPELINE_STAGE_FLAGS::NONE
-					  }
-					};
-					const IQueue::SSubmitInfo infos[] =
-					{
-					  {
-						.waitSemaphores = acquired,
-						.commandBuffers = commandBuffers,
-						.signalSemaphores = rendered
-					  }
-					};
+						const IQueue::SSubmitInfo::SSemaphoreInfo acquired[] =
+						{
+						  {
+							.semaphore = m_currentImageAcquire.semaphore,
+							.value = m_currentImageAcquire.acquireCount,
+							.stageMask = PIPELINE_STAGE_FLAGS::NONE
+						  }
+						};
+						const IQueue::SSubmitInfo infos[] =
+						{
+						  {
+							.waitSemaphores = acquired,
+							.commandBuffers = commandBuffers,
+							.signalSemaphores = rendered
+						  }
+						};
 
-					updateGUIDescriptorSet();
+//						updateGUIDescriptorSet();
 
-					if (queue->submit(infos) != IQueue::RESULT::SUCCESS)
-						m_realFrameIx--;
+						if (queue->submit(infos) != IQueue::RESULT::SUCCESS)
+							m_realFrameIx--;
+					}
 				}
-			}
 
-			m_window->setCaption("[Nabla Engine] Ray Tracing Pipeline");
-			m_surface->present(m_currentImageAcquire.imageIndex, rendered);
+				m_window->setCaption("[Nabla Engine] Ray Tracing Pipeline");
+				m_surface->present(m_currentImageAcquire.imageIndex, rendered);
+			}
+			m_api->endCapture();
+			m_frameAccumulationCounter++;
 		}
-		m_api->endCapture();
-		m_frameAccumulationCounter++;
-	}
-
-	inline void update()
-	{
-		m_camera.setMoveSpeed(m_cameraSetting.moveSpeed);
-		m_camera.setRotateSpeed(m_cameraSetting.rotateSpeed);
-
-		static std::chrono::microseconds previousEventTimestamp{};
-
-		m_inputSystem->getDefaultMouse(&m_mouse);
-		m_inputSystem->getDefaultKeyboard(&m_keyboard);
-
-		auto updatePresentationTimestamp = [&]()
-			{
-				m_currentImageAcquire = m_surface->acquireNextImage();
-
-				m_oracle.reportEndFrameRecord();
-				const auto timestamp = m_oracle.getNextPresentationTimeStamp();
-				m_oracle.reportBeginFrameRecord();
+#if 0
+		inline void update()
+		{
+			m_camera.setMoveSpeed(m_cameraSetting.moveSpeed);
+			m_camera.setRotateSpeed(m_cameraSetting.rotateSpeed);
 
-				return timestamp;
-			};
+			static std::chrono::microseconds previousEventTimestamp{};
 
-		const auto nextPresentationTimestamp = updatePresentationTimestamp();
+			m_inputSystem->getDefaultMouse(&m_mouse);
+			m_inputSystem->getDefaultKeyboard(&m_keyboard);
 
-		struct
-		{
-			std::vector<SMouseEvent> mouse{};
-			std::vector<SKeyboardEvent> keyboard{};
-		} capturedEvents;
-
-		m_camera.beginInputProcessing(nextPresentationTimestamp);
-		{
-			const auto& io = ImGui::GetIO();
-			m_mouse.consumeEvents([&](const IMouseEventChannel::range_t& events) -> void
+			auto updatePresentationTimestamp = [&]()
 				{
-					if (!io.WantCaptureMouse)
-						m_camera.mouseProcess(events); // don't capture the events, only let camera handle them with its impl
+					m_currentImageAcquire = m_surface->acquireNextImage();
 
-					for (const auto& e : events) // here capture
-					{
-						if (e.timeStamp < previousEventTimestamp)
-							continue;
+					m_oracle.reportEndFrameRecord();
+					const auto timestamp = m_oracle.getNextPresentationTimeStamp();
+					m_oracle.reportBeginFrameRecord();
 
-						previousEventTimestamp = e.timeStamp;
-						capturedEvents.mouse.emplace_back(e);
+					return timestamp;
+				};
 
-					}
-				}, m_logger.get());
+			const auto nextPresentationTimestamp = updatePresentationTimestamp();
 
-			m_keyboard.consumeEvents([&](const IKeyboardEventChannel::range_t& events) -> void
-				{
-					if (!io.WantCaptureKeyboard)
-						m_camera.keyboardProcess(events); // don't capture the events, only let camera handle them with its impl
+			struct
+			{
+				std::vector<SMouseEvent> mouse{};
+				std::vector<SKeyboardEvent> keyboard{};
+			} capturedEvents;
 
-					for (const auto& e : events) // here capture
+			m_camera.beginInputProcessing(nextPresentationTimestamp);
+			{
+				const auto& io = ImGui::GetIO();
+				m_mouse.consumeEvents([&](const IMouseEventChannel::range_t& events) -> void
 					{
-						if (e.timeStamp < previousEventTimestamp)
-							continue;
-
-						previousEventTimestamp = e.timeStamp;
-						capturedEvents.keyboard.emplace_back(e);
-					}
-				}, m_logger.get());
+						if (!io.WantCaptureMouse)
+							m_camera.mouseProcess(events); // don't capture the events, only let camera handle them with its impl
 
-		}
-		m_camera.endInputProcessing(nextPresentationTimestamp);
+						for (const auto& e : events) // here capture
+						{
+							if (e.timeStamp < previousEventTimestamp)
+								continue;
 
-		const core::SRange<const nbl::ui::SMouseEvent> mouseEvents(capturedEvents.mouse.data(), capturedEvents.mouse.data() + capturedEvents.mouse.size());
-		const core::SRange<const nbl::ui::SKeyboardEvent> keyboardEvents(capturedEvents.keyboard.data(), capturedEvents.keyboard.data() + capturedEvents.keyboard.size());
-		const auto cursorPosition = m_window->getCursorControl()->getPosition();
-		const auto mousePosition = float32_t2(cursorPosition.x, cursorPosition.y) - float32_t2(m_window->getX(), m_window->getY());
+							previousEventTimestamp = e.timeStamp;
+							capturedEvents.mouse.emplace_back(e);
 
-		const ext::imgui::UI::SUpdateParameters params =
-		{
-		  .mousePosition = mousePosition,
-		  .displaySize = { m_window->getWidth(), m_window->getHeight() },
-		  .mouseEvents = mouseEvents,
-		  .keyboardEvents = keyboardEvents
-		};
+						}
+					}, m_logger.get());
 
-		m_ui.manager->update(params);
-	}
+				m_keyboard.consumeEvents([&](const IKeyboardEventChannel::range_t& events) -> void
+					{
+						if (!io.WantCaptureKeyboard)
+							m_camera.keyboardProcess(events); // don't capture the events, only let camera handle them with its impl
 
-	inline bool keepRunning() override
-	{
-		if (m_surface->irrecoverable())
-			return false;
+						for (const auto& e : events) // here capture
+						{
+							if (e.timeStamp < previousEventTimestamp)
+								continue;
 
-		return true;
-	}
+							previousEventTimestamp = e.timeStamp;
+							capturedEvents.keyboard.emplace_back(e);
+						}
+					}, m_logger.get());
 
-	inline bool onAppTerminated() override
-	{
-		return device_base_t::onAppTerminated();
-	}
+			}
+			m_camera.endInputProcessing(nextPresentationTimestamp);
 
-private:
-	uint32_t getWorkgroupCount(uint32_t dim, uint32_t size)
-	{
-		return (dim + size - 1) / size;
-	}
+			const core::SRange<const nbl::ui::SMouseEvent> mouseEvents(capturedEvents.mouse.data(), capturedEvents.mouse.data() + capturedEvents.mouse.size());
+			const core::SRange<const nbl::ui::SKeyboardEvent> keyboardEvents(capturedEvents.keyboard.data(), capturedEvents.keyboard.data() + capturedEvents.keyboard.size());
+			const auto cursorPosition = m_window->getCursorControl()->getPosition();
+			const auto mousePosition = float32_t2(cursorPosition.x, cursorPosition.y) - float32_t2(m_window->getX(), m_window->getY());
 
-	bool createIndirectBuffer()
-	{
-		const auto getBufferRangeAddress = [](const SBufferRange<IGPUBuffer>& range)
+			const ext::imgui::UI::SUpdateParameters params =
 			{
-				return range.buffer->getDeviceAddress() + range.offset;
-			};
-		const auto command = TraceRaysIndirectCommand_t{
-		  .raygenShaderRecordAddress = getBufferRangeAddress(m_shaderBindingTable.raygenGroupRange),
-		  .raygenShaderRecordSize = m_shaderBindingTable.raygenGroupRange.size,
-		  .missShaderBindingTableAddress = getBufferRangeAddress(m_shaderBindingTable.missGroupsRange),
-		  .missShaderBindingTableSize = m_shaderBindingTable.missGroupsRange.size,
-		  .missShaderBindingTableStride = m_shaderBindingTable.missGroupsStride,
-		  .hitShaderBindingTableAddress = getBufferRangeAddress(m_shaderBindingTable.hitGroupsRange),
-		  .hitShaderBindingTableSize = m_shaderBindingTable.hitGroupsRange.size,
-		  .hitShaderBindingTableStride = m_shaderBindingTable.hitGroupsStride,
-		  .callableShaderBindingTableAddress = getBufferRangeAddress(m_shaderBindingTable.callableGroupsRange),
-		  .callableShaderBindingTableSize = m_shaderBindingTable.callableGroupsRange.size,
-		  .callableShaderBindingTableStride = m_shaderBindingTable.callableGroupsStride,
-		  .width = WIN_W,
-		  .height = WIN_H,
-		  .depth = 1,
-		};
-		IGPUBuffer::SCreationParams params;
-		params.usage = IGPUBuffer::EUF_TRANSFER_DST_BIT | IGPUBuffer::EUF_INDIRECT_BUFFER_BIT | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT;
-		params.size = sizeof(TraceRaysIndirectCommand_t);
-		m_utils->createFilledDeviceLocalBufferOnDedMem(SIntendedSubmitInfo{ .queue = getGraphicsQueue() }, std::move(params), &command).move_into(m_indirectBuffer);
-		return true;
-	}
-
-	void calculateRayTracingStackSize(const smart_refctd_ptr<video::IGPURayTracingPipeline>& pipeline)
-	{
-		const auto raygenStackSize = pipeline->getRaygenStackSize();
-		auto getMaxSize = [&](auto ranges, auto valProj) -> uint16_t
-			{
-				auto maxValue = 0;
-				for (const auto& val : ranges)
-				{
-					maxValue = std::max<uint16_t>(maxValue, std::invoke(valProj, val));
-				}
-				return maxValue;
+			  .mousePosition = mousePosition,
+			  .displaySize = { m_window->getWidth(), m_window->getHeight() },
+			  .mouseEvents = mouseEvents,
+			  .keyboardEvents = keyboardEvents
 			};
 
-		const auto closestHitStackMax = getMaxSize(pipeline->getHitStackSizes(), &IGPURayTracingPipeline::SHitGroupStackSize::closestHit);
-		const auto anyHitStackMax = getMaxSize(pipeline->getHitStackSizes(), &IGPURayTracingPipeline::SHitGroupStackSize::anyHit);
-		const auto intersectionStackMax = getMaxSize(pipeline->getHitStackSizes(), &IGPURayTracingPipeline::SHitGroupStackSize::intersection);
-		const auto missStackMax = getMaxSize(pipeline->getMissStackSizes(), std::identity{});
-		const auto callableStackMax = getMaxSize(pipeline->getCallableStackSizes(), std::identity{});
-		auto firstDepthStackSizeMax = std::max(closestHitStackMax, missStackMax);
-		firstDepthStackSizeMax = std::max<uint16_t>(firstDepthStackSizeMax, intersectionStackMax + anyHitStackMax);
-		m_rayTracingStackSize = raygenStackSize + std::max(firstDepthStackSizeMax, callableStackMax);
-	}
-
-	bool createShaderBindingTable(const smart_refctd_ptr<video::IGPURayTracingPipeline>& pipeline)
-	{
-		const auto& limits = m_device->getPhysicalDevice()->getLimits();
-		const auto handleSize = SPhysicalDeviceLimits::ShaderGroupHandleSize;
-		const auto handleSizeAligned = nbl::core::alignUp(handleSize, limits.shaderGroupHandleAlignment);
-
-		auto& raygenRange = m_shaderBindingTable.raygenGroupRange;
-
-		auto& hitRange = m_shaderBindingTable.hitGroupsRange;
-		const auto hitHandles = pipeline->getHitHandles();
-
-		auto& missRange = m_shaderBindingTable.missGroupsRange;
-		const auto missHandles = pipeline->getMissHandles();
-
-		auto& callableRange = m_shaderBindingTable.callableGroupsRange;
-		const auto callableHandles = pipeline->getCallableHandles();
-
-		raygenRange = {
-		  .offset = 0,
-		  .size = core::alignUp(handleSizeAligned, limits.shaderGroupBaseAlignment)
-		};
-
-		missRange = {
-		  .offset = raygenRange.size,
-		  .size = core::alignUp(missHandles.size() * handleSizeAligned, limits.shaderGroupBaseAlignment),
-		};
-		m_shaderBindingTable.missGroupsStride = handleSizeAligned;
-
-		hitRange = {
-		  .offset = missRange.offset + missRange.size,
-		  .size = core::alignUp(hitHandles.size() * handleSizeAligned, limits.shaderGroupBaseAlignment),
-		};
-		m_shaderBindingTable.hitGroupsStride = handleSizeAligned;
-
-		callableRange = {
-		  .offset = hitRange.offset + hitRange.size,
-		  .size = core::alignUp(callableHandles.size() * handleSizeAligned, limits.shaderGroupBaseAlignment),
-		};
-		m_shaderBindingTable.callableGroupsStride = handleSizeAligned;
-
-		const auto bufferSize = raygenRange.size + missRange.size + hitRange.size + callableRange.size;
-
-		ICPUBuffer::SCreationParams cpuBufferParams;
-		cpuBufferParams.size = bufferSize;
-		auto cpuBuffer = ICPUBuffer::create(std::move(cpuBufferParams));
-		uint8_t* pData = reinterpret_cast<uint8_t*>(cpuBuffer->getPointer());
-
-		// copy raygen region
-		memcpy(pData, &pipeline->getRaygen(), handleSize);
-
-		// copy miss region
-		uint8_t* pMissData = pData + missRange.offset;
-		for (const auto& handle : missHandles)
-		{
-			memcpy(pMissData, &handle, handleSize);
-			pMissData += m_shaderBindingTable.missGroupsStride;
+			m_ui.manager->update(params);
 		}
-
-		// copy hit region
-		uint8_t* pHitData = pData + hitRange.offset;
-		for (const auto& handle : hitHandles)
+#endif
+		inline bool keepRunning() override
 		{
-			memcpy(pHitData, &handle, handleSize);
-			pHitData += m_shaderBindingTable.hitGroupsStride;
-		}
+			if (m_surface->irrecoverable())
+				return false;
 
-		// copy callable region
-		uint8_t* pCallableData = pData + callableRange.offset;
-		for (const auto& handle : callableHandles)
-		{
-			memcpy(pCallableData, &handle, handleSize);
-			pCallableData += m_shaderBindingTable.callableGroupsStride;
+			return true;
 		}
 
+		inline bool onAppTerminated() override
 		{
-			IGPUBuffer::SCreationParams params;
-			params.usage = IGPUBuffer::EUF_TRANSFER_DST_BIT | IGPUBuffer::EUF_INLINE_UPDATE_VIA_CMDBUF | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT | IGPUBuffer::EUF_SHADER_BINDING_TABLE_BIT;
-			params.size = bufferSize;
-			m_utils->createFilledDeviceLocalBufferOnDedMem(SIntendedSubmitInfo{ .queue = getGraphicsQueue() }, std::move(params), pData).move_into(raygenRange.buffer);
-			missRange.buffer = core::smart_refctd_ptr(raygenRange.buffer);
-			hitRange.buffer = core::smart_refctd_ptr(raygenRange.buffer);
-			callableRange.buffer = core::smart_refctd_ptr(raygenRange.buffer);
+			return device_base_t::onAppTerminated();
 		}
 
-		return true;
-	}
+	private:
+#if 0
+		bool createAccelerationStructuresFromGeometry(const IGeometryCreator* gc)
+		{
+			auto queue = getGraphicsQueue();
+			// get geometries into ICPUBuffers
+			auto pool = m_device->createCommandPool(queue->getFamilyIndex(), IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT);
+			if (!pool)
+				return logFail("Couldn't create Command Pool for geometry creation!");
+
+			const auto defaultMaterial = Material{
+			  .ambient = {0.2, 0.1, 0.1},
+			  .diffuse = {0.8, 0.3, 0.3},
+			  .specular = {0.8, 0.8, 0.8},
+			  .shininess = 1.0f,
+			  .alpha = 1.0f,
+			};
 
-	bool createAccelerationStructuresFromGeometry(const IGeometryCreator* gc)
-	{
-		auto queue = getGraphicsQueue();
-		// get geometries into ICPUBuffers
-		auto pool = m_device->createCommandPool(queue->getFamilyIndex(), IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT);
-		if (!pool)
-			return logFail("Couldn't create Command Pool for geometry creation!");
-
-		const auto defaultMaterial = Material{
-		  .ambient = {0.2, 0.1, 0.1},
-		  .diffuse = {0.8, 0.3, 0.3},
-		  .specular = {0.8, 0.8, 0.8},
-		  .shininess = 1.0f,
-		  .alpha = 1.0f,
-		};
+			auto getTranslationMatrix = [](float32_t x, float32_t y, float32_t z)
+				{
+					core::matrix3x4SIMD transform;
+					transform.setTranslation(nbl::core::vectorSIMDf(x, y, z, 0));
+					return transform;
+				};
 
-		auto getTranslationMatrix = [](float32_t x, float32_t y, float32_t z)
-			{
-				core::matrix3x4SIMD transform;
-				transform.setTranslation(nbl::core::vectorSIMDf(x, y, z, 0));
-				return transform;
-			};
+			core::matrix3x4SIMD planeTransform;
+			planeTransform.setRotation(quaternion::fromAngleAxis(core::radians(-90.0f), vector3df_SIMD{ 1, 0, 0 }));
 
-		core::matrix3x4SIMD planeTransform;
-		planeTransform.setRotation(quaternion::fromAngleAxis(core::radians(-90.0f), vector3df_SIMD{ 1, 0, 0 }));
-
-		// triangles geometries
-		const auto cpuObjects = std::array{
-			ReferenceObjectCpu {
-				.meta = {.type = OT_RECTANGLE, .name = "Plane Mesh"},
-				.data = gc->createRectangleMesh(nbl::core::vector2df_SIMD(10, 10)),
-				.material = defaultMaterial,
-				.transform = planeTransform,
-			},
-			ReferenceObjectCpu {
-				.meta = {.type = OT_CUBE, .name = "Cube Mesh"},
-				.data = gc->createCubeMesh(nbl::core::vector3df(1, 1, 1)),
-				.material = defaultMaterial,
-				.transform = getTranslationMatrix(0, 0.5f, 0),
-			},
-			ReferenceObjectCpu {
-				.meta = {.type = OT_CUBE, .name = "Cube Mesh 2"},
-				.data = gc->createCubeMesh(nbl::core::vector3df(1.5, 1.5, 1.5)),
-				.material = Material{
-					.ambient = {0.1, 0.1, 0.2},
-					.diffuse = {0.2, 0.2, 0.8},
-					.specular = {0.8, 0.8, 0.8},
-					.shininess = 1.0f,
+			// triangles geometries
+			const auto cpuObjects = std::array{
+				ReferenceObjectCpu {
+					.meta = {.type = OT_RECTANGLE, .name = "Plane Mesh"},
+					.data = gc->createRectangleMesh(nbl::core::vector2df_SIMD(10, 10)),
+					.material = defaultMaterial,
+					.transform = planeTransform,
 				},
-				.transform = getTranslationMatrix(-5.0f, 1.0f, 0),
-			},
-			ReferenceObjectCpu {
-				.meta = {.type = OT_CUBE, .name = "Transparent Cube Mesh"},
-				.data = gc->createCubeMesh(nbl::core::vector3df(1.5, 1.5, 1.5)),
-				.material = Material{
-					.ambient = {0.1, 0.2, 0.1},
-					.diffuse = {0.2, 0.8, 0.2},
-					.specular = {0.8, 0.8, 0.8},
-					.shininess = 1.0f,
-					.alpha = 0.2,
+				ReferenceObjectCpu {
+					.meta = {.type = OT_CUBE, .name = "Cube Mesh"},
+					.data = gc->createCubeMesh(nbl::core::vector3df(1, 1, 1)),
+					.material = defaultMaterial,
+					.transform = getTranslationMatrix(0, 0.5f, 0),
 				},
-				.transform = getTranslationMatrix(5.0f, 1.0f, 0),
-			},
-		};
-
-		struct CPUTriBufferBindings
-		{
-			nbl::asset::SBufferBinding<ICPUBuffer> vertex, index;
-		};
-		std::array<CPUTriBufferBindings, std::size(cpuObjects)> cpuTriBuffers;
-
-		for (uint32_t i = 0; i < cpuObjects.size(); i++)
-		{
-			const auto& cpuObject = cpuObjects[i];
-
-			auto vBuffer = smart_refctd_ptr(cpuObject.data.bindings[0].buffer); // no offset
-			auto vUsage = bitflag(IGPUBuffer::EUF_STORAGE_BUFFER_BIT) | IGPUBuffer::EUF_TRANSFER_DST_BIT | IGPUBuffer::EUF_INLINE_UPDATE_VIA_CMDBUF |
-				IGPUBuffer::EUF_ACCELERATION_STRUCTURE_BUILD_INPUT_READ_ONLY_BIT | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT;
-			vBuffer->addUsageFlags(vUsage);
-			vBuffer->setContentHash(vBuffer->computeContentHash());
-
-			auto iBuffer = smart_refctd_ptr(cpuObject.data.indexBuffer.buffer); // no offset
-			auto iUsage = bitflag(IGPUBuffer::EUF_STORAGE_BUFFER_BIT) | IGPUBuffer::EUF_TRANSFER_DST_BIT | IGPUBuffer::EUF_INLINE_UPDATE_VIA_CMDBUF |
-				IGPUBuffer::EUF_ACCELERATION_STRUCTURE_BUILD_INPUT_READ_ONLY_BIT | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT;
-
-			if (cpuObject.data.indexType != EIT_UNKNOWN)
-				if (iBuffer)
-				{
-					iBuffer->addUsageFlags(iUsage);
-					iBuffer->setContentHash(iBuffer->computeContentHash());
-				}
+				ReferenceObjectCpu {
+					.meta = {.type = OT_CUBE, .name = "Cube Mesh 2"},
+					.data = gc->createCubeMesh(nbl::core::vector3df(1.5, 1.5, 1.5)),
+					.material = Material{
+						.ambient = {0.1, 0.1, 0.2},
+						.diffuse = {0.2, 0.2, 0.8},
+						.specular = {0.8, 0.8, 0.8},
+						.shininess = 1.0f,
+					},
+					.transform = getTranslationMatrix(-5.0f, 1.0f, 0),
+				},
+				ReferenceObjectCpu {
+					.meta = {.type = OT_CUBE, .name = "Transparent Cube Mesh"},
+					.data = gc->createCubeMesh(nbl::core::vector3df(1.5, 1.5, 1.5)),
+					.material = Material{
+						.ambient = {0.1, 0.2, 0.1},
+						.diffuse = {0.2, 0.8, 0.2},
+						.specular = {0.8, 0.8, 0.8},
+						.shininess = 1.0f,
+						.alpha = 0.2,
+					},
+					.transform = getTranslationMatrix(5.0f, 1.0f, 0),
+				},
+			};
 
-			cpuTriBuffers[i] = {
-			  .vertex = {.offset = 0, .buffer = vBuffer},
-			  .index = {.offset = 0, .buffer = iBuffer},
+			struct CPUTriBufferBindings
+			{
+				nbl::asset::SBufferBinding<ICPUBuffer> vertex, index;
 			};
+			std::array<CPUTriBufferBindings, std::size(cpuObjects)> cpuTriBuffers;
 
-		}
+			for (uint32_t i = 0; i < cpuObjects.size(); i++)
+			{
+				const auto& cpuObject = cpuObjects[i];
 
-		// procedural geometries
-		using Aabb = IGPUBottomLevelAccelerationStructure::AABB_t;
+				auto vBuffer = smart_refctd_ptr(cpuObject.data.bindings[0].buffer); // no offset
+				auto vUsage = bitflag(IGPUBuffer::EUF_STORAGE_BUFFER_BIT) | IGPUBuffer::EUF_TRANSFER_DST_BIT | IGPUBuffer::EUF_INLINE_UPDATE_VIA_CMDBUF |
+					IGPUBuffer::EUF_ACCELERATION_STRUCTURE_BUILD_INPUT_READ_ONLY_BIT | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT;
+				vBuffer->addUsageFlags(vUsage);
+				vBuffer->setContentHash(vBuffer->computeContentHash());
 
-		smart_refctd_ptr<ICPUBuffer> cpuProcBuffer;
-		{
-			ICPUBuffer::SCreationParams params;
-			params.size = NumberOfProceduralGeometries * sizeof(Aabb);
-			cpuProcBuffer = ICPUBuffer::create(std::move(params));
-		}
+				auto iBuffer = smart_refctd_ptr(cpuObject.data.indexBuffer.buffer); // no offset
+				auto iUsage = bitflag(IGPUBuffer::EUF_STORAGE_BUFFER_BIT) | IGPUBuffer::EUF_TRANSFER_DST_BIT | IGPUBuffer::EUF_INLINE_UPDATE_VIA_CMDBUF |
+					IGPUBuffer::EUF_ACCELERATION_STRUCTURE_BUILD_INPUT_READ_ONLY_BIT | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT;
 
-		core::vector<SProceduralGeomInfo> proceduralGeoms;
-		proceduralGeoms.reserve(NumberOfProceduralGeometries);
-		auto proceduralGeometries = reinterpret_cast<Aabb*>(cpuProcBuffer->getPointer());
-		for (int32_t i = 0; i < NumberOfProceduralGeometries; i++)
-		{
-			const auto middle_i = NumberOfProceduralGeometries / 2.0;
-			SProceduralGeomInfo sphere = {
-					.material = hlsl::_static_cast<MaterialPacked>(Material{
-					.ambient = {0.1, 0.05 * i, 0.1},
-					.diffuse = {0.3, 0.2 * i, 0.3},
-					.specular = {0.8, 0.8, 0.8},
-					.shininess = 1.0f,
-				}),
-				.center = float32_t3((i - middle_i) * 4.0, 2, 5.0),
-				.radius = 1,
-			};
+				if (cpuObject.data.indexType != EIT_UNKNOWN)
+					if (iBuffer)
+					{
+						iBuffer->addUsageFlags(iUsage);
+						iBuffer->setContentHash(iBuffer->computeContentHash());
+					}
 
-			proceduralGeoms.push_back(sphere);
-			const auto sphereMin = sphere.center - sphere.radius;
-			const auto sphereMax = sphere.center + sphere.radius;
-			proceduralGeometries[i] = {
-				vector3d(sphereMin.x, sphereMin.y, sphereMin.z),
-				vector3d(sphereMax.x, sphereMax.y, sphereMax.z)
-			};
-		}
+				cpuTriBuffers[i] = {
+				  .vertex = {.offset = 0, .buffer = vBuffer},
+				  .index = {.offset = 0, .buffer = iBuffer},
+				};
 
-		{
-			IGPUBuffer::SCreationParams params;
-			params.usage = IGPUBuffer::EUF_STORAGE_BUFFER_BIT | IGPUBuffer::EUF_TRANSFER_DST_BIT | IGPUBuffer::EUF_INLINE_UPDATE_VIA_CMDBUF | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT;
-			params.size = proceduralGeoms.size() * sizeof(SProceduralGeomInfo);
-			m_utils->createFilledDeviceLocalBufferOnDedMem(SIntendedSubmitInfo{ .queue = queue }, std::move(params), proceduralGeoms.data()).move_into(m_proceduralGeomInfoBuffer);
-		}
+			}
 
-		// get ICPUBuffers into ICPUBLAS
-		// TODO use one BLAS and multiple triangles/aabbs in one
-		const auto blasCount = std::size(cpuObjects) + 1;
-		const auto proceduralBlasIdx = std::size(cpuObjects);
+			// procedural geometries
+			using Aabb = IGPUBottomLevelAccelerationStructure::AABB_t;
 
-		std::array<smart_refctd_ptr<ICPUBottomLevelAccelerationStructure>, std::size(cpuObjects)+1u> cpuBlas;
-		for (uint32_t i = 0; i < blasCount; i++)
-		{
-			auto& blas = cpuBlas[i];
-			blas = make_smart_refctd_ptr<ICPUBottomLevelAccelerationStructure>();
+			smart_refctd_ptr<ICPUBuffer> cpuProcBuffer;
+			{
+				ICPUBuffer::SCreationParams params;
+				params.size = NumberOfProceduralGeometries * sizeof(Aabb);
+				cpuProcBuffer = ICPUBuffer::create(std::move(params));
+			}
 
-			if (i == proceduralBlasIdx)
+			core::vector<SProceduralGeomInfo> proceduralGeoms;
+			proceduralGeoms.reserve(NumberOfProceduralGeometries);
+			auto proceduralGeometries = reinterpret_cast<Aabb*>(cpuProcBuffer->getPointer());
+			for (int32_t i = 0; i < NumberOfProceduralGeometries; i++)
 			{
-				auto aabbs = make_refctd_dynamic_array<smart_refctd_dynamic_array<ICPUBottomLevelAccelerationStructure::AABBs<ICPUBuffer>>>(1u);
-				auto primitiveCounts = make_refctd_dynamic_array<smart_refctd_dynamic_array<uint32_t>>(1u);
+				const auto middle_i = NumberOfProceduralGeometries / 2.0;
+				SProceduralGeomInfo sphere = {
+						.material = hlsl::_static_cast<MaterialPacked>(Material{
+						.ambient = {0.1, 0.05 * i, 0.1},
+						.diffuse = {0.3, 0.2 * i, 0.3},
+						.specular = {0.8, 0.8, 0.8},
+						.shininess = 1.0f,
+					}),
+					.center = float32_t3((i - middle_i) * 4.0, 2, 5.0),
+					.radius = 1,
+				};
 
-				auto& aabb = aabbs->front();
-				auto& primCount = primitiveCounts->front();
-				
-				primCount = NumberOfProceduralGeometries;
-				aabb.data = { .offset = 0, .buffer = cpuProcBuffer };
-				aabb.stride = sizeof(IGPUBottomLevelAccelerationStructure::AABB_t);
-				aabb.geometryFlags = IGPUBottomLevelAccelerationStructure::GEOMETRY_FLAGS::OPAQUE_BIT; // only allow opaque for now
+				proceduralGeoms.push_back(sphere);
+				const auto sphereMin = sphere.center - sphere.radius;
+				const auto sphereMax = sphere.center + sphere.radius;
+				proceduralGeometries[i] = {
+					vector3d(sphereMin.x, sphereMin.y, sphereMin.z),
+					vector3d(sphereMax.x, sphereMax.y, sphereMax.z)
+				};
+			}
 
-				blas->setGeometries(std::move(aabbs), std::move(primitiveCounts));
+			{
+				IGPUBuffer::SCreationParams params;
+				params.usage = IGPUBuffer::EUF_STORAGE_BUFFER_BIT | IGPUBuffer::EUF_TRANSFER_DST_BIT | IGPUBuffer::EUF_INLINE_UPDATE_VIA_CMDBUF | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT;
+				params.size = proceduralGeoms.size() * sizeof(SProceduralGeomInfo);
+				m_utils->createFilledDeviceLocalBufferOnDedMem(SIntendedSubmitInfo{ .queue = queue }, std::move(params), proceduralGeoms.data()).move_into(m_proceduralGeomInfoBuffer);
 			}
-			else
+
+			// get ICPUBuffers into ICPUBLAS
+			// TODO use one BLAS and multiple triangles/aabbs in one
+			const auto blasCount = std::size(cpuObjects) + 1;
+			const auto proceduralBlasIdx = std::size(cpuObjects);
+
+			std::array<smart_refctd_ptr<ICPUBottomLevelAccelerationStructure>, std::size(cpuObjects)+1u> cpuBlas;
+			for (uint32_t i = 0; i < blasCount; i++)
 			{
-				auto triangles = make_refctd_dynamic_array<smart_refctd_dynamic_array<ICPUBottomLevelAccelerationStructure::Triangles<ICPUBuffer>>>(1u);
-				auto primitiveCounts = make_refctd_dynamic_array<smart_refctd_dynamic_array<uint32_t>>(1u);
+				auto& blas = cpuBlas[i];
+				blas = make_smart_refctd_ptr<ICPUBottomLevelAccelerationStructure>();
 
-				auto& tri = triangles->front();
-				auto& primCount = primitiveCounts->front();
-				const auto& geom = cpuObjects[i];
-				const auto& cpuBuf = cpuTriBuffers[i];
+				if (i == proceduralBlasIdx)
+				{
+					auto aabbs = make_refctd_dynamic_array<smart_refctd_dynamic_array<ICPUBottomLevelAccelerationStructure::AABBs<ICPUBuffer>>>(1u);
+					auto primitiveCounts = make_refctd_dynamic_array<smart_refctd_dynamic_array<uint32_t>>(1u);
 
-				const bool useIndex = geom.data.indexType != EIT_UNKNOWN;
-				const uint32_t vertexStride = geom.data.inputParams.bindings[0].stride;
-				const uint32_t numVertices = cpuBuf.vertex.buffer->getSize() / vertexStride;
+					auto& aabb = aabbs->front();
+					auto& primCount = primitiveCounts->front();
+				
+					primCount = NumberOfProceduralGeometries;
+					aabb.data = { .offset = 0, .buffer = cpuProcBuffer };
+					aabb.stride = sizeof(IGPUBottomLevelAccelerationStructure::AABB_t);
+					aabb.geometryFlags = IGPUBottomLevelAccelerationStructure::GEOMETRY_FLAGS::OPAQUE_BIT; // only allow opaque for now
 
-				if (useIndex)
-					primCount = geom.data.indexCount / 3;
+					blas->setGeometries(std::move(aabbs), std::move(primitiveCounts));
+				}
 				else
-					primCount = numVertices / 3;
-
-				tri.vertexData[0] = cpuBuf.vertex;
-				tri.indexData = useIndex ? cpuBuf.index : cpuBuf.vertex;
-				tri.maxVertex = numVertices - 1;
-				tri.vertexStride = vertexStride;
-				tri.vertexFormat = EF_R32G32B32_SFLOAT;
-				tri.indexType = geom.data.indexType;
-				tri.geometryFlags = geom.material.isTransparent() ?
-					IGPUBottomLevelAccelerationStructure::GEOMETRY_FLAGS::NO_DUPLICATE_ANY_HIT_INVOCATION_BIT :
-					IGPUBottomLevelAccelerationStructure::GEOMETRY_FLAGS::OPAQUE_BIT;
-
-				blas->setGeometries(std::move(triangles), std::move(primitiveCounts));
-			}
+				{
+					auto triangles = make_refctd_dynamic_array<smart_refctd_dynamic_array<ICPUBottomLevelAccelerationStructure::Triangles<ICPUBuffer>>>(1u);
+					auto primitiveCounts = make_refctd_dynamic_array<smart_refctd_dynamic_array<uint32_t>>(1u);
+
+					auto& tri = triangles->front();
+					auto& primCount = primitiveCounts->front();
+					const auto& geom = cpuObjects[i];
+					const auto& cpuBuf = cpuTriBuffers[i];
+
+					const bool useIndex = geom.data.indexType != EIT_UNKNOWN;
+					const uint32_t vertexStride = geom.data.inputParams.bindings[0].stride;
+					const uint32_t numVertices = cpuBuf.vertex.buffer->getSize() / vertexStride;
+
+					if (useIndex)
+						primCount = geom.data.indexCount / 3;
+					else
+						primCount = numVertices / 3;
+
+					tri.vertexData[0] = cpuBuf.vertex;
+					tri.indexData = useIndex ? cpuBuf.index : cpuBuf.vertex;
+					tri.maxVertex = numVertices - 1;
+					tri.vertexStride = vertexStride;
+					tri.vertexFormat = EF_R32G32B32_SFLOAT;
+					tri.indexType = geom.data.indexType;
+					tri.geometryFlags = geom.material.isTransparent() ?
+						IGPUBottomLevelAccelerationStructure::GEOMETRY_FLAGS::NO_DUPLICATE_ANY_HIT_INVOCATION_BIT :
+						IGPUBottomLevelAccelerationStructure::GEOMETRY_FLAGS::OPAQUE_BIT;
+
+					blas->setGeometries(std::move(triangles), std::move(primitiveCounts));
+				}
 
-			auto blasFlags = bitflag(IGPUBottomLevelAccelerationStructure::BUILD_FLAGS::PREFER_FAST_TRACE_BIT) | IGPUBottomLevelAccelerationStructure::BUILD_FLAGS::ALLOW_COMPACTION_BIT;
-			if (i == proceduralBlasIdx)
-				blasFlags |= IGPUBottomLevelAccelerationStructure::BUILD_FLAGS::GEOMETRY_TYPE_IS_AABB_BIT;
+				auto blasFlags = bitflag(IGPUBottomLevelAccelerationStructure::BUILD_FLAGS::PREFER_FAST_TRACE_BIT) | IGPUBottomLevelAccelerationStructure::BUILD_FLAGS::ALLOW_COMPACTION_BIT;
+				if (i == proceduralBlasIdx)
+					blasFlags |= IGPUBottomLevelAccelerationStructure::BUILD_FLAGS::GEOMETRY_TYPE_IS_AABB_BIT;
 
-			blas->setBuildFlags(blasFlags);
-			blas->setContentHash(blas->computeContentHash());
-		}
+				blas->setBuildFlags(blasFlags);
+				blas->setContentHash(blas->computeContentHash());
+			}
 
-		auto geomInfoBuffer = ICPUBuffer::create({ std::size(cpuObjects) * sizeof(STriangleGeomInfo) });
-		STriangleGeomInfo* geomInfos = reinterpret_cast<STriangleGeomInfo*>(geomInfoBuffer->getPointer());
+			auto geomInfoBuffer = ICPUBuffer::create({ std::size(cpuObjects) * sizeof(STriangleGeomInfo) });
+			STriangleGeomInfo* geomInfos = reinterpret_cast<STriangleGeomInfo*>(geomInfoBuffer->getPointer());
 
-		// get ICPUBLAS into ICPUTLAS
-		auto geomInstances = make_refctd_dynamic_array<smart_refctd_dynamic_array<ICPUTopLevelAccelerationStructure::PolymorphicInstance>>(blasCount);
-		{
-			uint32_t i = 0;
-			for (auto instance = geomInstances->begin(); instance != geomInstances->end(); instance++, i++)
+			// get ICPUBLAS into ICPUTLAS
+			auto geomInstances = make_refctd_dynamic_array<smart_refctd_dynamic_array<ICPUTopLevelAccelerationStructure::PolymorphicInstance>>(blasCount);
 			{
-				const auto isProceduralInstance = i == proceduralBlasIdx;
-				ICPUTopLevelAccelerationStructure::StaticInstance inst;
-				inst.base.blas = cpuBlas[i];
-				inst.base.flags = static_cast<uint32_t>(IGPUTopLevelAccelerationStructure::INSTANCE_FLAGS::TRIANGLE_FACING_CULL_DISABLE_BIT);
-				inst.base.instanceCustomIndex = i;
-				inst.base.instanceShaderBindingTableRecordOffset = isProceduralInstance ? 2 : 0;;
-				inst.base.mask = 0xFF;
-				inst.transform = isProceduralInstance ? matrix3x4SIMD() : cpuObjects[i].transform;
-
-				instance->instance = inst;
+				uint32_t i = 0;
+				for (auto instance = geomInstances->begin(); instance != geomInstances->end(); instance++, i++)
+				{
+					const auto isProceduralInstance = i == proceduralBlasIdx;
+					ICPUTopLevelAccelerationStructure::StaticInstance inst;
+					inst.base.blas = cpuBlas[i];
+					inst.base.flags = static_cast<uint32_t>(IGPUTopLevelAccelerationStructure::INSTANCE_FLAGS::TRIANGLE_FACING_CULL_DISABLE_BIT);
+					inst.base.instanceCustomIndex = i;
+					inst.base.instanceShaderBindingTableRecordOffset = isProceduralInstance ? 2 : 0;;
+					inst.base.mask = 0xFF;
+					inst.transform = isProceduralInstance ? matrix3x4SIMD() : cpuObjects[i].transform;
+
+					instance->instance = inst;
+				}
 			}
-		}
 
-		auto cpuTlas = make_smart_refctd_ptr<ICPUTopLevelAccelerationStructure>();
-		cpuTlas->setInstances(std::move(geomInstances));
-		cpuTlas->setBuildFlags(IGPUTopLevelAccelerationStructure::BUILD_FLAGS::PREFER_FAST_TRACE_BIT);
+			auto cpuTlas = make_smart_refctd_ptr<ICPUTopLevelAccelerationStructure>();
+			cpuTlas->setInstances(std::move(geomInstances));
+			cpuTlas->setBuildFlags(IGPUTopLevelAccelerationStructure::BUILD_FLAGS::PREFER_FAST_TRACE_BIT);
 
-		// convert with asset converter
-		smart_refctd_ptr<CAssetConverter> converter = CAssetConverter::create({ .device = m_device.get(), .optimizer = {} });
-		struct MyInputs : CAssetConverter::SInputs
-		{
-			// For the GPU Buffers to be directly writeable and so that we don't need a Transfer Queue submit at all
-			inline uint32_t constrainMemoryTypeBits(const size_t groupCopyID, const IAsset* canonicalAsset, const blake3_hash_t& contentHash, const IDeviceMemoryBacked* memoryBacked) const override
+			// convert with asset converter
+			smart_refctd_ptr<CAssetConverter> converter = CAssetConverter::create({ .device = m_device.get(), .optimizer = {} });
+			struct MyInputs : CAssetConverter::SInputs
 			{
-				assert(memoryBacked);
-				return memoryBacked->getObjectType() != IDeviceMemoryBacked::EOT_BUFFER ? (~0u) : rebarMemoryTypes;
-			}
-
-			uint32_t rebarMemoryTypes;
-		} inputs = {};
-		inputs.logger = m_logger.get();
-		inputs.rebarMemoryTypes = m_physicalDevice->getDirectVRAMAccessMemoryTypeBits();
-		// the allocator needs to be overriden to hand out memory ranges which have already been mapped so that the ReBAR fast-path can kick in
-		// (multiple buffers can be bound to same memory, but memory can only be mapped once at one place, so Asset Converter can't do it)
-		struct MyAllocator final : public IDeviceMemoryAllocator
-		{
-			ILogicalDevice* getDeviceForAllocations() const override { return device; }
+				// For the GPU Buffers to be directly writeable and so that we don't need a Transfer Queue submit at all
+				inline uint32_t constrainMemoryTypeBits(const size_t groupCopyID, const IAsset* canonicalAsset, const blake3_hash_t& contentHash, const IDeviceMemoryBacked* memoryBacked) const override
+				{
+					assert(memoryBacked);
+					return memoryBacked->getObjectType() != IDeviceMemoryBacked::EOT_BUFFER ? (~0u) : rebarMemoryTypes;
+				}
 
-			SAllocation allocate(const SAllocateInfo& info) override
+				uint32_t rebarMemoryTypes;
+			} inputs = {};
+			inputs.logger = m_logger.get();
+			inputs.rebarMemoryTypes = m_physicalDevice->getDirectVRAMAccessMemoryTypeBits();
+			// the allocator needs to be overriden to hand out memory ranges which have already been mapped so that the ReBAR fast-path can kick in
+			// (multiple buffers can be bound to same memory, but memory can only be mapped once at one place, so Asset Converter can't do it)
+			struct MyAllocator final : public IDeviceMemoryAllocator
 			{
-				auto retval = device->allocate(info);
-				// map what is mappable by default so ReBAR checks succeed
-				if (retval.isValid() && retval.memory->isMappable())
-					retval.memory->map({ .offset = 0,.length = info.size });
-				return retval;
-			}
+				ILogicalDevice* getDeviceForAllocations() const override { return device; }
 
-			ILogicalDevice* device;
-		} myalloc;
-		myalloc.device = m_device.get();
-		inputs.allocator = &myalloc;
+				SAllocation allocate(const SAllocateInfo& info) override
+				{
+					auto retval = device->allocate(info);
+					// map what is mappable by default so ReBAR checks succeed
+					if (retval.isValid() && retval.memory->isMappable())
+						retval.memory->map({ .offset = 0,.length = info.size });
+					return retval;
+				}
 
-		std::array<ICPUTopLevelAccelerationStructure*, 1u> tmpTlas;
-		std::array<ICPUBuffer*, 2 * std::size(cpuObjects) + 1u> tmpBuffers;
-		{
-			tmpTlas[0] = cpuTlas.get();
-			for (uint32_t i = 0; i < cpuObjects.size(); i++)
+				ILogicalDevice* device;
+			} myalloc;
+			myalloc.device = m_device.get();
+			inputs.allocator = &myalloc;
+
+			std::array<ICPUTopLevelAccelerationStructure*, 1u> tmpTlas;
+			std::array<ICPUBuffer*, 2 * std::size(cpuObjects) + 1u> tmpBuffers;
 			{
-				tmpBuffers[2 * i + 0] = cpuTriBuffers[i].vertex.buffer.get();
-				tmpBuffers[2 * i + 1] = cpuTriBuffers[i].index.buffer.get();
-			}
-			tmpBuffers[2 * proceduralBlasIdx] = cpuProcBuffer.get();
+				tmpTlas[0] = cpuTlas.get();
+				for (uint32_t i = 0; i < cpuObjects.size(); i++)
+				{
+					tmpBuffers[2 * i + 0] = cpuTriBuffers[i].vertex.buffer.get();
+					tmpBuffers[2 * i + 1] = cpuTriBuffers[i].index.buffer.get();
+				}
+				tmpBuffers[2 * proceduralBlasIdx] = cpuProcBuffer.get();
 
-			std::get<CAssetConverter::SInputs::asset_span_t<ICPUTopLevelAccelerationStructure>>(inputs.assets) = tmpTlas;
-			std::get<CAssetConverter::SInputs::asset_span_t<ICPUBuffer>>(inputs.assets) = tmpBuffers;
-		}
+				std::get<CAssetConverter::SInputs::asset_span_t<ICPUTopLevelAccelerationStructure>>(inputs.assets) = tmpTlas;
+				std::get<CAssetConverter::SInputs::asset_span_t<ICPUBuffer>>(inputs.assets) = tmpBuffers;
+			}
 
-		auto reservation = converter->reserve(inputs);
-		{
-			auto prepass = [&]<typename asset_type_t>(const auto & references) -> bool
+			auto reservation = converter->reserve(inputs);
 			{
-				auto objects = reservation.getGPUObjects<asset_type_t>();
-				uint32_t counter = {};
-				for (auto& object : objects)
+				auto prepass = [&]<typename asset_type_t>(const auto & references) -> bool
 				{
-					auto gpu = object.value;
-					auto* reference = references[counter];
-
-					if (reference)
+					auto objects = reservation.getGPUObjects<asset_type_t>();
+					uint32_t counter = {};
+					for (auto& object : objects)
 					{
-						if (!gpu)
+						auto gpu = object.value;
+						auto* reference = references[counter];
+
+						if (reference)
 						{
-							m_logger->log("Failed to convert a CPU object to GPU!", ILogger::ELL_ERROR);
-							return false;
+							if (!gpu)
+							{
+								m_logger->log("Failed to convert a CPU object to GPU!", ILogger::ELL_ERROR);
+								return false;
+							}
 						}
+						counter++;
 					}
-					counter++;
-				}
-				return true;
-			};
+					return true;
+				};
 
-			prepass.template operator() < ICPUTopLevelAccelerationStructure > (tmpTlas);
-			prepass.template operator() < ICPUBuffer > (tmpBuffers);
-		}
+				prepass.template operator() < ICPUTopLevelAccelerationStructure > (tmpTlas);
+				prepass.template operator() < ICPUBuffer > (tmpBuffers);
+			}
 
-		constexpr auto CompBufferCount = 2;
-		std::array<smart_refctd_ptr<IGPUCommandBuffer>, CompBufferCount> compBufs = {};
-		std::array<IQueue::SSubmitInfo::SCommandBufferInfo, CompBufferCount> compBufInfos = {};
-		{
-			auto pool = m_device->createCommandPool(queue->getFamilyIndex(), IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT | IGPUCommandPool::CREATE_FLAGS::TRANSIENT_BIT);
-			pool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY, compBufs);
-			compBufs.front()->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT);
-			for (auto i = 0; i < CompBufferCount; i++)
-				compBufInfos[i].cmdbuf = compBufs[i].get();
-		}
-		auto compSema = m_device->createSemaphore(0u);
-		SIntendedSubmitInfo compute = {};
-		compute.queue = queue;
-		compute.scratchCommandBuffers = compBufInfos;
-		compute.scratchSemaphore = {
-			.semaphore = compSema.get(),
-			.value = 0u,
-			.stageMask = PIPELINE_STAGE_FLAGS::ACCELERATION_STRUCTURE_BUILD_BIT | PIPELINE_STAGE_FLAGS::ACCELERATION_STRUCTURE_COPY_BIT
-		};
-		// convert
-		{
-			smart_refctd_ptr<CAssetConverter::SConvertParams::scratch_for_device_AS_build_t> scratchAlloc;
+			constexpr auto CompBufferCount = 2;
+			std::array<smart_refctd_ptr<IGPUCommandBuffer>, CompBufferCount> compBufs = {};
+			std::array<IQueue::SSubmitInfo::SCommandBufferInfo, CompBufferCount> compBufInfos = {};
 			{
-				constexpr auto MaxAlignment = 256;
-				constexpr auto MinAllocationSize = 1024;
-				const auto scratchSize = core::alignUp(reservation.getMaxASBuildScratchSize(false), MaxAlignment);
+				auto pool = m_device->createCommandPool(queue->getFamilyIndex(), IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT | IGPUCommandPool::CREATE_FLAGS::TRANSIENT_BIT);
+				pool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY, compBufs);
+				compBufs.front()->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT);
+				for (auto i = 0; i < CompBufferCount; i++)
+					compBufInfos[i].cmdbuf = compBufs[i].get();
+			}
+			auto compSema = m_device->createSemaphore(0u);
+			SIntendedSubmitInfo compute = {};
+			compute.queue = queue;
+			compute.scratchCommandBuffers = compBufInfos;
+			compute.scratchSemaphore = {
+				.semaphore = compSema.get(),
+				.value = 0u,
+				.stageMask = PIPELINE_STAGE_FLAGS::ACCELERATION_STRUCTURE_BUILD_BIT | PIPELINE_STAGE_FLAGS::ACCELERATION_STRUCTURE_COPY_BIT
+			};
+			// convert
+			{
+				smart_refctd_ptr<CAssetConverter::SConvertParams::scratch_for_device_AS_build_t> scratchAlloc;
+				{
+					constexpr auto MaxAlignment = 256;
+					constexpr auto MinAllocationSize = 1024;
+					const auto scratchSize = core::alignUp(reservation.getMaxASBuildScratchSize(false), MaxAlignment);
 
 
-				IGPUBuffer::SCreationParams creationParams = {};
-				creationParams.size = scratchSize;
-				creationParams.usage = IGPUBuffer::EUF_ACCELERATION_STRUCTURE_BUILD_INPUT_READ_ONLY_BIT | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT | IGPUBuffer::EUF_STORAGE_BUFFER_BIT;
-				auto scratchBuffer = m_device->createBuffer(std::move(creationParams));
+					IGPUBuffer::SCreationParams creationParams = {};
+					creationParams.size = scratchSize;
+					creationParams.usage = IGPUBuffer::EUF_ACCELERATION_STRUCTURE_BUILD_INPUT_READ_ONLY_BIT | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT | IGPUBuffer::EUF_STORAGE_BUFFER_BIT;
+					auto scratchBuffer = m_device->createBuffer(std::move(creationParams));
 
-				auto reqs = scratchBuffer->getMemoryReqs();
-				reqs.memoryTypeBits &= m_physicalDevice->getDirectVRAMAccessMemoryTypeBits();
+					auto reqs = scratchBuffer->getMemoryReqs();
+					reqs.memoryTypeBits &= m_physicalDevice->getDirectVRAMAccessMemoryTypeBits();
 
-				auto allocation = m_device->allocate(reqs, scratchBuffer.get(), IDeviceMemoryAllocation::EMAF_DEVICE_ADDRESS_BIT);
-				allocation.memory->map({ .offset = 0,.length = reqs.size });
+					auto allocation = m_device->allocate(reqs, scratchBuffer.get(), IDeviceMemoryAllocation::EMAF_DEVICE_ADDRESS_BIT);
+					allocation.memory->map({ .offset = 0,.length = reqs.size });
 
-				scratchAlloc = make_smart_refctd_ptr<CAssetConverter::SConvertParams::scratch_for_device_AS_build_t>(
-					SBufferRange<video::IGPUBuffer>{0ull, scratchSize, std::move(scratchBuffer)},
-					core::allocator<uint8_t>(), MaxAlignment, MinAllocationSize
-				);
-			}
+					scratchAlloc = make_smart_refctd_ptr<CAssetConverter::SConvertParams::scratch_for_device_AS_build_t>(
+						SBufferRange<video::IGPUBuffer>{0ull, scratchSize, std::move(scratchBuffer)},
+						core::allocator<uint8_t>(), MaxAlignment, MinAllocationSize
+					);
+				}
 
-			struct MyParams final : CAssetConverter::SConvertParams
-			{
-				inline uint32_t getFinalOwnerQueueFamily(const IGPUBuffer* buffer, const core::blake3_hash_t& createdFrom) override
+				struct MyParams final : CAssetConverter::SConvertParams
 				{
-					return finalUser;
+					inline uint32_t getFinalOwnerQueueFamily(const IGPUBuffer* buffer, const core::blake3_hash_t& createdFrom) override
+					{
+						return finalUser;
+					}
+					inline uint32_t getFinalOwnerQueueFamily(const IGPUAccelerationStructure* image, const core::blake3_hash_t& createdFrom) override
+					{
+						return finalUser;
+					}
+
+					uint8_t finalUser;
+				} params = {};
+				params.utilities = m_utils.get();
+				params.compute = &compute;
+				params.scratchForDeviceASBuild = scratchAlloc.get();
+				params.finalUser = queue->getFamilyIndex();
+
+				auto future = reservation.convert(params);
+				if (future.copy() != IQueue::RESULT::SUCCESS)
+				{
+					m_logger->log("Failed to await submission feature!", ILogger::ELL_ERROR);
+					return false;
 				}
-				inline uint32_t getFinalOwnerQueueFamily(const IGPUAccelerationStructure* image, const core::blake3_hash_t& createdFrom) override
+				// 2 submits, BLAS build, TLAS build, DO NOT ADD COMPACTIONS IN THIS EXAMPLE!
+				if (compute.getFutureScratchSemaphore().value>3)
+					m_logger->log("Overflow submitted on Compute Queue despite using ReBAR (no transfer submits or usage of staging buffer) and providing a AS Build Scratch Buffer of correctly queried max size!",system::ILogger::ELL_ERROR);
+
+				// assign gpu objects to output
+				auto&& tlases = reservation.getGPUObjects<ICPUTopLevelAccelerationStructure>();
+				m_gpuTlas = tlases[0].value;
+				auto&& buffers = reservation.getGPUObjects<ICPUBuffer>();
+				for (uint32_t i = 0; i < cpuObjects.size(); i++)
 				{
-					return finalUser;
+					auto& cpuObject = cpuObjects[i];
+
+					m_gpuTriangleGeometries.push_back(ReferenceObjectGpu{
+					  .meta = cpuObject.meta,
+					  .bindings = {
+						.vertex = {.offset = 0, .buffer = buffers[2 * i + 0].value },
+						.index = {.offset = 0, .buffer = buffers[2 * i + 1].value },
+					  },
+					  .vertexStride = cpuObject.data.inputParams.bindings[0].stride,
+					  .indexType = cpuObject.data.indexType,
+					  .indexCount = cpuObject.data.indexCount,
+					  .material = hlsl::_static_cast<MaterialPacked>(cpuObject.material),
+					  .transform = cpuObject.transform,
+						});
 				}
+				m_proceduralAabbBuffer = buffers[2 * proceduralBlasIdx].value;
 
-				uint8_t finalUser;
-			} params = {};
-			params.utilities = m_utils.get();
-			params.compute = &compute;
-			params.scratchForDeviceASBuild = scratchAlloc.get();
-			params.finalUser = queue->getFamilyIndex();
-
-			auto future = reservation.convert(params);
-			if (future.copy() != IQueue::RESULT::SUCCESS)
-			{
-				m_logger->log("Failed to await submission feature!", ILogger::ELL_ERROR);
-				return false;
-			}
-			// 2 submits, BLAS build, TLAS build, DO NOT ADD COMPACTIONS IN THIS EXAMPLE!
-			if (compute.getFutureScratchSemaphore().value>3)
-				m_logger->log("Overflow submitted on Compute Queue despite using ReBAR (no transfer submits or usage of staging buffer) and providing a AS Build Scratch Buffer of correctly queried max size!",system::ILogger::ELL_ERROR);
-
-			// assign gpu objects to output
-			auto&& tlases = reservation.getGPUObjects<ICPUTopLevelAccelerationStructure>();
-			m_gpuTlas = tlases[0].value;
-			auto&& buffers = reservation.getGPUObjects<ICPUBuffer>();
-			for (uint32_t i = 0; i < cpuObjects.size(); i++)
-			{
-				auto& cpuObject = cpuObjects[i];
-
-				m_gpuTriangleGeometries.push_back(ReferenceObjectGpu{
-				  .meta = cpuObject.meta,
-				  .bindings = {
-					.vertex = {.offset = 0, .buffer = buffers[2 * i + 0].value },
-					.index = {.offset = 0, .buffer = buffers[2 * i + 1].value },
-				  },
-				  .vertexStride = cpuObject.data.inputParams.bindings[0].stride,
-				  .indexType = cpuObject.data.indexType,
-				  .indexCount = cpuObject.data.indexCount,
-				  .material = hlsl::_static_cast<MaterialPacked>(cpuObject.material),
-				  .transform = cpuObject.transform,
-					});
+				for (uint32_t i = 0; i < m_gpuTriangleGeometries.size(); i++)
+				{
+					const auto& gpuObject = m_gpuTriangleGeometries[i];
+					const uint64_t vertexBufferAddress = gpuObject.bindings.vertex.buffer->getDeviceAddress();
+					geomInfos[i] = {
+					  .material = gpuObject.material,
+					  .vertexBufferAddress = vertexBufferAddress,
+					  .indexBufferAddress = gpuObject.useIndex() ? gpuObject.bindings.index.buffer->getDeviceAddress() : vertexBufferAddress,
+					  .vertexStride = gpuObject.vertexStride,
+					  .objType = gpuObject.meta.type,
+					  .indexType = gpuObject.indexType,
+					  .smoothNormals = s_smoothNormals[gpuObject.meta.type],
+					};
+				}
 			}
-			m_proceduralAabbBuffer = buffers[2 * proceduralBlasIdx].value;
 
-			for (uint32_t i = 0; i < m_gpuTriangleGeometries.size(); i++)
 			{
-				const auto& gpuObject = m_gpuTriangleGeometries[i];
-				const uint64_t vertexBufferAddress = gpuObject.bindings.vertex.buffer->getDeviceAddress();
-				geomInfos[i] = {
-				  .material = gpuObject.material,
-				  .vertexBufferAddress = vertexBufferAddress,
-				  .indexBufferAddress = gpuObject.useIndex() ? gpuObject.bindings.index.buffer->getDeviceAddress() : vertexBufferAddress,
-				  .vertexStride = gpuObject.vertexStride,
-				  .objType = gpuObject.meta.type,
-				  .indexType = gpuObject.indexType,
-				  .smoothNormals = s_smoothNormals[gpuObject.meta.type],
-				};
+				IGPUBuffer::SCreationParams params;
+				params.usage = IGPUBuffer::EUF_STORAGE_BUFFER_BIT | IGPUBuffer::EUF_TRANSFER_DST_BIT | IGPUBuffer::EUF_INLINE_UPDATE_VIA_CMDBUF | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT;
+				params.size = geomInfoBuffer->getSize();
+				m_utils->createFilledDeviceLocalBufferOnDedMem(SIntendedSubmitInfo{ .queue = queue }, std::move(params), geomInfos).move_into(m_triangleGeomInfoBuffer);
 			}
-		}
 
-		{
-			IGPUBuffer::SCreationParams params;
-			params.usage = IGPUBuffer::EUF_STORAGE_BUFFER_BIT | IGPUBuffer::EUF_TRANSFER_DST_BIT | IGPUBuffer::EUF_INLINE_UPDATE_VIA_CMDBUF | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT;
-			params.size = geomInfoBuffer->getSize();
-			m_utils->createFilledDeviceLocalBufferOnDedMem(SIntendedSubmitInfo{ .queue = queue }, std::move(params), geomInfos).move_into(m_triangleGeomInfoBuffer);
+			return true;
 		}
+#endif
+		smart_refctd_ptr<CAssetConverter> m_converter;
+
+		smart_refctd_ptr<IWindow> m_window;
+		smart_refctd_ptr<CSimpleResizeSurface<ISimpleManagedSurface::ISwapchainResources>> m_surface;
+		smart_refctd_ptr<ISemaphore> m_semaphore;
+		uint64_t m_realFrameIx = 0;
+uint32_t m_frameAccumulationCounter = 0;
+		std::array<smart_refctd_ptr<IGPUCommandBuffer>, MaxFramesInFlight> m_cmdBufs;
+		ISimpleManagedSurface::SAcquireResult m_currentImageAcquire = {};
+
+		core::smart_refctd_ptr<InputSystem> m_inputSystem;
+		InputSystem::ChannelReader<IMouseEventChannel> m_mouse;
+		InputSystem::ChannelReader<IKeyboardEventChannel> m_keyboard;
+
+		struct CameraSetting
+		{
+			float fov = 60.f;
+			float zNear = 0.1f;
+			float zFar = 10000.f;
+			float moveSpeed = 1.f;
+			float rotateSpeed = 1.f;
+			float viewWidth = 10.f;
+			float camYAngle = 165.f / 180.f * 3.14159f;
+			float camXAngle = 32.f / 180.f * 3.14159f;
 
-		return true;
-	}
-
-
-
-	smart_refctd_ptr<IWindow> m_window;
-	smart_refctd_ptr<CSimpleResizeSurface<ISimpleManagedSurface::ISwapchainResources>> m_surface;
-	smart_refctd_ptr<ISemaphore> m_semaphore;
-	uint64_t m_realFrameIx = 0;
-	uint32_t m_frameAccumulationCounter = 0;
-	std::array<smart_refctd_ptr<IGPUCommandBuffer>, MaxFramesInFlight> m_cmdBufs;
-	ISimpleManagedSurface::SAcquireResult m_currentImageAcquire = {};
+		} m_cameraSetting;
+		Camera m_camera = Camera(core::vectorSIMDf(0, 0, 0), core::vectorSIMDf(0, 0, 0), core::matrix4SIMD());
 
-	core::smart_refctd_ptr<InputSystem> m_inputSystem;
-	InputSystem::ChannelReader<IMouseEventChannel> m_mouse;
-	InputSystem::ChannelReader<IKeyboardEventChannel> m_keyboard;
-
-	struct CameraSetting
-	{
-		float fov = 60.f;
-		float zNear = 0.1f;
-		float zFar = 10000.f;
-		float moveSpeed = 1.f;
-		float rotateSpeed = 1.f;
-		float viewWidth = 10.f;
-		float camYAngle = 165.f / 180.f * 3.14159f;
-		float camXAngle = 32.f / 180.f * 3.14159f;
-
-	} m_cameraSetting;
-	Camera m_camera = Camera(core::vectorSIMDf(0, 0, 0), core::vectorSIMDf(0, 0, 0), core::matrix4SIMD());
-
-	Light m_light = {
-	  .direction = {-1.0f, -1.0f, -0.4f},
-	  .position = {10.0f, 15.0f, 8.0f},
-	  .outerCutoff = 0.866025404f, // {cos(radians(30.0f))}, 
-	  .type = ELT_DIRECTIONAL
-	};
-
-	video::CDumbPresentationOracle m_oracle;
+		video::CDumbPresentationOracle m_oracle;
 
+#if 0
 	struct C_UI
 	{
 		nbl::core::smart_refctd_ptr<nbl::ext::imgui::UI> manager;
@@ -1624,11 +1406,6 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication,
 	smart_refctd_ptr<IDescriptorPool> m_presentDsPool;
 	smart_refctd_ptr<IGPUGraphicsPipeline> m_presentPipeline;
 
-	smart_refctd_ptr<CAssetConverter> m_converter;
-
-
-	core::matrix4SIMD m_cachedModelViewProjectionMatrix;
-	bool m_useIndirectCommand = false;
-
+#endif
 };
-NBL_MAIN_FUNC(RaytracingPipelineApp)
+NBL_MAIN_FUNC(MeshLoadersApp)

From 27cd66fe70542eb1147fccf4cd9d4073de493924 Mon Sep 17 00:00:00 2001
From: Przemek <minikers21@gmail.com>
Date: Tue, 3 Jun 2025 15:44:59 +0200
Subject: [PATCH 331/529] Implemented grid dilation

---
 62_CAD/DrawResourcesFiller.cpp                 | 18 ++++++++++++++++++
 62_CAD/main.cpp                                |  7 ++++---
 .../shaders/main_pipeline/fragment_shader.hlsl |  1 +
 .../shaders/main_pipeline/vertex_shader.hlsl   |  9 ++++-----
 4 files changed, 27 insertions(+), 8 deletions(-)

diff --git a/62_CAD/DrawResourcesFiller.cpp b/62_CAD/DrawResourcesFiller.cpp
index eaa8eccd2..517334ad9 100644
--- a/62_CAD/DrawResourcesFiller.cpp
+++ b/62_CAD/DrawResourcesFiller.cpp
@@ -682,6 +682,24 @@ void DrawResourcesFiller::drawGridDTM(
 	gridDTMInfo.gridCellWidth = gridCellWidth;
 	gridDTMInfo.textureID = getImageIndexFromID(textureID, intendedNextSubmit); // for this to be valid and safe, this function needs to be called immediately after `addStaticImage` function to make sure image is in memory
 
+	// determine the thickes line
+	float thickestLineThickness = 0.0f;
+
+	if (dtmSettingsInfo.mode & E_DTM_MODE::OUTLINE)
+	{
+		thickestLineThickness = dtmSettingsInfo.outlineStyleInfo.worldSpaceLineWidth + dtmSettingsInfo.outlineStyleInfo.screenSpaceLineWidth;
+	}
+	else if (dtmSettingsInfo.mode & E_DTM_MODE::CONTOUR)
+	{
+		for (int i = 0; i < dtmSettingsInfo.contourSettingsCount; ++i)
+		{
+			const auto& contourLineStyle = dtmSettingsInfo.contourSettings[i].lineStyleInfo;
+			const float contourLineThickness = contourLineStyle.worldSpaceLineWidth + contourLineStyle.screenSpaceLineWidth;
+			thickestLineThickness = std::max(thickestLineThickness, contourLineThickness);
+		}
+	}
+	gridDTMInfo.thicknessOfTheThickestLine = thickestLineThickness;
+
 	if (dtmSettingsInfo.mode & E_DTM_MODE::OUTLINE)
 	{
 		const bool isOutlineStippled = dtmSettingsInfo.outlineStyleInfo.stipplePatternSize > 0;
diff --git a/62_CAD/main.cpp b/62_CAD/main.cpp
index 27589f1d2..a3f4016d7 100644
--- a/62_CAD/main.cpp
+++ b/62_CAD/main.cpp
@@ -372,7 +372,7 @@ class ComputerAidedDesign final : public examples::SimpleWindowedApplication, pu
 
 	void allocateResources()
 	{
-		drawResourcesFiller = DrawResourcesFiller(core::smart_refctd_ptr(m_utils), getGraphicsQueue());
+		drawResourcesFiller = DrawResourcesFiller(core::smart_refctd_ptr(m_utils), getGraphicsQueue(), core::smart_refctd_ptr(m_logger));
 		
 		size_t bufferSize = 512u * 1024u * 1024u; // 512 MB
 		drawResourcesFiller.allocateResourcesBuffer(m_device.get(), bufferSize);
@@ -3544,7 +3544,7 @@ class ComputerAidedDesign final : public examples::SimpleWindowedApplication, pu
 			dtmInfo.mode |= E_DTM_MODE::CONTOUR;
 
 			dtmInfo.outlineStyleInfo.screenSpaceLineWidth = 0.0f;
-			dtmInfo.outlineStyleInfo.worldSpaceLineWidth = 1.0f;
+			dtmInfo.outlineStyleInfo.worldSpaceLineWidth = 2.0f;
 			dtmInfo.outlineStyleInfo.color = float32_t4(0.0f, 0.39f, 0.0f, 1.0f);
 			std::array<double, 4> outlineStipplePattern = { 0.0f, -5.0f, 20.0f, -5.0f };
 			dtmInfo.outlineStyleInfo.setStipplePatternData(outlineStipplePattern);
@@ -3625,6 +3625,7 @@ class ComputerAidedDesign final : public examples::SimpleWindowedApplication, pu
 			drawResourcesFiller.drawGridDTM(topLeft, worldSpaceExtents, HeightMapCellWidth, heightMapTextureID,  dtmInfo, intendedNextSubmit);
 
 			// draw test polyline
+#if 0
 			{
 				LineStyleInfo style = {};
 				style.screenSpaceLineWidth = 0.0f;
@@ -3644,6 +3645,7 @@ class ComputerAidedDesign final : public examples::SimpleWindowedApplication, pu
 
 				drawResourcesFiller.drawPolyline(polyline, style, intendedNextSubmit);
 			}
+#endif
 		}
 	}
 
@@ -3656,7 +3658,6 @@ class ComputerAidedDesign final : public examples::SimpleWindowedApplication, pu
 	}
 
 protected:
-
 	std::chrono::seconds timeout = std::chrono::seconds(0x7fffFFFFu);
 	clock_t::time_point start;
 
diff --git a/62_CAD/shaders/main_pipeline/fragment_shader.hlsl b/62_CAD/shaders/main_pipeline/fragment_shader.hlsl
index 4a47a65a1..d4f269413 100644
--- a/62_CAD/shaders/main_pipeline/fragment_shader.hlsl
+++ b/62_CAD/shaders/main_pipeline/fragment_shader.hlsl
@@ -569,6 +569,7 @@ float4 fragMain(PSInput input) : SV_TARGET
             textureColor = dtmColor.rgb;
             localAlpha = dtmColor.a;
 
+            // test out of bounds draw
             /*if (outOfBoundsUV)
                 textureColor = float3(0.0f, 1.0f, 0.0f);
             else
diff --git a/62_CAD/shaders/main_pipeline/vertex_shader.hlsl b/62_CAD/shaders/main_pipeline/vertex_shader.hlsl
index 65f3eea64..7f669f34b 100644
--- a/62_CAD/shaders/main_pipeline/vertex_shader.hlsl
+++ b/62_CAD/shaders/main_pipeline/vertex_shader.hlsl
@@ -189,7 +189,6 @@ PSInput main(uint vertexID : SV_VertexID)
         uint32_t subsectionIdx = drawObj.type_subsectionIdx >> 16;
         outV.setObjType(objType);
         outV.setMainObjectIdx(drawObj.mainObjIndex);
-    
 
         MainObject mainObj = loadMainObject(drawObj.mainObjIndex);
         clipProjectionData = getClipProjectionData(mainObj);
@@ -652,6 +651,10 @@ PSInput main(uint vertexID : SV_VertexID)
             uint32_t textureID = vk::RawBufferLoad<uint32_t>(globals.pointers.geometryBuffer + drawObj.geometryAddress + 2 * sizeof(pfloat64_t2), 8u);
             float gridCellWidth = vk::RawBufferLoad<float>(globals.pointers.geometryBuffer + drawObj.geometryAddress + 2 * sizeof(pfloat64_t2) + sizeof(uint32_t), 8u);
             float reciprocalOutlineStipplePatternLength = vk::RawBufferLoad<float>(globals.pointers.geometryBuffer + drawObj.geometryAddress + 2 * sizeof(pfloat64_t2) + sizeof(uint32_t) + sizeof(float), 8u);
+            float thicknessOfTheThickestLine = vk::RawBufferLoad<float>(globals.pointers.geometryBuffer + drawObj.geometryAddress + 2 * sizeof(pfloat64_t2) + sizeof(uint32_t) + 2u * sizeof(float), 8u);
+
+            // for testing purpose
+            thicknessOfTheThickestLine += 200.0f;
 
             const float2 corner = float2(bool2(vertexIdx & 0x1u, vertexIdx >> 1));
             worldSpaceExtents.y = ieee754::flipSign(worldSpaceExtents.y);
@@ -668,10 +671,6 @@ PSInput main(uint vertexID : SV_VertexID)
             outV.setGridDTMScreenSpaceGridExtents(_static_cast<float2>(worldSpaceExtents) * globals.screenToWorldRatio);
             outV.setGridDTMOutlineStipplePatternLengthReciprocal(reciprocalOutlineStipplePatternLength);
 
-            // TODO: finish implementing grid dilation
-            // TODO: calculate actual thicknessOfTheThickestLine
-            float thicknessOfTheThickestLine = 200.0f;
-
             static const float SquareRootOfTwo = 1.4142135f;
             const pfloat64_t dilationFactor = SquareRootOfTwo * thicknessOfTheThickestLine;
             pfloat64_t2 dilationVector = pfloat64_t2(dilationFactor, dilationFactor);

From 157bd8f407e108f258356205b8d5a8c36c2eee5c Mon Sep 17 00:00:00 2001
From: devsh <devsh@devsh.eu>
Date: Wed, 4 Jun 2025 12:06:27 +0200
Subject: [PATCH 332/529] outline how I want stuff reorganised

---
 common/CMakeLists.txt                         |  21 +---
 common/CommonAPI.h                            | 111 ------------------
 common/CommonPCH/PCH.hpp                      |  13 --
 common/include/CEventCallback.hpp             |  49 --------
 common/include/nbl/examples/PCH.hpp           |  22 ++++
 .../{ => nbl/examples/cameras}/CCamera.hpp    |   9 +-
 .../nbl/examples/common/CEventCallback.hpp    |  49 ++++++++
 .../{ => nbl/examples/common}/InputSystem.hpp |   0
 .../common}/SBasicViewParameters.hlsl         |   0
 .../common}/SimpleWindowedApplication.hpp     |   3 +-
 .../geometry/CGeometryCreatorScene.hpp}       |  19 +--
 common/src/camera/CMakeLists.txt              |   7 --
 common/src/empty.cpp                          |   0
 common/src/geometry/CMakeLists.txt            |   1 -
 common/src/geometry/creator/CMakeLists.txt    |  69 -----------
 common/src/{ => nbl/examples}/CMakeLists.txt  |  10 +-
 .../src/nbl/examples/cameras/CMakeLists.txt   |   7 ++
 .../src/nbl/examples/geometry/CMakeLists.txt  |  69 +++++++++++
 .../geometry}/shaders/gc.basic.fragment.hlsl  |   0
 .../geometry}/shaders/gc.basic.vertex.hlsl    |   0
 .../geometry}/shaders/gc.cone.vertex.hlsl     |   0
 .../geometry}/shaders/gc.ico.vertex.hlsl      |   0
 .../geometry}/shaders/grid.fragment.hlsl      |   0
 .../geometry}/shaders/grid.vertex.hlsl        |   0
 .../template/gc.basic.vertex.input.hlsl       |   0
 .../geometry}/shaders/template/gc.common.hlsl |   2 +-
 .../template/gc.cone.vertex.input.hlsl        |   0
 .../shaders/template/gc.ico.vertex.input.hlsl |   0
 .../geometry}/shaders/template/gc.vertex.hlsl |   0
 .../shaders/template/grid.common.hlsl         |   2 +-
 .../nbl/examples/pch}/CMakeLists.txt          |   0
 .../nbl/examples/pch}/main.cpp                |   0
 32 files changed, 179 insertions(+), 284 deletions(-)
 delete mode 100644 common/CommonAPI.h
 delete mode 100644 common/CommonPCH/PCH.hpp
 delete mode 100644 common/include/CEventCallback.hpp
 create mode 100644 common/include/nbl/examples/PCH.hpp
 rename common/include/{ => nbl/examples/cameras}/CCamera.hpp (99%)
 create mode 100644 common/include/nbl/examples/common/CEventCallback.hpp
 rename common/include/{ => nbl/examples/common}/InputSystem.hpp (100%)
 rename common/include/{ => nbl/examples/common}/SBasicViewParameters.hlsl (100%)
 rename common/include/{ => nbl/examples/common}/SimpleWindowedApplication.hpp (99%)
 rename common/include/{CGeomtryCreatorScene.hpp => nbl/examples/geometry/CGeometryCreatorScene.hpp} (99%)
 delete mode 100644 common/src/camera/CMakeLists.txt
 delete mode 100644 common/src/empty.cpp
 delete mode 100644 common/src/geometry/CMakeLists.txt
 delete mode 100644 common/src/geometry/creator/CMakeLists.txt
 rename common/src/{ => nbl/examples}/CMakeLists.txt (64%)
 create mode 100644 common/src/nbl/examples/cameras/CMakeLists.txt
 create mode 100644 common/src/nbl/examples/geometry/CMakeLists.txt
 rename common/src/{geometry/creator => nbl/examples/geometry}/shaders/gc.basic.fragment.hlsl (100%)
 rename common/src/{geometry/creator => nbl/examples/geometry}/shaders/gc.basic.vertex.hlsl (100%)
 rename common/src/{geometry/creator => nbl/examples/geometry}/shaders/gc.cone.vertex.hlsl (100%)
 rename common/src/{geometry/creator => nbl/examples/geometry}/shaders/gc.ico.vertex.hlsl (100%)
 rename common/src/{geometry/creator => nbl/examples/geometry}/shaders/grid.fragment.hlsl (100%)
 rename common/src/{geometry/creator => nbl/examples/geometry}/shaders/grid.vertex.hlsl (100%)
 rename common/src/{geometry/creator => nbl/examples/geometry}/shaders/template/gc.basic.vertex.input.hlsl (100%)
 rename common/src/{geometry/creator => nbl/examples/geometry}/shaders/template/gc.common.hlsl (88%)
 rename common/src/{geometry/creator => nbl/examples/geometry}/shaders/template/gc.cone.vertex.input.hlsl (100%)
 rename common/src/{geometry/creator => nbl/examples/geometry}/shaders/template/gc.ico.vertex.input.hlsl (100%)
 rename common/src/{geometry/creator => nbl/examples/geometry}/shaders/template/gc.vertex.hlsl (100%)
 rename common/src/{geometry/creator => nbl/examples/geometry}/shaders/template/grid.common.hlsl (95%)
 rename common/{CommonPCH => src/nbl/examples/pch}/CMakeLists.txt (100%)
 rename common/{CommonPCH => src/nbl/examples/pch}/main.cpp (100%)

diff --git a/common/CMakeLists.txt b/common/CMakeLists.txt
index d9073f273..32c0ed6cf 100644
--- a/common/CMakeLists.txt
+++ b/common/CMakeLists.txt
@@ -7,22 +7,11 @@
 ##
 
 # interface libraries don't have build rules (except custom commands however it doesn't matter here) but properties
-add_library(nblCommonAPI INTERFACE)
+add_library(nblExamplesAPI INTERFACE)
+# TODO: change every variable prefix from `NBL_COMMON_API` to `NBL_EXAMPLES_API` here and elsewhere
 set(NBL_COMMON_API_INCLUDE_DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}/include")
-target_include_directories(nblCommonAPI INTERFACE "${NBL_COMMON_API_INCLUDE_DIRECTORY}")
+target_include_directories(nblExamplesAPI INTERFACE "${NBL_COMMON_API_INCLUDE_DIRECTORY}")
 
-add_subdirectory(src EXCLUDE_FROM_ALL)
+add_subdirectory("src/nbl/examples" EXCLUDE_FROM_ALL)
 
-########## <-
-# TODO: disable this CommonPCH thing! + DEPRICATED!
-# TODO: move asset converer into separate library
-
-nbl_create_ext_library_project(CommonAPI "" "${CMAKE_CURRENT_SOURCE_DIR}/src/empty.cpp" "" "" "")
-set(NBL_EXECUTABLE_COMMON_API_TARGET "${LIB_NAME}" CACHE INTERNAL "")
-
-add_subdirectory(CommonPCH EXCLUDE_FROM_ALL)
-
-#target_precompile_headers("${NBL_EXECUTABLE_COMMON_API_TARGET}" REUSE_FROM "${NBL_EXECUTABLE_PROJECT_CREATION_PCH_TARGET}")
-########## <-
-
-set(NBL_COMMON_API_TARGETS nblCommonAPI ${NBL_COMMON_API_TARGETS} ${NBL_EXECUTABLE_COMMON_API_TARGET} PARENT_SCOPE)
+set(NBL_COMMON_API_TARGETS nblExamplesAPI ${NBL_COMMON_API_TARGETS} ${NBL_EXECUTABLE_COMMON_API_TARGET} PARENT_SCOPE)
diff --git a/common/CommonAPI.h b/common/CommonAPI.h
deleted file mode 100644
index aca8c0741..000000000
--- a/common/CommonAPI.h
+++ /dev/null
@@ -1,111 +0,0 @@
-#ifndef __NBL_COMMON_API_H_INCLUDED__
-#define __NBL_COMMON_API_H_INCLUDED__
-
-#include <nabla.h>
-
-#include "MonoSystemMonoLoggerApplication.hpp"
-
-#include "nbl/ui/CGraphicalApplicationAndroid.h"
-#include "nbl/ui/CWindowManagerAndroid.h"
-
-// TODO: see TODO below
-// TODO: make these include themselves via `nabla.h`
-
-#include "nbl/video/utilities/SPhysicalDeviceFilter.h"
-
-#if 0
-class CommonAPI
-{
-	CommonAPI() = delete;
-public:		
-	class CommonAPIEventCallback : public nbl::ui::IWindow::IEventCallback
-	{
-	public:
-		CommonAPIEventCallback(nbl::core::smart_refctd_ptr<InputSystem>&& inputSystem, nbl::system::logger_opt_smart_ptr&& logger) : m_inputSystem(std::move(inputSystem)), m_logger(std::move(logger)), m_gotWindowClosedMsg(false){}
-		CommonAPIEventCallback() {}
-		bool isWindowOpen() const {return !m_gotWindowClosedMsg;}
-		void setLogger(nbl::system::logger_opt_smart_ptr& logger)
-		{
-			m_logger = logger;
-		}
-		void setInputSystem(nbl::core::smart_refctd_ptr<InputSystem>&& inputSystem)
-		{
-			m_inputSystem = std::move(inputSystem);
-		}
-	private:
-		
-		bool onWindowClosed_impl() override
-		{
-			m_logger.log("Window closed");
-			m_gotWindowClosedMsg = true;
-			return true;
-		}
-
-		void onMouseConnected_impl(nbl::core::smart_refctd_ptr<nbl::ui::IMouseEventChannel>&& mch) override
-		{
-			m_logger.log("A mouse %p has been connected", nbl::system::ILogger::ELL_INFO, mch.get());
-			m_inputSystem.get()->add(m_inputSystem.get()->m_mouse,std::move(mch));
-		}
-		void onMouseDisconnected_impl(nbl::ui::IMouseEventChannel* mch) override
-		{
-			m_logger.log("A mouse %p has been disconnected", nbl::system::ILogger::ELL_INFO, mch);
-			m_inputSystem.get()->remove(m_inputSystem.get()->m_mouse,mch);
-		}
-		void onKeyboardConnected_impl(nbl::core::smart_refctd_ptr<nbl::ui::IKeyboardEventChannel>&& kbch) override
-		{
-			m_logger.log("A keyboard %p has been connected", nbl::system::ILogger::ELL_INFO, kbch.get());
-			m_inputSystem.get()->add(m_inputSystem.get()->m_keyboard,std::move(kbch));
-		}
-		void onKeyboardDisconnected_impl(nbl::ui::IKeyboardEventChannel* kbch) override
-		{
-			m_logger.log("A keyboard %p has been disconnected", nbl::system::ILogger::ELL_INFO, kbch);
-			m_inputSystem.get()->remove(m_inputSystem.get()->m_keyboard,kbch);
-		}
-
-	private:
-		nbl::core::smart_refctd_ptr<InputSystem> m_inputSystem = nullptr;
-		nbl::system::logger_opt_smart_ptr m_logger = nullptr;
-		bool m_gotWindowClosedMsg;
-	};
-
-	// old code from init
-	{
-		// ... 
-
-		result.inputSystem = nbl::core::make_smart_refctd_ptr<InputSystem>(system::logger_opt_smart_ptr(nbl::core::smart_refctd_ptr(result.logger)));
-		result.assetManager = nbl::core::make_smart_refctd_ptr<nbl::asset::IAssetManager>(nbl::core::smart_refctd_ptr(result.system), nbl::core::smart_refctd_ptr(result.compilerSet)); // we should let user choose it?
-
-		if (!headlessCompute)
-		{
-			params.windowCb->setInputSystem(nbl::core::smart_refctd_ptr(result.inputSystem));
-			if (!params.window)
-			{
-				#ifdef _NBL_PLATFORM_WINDOWS_
-					result.windowManager = ui::IWindowManagerWin32::create(); // on the Windows path
-				#elif defined(_NBL_PLATFORM_LINUX_)
-					result.windowManager = nbl::core::make_smart_refctd_ptr<nbl::ui::CWindowManagerX11>(); // on the Android path
-				#else
-					#error "Unsupported platform"
-				#endif
-				
-				nbl::ui::IWindow::SCreationParams windowsCreationParams;
-				windowsCreationParams.width = params.windowWidth;
-				windowsCreationParams.height = params.windowHeight;
-				windowsCreationParams.x = 64u;
-				windowsCreationParams.y = 64u;
-				windowsCreationParams.flags = nbl::ui::IWindow::ECF_RESIZABLE;
-				windowsCreationParams.windowCaption = params.appName.data();
-				windowsCreationParams.callback = params.windowCb;
-
-				params.window = result.windowManager->createWindow(std::move(windowsCreationParams));
-			}
-			params.windowCb = nbl::core::smart_refctd_ptr<CommonAPIEventCallback>((CommonAPIEventCallback*) params.window->getEventCallback());
-		}
-
-		// ...
-	}
-};
-
-#endif
-
-#endif
diff --git a/common/CommonPCH/PCH.hpp b/common/CommonPCH/PCH.hpp
deleted file mode 100644
index 5b9d6a433..000000000
--- a/common/CommonPCH/PCH.hpp
+++ /dev/null
@@ -1,13 +0,0 @@
-// Copyright (C) 2018-2022 - DevSH Graphics Programming Sp. z O.O.
-// This file is part of the "Nabla Engine".
-// For conditions of distribution and use, see copyright notice in nabla.h
-#ifndef _EXAMPLES_COMMON_PCH_HPP_
-#define _EXAMPLES_COMMON_PCH_HPP_
-
-#include <nabla.h>
-
-#include <InputSystem.hpp>
-#include <CCamera.hpp>
-#include <SimpleWindowedApplication.hpp>
-
-#endif // _EXAMPLES_COMMON_PCH_HPP_
\ No newline at end of file
diff --git a/common/include/CEventCallback.hpp b/common/include/CEventCallback.hpp
deleted file mode 100644
index 2d4e36932..000000000
--- a/common/include/CEventCallback.hpp
+++ /dev/null
@@ -1,49 +0,0 @@
-#ifndef __NBL_C_EVENT_CALLBACK_HPP_INCLUDED__
-#define __NBL_C_EVENT_CALLBACK_HPP_INCLUDED__
-
-#include "nbl/video/utilities/CSimpleResizeSurface.h"
-#include "InputSystem.hpp"
-
-class CEventCallback : public nbl::video::ISimpleManagedSurface::ICallback
-{
-public:
-	CEventCallback(nbl::core::smart_refctd_ptr<InputSystem>&& m_inputSystem, nbl::system::logger_opt_smart_ptr&& logger) : m_inputSystem(std::move(m_inputSystem)), m_logger(std::move(logger)) {}
-	CEventCallback() {}
-
-	void setLogger(nbl::system::logger_opt_smart_ptr& logger)
-	{
-		m_logger = logger;
-	}
-	void setInputSystem(nbl::core::smart_refctd_ptr<InputSystem>&& m_inputSystem)
-	{
-		m_inputSystem = std::move(m_inputSystem);
-	}
-private:
-
-	void onMouseConnected_impl(nbl::core::smart_refctd_ptr<nbl::ui::IMouseEventChannel>&& mch) override
-	{
-		m_logger.log("A mouse %p has been connected", nbl::system::ILogger::ELL_INFO, mch.get());
-		m_inputSystem.get()->add(m_inputSystem.get()->m_mouse, std::move(mch));
-	}
-	void onMouseDisconnected_impl(nbl::ui::IMouseEventChannel* mch) override
-	{
-		m_logger.log("A mouse %p has been disconnected", nbl::system::ILogger::ELL_INFO, mch);
-		m_inputSystem.get()->remove(m_inputSystem.get()->m_mouse, mch);
-	}
-	void onKeyboardConnected_impl(nbl::core::smart_refctd_ptr<nbl::ui::IKeyboardEventChannel>&& kbch) override
-	{
-		m_logger.log("A keyboard %p has been connected", nbl::system::ILogger::ELL_INFO, kbch.get());
-		m_inputSystem.get()->add(m_inputSystem.get()->m_keyboard, std::move(kbch));
-	}
-	void onKeyboardDisconnected_impl(nbl::ui::IKeyboardEventChannel* kbch) override
-	{
-		m_logger.log("A keyboard %p has been disconnected", nbl::system::ILogger::ELL_INFO, kbch);
-		m_inputSystem.get()->remove(m_inputSystem.get()->m_keyboard, kbch);
-	}
-
-private:
-	nbl::core::smart_refctd_ptr<InputSystem> m_inputSystem = nullptr;
-	nbl::system::logger_opt_smart_ptr m_logger = nullptr;
-};
-
-#endif // __NBL_C_EVENT_CALLBACK_HPP_INCLUDED__
\ No newline at end of file
diff --git a/common/include/nbl/examples/PCH.hpp b/common/include/nbl/examples/PCH.hpp
new file mode 100644
index 000000000..7a1b6bdc6
--- /dev/null
+++ b/common/include/nbl/examples/PCH.hpp
@@ -0,0 +1,22 @@
+// Copyright (C) 2018-2025 - DevSH Graphics Programming Sp. z O.O.
+// This file is part of the "Nabla Engine".
+// For conditions of distribution and use, see copyright notice in nabla.h
+#ifndef _NBL_EXAMPLES_PCH_HPP_
+#define _NBL_EXAMPLES_PCH_HPP_
+
+
+#include <nabla.h>
+
+// #include "nbl/ui/CGraphicalApplicationAndroid.h"
+// #include "nbl/ui/CWindowManagerAndroid.h"
+
+#include "nbl/examples/common/SimpleWindowedApplication.hpp"
+#include "nbl/examples/common/InputSystem.hpp"
+#include "nbl/examples/common/CEventCallback.hpp"
+
+#include "nbl/examples/cameras/CCamera.hpp"
+
+#include "nbl/examples/geometry/CGeometryCreatorScene.hpp"
+
+
+#endif // _NBL_EXAMPLES_COMMON_PCH_HPP_
\ No newline at end of file
diff --git a/common/include/CCamera.hpp b/common/include/nbl/examples/cameras/CCamera.hpp
similarity index 99%
rename from common/include/CCamera.hpp
rename to common/include/nbl/examples/cameras/CCamera.hpp
index 1b0fe9c0f..3b3cd38d8 100644
--- a/common/include/CCamera.hpp
+++ b/common/include/nbl/examples/cameras/CCamera.hpp
@@ -1,16 +1,18 @@
 // Copyright (C) 2018-2020 - DevSH Graphics Programming Sp. z O.O.
 // This file is part of the "Nabla Engine".
 // For conditions of distribution and use, see copyright notice in nabla.h
+#ifndef _NBL_COMMON_CAMERA_IMPL_
+#define _NBL_COMMON_CAMERA_IMPL_
 
-#ifndef _CAMERA_IMPL_
-#define _CAMERA_IMPL_
 
 #include <nabla.h>
+
 #include <iostream>
 #include <cstdio>
 #include <fstream>
 #include <chrono>
 
+
 class Camera 
 { 
 public:
@@ -322,5 +324,4 @@ class Camera
 
 	std::chrono::microseconds nextPresentationTimeStamp, lastVirtualUpTimeStamp;
 };
-
-#endif // _CAMERA_IMPL_
\ No newline at end of file
+#endif 
\ No newline at end of file
diff --git a/common/include/nbl/examples/common/CEventCallback.hpp b/common/include/nbl/examples/common/CEventCallback.hpp
new file mode 100644
index 000000000..4670ca7f6
--- /dev/null
+++ b/common/include/nbl/examples/common/CEventCallback.hpp
@@ -0,0 +1,49 @@
+#ifndef _NBL_COMMON_C_EVENT_CALLBACK_HPP_INCLUDED_
+#define _NBL_C_EVENT_CALLBACK_HPP_INCLUDED_
+
+#include "nbl/video/utilities/CSimpleResizeSurface.h"
+#include "InputSystem.hpp"
+
+class CEventCallback : public nbl::video::ISimpleManagedSurface::ICallback
+{
+	public:
+		CEventCallback(nbl::core::smart_refctd_ptr<InputSystem>&& m_inputSystem, nbl::system::logger_opt_smart_ptr&& logger) : m_inputSystem(std::move(m_inputSystem)), m_logger(std::move(logger)) {}
+		CEventCallback() {}
+
+		void setLogger(nbl::system::logger_opt_smart_ptr& logger)
+		{
+			m_logger = logger;
+		}
+		void setInputSystem(nbl::core::smart_refctd_ptr<InputSystem>&& m_inputSystem)
+		{
+			m_inputSystem = std::move(m_inputSystem);
+		}
+
+	private:
+		void onMouseConnected_impl(nbl::core::smart_refctd_ptr<nbl::ui::IMouseEventChannel>&& mch) override
+		{
+			m_logger.log("A mouse %p has been connected", nbl::system::ILogger::ELL_INFO, mch.get());
+			m_inputSystem.get()->add(m_inputSystem.get()->m_mouse, std::move(mch));
+		}
+		void onMouseDisconnected_impl(nbl::ui::IMouseEventChannel* mch) override
+		{
+			m_logger.log("A mouse %p has been disconnected", nbl::system::ILogger::ELL_INFO, mch);
+			m_inputSystem.get()->remove(m_inputSystem.get()->m_mouse, mch);
+		}
+		void onKeyboardConnected_impl(nbl::core::smart_refctd_ptr<nbl::ui::IKeyboardEventChannel>&& kbch) override
+		{
+			m_logger.log("A keyboard %p has been connected", nbl::system::ILogger::ELL_INFO, kbch.get());
+			m_inputSystem.get()->add(m_inputSystem.get()->m_keyboard, std::move(kbch));
+		}
+		void onKeyboardDisconnected_impl(nbl::ui::IKeyboardEventChannel* kbch) override
+		{
+			m_logger.log("A keyboard %p has been disconnected", nbl::system::ILogger::ELL_INFO, kbch);
+			m_inputSystem.get()->remove(m_inputSystem.get()->m_keyboard, kbch);
+		}
+
+	private:
+		nbl::core::smart_refctd_ptr<InputSystem> m_inputSystem = nullptr;
+		nbl::system::logger_opt_smart_ptr m_logger = nullptr;
+};
+
+#endif // _NBL_C_EVENT_CALLBACK_HPP_INCLUDED_
\ No newline at end of file
diff --git a/common/include/InputSystem.hpp b/common/include/nbl/examples/common/InputSystem.hpp
similarity index 100%
rename from common/include/InputSystem.hpp
rename to common/include/nbl/examples/common/InputSystem.hpp
diff --git a/common/include/SBasicViewParameters.hlsl b/common/include/nbl/examples/common/SBasicViewParameters.hlsl
similarity index 100%
rename from common/include/SBasicViewParameters.hlsl
rename to common/include/nbl/examples/common/SBasicViewParameters.hlsl
diff --git a/common/include/SimpleWindowedApplication.hpp b/common/include/nbl/examples/common/SimpleWindowedApplication.hpp
similarity index 99%
rename from common/include/SimpleWindowedApplication.hpp
rename to common/include/nbl/examples/common/SimpleWindowedApplication.hpp
index 802a93188..ddb510eb7 100644
--- a/common/include/SimpleWindowedApplication.hpp
+++ b/common/include/nbl/examples/common/SimpleWindowedApplication.hpp
@@ -88,5 +88,4 @@ class SimpleWindowedApplication : public virtual application_templates::BasicMul
 };
 
 }
-
-#endif // _CAMERA_IMPL_
\ No newline at end of file
+#endif
\ No newline at end of file
diff --git a/common/include/CGeomtryCreatorScene.hpp b/common/include/nbl/examples/geometry/CGeometryCreatorScene.hpp
similarity index 99%
rename from common/include/CGeomtryCreatorScene.hpp
rename to common/include/nbl/examples/geometry/CGeometryCreatorScene.hpp
index 0d9bc6edd..7a3f253f3 100644
--- a/common/include/CGeomtryCreatorScene.hpp
+++ b/common/include/nbl/examples/geometry/CGeometryCreatorScene.hpp
@@ -1,14 +1,19 @@
-#ifndef _NBL_GEOMETRY_CREATOR_SCENE_H_INCLUDED_
-#define _NBL_GEOMETRY_CREATOR_SCENE_H_INCLUDED_
+#ifndef _NBL_EXAMPLES_C_GEOMETRY_CREATOR_SCENE_H_INCLUDED_
+#define _NBL_EXAMPLES_C_GEOMETRY_CREATOR_SCENE_H_INCLUDED_
+
 
 #include <nabla.h>
 
-#include "nbl/asset/utils/CGeometryCreator.h"
-#include "SBasicViewParameters.hlsl"
-#include "geometry/creator/spirv/builtin/CArchive.h"
-#include "geometry/creator/spirv/builtin/builtinResources.h"
+#include "nbl/asset/utils/CPolygonGeometryCreator.h"
+
+// soon to be deprecated!
+#include "nbl/examples/common/SBasicViewParameters.hlsl"
+
+#include "nbl/examples/geometry/creator/spirv/builtin/CArchive.h"
+#include "nbl/examples/geometry/creator/spirv/builtin/builtinResources.h"
+
 
-namespace nbl::scene::geometrycreator
+namespace nbl::examples
 {
 
 enum ObjectType : uint8_t
diff --git a/common/src/camera/CMakeLists.txt b/common/src/camera/CMakeLists.txt
deleted file mode 100644
index eedf690aa..000000000
--- a/common/src/camera/CMakeLists.txt
+++ /dev/null
@@ -1,7 +0,0 @@
-# header only currently
-
-#set(NBL_LIB_SOURCES
-#    "${CMAKE_CURRENT_SOURCE_DIR}/*.cpp"
-#)
-
-#nbl_create_ext_library_project(Camera "" "${NBL_LIB_SOURCES}" "" "" "")
\ No newline at end of file
diff --git a/common/src/empty.cpp b/common/src/empty.cpp
deleted file mode 100644
index e69de29bb..000000000
diff --git a/common/src/geometry/CMakeLists.txt b/common/src/geometry/CMakeLists.txt
deleted file mode 100644
index fb33ec637..000000000
--- a/common/src/geometry/CMakeLists.txt
+++ /dev/null
@@ -1 +0,0 @@
-add_subdirectory(creator EXCLUDE_FROM_ALL)
\ No newline at end of file
diff --git a/common/src/geometry/creator/CMakeLists.txt b/common/src/geometry/creator/CMakeLists.txt
deleted file mode 100644
index 336d32fe5..000000000
--- a/common/src/geometry/creator/CMakeLists.txt
+++ /dev/null
@@ -1,69 +0,0 @@
-# shaders IO directories
-set(NBL_THIS_EXAMPLE_INPUT_SHADERS_DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}/shaders")
-get_filename_component(_THIS_EXAMPLE_SPIRV_BR_BUNDLE_SEARCH_DIRECTORY_ "${CMAKE_CURRENT_BINARY_DIR}/shaders/include" ABSOLUTE)
-get_filename_component(_THIS_EXAMPLE_SPIRV_BR_OUTPUT_DIRECTORY_HEADER_ "${CMAKE_CURRENT_BINARY_DIR}/builtin/include" ABSOLUTE)
-get_filename_component(_THIS_EXAMPLE_SPIRV_BR_OUTPUT_DIRECTORY_SOURCE_ "${CMAKE_CURRENT_BINARY_DIR}/builtin/src" ABSOLUTE)
-set(NBL_THIS_EXAMPLE_OUTPUT_SPIRV_DIRECTORY "${_THIS_EXAMPLE_SPIRV_BR_BUNDLE_SEARCH_DIRECTORY_}/nbl/geometryCreator/spirv")
-
-# list of input source shaders
-set(NBL_THIS_EXAMPLE_INPUT_SHADERS
-	# geometry creator
-	"${NBL_THIS_EXAMPLE_INPUT_SHADERS_DIRECTORY}/gc.basic.fragment.hlsl"
-	"${NBL_THIS_EXAMPLE_INPUT_SHADERS_DIRECTORY}/gc.basic.vertex.hlsl"
-	"${NBL_THIS_EXAMPLE_INPUT_SHADERS_DIRECTORY}/gc.cone.vertex.hlsl"
-	"${NBL_THIS_EXAMPLE_INPUT_SHADERS_DIRECTORY}/gc.ico.vertex.hlsl"
-	
-	# grid
-	"${NBL_THIS_EXAMPLE_INPUT_SHADERS_DIRECTORY}/grid.vertex.hlsl"
-	"${NBL_THIS_EXAMPLE_INPUT_SHADERS_DIRECTORY}/grid.fragment.hlsl"
-)
-
-file(GLOB_RECURSE NBL_THIS_EXAMPLE_INPUT_COMMONS CONFIGURE_DEPENDS "${NBL_THIS_EXAMPLE_INPUT_SHADERS_DIRECTORY}/template/*.hlsl")
-
-include("${NBL_ROOT_PATH}/src/nbl/builtin/utils.cmake")
-
-foreach(NBL_INPUT_SHADER IN LISTS NBL_THIS_EXAMPLE_INPUT_SHADERS)
-	cmake_path(GET NBL_INPUT_SHADER FILENAME NBL_INPUT_SHADER_FILENAME)
-	cmake_path(GET NBL_INPUT_SHADER_FILENAME STEM LAST_ONLY NBL_SHADER_STEM) # filename without .hlsl extension
-	cmake_path(GET NBL_SHADER_STEM EXTENSION LAST_ONLY NBL_SHADER_TYPE) # .<shader type>
-	
-	set(NBL_OUTPUT_SPIRV_FILENAME "${NBL_SHADER_STEM}.spv")
-	set(NBL_OUTPUT_SPIRV_PATH "${NBL_THIS_EXAMPLE_OUTPUT_SPIRV_DIRECTORY}/${NBL_OUTPUT_SPIRV_FILENAME}")
-
-	if(NBL_SHADER_TYPE STREQUAL .vertex)
-		set(NBL_NSC_COMPILE_OPTIONS -T vs_6_7 -E VSMain)
-	elseif(NBL_SHADER_TYPE STREQUAL .geometry)
-		set(NBL_NSC_COMPILE_OPTIONS -T gs_6_7 -E GSMain)
-	elseif(NBL_SHADER_TYPE STREQUAL .fragment)
-		set(NBL_NSC_COMPILE_OPTIONS -T ps_6_7 -E PSMain)
-	else()
-		message(FATAL_ERROR "Input shader is supposed to be <name>.<shader type>.hlsl!")
-	endif()
-	
-	set(NBL_NSC_COMPILE_COMMAND
-		"$<TARGET_FILE:nsc>"
-		-Fc "${NBL_OUTPUT_SPIRV_PATH}"
-		-I "${NBL_COMMON_API_INCLUDE_DIRECTORY}"
-		${NBL_NSC_COMPILE_OPTIONS} # this should come from shader's [#pragma WAVE <compile options>] but our NSC doesn't seem to work properly currently
-		"${NBL_INPUT_SHADER}"
-	)
-	
-	set(NBL_DEPENDS
-		"${NBL_INPUT_SHADER}"
-		${NBL_THIS_EXAMPLE_INPUT_COMMONS}
-	)
-		
-	add_custom_command(OUTPUT "${NBL_OUTPUT_SPIRV_PATH}"
-	   COMMAND ${NBL_NSC_COMPILE_COMMAND}
-	   DEPENDS ${NBL_DEPENDS}
-	   WORKING_DIRECTORY "${NBL_THIS_EXAMPLE_INPUT_SHADERS_DIRECTORY}"
-	   COMMENT "Generating \"${NBL_OUTPUT_SPIRV_PATH}\""
-	   VERBATIM
-	   COMMAND_EXPAND_LISTS
-	)
-	
-	list(APPEND NBL_THIS_EXAMPLE_OUTPUT_SPIRV_BUILTINS "${NBL_OUTPUT_SPIRV_PATH}")
-	LIST_BUILTIN_RESOURCE(GEOMETRY_CREATOR_SPIRV_RESOURCES_TO_EMBED "geometryCreator/spirv/${NBL_OUTPUT_SPIRV_FILENAME}")
-endforeach()
-
-ADD_CUSTOM_BUILTIN_RESOURCES(geometryCreatorSpirvBRD GEOMETRY_CREATOR_SPIRV_RESOURCES_TO_EMBED "${_THIS_EXAMPLE_SPIRV_BR_BUNDLE_SEARCH_DIRECTORY_}" "nbl" "geometry::creator::spirv::builtin" "${_THIS_EXAMPLE_SPIRV_BR_OUTPUT_DIRECTORY_HEADER_}" "${_THIS_EXAMPLE_SPIRV_BR_OUTPUT_DIRECTORY_SOURCE_}" "STATIC" "INTERNAL")
\ No newline at end of file
diff --git a/common/src/CMakeLists.txt b/common/src/nbl/examples/CMakeLists.txt
similarity index 64%
rename from common/src/CMakeLists.txt
rename to common/src/nbl/examples/CMakeLists.txt
index 1399b949e..96ccaabea 100644
--- a/common/src/CMakeLists.txt
+++ b/common/src/nbl/examples/CMakeLists.txt
@@ -1,5 +1,8 @@
+# TODO: @AnastaZluk redo the PCH
+# add_subdirectory(pch EXCLUDE_FROM_ALL)
+
 # we add common libraries
-# add_subdirectory(camera EXCLUDE_FROM_ALL) # header only currently
+# add_subdirectory(cameras EXCLUDE_FROM_ALL) # header only currently
 add_subdirectory(geometry EXCLUDE_FROM_ALL)
 
 # we get all available targets inclusive & below this directory
@@ -7,8 +10,9 @@ NBL_GET_ALL_TARGETS(NBL_SUBDIRECTORY_TARGETS)
 
 # then we expose common include search directories to all common libraries + create link interface
 foreach(NBL_TARGET IN LISTS NBL_SUBDIRECTORY_TARGETS)
-    target_include_directories(${NBL_TARGET} PUBLIC $<TARGET_PROPERTY:nblCommonAPI,INTERFACE_INCLUDE_DIRECTORIES>)
-    target_link_libraries(nblCommonAPI INTERFACE ${NBL_TARGET})
+    target_include_directories(${NBL_TARGET} PUBLIC $<TARGET_PROPERTY:nblExamplesAPI,INTERFACE_INCLUDE_DIRECTORIES>)
+    target_link_libraries(nblExamplesAPI INTERFACE ${NBL_TARGET})
 endforeach()
 
+#
 set(NBL_COMMON_API_TARGETS ${NBL_SUBDIRECTORY_TARGETS} PARENT_SCOPE)
\ No newline at end of file
diff --git a/common/src/nbl/examples/cameras/CMakeLists.txt b/common/src/nbl/examples/cameras/CMakeLists.txt
new file mode 100644
index 000000000..0b0e59cdc
--- /dev/null
+++ b/common/src/nbl/examples/cameras/CMakeLists.txt
@@ -0,0 +1,7 @@
+# header only currently
+
+#set(NBL_EXAMPLES_CAMERA_LIB_SOURCES
+#    "${CMAKE_CURRENT_SOURCE_DIR}/*.cpp"
+#)
+
+#nbl_create_ext_library_project(ExampleCameras "" "${NBL_EXAMPLES_CAMERA_LIB_SOURCES}" "" "" "")
\ No newline at end of file
diff --git a/common/src/nbl/examples/geometry/CMakeLists.txt b/common/src/nbl/examples/geometry/CMakeLists.txt
new file mode 100644
index 000000000..0eb09263b
--- /dev/null
+++ b/common/src/nbl/examples/geometry/CMakeLists.txt
@@ -0,0 +1,69 @@
+# shaders IO directories
+set(NBL_EXAMPLES_GEOMETRY_INPUT_SHADERS_DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}/shaders")
+get_filename_component(_EXAMPLES_GEOMETRY_SPIRV_BR_BUNDLE_SEARCH_DIRECTORY_ "${CMAKE_CURRENT_BINARY_DIR}/shaders/include" ABSOLUTE)
+get_filename_component(_EXAMPLES_GEOMETRY_SPIRV_BR_OUTPUT_DIRECTORY_HEADER_ "${CMAKE_CURRENT_BINARY_DIR}/builtin/include" ABSOLUTE)
+get_filename_component(_EXAMPLES_GEOMETRY_SPIRV_BR_OUTPUT_DIRECTORY_SOURCE_ "${CMAKE_CURRENT_BINARY_DIR}/builtin/src" ABSOLUTE)
+set(NBL_EXAMPLES_GEOMETRY_OUTPUT_SPIRV_DIRECTORY "${_EXAMPLES_GEOMETRY_SPIRV_BR_BUNDLE_SEARCH_DIRECTORY_}/nbl/examples/geometry/spirv")
+
+# list of input source shaders
+set(NBL_EXAMPLES_GEOMETRY_INPUT_SHADERS
+	# geometry creator
+	"${NBL_EXAMPLES_GEOMETRY_INPUT_SHADERS_DIRECTORY}/gc.basic.fragment.hlsl"
+	"${NBL_EXAMPLES_GEOMETRY_INPUT_SHADERS_DIRECTORY}/gc.basic.vertex.hlsl"
+	"${NBL_EXAMPLES_GEOMETRY_INPUT_SHADERS_DIRECTORY}/gc.cone.vertex.hlsl"
+	"${NBL_EXAMPLES_GEOMETRY_INPUT_SHADERS_DIRECTORY}/gc.ico.vertex.hlsl"
+	
+	# grid
+	"${NBL_EXAMPLES_GEOMETRY_INPUT_SHADERS_DIRECTORY}/grid.vertex.hlsl"
+	"${NBL_EXAMPLES_GEOMETRY_INPUT_SHADERS_DIRECTORY}/grid.fragment.hlsl"
+)
+
+file(GLOB_RECURSE NBL_EXAMPLES_GEOMETRY_INPUT_COMMONS CONFIGURE_DEPENDS "${NBL_EXAMPLES_GEOMETRY_INPUT_SHADERS_DIRECTORY}/template/*.hlsl")
+
+include("${NBL_ROOT_PATH}/src/nbl/builtin/utils.cmake")
+
+foreach(NBL_INPUT_SHADER IN LISTS NBL_EXAMPLES_GEOMETRY_INPUT_SHADERS)
+	cmake_path(GET NBL_INPUT_SHADER FILENAME NBL_INPUT_SHADER_FILENAME)
+	cmake_path(GET NBL_INPUT_SHADER_FILENAME STEM LAST_ONLY NBL_SHADER_STEM) # filename without .hlsl extension
+	cmake_path(GET NBL_SHADER_STEM EXTENSION LAST_ONLY NBL_SHADER_TYPE) # .<shader type>
+	
+	set(NBL_OUTPUT_SPIRV_FILENAME "${NBL_SHADER_STEM}.spv")
+	set(NBL_OUTPUT_SPIRV_PATH "${NBL_EXAMPLES_GEOMETRY_OUTPUT_SPIRV_DIRECTORY}/${NBL_OUTPUT_SPIRV_FILENAME}")
+
+	if(NBL_SHADER_TYPE STREQUAL .vertex)
+		set(NBL_NSC_COMPILE_OPTIONS -T vs_6_8 -E VSMain)
+	elseif(NBL_SHADER_TYPE STREQUAL .geometry)
+		set(NBL_NSC_COMPILE_OPTIONS -T gs_6_8 -E GSMain)
+	elseif(NBL_SHADER_TYPE STREQUAL .fragment)
+		set(NBL_NSC_COMPILE_OPTIONS -T ps_6_8 -E PSMain)
+	else()
+		message(FATAL_ERROR "Input shader is supposed to be <name>.<shader type>.hlsl!")
+	endif()
+	
+	set(NBL_NSC_COMPILE_COMMAND
+		"$<TARGET_FILE:nsc>"
+		-Fc "${NBL_OUTPUT_SPIRV_PATH}"
+		-I "${NBL_COMMON_API_INCLUDE_DIRECTORY}"
+		${NBL_NSC_COMPILE_OPTIONS} # this should come from shader's [#pragma WAVE <compile options>] but our NSC doesn't seem to work properly currently
+		"${NBL_INPUT_SHADER}"
+	)
+	
+	set(NBL_DEPENDS
+		"${NBL_INPUT_SHADER}"
+		${NBL_EXAMPLES_GEOMETRY_INPUT_COMMONS}
+	)
+		
+	add_custom_command(OUTPUT "${NBL_OUTPUT_SPIRV_PATH}"
+	   COMMAND ${NBL_NSC_COMPILE_COMMAND}
+	   DEPENDS ${NBL_DEPENDS}
+	   WORKING_DIRECTORY "${NBL_EXAMPLES_GEOMETRY_INPUT_SHADERS_DIRECTORY}"
+	   COMMENT "Generating \"${NBL_OUTPUT_SPIRV_PATH}\""
+	   VERBATIM
+	   COMMAND_EXPAND_LISTS
+	)
+	
+	list(APPEND NBL_EXAMPLES_GEOMETRY_OUTPUT_SPIRV_BUILTINS "${NBL_OUTPUT_SPIRV_PATH}")
+	LIST_BUILTIN_RESOURCE(GEOMETRY_CREATOR_SPIRV_RESOURCES_TO_EMBED "geometry/spirv/${NBL_OUTPUT_SPIRV_FILENAME}")
+endforeach()
+
+ADD_CUSTOM_BUILTIN_RESOURCES(geometryCreatorSpirvBRD GEOMETRY_CREATOR_SPIRV_RESOURCES_TO_EMBED "${_EXAMPLES_GEOMETRY_SPIRV_BR_BUNDLE_SEARCH_DIRECTORY_}" "nbl" "geometry::spirv::builtin" "${_EXAMPLES_GEOMETRY_SPIRV_BR_OUTPUT_DIRECTORY_HEADER_}" "${_EXAMPLES_GEOMETRY_SPIRV_BR_OUTPUT_DIRECTORY_SOURCE_}" "STATIC" "INTERNAL")
\ No newline at end of file
diff --git a/common/src/geometry/creator/shaders/gc.basic.fragment.hlsl b/common/src/nbl/examples/geometry/shaders/gc.basic.fragment.hlsl
similarity index 100%
rename from common/src/geometry/creator/shaders/gc.basic.fragment.hlsl
rename to common/src/nbl/examples/geometry/shaders/gc.basic.fragment.hlsl
diff --git a/common/src/geometry/creator/shaders/gc.basic.vertex.hlsl b/common/src/nbl/examples/geometry/shaders/gc.basic.vertex.hlsl
similarity index 100%
rename from common/src/geometry/creator/shaders/gc.basic.vertex.hlsl
rename to common/src/nbl/examples/geometry/shaders/gc.basic.vertex.hlsl
diff --git a/common/src/geometry/creator/shaders/gc.cone.vertex.hlsl b/common/src/nbl/examples/geometry/shaders/gc.cone.vertex.hlsl
similarity index 100%
rename from common/src/geometry/creator/shaders/gc.cone.vertex.hlsl
rename to common/src/nbl/examples/geometry/shaders/gc.cone.vertex.hlsl
diff --git a/common/src/geometry/creator/shaders/gc.ico.vertex.hlsl b/common/src/nbl/examples/geometry/shaders/gc.ico.vertex.hlsl
similarity index 100%
rename from common/src/geometry/creator/shaders/gc.ico.vertex.hlsl
rename to common/src/nbl/examples/geometry/shaders/gc.ico.vertex.hlsl
diff --git a/common/src/geometry/creator/shaders/grid.fragment.hlsl b/common/src/nbl/examples/geometry/shaders/grid.fragment.hlsl
similarity index 100%
rename from common/src/geometry/creator/shaders/grid.fragment.hlsl
rename to common/src/nbl/examples/geometry/shaders/grid.fragment.hlsl
diff --git a/common/src/geometry/creator/shaders/grid.vertex.hlsl b/common/src/nbl/examples/geometry/shaders/grid.vertex.hlsl
similarity index 100%
rename from common/src/geometry/creator/shaders/grid.vertex.hlsl
rename to common/src/nbl/examples/geometry/shaders/grid.vertex.hlsl
diff --git a/common/src/geometry/creator/shaders/template/gc.basic.vertex.input.hlsl b/common/src/nbl/examples/geometry/shaders/template/gc.basic.vertex.input.hlsl
similarity index 100%
rename from common/src/geometry/creator/shaders/template/gc.basic.vertex.input.hlsl
rename to common/src/nbl/examples/geometry/shaders/template/gc.basic.vertex.input.hlsl
diff --git a/common/src/geometry/creator/shaders/template/gc.common.hlsl b/common/src/nbl/examples/geometry/shaders/template/gc.common.hlsl
similarity index 88%
rename from common/src/geometry/creator/shaders/template/gc.common.hlsl
rename to common/src/nbl/examples/geometry/shaders/template/gc.common.hlsl
index 4590cd4a3..26e2885f7 100644
--- a/common/src/geometry/creator/shaders/template/gc.common.hlsl
+++ b/common/src/nbl/examples/geometry/shaders/template/gc.common.hlsl
@@ -9,7 +9,7 @@
 	};
 #endif // __HLSL_VERSION
 
-#include "SBasicViewParameters.hlsl"
+#include "common/SBasicViewParameters.hlsl"
 
 #endif // _THIS_EXAMPLE_GC_COMMON_HLSL_
 
diff --git a/common/src/geometry/creator/shaders/template/gc.cone.vertex.input.hlsl b/common/src/nbl/examples/geometry/shaders/template/gc.cone.vertex.input.hlsl
similarity index 100%
rename from common/src/geometry/creator/shaders/template/gc.cone.vertex.input.hlsl
rename to common/src/nbl/examples/geometry/shaders/template/gc.cone.vertex.input.hlsl
diff --git a/common/src/geometry/creator/shaders/template/gc.ico.vertex.input.hlsl b/common/src/nbl/examples/geometry/shaders/template/gc.ico.vertex.input.hlsl
similarity index 100%
rename from common/src/geometry/creator/shaders/template/gc.ico.vertex.input.hlsl
rename to common/src/nbl/examples/geometry/shaders/template/gc.ico.vertex.input.hlsl
diff --git a/common/src/geometry/creator/shaders/template/gc.vertex.hlsl b/common/src/nbl/examples/geometry/shaders/template/gc.vertex.hlsl
similarity index 100%
rename from common/src/geometry/creator/shaders/template/gc.vertex.hlsl
rename to common/src/nbl/examples/geometry/shaders/template/gc.vertex.hlsl
diff --git a/common/src/geometry/creator/shaders/template/grid.common.hlsl b/common/src/nbl/examples/geometry/shaders/template/grid.common.hlsl
similarity index 95%
rename from common/src/geometry/creator/shaders/template/grid.common.hlsl
rename to common/src/nbl/examples/geometry/shaders/template/grid.common.hlsl
index bc6516600..616412245 100644
--- a/common/src/geometry/creator/shaders/template/grid.common.hlsl
+++ b/common/src/nbl/examples/geometry/shaders/template/grid.common.hlsl
@@ -31,7 +31,7 @@
     }
 #endif // __HLSL_VERSION
 
-#include "SBasicViewParameters.hlsl"
+#include "common/SBasicViewParameters.hlsl"
 
 #endif // _THIS_EXAMPLE_GRID_COMMON_HLSL_
 
diff --git a/common/CommonPCH/CMakeLists.txt b/common/src/nbl/examples/pch/CMakeLists.txt
similarity index 100%
rename from common/CommonPCH/CMakeLists.txt
rename to common/src/nbl/examples/pch/CMakeLists.txt
diff --git a/common/CommonPCH/main.cpp b/common/src/nbl/examples/pch/main.cpp
similarity index 100%
rename from common/CommonPCH/main.cpp
rename to common/src/nbl/examples/pch/main.cpp

From 837071974d01e908a4cbce8ff0cca05bd5aecf39 Mon Sep 17 00:00:00 2001
From: devsh <devsh@devsh.eu>
Date: Wed, 4 Jun 2025 12:32:07 +0200
Subject: [PATCH 333/529] temporarily disable some things that I couldn't
 figure out withour Arek's help

Yes geometry creator scene default SPIR-V shaders are gone
---
 09_GeometryCreator/CMakeLists.txt             |  6 ++++--
 09_GeometryCreator/include/common.hpp         | 20 +++++--------------
 61_UI/CMakeLists.txt                          |  8 +++++---
 CMakeLists.txt                                | 10 +++++-----
 common/CMakeLists.txt                         |  8 ++++----
 .../geometry/CGeometryCreatorScene.hpp        |  7 ++++---
 .../src/nbl/examples/geometry/CMakeLists.txt  |  8 ++++++--
 common/src/nbl/examples/pch/CMakeLists.txt    |  5 ++++-
 8 files changed, 37 insertions(+), 35 deletions(-)

diff --git a/09_GeometryCreator/CMakeLists.txt b/09_GeometryCreator/CMakeLists.txt
index 928ef5761..2dd253226 100644
--- a/09_GeometryCreator/CMakeLists.txt
+++ b/09_GeometryCreator/CMakeLists.txt
@@ -2,5 +2,7 @@ set(NBL_INCLUDE_SERACH_DIRECTORIES
 	"${CMAKE_CURRENT_SOURCE_DIR}/include"
 )
 
-nbl_create_executable_project("" "" "${NBL_INCLUDE_SERACH_DIRECTORIES}" "" "${NBL_EXECUTABLE_PROJECT_CREATION_PCH_TARGET}")
-LINK_BUILTIN_RESOURCES_TO_TARGET(${EXECUTABLE_NAME} geometryCreatorSpirvBRD)
\ No newline at end of file
+	# TODO; Arek I removed `NBL_EXECUTABLE_PROJECT_CREATION_PCH_TARGET` from the last parameter here, doesn't this macro have 4 arguments anyway !?
+nbl_create_executable_project("" "" "${NBL_INCLUDE_SERACH_DIRECTORIES}" "" "")
+# TODO: Arek temporarily disabled cause I haven't figured out how to make this target yet
+# LINK_BUILTIN_RESOURCES_TO_TARGET(${EXECUTABLE_NAME} nblExamplesGeometrySpirvBRD)
\ No newline at end of file
diff --git a/09_GeometryCreator/include/common.hpp b/09_GeometryCreator/include/common.hpp
index 3661e5697..946f2982f 100644
--- a/09_GeometryCreator/include/common.hpp
+++ b/09_GeometryCreator/include/common.hpp
@@ -1,20 +1,10 @@
-#ifndef __NBL_THIS_EXAMPLE_COMMON_H_INCLUDED__
-#define __NBL_THIS_EXAMPLE_COMMON_H_INCLUDED__
+#ifndef _NBL_THIS_EXAMPLE_COMMON_H_INCLUDED_
+#define _NBL_THIS_EXAMPLE_COMMON_H_INCLUDED_
 
 #include <nabla.h>
-#include "nbl/asset/utils/CGeometryCreator.h"
 
-#include "SimpleWindowedApplication.hpp"
-#include "InputSystem.hpp"
-#include "CEventCallback.hpp"
-
-#include "CCamera.hpp"
-#include "SBasicViewParameters.hlsl"
-
-#include "geometry/creator/spirv/builtin/CArchive.h"
-#include "geometry/creator/spirv/builtin/builtinResources.h"
-
-#include "CGeomtryCreatorScene.hpp"
+// TODO: @AnastaZIuk do we even make that explicit?
+#include "nbl/examples/PCH.hpp"
 
 using namespace nbl;
 using namespace core;
@@ -24,6 +14,6 @@ using namespace asset;
 using namespace ui;
 using namespace video;
 using namespace scene;
-using namespace geometrycreator;
+using namespace examples;
 
 #endif // __NBL_THIS_EXAMPLE_COMMON_H_INCLUDED__
\ No newline at end of file
diff --git a/61_UI/CMakeLists.txt b/61_UI/CMakeLists.txt
index a34e46ce6..5d0021f61 100644
--- a/61_UI/CMakeLists.txt
+++ b/61_UI/CMakeLists.txt
@@ -12,7 +12,9 @@ if(NBL_BUILD_IMGUI)
 		imguizmo
 		"${NBL_EXT_IMGUI_UI_LIB}"
 	)
-
-	nbl_create_executable_project("${NBL_EXTRA_SOURCES}" "" "${NBL_INCLUDE_SERACH_DIRECTORIES}" "${NBL_LIBRARIES}" "${NBL_EXECUTABLE_PROJECT_CREATION_PCH_TARGET}")
-	LINK_BUILTIN_RESOURCES_TO_TARGET(${EXECUTABLE_NAME} geometryCreatorSpirvBRD)
+	
+	# TODO; Arek I removed `NBL_EXECUTABLE_PROJECT_CREATION_PCH_TARGET` from the last parameter here, doesn't this macro have 4 arguments anyway !?
+	nbl_create_executable_project("${NBL_EXTRA_SOURCES}" "" "${NBL_INCLUDE_SERACH_DIRECTORIES}" "${NBL_LIBRARIES}")
+	# TODO: Arek temporarily disabled cause I haven't figured out how to make this target yet
+	# LINK_BUILTIN_RESOURCES_TO_TARGET(${EXECUTABLE_NAME} nblExamplesGeometrySpirvBRD)
 endif()
\ No newline at end of file
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 0c0584ebe..789e96937 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -3,8 +3,8 @@
 # For conditions of distribution and use, see copyright notice in nabla.h
 
 function(NBL_HOOK_COMMON_API NBL_EXCLUDE_TARGETS_LIST)
-	if(NOT TARGET nblCommonAPI)
-		message(FATAL_ERROR "nblCommonAPI not defined!")
+	if(NOT TARGET nblExamplesAPI)
+		message(FATAL_ERROR "nblExamplesAPI not defined!")
 	endif()
 
     NBL_GET_ALL_TARGETS(NBL_TARGETS)
@@ -13,8 +13,8 @@ function(NBL_HOOK_COMMON_API NBL_EXCLUDE_TARGETS_LIST)
 		# TODO: exclude builtin targets created by examples as well - doesn't impact anything at all now
 		if(NOT ${NBL_TARGET} IN_LIST NBL_EXCLUDE_TARGETS_LIST)
 
-			target_include_directories(${NBL_TARGET} PRIVATE $<TARGET_PROPERTY:nblCommonAPI,INTERFACE_INCLUDE_DIRECTORIES>)
-			target_link_libraries(${NBL_TARGET} PRIVATE nblCommonAPI)
+			target_include_directories(${NBL_TARGET} PRIVATE $<TARGET_PROPERTY:nblExamplesAPI,INTERFACE_INCLUDE_DIRECTORIES>)
+			target_link_libraries(${NBL_TARGET} PRIVATE nblExamplesAPI)
 		endif()
     endforeach()
 endfunction()
@@ -92,5 +92,5 @@ if(NBL_BUILD_EXAMPLES)
   	add_subdirectory(70_FLIPFluids EXCLUDE_FROM_ALL)
 	add_subdirectory(71_RayTracingPipeline EXCLUDE_FROM_ALL)
 
-	NBL_HOOK_COMMON_API("${NBL_COMMON_API_TARGETS}")
+	NBL_HOOK_COMMON_API("${NBL_EXAMPLES_API_TARGETS}")
 endif()
diff --git a/common/CMakeLists.txt b/common/CMakeLists.txt
index 32c0ed6cf..9560a8f42 100644
--- a/common/CMakeLists.txt
+++ b/common/CMakeLists.txt
@@ -8,10 +8,10 @@
 
 # interface libraries don't have build rules (except custom commands however it doesn't matter here) but properties
 add_library(nblExamplesAPI INTERFACE)
-# TODO: change every variable prefix from `NBL_COMMON_API` to `NBL_EXAMPLES_API` here and elsewhere
-set(NBL_COMMON_API_INCLUDE_DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}/include")
-target_include_directories(nblExamplesAPI INTERFACE "${NBL_COMMON_API_INCLUDE_DIRECTORY}")
+set(NBL_EXAMPLES_API_INCLUDE_DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}/include")
+target_include_directories(nblExamplesAPI INTERFACE "${NBL_EXAMPLES_API_INCLUDE_DIRECTORY}")
 
 add_subdirectory("src/nbl/examples" EXCLUDE_FROM_ALL)
 
-set(NBL_COMMON_API_TARGETS nblExamplesAPI ${NBL_COMMON_API_TARGETS} ${NBL_EXECUTABLE_COMMON_API_TARGET} PARENT_SCOPE)
+# TODO: Arek what was `NBL_EXECUTABLE_COMMON_API_TARGET` ? I removed it.
+set(NBL_EXAMPLES_API_TARGETS nblExamplesAPI ${NBL_EXAMPLES_API_TARGETS} PARENT_SCOPE)
diff --git a/common/include/nbl/examples/geometry/CGeometryCreatorScene.hpp b/common/include/nbl/examples/geometry/CGeometryCreatorScene.hpp
index 7a3f253f3..9ebd244aa 100644
--- a/common/include/nbl/examples/geometry/CGeometryCreatorScene.hpp
+++ b/common/include/nbl/examples/geometry/CGeometryCreatorScene.hpp
@@ -4,13 +4,14 @@
 
 #include <nabla.h>
 
-#include "nbl/asset/utils/CPolygonGeometryCreator.h"
+#include "nbl/asset/utils/CGeometryCreator.h"
 
 // soon to be deprecated!
 #include "nbl/examples/common/SBasicViewParameters.hlsl"
 
-#include "nbl/examples/geometry/creator/spirv/builtin/CArchive.h"
-#include "nbl/examples/geometry/creator/spirv/builtin/builtinResources.h"
+// TODO: Arek bring back
+//#include "nbl/examples/geometry/spirv/builtin/CArchive.h"
+//#include "nbl/examples/geometry/spirv/builtin/builtinResources.h"
 
 
 namespace nbl::examples
diff --git a/common/src/nbl/examples/geometry/CMakeLists.txt b/common/src/nbl/examples/geometry/CMakeLists.txt
index 0eb09263b..c402a2b8a 100644
--- a/common/src/nbl/examples/geometry/CMakeLists.txt
+++ b/common/src/nbl/examples/geometry/CMakeLists.txt
@@ -1,3 +1,6 @@
+# TODO: let arek figure out how to redo the shaders
+#[===[
+
 # shaders IO directories
 set(NBL_EXAMPLES_GEOMETRY_INPUT_SHADERS_DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}/shaders")
 get_filename_component(_EXAMPLES_GEOMETRY_SPIRV_BR_BUNDLE_SEARCH_DIRECTORY_ "${CMAKE_CURRENT_BINARY_DIR}/shaders/include" ABSOLUTE)
@@ -43,7 +46,7 @@ foreach(NBL_INPUT_SHADER IN LISTS NBL_EXAMPLES_GEOMETRY_INPUT_SHADERS)
 	set(NBL_NSC_COMPILE_COMMAND
 		"$<TARGET_FILE:nsc>"
 		-Fc "${NBL_OUTPUT_SPIRV_PATH}"
-		-I "${NBL_COMMON_API_INCLUDE_DIRECTORY}"
+		-I "${NBL_EXAMPLES_API_INCLUDE_DIRECTORY}"
 		${NBL_NSC_COMPILE_OPTIONS} # this should come from shader's [#pragma WAVE <compile options>] but our NSC doesn't seem to work properly currently
 		"${NBL_INPUT_SHADER}"
 	)
@@ -66,4 +69,5 @@ foreach(NBL_INPUT_SHADER IN LISTS NBL_EXAMPLES_GEOMETRY_INPUT_SHADERS)
 	LIST_BUILTIN_RESOURCE(GEOMETRY_CREATOR_SPIRV_RESOURCES_TO_EMBED "geometry/spirv/${NBL_OUTPUT_SPIRV_FILENAME}")
 endforeach()
 
-ADD_CUSTOM_BUILTIN_RESOURCES(geometryCreatorSpirvBRD GEOMETRY_CREATOR_SPIRV_RESOURCES_TO_EMBED "${_EXAMPLES_GEOMETRY_SPIRV_BR_BUNDLE_SEARCH_DIRECTORY_}" "nbl" "geometry::spirv::builtin" "${_EXAMPLES_GEOMETRY_SPIRV_BR_OUTPUT_DIRECTORY_HEADER_}" "${_EXAMPLES_GEOMETRY_SPIRV_BR_OUTPUT_DIRECTORY_SOURCE_}" "STATIC" "INTERNAL")
\ No newline at end of file
+ADD_CUSTOM_BUILTIN_RESOURCES(geometryCreatorSpirvBRD GEOMETRY_CREATOR_SPIRV_RESOURCES_TO_EMBED "${_EXAMPLES_GEOMETRY_SPIRV_BR_BUNDLE_SEARCH_DIRECTORY_}" "nbl" "geometry::spirv::builtin" "${_EXAMPLES_GEOMETRY_SPIRV_BR_OUTPUT_DIRECTORY_HEADER_}" "${_EXAMPLES_GEOMETRY_SPIRV_BR_OUTPUT_DIRECTORY_SOURCE_}" "STATIC" "INTERNAL")
+]===]
\ No newline at end of file
diff --git a/common/src/nbl/examples/pch/CMakeLists.txt b/common/src/nbl/examples/pch/CMakeLists.txt
index 5e62f885f..34f16c2d2 100644
--- a/common/src/nbl/examples/pch/CMakeLists.txt
+++ b/common/src/nbl/examples/pch/CMakeLists.txt
@@ -1,3 +1,5 @@
+# TODO: let arek figure out how to redo the PCH
+#[===[
 include(common RESULT_VARIABLE RES)
 if(NOT RES)
 	message(FATAL_ERROR "common.cmake not found. Should be in '${NBL_ROOT_PATH}/cmake' directory")
@@ -12,4 +14,5 @@ target_precompile_headers("${EXECUTABLE_NAME}" PUBLIC
 	"${CMAKE_CURRENT_SOURCE_DIR}/PCH.hpp" # Common PCH for examples
 	"${NBL_NABLA_TARGET_SOURCE_DIR}/pch.h" # Nabla's PCH
 )
-unset(NBL_NABLA_TARGET_SOURCE_DIR)
\ No newline at end of file
+unset(NBL_NABLA_TARGET_SOURCE_DIR)
+]===]
\ No newline at end of file

From 83443a75be9d732989cc77fe8cdfd18b7e6fa52f Mon Sep 17 00:00:00 2001
From: devsh <devsh@devsh.eu>
Date: Wed, 4 Jun 2025 15:55:08 +0200
Subject: [PATCH 334/529] factored out the `CSwapchainFramebuffersAndDepth`
 into `nbl/examples/common`

---
 09_GeometryCreator/include/common.hpp         |   2 -
 09_GeometryCreator/main.cpp                   | 394 ++++--------------
 common/include/nbl/examples/PCH.hpp           |   1 +
 .../nbl/examples/common/CEventCallback.hpp    |  15 +-
 .../common/CSwapchainFramebuffersAndDepth.hpp | 101 +++++
 .../nbl/examples/common/InputSystem.hpp       |  37 +-
 .../examples/common/MonoWindowApplication.hpp | 189 +++++++++
 .../geometry/CGeometryCreatorScene.hpp        |  35 +-
 8 files changed, 433 insertions(+), 341 deletions(-)
 create mode 100644 common/include/nbl/examples/common/CSwapchainFramebuffersAndDepth.hpp
 create mode 100644 common/include/nbl/examples/common/MonoWindowApplication.hpp

diff --git a/09_GeometryCreator/include/common.hpp b/09_GeometryCreator/include/common.hpp
index 946f2982f..02197171d 100644
--- a/09_GeometryCreator/include/common.hpp
+++ b/09_GeometryCreator/include/common.hpp
@@ -1,8 +1,6 @@
 #ifndef _NBL_THIS_EXAMPLE_COMMON_H_INCLUDED_
 #define _NBL_THIS_EXAMPLE_COMMON_H_INCLUDED_
 
-#include <nabla.h>
-
 // TODO: @AnastaZIuk do we even make that explicit?
 #include "nbl/examples/PCH.hpp"
 
diff --git a/09_GeometryCreator/main.cpp b/09_GeometryCreator/main.cpp
index 4ac527e08..2a3a1553e 100644
--- a/09_GeometryCreator/main.cpp
+++ b/09_GeometryCreator/main.cpp
@@ -1,205 +1,34 @@
-// Copyright (C) 2018-2024 - DevSH Graphics Programming Sp. z O.O.
+// Copyright (C) 2018-2025 - DevSH Graphics Programming Sp. z O.O.
 // This file is part of the "Nabla Engine".
 // For conditions of distribution and use, see copyright notice in nabla.h
 
 #include "common.hpp"
 
-class CSwapchainFramebuffersAndDepth final : public nbl::video::CDefaultSwapchainFramebuffers
+class GeometryCreatorApp final : public examples::MonoWindowApplication
 {
-	using base_t = CDefaultSwapchainFramebuffers;
-
-public:
-	template<typename... Args>
-	inline CSwapchainFramebuffersAndDepth(ILogicalDevice* device, const asset::E_FORMAT _desiredDepthFormat, Args&&... args) : CDefaultSwapchainFramebuffers(device, std::forward<Args>(args)...)
-	{
-		const IPhysicalDevice::SImageFormatPromotionRequest req = {
-			.originalFormat = _desiredDepthFormat,
-			.usages = {IGPUImage::EUF_RENDER_ATTACHMENT_BIT}
-		};
-		m_depthFormat = m_device->getPhysicalDevice()->promoteImageFormat(req, IGPUImage::TILING::OPTIMAL);
-
-		const static IGPURenderpass::SCreationParams::SDepthStencilAttachmentDescription depthAttachments[] = {
-			{{
-				{
-					.format = m_depthFormat,
-					.samples = IGPUImage::ESCF_1_BIT,
-					.mayAlias = false
-				},
-			/*.loadOp = */{IGPURenderpass::LOAD_OP::CLEAR},
-			/*.storeOp = */{IGPURenderpass::STORE_OP::STORE},
-			/*.initialLayout = */{IGPUImage::LAYOUT::UNDEFINED}, // because we clear we don't care about contents
-			/*.finalLayout = */{IGPUImage::LAYOUT::ATTACHMENT_OPTIMAL} // transition to presentation right away so we can skip a barrier
-		}},
-		IGPURenderpass::SCreationParams::DepthStencilAttachmentsEnd
-		};
-		m_params.depthStencilAttachments = depthAttachments;
-
-		static IGPURenderpass::SCreationParams::SSubpassDescription subpasses[] = {
-			m_params.subpasses[0],
-			IGPURenderpass::SCreationParams::SubpassesEnd
-		};
-		subpasses[0].depthStencilAttachment.render = { .attachmentIndex = 0,.layout = IGPUImage::LAYOUT::ATTACHMENT_OPTIMAL };
-		m_params.subpasses = subpasses;
-	}
-
-protected:
-	inline bool onCreateSwapchain_impl(const uint8_t qFam) override
-	{
-		auto device = const_cast<ILogicalDevice*>(m_renderpass->getOriginDevice());
-
-		const auto depthFormat = m_renderpass->getCreationParameters().depthStencilAttachments[0].format;
-		const auto& sharedParams = getSwapchain()->getCreationParameters().sharedParams;
-		auto image = device->createImage({ IImage::SCreationParams{
-			.type = IGPUImage::ET_2D,
-			.samples = IGPUImage::ESCF_1_BIT,
-			.format = depthFormat,
-			.extent = {sharedParams.width,sharedParams.height,1},
-			.mipLevels = 1,
-			.arrayLayers = 1,
-			.depthUsage = IGPUImage::EUF_RENDER_ATTACHMENT_BIT
-		} });
-
-		device->allocate(image->getMemoryReqs(), image.get());
-
-		m_depthBuffer = device->createImageView({
-			.flags = IGPUImageView::ECF_NONE,
-			.subUsages = IGPUImage::EUF_RENDER_ATTACHMENT_BIT,
-			.image = std::move(image),
-			.viewType = IGPUImageView::ET_2D,
-			.format = depthFormat,
-			.subresourceRange = {IGPUImage::EAF_DEPTH_BIT,0,1,0,1}
-			});
-
-		const auto retval = base_t::onCreateSwapchain_impl(qFam);
-		m_depthBuffer = nullptr;
-		return retval;
-	}
-
-	inline smart_refctd_ptr<IGPUFramebuffer> createFramebuffer(IGPUFramebuffer::SCreationParams&& params) override
-	{
-		params.depthStencilAttachments = &m_depthBuffer.get();
-		return m_device->createFramebuffer(std::move(params));
-	}
-
-	E_FORMAT m_depthFormat;
-	// only used to pass a parameter from `onCreateSwapchain_impl` to `createFramebuffer`
-	smart_refctd_ptr<IGPUImageView> m_depthBuffer;
-};
-
-class GeometryCreatorApp final : public examples::SimpleWindowedApplication
-{
-		using device_base_t = examples::SimpleWindowedApplication;
-		using clock_t = std::chrono::steady_clock;
-
-		constexpr static inline uint32_t WIN_W = 1280, WIN_H = 720;
-		
-		// Maximum frames which can be simultaneously submitted, used to cycle through our per-frame resources like command buffers
-		constexpr static inline uint32_t MaxFramesInFlight = 3u;
-
-		constexpr static inline clock_t::duration DisplayImageDuration = std::chrono::milliseconds(900);
+		using base_t = examples::MonoWindowApplication;
 
 	public:
-		inline GeometryCreatorApp(const path& _localInputCWD, const path& _localOutputCWD, const path& _sharedInputCWD, const path& _sharedOutputCWD)
-			: IApplicationFramework(_localInputCWD, _localOutputCWD, _sharedInputCWD, _sharedOutputCWD) {}
+		GeometryCreatorApp(const path& _localInputCWD, const path& _localOutputCWD, const path& _sharedInputCWD, const path& _sharedOutputCWD)
+			: base_t({1280,720}, EF_D16_UNORM, _localInputCWD, _localOutputCWD, _sharedInputCWD, _sharedOutputCWD) {}
 
-		virtual SPhysicalDeviceFeatures getRequiredDeviceFeatures() const override
+		SPhysicalDeviceFeatures getRequiredDeviceFeatures() const override
 		{
-			auto retval = device_base_t::getRequiredDeviceFeatures();
+			auto retval = base_t::getRequiredDeviceFeatures();
 			retval.geometryShader = true;
 			return retval;
 		}
 
-		inline core::vector<video::SPhysicalDeviceFilter::SurfaceCompatibility> getSurfaces() const override
-		{
-			if (!m_surface)
-			{
-				{
-					auto windowCallback = core::make_smart_refctd_ptr<CEventCallback>(smart_refctd_ptr(m_inputSystem), smart_refctd_ptr(m_logger));
-					IWindow::SCreationParams params = {};
-					params.callback = core::make_smart_refctd_ptr<nbl::video::ISimpleManagedSurface::ICallback>();
-					params.width = WIN_W;
-					params.height = WIN_H;
-					params.x = 32;
-					params.y = 32;
-					params.flags = ui::IWindow::ECF_HIDDEN | IWindow::ECF_BORDERLESS | IWindow::ECF_RESIZABLE;
-					params.windowCaption = "GeometryCreatorApp";
-					params.callback = windowCallback;
-					const_cast<std::remove_const_t<decltype(m_window)>&>(m_window) = m_winMgr->createWindow(std::move(params));
-				}
-
-				auto surface = CSurfaceVulkanWin32::create(smart_refctd_ptr(m_api), smart_refctd_ptr_static_cast<IWindowWin32>(m_window));
-				const_cast<std::remove_const_t<decltype(m_surface)>&>(m_surface) = nbl::video::CSimpleResizeSurface<CSwapchainFramebuffersAndDepth>::create(std::move(surface));
-			}
-
-			if (m_surface)
-				return { {m_surface->getSurface()/*,EQF_NONE*/} };
-
-			return {};
-		}
-
 		inline bool onAppInitialized(smart_refctd_ptr<ISystem>&& system) override
 		{
-			m_inputSystem = make_smart_refctd_ptr<InputSystem>(logger_opt_smart_ptr(smart_refctd_ptr(m_logger)));
-
-			if (!device_base_t::onAppInitialized(smart_refctd_ptr(system)))
+			if (!base_t::onAppInitialized(smart_refctd_ptr(system)))
 				return false;
 
 			m_semaphore = m_device->createSemaphore(m_realFrameIx);
 			if (!m_semaphore)
 				return logFail("Failed to Create a Semaphore!");
 
-			ISwapchain::SCreationParams swapchainParams = { .surface = m_surface->getSurface() };
-			if (!swapchainParams.deduceFormat(m_physicalDevice))
-				return logFail("Could not choose a Surface Format for the Swapchain!");
-
-			// Subsequent submits don't wait for each other, hence its important to have External Dependencies which prevent users of the depth attachment overlapping.
-			const static IGPURenderpass::SCreationParams::SSubpassDependency dependencies[] = {
-				// wipe-transition of Color to ATTACHMENT_OPTIMAL
-				{
-					.srcSubpass = IGPURenderpass::SCreationParams::SSubpassDependency::External,
-					.dstSubpass = 0,
-					.memoryBarrier = {
-					// last place where the depth can get modified in previous frame
-					.srcStageMask = PIPELINE_STAGE_FLAGS::LATE_FRAGMENT_TESTS_BIT,
-					// only write ops, reads can't be made available
-					.srcAccessMask = ACCESS_FLAGS::DEPTH_STENCIL_ATTACHMENT_WRITE_BIT,
-					// destination needs to wait as early as possible
-					.dstStageMask = PIPELINE_STAGE_FLAGS::EARLY_FRAGMENT_TESTS_BIT,
-					// because of depth test needing a read and a write
-					.dstAccessMask = ACCESS_FLAGS::DEPTH_STENCIL_ATTACHMENT_WRITE_BIT | ACCESS_FLAGS::DEPTH_STENCIL_ATTACHMENT_READ_BIT
-				}
-				// leave view offsets and flags default
-			},
-				// color from ATTACHMENT_OPTIMAL to PRESENT_SRC
-				{
-					.srcSubpass = 0,
-					.dstSubpass = IGPURenderpass::SCreationParams::SSubpassDependency::External,
-					.memoryBarrier = {
-					// last place where the depth can get modified
-					.srcStageMask = PIPELINE_STAGE_FLAGS::COLOR_ATTACHMENT_OUTPUT_BIT,
-					// only write ops, reads can't be made available
-					.srcAccessMask = ACCESS_FLAGS::COLOR_ATTACHMENT_WRITE_BIT
-					// spec says nothing is needed when presentation is the destination
-				}
-				// leave view offsets and flags default
-			},
-			IGPURenderpass::SCreationParams::DependenciesEnd
-			};
-
-			// TODO: promote the depth format if D16 not supported, or quote the spec if there's guaranteed support for it
-			auto scResources = std::make_unique<CSwapchainFramebuffersAndDepth>(m_device.get(), EF_D16_UNORM, swapchainParams.surfaceFormat.format, dependencies);
-
-			auto* renderpass = scResources->getRenderpass();
-
-			if (!renderpass)
-				return logFail("Failed to create Renderpass!");
-
-			auto gQueue = getGraphicsQueue();
-			if (!m_surface || !m_surface->init(gQueue, std::move(scResources), swapchainParams.sharedParams))
-				return logFail("Could not create Window & Surface or initialize the Surface!");
-
-			auto pool = m_device->createCommandPool(gQueue->getFamilyIndex(), IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT);
-
+			auto pool = m_device->createCommandPool(getGraphicsQueue()->getFamilyIndex(),IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT);
 			for (auto i = 0u; i < MaxFramesInFlight; i++)
 			{
 				if (!pool)
@@ -208,12 +37,7 @@ class GeometryCreatorApp final : public examples::SimpleWindowedApplication
 					return logFail("Couldn't create Command Buffer!");
 			}
 
-			m_winMgr->setWindowSize(m_window.get(), WIN_W, WIN_H);
-			m_surface->recreateSwapchain();
-
-			auto assetManager = make_smart_refctd_ptr<nbl::asset::IAssetManager>(smart_refctd_ptr(system));
-			auto* geometry = assetManager->getGeometryCreator();
-
+#if 0
 			//using Builder = typename CScene::CreateResourcesDirectlyWithDevice::Builder;
 			using Builder = typename CScene::CreateResourcesWithAssetConverter::Builder;
 			auto oneRunCmd = CScene::createCommandBuffer(m_utils->getLogicalDevice(), m_utils->getLogger(), gQueue->getFamilyIndex());
@@ -227,61 +51,25 @@ class GeometryCreatorApp final : public examples::SimpleWindowedApplication
 			}
 			else
 				m_logger->log("Could not build resource objects!", ILogger::ELL_ERROR);
-
+#endif
 			// camera
 			{
 				core::vectorSIMDf cameraPosition(-5.81655884, 2.58630896, -4.23974705);
 				core::vectorSIMDf cameraTarget(-0.349590302, -0.213266611, 0.317821503);
-				matrix4SIMD projectionMatrix = matrix4SIMD::buildProjectionMatrixPerspectiveFovLH(core::radians(60.0f), float(WIN_W) / WIN_H, 0.1, 10000);
+				matrix4SIMD projectionMatrix = matrix4SIMD::buildProjectionMatrixPerspectiveFovLH(core::radians(60.0f), float(m_initialResolution.x)/float(m_initialResolution.y), 0.1, 10000);
 				camera = Camera(cameraPosition, cameraTarget, projectionMatrix, 1.069f, 0.4f);
 			}
 
-			m_winMgr->show(m_window.get());
-			oracle.reportBeginFrameRecord();
-
+			onAppInitializedFinish();
 			return true;
 		}
 
-		inline void workLoopBody() override
+		inline IQueue::SSubmitInfo::SSemaphoreInfo renderFrame(const std::chrono::microseconds nextPresentationTimestamp) override
 		{
-			// framesInFlight: ensuring safe execution of command buffers and acquires, `framesInFlight` only affect semaphore waits, don't use this to index your resources because it can change with swapchain recreation.
-			const uint32_t framesInFlight = core::min(MaxFramesInFlight, m_surface->getMaxAcquiresInFlight());
-			// We block for semaphores for 2 reasons here:
-				// A) Resource: Can't use resource like a command buffer BEFORE previous use is finished! [MaxFramesInFlight]
-				// B) Acquire: Can't have more acquires in flight than a certain threshold returned by swapchain or your surface helper class. [MaxAcquiresInFlight]
-			if (m_realFrameIx >= framesInFlight)
-			{
-				const ISemaphore::SWaitInfo cbDonePending[] =
-				{
-					{
-						.semaphore = m_semaphore.get(),
-						.value = m_realFrameIx + 1 - framesInFlight
-					}
-				};
-				if (m_device->blockForSemaphores(cbDonePending) != ISemaphore::WAIT_RESULT::SUCCESS)
-					return;
-			}
-
-			const auto resourceIx = m_realFrameIx % MaxFramesInFlight;
-
 			m_inputSystem->getDefaultMouse(&mouse);
 			m_inputSystem->getDefaultKeyboard(&keyboard);
 
-			auto updatePresentationTimestamp = [&]()
-			{
-				m_currentImageAcquire = m_surface->acquireNextImage();
-
-				oracle.reportEndFrameRecord();
-				const auto timestamp = oracle.getNextPresentationTimeStamp();
-				oracle.reportBeginFrameRecord();
-
-				return timestamp;
-			};
-
-			const auto nextPresentationTimestamp = updatePresentationTimestamp();
-
-			if (!m_currentImageAcquire)
-				return;
+			const auto resourceIx = m_realFrameIx % base_t::MaxFramesInFlight;
 
 			auto* const cb = m_cmdBufs.data()[resourceIx].get();
 			cb->reset(IGPUCommandBuffer::RESET_FLAGS::RELEASE_RESOURCES_BIT);
@@ -292,12 +80,13 @@ class GeometryCreatorApp final : public examples::SimpleWindowedApplication
 				mouse.consumeEvents([&](const IMouseEventChannel::range_t& events) -> void { camera.mouseProcess(events); mouseProcess(events); }, m_logger.get());
 				keyboard.consumeEvents([&](const IKeyboardEventChannel::range_t& events) -> void { camera.keyboardProcess(events); }, m_logger.get());
 				camera.endInputProcessing(nextPresentationTimestamp);
-
+#if 0
 				const auto type = static_cast<ObjectType>(gcIndex);
 				const auto& [gpu, meta] = resources.objects[type];
 
 				object.meta.type = type;
 				object.meta.name = meta.name;
+#endif
 			}
 
 			const auto viewMatrix = camera.getViewMatrix();
@@ -312,7 +101,7 @@ class GeometryCreatorApp final : public examples::SimpleWindowedApplication
 
 			core::matrix3x4SIMD normalMatrix;
 			modelViewMatrix.getSub3x3InverseTranspose(normalMatrix);
-
+#if 0
 			SBasicViewParameters uboData;
 			memcpy(uboData.MVP, modelViewProjectionMatrix.pointer(), sizeof(uboData.MVP));
 			memcpy(uboData.MV, modelViewMatrix.pointer(), sizeof(uboData.MV));
@@ -324,7 +113,7 @@ class GeometryCreatorApp final : public examples::SimpleWindowedApplication
 
 				cb->updateBuffer(range, &uboData);
 			}
-
+#endif
 			auto* queue = getGraphicsQueue();
 
 			asset::SViewport viewport;
@@ -357,7 +146,7 @@ class GeometryCreatorApp final : public examples::SimpleWindowedApplication
 				auto scRes = static_cast<CDefaultSwapchainFramebuffers*>(m_surface->getSwapchainResources());
 				const IGPUCommandBuffer::SRenderpassBeginInfo info =
 				{
-					.framebuffer = scRes->getFramebuffer(m_currentImageAcquire.imageIndex),
+					.framebuffer = scRes->getFramebuffer(base_t::getCurrentAcquire().imageIndex),
 					.colorClearValues = &clearValue,
 					.depthStencilClearValues = &depthValue,
 					.renderArea = currentRenderArea
@@ -365,7 +154,7 @@ class GeometryCreatorApp final : public examples::SimpleWindowedApplication
 
 				cb->beginRenderPass(info, IGPUCommandBuffer::SUBPASS_CONTENTS::INLINE);
 			}
-
+#if 0
 			const auto& [hook, meta] = resources.objects[object.meta.type];
 			auto* rawPipeline = hook.pipeline.get();
 
@@ -382,96 +171,97 @@ class GeometryCreatorApp final : public examples::SimpleWindowedApplication
 			}
 			else
 				cb->draw(hook.indexCount, 1, 0, 0);
-
+#endif
 			cb->endRenderPass();
 			cb->end();
+
+			IQueue::SSubmitInfo::SSemaphoreInfo retval =
+			{
+				.semaphore = m_semaphore.get(),
+				.value = ++m_realFrameIx,
+				.stageMask = PIPELINE_STAGE_FLAGS::ALL_GRAPHICS_BITS
+			};
+			const IQueue::SSubmitInfo::SCommandBufferInfo commandBuffers[] =
+			{
+				{.cmdbuf = cb }
+			};
+			const IQueue::SSubmitInfo::SSemaphoreInfo acquired[] = {
+				{.semaphore = base_t::getCurrentAcquire().semaphore, .value = base_t::getCurrentAcquire().acquireCount, .stageMask = PIPELINE_STAGE_FLAGS::NONE}
+			};
+			const IQueue::SSubmitInfo infos[] =
 			{
-				const IQueue::SSubmitInfo::SSemaphoreInfo rendered[] =
-				{
-					{
-						.semaphore = m_semaphore.get(),
-						.value = ++m_realFrameIx,
-						.stageMask = PIPELINE_STAGE_FLAGS::ALL_GRAPHICS_BITS
-					}
-				};
 				{
-					{
-						const IQueue::SSubmitInfo::SCommandBufferInfo commandBuffers[] =
-						{
-							{.cmdbuf = cb }
-						};
-
-						const IQueue::SSubmitInfo::SSemaphoreInfo acquired[] =
-						{
-							{
-								.semaphore = m_currentImageAcquire.semaphore,
-								.value = m_currentImageAcquire.acquireCount,
-								.stageMask = PIPELINE_STAGE_FLAGS::NONE
-							}
-						};
-						const IQueue::SSubmitInfo infos[] =
-						{
-							{
-								.waitSemaphores = acquired,
-								.commandBuffers = commandBuffers,
-								.signalSemaphores = rendered
-							}
-						};
-
-						if (queue->submit(infos) == IQueue::RESULT::SUCCESS)
-						{
-							const nbl::video::ISemaphore::SWaitInfo waitInfos[] =
-							{ {
-								.semaphore = m_semaphore.get(),
-								.value = m_realFrameIx
-							} };
-
-							m_device->blockForSemaphores(waitInfos); // this is not solution, quick wa to not throw validation errors
-						}
-						else
-							--m_realFrameIx;
-					}
+					.waitSemaphores = acquired,
+					.commandBuffers = commandBuffers,
+					.signalSemaphores = {&retval,1}
 				}
+			};
 
-				std::string caption = "[Nabla Engine] Geometry Creator";
-				{
-					caption += ", displaying [" + std::string(object.meta.name.data()) + "]";
-					m_window->setCaption(caption);
-				}
-				m_surface->present(m_currentImageAcquire.imageIndex, rendered);
+			if (queue->submit(infos) != IQueue::RESULT::SUCCESS)
+			{
+				retval.semaphore = nullptr; // so that we don't wait on semaphore that will never signal
+				m_realFrameIx--;
 			}
-		}
 
-		inline bool keepRunning() override
-		{
-			if (m_surface->irrecoverable())
-				return false;
-
-			return true;
+			std::string caption = "[Nabla Engine] Geometry Creator";
+			{
+//					caption += ", displaying [" + std::string(object.meta.name.data()) + "]";
+				m_window->setCaption(caption);
+			}
+			return retval;
 		}
-
-		inline bool onAppTerminated() override
+		
+	protected:
+		const video::IGPURenderpass::SCreationParams::SSubpassDependency* getDefaultSubpassDependencies() const override
 		{
-			return device_base_t::onAppTerminated();
+			// Subsequent submits don't wait for each other, hence its important to have External Dependencies which prevent users of the depth attachment overlapping.
+			const static IGPURenderpass::SCreationParams::SSubpassDependency dependencies[] = {
+				// wipe-transition of Color to ATTACHMENT_OPTIMAL
+				{
+					.srcSubpass = IGPURenderpass::SCreationParams::SSubpassDependency::External,
+					.dstSubpass = 0,
+					.memoryBarrier = {
+						// last place where the depth can get modified in previous frame
+						.srcStageMask = PIPELINE_STAGE_FLAGS::LATE_FRAGMENT_TESTS_BIT,
+						// only write ops, reads can't be made available
+						.srcAccessMask = ACCESS_FLAGS::DEPTH_STENCIL_ATTACHMENT_WRITE_BIT,
+						// destination needs to wait as early as possible
+						.dstStageMask = PIPELINE_STAGE_FLAGS::EARLY_FRAGMENT_TESTS_BIT,
+						// because of depth test needing a read and a write
+						.dstAccessMask = ACCESS_FLAGS::DEPTH_STENCIL_ATTACHMENT_WRITE_BIT | ACCESS_FLAGS::DEPTH_STENCIL_ATTACHMENT_READ_BIT
+					}
+					// leave view offsets and flags default
+				},
+				// color from ATTACHMENT_OPTIMAL to PRESENT_SRC
+				{
+					.srcSubpass = 0,
+					.dstSubpass = IGPURenderpass::SCreationParams::SSubpassDependency::External,
+					.memoryBarrier = {
+						// last place where the depth can get modified
+						.srcStageMask = PIPELINE_STAGE_FLAGS::COLOR_ATTACHMENT_OUTPUT_BIT,
+						// only write ops, reads can't be made available
+						.srcAccessMask = ACCESS_FLAGS::COLOR_ATTACHMENT_WRITE_BIT
+						// spec says nothing is needed when presentation is the destination
+					}
+					// leave view offsets and flags default
+				},
+				IGPURenderpass::SCreationParams::DependenciesEnd
+			};
+			return dependencies;
 		}
 
 	private:
-		smart_refctd_ptr<IWindow> m_window;
-		smart_refctd_ptr<CSimpleResizeSurface<CSwapchainFramebuffersAndDepth>> m_surface;
 		smart_refctd_ptr<ISemaphore> m_semaphore;
 		uint64_t m_realFrameIx = 0;
-		std::array<smart_refctd_ptr<IGPUCommandBuffer>, MaxFramesInFlight> m_cmdBufs;
-		ISimpleManagedSurface::SAcquireResult m_currentImageAcquire = {};
+		std::array<smart_refctd_ptr<IGPUCommandBuffer>,base_t::MaxFramesInFlight> m_cmdBufs;
 
-		core::smart_refctd_ptr<InputSystem> m_inputSystem;
 		InputSystem::ChannelReader<IMouseEventChannel> mouse;
 		InputSystem::ChannelReader<IKeyboardEventChannel> keyboard;
 
 		Camera camera = Camera(core::vectorSIMDf(0, 0, 0), core::vectorSIMDf(0, 0, 0), core::matrix4SIMD());
-		video::CDumbPresentationOracle oracle;
 
-		ResourcesBundle resources;
-		ObjectDrawHookCpu object;
+//		ResourcesBundle resources;
+//		ObjectDrawHookCpu object;
 		uint16_t gcIndex = {};
 
 		void mouseProcess(const nbl::ui::IMouseEventChannel::range_t& events)
@@ -481,7 +271,7 @@ class GeometryCreatorApp final : public examples::SimpleWindowedApplication
 				auto ev = *eventIt;
 
 				if (ev.type == nbl::ui::SMouseEvent::EET_SCROLL)
-					gcIndex = std::clamp<uint16_t>(int16_t(gcIndex) + int16_t(core::sign(ev.scrollEvent.verticalScroll)), int64_t(0), int64_t(OT_COUNT - (uint8_t)1u));
+					gcIndex = std::clamp<uint16_t>(int16_t(gcIndex) + int16_t(core::sign(ev.scrollEvent.verticalScroll)), int64_t(0), int64_t(CGeometryCreatorScene::OT_COUNT - (uint8_t)1u));
 			}
 		}
 };
diff --git a/common/include/nbl/examples/PCH.hpp b/common/include/nbl/examples/PCH.hpp
index 7a1b6bdc6..179c9f037 100644
--- a/common/include/nbl/examples/PCH.hpp
+++ b/common/include/nbl/examples/PCH.hpp
@@ -11,6 +11,7 @@
 // #include "nbl/ui/CWindowManagerAndroid.h"
 
 #include "nbl/examples/common/SimpleWindowedApplication.hpp"
+#include "nbl/examples/common/MonoWindowApplication.hpp"
 #include "nbl/examples/common/InputSystem.hpp"
 #include "nbl/examples/common/CEventCallback.hpp"
 
diff --git a/common/include/nbl/examples/common/CEventCallback.hpp b/common/include/nbl/examples/common/CEventCallback.hpp
index 4670ca7f6..cae6dc7de 100644
--- a/common/include/nbl/examples/common/CEventCallback.hpp
+++ b/common/include/nbl/examples/common/CEventCallback.hpp
@@ -1,9 +1,14 @@
-#ifndef _NBL_COMMON_C_EVENT_CALLBACK_HPP_INCLUDED_
-#define _NBL_C_EVENT_CALLBACK_HPP_INCLUDED_
+#ifndef _NBL_EXAMPLES_COMMON_C_EVENT_CALLBACK_HPP_INCLUDED_
+#define _NBL_EXAMPLES_COMMON_C_EVENT_CALLBACK_HPP_INCLUDED_
+
 
 #include "nbl/video/utilities/CSimpleResizeSurface.h"
-#include "InputSystem.hpp"
 
+#include "nbl/examples/common/InputSystem.hpp"
+
+
+namespace nbl::examples
+{
 class CEventCallback : public nbl::video::ISimpleManagedSurface::ICallback
 {
 	public:
@@ -45,5 +50,5 @@ class CEventCallback : public nbl::video::ISimpleManagedSurface::ICallback
 		nbl::core::smart_refctd_ptr<InputSystem> m_inputSystem = nullptr;
 		nbl::system::logger_opt_smart_ptr m_logger = nullptr;
 };
-
-#endif // _NBL_C_EVENT_CALLBACK_HPP_INCLUDED_
\ No newline at end of file
+}
+#endif
\ No newline at end of file
diff --git a/common/include/nbl/examples/common/CSwapchainFramebuffersAndDepth.hpp b/common/include/nbl/examples/common/CSwapchainFramebuffersAndDepth.hpp
new file mode 100644
index 000000000..a79d59730
--- /dev/null
+++ b/common/include/nbl/examples/common/CSwapchainFramebuffersAndDepth.hpp
@@ -0,0 +1,101 @@
+// Copyright (C) 2023-2025 - DevSH Graphics Programming Sp. z O.O.
+// This file is part of the "Nabla Engine".
+// For conditions of distribution and use, see copyright notice in nabla.h
+#ifndef _NBL_EXAMPLES_COMMON_C_SWAPCHAIN_FRAMEBUFFERS_AND_DEPTH_HPP_INCLUDED_
+#define _NBL_EXAMPLES_COMMON_C_SWAPCHAIN_FRAMEBUFFERS_AND_DEPTH_HPP_INCLUDED_
+
+// Build on top of the previous one
+#include "nbl/application_templates/BasicMultiQueueApplication.hpp"
+
+namespace nbl::examples
+{
+	
+class CSwapchainFramebuffersAndDepth final : public video::CDefaultSwapchainFramebuffers
+{
+		using base_t = CDefaultSwapchainFramebuffers;
+
+	public:
+		template<typename... Args>
+		inline CSwapchainFramebuffersAndDepth(video::ILogicalDevice* device, const asset::E_FORMAT _desiredDepthFormat, Args&&... args) : base_t(device,std::forward<Args>(args)...)
+		{
+			using namespace nbl::asset;
+			using namespace nbl::video;
+			const IPhysicalDevice::SImageFormatPromotionRequest req = {
+				.originalFormat = _desiredDepthFormat,
+				.usages = {IGPUImage::EUF_RENDER_ATTACHMENT_BIT}
+			};
+			m_depthFormat = m_device->getPhysicalDevice()->promoteImageFormat(req,IGPUImage::TILING::OPTIMAL);
+
+			const static IGPURenderpass::SCreationParams::SDepthStencilAttachmentDescription depthAttachments[] = {
+				{{
+					{
+						.format = m_depthFormat,
+						.samples = IGPUImage::ESCF_1_BIT,
+						.mayAlias = false
+					},
+					/*.loadOp = */{IGPURenderpass::LOAD_OP::CLEAR},
+					/*.storeOp = */{IGPURenderpass::STORE_OP::STORE},
+					/*.initialLayout = */{IGPUImage::LAYOUT::UNDEFINED}, // because we clear we don't care about contents
+					/*.finalLayout = */{IGPUImage::LAYOUT::ATTACHMENT_OPTIMAL} // transition to presentation right away so we can skip a barrier
+				}},
+				IGPURenderpass::SCreationParams::DepthStencilAttachmentsEnd
+			};
+			m_params.depthStencilAttachments = depthAttachments;
+
+			static IGPURenderpass::SCreationParams::SSubpassDescription subpasses[] = {
+				m_params.subpasses[0],
+				IGPURenderpass::SCreationParams::SubpassesEnd
+			};
+			subpasses[0].depthStencilAttachment.render = { .attachmentIndex = 0,.layout = IGPUImage::LAYOUT::ATTACHMENT_OPTIMAL };
+			m_params.subpasses = subpasses;
+		}
+
+	protected:
+		inline bool onCreateSwapchain_impl(const uint8_t qFam) override
+		{
+			using namespace nbl::asset;
+			using namespace nbl::video;
+			// DOCS: why are we not using `m_device` here? any particular reason?
+			auto device = const_cast<ILogicalDevice*>(m_renderpass->getOriginDevice());
+
+			const auto depthFormat = m_renderpass->getCreationParameters().depthStencilAttachments[0].format;
+			const auto& sharedParams = getSwapchain()->getCreationParameters().sharedParams;
+			auto image = device->createImage({ IImage::SCreationParams{
+				.type = IGPUImage::ET_2D,
+				.samples = IGPUImage::ESCF_1_BIT,
+				.format = depthFormat,
+				.extent = {sharedParams.width,sharedParams.height,1},
+				.mipLevels = 1,
+				.arrayLayers = 1,
+				.depthUsage = IGPUImage::EUF_RENDER_ATTACHMENT_BIT
+			} });
+
+			device->allocate(image->getMemoryReqs(), image.get());
+
+			m_depthBuffer = device->createImageView({
+				.flags = IGPUImageView::ECF_NONE,
+				.subUsages = IGPUImage::EUF_RENDER_ATTACHMENT_BIT,
+				.image = std::move(image),
+				.viewType = IGPUImageView::ET_2D,
+				.format = depthFormat,
+				.subresourceRange = {IGPUImage::EAF_DEPTH_BIT,0,1,0,1}
+				});
+
+			const auto retval = base_t::onCreateSwapchain_impl(qFam);
+			m_depthBuffer = nullptr;
+			return retval;
+		}
+
+		inline core::smart_refctd_ptr<video::IGPUFramebuffer> createFramebuffer(video::IGPUFramebuffer::SCreationParams&& params) override
+		{
+			params.depthStencilAttachments = &m_depthBuffer.get();
+			return m_device->createFramebuffer(std::move(params));
+		}
+
+		asset::E_FORMAT m_depthFormat;
+		// only used to pass a parameter from `onCreateSwapchain_impl` to `createFramebuffer`
+		core::smart_refctd_ptr<video::IGPUImageView> m_depthBuffer;
+};
+
+}
+#endif
\ No newline at end of file
diff --git a/common/include/nbl/examples/common/InputSystem.hpp b/common/include/nbl/examples/common/InputSystem.hpp
index c42b738d0..c30fc1212 100644
--- a/common/include/nbl/examples/common/InputSystem.hpp
+++ b/common/include/nbl/examples/common/InputSystem.hpp
@@ -4,16 +4,19 @@
 #ifndef _NBL_EXAMPLES_COMMON_INPUT_SYSTEM_HPP_INCLUDED_
 #define _NBL_EXAMPLES_COMMON_INPUT_SYSTEM_HPP_INCLUDED_
 
-class InputSystem : public nbl::core::IReferenceCounted
+namespace nbl::examples
+{
+
+class InputSystem : public core::IReferenceCounted
 {
 	public:
 		template <class ChannelType>
 		struct Channels
 		{
-			nbl::core::mutex lock;
+			core::mutex lock;
 			std::condition_variable added;
-			nbl::core::vector<nbl::core::smart_refctd_ptr<ChannelType>> channels;
-			nbl::core::vector<std::chrono::microseconds> timeStamps;
+			core::vector<core::smart_refctd_ptr<ChannelType>> channels;
+			core::vector<std::chrono::microseconds> timeStamps;
 			uint32_t defaultChannelIndex = 0;
 		};
 		// TODO: move to "nbl/ui/InputEventChannel.h" once the interface of this utility struct matures, also maybe rename to `Consumer` ?
@@ -21,7 +24,7 @@ class InputSystem : public nbl::core::IReferenceCounted
 		struct ChannelReader
 		{
 			template<typename F>
-			inline void consumeEvents(F&& processFunc, nbl::system::logger_opt_ptr logger=nullptr)
+			inline void consumeEvents(F&& processFunc, system::logger_opt_ptr logger=nullptr)
 			{
 				auto events = channel->getEvents();
 				const auto frontBufferCapacity = channel->getFrontBufferCapacity();
@@ -29,7 +32,7 @@ class InputSystem : public nbl::core::IReferenceCounted
 				{
 					logger.log(
 						"Detected overflow, %d unconsumed events in channel of size %d!",
-						nbl::system::ILogger::ELL_ERROR,events.size()-consumedCounter,frontBufferCapacity
+						system::ILogger::ELL_ERROR,events.size()-consumedCounter,frontBufferCapacity
 					);
 					consumedCounter = events.size()-frontBufferCapacity;
 				}
@@ -38,22 +41,22 @@ class InputSystem : public nbl::core::IReferenceCounted
 				consumedCounter = events.size();
 			}
 
-			nbl::core::smart_refctd_ptr<ChannelType> channel = nullptr;
+			core::smart_refctd_ptr<ChannelType> channel = nullptr;
 			uint64_t consumedCounter = 0ull;
 		};
 		
-		InputSystem(nbl::system::logger_opt_smart_ptr&& logger) : m_logger(std::move(logger)) {}
+		InputSystem(system::logger_opt_smart_ptr&& logger) : m_logger(std::move(logger)) {}
 
-		void getDefaultMouse(ChannelReader<nbl::ui::IMouseEventChannel>* reader)
+		void getDefaultMouse(ChannelReader<ui::IMouseEventChannel>* reader)
 		{
 			getDefault(m_mouse,reader);
 		}
-		void getDefaultKeyboard(ChannelReader<nbl::ui::IKeyboardEventChannel>* reader)
+		void getDefaultKeyboard(ChannelReader<ui::IKeyboardEventChannel>* reader)
 		{
 			getDefault(m_keyboard,reader);
 		}
 		template<class ChannelType>
-		void add(Channels<ChannelType>& channels, nbl::core::smart_refctd_ptr<ChannelType>&& channel)
+		void add(Channels<ChannelType>& channels, core::smart_refctd_ptr<ChannelType>&& channel)
 		{
 			std::unique_lock lock(channels.lock);
 			channels.channels.push_back(std::move(channel));
@@ -94,7 +97,7 @@ class InputSystem : public nbl::core::IReferenceCounted
 			std::unique_lock lock(channels.lock);
 			while (channels.channels.empty())
 			{
-				m_logger.log("Waiting For Input Device to be connected...",nbl::system::ILogger::ELL_INFO);
+				m_logger.log("Waiting For Input Device to be connected...",system::ILogger::ELL_INFO);
 				channels.added.wait(lock);
 			}
 				
@@ -159,7 +162,7 @@ class InputSystem : public nbl::core::IReferenceCounted
 					}
 
 					if(defaultIdx != newDefaultIdx) {
-						m_logger.log("Default InputChannel for ChannelType changed from %u to %u",nbl::system::ILogger::ELL_INFO, defaultIdx, newDefaultIdx);
+						m_logger.log("Default InputChannel for ChannelType changed from %u to %u",system::ILogger::ELL_INFO, defaultIdx, newDefaultIdx);
 
 						defaultIdx = newDefaultIdx;
 						channels.defaultChannelIndex = newDefaultIdx;
@@ -177,10 +180,10 @@ class InputSystem : public nbl::core::IReferenceCounted
 			reader->consumedCounter = consumedCounter;
 		}
 
-		nbl::system::logger_opt_smart_ptr m_logger;
-		Channels<nbl::ui::IMouseEventChannel> m_mouse;
-		Channels<nbl::ui::IKeyboardEventChannel> m_keyboard;
+		system::logger_opt_smart_ptr m_logger;
+		Channels<ui::IMouseEventChannel> m_mouse;
+		Channels<ui::IKeyboardEventChannel> m_keyboard;
 };
 
-
+}
 #endif
diff --git a/common/include/nbl/examples/common/MonoWindowApplication.hpp b/common/include/nbl/examples/common/MonoWindowApplication.hpp
new file mode 100644
index 000000000..0f18012c0
--- /dev/null
+++ b/common/include/nbl/examples/common/MonoWindowApplication.hpp
@@ -0,0 +1,189 @@
+// Copyright (C) 2023-2023 - DevSH Graphics Programming Sp. z O.O.
+// This file is part of the "Nabla Engine".
+// For conditions of distribution and use, see copyright notice in nabla.h
+#ifndef _NBL_EXAMPLES_COMMON_MONO_WINDOW_APPLICATION_HPP_INCLUDED_
+#define _NBL_EXAMPLES_COMMON_MONO_WINDOW_APPLICATION_HPP_INCLUDED_
+
+// Build on top of the previous one
+#include "nbl/examples/common/SimpleWindowedApplication.hpp"
+#include "nbl/examples/common/CSwapchainFramebuffersAndDepth.hpp"
+#include "nbl/examples/common/CEventCallback.hpp"
+
+namespace nbl::examples
+{
+	
+// Virtual Inheritance because apps might end up doing diamond inheritance
+class MonoWindowApplication : public virtual SimpleWindowedApplication
+{
+		using base_t = SimpleWindowedApplication;
+
+	public:
+		// Maximum frames which can be simultaneously submitted, used to cycle through our per-frame resources like command buffers
+		constexpr static inline uint8_t MaxFramesInFlight = 3;
+
+		template<typename... Args>
+		MonoWindowApplication(const hlsl::uint16_t2 _initialResolution, const asset::E_FORMAT _depthFormat, Args&&... args) :
+			base_t(std::forward<Args>(args)...), m_initialResolution(_initialResolution), m_depthFormat(_depthFormat) {}
+
+		//
+		inline core::vector<video::SPhysicalDeviceFilter::SurfaceCompatibility> getSurfaces() const override final
+		{
+			if (!m_surface)
+			{
+				using namespace nbl::core;
+				using namespace nbl::ui;
+				using namespace nbl::video;
+				{
+					auto windowCallback = make_smart_refctd_ptr<CEventCallback>(smart_refctd_ptr(m_inputSystem),smart_refctd_ptr(m_logger));
+					IWindow::SCreationParams params = {};
+					params.callback = make_smart_refctd_ptr<ISimpleManagedSurface::ICallback>();
+					params.width = m_initialResolution[0];
+					params.height = m_initialResolution[1];
+					params.x = 32;
+					params.y = 32;
+					params.flags = ui::IWindow::ECF_HIDDEN | IWindow::ECF_BORDERLESS | IWindow::ECF_RESIZABLE;
+					params.windowCaption = "MonoWindowApplication";
+					params.callback = windowCallback;
+					const_cast<std::remove_const_t<decltype(m_window)>&>(m_window) = m_winMgr->createWindow(std::move(params));
+				}
+
+				auto surface = CSurfaceVulkanWin32::create(smart_refctd_ptr(m_api), smart_refctd_ptr_static_cast<IWindowWin32>(m_window));
+				const_cast<std::remove_const_t<decltype(m_surface)>&>(m_surface) = CSimpleResizeSurface<CSwapchainFramebuffersAndDepth>::create(std::move(surface));
+			}
+
+			if (m_surface)
+				return { {m_surface->getSurface()/*,EQF_NONE*/} };
+
+			return {};
+		}
+		
+		virtual inline bool onAppInitialized(core::smart_refctd_ptr<system::ISystem>&& system) override
+		{
+			using namespace nbl::core;
+			using namespace nbl::video;
+			// want to have a usable system and logger first
+			if (!MonoSystemMonoLoggerApplication::onAppInitialized(std::move(system)))
+				return false;
+
+			m_inputSystem = make_smart_refctd_ptr<InputSystem>(system::logger_opt_smart_ptr(smart_refctd_ptr(m_logger)));
+			if (!base_t::onAppInitialized(std::move(system)))
+				return false;
+			
+			ISwapchain::SCreationParams swapchainParams = { .surface = smart_refctd_ptr<ISurface>(m_surface->getSurface()) };
+			if (!swapchainParams.deduceFormat(m_physicalDevice))
+				return logFail("Could not choose a Surface Format for the Swapchain!");
+			
+			// TODO: option without depth
+			auto scResources = std::make_unique<CSwapchainFramebuffersAndDepth>(m_device.get(),m_depthFormat,swapchainParams.surfaceFormat.format,getDefaultSubpassDependencies());
+			auto* renderpass = scResources->getRenderpass();
+
+			if (!renderpass)
+				return logFail("Failed to create Renderpass!");
+
+			auto gQueue = getGraphicsQueue();
+			if (!m_surface || !m_surface->init(gQueue,std::move(scResources),swapchainParams.sharedParams))
+				return logFail("Could not create Window & Surface or initialize the Surface!");
+			
+			m_winMgr->setWindowSize(m_window.get(),m_initialResolution[0],m_initialResolution[1]);
+			m_surface->recreateSwapchain();
+
+			return true;
+		}
+
+		// we do slight inversion of control here
+		inline void workLoopBody() override final
+		{
+			using namespace nbl::core;
+			using namespace nbl::video;
+			// framesInFlight: ensuring safe execution of command buffers and acquires, `framesInFlight` only affect semaphore waits, don't use this to index your resources because it can change with swapchain recreation.
+			const uint32_t framesInFlightCount = hlsl::min(MaxFramesInFlight,m_surface->getMaxAcquiresInFlight());
+			// We block for semaphores for 2 reasons here:
+				// A) Resource: Can't use resource like a command buffer BEFORE previous use is finished! [MaxFramesInFlight]
+				// B) Acquire: Can't have more acquires in flight than a certain threshold returned by swapchain or your surface helper class. [MaxAcquiresInFlight]
+			if (m_framesInFlight.size()>=framesInFlightCount)
+			{
+				const ISemaphore::SWaitInfo framesDone[] =
+				{
+					{
+						.semaphore = m_framesInFlight.front().semaphore.get(),
+						.value = m_framesInFlight.front().value
+					}
+				};
+				if (m_device->blockForSemaphores(framesDone)!=ISemaphore::WAIT_RESULT::SUCCESS)
+					return;
+				m_framesInFlight.pop_front();
+			}
+
+			auto updatePresentationTimestamp = [&]()
+			{
+				m_currentImageAcquire = m_surface->acquireNextImage();
+
+				// TODO: better frame pacing than this
+				oracle.reportEndFrameRecord();
+				const auto timestamp = oracle.getNextPresentationTimeStamp();
+				oracle.reportBeginFrameRecord();
+
+				return timestamp;
+			};
+
+			const auto nextPresentationTimestamp = updatePresentationTimestamp();
+
+			if (!m_currentImageAcquire)
+				return;
+
+			const IQueue::SSubmitInfo::SSemaphoreInfo rendered[] = {renderFrame(nextPresentationTimestamp)};
+			m_surface->present(m_currentImageAcquire.imageIndex,rendered);
+			if (rendered->semaphore)
+				m_framesInFlight.emplace_back(smart_refctd_ptr<ISemaphore>(rendered->semaphore),rendered->value);
+		}
+
+		//
+		virtual inline bool keepRunning() override
+		{
+			if (m_surface->irrecoverable())
+				return false;
+
+			return true;
+		}
+
+		//
+		virtual inline bool onAppTerminated()
+		{
+			m_inputSystem = nullptr;
+			m_device->waitIdle();
+			m_framesInFlight.clear();
+			m_surface = nullptr;
+			m_window = nullptr;
+			return base_t::onAppTerminated();
+		}
+
+	protected:
+		inline void onAppInitializedFinish()
+		{
+			m_winMgr->show(m_window.get());
+			oracle.reportBeginFrameRecord();
+		}
+		inline const auto& getCurrentAcquire() const {return m_currentImageAcquire;}
+
+		virtual const video::IGPURenderpass::SCreationParams::SSubpassDependency* getDefaultSubpassDependencies() const = 0;
+		virtual video::IQueue::SSubmitInfo::SSemaphoreInfo renderFrame(const std::chrono::microseconds nextPresentationTimestamp) = 0;
+
+		const hlsl::uint16_t2 m_initialResolution;
+		const asset::E_FORMAT m_depthFormat;
+		core::smart_refctd_ptr<InputSystem> m_inputSystem;
+		core::smart_refctd_ptr<ui::IWindow> m_window;
+		core::smart_refctd_ptr<video::CSimpleResizeSurface<CSwapchainFramebuffersAndDepth>> m_surface;
+
+	private:
+		struct SSubmittedFrame
+		{
+			core::smart_refctd_ptr<video::ISemaphore> semaphore;
+			uint64_t value;
+		};
+		core::deque<SSubmittedFrame> m_framesInFlight;
+		video::ISimpleManagedSurface::SAcquireResult m_currentImageAcquire = {};
+		video::CDumbPresentationOracle oracle;
+};
+
+}
+#endif
\ No newline at end of file
diff --git a/common/include/nbl/examples/geometry/CGeometryCreatorScene.hpp b/common/include/nbl/examples/geometry/CGeometryCreatorScene.hpp
index 9ebd244aa..e68441ffe 100644
--- a/common/include/nbl/examples/geometry/CGeometryCreatorScene.hpp
+++ b/common/include/nbl/examples/geometry/CGeometryCreatorScene.hpp
@@ -17,20 +17,25 @@
 namespace nbl::examples
 {
 
-enum ObjectType : uint8_t
+class CGeometryCreatorScene
 {
-	OT_CUBE,
-	OT_SPHERE,
-	OT_CYLINDER,
-	OT_RECTANGLE,
-	OT_DISK,
-	OT_ARROW,
-	OT_CONE,
-	OT_ICOSPHERE,
-
-	OT_COUNT,
-	OT_UNKNOWN = std::numeric_limits<uint8_t>::max()
+	public:
+		enum ObjectType : uint8_t
+		{
+			OT_CUBE,
+			OT_SPHERE,
+			OT_CYLINDER,
+			OT_RECTANGLE,
+			OT_DISK,
+			OT_ARROW,
+			OT_CONE,
+			OT_ICOSPHERE,
+
+			OT_COUNT,
+			OT_UNKNOWN = std::numeric_limits<uint8_t>::max()
+		};
 };
+#if 0
 
 struct ObjectMeta
 {
@@ -1346,7 +1351,7 @@ class CScene final : public nbl::core::IReferenceCounted
 
 	ResourcesBundle resources;
 };
+#endif
 
-} // nbl::scene::geometrycreator
-
-#endif // _NBL_GEOMETRY_CREATOR_SCENE_H_INCLUDED_
\ No newline at end of file
+}
+#endif
\ No newline at end of file

From 90ba9265ae0ed7cdb460ff90bb5bb2a1c439655c Mon Sep 17 00:00:00 2001
From: keptsecret <sorchon@gmail.com>
Date: Thu, 5 Jun 2025 15:00:14 +0700
Subject: [PATCH 335/529] various minor adjustments to unit tests

---
 .../app_resources/common.hlsl                 |   9 ++
 .../app_resources/shaderCommon.hlsl           |  14 +--
 .../app_resources/testSubgroup.comp.hlsl      |  40 +++----
 .../app_resources/testWorkgroup.comp.hlsl     | 110 ++++++++----------
 23_Arithmetic2UnitTest/main.cpp               |  26 ++---
 5 files changed, 88 insertions(+), 111 deletions(-)

diff --git a/23_Arithmetic2UnitTest/app_resources/common.hlsl b/23_Arithmetic2UnitTest/app_resources/common.hlsl
index 10892a2b9..2daffa56c 100644
--- a/23_Arithmetic2UnitTest/app_resources/common.hlsl
+++ b/23_Arithmetic2UnitTest/app_resources/common.hlsl
@@ -10,6 +10,14 @@ struct Output
 	uint32_t data[ScanElementCount];
 };
 
+struct PushConstantData
+{
+    uint64_t pInputBuf;
+    uint64_t ppOutputBuf;
+};
+
+namespace arithmetic
+{
 // Thanks to our unified HLSL/C++ STD lib we're able to remove a whole load of code
 template<typename T>
 struct bit_and : nbl::hlsl::bit_and<T>
@@ -92,5 +100,6 @@ struct ballot : nbl::hlsl::plus<T>
 	static inline constexpr const char* name = "bitcount";
 #endif
 };
+}
 
 #include "nbl/builtin/hlsl/subgroup/basic.hlsl"
\ No newline at end of file
diff --git a/23_Arithmetic2UnitTest/app_resources/shaderCommon.hlsl b/23_Arithmetic2UnitTest/app_resources/shaderCommon.hlsl
index 31d59121b..9045d62e8 100644
--- a/23_Arithmetic2UnitTest/app_resources/shaderCommon.hlsl
+++ b/23_Arithmetic2UnitTest/app_resources/shaderCommon.hlsl
@@ -2,6 +2,9 @@
 
 #include "nbl/builtin/hlsl/jit/device_capabilities.hlsl"
 
+using namespace nbl;
+using namespace hlsl;
+
 // https://github.com/microsoft/DirectXShaderCompiler/issues/6144
 uint32_t3 nbl::hlsl::glsl::gl_WorkGroupSize() {return uint32_t3(WORKGROUP_SIZE,1,1);}
 
@@ -9,19 +12,8 @@ uint32_t3 nbl::hlsl::glsl::gl_WorkGroupSize() {return uint32_t3(WORKGROUP_SIZE,1
 #error "Define ITEMS_PER_INVOCATION!"
 #endif
 
-struct PushConstantData
-{
-    uint64_t inputBufAddress;
-    uint64_t outputAddressBufAddress;
-};
-
 [[vk::push_constant]] PushConstantData pc;
 
-// because subgroups don't match `gl_LocalInvocationIndex` snake curve addressing, we also can't load inputs that way
-uint32_t globalIndex();
-// since we test ITEMS_PER_WG<WorkgroupSize we need this so workgroups don't overwrite each other's outputs
-bool canStore();
-
 #ifndef OPERATION
 #error "Define OPERATION!"
 #endif
diff --git a/23_Arithmetic2UnitTest/app_resources/testSubgroup.comp.hlsl b/23_Arithmetic2UnitTest/app_resources/testSubgroup.comp.hlsl
index 838f7adf9..585a8498c 100644
--- a/23_Arithmetic2UnitTest/app_resources/testSubgroup.comp.hlsl
+++ b/23_Arithmetic2UnitTest/app_resources/testSubgroup.comp.hlsl
@@ -11,16 +11,23 @@
 
 typedef vector<uint32_t, ITEMS_PER_INVOCATION> type_t;
 
+uint32_t globalIndex()
+{
+    return glsl::gl_WorkGroupID().x*WORKGROUP_SIZE+workgroup::SubgroupContiguousIndex();
+}
+
+bool canStore() { return true; }
+
 template<class Binop, uint32_t N>
 static void subtest(NBL_CONST_REF_ARG(type_t) sourceVal)
 {
-    using config_t = nbl::hlsl::subgroup2::Configuration<SUBGROUP_SIZE_LOG2>;
-    using params_t = nbl::hlsl::subgroup2::ArithmeticParams<config_t, typename Binop::base_t, N, nbl::hlsl::jit::device_capabilities>;
+    using config_t = subgroup2::Configuration<SUBGROUP_SIZE_LOG2>;
+    using params_t = subgroup2::ArithmeticParams<config_t, typename Binop::base_t, N, jit::device_capabilities>;
 
-    const uint64_t outputBufAddr = vk::RawBufferLoad<uint64_t>(pc.outputAddressBufAddress + Binop::BindingIndex * sizeof(uint64_t), sizeof(uint64_t));
+    const uint64_t outputBufAddr = vk::RawBufferLoad<uint64_t>(pc.ppOutputBuf + Binop::BindingIndex * sizeof(uint64_t), sizeof(uint64_t));
 
     if (globalIndex()==0u)
-        vk::RawBufferStore<uint32_t>(outputBufAddr, nbl::hlsl::glsl::gl_SubgroupSize());
+        vk::RawBufferStore<uint32_t>(outputBufAddr, glsl::gl_SubgroupSize());
 
     operation_t<params_t> func;
     type_t val = func(sourceVal);
@@ -31,25 +38,18 @@ static void subtest(NBL_CONST_REF_ARG(type_t) sourceVal)
 type_t test()
 {
     const uint32_t idx = globalIndex();
-    type_t sourceVal = vk::RawBufferLoad<type_t>(pc.inputBufAddress + idx * sizeof(type_t));
-
-    subtest<bit_and<uint32_t>, ITEMS_PER_INVOCATION>(sourceVal);
-    subtest<bit_xor<uint32_t>, ITEMS_PER_INVOCATION>(sourceVal);
-    subtest<bit_or<uint32_t>, ITEMS_PER_INVOCATION>(sourceVal);
-    subtest<plus<uint32_t>, ITEMS_PER_INVOCATION>(sourceVal);
-    subtest<multiplies<uint32_t>, ITEMS_PER_INVOCATION>(sourceVal);
-    subtest<minimum<uint32_t>, ITEMS_PER_INVOCATION>(sourceVal);
-    subtest<maximum<uint32_t>, ITEMS_PER_INVOCATION>(sourceVal);
+    type_t sourceVal = vk::RawBufferLoad<type_t>(pc.pInputBuf + idx * sizeof(type_t));
+
+    subtest<arithmetic::bit_and<uint32_t>, ITEMS_PER_INVOCATION>(sourceVal);
+    subtest<arithmetic::bit_xor<uint32_t>, ITEMS_PER_INVOCATION>(sourceVal);
+    subtest<arithmetic::bit_or<uint32_t>, ITEMS_PER_INVOCATION>(sourceVal);
+    subtest<arithmetic::plus<uint32_t>, ITEMS_PER_INVOCATION>(sourceVal);
+    subtest<arithmetic::multiplies<uint32_t>, ITEMS_PER_INVOCATION>(sourceVal);
+    subtest<arithmetic::minimum<uint32_t>, ITEMS_PER_INVOCATION>(sourceVal);
+    subtest<arithmetic::maximum<uint32_t>, ITEMS_PER_INVOCATION>(sourceVal);
     return sourceVal;
 }
 
-uint32_t globalIndex()
-{
-    return nbl::hlsl::glsl::gl_WorkGroupID().x*WORKGROUP_SIZE+nbl::hlsl::workgroup::SubgroupContiguousIndex();
-}
-
-bool canStore() {return true;}
-
 [numthreads(WORKGROUP_SIZE,1,1)]
 void main()
 {
diff --git a/23_Arithmetic2UnitTest/app_resources/testWorkgroup.comp.hlsl b/23_Arithmetic2UnitTest/app_resources/testWorkgroup.comp.hlsl
index 0a7fde9ba..0f97c7b54 100644
--- a/23_Arithmetic2UnitTest/app_resources/testWorkgroup.comp.hlsl
+++ b/23_Arithmetic2UnitTest/app_resources/testWorkgroup.comp.hlsl
@@ -9,12 +9,12 @@ static const uint32_t WORKGROUP_SIZE = 1u << WORKGROUP_SIZE_LOG2;
 
 #include "shaderCommon.hlsl"
 
-using config_t = nbl::hlsl::workgroup2::ArithmeticConfiguration<WORKGROUP_SIZE_LOG2, SUBGROUP_SIZE_LOG2, ITEMS_PER_INVOCATION>;
+using config_t = workgroup2::ArithmeticConfiguration<WORKGROUP_SIZE_LOG2, SUBGROUP_SIZE_LOG2, ITEMS_PER_INVOCATION>;
 
 typedef vector<uint32_t, config_t::ItemsPerInvocation_0> type_t;
 
 // final (level 1/2) scan needs to fit in one subgroup exactly
-groupshared uint32_t scratch[config_t::SharedScratchElementCount];
+groupshared uint32_t scratch[mpl::max_v<int16_t,config_t::SharedScratchElementCount,1>];
 
 struct ScratchProxy
 {
@@ -31,13 +31,13 @@ struct ScratchProxy
 
     uint32_t atomicOr(const uint32_t ix, const uint32_t value)
     {
-        return nbl::hlsl::glsl::atomicOr(scratch[ix],value);
+        return glsl::atomicOr(scratch[ix],value);
     }
 
     void workgroupExecutionAndMemoryBarrier()
     {
-        nbl::hlsl::glsl::barrier();
-        //nbl::hlsl::glsl::memoryBarrierShared(); implied by the above
+        glsl::barrier();
+        //glsl::memoryBarrierShared(); implied by the above
     }
 };
 
@@ -45,26 +45,26 @@ template<class Config, class Binop>
 struct DataProxy
 {
     using dtype_t = vector<uint32_t, Config::ItemsPerInvocation_0>;
-    static_assert(nbl::hlsl::is_same_v<dtype_t, type_t>);
+    static_assert(is_same_v<dtype_t, type_t>);
 
     template<typename AccessType, typename IndexType>
     void get(const IndexType ix, NBL_REF_ARG(AccessType) value)
     {
-        const uint32_t workgroupOffset = nbl::hlsl::glsl::gl_WorkGroupID().x * Config::VirtualWorkgroupSize;
-        value = vk::RawBufferLoad<AccessType>(pc.inputBufAddress + (workgroupOffset + ix) * sizeof(AccessType));
+        const uint32_t workgroupOffset = glsl::gl_WorkGroupID().x * Config::VirtualWorkgroupSize;
+        value = vk::RawBufferLoad<AccessType>(pc.pInputBuf + (workgroupOffset + ix) * sizeof(AccessType));
     }
     template<typename AccessType, typename IndexType>
     void set(const IndexType ix, const AccessType value)
     {
-        const uint32_t workgroupOffset = nbl::hlsl::glsl::gl_WorkGroupID().x * Config::VirtualWorkgroupSize;
-        uint64_t outputBufAddr = vk::RawBufferLoad<uint64_t>(pc.outputAddressBufAddress + Binop::BindingIndex * sizeof(uint64_t));
+        const uint32_t workgroupOffset = glsl::gl_WorkGroupID().x * Config::VirtualWorkgroupSize;
+        uint64_t outputBufAddr = vk::RawBufferLoad<uint64_t>(pc.ppOutputBuf + Binop::BindingIndex * sizeof(uint64_t));
         vk::RawBufferStore<AccessType>(outputBufAddr + sizeof(uint32_t) + sizeof(AccessType) * (workgroupOffset+ix), value, sizeof(uint32_t));
     }
 
     void workgroupExecutionAndMemoryBarrier()
     {
-        nbl::hlsl::glsl::barrier();
-        //nbl::hlsl::glsl::memoryBarrierShared(); implied by the above
+        glsl::barrier();
+        //glsl::memoryBarrierShared(); implied by the above
     }
 };
 
@@ -72,41 +72,41 @@ template<class Config, class Binop>
 struct PreloadedDataProxy
 {
     using dtype_t = vector<uint32_t, Config::ItemsPerInvocation_0>;
-    static_assert(nbl::hlsl::is_same_v<dtype_t, type_t>);
+    static_assert(is_same_v<dtype_t, type_t>);
 
     NBL_CONSTEXPR_STATIC_INLINE uint32_t PreloadedDataCount = Config::VirtualWorkgroupSize / Config::WorkgroupSize;
 
     template<typename AccessType, typename IndexType>
     void get(const IndexType ix, NBL_REF_ARG(AccessType) value)
     {
-        value = preloaded[(ix-nbl::hlsl::workgroup::SubgroupContiguousIndex())>>Config::WorkgroupSizeLog2];
+        value = preloaded[ix>>Config::WorkgroupSizeLog2];
     }
     template<typename AccessType, typename IndexType>
     void set(const IndexType ix, const AccessType value)
     {
-        preloaded[(ix-nbl::hlsl::workgroup::SubgroupContiguousIndex())>>Config::WorkgroupSizeLog2] = value;
+        preloaded[ix>>Config::WorkgroupSizeLog2] = value;
     }
 
     void preload()
     {
-        const uint32_t workgroupOffset = nbl::hlsl::glsl::gl_WorkGroupID().x * Config::VirtualWorkgroupSize;
+        const uint32_t workgroupOffset = glsl::gl_WorkGroupID().x * Config::VirtualWorkgroupSize;
         [unroll]
         for (uint32_t idx = 0; idx < PreloadedDataCount; idx++)
-            preloaded[idx] = vk::RawBufferLoad<dtype_t>(pc.inputBufAddress + (workgroupOffset + idx * Config::WorkgroupSize + nbl::hlsl::workgroup::SubgroupContiguousIndex()) * sizeof(dtype_t));
+            preloaded[idx] = vk::RawBufferLoad<dtype_t>(pc.pInputBuf + (workgroupOffset + idx * Config::WorkgroupSize + workgroup::SubgroupContiguousIndex()) * sizeof(dtype_t));
     }
     void unload()
     {
-        const uint32_t workgroupOffset = nbl::hlsl::glsl::gl_WorkGroupID().x * Config::VirtualWorkgroupSize;
-        uint64_t outputBufAddr = vk::RawBufferLoad<uint64_t>(pc.outputAddressBufAddress + Binop::BindingIndex * sizeof(uint64_t));
+        const uint32_t workgroupOffset = glsl::gl_WorkGroupID().x * Config::VirtualWorkgroupSize;
+        uint64_t outputBufAddr = vk::RawBufferLoad<uint64_t>(pc.ppOutputBuf + Binop::BindingIndex * sizeof(uint64_t));
         [unroll]
         for (uint32_t idx = 0; idx < PreloadedDataCount; idx++)
-            vk::RawBufferStore<dtype_t>(outputBufAddr + sizeof(uint32_t) + sizeof(dtype_t) * (workgroupOffset + idx * Config::WorkgroupSize + nbl::hlsl::workgroup::SubgroupContiguousIndex()), preloaded[idx], sizeof(uint32_t));
+            vk::RawBufferStore<dtype_t>(outputBufAddr + sizeof(uint32_t) + sizeof(dtype_t) * (workgroupOffset + idx * Config::WorkgroupSize + workgroup::SubgroupContiguousIndex()), preloaded[idx], sizeof(uint32_t));
     }
 
     void workgroupExecutionAndMemoryBarrier()
     {
-        nbl::hlsl::glsl::barrier();
-        //nbl::hlsl::glsl::memoryBarrierShared(); implied by the above
+        glsl::barrier();
+        //glsl::memoryBarrierShared(); implied by the above
     }
 
     dtype_t preloaded[PreloadedDataCount];
@@ -122,73 +122,55 @@ struct operation_t
 
     // workgroup reduction returns the value of the reduction
     // workgroup scans do no return anything, but use the data accessor to do the storing directly
-#if IS_REDUCTION
     void operator()()
     {
         PreloadedDataProxy<config_t,Binop> dataAccessor;
         dataAccessor.preload();
-        otype_t value = nbl::hlsl::OPERATION<config_t,binop_base_t,device_capabilities>::template __call<PreloadedDataProxy<config_t,Binop>, ScratchProxy>(dataAccessor,arithmeticAccessor);
+#if IS_REDUCTION
+        otype_t value =
+#endif
+        OPERATION<config_t,binop_base_t,device_capabilities>::template __call<PreloadedDataProxy<config_t,Binop>, ScratchProxy>(dataAccessor,arithmeticAccessor);
         // we barrier before because we alias the accessors for Binop
         arithmeticAccessor.workgroupExecutionAndMemoryBarrier();
-
+#if IS_REDUCTION
         [unroll]
         for (uint32_t i = 0; i < PreloadedDataProxy<config_t,Binop>::PreloadedDataCount; i++)
             dataAccessor.preloaded[i] = value;
+#endif
         dataAccessor.unload();
     }
-#else
-    void operator()()
-    {
-        PreloadedDataProxy<config_t,Binop> dataAccessor;
-        dataAccessor.preload();
-        nbl::hlsl::OPERATION<config_t,binop_base_t,device_capabilities>::template __call<PreloadedDataProxy<config_t,Binop>, ScratchProxy>(dataAccessor,arithmeticAccessor);
-        // we barrier before because we alias the accessors for Binop
-        arithmeticAccessor.workgroupExecutionAndMemoryBarrier();
-        dataAccessor.unload();
-    }
-#endif
 };
 
 
-template<class Binop>
-static void subtest(NBL_CONST_REF_ARG(type_t) sourceVal)
+uint32_t globalIndex()
 {
-    uint64_t outputBufAddr = vk::RawBufferLoad<uint64_t>(pc.outputAddressBufAddress + Binop::BindingIndex * sizeof(uint64_t));
-    if (globalIndex()==0u)
-        vk::RawBufferStore<uint32_t>(outputBufAddr, nbl::hlsl::glsl::gl_SubgroupSize());
-
-    operation_t<Binop,nbl::hlsl::jit::device_capabilities> func;
-    func();
+    return glsl::gl_WorkGroupID().x*ITEMS_PER_WG+workgroup::SubgroupContiguousIndex();
 }
 
-
-type_t test()
+template<class Binop>
+static void subtest()
 {
-    type_t sourceVal = vk::RawBufferLoad<type_t>(pc.inputBufAddress + globalIndex() * sizeof(type_t));
-
-    subtest<bit_and<uint32_t> >(sourceVal);
-    subtest<bit_xor<uint32_t> >(sourceVal);
-    subtest<bit_or<uint32_t> >(sourceVal);
-    subtest<plus<uint32_t> >(sourceVal);
-    subtest<multiplies<uint32_t> >(sourceVal);
-    subtest<minimum<uint32_t> >(sourceVal);
-    subtest<maximum<uint32_t> >(sourceVal);
-    return sourceVal;
-}
-
+    uint64_t outputBufAddr = vk::RawBufferLoad<uint64_t>(pc.ppOutputBuf + Binop::BindingIndex * sizeof(uint64_t));
+    if (globalIndex()==0u)
+        vk::RawBufferStore<uint32_t>(outputBufAddr, glsl::gl_SubgroupSize());
 
-uint32_t globalIndex()
-{
-    return nbl::hlsl::glsl::gl_WorkGroupID().x*ITEMS_PER_WG+nbl::hlsl::workgroup::SubgroupContiguousIndex();
+    operation_t<Binop,jit::device_capabilities> func;
+    func();
 }
 
-bool canStore()
+void test()
 {
-    return nbl::hlsl::workgroup::SubgroupContiguousIndex()<ITEMS_PER_WG;
+    subtest<arithmetic::bit_and<uint32_t> >();
+    subtest<arithmetic::bit_xor<uint32_t> >();
+    subtest<arithmetic::bit_or<uint32_t> >();
+    subtest<arithmetic::plus<uint32_t> >();
+    subtest<arithmetic::multiplies<uint32_t> >();
+    subtest<arithmetic::minimum<uint32_t> >();
+    subtest<arithmetic::maximum<uint32_t> >();
 }
 
 [numthreads(WORKGROUP_SIZE,1,1)]
 void main()
 {
-    const type_t sourceVal = test();
+    test();
 }
\ No newline at end of file
diff --git a/23_Arithmetic2UnitTest/main.cpp b/23_Arithmetic2UnitTest/main.cpp
index 73e6a144e..71642b631 100644
--- a/23_Arithmetic2UnitTest/main.cpp
+++ b/23_Arithmetic2UnitTest/main.cpp
@@ -45,12 +45,6 @@ struct emulatedScanExclusive
 	static inline constexpr const char* name = "exclusive_scan";
 };
 
-struct PushConstantData
-{
-	uint64_t inputBufAddress;
-	uint64_t outputAddressBufAddress;
-};
-
 class Workgroup2ScanTestApp final : public application_templates::BasicMultiQueueApplication, public application_templates::MonoAssetManagerAndBuiltinResourceApplication
 {
 	using device_base_t = application_templates::BasicMultiQueueApplication;
@@ -118,8 +112,8 @@ class Workgroup2ScanTestApp final : public application_templates::BasicMultiQueu
 			params.size = OutputBufferCount * sizeof(uint64_t);
 			m_utils->createFilledDeviceLocalBufferOnDedMem(SIntendedSubmitInfo{ .queue = getTransferUpQueue() }, std::move(params), outputAddresses.data()).move_into(gpuOutputAddressesBuffer);
 		}
-		pc.inputBufAddress = gpuinputDataBuffer->getDeviceAddress();
-		pc.outputAddressBufAddress = gpuOutputAddressesBuffer->getDeviceAddress();
+		pc.pInputBuf = gpuinputDataBuffer->getDeviceAddress();
+		pc.ppOutputBuf = gpuOutputAddressesBuffer->getDeviceAddress();
 
 		// create Pipeline Layout
 		{
@@ -310,7 +304,7 @@ class Workgroup2ScanTestApp final : public application_templates::BasicMultiQueu
 	template<template<class> class Arithmetic, bool WorkgroupTest>
 	bool runTest(const smart_refctd_ptr<const ICPUShader>& source, const uint32_t elementCount, const uint8_t subgroupSizeLog2, const uint32_t workgroupSize, uint32_t itemsPerWG = ~0u, uint32_t itemsPerInvoc = 1u)
 	{
-		std::string arith_name = Arithmetic<bit_xor<float>>::name;
+		std::string arith_name = Arithmetic<arithmetic::bit_xor<float>>::name;
 		const uint32_t workgroupSizeLog2 = hlsl::findMSB(workgroupSize);
 
 		auto compiler = make_smart_refctd_ptr<asset::CHLSLCompiler>(smart_refctd_ptr(m_system));
@@ -423,13 +417,13 @@ class Workgroup2ScanTestApp final : public application_templates::BasicMultiQueu
 		m_device->blockForSemaphores(wait);
 
 		// check results
-		bool passed = validateResults<Arithmetic, bit_and<uint32_t>, WorkgroupTest>(itemsPerWG, workgroupCount, itemsPerInvoc);
-		passed = validateResults<Arithmetic, bit_xor<uint32_t>, WorkgroupTest>(itemsPerWG, workgroupCount, itemsPerInvoc) && passed;
-		passed = validateResults<Arithmetic, bit_or<uint32_t>, WorkgroupTest>(itemsPerWG, workgroupCount, itemsPerInvoc) && passed;
-		passed = validateResults<Arithmetic, plus<uint32_t>, WorkgroupTest>(itemsPerWG, workgroupCount, itemsPerInvoc) && passed;
-		passed = validateResults<Arithmetic, multiplies<uint32_t>, WorkgroupTest>(itemsPerWG, workgroupCount, itemsPerInvoc) && passed;
-		passed = validateResults<Arithmetic, minimum<uint32_t>, WorkgroupTest>(itemsPerWG, workgroupCount, itemsPerInvoc) && passed;
-		passed = validateResults<Arithmetic, maximum<uint32_t>, WorkgroupTest>(itemsPerWG, workgroupCount, itemsPerInvoc) && passed;
+		bool passed = validateResults<Arithmetic, arithmetic::bit_and<uint32_t>, WorkgroupTest>(itemsPerWG, workgroupCount, itemsPerInvoc);
+		passed = validateResults<Arithmetic, arithmetic::bit_xor<uint32_t>, WorkgroupTest>(itemsPerWG, workgroupCount, itemsPerInvoc) && passed;
+		passed = validateResults<Arithmetic, arithmetic::bit_or<uint32_t>, WorkgroupTest>(itemsPerWG, workgroupCount, itemsPerInvoc) && passed;
+		passed = validateResults<Arithmetic, arithmetic::plus<uint32_t>, WorkgroupTest>(itemsPerWG, workgroupCount, itemsPerInvoc) && passed;
+		passed = validateResults<Arithmetic, arithmetic::multiplies<uint32_t>, WorkgroupTest>(itemsPerWG, workgroupCount, itemsPerInvoc) && passed;
+		passed = validateResults<Arithmetic, arithmetic::minimum<uint32_t>, WorkgroupTest>(itemsPerWG, workgroupCount, itemsPerInvoc) && passed;
+		passed = validateResults<Arithmetic, arithmetic::maximum<uint32_t>, WorkgroupTest>(itemsPerWG, workgroupCount, itemsPerInvoc) && passed;
 
 		return passed;
 	}

From 19d7fe0fa35a0e6ddf7061b1ed22460ebdb56273 Mon Sep 17 00:00:00 2001
From: keptsecret <sorchon@gmail.com>
Date: Thu, 5 Jun 2025 15:21:42 +0700
Subject: [PATCH 336/529] simplified data accessors

---
 .../app_resources/testWorkgroup.comp.hlsl     | 37 +++++++++++++------
 1 file changed, 25 insertions(+), 12 deletions(-)

diff --git a/23_Arithmetic2UnitTest/app_resources/testWorkgroup.comp.hlsl b/23_Arithmetic2UnitTest/app_resources/testWorkgroup.comp.hlsl
index 0f97c7b54..5cb316578 100644
--- a/23_Arithmetic2UnitTest/app_resources/testWorkgroup.comp.hlsl
+++ b/23_Arithmetic2UnitTest/app_resources/testWorkgroup.comp.hlsl
@@ -47,18 +47,23 @@ struct DataProxy
     using dtype_t = vector<uint32_t, Config::ItemsPerInvocation_0>;
     static_assert(is_same_v<dtype_t, type_t>);
 
+    static DataProxy<Config, Binop> create()
+    {
+        DataProxy<Config, Binop> retval;
+        retval.workgroupOffset = glsl::gl_WorkGroupID().x * Config::VirtualWorkgroupSize;
+        retval.outputBufAddr = sizeof(uint32_t) + vk::RawBufferLoad<uint64_t>(pc.ppOutputBuf + Binop::BindingIndex * sizeof(uint64_t));
+        return retval;
+    }
+
     template<typename AccessType, typename IndexType>
     void get(const IndexType ix, NBL_REF_ARG(AccessType) value)
     {
-        const uint32_t workgroupOffset = glsl::gl_WorkGroupID().x * Config::VirtualWorkgroupSize;
         value = vk::RawBufferLoad<AccessType>(pc.pInputBuf + (workgroupOffset + ix) * sizeof(AccessType));
     }
     template<typename AccessType, typename IndexType>
     void set(const IndexType ix, const AccessType value)
     {
-        const uint32_t workgroupOffset = glsl::gl_WorkGroupID().x * Config::VirtualWorkgroupSize;
-        uint64_t outputBufAddr = vk::RawBufferLoad<uint64_t>(pc.ppOutputBuf + Binop::BindingIndex * sizeof(uint64_t));
-        vk::RawBufferStore<AccessType>(outputBufAddr + sizeof(uint32_t) + sizeof(AccessType) * (workgroupOffset+ix), value, sizeof(uint32_t));
+        vk::RawBufferStore<AccessType>(outputBufAddr + sizeof(AccessType) * (workgroupOffset+ix), value, sizeof(uint32_t));
     }
 
     void workgroupExecutionAndMemoryBarrier()
@@ -66,6 +71,9 @@ struct DataProxy
         glsl::barrier();
         //glsl::memoryBarrierShared(); implied by the above
     }
+
+    uint32_t workgroupOffset;
+    uint64_t outputBufAddr;
 };
 
 template<class Config, class Binop>
@@ -76,6 +84,13 @@ struct PreloadedDataProxy
 
     NBL_CONSTEXPR_STATIC_INLINE uint32_t PreloadedDataCount = Config::VirtualWorkgroupSize / Config::WorkgroupSize;
 
+    static PreloadedDataProxy<Config, Binop> create()
+    {
+        PreloadedDataProxy<Config, Binop> retval;
+        retval.data = DataProxy<Config, Binop>::create();
+        return retval;
+    }
+
     template<typename AccessType, typename IndexType>
     void get(const IndexType ix, NBL_REF_ARG(AccessType) value)
     {
@@ -89,18 +104,15 @@ struct PreloadedDataProxy
 
     void preload()
     {
-        const uint32_t workgroupOffset = glsl::gl_WorkGroupID().x * Config::VirtualWorkgroupSize;
         [unroll]
-        for (uint32_t idx = 0; idx < PreloadedDataCount; idx++)
-            preloaded[idx] = vk::RawBufferLoad<dtype_t>(pc.pInputBuf + (workgroupOffset + idx * Config::WorkgroupSize + workgroup::SubgroupContiguousIndex()) * sizeof(dtype_t));
+        for (uint16_t idx = 0; idx < PreloadedDataCount; idx++)
+            data.template get<dtype_t, uint16_t>(idx * Config::WorkgroupSize + workgroup::SubgroupContiguousIndex(), preloaded[idx]);
     }
     void unload()
     {
-        const uint32_t workgroupOffset = glsl::gl_WorkGroupID().x * Config::VirtualWorkgroupSize;
-        uint64_t outputBufAddr = vk::RawBufferLoad<uint64_t>(pc.ppOutputBuf + Binop::BindingIndex * sizeof(uint64_t));
         [unroll]
-        for (uint32_t idx = 0; idx < PreloadedDataCount; idx++)
-            vk::RawBufferStore<dtype_t>(outputBufAddr + sizeof(uint32_t) + sizeof(dtype_t) * (workgroupOffset + idx * Config::WorkgroupSize + workgroup::SubgroupContiguousIndex()), preloaded[idx], sizeof(uint32_t));
+        for (uint16_t idx = 0; idx < PreloadedDataCount; idx++)
+            data.template set<dtype_t, uint16_t>(idx * Config::WorkgroupSize + workgroup::SubgroupContiguousIndex(), preloaded[idx]);
     }
 
     void workgroupExecutionAndMemoryBarrier()
@@ -109,6 +121,7 @@ struct PreloadedDataProxy
         //glsl::memoryBarrierShared(); implied by the above
     }
 
+    DataProxy<Config, Binop> data;
     dtype_t preloaded[PreloadedDataCount];
 };
 
@@ -124,7 +137,7 @@ struct operation_t
     // workgroup scans do no return anything, but use the data accessor to do the storing directly
     void operator()()
     {
-        PreloadedDataProxy<config_t,Binop> dataAccessor;
+        PreloadedDataProxy<config_t,Binop> dataAccessor = PreloadedDataProxy<config_t,Binop>::create();
         dataAccessor.preload();
 #if IS_REDUCTION
         otype_t value =

From fdace317db64525773dcf0cca9bc647331db7540 Mon Sep 17 00:00:00 2001
From: keptsecret <sorchon@gmail.com>
Date: Thu, 5 Jun 2025 15:47:23 +0700
Subject: [PATCH 337/529] tests for native and emulated subgroup op

---
 .../app_resources/shaderCommon.hlsl           |  11 +-
 .../app_resources/testSubgroup.comp.hlsl      |   2 +-
 .../app_resources/testWorkgroup.comp.hlsl     |   2 +-
 23_Arithmetic2UnitTest/main.cpp               | 112 +++++++++++-------
 4 files changed, 80 insertions(+), 47 deletions(-)

diff --git a/23_Arithmetic2UnitTest/app_resources/shaderCommon.hlsl b/23_Arithmetic2UnitTest/app_resources/shaderCommon.hlsl
index 9045d62e8..6b9575ccd 100644
--- a/23_Arithmetic2UnitTest/app_resources/shaderCommon.hlsl
+++ b/23_Arithmetic2UnitTest/app_resources/shaderCommon.hlsl
@@ -1,7 +1,5 @@
 #include "common.hlsl"
 
-#include "nbl/builtin/hlsl/jit/device_capabilities.hlsl"
-
 using namespace nbl;
 using namespace hlsl;
 
@@ -14,6 +12,15 @@ uint32_t3 nbl::hlsl::glsl::gl_WorkGroupSize() {return uint32_t3(WORKGROUP_SIZE,1
 
 [[vk::push_constant]] PushConstantData pc;
 
+struct device_capabilities
+{
+#ifdef TEST_NATIVE
+    NBL_CONSTEXPR_STATIC_INLINE bool shaderSubgroupArithmetic = true;
+#else
+    NBL_CONSTEXPR_STATIC_INLINE bool shaderSubgroupArithmetic = false;
+#endif
+};
+
 #ifndef OPERATION
 #error "Define OPERATION!"
 #endif
diff --git a/23_Arithmetic2UnitTest/app_resources/testSubgroup.comp.hlsl b/23_Arithmetic2UnitTest/app_resources/testSubgroup.comp.hlsl
index 585a8498c..8d8557ccd 100644
--- a/23_Arithmetic2UnitTest/app_resources/testSubgroup.comp.hlsl
+++ b/23_Arithmetic2UnitTest/app_resources/testSubgroup.comp.hlsl
@@ -22,7 +22,7 @@ template<class Binop, uint32_t N>
 static void subtest(NBL_CONST_REF_ARG(type_t) sourceVal)
 {
     using config_t = subgroup2::Configuration<SUBGROUP_SIZE_LOG2>;
-    using params_t = subgroup2::ArithmeticParams<config_t, typename Binop::base_t, N, jit::device_capabilities>;
+    using params_t = subgroup2::ArithmeticParams<config_t, typename Binop::base_t, N, device_capabilities>;
 
     const uint64_t outputBufAddr = vk::RawBufferLoad<uint64_t>(pc.ppOutputBuf + Binop::BindingIndex * sizeof(uint64_t), sizeof(uint64_t));
 
diff --git a/23_Arithmetic2UnitTest/app_resources/testWorkgroup.comp.hlsl b/23_Arithmetic2UnitTest/app_resources/testWorkgroup.comp.hlsl
index 5cb316578..cdd4af4b2 100644
--- a/23_Arithmetic2UnitTest/app_resources/testWorkgroup.comp.hlsl
+++ b/23_Arithmetic2UnitTest/app_resources/testWorkgroup.comp.hlsl
@@ -167,7 +167,7 @@ static void subtest()
     if (globalIndex()==0u)
         vk::RawBufferStore<uint32_t>(outputBufAddr, glsl::gl_SubgroupSize());
 
-    operation_t<Binop,jit::device_capabilities> func;
+    operation_t<Binop,device_capabilities> func;
     func();
 }
 
diff --git a/23_Arithmetic2UnitTest/main.cpp b/23_Arithmetic2UnitTest/main.cpp
index 71642b631..98a9def2e 100644
--- a/23_Arithmetic2UnitTest/main.cpp
+++ b/23_Arithmetic2UnitTest/main.cpp
@@ -187,47 +187,65 @@ class Workgroup2ScanTestApp final : public application_templates::BasicMultiQueu
 		const auto MaxWorkgroupSize = m_physicalDevice->getLimits().maxComputeWorkGroupInvocations;
 		const auto MinSubgroupSize = m_physicalDevice->getLimits().minSubgroupSize;
 		const auto MaxSubgroupSize = m_physicalDevice->getLimits().maxSubgroupSize;
-		for (auto subgroupSize=MinSubgroupSize; subgroupSize <= MaxSubgroupSize; subgroupSize *= 2u)
+		for (uint32_t useNative = 0; useNative < 2; useNative++)
 		{
-			const uint8_t subgroupSizeLog2 = hlsl::findMSB(subgroupSize);
-			for (uint32_t workgroupSize = subgroupSize; workgroupSize <= MaxWorkgroupSize; workgroupSize *= 2u)
+			bool b_useNative = false;
+			if (!m_physicalDevice->getProperties().limits.shaderSubgroupArithmetic && useNative == 0)
 			{
-				// make sure renderdoc captures everything for debugging
-				m_api->startCapture();
-				m_logger->log("Testing Workgroup Size %u with Subgroup Size %u", ILogger::ELL_INFO, workgroupSize, subgroupSize);
+				m_logger->log("Device property shaderSubgroupArithmetic is false! Skipping to emulated arithmetic...", ILogger::ELL_INFO);
+				continue;
+			}
 
-				for (uint32_t j = 0; j < ItemsPerInvocations.size(); j++)
-				{
-					const uint32_t itemsPerInvocation = ItemsPerInvocations[j];
-					m_logger->log("Testing Items per Invocation %u", ILogger::ELL_INFO, itemsPerInvocation);
-					bool passed = true;
-					passed = runTest<emulatedReduction, false>(subgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize, ~0u, itemsPerInvocation) && passed;
-					logTestOutcome(passed, workgroupSize);
-					passed = runTest<emulatedScanInclusive, false>(subgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize, ~0u, itemsPerInvocation) && passed;
-					logTestOutcome(passed, workgroupSize);
-					passed = runTest<emulatedScanExclusive, false>(subgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize, ~0u, itemsPerInvocation) && passed;
-					logTestOutcome(passed, workgroupSize);
-
-					const uint32_t itemsPerWG = calculateItemsPerWorkgroup(workgroupSize, subgroupSize, itemsPerInvocation);
-					m_logger->log("Testing Item Count %u", ILogger::ELL_INFO, itemsPerWG);
-					passed = runTest<emulatedReduction, true>(workgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize, itemsPerWG, itemsPerInvocation) && passed;
-					logTestOutcome(passed, itemsPerWG);
-					passed = runTest<emulatedScanInclusive, true>(workgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize, itemsPerWG, itemsPerInvocation) && passed;
-					logTestOutcome(passed, itemsPerWG);
-					passed = runTest<emulatedScanExclusive, true>(workgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize, itemsPerWG, itemsPerInvocation) && passed;
-					logTestOutcome(passed, itemsPerWG);
-				}
-				m_api->endCapture();
+			if (useNative)
+				m_logger->log("Testing with emulated subgroup arithmetic", ILogger::ELL_INFO);
+			else
+			{
+				m_logger->log("Testing with native subgroup arithmetic", ILogger::ELL_INFO);
+				b_useNative = true;
+			}
 
-				// save cache every now and then	
+			for (auto subgroupSize = MinSubgroupSize; subgroupSize <= MaxSubgroupSize; subgroupSize *= 2u)
+			{
+				const uint8_t subgroupSizeLog2 = hlsl::findMSB(subgroupSize);
+				for (uint32_t workgroupSize = subgroupSize; workgroupSize <= MaxWorkgroupSize; workgroupSize *= 2u)
 				{
-					auto cpu = m_spirv_isa_cache->convertToCPUCache();
-					// Normally we'd beautifully JSON serialize the thing, allow multiple devices & drivers + metadata
-					auto bin = cpu->getEntries().begin()->second.bin;
-					IFile::success_t success;
-					m_spirv_isa_cache_output->write(success, bin->data(), 0ull, bin->size());
-					if (!success)
-						logFail("Could not write Create SPIR-V to ISA cache to disk!");
+					// make sure renderdoc captures everything for debugging
+					m_api->startCapture();
+					m_logger->log("Testing Workgroup Size %u with Subgroup Size %u", ILogger::ELL_INFO, workgroupSize, subgroupSize);
+
+					for (uint32_t j = 0; j < ItemsPerInvocations.size(); j++)
+					{
+						const uint32_t itemsPerInvocation = ItemsPerInvocations[j];
+						m_logger->log("Testing Items per Invocation %u", ILogger::ELL_INFO, itemsPerInvocation);
+						bool passed = true;
+						passed = runTest<emulatedReduction, false>(subgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize, b_useNative, ~0u, itemsPerInvocation) && passed;
+						logTestOutcome(passed, workgroupSize);
+						passed = runTest<emulatedScanInclusive, false>(subgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize, b_useNative, ~0u, itemsPerInvocation) && passed;
+						logTestOutcome(passed, workgroupSize);
+						passed = runTest<emulatedScanExclusive, false>(subgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize, b_useNative, ~0u, itemsPerInvocation) && passed;
+						logTestOutcome(passed, workgroupSize);
+
+						const uint32_t itemsPerWG = calculateItemsPerWorkgroup(workgroupSize, subgroupSize, itemsPerInvocation);
+						m_logger->log("Testing Item Count %u", ILogger::ELL_INFO, itemsPerWG);
+						passed = runTest<emulatedReduction, true>(workgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize, b_useNative, itemsPerWG, itemsPerInvocation) && passed;
+						logTestOutcome(passed, itemsPerWG);
+						passed = runTest<emulatedScanInclusive, true>(workgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize, b_useNative, itemsPerWG, itemsPerInvocation) && passed;
+						logTestOutcome(passed, itemsPerWG);
+						passed = runTest<emulatedScanExclusive, true>(workgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize, b_useNative, itemsPerWG, itemsPerInvocation) && passed;
+						logTestOutcome(passed, itemsPerWG);
+					}
+					m_api->endCapture();
+
+					// save cache every now and then	
+					{
+						auto cpu = m_spirv_isa_cache->convertToCPUCache();
+						// Normally we'd beautifully JSON serialize the thing, allow multiple devices & drivers + metadata
+						auto bin = cpu->getEntries().begin()->second.bin;
+						IFile::success_t success;
+						m_spirv_isa_cache_output->write(success, bin->data(), 0ull, bin->size());
+						if (!success)
+							logFail("Could not write Create SPIR-V to ISA cache to disk!");
+					}
 				}
 			}
 		}
@@ -302,7 +320,7 @@ class Workgroup2ScanTestApp final : public application_templates::BasicMultiQueu
 	}
 
 	template<template<class> class Arithmetic, bool WorkgroupTest>
-	bool runTest(const smart_refctd_ptr<const ICPUShader>& source, const uint32_t elementCount, const uint8_t subgroupSizeLog2, const uint32_t workgroupSize, uint32_t itemsPerWG = ~0u, uint32_t itemsPerInvoc = 1u)
+	bool runTest(const smart_refctd_ptr<const ICPUShader>& source, const uint32_t elementCount, const uint8_t subgroupSizeLog2, const uint32_t workgroupSize, bool useNative, uint32_t itemsPerWG = ~0u, uint32_t itemsPerInvoc = 1u)
 	{
 		std::string arith_name = Arithmetic<arithmetic::bit_xor<float>>::name;
 		const uint32_t workgroupSizeLog2 = hlsl::findMSB(workgroupSize);
@@ -338,15 +356,19 @@ class Workgroup2ScanTestApp final : public application_templates::BasicMultiQueu
 				std::to_string(arith_name=="reduction")
 			};
 
-			const IShaderCompiler::SMacroDefinition defines[6] = {
+			const IShaderCompiler::SMacroDefinition defines[7] = {
 				{ "OPERATION", definitions[0] },
 				{ "WORKGROUP_SIZE_LOG2", definitions[1] },
 				{ "ITEMS_PER_WG", definitions[2] },
 				{ "ITEMS_PER_INVOCATION", definitions[3] },
 				{ "SUBGROUP_SIZE_LOG2", definitions[4] },
-				{ "IS_REDUCTION", definitions[5] }
+				{ "IS_REDUCTION", definitions[5] },
+				{ "TEST_NATIVE", "1" }
 			};
-			options.preprocessorOptions.extraDefines = { defines, defines + 6 };
+			if (useNative)
+				options.preprocessorOptions.extraDefines = { defines, defines + 7 };
+			else
+				options.preprocessorOptions.extraDefines = { defines, defines + 6 };
 
 			overriddenUnspecialized = compiler->compileToSPIRV((const char*)source->getContent()->getPointer(), options);
 		}
@@ -359,13 +381,17 @@ class Workgroup2ScanTestApp final : public application_templates::BasicMultiQueu
 				std::to_string(subgroupSizeLog2)
 			};
 
-			const IShaderCompiler::SMacroDefinition defines[4] = {
+			const IShaderCompiler::SMacroDefinition defines[5] = {
 				{ "OPERATION", definitions[0] },
 				{ "WORKGROUP_SIZE", definitions[1] },
 				{ "ITEMS_PER_INVOCATION", definitions[2] },
-				{ "SUBGROUP_SIZE_LOG2", definitions[3] }
+				{ "SUBGROUP_SIZE_LOG2", definitions[3] },
+				{ "TEST_NATIVE", "1" }
 			};
-			options.preprocessorOptions.extraDefines = { defines, defines + 4 };
+			if (useNative)
+				options.preprocessorOptions.extraDefines = { defines, defines + 5 };
+			else
+				options.preprocessorOptions.extraDefines = { defines, defines + 4 };
 
 			overriddenUnspecialized = compiler->compileToSPIRV((const char*)source->getContent()->getPointer(), options);
 		}

From d6680f2996d7acf56085b9e072c29698d9d06469 Mon Sep 17 00:00:00 2001
From: keptsecret <sorchon@gmail.com>
Date: Thu, 5 Jun 2025 15:58:32 +0700
Subject: [PATCH 338/529] removed redundant stuff

---
 23_Arithmetic2UnitTest/app_resources/common.hlsl | 9 ---------
 23_Arithmetic2UnitTest/main.cpp                  | 5 ++---
 2 files changed, 2 insertions(+), 12 deletions(-)

diff --git a/23_Arithmetic2UnitTest/app_resources/common.hlsl b/23_Arithmetic2UnitTest/app_resources/common.hlsl
index 2daffa56c..ddf5dc00f 100644
--- a/23_Arithmetic2UnitTest/app_resources/common.hlsl
+++ b/23_Arithmetic2UnitTest/app_resources/common.hlsl
@@ -1,15 +1,6 @@
 #include "nbl/builtin/hlsl/cpp_compat.hlsl"
 #include "nbl/builtin/hlsl/functional.hlsl"
 
-template<uint32_t kScanElementCount=1024*1024>
-struct Output
-{
-	NBL_CONSTEXPR_STATIC_INLINE uint32_t ScanElementCount = kScanElementCount;
-
-	uint32_t subgroupSize;
-	uint32_t data[ScanElementCount];
-};
-
 struct PushConstantData
 {
     uint64_t pInputBuf;
diff --git a/23_Arithmetic2UnitTest/main.cpp b/23_Arithmetic2UnitTest/main.cpp
index 98a9def2e..326c9e57f 100644
--- a/23_Arithmetic2UnitTest/main.cpp
+++ b/23_Arithmetic2UnitTest/main.cpp
@@ -65,7 +65,7 @@ class Workgroup2ScanTestApp final : public application_templates::BasicMultiQueu
 		computeQueue = getComputeQueue();
 
 		// TODO: get the element count from argv
-		const uint32_t elementCount = Output<>::ScanElementCount;
+		const uint32_t elementCount = 1024 * 1024;
 		// populate our random data buffer on the CPU and create a GPU copy
 		inputData = new uint32_t[elementCount];
 		smart_refctd_ptr<IGPUBuffer> gpuinputDataBuffer;
@@ -75,7 +75,7 @@ class Workgroup2ScanTestApp final : public application_templates::BasicMultiQueu
 				inputData[i] = randGenerator(); // TODO: change to using xoroshiro, then we can skip having the input buffer at all
 
 			IGPUBuffer::SCreationParams inputDataBufferCreationParams = {};
-			inputDataBufferCreationParams.size = sizeof(Output<>::data[0]) * elementCount;
+			inputDataBufferCreationParams.size = sizeof(uint32_t) * elementCount;
 			inputDataBufferCreationParams.usage = IGPUBuffer::EUF_STORAGE_BUFFER_BIT | IGPUBuffer::EUF_TRANSFER_DST_BIT | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT;
 			m_utils->createFilledDeviceLocalBufferOnDedMem(
 				SIntendedSubmitInfo{.queue=getTransferUpQueue()},
@@ -341,7 +341,6 @@ class Workgroup2ScanTestApp final : public application_templates::BasicMultiQueu
 		options.preprocessorOptions.logger = m_logger.get();
 
 		auto* includeFinder = compiler->getDefaultIncludeFinder();
-		includeFinder->addSearchPath("nbl/builtin/hlsl/jit", core::make_smart_refctd_ptr<CJITIncludeLoader>(m_physicalDevice->getLimits(), m_device->getEnabledFeatures()));
 		options.preprocessorOptions.includeFinder = includeFinder;
 
 		smart_refctd_ptr<ICPUShader> overriddenUnspecialized;

From bafad3ecd353863a1f12feada096814799a1ee04 Mon Sep 17 00:00:00 2001
From: keptsecret <sorchon@gmail.com>
Date: Fri, 6 Jun 2025 11:04:06 +0700
Subject: [PATCH 339/529] bind swapchain image directly, explicit surface
 format swapchain

---
 29_Arithmetic2Bench/main.cpp | 286 ++++++++++++++++++++++-------------
 1 file changed, 181 insertions(+), 105 deletions(-)

diff --git a/29_Arithmetic2Bench/main.cpp b/29_Arithmetic2Bench/main.cpp
index 165427750..9f59f38d8 100644
--- a/29_Arithmetic2Bench/main.cpp
+++ b/29_Arithmetic2Bench/main.cpp
@@ -53,6 +53,160 @@ struct PushConstantData
 	uint64_t outputAddressBufAddress;
 };
 
+template<typename SwapchainResources> requires std::is_base_of_v<ISimpleManagedSurface::ISwapchainResources, SwapchainResources>
+class CExplicitSurfaceFormatResizeSurface final : public ISimpleManagedSurface
+{
+public:
+	using this_t = CExplicitSurfaceFormatResizeSurface<SwapchainResources>;
+
+	// Factory method so we can fail, requires a `_surface` created from a window and with a callback that inherits from `ICallback` declared just above
+	template<typename Surface> requires std::is_base_of_v<CSurface<typename Surface::window_t, typename Surface::immediate_base_t>, Surface>
+	static inline core::smart_refctd_ptr<this_t> create(core::smart_refctd_ptr<Surface>&& _surface)
+	{
+		if (!_surface)
+			return nullptr;
+
+		auto _window = _surface->getWindow();
+		ICallback* cb = nullptr;
+		if (_window)
+			cb = dynamic_cast<ICallback*>(_window->getEventCallback());
+
+		return core::smart_refctd_ptr<this_t>(new this_t(std::move(_surface), cb), core::dont_grab);
+	}
+
+	// Factory method so we can fail, requires a `_surface` created from a native surface
+	template<typename Surface> requires std::is_base_of_v<CSurfaceNative<typename Surface::window_t, typename Surface::immediate_base_t>, Surface>
+	static inline core::smart_refctd_ptr<this_t> create(core::smart_refctd_ptr<Surface>&& _surface, ICallback* cb)
+	{
+		if (!_surface)
+			return nullptr;
+
+		return core::smart_refctd_ptr<this_t>(new this_t(std::move(_surface), cb), core::dont_grab);
+	}
+
+	//
+	inline bool init(CThreadSafeQueueAdapter* queue, std::unique_ptr<SwapchainResources>&& scResources, const ISwapchain::SSharedCreationParams& sharedParams = {})
+	{
+		if (!scResources || !base_init(queue))
+			return init_fail();
+
+		m_sharedParams = sharedParams;
+		if (!m_sharedParams.deduce(queue->getOriginDevice()->getPhysicalDevice(), getSurface()))
+			return init_fail();
+
+		m_swapchainResources = std::move(scResources);
+		return true;
+	}
+
+	// Can be public because we don't need to worry about mutexes unlike the Smooth Resize class
+	inline ISwapchainResources* getSwapchainResources() override { return m_swapchainResources.get(); }
+
+	// need to see if the swapchain is invalidated (e.g. because we're starting from 0-area old Swapchain) and try to recreate the swapchain
+	inline SAcquireResult acquireNextImage()
+	{
+		if (!isWindowOpen())
+		{
+			becomeIrrecoverable();
+			return {};
+		}
+
+		if (!m_swapchainResources || (m_swapchainResources->getStatus() != ISwapchainResources::STATUS::USABLE && !recreateSwapchain(m_surfaceFormat)))
+			return {};
+
+		return ISimpleManagedSurface::acquireNextImage();
+	}
+
+	// its enough to just foward though
+	inline bool present(const uint8_t imageIndex, const std::span<const IQueue::SSubmitInfo::SSemaphoreInfo> waitSemaphores)
+	{
+		return ISimpleManagedSurface::present(imageIndex, waitSemaphores);
+	}
+
+	//
+	inline bool recreateSwapchain(const ISurface::SFormat& explicitSurfaceFormat)
+	{
+		assert(m_swapchainResources);
+		// dont assign straight to `m_swapchainResources` because of complex refcounting and cycles
+		core::smart_refctd_ptr<ISwapchain> newSwapchain;
+		// TODO: This block of code could be rolled up into `ISimpleManagedSurface::ISwapchainResources` eventually
+		{
+			auto* surface = getSurface();
+			auto device = const_cast<ILogicalDevice*>(getAssignedQueue()->getOriginDevice());
+			// 0s are invalid values, so they indicate we want them deduced
+			m_sharedParams.width = 0;
+			m_sharedParams.height = 0;
+			// Question: should we re-query the supported queues, formats, present modes, etc. just-in-time??
+			auto* swapchain = m_swapchainResources->getSwapchain();
+			if (swapchain ? swapchain->deduceRecreationParams(m_sharedParams) : m_sharedParams.deduce(device->getPhysicalDevice(), surface))
+			{
+				// super special case, we can't re-create the swapchain but its possible to recover later on
+				if (m_sharedParams.width == 0 || m_sharedParams.height == 0)
+				{
+					// we need to keep the old-swapchain around, but can drop the rest
+					m_swapchainResources->invalidate();
+					return false;
+				}
+				// now lets try to create a new swapchain
+				if (swapchain)
+					newSwapchain = swapchain->recreate(m_sharedParams);
+				else
+				{
+					ISwapchain::SCreationParams params = {
+						.surface = core::smart_refctd_ptr<ISurface>(surface),
+						.surfaceFormat = explicitSurfaceFormat,
+						.sharedParams = m_sharedParams
+						// we're not going to support concurrent sharing in this simple class
+					};
+					m_surfaceFormat = explicitSurfaceFormat;
+					newSwapchain = CVulkanSwapchain::create(core::smart_refctd_ptr<const ILogicalDevice>(device), std::move(params));
+				}
+			}
+			else // parameter deduction failed
+				return false;
+		}
+
+		if (newSwapchain)
+		{
+			m_swapchainResources->invalidate();
+			return m_swapchainResources->onCreateSwapchain(getAssignedQueue()->getFamilyIndex(), std::move(newSwapchain));
+		}
+		else
+			becomeIrrecoverable();
+
+		return false;
+	}
+
+protected:
+	using ISimpleManagedSurface::ISimpleManagedSurface;
+
+	//
+	inline void deinit_impl() override final
+	{
+		becomeIrrecoverable();
+	}
+
+	//
+	inline void becomeIrrecoverable() override { m_swapchainResources = nullptr; }
+
+	// gets called when OUT_OF_DATE upon an acquire
+	inline SAcquireResult handleOutOfDate() override final
+	{
+		// recreate swapchain and try to acquire again
+		if (recreateSwapchain(m_surfaceFormat))
+			return ISimpleManagedSurface::acquireNextImage();
+		return {};
+	}
+
+private:
+	// Because the surface can start minimized (extent={0,0}) we might not be able to create the swapchain right away, so store creation parameters until we can create it.
+	ISwapchain::SSharedCreationParams m_sharedParams = {};
+	// The swapchain might not be possible to create or recreate right away, so this might be
+	// either nullptr before the first successful acquire or the old to-be-retired swapchain.
+	std::unique_ptr<SwapchainResources> m_swapchainResources = {};
+
+	ISurface::SFormat m_surfaceFormat = {};
+};
+
 // NOTE added swapchain + drawing frames to be able to profile with Nsight, which still doesn't support profiling headless compute shaders
 class ArithmeticBenchApp final : public examples::SimpleWindowedApplication, public application_templates::MonoAssetManagerAndBuiltinResourceApplication
 {
@@ -86,7 +240,7 @@ class ArithmeticBenchApp final : public examples::SimpleWindowedApplication, pub
 			}
 
 			auto surface = CSurfaceVulkanWin32::create(smart_refctd_ptr(m_api), smart_refctd_ptr_static_cast<IWindowWin32>(m_window));
-			const_cast<std::remove_const_t<decltype(m_surface)>&>(m_surface) = CSimpleResizeSurface<ISimpleManagedSurface::ISwapchainResources>::create(std::move(surface));
+			const_cast<std::remove_const_t<decltype(m_surface)>&>(m_surface) = CExplicitSurfaceFormatResizeSurface<ISimpleManagedSurface::ISwapchainResources>::create(std::move(surface));
 		}
 
 		if (m_surface)
@@ -109,9 +263,12 @@ class ArithmeticBenchApp final : public examples::SimpleWindowedApplication, pub
 			return logFail("Failed to Create a Semaphore!");
 
 		ISwapchain::SCreationParams swapchainParams = { .surface = m_surface->getSurface() };
-		if (!swapchainParams.deduceFormat(m_physicalDevice))
+		asset::E_FORMAT preferredFormats[] = { asset::EF_R8G8B8A8_UNORM };
+		if (!swapchainParams.deduceFormat(m_physicalDevice, preferredFormats))
 			return logFail("Could not choose a Surface Format for the Swapchain!");
 
+		swapchainParams.sharedParams.imageUsage = IGPUImage::E_USAGE_FLAGS::EUF_RENDER_ATTACHMENT_BIT | IGPUImage::E_USAGE_FLAGS::EUF_STORAGE_BIT;
+
 		auto graphicsQueue = getGraphicsQueue();
 		if (!m_surface || !m_surface->init(graphicsQueue, std::make_unique<ISimpleManagedSurface::ISwapchainResources>(), swapchainParams.sharedParams))
 			return logFail("Could not create Window & Surface or initialize the Surface!");
@@ -127,7 +284,7 @@ class ArithmeticBenchApp final : public examples::SimpleWindowedApplication, pub
 		}
 
 		m_winMgr->setWindowSize(m_window.get(), WIN_W, WIN_H);
-		m_surface->recreateSwapchain();
+		m_surface->recreateSwapchain(swapchainParams.surfaceFormat);
 
 		transferDownQueue = getTransferDownQueue();
 		computeQueue = getComputeQueue();
@@ -181,21 +338,21 @@ class ArithmeticBenchApp final : public examples::SimpleWindowedApplication, pub
 		pc.inputBufAddress = gpuinputDataBuffer->getDeviceAddress();
 		pc.outputAddressBufAddress = gpuOutputAddressesBuffer->getDeviceAddress();
 
-		// create dummy image
-		dummyImg = m_device->createImage({
-				{
-					.type = IGPUImage::ET_2D,
-					.samples = asset::ICPUImage::ESCF_1_BIT,
-					.format = asset::EF_R16G16B16A16_SFLOAT,
-					.extent = {WIN_W, WIN_H, 1},
-					.mipLevels = 1,
-					.arrayLayers = 1,
-					.flags = IImage::ECF_NONE,
-					.usage = core::bitflag(asset::IImage::EUF_STORAGE_BIT) | asset::IImage::EUF_TRANSFER_SRC_BIT
-				}
-			});
-		if (!dummyImg || !m_device->allocate(dummyImg->getMemoryReqs(), dummyImg.get()).isValid())
-			return logFail("Could not create HDR Image");
+		// create image views for swapchain images
+		for (uint32_t i = 0; i < ISwapchain::MaxImages; i++)
+		{
+			IGPUImage* scImg = m_surface->getSwapchainResources()->getImage(i);
+			if (scImg == nullptr)
+				continue;
+			IGPUImageView::SCreationParams viewParams = {
+				.flags = IGPUImageView::ECF_NONE,
+				.subUsages = IGPUImage::E_USAGE_FLAGS::EUF_STORAGE_BIT,
+				.image = smart_refctd_ptr<IGPUImage>(scImg),
+				.viewType = IGPUImageView::ET_2D,
+				.format = scImg->getCreationParameters().format
+			};
+			swapchainImageViews[i] = m_device->createImageView(std::move(viewParams));
+		}
 
 		// create Descriptor Sets and Pipeline Layouts
 		smart_refctd_ptr<IGPUPipelineLayout> benchPplnLayout;
@@ -322,7 +479,7 @@ class ArithmeticBenchApp final : public examples::SimpleWindowedApplication, pub
 					   .dstAccessMask = ACCESS_FLAGS::SHADER_WRITE_BITS
 					}
 			};
-			imageBarriers[0].image = dummyImg.get();
+			imageBarriers[0].image = m_surface->getSwapchainResources()->getImage(m_currentImageAcquire.imageIndex);
 			imageBarriers[0].subresourceRange = {
 				.aspectMask = IImage::EAF_COLOR_BIT,
 				.baseMipLevel = 0u,
@@ -336,19 +493,9 @@ class ArithmeticBenchApp final : public examples::SimpleWindowedApplication, pub
 			cmdbuf->pipelineBarrier(E_DEPENDENCY_FLAGS::EDF_NONE, { .imgBarriers = imageBarriers });
 		}
 
-		// bind dummy image
-		IGPUImageView::SCreationParams viewParams = {
-			.flags = IGPUImageView::ECF_NONE,
-			.subUsages = IGPUImage::E_USAGE_FLAGS::EUF_STORAGE_BIT,
-			.image = dummyImg,
-			.viewType = IGPUImageView::ET_2D,
-			.format = dummyImg->getCreationParameters().format
-		};
-		auto dummyImgView = m_device->createImageView(std::move(viewParams));
-
 		video::IGPUDescriptorSet::SDescriptorInfo dsInfo;
 		dsInfo.info.image.imageLayout = IImage::LAYOUT::GENERAL;
-		dsInfo.desc = dummyImgView;
+		dsInfo.desc = swapchainImageViews[m_currentImageAcquire.imageIndex];
 
 		IGPUDescriptorSet::SWriteDescriptorSet dsWrites[1u] =
 		{
@@ -366,7 +513,7 @@ class ArithmeticBenchApp final : public examples::SimpleWindowedApplication, pub
 		const auto MinSubgroupSize = m_physicalDevice->getLimits().minSubgroupSize;
 		const auto MaxSubgroupSize = m_physicalDevice->getLimits().maxSubgroupSize;
 
-		const auto SubgroupSizeLog2 = hlsl::findMSB(MinSubgroupSize);
+		const auto SubgroupSizeLog2 = hlsl::findMSB(MaxSubgroupSize);
 
 		cmdbuf->bindDescriptorSets(EPBP_COMPUTE, benchSets[0].pipeline->getLayout(), 0u, 1u, &benchDs.get());
 		cmdbuf->pushConstants(benchSets[0].pipeline->getLayout(), IShader::E_SHADER_STAGE::ESS_COMPUTE, 0, sizeof(PushConstantData), &pc);
@@ -374,72 +521,6 @@ class ArithmeticBenchApp final : public examples::SimpleWindowedApplication, pub
 		for (uint32_t i = 0; i < benchSets.size(); i++)
 			runBenchmark<DoWorkgroupBenchmarks>(cmdbuf, benchSets[i], elementCount, SubgroupSizeLog2);
 
-
-		// blit
-		{
-			IGPUCommandBuffer::SPipelineBarrierDependencyInfo::image_barrier_t imageBarriers[2];
-			imageBarriers[0].barrier = {
-			   .dep = {
-				   .srcStageMask = PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT,
-				   .srcAccessMask = ACCESS_FLAGS::SHADER_WRITE_BITS,
-				   .dstStageMask = PIPELINE_STAGE_FLAGS::BLIT_BIT,
-				   .dstAccessMask = ACCESS_FLAGS::TRANSFER_WRITE_BIT
-				}
-			};
-			imageBarriers[0].image = dummyImg.get();
-			imageBarriers[0].subresourceRange = {
-				.aspectMask = IImage::EAF_COLOR_BIT,
-				.baseMipLevel = 0u,
-				.levelCount = 1u,
-				.baseArrayLayer = 0u,
-				.layerCount = 1u
-			};
-			imageBarriers[0].oldLayout = IImage::LAYOUT::UNDEFINED;
-			imageBarriers[0].newLayout = IImage::LAYOUT::TRANSFER_SRC_OPTIMAL;
-
-			imageBarriers[1].barrier = {
-			   .dep = {
-				   .srcStageMask = PIPELINE_STAGE_FLAGS::NONE,
-				   .srcAccessMask = ACCESS_FLAGS::NONE,
-				   .dstStageMask = PIPELINE_STAGE_FLAGS::BLIT_BIT,
-				   .dstAccessMask = ACCESS_FLAGS::TRANSFER_WRITE_BIT
-				}
-			};
-			imageBarriers[1].image = m_surface->getSwapchainResources()->getImage(m_currentImageAcquire.imageIndex);
-			imageBarriers[1].subresourceRange = {
-				.aspectMask = IImage::EAF_COLOR_BIT,
-				.baseMipLevel = 0u,
-				.levelCount = 1u,
-				.baseArrayLayer = 0u,
-				.layerCount = 1u
-			};
-			imageBarriers[1].oldLayout = IImage::LAYOUT::UNDEFINED;
-			imageBarriers[1].newLayout = IImage::LAYOUT::TRANSFER_DST_OPTIMAL;
-
-			cmdbuf->pipelineBarrier(E_DEPENDENCY_FLAGS::EDF_NONE, { .imgBarriers = imageBarriers });
-		}
-
-		{
-			IGPUCommandBuffer::SImageBlit regions[] = { {
-				.srcMinCoord = {0,0,0},
-				.srcMaxCoord = {WIN_W,WIN_H,1},
-				.dstMinCoord = {0,0,0},
-				.dstMaxCoord = {WIN_W,WIN_H,1},
-				.layerCount = 1,
-				.srcBaseLayer = 0,
-				.dstBaseLayer = 0,
-				.srcMipLevel = 0,
-				.dstMipLevel = 0,
-				.aspectMask = IGPUImage::E_ASPECT_FLAGS::EAF_COLOR_BIT
-			} };
-
-			auto srcImg = dummyImg.get();
-			auto scRes = static_cast<CDefaultSwapchainFramebuffers*>(m_surface->getSwapchainResources());
-			auto dstImg = scRes->getImage(m_currentImageAcquire.imageIndex);
-
-			cmdbuf->blitImage(srcImg, IImage::LAYOUT::TRANSFER_SRC_OPTIMAL, dstImg, IImage::LAYOUT::TRANSFER_DST_OPTIMAL, regions, ISampler::ETF_NEAREST);
-		}
-
 		// barrier transition to PRESENT
 		{
 			IGPUCommandBuffer::SPipelineBarrierDependencyInfo::image_barrier_t imageBarriers[1];
@@ -459,7 +540,7 @@ class ArithmeticBenchApp final : public examples::SimpleWindowedApplication, pub
 				.baseArrayLayer = 0u,
 				.layerCount = 1u
 			};
-			imageBarriers[0].oldLayout = IImage::LAYOUT::TRANSFER_DST_OPTIMAL;
+			imageBarriers[0].oldLayout = IImage::LAYOUT::GENERAL;
 			imageBarriers[0].newLayout = IImage::LAYOUT::PRESENT_SRC;
 
 			cmdbuf->pipelineBarrier(E_DEPENDENCY_FLAGS::EDF_NONE, { .imgBarriers = imageBarriers });
@@ -517,11 +598,6 @@ class ArithmeticBenchApp final : public examples::SimpleWindowedApplication, pub
 				}
 			}
 
-			std::string caption = "[Nabla Engine] Geometry Creator";
-			{
-				caption += ", displaying [all objects]";
-				m_window->setCaption(caption);
-			}
 			m_surface->present(m_currentImageAcquire.imageIndex, rendered);
 		}
 
@@ -696,7 +772,7 @@ class ArithmeticBenchApp final : public examples::SimpleWindowedApplication, pub
 	IQueue* computeQueue;
 
 	smart_refctd_ptr<IWindow> m_window;
-	smart_refctd_ptr<CSimpleResizeSurface<ISimpleManagedSurface::ISwapchainResources>> m_surface;
+	smart_refctd_ptr<CExplicitSurfaceFormatResizeSurface<ISimpleManagedSurface::ISwapchainResources>> m_surface;
 	smart_refctd_ptr<ISemaphore> m_semaphore;
 	uint64_t m_realFrameIx = 0;
 	std::array<smart_refctd_ptr<IGPUCommandBuffer>, MaxFramesInFlight> m_cmdBufs;
@@ -704,7 +780,7 @@ class ArithmeticBenchApp final : public examples::SimpleWindowedApplication, pub
 
 	smart_refctd_ptr<InputSystem> m_inputSystem;
 
-	smart_refctd_ptr<IGPUImage> dummyImg;
+	std::array<smart_refctd_ptr<IGPUImageView>, ISwapchain::MaxImages> swapchainImageViews;
 
 	constexpr static inline uint32_t MaxNumSubmits = 30;
 	uint32_t numSubmits = 0;

From 32dc78f065a8414dacafc216f31b7d333e301083 Mon Sep 17 00:00:00 2001
From: keptsecret <sorchon@gmail.com>
Date: Fri, 6 Jun 2025 14:48:03 +0700
Subject: [PATCH 340/529] shared data accessor header between test and bench,
 same shader adjustments as test

---
 .../app_resources/testWorkgroup.comp.hlsl     | 109 +-----------
 .../app_resources/benchmarkSubgroup.comp.hlsl |  31 ++--
 .../benchmarkWorkgroup.comp.hlsl              | 160 ++++--------------
 29_Arithmetic2Bench/app_resources/common.hlsl |  12 +-
 .../app_resources/shaderCommon.hlsl           |  16 +-
 29_Arithmetic2Bench/main.cpp                  |  57 ++-----
 common/include/WorkgroupDataAccessors.hlsl    | 119 +++++++++++++
 7 files changed, 191 insertions(+), 313 deletions(-)
 create mode 100644 common/include/WorkgroupDataAccessors.hlsl

diff --git a/23_Arithmetic2UnitTest/app_resources/testWorkgroup.comp.hlsl b/23_Arithmetic2UnitTest/app_resources/testWorkgroup.comp.hlsl
index cdd4af4b2..a38124b0c 100644
--- a/23_Arithmetic2UnitTest/app_resources/testWorkgroup.comp.hlsl
+++ b/23_Arithmetic2UnitTest/app_resources/testWorkgroup.comp.hlsl
@@ -16,114 +16,7 @@ typedef vector<uint32_t, config_t::ItemsPerInvocation_0> type_t;
 // final (level 1/2) scan needs to fit in one subgroup exactly
 groupshared uint32_t scratch[mpl::max_v<int16_t,config_t::SharedScratchElementCount,1>];
 
-struct ScratchProxy
-{
-    template<typename AccessType, typename IndexType>
-    void get(const uint32_t ix, NBL_REF_ARG(AccessType) value)
-    {
-        value = scratch[ix];
-    }
-    template<typename AccessType, typename IndexType>
-    void set(const uint32_t ix, const AccessType value)
-    {
-        scratch[ix] = value;
-    }
-
-    uint32_t atomicOr(const uint32_t ix, const uint32_t value)
-    {
-        return glsl::atomicOr(scratch[ix],value);
-    }
-
-    void workgroupExecutionAndMemoryBarrier()
-    {
-        glsl::barrier();
-        //glsl::memoryBarrierShared(); implied by the above
-    }
-};
-
-template<class Config, class Binop>
-struct DataProxy
-{
-    using dtype_t = vector<uint32_t, Config::ItemsPerInvocation_0>;
-    static_assert(is_same_v<dtype_t, type_t>);
-
-    static DataProxy<Config, Binop> create()
-    {
-        DataProxy<Config, Binop> retval;
-        retval.workgroupOffset = glsl::gl_WorkGroupID().x * Config::VirtualWorkgroupSize;
-        retval.outputBufAddr = sizeof(uint32_t) + vk::RawBufferLoad<uint64_t>(pc.ppOutputBuf + Binop::BindingIndex * sizeof(uint64_t));
-        return retval;
-    }
-
-    template<typename AccessType, typename IndexType>
-    void get(const IndexType ix, NBL_REF_ARG(AccessType) value)
-    {
-        value = vk::RawBufferLoad<AccessType>(pc.pInputBuf + (workgroupOffset + ix) * sizeof(AccessType));
-    }
-    template<typename AccessType, typename IndexType>
-    void set(const IndexType ix, const AccessType value)
-    {
-        vk::RawBufferStore<AccessType>(outputBufAddr + sizeof(AccessType) * (workgroupOffset+ix), value, sizeof(uint32_t));
-    }
-
-    void workgroupExecutionAndMemoryBarrier()
-    {
-        glsl::barrier();
-        //glsl::memoryBarrierShared(); implied by the above
-    }
-
-    uint32_t workgroupOffset;
-    uint64_t outputBufAddr;
-};
-
-template<class Config, class Binop>
-struct PreloadedDataProxy
-{
-    using dtype_t = vector<uint32_t, Config::ItemsPerInvocation_0>;
-    static_assert(is_same_v<dtype_t, type_t>);
-
-    NBL_CONSTEXPR_STATIC_INLINE uint32_t PreloadedDataCount = Config::VirtualWorkgroupSize / Config::WorkgroupSize;
-
-    static PreloadedDataProxy<Config, Binop> create()
-    {
-        PreloadedDataProxy<Config, Binop> retval;
-        retval.data = DataProxy<Config, Binop>::create();
-        return retval;
-    }
-
-    template<typename AccessType, typename IndexType>
-    void get(const IndexType ix, NBL_REF_ARG(AccessType) value)
-    {
-        value = preloaded[ix>>Config::WorkgroupSizeLog2];
-    }
-    template<typename AccessType, typename IndexType>
-    void set(const IndexType ix, const AccessType value)
-    {
-        preloaded[ix>>Config::WorkgroupSizeLog2] = value;
-    }
-
-    void preload()
-    {
-        [unroll]
-        for (uint16_t idx = 0; idx < PreloadedDataCount; idx++)
-            data.template get<dtype_t, uint16_t>(idx * Config::WorkgroupSize + workgroup::SubgroupContiguousIndex(), preloaded[idx]);
-    }
-    void unload()
-    {
-        [unroll]
-        for (uint16_t idx = 0; idx < PreloadedDataCount; idx++)
-            data.template set<dtype_t, uint16_t>(idx * Config::WorkgroupSize + workgroup::SubgroupContiguousIndex(), preloaded[idx]);
-    }
-
-    void workgroupExecutionAndMemoryBarrier()
-    {
-        glsl::barrier();
-        //glsl::memoryBarrierShared(); implied by the above
-    }
-
-    DataProxy<Config, Binop> data;
-    dtype_t preloaded[PreloadedDataCount];
-};
+#include "../../common/include/WorkgroupDataAccessors.hlsl"
 
 static ScratchProxy arithmeticAccessor;
 
diff --git a/29_Arithmetic2Bench/app_resources/benchmarkSubgroup.comp.hlsl b/29_Arithmetic2Bench/app_resources/benchmarkSubgroup.comp.hlsl
index 113ec2bae..553103bef 100644
--- a/29_Arithmetic2Bench/app_resources/benchmarkSubgroup.comp.hlsl
+++ b/29_Arithmetic2Bench/app_resources/benchmarkSubgroup.comp.hlsl
@@ -13,41 +13,38 @@ typedef vector<uint32_t, ITEMS_PER_INVOCATION> type_t;
 
 uint32_t globalIndex()
 {
-    return nbl::hlsl::glsl::gl_WorkGroupID().x*WORKGROUP_SIZE+nbl::hlsl::workgroup::SubgroupContiguousIndex();
+    return glsl::gl_WorkGroupID().x*WORKGROUP_SIZE+workgroup::SubgroupContiguousIndex();
 }
 
-bool canStore() {return true;}
-
 template<class Binop, uint32_t N>
 static void subbench(NBL_CONST_REF_ARG(type_t) sourceVal)
 {
-    using config_t = nbl::hlsl::subgroup2::Configuration<SUBGROUP_SIZE_LOG2>;
-    using params_t = nbl::hlsl::subgroup2::ArithmeticParams<config_t, typename Binop::base_t, N, nbl::hlsl::jit::device_capabilities>;
+    using config_t = subgroup2::Configuration<SUBGROUP_SIZE_LOG2>;
+    using params_t = subgroup2::ArithmeticParams<config_t, typename Binop::base_t, N, device_capabilities>;
     type_t value = sourceVal;
 
-    const uint64_t outputBufAddr = vk::RawBufferLoad<uint64_t>(pc.outputAddressBufAddress + Binop::BindingIndex * sizeof(uint64_t), sizeof(uint64_t));
+    const uint64_t outputBufAddr = vk::RawBufferLoad<uint64_t>(pc.ppOutputBuf + Binop::BindingIndex * sizeof(uint64_t), sizeof(uint64_t));
 
     operation_t<params_t> func;
     // [unroll]
     for (uint32_t i = 0; i < NUM_LOOPS; i++)
         value = func(value);
 
-    if (canStore())
-        vk::RawBufferStore<type_t>(outputBufAddr + sizeof(uint32_t) + sizeof(type_t) * globalIndex(), value, sizeof(uint32_t));
+    vk::RawBufferStore<type_t>(outputBufAddr + sizeof(uint32_t) + sizeof(type_t) * globalIndex(), value, sizeof(uint32_t));
 }
 
 void benchmark()
 {
     const uint32_t idx = globalIndex();
-    type_t sourceVal = vk::RawBufferLoad<type_t>(pc.inputBufAddress + idx * sizeof(type_t));
-
-    subbench<bit_and<uint32_t>, ITEMS_PER_INVOCATION>(sourceVal);
-    subbench<bit_xor<uint32_t>, ITEMS_PER_INVOCATION>(sourceVal);
-    subbench<bit_or<uint32_t>, ITEMS_PER_INVOCATION>(sourceVal);
-    subbench<plus<uint32_t>, ITEMS_PER_INVOCATION>(sourceVal);
-    subbench<multiplies<uint32_t>, ITEMS_PER_INVOCATION>(sourceVal);
-    subbench<minimum<uint32_t>, ITEMS_PER_INVOCATION>(sourceVal);
-    subbench<maximum<uint32_t>, ITEMS_PER_INVOCATION>(sourceVal);
+    type_t sourceVal = vk::RawBufferLoad<type_t>(pc.pInputBuf + idx * sizeof(type_t));
+
+    subbench<arithmetic::bit_and<uint32_t>, ITEMS_PER_INVOCATION>(sourceVal);
+    subbench<arithmetic::bit_xor<uint32_t>, ITEMS_PER_INVOCATION>(sourceVal);
+    subbench<arithmetic::bit_or<uint32_t>, ITEMS_PER_INVOCATION>(sourceVal);
+    subbench<arithmetic::plus<uint32_t>, ITEMS_PER_INVOCATION>(sourceVal);
+    subbench<arithmetic::multiplies<uint32_t>, ITEMS_PER_INVOCATION>(sourceVal);
+    subbench<arithmetic::minimum<uint32_t>, ITEMS_PER_INVOCATION>(sourceVal);
+    subbench<arithmetic::maximum<uint32_t>, ITEMS_PER_INVOCATION>(sourceVal);
 }
 
 [numthreads(WORKGROUP_SIZE,1,1)]
diff --git a/29_Arithmetic2Bench/app_resources/benchmarkWorkgroup.comp.hlsl b/29_Arithmetic2Bench/app_resources/benchmarkWorkgroup.comp.hlsl
index e44bf4f06..504cc36de 100644
--- a/29_Arithmetic2Bench/app_resources/benchmarkWorkgroup.comp.hlsl
+++ b/29_Arithmetic2Bench/app_resources/benchmarkWorkgroup.comp.hlsl
@@ -9,108 +9,14 @@ static const uint32_t WORKGROUP_SIZE = 1u << WORKGROUP_SIZE_LOG2;
 
 #include "shaderCommon.hlsl"
 
-using config_t = nbl::hlsl::workgroup2::ArithmeticConfiguration<WORKGROUP_SIZE_LOG2, SUBGROUP_SIZE_LOG2, ITEMS_PER_INVOCATION>;
+using config_t = workgroup2::ArithmeticConfiguration<WORKGROUP_SIZE_LOG2, SUBGROUP_SIZE_LOG2, ITEMS_PER_INVOCATION>;
 
 typedef vector<uint32_t, config_t::ItemsPerInvocation_0> type_t;
 
 // final (level 1/2) scan needs to fit in one subgroup exactly
 groupshared uint32_t scratch[config_t::SharedScratchElementCount];
 
-struct ScratchProxy
-{
-    template<typename AccessType, typename IndexType>
-    void get(const IndexType ix, NBL_REF_ARG(AccessType) value)
-    {
-        value = scratch[ix];
-    }
-    template<typename AccessType, typename IndexType>
-    void set(const IndexType ix, const AccessType value)
-    {
-        scratch[ix] = value;
-    }
-
-    uint32_t atomicOr(const uint32_t ix, const uint32_t value)
-    {
-        return nbl::hlsl::glsl::atomicOr(scratch[ix],value);
-    }
-
-    void workgroupExecutionAndMemoryBarrier()
-    {
-        nbl::hlsl::glsl::barrier();
-        //nbl::hlsl::glsl::memoryBarrierShared(); implied by the above
-    }
-};
-
-
-template<class Config, class Binop>
-struct DataProxy
-{
-    using dtype_t = vector<uint32_t, Config::ItemsPerInvocation_0>;
-    static_assert(nbl::hlsl::is_same_v<dtype_t, type_t>);
-
-    // we don't want to write/read storage multiple times in loop; doesn't seem optimized out in generated spirv
-    template<typename AccessType, typename IndexType>
-    void get(const IndexType ix, NBL_REF_ARG(dtype_t) value)
-    {
-        // value = inputValue[ix];
-        value = nbl::hlsl::promote<dtype_t>(globalIndex());
-    }
-    template<typename AccessType, typename IndexType>
-    void set(const IndexType ix, const dtype_t value)
-    {
-        // output[Binop::BindingIndex].template Store<type_t>(sizeof(uint32_t) + sizeof(type_t) * ix, value);
-    }
-
-    void workgroupExecutionAndMemoryBarrier()
-    {
-        nbl::hlsl::glsl::barrier();
-        //nbl::hlsl::glsl::memoryBarrierShared(); implied by the above
-    }
-};
-
-template<class Config, class Binop>
-struct PreloadedDataProxy
-{
-    using dtype_t = vector<uint32_t, Config::ItemsPerInvocation_0>;
-    static_assert(nbl::hlsl::is_same_v<dtype_t, type_t>);
-
-    NBL_CONSTEXPR_STATIC_INLINE uint32_t PreloadedDataCount = Config::VirtualWorkgroupSize / Config::WorkgroupSize;
-
-    template<typename AccessType, typename IndexType>
-    void get(const IndexType ix, NBL_REF_ARG(AccessType) value)
-    {
-        value = preloaded[(ix-nbl::hlsl::workgroup::SubgroupContiguousIndex())>>Config::WorkgroupSizeLog2];
-    }
-    template<typename AccessType, typename IndexType>
-    void set(const IndexType ix, const AccessType value)
-    {
-        preloaded[(ix-nbl::hlsl::workgroup::SubgroupContiguousIndex())>>Config::WorkgroupSizeLog2] = value;
-    }
-
-    void preload()
-    {
-        const uint32_t workgroupOffset = nbl::hlsl::glsl::gl_WorkGroupID().x * Config::VirtualWorkgroupSize;
-        [unroll]
-        for (uint32_t idx = 0; idx < PreloadedDataCount; idx++)
-            preloaded[idx] = vk::RawBufferLoad<dtype_t>(pc.inputBufAddress + (workgroupOffset + idx * Config::WorkgroupSize + nbl::hlsl::workgroup::SubgroupContiguousIndex()) * sizeof(dtype_t));
-    }
-    void unload()
-    {
-        const uint32_t workgroupOffset = nbl::hlsl::glsl::gl_WorkGroupID().x * Config::VirtualWorkgroupSize;
-        uint64_t outputBufAddr = vk::RawBufferLoad<uint64_t>(pc.outputAddressBufAddress + Binop::BindingIndex * sizeof(uint64_t));
-        [unroll]
-        for (uint32_t idx = 0; idx < PreloadedDataCount; idx++)
-            vk::RawBufferStore<dtype_t>(outputBufAddr + sizeof(uint32_t) + sizeof(dtype_t) * (workgroupOffset + idx * Config::WorkgroupSize + nbl::hlsl::workgroup::SubgroupContiguousIndex()), preloaded[idx], sizeof(uint32_t));
-    }
-
-    void workgroupExecutionAndMemoryBarrier()
-    {
-        nbl::hlsl::glsl::barrier();
-        //nbl::hlsl::glsl::memoryBarrierShared(); implied by the above
-    }
-
-    dtype_t preloaded[PreloadedDataCount];
-};
+#include "../../common/include/WorkgroupDataAccessors.hlsl"
 
 static ScratchProxy arithmeticAccessor;
 
@@ -120,74 +26,70 @@ struct operation_t
     using binop_base_t = typename Binop::base_t;
     using otype_t = typename Binop::type_t;
 
-#if IS_REDUCTION
     void operator()(PreloadedDataProxy<config_t,Binop> dataAccessor)
     {
-        otype_t value = nbl::hlsl::OPERATION<config_t,binop_base_t,device_capabilities>::template __call<PreloadedDataProxy<config_t,Binop>, ScratchProxy>(dataAccessor,arithmeticAccessor);
+#if IS_REDUCTION
+        otype_t value = 
+#endif
+        OPERATION<config_t,binop_base_t,device_capabilities>::template __call<PreloadedDataProxy<config_t,Binop>, ScratchProxy>(dataAccessor,arithmeticAccessor);
         // we barrier before because we alias the accessors for Binop
         arithmeticAccessor.workgroupExecutionAndMemoryBarrier();
-
+#if IS_REDUCTION
         [unroll]
         for (uint32_t i = 0; i < PreloadedDataProxy<config_t,Binop>::PreloadedDataCount; i++)
             dataAccessor.preloaded[i] = value;
-    }
-#else
-    void operator()(PreloadedDataProxy<config_t,Binop> dataAccessor)
-    {
-        nbl::hlsl::OPERATION<config_t,binop_base_t,device_capabilities>::template __call<PreloadedDataProxy<config_t,Binop>, ScratchProxy>(dataAccessor,arithmeticAccessor);
-        // we barrier before because we alias the accessors for Binop
-        arithmeticAccessor.workgroupExecutionAndMemoryBarrier();
-    }
 #endif
+    }
+// #else
+//     void operator()(PreloadedDataProxy<config_t,Binop> dataAccessor)
+//     {
+//         OPERATION<config_t,binop_base_t,device_capabilities>::template __call<PreloadedDataProxy<config_t,Binop>, ScratchProxy>(dataAccessor,arithmeticAccessor);
+//         // we barrier before because we alias the accessors for Binop
+//         arithmeticAccessor.workgroupExecutionAndMemoryBarrier();
+//     }
+// #endif
 
 };
 
 template<class Binop>
-static void subbench(NBL_CONST_REF_ARG(type_t) sourceVal)
+static void subbench()
 {
-    const uint64_t outputBufAddr = vk::RawBufferLoad<uint64_t>(pc.outputAddressBufAddress + Binop::BindingIndex * sizeof(uint64_t), sizeof(uint64_t));
-
-    if (globalIndex()==0u)
-        vk::RawBufferStore<uint32_t>(outputBufAddr, nbl::hlsl::glsl::gl_SubgroupSize());
+    const uint64_t outputBufAddr = vk::RawBufferLoad<uint64_t>(pc.ppOutputBuf + Binop::BindingIndex * sizeof(uint64_t), sizeof(uint64_t));
 
-    PreloadedDataProxy<config_t,Binop> dataAccessor;
+    PreloadedDataProxy<config_t,Binop> dataAccessor = PreloadedDataProxy<config_t,Binop>::create();
     dataAccessor.preload();
 
-    operation_t<Binop,nbl::hlsl::jit::device_capabilities> func;
+    operation_t<Binop,device_capabilities> func;
     for (uint32_t i = 0; i < NUM_LOOPS; i++)
         func(dataAccessor);
 
     dataAccessor.unload();
 }
 
-
-type_t benchmark()
+void benchmark()
 {
-    const type_t sourceVal = vk::RawBufferLoad<type_t>(pc.inputBufAddress + globalIndex() * sizeof(type_t));
-
-    subbench<bit_and<uint32_t> >(sourceVal);
-    subbench<bit_xor<uint32_t> >(sourceVal);
-    subbench<bit_or<uint32_t> >(sourceVal);
-    subbench<plus<uint32_t> >(sourceVal);
-    subbench<multiplies<uint32_t> >(sourceVal);
-    subbench<minimum<uint32_t> >(sourceVal);
-    subbench<maximum<uint32_t> >(sourceVal);
-    return sourceVal;
+    subbench<arithmetic::bit_and<uint32_t> >();
+    subbench<arithmetic::bit_xor<uint32_t> >();
+    subbench<arithmetic::bit_or<uint32_t> >();
+    subbench<arithmetic::plus<uint32_t> >();
+    subbench<arithmetic::multiplies<uint32_t> >();
+    subbench<arithmetic::minimum<uint32_t> >();
+    subbench<arithmetic::maximum<uint32_t> >();
 }
 
 
 uint32_t globalIndex()
 {
-    return nbl::hlsl::glsl::gl_WorkGroupID().x*ITEMS_PER_WG+nbl::hlsl::workgroup::SubgroupContiguousIndex();
+    return glsl::gl_WorkGroupID().x*ITEMS_PER_WG+workgroup::SubgroupContiguousIndex();
 }
 
 bool canStore()
 {
-    return nbl::hlsl::workgroup::SubgroupContiguousIndex()<ITEMS_PER_WG;
+    return workgroup::SubgroupContiguousIndex()<ITEMS_PER_WG;
 }
 
 [numthreads(WORKGROUP_SIZE,1,1)]
 void main()
 {
-    const type_t sourceVal = benchmark();
+    benchmark();
 }
diff --git a/29_Arithmetic2Bench/app_resources/common.hlsl b/29_Arithmetic2Bench/app_resources/common.hlsl
index 67d3f16ca..a1d74b13b 100644
--- a/29_Arithmetic2Bench/app_resources/common.hlsl
+++ b/29_Arithmetic2Bench/app_resources/common.hlsl
@@ -1,15 +1,14 @@
 #include "nbl/builtin/hlsl/cpp_compat.hlsl"
 #include "nbl/builtin/hlsl/functional.hlsl"
 
-template<uint32_t kScanElementCount=1024*1024>
-struct Output
+struct PushConstantData
 {
-    NBL_CONSTEXPR_STATIC_INLINE uint32_t ScanElementCount = kScanElementCount;
-
-    uint32_t subgroupSize;
-    uint32_t data[ScanElementCount];
+    uint64_t pInputBuf;
+    uint64_t ppOutputBuf;
 };
 
+namespace arithmetic
+{
 template<typename T>
 struct bit_and : nbl::hlsl::bit_and<T>
 {
@@ -91,5 +90,6 @@ struct ballot : nbl::hlsl::plus<T>
     static inline constexpr const char* name = "bitcount";
 #endif
 };
+}
 
 #include "nbl/builtin/hlsl/subgroup/basic.hlsl"
diff --git a/29_Arithmetic2Bench/app_resources/shaderCommon.hlsl b/29_Arithmetic2Bench/app_resources/shaderCommon.hlsl
index a14986e0d..8659fd054 100644
--- a/29_Arithmetic2Bench/app_resources/shaderCommon.hlsl
+++ b/29_Arithmetic2Bench/app_resources/shaderCommon.hlsl
@@ -1,6 +1,7 @@
 #include "common.hlsl"
 
-#include "nbl/builtin/hlsl/jit/device_capabilities.hlsl"
+using namespace nbl;
+using namespace hlsl;
 
 // https://github.com/microsoft/DirectXShaderCompiler/issues/6144
 uint32_t3 nbl::hlsl::glsl::gl_WorkGroupSize() {return uint32_t3(WORKGROUP_SIZE,1,1);}
@@ -9,14 +10,17 @@ uint32_t3 nbl::hlsl::glsl::gl_WorkGroupSize() {return uint32_t3(WORKGROUP_SIZE,1
 #error "Define ITEMS_PER_INVOCATION!"
 #endif
 
-struct PushConstantData
+[[vk::push_constant]] PushConstantData pc;
+
+struct device_capabilities
 {
-    uint64_t inputBufAddress;
-    uint64_t outputAddressBufAddress;
+#ifdef TEST_NATIVE
+    NBL_CONSTEXPR_STATIC_INLINE bool shaderSubgroupArithmetic = true;
+#else
+    NBL_CONSTEXPR_STATIC_INLINE bool shaderSubgroupArithmetic = true;
+#endif
 };
 
-[[vk::push_constant]] PushConstantData pc;
-
 // because subgroups don't match `gl_LocalInvocationIndex` snake curve addressing, we also can't load inputs that way
 uint32_t globalIndex();
 // since we test ITEMS_PER_WG<WorkgroupSize we need this so workgroups don't overwrite each other's outputs
diff --git a/29_Arithmetic2Bench/main.cpp b/29_Arithmetic2Bench/main.cpp
index 9f59f38d8..0b6639bec 100644
--- a/29_Arithmetic2Bench/main.cpp
+++ b/29_Arithmetic2Bench/main.cpp
@@ -47,12 +47,6 @@ struct emulatedScanExclusive
 	static inline constexpr const char* name = "exclusive_scan";
 };
 
-struct PushConstantData
-{
-	uint64_t inputBufAddress;
-	uint64_t outputAddressBufAddress;
-};
-
 template<typename SwapchainResources> requires std::is_base_of_v<ISimpleManagedSurface::ISwapchainResources, SwapchainResources>
 class CExplicitSurfaceFormatResizeSurface final : public ISimpleManagedSurface
 {
@@ -289,8 +283,6 @@ class ArithmeticBenchApp final : public examples::SimpleWindowedApplication, pub
 		transferDownQueue = getTransferDownQueue();
 		computeQueue = getComputeQueue();
 
-		// TODO: get the element count from argv
-		const uint32_t elementCount = Output<>::ScanElementCount;
 		// populate our random data buffer on the CPU and create a GPU copy
 		inputData = new uint32_t[elementCount];
 		{
@@ -299,7 +291,7 @@ class ArithmeticBenchApp final : public examples::SimpleWindowedApplication, pub
 				inputData[i] = randGenerator(); // TODO: change to using xoroshiro, then we can skip having the input buffer at all
 
 			IGPUBuffer::SCreationParams inputDataBufferCreationParams = {};
-			inputDataBufferCreationParams.size = sizeof(Output<>::data[0]) * elementCount;
+			inputDataBufferCreationParams.size = sizeof(uint32_t) * elementCount;
 			inputDataBufferCreationParams.usage = IGPUBuffer::EUF_STORAGE_BUFFER_BIT | IGPUBuffer::EUF_TRANSFER_DST_BIT | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT;
 			m_utils->createFilledDeviceLocalBufferOnDedMem(
 				SIntendedSubmitInfo{.queue=getTransferUpQueue()},
@@ -335,8 +327,8 @@ class ArithmeticBenchApp final : public examples::SimpleWindowedApplication, pub
 			params.size = OutputBufferCount * sizeof(uint64_t);
 			m_utils->createFilledDeviceLocalBufferOnDedMem(SIntendedSubmitInfo{ .queue = getTransferUpQueue() }, std::move(params), outputAddresses.data()).move_into(gpuOutputAddressesBuffer);
 		}
-		pc.inputBufAddress = gpuinputDataBuffer->getDeviceAddress();
-		pc.outputAddressBufAddress = gpuOutputAddressesBuffer->getDeviceAddress();
+		pc.pInputBuf = gpuinputDataBuffer->getDeviceAddress();
+		pc.ppOutputBuf = gpuOutputAddressesBuffer->getDeviceAddress();
 
 		// create image views for swapchain images
 		for (uint32_t i = 0; i < ISwapchain::MaxImages; i++)
@@ -357,16 +349,6 @@ class ArithmeticBenchApp final : public examples::SimpleWindowedApplication, pub
 		// create Descriptor Sets and Pipeline Layouts
 		smart_refctd_ptr<IGPUPipelineLayout> benchPplnLayout;
 		{
-			// create Descriptor Set Layout
-			smart_refctd_ptr<IGPUDescriptorSetLayout> dsLayout;
-			{
-				IGPUDescriptorSetLayout::SBinding binding[2];
-				for (uint32_t i = 0u; i < 2; i++)
-					binding[i] = {{},i,IDescriptor::E_TYPE::ET_STORAGE_BUFFER,IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_NONE,IShader::E_SHADER_STAGE::ESS_COMPUTE,1u,nullptr };
-				binding[1].count = OutputBufferCount;
-				dsLayout = m_device->createDescriptorSetLayout(binding);
-			}
-
 			// set and transient pool
 			smart_refctd_ptr<IGPUDescriptorSetLayout> benchLayout;
 			{
@@ -402,7 +384,6 @@ class ArithmeticBenchApp final : public examples::SimpleWindowedApplication, pub
 		auto workgroupBenchSource = getShaderSource("app_resources/benchmarkWorkgroup.comp.hlsl");
 		// now create or retrieve final resources to run our tests
 		sema = m_device->createSemaphore(timelineValue);
-		resultsBuffer = ICPUBuffer::create({ outputBuffers[0]->getSize() });
 		smart_refctd_ptr<IGPUCommandBuffer> cmdbuf;
 		{
 			smart_refctd_ptr<nbl::video::IGPUCommandPool> cmdpool = m_device->createCommandPool(computeQueue->getFamilyIndex(),IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT);
@@ -413,20 +394,17 @@ class ArithmeticBenchApp final : public examples::SimpleWindowedApplication, pub
 			}
 		}
 
-		// const auto MaxWorkgroupSize = m_physicalDevice->getLimits().maxComputeWorkGroupInvocations;
-		const auto MinSubgroupSize = m_physicalDevice->getLimits().minSubgroupSize;
 		const auto MaxSubgroupSize = m_physicalDevice->getLimits().maxSubgroupSize;
-
 		// for each workgroup size (manually adjust items per invoc, operation else uses up a lot of ram)
 		if constexpr (DoWorkgroupBenchmarks)
 		{
 			for (uint32_t i = 0; i < workgroupSizes.size(); i++)
-				benchSets[i] = createBenchmarkPipelines<ArithmeticOp, DoWorkgroupBenchmarks>(workgroupBenchSource, benchPplnLayout.get(), elementCount, hlsl::findMSB(MinSubgroupSize), workgroupSizes[i], ItemsPerInvocation, NumLoops);
+				benchSets[i] = createBenchmarkPipelines<ArithmeticOp, DoWorkgroupBenchmarks>(workgroupBenchSource, benchPplnLayout.get(), elementCount, hlsl::findMSB(MaxSubgroupSize), workgroupSizes[i], ItemsPerInvocation, NumLoops);
 		}
 		else
 		{
 			for (uint32_t i = 0; i < workgroupSizes.size(); i++)
-				benchSets[i] = createBenchmarkPipelines<ArithmeticOp, DoWorkgroupBenchmarks>(subgroupBenchSource, benchPplnLayout.get(), elementCount, hlsl::findMSB(MinSubgroupSize), workgroupSizes[i], ItemsPerInvocation, NumLoops);
+				benchSets[i] = createBenchmarkPipelines<ArithmeticOp, DoWorkgroupBenchmarks>(subgroupBenchSource, benchPplnLayout.get(), elementCount, hlsl::findMSB(MaxSubgroupSize), workgroupSizes[i], ItemsPerInvocation, NumLoops);
 		}
 
 		m_winMgr->show(m_window.get());
@@ -509,10 +487,8 @@ class ArithmeticBenchApp final : public examples::SimpleWindowedApplication, pub
 		};
 		m_device->updateDescriptorSets(1u, dsWrites, 0u, nullptr);
 
-		const uint32_t elementCount = Output<>::ScanElementCount;
-		const auto MinSubgroupSize = m_physicalDevice->getLimits().minSubgroupSize;
+		const uint32_t elementCount = 1024*1024;
 		const auto MaxSubgroupSize = m_physicalDevice->getLimits().maxSubgroupSize;
-
 		const auto SubgroupSizeLog2 = hlsl::findMSB(MaxSubgroupSize);
 
 		cmdbuf->bindDescriptorSets(EPBP_COMPUTE, benchSets[0].pipeline->getLayout(), 0u, 1u, &benchDs.get());
@@ -608,17 +584,6 @@ class ArithmeticBenchApp final : public examples::SimpleWindowedApplication, pub
 	bool keepRunning() override { return numSubmits < MaxNumSubmits; }
 
 private:
-	void logTestOutcome(bool passed, uint32_t workgroupSize)
-	{
-		if (passed)
-			m_logger->log("Passed test #%u", ILogger::ELL_INFO, workgroupSize);
-		else
-		{
-			totalFailCount++;
-			m_logger->log("Failed test #%u", ILogger::ELL_ERROR, workgroupSize);
-		}
-	}
-
 	// create pipeline (specialized every test) [TODO: turn into a future/async]
 	smart_refctd_ptr<IGPUComputePipeline> createPipeline(const ICPUShader* overridenUnspecialized, const IGPUPipelineLayout* layout, const uint8_t subgroupSizeLog2)
 	{
@@ -648,7 +613,7 @@ class ArithmeticBenchApp final : public examples::SimpleWindowedApplication, pub
 	template<template<class> class Arithmetic, bool WorkgroupBench>
 	BenchmarkSet createBenchmarkPipelines(const smart_refctd_ptr<const ICPUShader>&source, const IGPUPipelineLayout* layout, const uint32_t elementCount, const uint8_t subgroupSizeLog2, const uint32_t workgroupSize, uint32_t itemsPerInvoc = 1u, uint32_t numLoops = 8u)
 	{
-		std::string arith_name = Arithmetic<plus<uint32_t>>::name;
+		std::string arith_name = Arithmetic<arithmetic::plus<uint32_t>>::name;
 
 		auto compiler = make_smart_refctd_ptr<asset::CHLSLCompiler>(smart_refctd_ptr(m_system));
 		CHLSLCompiler::SOptions options = {};
@@ -784,13 +749,14 @@ class ArithmeticBenchApp final : public examples::SimpleWindowedApplication, pub
 
 	constexpr static inline uint32_t MaxNumSubmits = 30;
 	uint32_t numSubmits = 0;
+	uint32_t elementCount = 1024 * 1024;
 
 	/* PARAMETERS TO CHANGE FOR DIFFERENT BENCHMARKS */
 	constexpr static inline bool DoWorkgroupBenchmarks = true;
 	uint32_t ItemsPerInvocation = 4u;
 	constexpr static inline uint32_t NumLoops = 1000u;
-	constexpr static inline uint32_t NumBenchmarks = 6u;
-	constexpr static inline std::array<uint32_t, NumBenchmarks> workgroupSizes = { 32, 64, 128, 256, 512, 1024 };
+	constexpr static inline uint32_t NumBenchmarks = 2u;
+	constexpr static inline std::array<uint32_t, NumBenchmarks> workgroupSizes = { 32, 64 };// 128, 256, 512, 1024};
 	template<class BinOp>
 	using ArithmeticOp = emulatedReduction<BinOp>;	// change this to test other arithmetic ops
 
@@ -807,9 +773,6 @@ class ArithmeticBenchApp final : public examples::SimpleWindowedApplication, pub
 
 	smart_refctd_ptr<ISemaphore> sema;
 	uint64_t timelineValue = 0;
-	smart_refctd_ptr<ICPUBuffer> resultsBuffer;
-
-	uint32_t totalFailCount = 0;
 };
 
 NBL_MAIN_FUNC(ArithmeticBenchApp)
\ No newline at end of file
diff --git a/common/include/WorkgroupDataAccessors.hlsl b/common/include/WorkgroupDataAccessors.hlsl
new file mode 100644
index 000000000..267c81a73
--- /dev/null
+++ b/common/include/WorkgroupDataAccessors.hlsl
@@ -0,0 +1,119 @@
+#ifndef _WORKGROUP_DATA_ACCESSORS_HLSL_
+#define _WORKGROUP_DATA_ACCESSORS_HLSL_
+
+namespace nbl
+{
+namespace hlsl
+{
+
+struct ScratchProxy
+{
+    template<typename AccessType, typename IndexType>
+    void get(const uint32_t ix, NBL_REF_ARG(AccessType) value)
+    {
+        value = scratch[ix];
+    }
+    template<typename AccessType, typename IndexType>
+    void set(const uint32_t ix, const AccessType value)
+    {
+        scratch[ix] = value;
+    }
+
+    uint32_t atomicOr(const uint32_t ix, const uint32_t value)
+    {
+        return glsl::atomicOr(scratch[ix],value);
+    }
+
+    void workgroupExecutionAndMemoryBarrier()
+    {
+        glsl::barrier();
+        //glsl::memoryBarrierShared(); implied by the above
+    }
+};
+
+template<class Config, class Binop>
+struct DataProxy
+{
+    using dtype_t = vector<uint32_t, Config::ItemsPerInvocation_0>;
+
+    static DataProxy<Config, Binop> create()
+    {
+        DataProxy<Config, Binop> retval;
+        retval.workgroupOffset = glsl::gl_WorkGroupID().x * Config::VirtualWorkgroupSize;
+        retval.outputBufAddr = sizeof(uint32_t) + vk::RawBufferLoad<uint64_t>(pc.ppOutputBuf + Binop::BindingIndex * sizeof(uint64_t));
+        return retval;
+    }
+
+    template<typename AccessType, typename IndexType>
+    void get(const IndexType ix, NBL_REF_ARG(AccessType) value)
+    {
+        value = vk::RawBufferLoad<AccessType>(pc.pInputBuf + (workgroupOffset + ix) * sizeof(AccessType));
+    }
+    template<typename AccessType, typename IndexType>
+    void set(const IndexType ix, const AccessType value)
+    {
+        vk::RawBufferStore<AccessType>(outputBufAddr + sizeof(AccessType) * (workgroupOffset+ix), value, sizeof(uint32_t));
+    }
+
+    void workgroupExecutionAndMemoryBarrier()
+    {
+        glsl::barrier();
+        //glsl::memoryBarrierShared(); implied by the above
+    }
+
+    uint32_t workgroupOffset;
+    uint64_t outputBufAddr;
+};
+
+template<class Config, class Binop>
+struct PreloadedDataProxy
+{
+    using dtype_t = vector<uint32_t, Config::ItemsPerInvocation_0>;
+
+    NBL_CONSTEXPR_STATIC_INLINE uint32_t PreloadedDataCount = Config::VirtualWorkgroupSize / Config::WorkgroupSize;
+
+    static PreloadedDataProxy<Config, Binop> create()
+    {
+        PreloadedDataProxy<Config, Binop> retval;
+        retval.data = DataProxy<Config, Binop>::create();
+        return retval;
+    }
+
+    template<typename AccessType, typename IndexType>
+    void get(const IndexType ix, NBL_REF_ARG(AccessType) value)
+    {
+        value = preloaded[ix>>Config::WorkgroupSizeLog2];
+    }
+    template<typename AccessType, typename IndexType>
+    void set(const IndexType ix, const AccessType value)
+    {
+        preloaded[ix>>Config::WorkgroupSizeLog2] = value;
+    }
+
+    void preload()
+    {
+        [unroll]
+        for (uint16_t idx = 0; idx < PreloadedDataCount; idx++)
+            data.template get<dtype_t, uint16_t>(idx * Config::WorkgroupSize + workgroup::SubgroupContiguousIndex(), preloaded[idx]);
+    }
+    void unload()
+    {
+        [unroll]
+        for (uint16_t idx = 0; idx < PreloadedDataCount; idx++)
+            data.template set<dtype_t, uint16_t>(idx * Config::WorkgroupSize + workgroup::SubgroupContiguousIndex(), preloaded[idx]);
+    }
+
+    void workgroupExecutionAndMemoryBarrier()
+    {
+        glsl::barrier();
+        //glsl::memoryBarrierShared(); implied by the above
+    }
+
+    DataProxy<Config, Binop> data;
+    dtype_t preloaded[PreloadedDataCount];
+};
+
+}
+}
+
+#endif

From 2aef6d343f68dbf9db15505ae50ed6ce2a249d4c Mon Sep 17 00:00:00 2001
From: keptsecret <sorchon@gmail.com>
Date: Fri, 6 Jun 2025 15:42:05 +0700
Subject: [PATCH 341/529] generate benchmark inputs with xoroshiro

---
 .../app_resources/benchmarkSubgroup.comp.hlsl |  8 ++-
 .../benchmarkWorkgroup.comp.hlsl              | 71 +++++++++++++++----
 .../app_resources/shaderCommon.hlsl           |  2 +-
 29_Arithmetic2Bench/main.cpp                  | 54 +++++++-------
 common/include/WorkgroupDataAccessors.hlsl    |  6 +-
 5 files changed, 93 insertions(+), 48 deletions(-)

diff --git a/29_Arithmetic2Bench/app_resources/benchmarkSubgroup.comp.hlsl b/29_Arithmetic2Bench/app_resources/benchmarkSubgroup.comp.hlsl
index 553103bef..2e5d3e146 100644
--- a/29_Arithmetic2Bench/app_resources/benchmarkSubgroup.comp.hlsl
+++ b/29_Arithmetic2Bench/app_resources/benchmarkSubgroup.comp.hlsl
@@ -5,6 +5,7 @@
 #include "nbl/builtin/hlsl/glsl_compat/core.hlsl"
 #include "nbl/builtin/hlsl/glsl_compat/subgroup_basic.hlsl"
 #include "nbl/builtin/hlsl/subgroup2/arithmetic_portability.hlsl"
+#include "nbl/builtin/hlsl/random/xoroshiro.hlsl"
 
 #include "shaderCommon.hlsl"
 #include "nbl/builtin/hlsl/workgroup/basic.hlsl"
@@ -35,8 +36,11 @@ static void subbench(NBL_CONST_REF_ARG(type_t) sourceVal)
 
 void benchmark()
 {
-    const uint32_t idx = globalIndex();
-    type_t sourceVal = vk::RawBufferLoad<type_t>(pc.pInputBuf + idx * sizeof(type_t));
+    type_t sourceVal;
+    Xoroshiro64Star xoroshiro = Xoroshiro64Star::construct(uint32_t2(invocationIndex,invocationIndex+1));
+    [unroll]
+    for (uint16_t i = 0; i < Config::ItemsPerInvocation_0; i++)
+        sourceVal[i] = xoroshiro();
 
     subbench<arithmetic::bit_and<uint32_t>, ITEMS_PER_INVOCATION>(sourceVal);
     subbench<arithmetic::bit_xor<uint32_t>, ITEMS_PER_INVOCATION>(sourceVal);
diff --git a/29_Arithmetic2Bench/app_resources/benchmarkWorkgroup.comp.hlsl b/29_Arithmetic2Bench/app_resources/benchmarkWorkgroup.comp.hlsl
index 504cc36de..4e611476a 100644
--- a/29_Arithmetic2Bench/app_resources/benchmarkWorkgroup.comp.hlsl
+++ b/29_Arithmetic2Bench/app_resources/benchmarkWorkgroup.comp.hlsl
@@ -4,6 +4,7 @@
 #include "nbl/builtin/hlsl/glsl_compat/subgroup_basic.hlsl"
 #include "nbl/builtin/hlsl/subgroup2/arithmetic_portability.hlsl"
 #include "nbl/builtin/hlsl/workgroup2/arithmetic.hlsl"
+#include "nbl/builtin/hlsl/random/xoroshiro.hlsl"
 
 static const uint32_t WORKGROUP_SIZE = 1u << WORKGROUP_SIZE_LOG2;
 
@@ -18,6 +19,59 @@ groupshared uint32_t scratch[config_t::SharedScratchElementCount];
 
 #include "../../common/include/WorkgroupDataAccessors.hlsl"
 
+template<class Config, class Binop>
+struct RandomizedInputDataProxy
+{
+    using dtype_t = vector<uint32_t, Config::ItemsPerInvocation_0>;
+
+    NBL_CONSTEXPR_STATIC_INLINE uint32_t PreloadedDataCount = Config::VirtualWorkgroupSize / Config::WorkgroupSize;
+
+    static RandomizedInputDataProxy<Config, Binop> create()
+    {
+        RandomizedInputDataProxy<Config, Binop> retval;
+        retval.data = DataProxy<Config, Binop>::create();
+        return retval;
+    }
+
+    template<typename AccessType, typename IndexType>
+    void get(const IndexType ix, NBL_REF_ARG(AccessType) value)
+    {
+        value = preloaded[ix>>Config::WorkgroupSizeLog2];
+    }
+    template<typename AccessType, typename IndexType>
+    void set(const IndexType ix, const AccessType value)
+    {
+        preloaded[ix>>Config::WorkgroupSizeLog2] = value;
+    }
+
+    void preload()
+    {
+        const uint16_t invocationIndex = workgroup::SubgroupContiguousIndex();
+        Xoroshiro64Star xoroshiro = Xoroshiro64Star::construct(uint32_t2(invocationIndex,invocationIndex+1));
+        [unroll]
+        for (uint16_t idx = 0; idx < PreloadedDataCount; idx++)
+            [unroll]
+            for (uint16_t i = 0; i < Config::ItemsPerInvocation_0; i++)
+               preloaded[idx][i] = xoroshiro();
+    }
+    void unload()
+    {
+        const uint16_t invocationIndex = workgroup::SubgroupContiguousIndex();
+        [unroll]
+        for (uint16_t idx = 0; idx < PreloadedDataCount; idx++)
+            data.template set<dtype_t, uint16_t>(idx * Config::WorkgroupSize + invocationIndex, preloaded[idx]);
+    }
+
+    void workgroupExecutionAndMemoryBarrier()
+    {
+        glsl::barrier();
+        //glsl::memoryBarrierShared(); implied by the above
+    }
+
+    DataProxy<Config, Binop> data;
+    dtype_t preloaded[PreloadedDataCount];
+};
+
 static ScratchProxy arithmeticAccessor;
 
 template<class Binop, class device_capabilities>
@@ -26,29 +80,20 @@ struct operation_t
     using binop_base_t = typename Binop::base_t;
     using otype_t = typename Binop::type_t;
 
-    void operator()(PreloadedDataProxy<config_t,Binop> dataAccessor)
+    void operator()(RandomizedInputDataProxy<config_t,Binop> dataAccessor)
     {
 #if IS_REDUCTION
         otype_t value = 
 #endif
-        OPERATION<config_t,binop_base_t,device_capabilities>::template __call<PreloadedDataProxy<config_t,Binop>, ScratchProxy>(dataAccessor,arithmeticAccessor);
+        OPERATION<config_t,binop_base_t,device_capabilities>::template __call<RandomizedInputDataProxy<config_t,Binop>, ScratchProxy>(dataAccessor,arithmeticAccessor);
         // we barrier before because we alias the accessors for Binop
         arithmeticAccessor.workgroupExecutionAndMemoryBarrier();
 #if IS_REDUCTION
         [unroll]
-        for (uint32_t i = 0; i < PreloadedDataProxy<config_t,Binop>::PreloadedDataCount; i++)
+        for (uint32_t i = 0; i < RandomizedInputDataProxy<config_t,Binop>::PreloadedDataCount; i++)
             dataAccessor.preloaded[i] = value;
 #endif
     }
-// #else
-//     void operator()(PreloadedDataProxy<config_t,Binop> dataAccessor)
-//     {
-//         OPERATION<config_t,binop_base_t,device_capabilities>::template __call<PreloadedDataProxy<config_t,Binop>, ScratchProxy>(dataAccessor,arithmeticAccessor);
-//         // we barrier before because we alias the accessors for Binop
-//         arithmeticAccessor.workgroupExecutionAndMemoryBarrier();
-//     }
-// #endif
-
 };
 
 template<class Binop>
@@ -56,7 +101,7 @@ static void subbench()
 {
     const uint64_t outputBufAddr = vk::RawBufferLoad<uint64_t>(pc.ppOutputBuf + Binop::BindingIndex * sizeof(uint64_t), sizeof(uint64_t));
 
-    PreloadedDataProxy<config_t,Binop> dataAccessor = PreloadedDataProxy<config_t,Binop>::create();
+    RandomizedInputDataProxy<config_t,Binop> dataAccessor = RandomizedInputDataProxy<config_t,Binop>::create();
     dataAccessor.preload();
 
     operation_t<Binop,device_capabilities> func;
diff --git a/29_Arithmetic2Bench/app_resources/shaderCommon.hlsl b/29_Arithmetic2Bench/app_resources/shaderCommon.hlsl
index 8659fd054..bf66de500 100644
--- a/29_Arithmetic2Bench/app_resources/shaderCommon.hlsl
+++ b/29_Arithmetic2Bench/app_resources/shaderCommon.hlsl
@@ -17,7 +17,7 @@ struct device_capabilities
 #ifdef TEST_NATIVE
     NBL_CONSTEXPR_STATIC_INLINE bool shaderSubgroupArithmetic = true;
 #else
-    NBL_CONSTEXPR_STATIC_INLINE bool shaderSubgroupArithmetic = true;
+    NBL_CONSTEXPR_STATIC_INLINE bool shaderSubgroupArithmetic = false;
 #endif
 };
 
diff --git a/29_Arithmetic2Bench/main.cpp b/29_Arithmetic2Bench/main.cpp
index 0b6639bec..002471b22 100644
--- a/29_Arithmetic2Bench/main.cpp
+++ b/29_Arithmetic2Bench/main.cpp
@@ -283,28 +283,11 @@ class ArithmeticBenchApp final : public examples::SimpleWindowedApplication, pub
 		transferDownQueue = getTransferDownQueue();
 		computeQueue = getComputeQueue();
 
-		// populate our random data buffer on the CPU and create a GPU copy
-		inputData = new uint32_t[elementCount];
-		{
-			std::mt19937 randGenerator(0xdeadbeefu);
-			for (uint32_t i = 0u; i < elementCount; i++)
-				inputData[i] = randGenerator(); // TODO: change to using xoroshiro, then we can skip having the input buffer at all
-
-			IGPUBuffer::SCreationParams inputDataBufferCreationParams = {};
-			inputDataBufferCreationParams.size = sizeof(uint32_t) * elementCount;
-			inputDataBufferCreationParams.usage = IGPUBuffer::EUF_STORAGE_BUFFER_BIT | IGPUBuffer::EUF_TRANSFER_DST_BIT | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT;
-			m_utils->createFilledDeviceLocalBufferOnDedMem(
-				SIntendedSubmitInfo{.queue=getTransferUpQueue()},
-				std::move(inputDataBufferCreationParams),
-				inputData
-			).move_into(gpuinputDataBuffer);
-		}
-
 		// create 8 buffers for 8 operations
 		for (auto i=0u; i<OutputBufferCount; i++)
 		{
 			IGPUBuffer::SCreationParams params = {};
-			params.size = sizeof(uint32_t) + gpuinputDataBuffer->getSize();
+			params.size = sizeof(uint32_t) * (elementCount+1);
 			params.usage = bitflag(IGPUBuffer::EUF_STORAGE_BUFFER_BIT) | IGPUBuffer::EUF_TRANSFER_SRC_BIT | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT;
 
 			outputBuffers[i] = m_device->createBuffer(std::move(params));
@@ -327,7 +310,6 @@ class ArithmeticBenchApp final : public examples::SimpleWindowedApplication, pub
 			params.size = OutputBufferCount * sizeof(uint64_t);
 			m_utils->createFilledDeviceLocalBufferOnDedMem(SIntendedSubmitInfo{ .queue = getTransferUpQueue() }, std::move(params), outputAddresses.data()).move_into(gpuOutputAddressesBuffer);
 		}
-		pc.pInputBuf = gpuinputDataBuffer->getDeviceAddress();
 		pc.ppOutputBuf = gpuOutputAddressesBuffer->getDeviceAddress();
 
 		// create image views for swapchain images
@@ -363,6 +345,12 @@ class ArithmeticBenchApp final : public examples::SimpleWindowedApplication, pub
 			SPushConstantRange pcRange = { .stageFlags = IShader::E_SHADER_STAGE::ESS_COMPUTE, .offset = 0,.size = sizeof(PushConstantData) };
 			benchPplnLayout = m_device->createPipelineLayout({ &pcRange, 1 }, std::move(benchLayout));
 		}
+		if (UseNativeArithmetic && !m_physicalDevice->getProperties().limits.shaderSubgroupArithmetic)
+		{
+			m_logger->log("UseNativeArithmetic is true but device does not support shaderSubgroupArithmetic!", ILogger::ELL_ERROR);
+			exit(-1);
+		}
+			
 
 		// load shader source from file
 		auto getShaderSource = [&](const char* filePath) -> auto
@@ -414,7 +402,6 @@ class ArithmeticBenchApp final : public examples::SimpleWindowedApplication, pub
 
 	virtual bool onAppTerminated() override
 	{
-		delete[] inputData;
 		return true;
 	}
 
@@ -650,16 +637,20 @@ class ArithmeticBenchApp final : public examples::SimpleWindowedApplication, pub
 				std::to_string(arith_name=="reduction")
 			};
 
-			const IShaderCompiler::SMacroDefinition defines[7] = {
+			const IShaderCompiler::SMacroDefinition defines[8] = {
 				{ "OPERATION", definitions[0] },
 				{ "WORKGROUP_SIZE_LOG2", definitions[1] },
 				{ "ITEMS_PER_WG", definitions[2] },
 				{ "ITEMS_PER_INVOCATION", definitions[3] },
 				{ "SUBGROUP_SIZE_LOG2", definitions[4] },
 				{ "NUM_LOOPS", definitions[5] },
-				{ "IS_REDUCTION", definitions[6] }
+				{ "IS_REDUCTION", definitions[6] },
+				{ "TEST_NATIVE", "1" }
 			};
-			options.preprocessorOptions.extraDefines = { defines, defines + 7 };
+			if (UseNativeArithmetic)
+				options.preprocessorOptions.extraDefines = { defines, defines + 8 };
+			else
+				options.preprocessorOptions.extraDefines = { defines, defines + 7 };
 
 			overriddenUnspecialized = compiler->compileToSPIRV((const char*)source->getContent()->getPointer(), options);
 		}
@@ -673,14 +664,18 @@ class ArithmeticBenchApp final : public examples::SimpleWindowedApplication, pub
 				std::to_string(numLoops)
 			};
 
-			const IShaderCompiler::SMacroDefinition defines[5] = {
+			const IShaderCompiler::SMacroDefinition defines[6] = {
 				{ "OPERATION", definitions[0] },
 				{ "WORKGROUP_SIZE", definitions[1] },
 				{ "ITEMS_PER_INVOCATION", definitions[2] },
 				{ "SUBGROUP_SIZE_LOG2", definitions[3] },
-				{ "NUM_LOOPS", definitions[4] }
+				{ "NUM_LOOPS", definitions[4] },
+				{ "TEST_NATIVE", "1" }
 			};
-			options.preprocessorOptions.extraDefines = { defines, defines + 5 };
+			if (UseNativeArithmetic)
+				options.preprocessorOptions.extraDefines = { defines, defines + 6 };
+			else
+				options.preprocessorOptions.extraDefines = { defines, defines + 5 };
 
 			overriddenUnspecialized = compiler->compileToSPIRV((const char*)source->getContent()->getPointer(), options);
 		}
@@ -753,10 +748,11 @@ class ArithmeticBenchApp final : public examples::SimpleWindowedApplication, pub
 
 	/* PARAMETERS TO CHANGE FOR DIFFERENT BENCHMARKS */
 	constexpr static inline bool DoWorkgroupBenchmarks = true;
+	constexpr static inline bool UseNativeArithmetic = true;
 	uint32_t ItemsPerInvocation = 4u;
 	constexpr static inline uint32_t NumLoops = 1000u;
-	constexpr static inline uint32_t NumBenchmarks = 2u;
-	constexpr static inline std::array<uint32_t, NumBenchmarks> workgroupSizes = { 32, 64 };// 128, 256, 512, 1024};
+	constexpr static inline uint32_t NumBenchmarks = 6u;
+	constexpr static inline std::array<uint32_t, NumBenchmarks> workgroupSizes = { 32, 64, 128, 256, 512, 1024 };
 	template<class BinOp>
 	using ArithmeticOp = emulatedReduction<BinOp>;	// change this to test other arithmetic ops
 
@@ -764,8 +760,6 @@ class ArithmeticBenchApp final : public examples::SimpleWindowedApplication, pub
 	smart_refctd_ptr<IDescriptorPool> benchPool;
 	smart_refctd_ptr<IGPUDescriptorSet> benchDs;
 
-	uint32_t* inputData = nullptr;
-	smart_refctd_ptr<IGPUBuffer> gpuinputDataBuffer;
 	constexpr static inline uint32_t OutputBufferCount = 8u;
 	smart_refctd_ptr<IGPUBuffer> outputBuffers[OutputBufferCount];
 	smart_refctd_ptr<IGPUBuffer> gpuOutputAddressesBuffer;
diff --git a/common/include/WorkgroupDataAccessors.hlsl b/common/include/WorkgroupDataAccessors.hlsl
index 267c81a73..e6112f797 100644
--- a/common/include/WorkgroupDataAccessors.hlsl
+++ b/common/include/WorkgroupDataAccessors.hlsl
@@ -92,15 +92,17 @@ struct PreloadedDataProxy
 
     void preload()
     {
+        const uint16_t invocationIndex = workgroup::SubgroupContiguousIndex();
         [unroll]
         for (uint16_t idx = 0; idx < PreloadedDataCount; idx++)
-            data.template get<dtype_t, uint16_t>(idx * Config::WorkgroupSize + workgroup::SubgroupContiguousIndex(), preloaded[idx]);
+            data.template get<dtype_t, uint16_t>(idx * Config::WorkgroupSize + invocationIndex, preloaded[idx]);
     }
     void unload()
     {
+        const uint16_t invocationIndex = workgroup::SubgroupContiguousIndex();
         [unroll]
         for (uint16_t idx = 0; idx < PreloadedDataCount; idx++)
-            data.template set<dtype_t, uint16_t>(idx * Config::WorkgroupSize + workgroup::SubgroupContiguousIndex(), preloaded[idx]);
+            data.template set<dtype_t, uint16_t>(idx * Config::WorkgroupSize + invocationIndex, preloaded[idx]);
     }
 
     void workgroupExecutionAndMemoryBarrier()

From 149a2375c3ffb43fa4c3e403c6d6eae056828fb3 Mon Sep 17 00:00:00 2001
From: keptsecret <sorchon@gmail.com>
Date: Fri, 6 Jun 2025 16:20:40 +0700
Subject: [PATCH 342/529] only have to benchmark plus op

---
 .../app_resources/benchmarkSubgroup.comp.hlsl |  6 --
 .../benchmarkWorkgroup.comp.hlsl              | 17 +----
 29_Arithmetic2Bench/app_resources/common.hlsl | 64 +------------------
 .../app_resources/shaderCommon.hlsl           |  5 --
 29_Arithmetic2Bench/main.cpp                  | 15 +----
 5 files changed, 4 insertions(+), 103 deletions(-)

diff --git a/29_Arithmetic2Bench/app_resources/benchmarkSubgroup.comp.hlsl b/29_Arithmetic2Bench/app_resources/benchmarkSubgroup.comp.hlsl
index 2e5d3e146..ba11890d1 100644
--- a/29_Arithmetic2Bench/app_resources/benchmarkSubgroup.comp.hlsl
+++ b/29_Arithmetic2Bench/app_resources/benchmarkSubgroup.comp.hlsl
@@ -42,13 +42,7 @@ void benchmark()
     for (uint16_t i = 0; i < Config::ItemsPerInvocation_0; i++)
         sourceVal[i] = xoroshiro();
 
-    subbench<arithmetic::bit_and<uint32_t>, ITEMS_PER_INVOCATION>(sourceVal);
-    subbench<arithmetic::bit_xor<uint32_t>, ITEMS_PER_INVOCATION>(sourceVal);
-    subbench<arithmetic::bit_or<uint32_t>, ITEMS_PER_INVOCATION>(sourceVal);
     subbench<arithmetic::plus<uint32_t>, ITEMS_PER_INVOCATION>(sourceVal);
-    subbench<arithmetic::multiplies<uint32_t>, ITEMS_PER_INVOCATION>(sourceVal);
-    subbench<arithmetic::minimum<uint32_t>, ITEMS_PER_INVOCATION>(sourceVal);
-    subbench<arithmetic::maximum<uint32_t>, ITEMS_PER_INVOCATION>(sourceVal);
 }
 
 [numthreads(WORKGROUP_SIZE,1,1)]
diff --git a/29_Arithmetic2Bench/app_resources/benchmarkWorkgroup.comp.hlsl b/29_Arithmetic2Bench/app_resources/benchmarkWorkgroup.comp.hlsl
index 4e611476a..58a3624cd 100644
--- a/29_Arithmetic2Bench/app_resources/benchmarkWorkgroup.comp.hlsl
+++ b/29_Arithmetic2Bench/app_resources/benchmarkWorkgroup.comp.hlsl
@@ -113,26 +113,11 @@ static void subbench()
 
 void benchmark()
 {
-    subbench<arithmetic::bit_and<uint32_t> >();
-    subbench<arithmetic::bit_xor<uint32_t> >();
-    subbench<arithmetic::bit_or<uint32_t> >();
+    // only benchmark plus op
     subbench<arithmetic::plus<uint32_t> >();
-    subbench<arithmetic::multiplies<uint32_t> >();
-    subbench<arithmetic::minimum<uint32_t> >();
-    subbench<arithmetic::maximum<uint32_t> >();
 }
 
 
-uint32_t globalIndex()
-{
-    return glsl::gl_WorkGroupID().x*ITEMS_PER_WG+workgroup::SubgroupContiguousIndex();
-}
-
-bool canStore()
-{
-    return workgroup::SubgroupContiguousIndex()<ITEMS_PER_WG;
-}
-
 [numthreads(WORKGROUP_SIZE,1,1)]
 void main()
 {
diff --git a/29_Arithmetic2Bench/app_resources/common.hlsl b/29_Arithmetic2Bench/app_resources/common.hlsl
index a1d74b13b..388be324f 100644
--- a/29_Arithmetic2Bench/app_resources/common.hlsl
+++ b/29_Arithmetic2Bench/app_resources/common.hlsl
@@ -10,82 +10,22 @@ struct PushConstantData
 namespace arithmetic
 {
 template<typename T>
-struct bit_and : nbl::hlsl::bit_and<T>
-{
-    using base_t = nbl::hlsl::bit_and<T>;
-
-    NBL_CONSTEXPR_STATIC_INLINE uint16_t BindingIndex = 0;
-#ifndef __HLSL_VERSION
-    static inline constexpr const char* name = "bit_and";
-#endif
-};
-template<typename T>
-struct bit_or : nbl::hlsl::bit_or<T>
-{
-    using base_t = nbl::hlsl::bit_or<T>;
-
-    NBL_CONSTEXPR_STATIC_INLINE uint16_t BindingIndex = 1;
-#ifndef __HLSL_VERSION
-    static inline constexpr const char* name = "bit_xor";
-#endif
-};
-template<typename T>
-struct bit_xor : nbl::hlsl::bit_xor<T>
-{
-    using base_t = nbl::hlsl::bit_xor<T>;
-
-    NBL_CONSTEXPR_STATIC_INLINE uint16_t BindingIndex = 2;
-#ifndef __HLSL_VERSION
-    static inline constexpr const char* name = "bit_or";
-#endif
-};
-template<typename T>
 struct plus : nbl::hlsl::plus<T>
 {
     using base_t = nbl::hlsl::plus<T>;
 
-    NBL_CONSTEXPR_STATIC_INLINE uint16_t BindingIndex = 3;
+    NBL_CONSTEXPR_STATIC_INLINE uint16_t BindingIndex = 0;
 #ifndef __HLSL_VERSION
     static inline constexpr const char* name = "plus";
 #endif
 };
-template<typename T>
-struct multiplies : nbl::hlsl::multiplies<T>
-{
-    using base_t = nbl::hlsl::multiplies<T>;
-
-    NBL_CONSTEXPR_STATIC_INLINE uint16_t BindingIndex = 4;
-#ifndef __HLSL_VERSION
-    static inline constexpr const char* name = "multiplies";
-#endif
-};
-template<typename T>
-struct minimum : nbl::hlsl::minimum<T>
-{
-    using base_t = nbl::hlsl::minimum<T>;
-
-    NBL_CONSTEXPR_STATIC_INLINE uint16_t BindingIndex = 5;
-#ifndef __HLSL_VERSION
-    static inline constexpr const char* name = "minimum";
-#endif
-};
-template<typename T>
-struct maximum : nbl::hlsl::maximum<T>
-{
-    using base_t = nbl::hlsl::maximum<T>;
-
-    NBL_CONSTEXPR_STATIC_INLINE uint16_t BindingIndex = 6;
-#ifndef __HLSL_VERSION
-    static inline constexpr const char* name = "maximum";
-#endif
-};
 
 template<typename T>
 struct ballot : nbl::hlsl::plus<T>
 {
     using base_t = nbl::hlsl::plus<T>;
 
-    NBL_CONSTEXPR_STATIC_INLINE uint16_t BindingIndex = 7;
+    NBL_CONSTEXPR_STATIC_INLINE uint16_t BindingIndex = 1;
 #ifndef __HLSL_VERSION
     static inline constexpr const char* name = "bitcount";
 #endif
diff --git a/29_Arithmetic2Bench/app_resources/shaderCommon.hlsl b/29_Arithmetic2Bench/app_resources/shaderCommon.hlsl
index bf66de500..4866efe81 100644
--- a/29_Arithmetic2Bench/app_resources/shaderCommon.hlsl
+++ b/29_Arithmetic2Bench/app_resources/shaderCommon.hlsl
@@ -21,11 +21,6 @@ struct device_capabilities
 #endif
 };
 
-// because subgroups don't match `gl_LocalInvocationIndex` snake curve addressing, we also can't load inputs that way
-uint32_t globalIndex();
-// since we test ITEMS_PER_WG<WorkgroupSize we need this so workgroups don't overwrite each other's outputs
-bool canStore();
-
 #ifndef OPERATION
 #error "Define OPERATION!"
 #endif
diff --git a/29_Arithmetic2Bench/main.cpp b/29_Arithmetic2Bench/main.cpp
index 002471b22..c91cbe4aa 100644
--- a/29_Arithmetic2Bench/main.cpp
+++ b/29_Arithmetic2Bench/main.cpp
@@ -370,18 +370,6 @@ class ArithmeticBenchApp final : public examples::SimpleWindowedApplication, pub
 
 		auto subgroupBenchSource = getShaderSource("app_resources/benchmarkSubgroup.comp.hlsl");
 		auto workgroupBenchSource = getShaderSource("app_resources/benchmarkWorkgroup.comp.hlsl");
-		// now create or retrieve final resources to run our tests
-		sema = m_device->createSemaphore(timelineValue);
-		smart_refctd_ptr<IGPUCommandBuffer> cmdbuf;
-		{
-			smart_refctd_ptr<nbl::video::IGPUCommandPool> cmdpool = m_device->createCommandPool(computeQueue->getFamilyIndex(),IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT);
-			if (!cmdpool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY,{&cmdbuf,1}))
-			{
-				logFail("Failed to create Command Buffers!\n");
-				return false;
-			}
-		}
-
 		const auto MaxSubgroupSize = m_physicalDevice->getLimits().maxSubgroupSize;
 		// for each workgroup size (manually adjust items per invoc, operation else uses up a lot of ram)
 		if constexpr (DoWorkgroupBenchmarks)
@@ -760,12 +748,11 @@ class ArithmeticBenchApp final : public examples::SimpleWindowedApplication, pub
 	smart_refctd_ptr<IDescriptorPool> benchPool;
 	smart_refctd_ptr<IGPUDescriptorSet> benchDs;
 
-	constexpr static inline uint32_t OutputBufferCount = 8u;
+	constexpr static inline uint32_t OutputBufferCount = 2u;
 	smart_refctd_ptr<IGPUBuffer> outputBuffers[OutputBufferCount];
 	smart_refctd_ptr<IGPUBuffer> gpuOutputAddressesBuffer;
 	PushConstantData pc;
 
-	smart_refctd_ptr<ISemaphore> sema;
 	uint64_t timelineValue = 0;
 };
 

From 00ed9beaddced9d0bd01e18d510ea7d58e48cfb5 Mon Sep 17 00:00:00 2001
From: keptsecret <sorchon@gmail.com>
Date: Fri, 6 Jun 2025 16:55:15 +0700
Subject: [PATCH 343/529] benchmark all reduce/scan in one run (lots of
 shaders)

---
 29_Arithmetic2Bench/main.cpp | 96 ++++++++++++++----------------------
 1 file changed, 37 insertions(+), 59 deletions(-)

diff --git a/29_Arithmetic2Bench/main.cpp b/29_Arithmetic2Bench/main.cpp
index c91cbe4aa..38f995264 100644
--- a/29_Arithmetic2Bench/main.cpp
+++ b/29_Arithmetic2Bench/main.cpp
@@ -10,43 +10,6 @@ using namespace asset;
 using namespace ui;
 using namespace video;
 
-// method emulations on the CPU, to verify the results of the GPU methods
-template<class Binop>
-struct emulatedReduction
-{
-	using type_t = typename Binop::type_t;
-
-	static inline void impl(type_t* out, const type_t* in, const uint32_t itemCount)
-	{
-		const type_t red = std::reduce(in,in+itemCount,Binop::identity,Binop());
-		std::fill(out,out+itemCount,red);
-	}
-
-	static inline constexpr const char* name = "reduction";
-};
-template<class Binop>
-struct emulatedScanInclusive
-{
-	using type_t = typename Binop::type_t;
-
-	static inline void impl(type_t* out, const type_t* in, const uint32_t itemCount)
-	{
-		std::inclusive_scan(in,in+itemCount,out,Binop());
-	}
-	static inline constexpr const char* name = "inclusive_scan";
-};
-template<class Binop>
-struct emulatedScanExclusive
-{
-	using type_t = typename Binop::type_t;
-
-	static inline void impl(type_t* out, const type_t* in, const uint32_t itemCount)
-	{
-		std::exclusive_scan(in,in+itemCount,out,Binop::identity,Binop());
-	}
-	static inline constexpr const char* name = "exclusive_scan";
-};
-
 template<typename SwapchainResources> requires std::is_base_of_v<ISimpleManagedSurface::ISwapchainResources, SwapchainResources>
 class CExplicitSurfaceFormatResizeSurface final : public ISimpleManagedSurface
 {
@@ -287,7 +250,7 @@ class ArithmeticBenchApp final : public examples::SimpleWindowedApplication, pub
 		for (auto i=0u; i<OutputBufferCount; i++)
 		{
 			IGPUBuffer::SCreationParams params = {};
-			params.size = sizeof(uint32_t) * (elementCount+1);
+			params.size = sizeof(uint32_t) * (ElementCount+1);
 			params.usage = bitflag(IGPUBuffer::EUF_STORAGE_BUFFER_BIT) | IGPUBuffer::EUF_TRANSFER_SRC_BIT | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT;
 
 			outputBuffers[i] = m_device->createBuffer(std::move(params));
@@ -368,20 +331,17 @@ class ArithmeticBenchApp final : public examples::SimpleWindowedApplication, pub
 			return smart_refctd_ptr_static_cast<ICPUShader>(firstAssetInBundle);
 		};
 
-		auto subgroupBenchSource = getShaderSource("app_resources/benchmarkSubgroup.comp.hlsl");
-		auto workgroupBenchSource = getShaderSource("app_resources/benchmarkWorkgroup.comp.hlsl");
-		const auto MaxSubgroupSize = m_physicalDevice->getLimits().maxSubgroupSize;
 		// for each workgroup size (manually adjust items per invoc, operation else uses up a lot of ram)
+		const auto MaxSubgroupSize = m_physicalDevice->getLimits().maxSubgroupSize;
+		smart_refctd_ptr<ICPUShader> shaderSource;
 		if constexpr (DoWorkgroupBenchmarks)
-		{
-			for (uint32_t i = 0; i < workgroupSizes.size(); i++)
-				benchSets[i] = createBenchmarkPipelines<ArithmeticOp, DoWorkgroupBenchmarks>(workgroupBenchSource, benchPplnLayout.get(), elementCount, hlsl::findMSB(MaxSubgroupSize), workgroupSizes[i], ItemsPerInvocation, NumLoops);
-		}
+			shaderSource = getShaderSource("app_resources/benchmarkWorkgroup.comp.hlsl");
 		else
-		{
+			shaderSource = getShaderSource("app_resources/benchmarkSubgroup.comp.hlsl");
+
+		for (uint32_t op = 0; op < arithmeticOperations.size(); op++)
 			for (uint32_t i = 0; i < workgroupSizes.size(); i++)
-				benchSets[i] = createBenchmarkPipelines<ArithmeticOp, DoWorkgroupBenchmarks>(subgroupBenchSource, benchPplnLayout.get(), elementCount, hlsl::findMSB(MaxSubgroupSize), workgroupSizes[i], ItemsPerInvocation, NumLoops);
-		}
+				benchSets[op*workgroupSizes.size()+i] = createBenchmarkPipelines<DoWorkgroupBenchmarks>(shaderSource, benchPplnLayout.get(), ElementCount, arithmeticOperations[op], hlsl::findMSB(MaxSubgroupSize), workgroupSizes[i], ItemsPerInvocation, NumLoops);
 
 		m_winMgr->show(m_window.get());
 
@@ -559,6 +519,27 @@ class ArithmeticBenchApp final : public examples::SimpleWindowedApplication, pub
 	bool keepRunning() override { return numSubmits < MaxNumSubmits; }
 
 private:
+	// reflects calculations in workgroup2::ArithmeticConfiguration
+	uint32_t calculateItemsPerWorkgroup(const uint32_t workgroupSize, const uint32_t subgroupSize, const uint32_t itemsPerInvocation)
+	{
+		if (workgroupSize <= subgroupSize)
+			return workgroupSize * itemsPerInvocation;
+
+		const uint8_t subgroupSizeLog2 = hlsl::findMSB(subgroupSize);
+		const uint8_t workgroupSizeLog2 = hlsl::findMSB(workgroupSize);
+
+		const uint16_t levels = (workgroupSizeLog2 == subgroupSizeLog2) ? 1 :
+			(workgroupSizeLog2 > subgroupSizeLog2 * 2 + 2) ? 3 : 2;
+
+		const uint16_t itemsPerInvocationProductLog2 = max(workgroupSizeLog2 - subgroupSizeLog2 * levels, 0);
+		uint16_t itemsPerInvocation1 = (levels == 3) ? min(itemsPerInvocationProductLog2, 2) : itemsPerInvocationProductLog2;
+		itemsPerInvocation1 = uint16_t(1u) << itemsPerInvocation1;
+
+		uint32_t virtualWorkgroupSize = 1u << max(subgroupSizeLog2 * levels, workgroupSizeLog2);
+
+		return itemsPerInvocation * virtualWorkgroupSize;
+	}
+
 	// create pipeline (specialized every test) [TODO: turn into a future/async]
 	smart_refctd_ptr<IGPUComputePipeline> createPipeline(const ICPUShader* overridenUnspecialized, const IGPUPipelineLayout* layout, const uint8_t subgroupSizeLog2)
 	{
@@ -585,11 +566,9 @@ class ArithmeticBenchApp final : public examples::SimpleWindowedApplication, pub
 		uint32_t itemsPerInvocation;
 	};
 
-	template<template<class> class Arithmetic, bool WorkgroupBench>
-	BenchmarkSet createBenchmarkPipelines(const smart_refctd_ptr<const ICPUShader>&source, const IGPUPipelineLayout* layout, const uint32_t elementCount, const uint8_t subgroupSizeLog2, const uint32_t workgroupSize, uint32_t itemsPerInvoc = 1u, uint32_t numLoops = 8u)
+	template<bool WorkgroupBench>
+	BenchmarkSet createBenchmarkPipelines(const smart_refctd_ptr<const ICPUShader>&source, const IGPUPipelineLayout* layout, const uint32_t elementCount, const std::string& arith_name, const uint8_t subgroupSizeLog2, const uint32_t workgroupSize, uint32_t itemsPerInvoc = 1u, uint32_t numLoops = 8u)
 	{
-		std::string arith_name = Arithmetic<arithmetic::plus<uint32_t>>::name;
-
 		auto compiler = make_smart_refctd_ptr<asset::CHLSLCompiler>(smart_refctd_ptr(m_system));
 		CHLSLCompiler::SOptions options = {};
 		options.stage = IShader::E_SHADER_STAGE::ESS_COMPUTE;
@@ -606,11 +585,10 @@ class ArithmeticBenchApp final : public examples::SimpleWindowedApplication, pub
 		options.preprocessorOptions.logger = m_logger.get();
 
 		auto* includeFinder = compiler->getDefaultIncludeFinder();
-		includeFinder->addSearchPath("nbl/builtin/hlsl/jit", core::make_smart_refctd_ptr<CJITIncludeLoader>(m_physicalDevice->getLimits(), m_device->getEnabledFeatures()));
 		options.preprocessorOptions.includeFinder = includeFinder;
 
 		const uint32_t subgroupSize = 0x1u << subgroupSizeLog2;
-		const uint32_t itemsPerWG = workgroupSize <= subgroupSize ? workgroupSize * itemsPerInvoc : itemsPerInvoc * max(workgroupSize >> subgroupSizeLog2, subgroupSize) << subgroupSizeLog2;	// TODO use Config somehow
+		const uint32_t itemsPerWG = calculateItemsPerWorkgroup(workgroupSize, subgroupSize, itemsPerInvoc);
 		smart_refctd_ptr<ICPUShader> overriddenUnspecialized;
 		if constexpr (WorkgroupBench)
 		{
@@ -732,7 +710,7 @@ class ArithmeticBenchApp final : public examples::SimpleWindowedApplication, pub
 
 	constexpr static inline uint32_t MaxNumSubmits = 30;
 	uint32_t numSubmits = 0;
-	uint32_t elementCount = 1024 * 1024;
+	constexpr static inline uint32_t ElementCount = 1024 * 1024;
 
 	/* PARAMETERS TO CHANGE FOR DIFFERENT BENCHMARKS */
 	constexpr static inline bool DoWorkgroupBenchmarks = true;
@@ -740,11 +718,11 @@ class ArithmeticBenchApp final : public examples::SimpleWindowedApplication, pub
 	uint32_t ItemsPerInvocation = 4u;
 	constexpr static inline uint32_t NumLoops = 1000u;
 	constexpr static inline uint32_t NumBenchmarks = 6u;
-	constexpr static inline std::array<uint32_t, NumBenchmarks> workgroupSizes = { 32, 64, 128, 256, 512, 1024 };
-	template<class BinOp>
-	using ArithmeticOp = emulatedReduction<BinOp>;	// change this to test other arithmetic ops
+	std::array<uint32_t, NumBenchmarks> workgroupSizes = { 32, 64, 128, 256, 512, 1024 };
+	std::array<std::string, 3u> arithmeticOperations = { "reduction", "inclusive_scan", "exclusive_scan" };
+
 
-	std::array<BenchmarkSet, NumBenchmarks> benchSets;
+	std::array<BenchmarkSet, NumBenchmarks*3u> benchSets;
 	smart_refctd_ptr<IDescriptorPool> benchPool;
 	smart_refctd_ptr<IGPUDescriptorSet> benchDs;
 

From d1a4e8e8e7f21390a7d817bc0bf7f2d984ac4253 Mon Sep 17 00:00:00 2001
From: Przemek <minikers21@gmail.com>
Date: Sat, 7 Jun 2025 15:51:01 +0200
Subject: [PATCH 344/529] Added diagonal mode info to grid DTM height map

---
 62_CAD/main.cpp                               | 26 +++++++++++++
 62_CAD/shaders/globals.hlsl                   | 28 +++++++++++++
 .../main_pipeline/fragment_shader.hlsl        | 39 +++++++++++++------
 .../shaders/main_pipeline/vertex_shader.hlsl  |  5 ++-
 4 files changed, 85 insertions(+), 13 deletions(-)

diff --git a/62_CAD/main.cpp b/62_CAD/main.cpp
index a3f4016d7..41d8fbfd3 100644
--- a/62_CAD/main.cpp
+++ b/62_CAD/main.cpp
@@ -1261,6 +1261,31 @@ class ComputerAidedDesign final : public examples::SimpleWindowedApplication, pu
 		}
 
 		gridDTMHeightMap = loadImage("../../media/gridDTMHeightMap.exr");
+
+		// set diagonals of even cells to TOP_LEFT_TO_BOTTOM_RIGHT and diagonals of odd cells to BOTTOM_LEFT_TO_TOP_RIGHT
+		{
+			// assumption is that format of the grid DTM height map is *_SRGB, I don't think we need any code to ensure that
+
+			auto* region = gridDTMHeightMap->getRegion(0, core::vectorSIMDu32(0.0f));
+			auto imageExtent = region->getExtent();
+			auto imagePixelSize = asset::getBytesPerPixel(gridDTMHeightMap->getCreationParameters().format).getIntegerApprox();
+			float* imageData = static_cast<float*>(gridDTMHeightMap->getBuffer()->getPointer()) + region->bufferOffset;
+			const size_t imageByteSize = gridDTMHeightMap->getImageDataSizeInBytes();
+			assert(imageByteSize % sizeof(float) == 0);
+
+			for (int i = 0; i < imageByteSize; i += sizeof(float))
+			{
+				const bool isCellEven = i % (2 * sizeof(float)) == 0;
+				E_CELL_DIAGONAL diagonal = isCellEven ? TOP_LEFT_TO_BOTTOM_RIGHT : BOTTOM_LEFT_TO_TOP_RIGHT;
+
+				// test
+				diagonal = BOTTOM_LEFT_TO_TOP_RIGHT;
+
+				setDiagonalModeBit(imageData, diagonal);
+				imageData++;
+			}
+		}
+
 		assert(gridDTMHeightMap);
 
 		return true;
@@ -3735,3 +3760,4 @@ class ComputerAidedDesign final : public examples::SimpleWindowedApplication, pu
 };
 
 NBL_MAIN_FUNC(ComputerAidedDesign)
+
diff --git a/62_CAD/shaders/globals.hlsl b/62_CAD/shaders/globals.hlsl
index 1397f78e6..2361de5e2 100644
--- a/62_CAD/shaders/globals.hlsl
+++ b/62_CAD/shaders/globals.hlsl
@@ -263,6 +263,34 @@ struct GridDTMInfo
     float thicknessOfTheThickestLine; // 4 bytes (48)
 };
 
+enum E_CELL_DIAGONAL : uint32_t
+{
+    TOP_LEFT_TO_BOTTOM_RIGHT = 0u,
+    BOTTOM_LEFT_TO_TOP_RIGHT = 1u,
+    INVALID = 2u
+};
+
+#ifndef __HLSL_VERSION
+
+// sets last bit of data to 1 or 0 depending on diagonalMode
+static void setDiagonalModeBit(float* data, E_CELL_DIAGONAL diagonalMode)
+{
+    if (diagonalMode == E_CELL_DIAGONAL::INVALID)
+        return;
+
+    uint32_t dataAsUint = reinterpret_cast<uint32_t&>(*data);
+    dataAsUint |= static_cast<uint32_t>(diagonalMode);
+    *data = reinterpret_cast<float&>(dataAsUint);
+}
+
+#endif
+
+// Top left corner holds diagonal mode info of a cell 
+static E_CELL_DIAGONAL getDiagonalModeFromCellCornerData(float cellCornerData)
+{
+    return (nbl::hlsl::bit_cast<uint32_t, float>(cellCornerData) & 0x1u) ? BOTTOM_LEFT_TO_TOP_RIGHT : TOP_LEFT_TO_BOTTOM_RIGHT;
+}
+
 static uint32_t packR11G11B10_UNORM(float32_t3 color)
 {
     // Scale and convert to integers
diff --git a/62_CAD/shaders/main_pipeline/fragment_shader.hlsl b/62_CAD/shaders/main_pipeline/fragment_shader.hlsl
index d4f269413..10a2348a7 100644
--- a/62_CAD/shaders/main_pipeline/fragment_shader.hlsl
+++ b/62_CAD/shaders/main_pipeline/fragment_shader.hlsl
@@ -117,17 +117,8 @@ float32_t4 calculateFinalColor<true>(const uint2 fragCoord, const float localAlp
     return color;
 }
 
-enum E_CELL_DIAGONAL
-{
-    TOP_LEFT_TO_BOTTOM_RIGHT,
-    BOTTOM_LEFT_TO_TOP_RIGHT,
-    INVALID
-};
-
 E_CELL_DIAGONAL resolveGridDTMCellDiagonal(in float4 cellHeights)
 {
-    static const E_CELL_DIAGONAL DefaultDiagonal = TOP_LEFT_TO_BOTTOM_RIGHT;
-
     const bool4 invalidHeights = bool4(
         isnan(cellHeights.x),
         isnan(cellHeights.y),
@@ -140,15 +131,36 @@ E_CELL_DIAGONAL resolveGridDTMCellDiagonal(in float4 cellHeights)
         invalidHeightsCount += int(invalidHeights[i]);
 
     if (invalidHeightsCount == 0)
-        return DefaultDiagonal;
+    {
+        E_CELL_DIAGONAL a = getDiagonalModeFromCellCornerData(cellHeights.w);
+
+        if (a == TOP_LEFT_TO_BOTTOM_RIGHT)
+        {
+            uint32_t asdf = nbl::hlsl::bit_cast<uint32_t, float>(cellHeights.w);
+            printf("a %f %u", cellHeights.w, asdf);
+        }
+        else if (a == BOTTOM_LEFT_TO_TOP_RIGHT)
+        {
+            uint32_t asdf = nbl::hlsl::bit_cast<uint32_t, float>(cellHeights.w);
+            printf("b %f %u", cellHeights.w, asdf);
+        }
+        else
+        {
+            printf("wtf");
+        }
+
+        return getDiagonalModeFromCellCornerData(cellHeights.w);
+    }
 
     if (invalidHeightsCount > 1)
         return INVALID;
 
     if (invalidHeights.x || invalidHeights.z)
         return TOP_LEFT_TO_BOTTOM_RIGHT;
-    else
+    else if (invalidHeights.y || invalidHeights.w)
         return BOTTOM_LEFT_TO_TOP_RIGHT;
+
+    return INVALID;
 }
 
 [[vk::spvexecutionmode(spv::ExecutionModePixelInterlockOrderedEXT)]]
@@ -488,6 +500,11 @@ float4 fragMain(PSInput input) : SV_TARGET
                 const E_CELL_DIAGONAL cellDiagonal = resolveGridDTMCellDiagonal(cellHeights);
                 const bool diagonalFromTopLeftToBottomRight = cellDiagonal == E_CELL_DIAGONAL::TOP_LEFT_TO_BOTTOM_RIGHT;
 
+                /*if (!diagonalFromTopLeftToBottomRight)
+                    printf("a");
+                else
+                    printf("b");*/
+
                 if (cellDiagonal == E_CELL_DIAGONAL::INVALID)
                     discard;
 
diff --git a/62_CAD/shaders/main_pipeline/vertex_shader.hlsl b/62_CAD/shaders/main_pipeline/vertex_shader.hlsl
index 7f669f34b..11c8f8e22 100644
--- a/62_CAD/shaders/main_pipeline/vertex_shader.hlsl
+++ b/62_CAD/shaders/main_pipeline/vertex_shader.hlsl
@@ -654,7 +654,8 @@ PSInput main(uint vertexID : SV_VertexID)
             float thicknessOfTheThickestLine = vk::RawBufferLoad<float>(globals.pointers.geometryBuffer + drawObj.geometryAddress + 2 * sizeof(pfloat64_t2) + sizeof(uint32_t) + 2u * sizeof(float), 8u);
 
             // for testing purpose
-            thicknessOfTheThickestLine += 200.0f;
+            //thicknessOfTheThickestLine += 200.0f;
+            thicknessOfTheThickestLine = 0.0f;
 
             const float2 corner = float2(bool2(vertexIdx & 0x1u, vertexIdx >> 1));
             worldSpaceExtents.y = ieee754::flipSign(worldSpaceExtents.y);
@@ -704,7 +705,7 @@ PSInput main(uint vertexID : SV_VertexID)
 
             const float2 uv = corner + uvOffset;
             outV.setImageUV(uv);
-            printf("uv = { %f, %f } scale = { %f, %f }", _static_cast<float>(uv.x), _static_cast<float>(uv.y), _static_cast<float>(uvScale.x), _static_cast<float>(uvScale.y));
+            /*printf("uv = { %f, %f } scale = { %f, %f }", _static_cast<float>(uv.x), _static_cast<float>(uv.y), _static_cast<float>(uvScale.x), _static_cast<float>(uvScale.y));*/
 
             pfloat64_t2 topLeftToGridCenterVector = worldSpaceExtents * 0.5;
             topLeftToGridCenterVector.y = -topLeftToGridCenterVector.y;

From a5a21fd577fab8dae00995e5f1cefbc63f48cd5e Mon Sep 17 00:00:00 2001
From: keptsecret <sorchon@gmail.com>
Date: Mon, 9 Jun 2025 11:09:53 +0700
Subject: [PATCH 345/529] minor changes to passing subgroup size and items per
 wg

---
 .../app_resources/testSubgroup.comp.hlsl      |  8 +--
 .../app_resources/testWorkgroup.comp.hlsl     |  7 +--
 23_Arithmetic2UnitTest/main.cpp               | 60 ++++++++-----------
 .../benchmarkWorkgroup.comp.hlsl              |  2 +-
 4 files changed, 30 insertions(+), 47 deletions(-)

diff --git a/23_Arithmetic2UnitTest/app_resources/testSubgroup.comp.hlsl b/23_Arithmetic2UnitTest/app_resources/testSubgroup.comp.hlsl
index 8d8557ccd..2adb4dc81 100644
--- a/23_Arithmetic2UnitTest/app_resources/testSubgroup.comp.hlsl
+++ b/23_Arithmetic2UnitTest/app_resources/testSubgroup.comp.hlsl
@@ -16,8 +16,6 @@ uint32_t globalIndex()
     return glsl::gl_WorkGroupID().x*WORKGROUP_SIZE+workgroup::SubgroupContiguousIndex();
 }
 
-bool canStore() { return true; }
-
 template<class Binop, uint32_t N>
 static void subtest(NBL_CONST_REF_ARG(type_t) sourceVal)
 {
@@ -26,13 +24,13 @@ static void subtest(NBL_CONST_REF_ARG(type_t) sourceVal)
 
     const uint64_t outputBufAddr = vk::RawBufferLoad<uint64_t>(pc.ppOutputBuf + Binop::BindingIndex * sizeof(uint64_t), sizeof(uint64_t));
 
-    if (globalIndex()==0u)
+    if (glsl::gl_SubgroupSize()!=1u<<SUBGROUP_SIZE_LOG2)
         vk::RawBufferStore<uint32_t>(outputBufAddr, glsl::gl_SubgroupSize());
 
     operation_t<params_t> func;
     type_t val = func(sourceVal);
-    if (canStore())
-        vk::RawBufferStore<type_t>(outputBufAddr + sizeof(uint32_t) + sizeof(type_t) * globalIndex(), val, sizeof(uint32_t));
+
+    vk::RawBufferStore<type_t>(outputBufAddr + sizeof(uint32_t) + sizeof(type_t) * globalIndex(), val, sizeof(uint32_t));
 }
 
 type_t test()
diff --git a/23_Arithmetic2UnitTest/app_resources/testWorkgroup.comp.hlsl b/23_Arithmetic2UnitTest/app_resources/testWorkgroup.comp.hlsl
index a38124b0c..efaa25874 100644
--- a/23_Arithmetic2UnitTest/app_resources/testWorkgroup.comp.hlsl
+++ b/23_Arithmetic2UnitTest/app_resources/testWorkgroup.comp.hlsl
@@ -48,16 +48,11 @@ struct operation_t
 };
 
 
-uint32_t globalIndex()
-{
-    return glsl::gl_WorkGroupID().x*ITEMS_PER_WG+workgroup::SubgroupContiguousIndex();
-}
-
 template<class Binop>
 static void subtest()
 {
     uint64_t outputBufAddr = vk::RawBufferLoad<uint64_t>(pc.ppOutputBuf + Binop::BindingIndex * sizeof(uint64_t));
-    if (globalIndex()==0u)
+    if (glsl::gl_SubgroupSize()!=1u<<SUBGROUP_SIZE_LOG2)
         vk::RawBufferStore<uint32_t>(outputBufAddr, glsl::gl_SubgroupSize());
 
     operation_t<Binop,device_capabilities> func;
diff --git a/23_Arithmetic2UnitTest/main.cpp b/23_Arithmetic2UnitTest/main.cpp
index 326c9e57f..ad867bc92 100644
--- a/23_Arithmetic2UnitTest/main.cpp
+++ b/23_Arithmetic2UnitTest/main.cpp
@@ -216,16 +216,17 @@ class Workgroup2ScanTestApp final : public application_templates::BasicMultiQueu
 					for (uint32_t j = 0; j < ItemsPerInvocations.size(); j++)
 					{
 						const uint32_t itemsPerInvocation = ItemsPerInvocations[j];
+						uint32_t itemsPerWG = workgroupSize * itemsPerInvocation;
 						m_logger->log("Testing Items per Invocation %u", ILogger::ELL_INFO, itemsPerInvocation);
 						bool passed = true;
-						passed = runTest<emulatedReduction, false>(subgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize, b_useNative, ~0u, itemsPerInvocation) && passed;
-						logTestOutcome(passed, workgroupSize);
-						passed = runTest<emulatedScanInclusive, false>(subgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize, b_useNative, ~0u, itemsPerInvocation) && passed;
-						logTestOutcome(passed, workgroupSize);
-						passed = runTest<emulatedScanExclusive, false>(subgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize, b_useNative, ~0u, itemsPerInvocation) && passed;
-						logTestOutcome(passed, workgroupSize);
-
-						const uint32_t itemsPerWG = calculateItemsPerWorkgroup(workgroupSize, subgroupSize, itemsPerInvocation);
+						passed = runTest<emulatedReduction, false>(subgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize, b_useNative, itemsPerWG, itemsPerInvocation) && passed;
+						logTestOutcome(passed, itemsPerWG);
+						passed = runTest<emulatedScanInclusive, false>(subgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize, b_useNative, itemsPerWG, itemsPerInvocation) && passed;
+						logTestOutcome(passed, itemsPerWG);
+						passed = runTest<emulatedScanExclusive, false>(subgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize, b_useNative, itemsPerWG, itemsPerInvocation) && passed;
+						logTestOutcome(passed, itemsPerWG);
+
+						itemsPerWG = calculateItemsPerWorkgroup(workgroupSize, subgroupSize, itemsPerInvocation);
 						m_logger->log("Testing Item Count %u", ILogger::ELL_INFO, itemsPerWG);
 						passed = runTest<emulatedReduction, true>(workgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize, b_useNative, itemsPerWG, itemsPerInvocation) && passed;
 						logTestOutcome(passed, itemsPerWG);
@@ -320,7 +321,7 @@ class Workgroup2ScanTestApp final : public application_templates::BasicMultiQueu
 	}
 
 	template<template<class> class Arithmetic, bool WorkgroupTest>
-	bool runTest(const smart_refctd_ptr<const ICPUShader>& source, const uint32_t elementCount, const uint8_t subgroupSizeLog2, const uint32_t workgroupSize, bool useNative, uint32_t itemsPerWG = ~0u, uint32_t itemsPerInvoc = 1u)
+	bool runTest(const smart_refctd_ptr<const ICPUShader>& source, const uint32_t elementCount, const uint8_t subgroupSizeLog2, const uint32_t workgroupSize, bool useNative, uint32_t itemsPerWG, uint32_t itemsPerInvoc = 1u)
 	{
 		std::string arith_name = Arithmetic<arithmetic::bit_xor<float>>::name;
 		const uint32_t workgroupSizeLog2 = hlsl::findMSB(workgroupSize);
@@ -398,15 +399,7 @@ class Workgroup2ScanTestApp final : public application_templates::BasicMultiQueu
 		auto pipeline = createPipeline(overriddenUnspecialized.get(),subgroupSizeLog2);
 
 		// TODO: overlap dispatches with memory readbacks (requires multiple copies of `buffers`)
-		uint32_t workgroupCount;
-		if constexpr (WorkgroupTest)
-			workgroupCount = elementCount / itemsPerWG;
-		else
-		{
-			itemsPerWG = workgroupSize;
-			workgroupCount = elementCount / (itemsPerWG * itemsPerInvoc);
-		}
-		workgroupCount = min(workgroupCount, m_physicalDevice->getLimits().maxComputeWorkGroupCount[0]);
+		uint32_t workgroupCount = min(elementCount / itemsPerWG, m_physicalDevice->getLimits().maxComputeWorkGroupCount[0]);
 
 		cmdbuf->begin(IGPUCommandBuffer::USAGE::NONE);
 		cmdbuf->bindComputePipeline(pipeline.get());
@@ -441,21 +434,22 @@ class Workgroup2ScanTestApp final : public application_templates::BasicMultiQueu
 		const ISemaphore::SWaitInfo wait[1] = {{.semaphore=sema.get(),.value=timelineValue}};
 		m_device->blockForSemaphores(wait);
 
+		const uint32_t subgroupSize = 1u << subgroupSizeLog2;
 		// check results
-		bool passed = validateResults<Arithmetic, arithmetic::bit_and<uint32_t>, WorkgroupTest>(itemsPerWG, workgroupCount, itemsPerInvoc);
-		passed = validateResults<Arithmetic, arithmetic::bit_xor<uint32_t>, WorkgroupTest>(itemsPerWG, workgroupCount, itemsPerInvoc) && passed;
-		passed = validateResults<Arithmetic, arithmetic::bit_or<uint32_t>, WorkgroupTest>(itemsPerWG, workgroupCount, itemsPerInvoc) && passed;
-		passed = validateResults<Arithmetic, arithmetic::plus<uint32_t>, WorkgroupTest>(itemsPerWG, workgroupCount, itemsPerInvoc) && passed;
-		passed = validateResults<Arithmetic, arithmetic::multiplies<uint32_t>, WorkgroupTest>(itemsPerWG, workgroupCount, itemsPerInvoc) && passed;
-		passed = validateResults<Arithmetic, arithmetic::minimum<uint32_t>, WorkgroupTest>(itemsPerWG, workgroupCount, itemsPerInvoc) && passed;
-		passed = validateResults<Arithmetic, arithmetic::maximum<uint32_t>, WorkgroupTest>(itemsPerWG, workgroupCount, itemsPerInvoc) && passed;
+		bool passed = validateResults<Arithmetic, arithmetic::bit_and<uint32_t>, WorkgroupTest>(itemsPerWG, workgroupCount, subgroupSize, itemsPerInvoc);
+		passed = validateResults<Arithmetic, arithmetic::bit_xor<uint32_t>, WorkgroupTest>(itemsPerWG, workgroupCount, subgroupSize, itemsPerInvoc) && passed;
+		passed = validateResults<Arithmetic, arithmetic::bit_or<uint32_t>, WorkgroupTest>(itemsPerWG, workgroupCount, subgroupSize, itemsPerInvoc) && passed;
+		passed = validateResults<Arithmetic, arithmetic::plus<uint32_t>, WorkgroupTest>(itemsPerWG, workgroupCount, subgroupSize, itemsPerInvoc) && passed;
+		passed = validateResults<Arithmetic, arithmetic::multiplies<uint32_t>, WorkgroupTest>(itemsPerWG, workgroupCount, subgroupSize, itemsPerInvoc) && passed;
+		passed = validateResults<Arithmetic, arithmetic::minimum<uint32_t>, WorkgroupTest>(itemsPerWG, workgroupCount, subgroupSize, itemsPerInvoc) && passed;
+		passed = validateResults<Arithmetic, arithmetic::maximum<uint32_t>, WorkgroupTest>(itemsPerWG, workgroupCount, subgroupSize, itemsPerInvoc) && passed;
 
 		return passed;
 	}
 
 	//returns true if result matches
 	template<template<class> class Arithmetic, class Binop, bool WorkgroupTest>
-	bool validateResults(const uint32_t itemsPerWG, const uint32_t workgroupCount, const uint32_t itemsPerInvoc)
+	bool validateResults(const uint32_t itemsPerWG, const uint32_t workgroupCount, const uint32_t subgroupSize, const uint32_t itemsPerInvoc)
 	{
 		bool success = true;
 
@@ -465,7 +459,6 @@ class Workgroup2ScanTestApp final : public application_templates::BasicMultiQueu
 
 		using type_t = typename Binop::type_t;
 		const auto dataFromBuffer = reinterpret_cast<const uint32_t*>(resultsBuffer->getPointer());
-		const auto subgroupSize = dataFromBuffer[0];
 		if (subgroupSize<nbl::hlsl::subgroup::MinSubgroupSize || subgroupSize>nbl::hlsl::subgroup::MaxSubgroupSize)
 		{
 			m_logger->log("Unexpected Subgroup Size %u", ILogger::ELL_ERROR, subgroupSize);
@@ -475,11 +468,7 @@ class Workgroup2ScanTestApp final : public application_templates::BasicMultiQueu
 		const auto testData = reinterpret_cast<const type_t*>(dataFromBuffer + 1);
 		// TODO: parallel for (the temporary values need to be threadlocal or what?)
 		// now check if the data obtained has valid values
-		type_t* tmp;
-		if constexpr (WorkgroupTest)
-			tmp = new type_t[itemsPerWG];
-		else
-			tmp = new type_t[itemsPerWG * itemsPerInvoc];
+		type_t* tmp = new type_t[itemsPerWG];
 		for (uint32_t workgroupID = 0u; success && workgroupID < workgroupCount; workgroupID++)
 		{
 			if constexpr (WorkgroupTest)
@@ -506,11 +495,12 @@ class Workgroup2ScanTestApp final : public application_templates::BasicMultiQueu
 			}
 			else
 			{
-				const auto workgroupOffset = workgroupID * itemsPerWG * itemsPerInvoc;
-				for (uint32_t pseudoSubgroupID = 0u; pseudoSubgroupID < itemsPerWG; pseudoSubgroupID += subgroupSize)
+				const auto workgroupOffset = workgroupID * itemsPerWG;
+				const auto workgroupSize = itemsPerWG / itemsPerInvoc;
+				for (uint32_t pseudoSubgroupID = 0u; pseudoSubgroupID < workgroupSize; pseudoSubgroupID += subgroupSize)
 					Arithmetic<Binop>::impl(tmp + pseudoSubgroupID * itemsPerInvoc, inputData + workgroupOffset + pseudoSubgroupID * itemsPerInvoc, subgroupSize * itemsPerInvoc);
 
-				for (uint32_t localInvocationIndex = 0u; localInvocationIndex < itemsPerWG; localInvocationIndex++)
+				for (uint32_t localInvocationIndex = 0u; localInvocationIndex < workgroupSize; localInvocationIndex++)
 				{
 					const auto localOffset = localInvocationIndex * itemsPerInvoc;
 					const auto globalInvocationIndex = workgroupOffset + localOffset;
diff --git a/29_Arithmetic2Bench/app_resources/benchmarkWorkgroup.comp.hlsl b/29_Arithmetic2Bench/app_resources/benchmarkWorkgroup.comp.hlsl
index 58a3624cd..72a42f9a1 100644
--- a/29_Arithmetic2Bench/app_resources/benchmarkWorkgroup.comp.hlsl
+++ b/29_Arithmetic2Bench/app_resources/benchmarkWorkgroup.comp.hlsl
@@ -15,7 +15,7 @@ using config_t = workgroup2::ArithmeticConfiguration<WORKGROUP_SIZE_LOG2, SUBGRO
 typedef vector<uint32_t, config_t::ItemsPerInvocation_0> type_t;
 
 // final (level 1/2) scan needs to fit in one subgroup exactly
-groupshared uint32_t scratch[config_t::SharedScratchElementCount];
+groupshared uint32_t scratch[mpl::max_v<int16_t,config_t::SharedScratchElementCount,1>];
 
 #include "../../common/include/WorkgroupDataAccessors.hlsl"
 

From 1710b698621796aa767edf7bc940e55e6758c2a8 Mon Sep 17 00:00:00 2001
From: keptsecret <sorchon@gmail.com>
Date: Mon, 9 Jun 2025 12:20:43 +0700
Subject: [PATCH 346/529] push constant stores array of output addresses
 directly because static addressing

---
 .../app_resources/common.hlsl                 |  4 ++--
 .../app_resources/testSubgroup.comp.hlsl      |  2 +-
 .../app_resources/testWorkgroup.comp.hlsl     |  3 +--
 23_Arithmetic2UnitTest/main.cpp               | 21 ++-----------------
 .../app_resources/benchmarkSubgroup.comp.hlsl |  2 +-
 .../benchmarkWorkgroup.comp.hlsl              |  2 --
 29_Arithmetic2Bench/app_resources/common.hlsl |  4 ++--
 29_Arithmetic2Bench/main.cpp                  | 17 +++------------
 common/include/WorkgroupDataAccessors.hlsl    |  2 +-
 9 files changed, 13 insertions(+), 44 deletions(-)

diff --git a/23_Arithmetic2UnitTest/app_resources/common.hlsl b/23_Arithmetic2UnitTest/app_resources/common.hlsl
index ddf5dc00f..6654645cf 100644
--- a/23_Arithmetic2UnitTest/app_resources/common.hlsl
+++ b/23_Arithmetic2UnitTest/app_resources/common.hlsl
@@ -4,7 +4,7 @@
 struct PushConstantData
 {
     uint64_t pInputBuf;
-    uint64_t ppOutputBuf;
+    uint64_t pOutputBuf[8];
 };
 
 namespace arithmetic
@@ -93,4 +93,4 @@ struct ballot : nbl::hlsl::plus<T>
 };
 }
 
-#include "nbl/builtin/hlsl/subgroup/basic.hlsl"
\ No newline at end of file
+#include "nbl/builtin/hlsl/glsl_compat/subgroup_basic.hlsl"
diff --git a/23_Arithmetic2UnitTest/app_resources/testSubgroup.comp.hlsl b/23_Arithmetic2UnitTest/app_resources/testSubgroup.comp.hlsl
index 2adb4dc81..6cd496648 100644
--- a/23_Arithmetic2UnitTest/app_resources/testSubgroup.comp.hlsl
+++ b/23_Arithmetic2UnitTest/app_resources/testSubgroup.comp.hlsl
@@ -22,7 +22,7 @@ static void subtest(NBL_CONST_REF_ARG(type_t) sourceVal)
     using config_t = subgroup2::Configuration<SUBGROUP_SIZE_LOG2>;
     using params_t = subgroup2::ArithmeticParams<config_t, typename Binop::base_t, N, device_capabilities>;
 
-    const uint64_t outputBufAddr = vk::RawBufferLoad<uint64_t>(pc.ppOutputBuf + Binop::BindingIndex * sizeof(uint64_t), sizeof(uint64_t));
+    const uint64_t outputBufAddr = pc.pOutputBuf[Binop::BindingIndex];
 
     if (glsl::gl_SubgroupSize()!=1u<<SUBGROUP_SIZE_LOG2)
         vk::RawBufferStore<uint32_t>(outputBufAddr, glsl::gl_SubgroupSize());
diff --git a/23_Arithmetic2UnitTest/app_resources/testWorkgroup.comp.hlsl b/23_Arithmetic2UnitTest/app_resources/testWorkgroup.comp.hlsl
index efaa25874..97ff31481 100644
--- a/23_Arithmetic2UnitTest/app_resources/testWorkgroup.comp.hlsl
+++ b/23_Arithmetic2UnitTest/app_resources/testWorkgroup.comp.hlsl
@@ -51,9 +51,8 @@ struct operation_t
 template<class Binop>
 static void subtest()
 {
-    uint64_t outputBufAddr = vk::RawBufferLoad<uint64_t>(pc.ppOutputBuf + Binop::BindingIndex * sizeof(uint64_t));
     if (glsl::gl_SubgroupSize()!=1u<<SUBGROUP_SIZE_LOG2)
-        vk::RawBufferStore<uint32_t>(outputBufAddr, glsl::gl_SubgroupSize());
+        vk::RawBufferStore<uint32_t>(pc.pOutputBuf[Binop::BindingIndex], glsl::gl_SubgroupSize());
 
     operation_t<Binop,device_capabilities> func;
     func();
diff --git a/23_Arithmetic2UnitTest/main.cpp b/23_Arithmetic2UnitTest/main.cpp
index ad867bc92..85d6e610f 100644
--- a/23_Arithmetic2UnitTest/main.cpp
+++ b/23_Arithmetic2UnitTest/main.cpp
@@ -99,21 +99,9 @@ class Workgroup2ScanTestApp final : public application_templates::BasicMultiQueu
 			auto bufferMem = m_device->allocate(mreq, outputBuffers[i].get(), IDeviceMemoryAllocation::EMAF_DEVICE_ADDRESS_BIT);
 			assert(bufferMem.isValid());
 		}
-
-		// create buffer to store BDA of output buffers
-		smart_refctd_ptr<IGPUBuffer> gpuOutputAddressesBuffer;
-		{
-			std::array<uint64_t, OutputBufferCount> outputAddresses;
-			for (uint32_t i = 0; i < OutputBufferCount; i++)
-				outputAddresses[i] = outputBuffers[i]->getDeviceAddress();
-
-			IGPUBuffer::SCreationParams params;
-			params.usage = IGPUBuffer::EUF_STORAGE_BUFFER_BIT | IGPUBuffer::EUF_TRANSFER_DST_BIT | IGPUBuffer::EUF_INLINE_UPDATE_VIA_CMDBUF | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT;
-			params.size = OutputBufferCount * sizeof(uint64_t);
-			m_utils->createFilledDeviceLocalBufferOnDedMem(SIntendedSubmitInfo{ .queue = getTransferUpQueue() }, std::move(params), outputAddresses.data()).move_into(gpuOutputAddressesBuffer);
-		}
 		pc.pInputBuf = gpuinputDataBuffer->getDeviceAddress();
-		pc.ppOutputBuf = gpuOutputAddressesBuffer->getDeviceAddress();
+		for (uint32_t i = 0; i < OutputBufferCount; i++)
+			pc.pOutputBuf[i] = outputBuffers[i]->getDeviceAddress();
 
 		// create Pipeline Layout
 		{
@@ -459,11 +447,6 @@ class Workgroup2ScanTestApp final : public application_templates::BasicMultiQueu
 
 		using type_t = typename Binop::type_t;
 		const auto dataFromBuffer = reinterpret_cast<const uint32_t*>(resultsBuffer->getPointer());
-		if (subgroupSize<nbl::hlsl::subgroup::MinSubgroupSize || subgroupSize>nbl::hlsl::subgroup::MaxSubgroupSize)
-		{
-			m_logger->log("Unexpected Subgroup Size %u", ILogger::ELL_ERROR, subgroupSize);
-			return false;
-		}
 
 		const auto testData = reinterpret_cast<const type_t*>(dataFromBuffer + 1);
 		// TODO: parallel for (the temporary values need to be threadlocal or what?)
diff --git a/29_Arithmetic2Bench/app_resources/benchmarkSubgroup.comp.hlsl b/29_Arithmetic2Bench/app_resources/benchmarkSubgroup.comp.hlsl
index ba11890d1..2da7de38f 100644
--- a/29_Arithmetic2Bench/app_resources/benchmarkSubgroup.comp.hlsl
+++ b/29_Arithmetic2Bench/app_resources/benchmarkSubgroup.comp.hlsl
@@ -24,7 +24,7 @@ static void subbench(NBL_CONST_REF_ARG(type_t) sourceVal)
     using params_t = subgroup2::ArithmeticParams<config_t, typename Binop::base_t, N, device_capabilities>;
     type_t value = sourceVal;
 
-    const uint64_t outputBufAddr = vk::RawBufferLoad<uint64_t>(pc.ppOutputBuf + Binop::BindingIndex * sizeof(uint64_t), sizeof(uint64_t));
+    const uint64_t outputBufAddr = pc.pOutputBuf[Binop::BindingIndex];
 
     operation_t<params_t> func;
     // [unroll]
diff --git a/29_Arithmetic2Bench/app_resources/benchmarkWorkgroup.comp.hlsl b/29_Arithmetic2Bench/app_resources/benchmarkWorkgroup.comp.hlsl
index 72a42f9a1..ad861a30d 100644
--- a/29_Arithmetic2Bench/app_resources/benchmarkWorkgroup.comp.hlsl
+++ b/29_Arithmetic2Bench/app_resources/benchmarkWorkgroup.comp.hlsl
@@ -99,8 +99,6 @@ struct operation_t
 template<class Binop>
 static void subbench()
 {
-    const uint64_t outputBufAddr = vk::RawBufferLoad<uint64_t>(pc.ppOutputBuf + Binop::BindingIndex * sizeof(uint64_t), sizeof(uint64_t));
-
     RandomizedInputDataProxy<config_t,Binop> dataAccessor = RandomizedInputDataProxy<config_t,Binop>::create();
     dataAccessor.preload();
 
diff --git a/29_Arithmetic2Bench/app_resources/common.hlsl b/29_Arithmetic2Bench/app_resources/common.hlsl
index 388be324f..0cdcd7dad 100644
--- a/29_Arithmetic2Bench/app_resources/common.hlsl
+++ b/29_Arithmetic2Bench/app_resources/common.hlsl
@@ -4,7 +4,7 @@
 struct PushConstantData
 {
     uint64_t pInputBuf;
-    uint64_t ppOutputBuf;
+    uint64_t pOutputBuf[2];
 };
 
 namespace arithmetic
@@ -32,4 +32,4 @@ struct ballot : nbl::hlsl::plus<T>
 };
 }
 
-#include "nbl/builtin/hlsl/subgroup/basic.hlsl"
+#include "nbl/builtin/hlsl/glsl_compat/subgroup_basic.hlsl"
diff --git a/29_Arithmetic2Bench/main.cpp b/29_Arithmetic2Bench/main.cpp
index 38f995264..d317f07df 100644
--- a/29_Arithmetic2Bench/main.cpp
+++ b/29_Arithmetic2Bench/main.cpp
@@ -246,7 +246,7 @@ class ArithmeticBenchApp final : public examples::SimpleWindowedApplication, pub
 		transferDownQueue = getTransferDownQueue();
 		computeQueue = getComputeQueue();
 
-		// create 8 buffers for 8 operations
+		// create 2 buffers for 2 operations
 		for (auto i=0u; i<OutputBufferCount; i++)
 		{
 			IGPUBuffer::SCreationParams params = {};
@@ -261,19 +261,8 @@ class ArithmeticBenchApp final : public examples::SimpleWindowedApplication, pub
 			auto bufferMem = m_device->allocate(mreq, outputBuffers[i].get(), IDeviceMemoryAllocation::EMAF_DEVICE_ADDRESS_BIT);
 			assert(bufferMem.isValid());
 		}
-
-		// create buffer to store BDA of output buffers
-		{
-			std::array<uint64_t, OutputBufferCount> outputAddresses;
-			for (uint32_t i = 0; i < OutputBufferCount; i++)
-				outputAddresses[i] = outputBuffers[i]->getDeviceAddress();
-
-			IGPUBuffer::SCreationParams params;
-			params.usage = IGPUBuffer::EUF_STORAGE_BUFFER_BIT | IGPUBuffer::EUF_TRANSFER_DST_BIT | IGPUBuffer::EUF_INLINE_UPDATE_VIA_CMDBUF | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT;
-			params.size = OutputBufferCount * sizeof(uint64_t);
-			m_utils->createFilledDeviceLocalBufferOnDedMem(SIntendedSubmitInfo{ .queue = getTransferUpQueue() }, std::move(params), outputAddresses.data()).move_into(gpuOutputAddressesBuffer);
-		}
-		pc.ppOutputBuf = gpuOutputAddressesBuffer->getDeviceAddress();
+		for (auto i = 0u; i < OutputBufferCount; i++)
+			pc.pOutputBuf[i] = outputBuffers[i]->getDeviceAddress();
 
 		// create image views for swapchain images
 		for (uint32_t i = 0; i < ISwapchain::MaxImages; i++)
diff --git a/common/include/WorkgroupDataAccessors.hlsl b/common/include/WorkgroupDataAccessors.hlsl
index e6112f797..6beadfbc9 100644
--- a/common/include/WorkgroupDataAccessors.hlsl
+++ b/common/include/WorkgroupDataAccessors.hlsl
@@ -40,7 +40,7 @@ struct DataProxy
     {
         DataProxy<Config, Binop> retval;
         retval.workgroupOffset = glsl::gl_WorkGroupID().x * Config::VirtualWorkgroupSize;
-        retval.outputBufAddr = sizeof(uint32_t) + vk::RawBufferLoad<uint64_t>(pc.ppOutputBuf + Binop::BindingIndex * sizeof(uint64_t));
+        retval.outputBufAddr = sizeof(uint32_t) + pc.pOutputBuf[Binop::BindingIndex];
         return retval;
     }
 

From 47dec6df8d84c2e60e0f2df813aa32eedcea4ae7 Mon Sep 17 00:00:00 2001
From: Erfan Ahmadi <ahmadierfan99@gmail.com>
Date: Mon, 9 Jun 2025 12:39:09 +0400
Subject: [PATCH 347/529] IUtilities constructor to static create function

---
 05_StreamingAndBufferDeviceAddressApp/main.cpp | 4 ++--
 11_FFT/main.cpp                                | 2 +-
 28_FFTBloom/main.cpp                           | 2 +-
 old_to_refactor/20_Megatexture/main.cpp        | 2 +-
 4 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/05_StreamingAndBufferDeviceAddressApp/main.cpp b/05_StreamingAndBufferDeviceAddressApp/main.cpp
index e8f7dbd33..7fa72235b 100644
--- a/05_StreamingAndBufferDeviceAddressApp/main.cpp
+++ b/05_StreamingAndBufferDeviceAddressApp/main.cpp
@@ -117,8 +117,8 @@ class StreamingAndBufferDeviceAddressApp final : public application_templates::M
 			// `CAsyncSingleBufferSubAllocator` just allows you suballocate subranges of any `IGPUBuffer` range with deferred/latched frees.
 			constexpr uint32_t DownstreamBufferSize = sizeof(output_t)<<23;
 			constexpr uint32_t UpstreamBufferSize = sizeof(input_t)<<23;
-
-			m_utils = make_smart_refctd_ptr<IUtilities>(smart_refctd_ptr(m_device),smart_refctd_ptr(m_logger),DownstreamBufferSize,UpstreamBufferSize);
+			
+			m_utils = IUtilities::create(smart_refctd_ptr(m_device),smart_refctd_ptr(m_logger),DownstreamBufferSize,UpstreamBufferSize);
 			if (!m_utils)
 				return logFail("Failed to create Utilities!");
 			m_upStreamingBuffer = m_utils->getDefaultUpStreamingBuffer();
diff --git a/11_FFT/main.cpp b/11_FFT/main.cpp
index 80f5f856c..b10efbf31 100644
--- a/11_FFT/main.cpp
+++ b/11_FFT/main.cpp
@@ -96,7 +96,7 @@ class FFT_Test final : public application_templates::MonoDeviceApplication, publ
 		constexpr uint32_t DownstreamBufferSize = sizeof(scalar_t) << 23;
 		constexpr uint32_t UpstreamBufferSize = sizeof(scalar_t) << 23;
 
-		m_utils = make_smart_refctd_ptr<IUtilities>(smart_refctd_ptr(m_device), smart_refctd_ptr(m_logger), DownstreamBufferSize, UpstreamBufferSize);
+		m_utils = IUtilities::create(smart_refctd_ptr(m_device), smart_refctd_ptr(m_logger), DownstreamBufferSize, UpstreamBufferSize);
 		if (!m_utils)
 			return logFail("Failed to create Utilities!");
 		m_upStreamingBuffer = m_utils->getDefaultUpStreamingBuffer();
diff --git a/28_FFTBloom/main.cpp b/28_FFTBloom/main.cpp
index cc312c3be..fddb45586 100644
--- a/28_FFTBloom/main.cpp
+++ b/28_FFTBloom/main.cpp
@@ -461,7 +461,7 @@ class FFTBloomApp final : public examples::SimpleWindowedApplication, public app
 			assert(m_kerImageView);
 
 			// Going to need an IUtils to perform uploads/downloads
-			m_utils = make_smart_refctd_ptr<IUtilities>(smart_refctd_ptr(m_device), smart_refctd_ptr(m_logger));
+			m_utils = IUtilities::create(smart_refctd_ptr(m_device), smart_refctd_ptr(m_logger));
 
 			// Now convert uploads
 			// Get graphics queue for image transfer
diff --git a/old_to_refactor/20_Megatexture/main.cpp b/old_to_refactor/20_Megatexture/main.cpp
index 35d0692af..5c309ff24 100644
--- a/old_to_refactor/20_Megatexture/main.cpp
+++ b/old_to_refactor/20_Megatexture/main.cpp
@@ -684,7 +684,7 @@ APP_CONSTRUCTOR(MegaTextureApp)
             video::IGPUBuffer::SCreationParams bufferCreationParams;
             bufferCreationParams.usage = asset::IBuffer::EUF_STORAGE_BUFFER_BIT;
             bufferCreationParams.size = sizeof(video::IGPUVirtualTexture::SPrecomputedData);
-            core::smart_refctd_ptr<video::IUtilities> utilities = core::make_smart_refctd_ptr<video::IUtilities>(core::smart_refctd_ptr(logicalDevice));
+            core::smart_refctd_ptr<video::IUtilities> utilities = video::IUtilities::create(core::smart_refctd_ptr(logicalDevice));
             core::smart_refctd_ptr<video::IGPUBuffer> buffer = utilities->createFilledDeviceLocalBufferOnDedMem(queues[CommonAPI::InitOutput::EQT_TRANSFER_UP], std::move(bufferCreationParams), &gpuvt->getPrecomputedData());
 
             {

From 07774344371d182cbcbc716928375cca29356521 Mon Sep 17 00:00:00 2001
From: Erfan Ahmadi <ahmadierfan99@gmail.com>
Date: Mon, 9 Jun 2025 14:04:11 +0400
Subject: [PATCH 348/529] allocateDrawResourcesWithinAvailableVRAM

---
 62_CAD/DrawResourcesFiller.cpp | 160 ++++++++++++++++++++++++---------
 62_CAD/DrawResourcesFiller.h   |  46 ++++++++--
 62_CAD/main.cpp                |   5 +-
 3 files changed, 157 insertions(+), 54 deletions(-)

diff --git a/62_CAD/DrawResourcesFiller.cpp b/62_CAD/DrawResourcesFiller.cpp
index 517334ad9..1d0eaaf16 100644
--- a/62_CAD/DrawResourcesFiller.cpp
+++ b/62_CAD/DrawResourcesFiller.cpp
@@ -25,66 +25,131 @@ void DrawResourcesFiller::setTexturesDescriptorSetAndBinding(core::smart_refctd_
 	suballocatedDescriptorSet = core::make_smart_refctd_ptr<SubAllocatedDescriptorSet>(std::move(descriptorSet));
 }
 
-void DrawResourcesFiller::allocateResourcesBuffer(ILogicalDevice* logicalDevice, size_t size)
-{
-	// TODO: Make this function failable and report insufficient memory if less that getMinimumRequiredResourcesBufferSize, TODO: Have retry mechanism to allocate less mem
-	// TODO: Allocate buffer memory and image memory with 1 allocation, so that failure and retries are more straightforward.
-	size = core::alignUp(size, ResourcesMaxNaturalAlignment);
-	size = core::max(size, getMinimumRequiredResourcesBufferSize());
-	// size = 368u; STRESS TEST
-	IGPUBuffer::SCreationParams geometryCreationParams = {};
-	geometryCreationParams.size = size;
-	geometryCreationParams.usage = bitflag(IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT) | IGPUBuffer::EUF_TRANSFER_DST_BIT | IGPUBuffer::EUF_INDEX_BUFFER_BIT;
-	resourcesGPUBuffer = logicalDevice->createBuffer(std::move(geometryCreationParams));
+bool DrawResourcesFiller::allocateDrawResources(ILogicalDevice* logicalDevice, size_t requiredImageMemorySize, size_t requiredBufferMemorySize)
+{
+	// single memory allocation sectioned into images+buffers (images start at offset=0)
+	const size_t adjustedImagesMemorySize = core::alignUp(requiredImageMemorySize, GPUStructsMaxNaturalAlignment);
+	const size_t adjustedBuffersMemorySize = core::max(requiredBufferMemorySize, getMinimumRequiredResourcesBufferSize());
+	const size_t totalResourcesSize = adjustedImagesMemorySize + adjustedBuffersMemorySize;
+
+	IGPUBuffer::SCreationParams resourcesBufferCreationParams = {};
+	resourcesBufferCreationParams.size = adjustedBuffersMemorySize;
+	resourcesBufferCreationParams.usage = bitflag(IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT) | IGPUBuffer::EUF_TRANSFER_DST_BIT | IGPUBuffer::EUF_INDEX_BUFFER_BIT;
+	resourcesGPUBuffer = logicalDevice->createBuffer(std::move(resourcesBufferCreationParams));
 	resourcesGPUBuffer->setObjectDebugName("drawResourcesBuffer");
 
 	IDeviceMemoryBacked::SDeviceMemoryRequirements memReq = resourcesGPUBuffer->getMemoryReqs();
-	memReq.memoryTypeBits &= logicalDevice->getPhysicalDevice()->getDeviceLocalMemoryTypeBits();
-	auto mem = logicalDevice->allocate(memReq, resourcesGPUBuffer.get(), IDeviceMemoryAllocation::EMAF_DEVICE_ADDRESS_BIT);
+	
+	nbl::video::IDeviceMemoryBacked::SDeviceMemoryRequirements gpuBufferMemoryReqs = resourcesGPUBuffer->getMemoryReqs();
+	const bool memoryRequirementsMatch =
+		(logicalDevice->getPhysicalDevice()->getDeviceLocalMemoryTypeBits() & gpuBufferMemoryReqs.memoryTypeBits) != 0 && // should have device local memory compatible
+		(gpuBufferMemoryReqs.requiresDedicatedAllocation == false); // should not require dedicated allocation
+
+	if (!memoryRequirementsMatch)
+	{
+		m_logger.log("Shouldn't happen: Buffer Memory Requires Dedicated Allocation or can't biind to device local memory.", nbl::system::ILogger::ELL_ERROR);
+		return false;
+	}
+	
+	const auto& memoryProperties = logicalDevice->getPhysicalDevice()->getMemoryProperties();
+
+	uint32_t memoryTypeIdx = ~0u;
 
-	// Allocate for Images  
+	video::IDeviceMemoryAllocator::SAllocation allocation = {};
+	for (uint32_t i = 0u; i < memoryProperties.memoryTypeCount; ++i)
 	{
-		const auto& memoryProperties = logicalDevice->getPhysicalDevice()->getMemoryProperties();
-		uint32_t memoryTypeIdx = ~0u;
-		for (uint32_t i = 0u; i < memoryProperties.memoryTypeCount; ++i)
+		if (memoryProperties.memoryTypes[i].propertyFlags.hasFlags(IDeviceMemoryAllocation::EMPF_DEVICE_LOCAL_BIT))
 		{
-			if (memoryProperties.memoryTypes[i].propertyFlags.hasFlags(IDeviceMemoryAllocation::EMPF_DEVICE_LOCAL_BIT))
+			memoryTypeIdx = i;
+
+			IDeviceMemoryAllocator::SAllocateInfo allocationInfo =
 			{
-				memoryTypeIdx = i;
+				.size = totalResourcesSize,
+				.flags = IDeviceMemoryAllocation::E_MEMORY_ALLOCATE_FLAGS::EMAF_DEVICE_ADDRESS_BIT, // for the buffers
+				.memoryTypeIndex = memoryTypeIdx,
+				.dedication = nullptr,
+			};
+
+			allocation = logicalDevice->allocate(allocationInfo);
+			
+			if (allocation.isValid())
 				break;
-			}
 		}
+	}
 
-		if (memoryTypeIdx == ~0u)
-		{
-			m_logger.log("allocateResourcesBuffer: no device local memory type found.", nbl::system::ILogger::ELL_ERROR);
-			assert(false);
-		}
+	if (memoryTypeIdx == ~0u)
+	{
+		m_logger.log("allocateResourcesBuffer: no device local memory type found!", nbl::system::ILogger::ELL_ERROR);
+		return false;
+	}
 
-		IDeviceMemoryAllocator::SAllocateInfo allocationInfo =
-		{
-			// TODO: Get from user side.
-			.size = 65 * 1024 * 1024, // 70 MB
-			.flags = IDeviceMemoryAllocation::E_MEMORY_ALLOCATE_FLAGS::EMAF_NONE,
-			.memoryTypeIndex = memoryTypeIdx,
-			.dedication = nullptr,
-		};
-		imagesMemoryArena = logicalDevice->allocate(allocationInfo);
+	if (!allocation.isValid())
+		return false;
 
-		if (imagesMemoryArena.isValid())
-		{
-			imagesMemorySubAllocator = core::make_smart_refctd_ptr<ImagesMemorySubAllocator>(static_cast<uint64_t>(allocationInfo.size));
-		}
-		else
-		{
-			m_logger.log("failure to allocate memory arena for images", nbl::system::ILogger::ELL_ERROR);
-			assert(false);
+	imagesMemoryArena = {
+		.memory = allocation.memory,
+		.offset = allocation.offset,
+	};
+
+	buffersMemoryArena = {
+		.memory = allocation.memory,
+		.offset = core::alignUp(allocation.offset + adjustedImagesMemorySize, GPUStructsMaxNaturalAlignment), // first natural alignment after images section of the memory allocation
+	};
+
+	imagesMemorySubAllocator = core::make_smart_refctd_ptr<ImagesMemorySubAllocator>(adjustedImagesMemorySize);
+
+	video::ILogicalDevice::SBindBufferMemoryInfo bindBufferMemory = {
+		.buffer = resourcesGPUBuffer.get(),
+		.binding = {
+			.memory = buffersMemoryArena.memory.get(),
+			.offset  = buffersMemoryArena.offset,
 		}
+	};
+
+	if (!logicalDevice->bindBufferMemory(1, &bindBufferMemory))
+	{
+		m_logger.log("DrawResourcesFiller::allocateDrawResources, bindBufferMemory failed.", nbl::system::ILogger::ELL_ERROR);
+		return false;
 	}
 
+	return true;
 }
 
-void DrawResourcesFiller::allocateMSDFTextures(ILogicalDevice* logicalDevice, uint32_t maxMSDFs, uint32_t2 msdfsExtent)
+bool DrawResourcesFiller::allocateDrawResourcesWithinAvailableVRAM(ILogicalDevice* logicalDevice, size_t maxImageMemorySize, size_t maxBufferMemorySize, uint32_t reductionPercent, uint32_t maxTries)
+{
+	const size_t minimumAcceptableSize = core::max(MinimumDrawResourcesMemorySize, getMinimumRequiredResourcesBufferSize());
+
+	size_t currentBufferSize = maxBufferMemorySize;
+	size_t currentImageSize = maxImageMemorySize;
+	const size_t totalInitialSize = currentBufferSize + currentImageSize;
+
+	// If initial size is less than minimum acceptable then increase the buffer and image size to sum up to minimumAcceptableSize with image:buffer ratios preserved
+	if (totalInitialSize < minimumAcceptableSize)
+	{
+		// Preserve ratio: R = buffer / (buffer + image)
+		// scaleFactor = minimumAcceptableSize / totalInitialSize;
+		const double scaleFactor = static_cast<double>(minimumAcceptableSize) / totalInitialSize;
+		currentBufferSize = static_cast<size_t>(currentBufferSize * scaleFactor);
+		currentImageSize = minimumAcceptableSize - currentBufferSize; // ensures exact sum
+	}
+
+	uint32_t numTries = 0u;
+	while ((currentBufferSize + currentImageSize) >= minimumAcceptableSize && numTries < maxTries)
+	{
+		if (allocateDrawResources(logicalDevice, currentBufferSize, currentImageSize))
+			return true;
+
+		currentBufferSize = (currentBufferSize * (100 - reductionPercent)) / 100;
+		currentImageSize = (currentImageSize * (100 - reductionPercent)) / 100;
+		numTries++;
+		m_logger.log("Allocation of memory for images(%zu) and buffers(%zu) failed; Reducing allocation size by %u%% and retrying...", system::ILogger::ELL_WARNING, currentImageSize, currentBufferSize, reductionPercent);
+	}
+
+	m_logger.log("All attempts to allocate memory for images(%zu) and buffers(%zu) failed.", system::ILogger::ELL_ERROR, currentImageSize, currentBufferSize);
+	return false;
+}
+
+bool DrawResourcesFiller::allocateMSDFTextures(ILogicalDevice* logicalDevice, uint32_t maxMSDFs, uint32_t2 msdfsExtent)
 {
 	// TODO: Make this function failable and report insufficient memory
 	asset::E_FORMAT msdfFormat = MSDFTextureFormat;
@@ -116,7 +181,10 @@ void DrawResourcesFiller::allocateMSDFTextures(ILogicalDevice* logicalDevice, ui
 		auto image = logicalDevice->createImage(std::move(imgInfo));
 		auto imageMemReqs = image->getMemoryReqs();
 		imageMemReqs.memoryTypeBits &= logicalDevice->getPhysicalDevice()->getDeviceLocalMemoryTypeBits();
-		logicalDevice->allocate(imageMemReqs, image.get());
+		const auto allocation = logicalDevice->allocate(imageMemReqs, image.get());
+
+		if (!allocation.isValid())
+			return false;
 
 		image->setObjectDebugName("MSDFs Texture Array");
 
@@ -134,9 +202,13 @@ void DrawResourcesFiller::allocateMSDFTextures(ILogicalDevice* logicalDevice, ui
 		msdfTextureArray = logicalDevice->createImageView(std::move(imgViewInfo));
 	}
 
+	if (!msdfTextureArray)
+		return false;
+
 	msdfLRUCache = std::unique_ptr<MSDFsLRUCache>(new MSDFsLRUCache(maxMSDFs));
 	msdfTextureArrayIndexAllocator = core::make_smart_refctd_ptr<IndexAllocator>(core::smart_refctd_ptr<ILogicalDevice>(logicalDevice), maxMSDFs);
 	msdfImagesState.resize(maxMSDFs);
+	return true;
 }
 
 void DrawResourcesFiller::drawPolyline(const CPolylineBase& polyline, const LineStyleInfo& lineStyleInfo, SIntendedSubmitInfo& intendedNextSubmit)
diff --git a/62_CAD/DrawResourcesFiller.h b/62_CAD/DrawResourcesFiller.h
index f482d8435..981facaec 100644
--- a/62_CAD/DrawResourcesFiller.h
+++ b/62_CAD/DrawResourcesFiller.h
@@ -29,7 +29,8 @@ struct DrawResourcesFiller
 public:
 	
 	// We pack multiple data types in a single buffer, we need to makes sure each offset starts aligned to avoid mis-aligned accesses
-	static constexpr size_t ResourcesMaxNaturalAlignment = 8u;
+	static constexpr size_t GPUStructsMaxNaturalAlignment = 8u;
+	static constexpr size_t MinimumDrawResourcesMemorySize = 512u * 1 << 20u; // 512MB
 
 	/// @brief general parent struct for 1.ReservedCompute and 2.CPUGenerated Resources
 	struct ResourceBase
@@ -38,7 +39,7 @@ struct DrawResourcesFiller
 		size_t bufferOffset = InvalidBufferOffset; // set when copy to gpu buffer is issued
 		virtual size_t getCount() const = 0;
 		virtual size_t getStorageSize() const = 0;
-		virtual size_t getAlignedStorageSize() const { return core::alignUp(getStorageSize(), ResourcesMaxNaturalAlignment); }
+		virtual size_t getAlignedStorageSize() const { return core::alignUp(getStorageSize(), GPUStructsMaxNaturalAlignment); }
 	};
 
 	/// @brief ResourceBase reserved for compute shader stages input/output
@@ -67,11 +68,11 @@ struct DrawResourcesFiller
 		}
 
 		/// @brief increases size of general-purpose resources that hold bytes
-		/// @param alignment: Alignment of the pointer returned to be filled, should be PoT and <= ResourcesMaxNaturalAlignment, only use this if storing raw bytes in vector
+		/// @param alignment: Alignment of the pointer returned to be filled, should be PoT and <= GPUStructsMaxNaturalAlignment, only use this if storing raw bytes in vector
 		/// @return pointer to start of the data to be filled, up to additional size
 		size_t increaseSizeAndGetOffset(size_t additionalSize, size_t alignment) 
 		{
-			assert(core::isPoT(alignment) && alignment <= ResourcesMaxNaturalAlignment);
+			assert(core::isPoT(alignment) && alignment <= GPUStructsMaxNaturalAlignment);
 			size_t offset = core::alignUp(vector.size(), alignment);
 			vector.resize(offset + additionalSize);
 			return offset;
@@ -104,7 +105,7 @@ struct DrawResourcesFiller
 		CPUGeneratedResource<uint32_t> indexBuffer; // TODO: this is going to change to ReservedComputeResource where index buffer gets filled by compute shaders
 		CPUGeneratedResource<uint8_t> geometryInfo; // general purpose byte buffer for custom data for geometries (eg. line points, bezier definitions, aabbs)
 
-		// Get Total memory consumption, If all ResourcesCollection get packed together with ResourcesMaxNaturalAlignment
+		// Get Total memory consumption, If all ResourcesCollection get packed together with GPUStructsMaxNaturalAlignment
 		// used to decide the remaining memory and when to overflow
 		size_t calculateTotalConsumption() const
 		{
@@ -135,12 +136,40 @@ struct DrawResourcesFiller
 	{
 		// for auto-submission to work correctly, memory needs to serve at least 2 linestyle, 1 dtm settings, 1 clip proj, 1 main obj, 1 draw obj and 512 bytes of additional mem for geometries and index buffer
 		// this is the ABSOLUTE MINIMUM (if this value is used rendering will probably be as slow as CPU drawing :D)
-		return core::alignUp(sizeof(LineStyle) + sizeof(LineStyle) * DTMSettings::MaxContourSettings + sizeof(DTMSettings) + sizeof(WorldClipRect) + sizeof(float64_t3x3) + sizeof(MainObject) + sizeof(DrawObject) + 512ull, ResourcesMaxNaturalAlignment);
+		return core::alignUp(sizeof(LineStyle) + sizeof(LineStyle) * DTMSettings::MaxContourSettings + sizeof(DTMSettings) + sizeof(WorldClipRect) + sizeof(float64_t3x3) + sizeof(MainObject) + sizeof(DrawObject) + 512ull, GPUStructsMaxNaturalAlignment);
 	}
 
-	void allocateResourcesBuffer(ILogicalDevice* logicalDevice, size_t size);
+	/**
+	 * @brief Attempts to allocate a single contiguous device-local memory block for draw resources, divided into image and buffer sections.
+	 * 
+	 * The function allocates a single memory block and splits it into image and buffer arenas.
+	 * 
+	 * @param logicalDevice Pointer to the logical device used for memory allocation and resource creation.
+	 * @param requiredImageMemorySize The size in bytes of the memory required for images.
+	 * @param requiredBufferMemorySize The size in bytes of the memory required for buffers.
+	 * 
+	 * @return true if the memory allocation and resource setup succeeded; false otherwise.
+	 */
+	bool allocateDrawResources(ILogicalDevice* logicalDevice, size_t requiredImageMemorySize, size_t requiredBufferMemorySize);
+	
+	/**
+	 * @brief Attempts to allocate draw resources within a given VRAM budget, retrying with progressively smaller sizes on failure.
+	 * 
+	 * This function preserves the initial image-to-buffer memory ratio. If the initial sizes are too small,
+	 * it scales them up to meet a minimum required threshold. On allocation failure, it reduces the memory
+	 * sizes by a specified percentage and retries, until it either succeeds or the number of attempts exceeds `maxTries`.
+	 * 
+	 * @param logicalDevice Pointer to the logical device used for allocation.
+	 * @param maxImageMemorySize Initial image memory size (in bytes) to attempt allocation with.
+	 * @param maxBufferMemorySize Initial buffer memory size (in bytes) to attempt allocation with.
+	 * @param reductionPercent The percentage by which to reduce the memory sizes after each failed attempt (e.g., 10 means reduce by 10%).
+	 * @param maxTries Maximum number of attempts to try reducing and allocating memory.
+	 * 
+	 * @return true if the allocation succeeded at any iteration; false if all attempts failed.
+	 */
+	bool allocateDrawResourcesWithinAvailableVRAM(ILogicalDevice* logicalDevice, size_t maxImageMemorySize, size_t maxBufferMemorySize, uint32_t reductionPercent = 10u, uint32_t maxTries = 32u);
 
-	void allocateMSDFTextures(ILogicalDevice* logicalDevice, uint32_t maxMSDFs, uint32_t2 msdfsExtent);
+	bool allocateMSDFTextures(ILogicalDevice* logicalDevice, uint32_t maxMSDFs, uint32_t2 msdfsExtent);
 
 	// functions that user should set to get MSDF texture if it's not available in cache.
 	// it's up to user to return cached or generate on the fly.
@@ -723,6 +752,7 @@ struct DrawResourcesFiller
 
 	// ResourcesCollection and packed into GPUBuffer
 	ResourcesCollection resourcesCollection;
+	IDeviceMemoryAllocator::SAllocation buffersMemoryArena;
 	nbl::core::smart_refctd_ptr<IGPUBuffer> resourcesGPUBuffer;
 	size_t copiedResourcesSize;
 
diff --git a/62_CAD/main.cpp b/62_CAD/main.cpp
index 41d8fbfd3..7d8ccb67d 100644
--- a/62_CAD/main.cpp
+++ b/62_CAD/main.cpp
@@ -374,8 +374,9 @@ class ComputerAidedDesign final : public examples::SimpleWindowedApplication, pu
 	{
 		drawResourcesFiller = DrawResourcesFiller(core::smart_refctd_ptr(m_utils), getGraphicsQueue(), core::smart_refctd_ptr(m_logger));
 		
-		size_t bufferSize = 512u * 1024u * 1024u; // 512 MB
-		drawResourcesFiller.allocateResourcesBuffer(m_device.get(), bufferSize);
+		size_t maxImagesMemSize = 1024ull * 1024ull * 1024ull; // 1024 MB
+		size_t maxBufferMemSize = 1024ull * 1024ull * 1024ull; // 1024 MB
+		drawResourcesFiller.allocateDrawResourcesWithinAvailableVRAM(m_device.get(), maxImagesMemSize, maxBufferMemSize);
 		drawResourcesFiller.allocateMSDFTextures(m_device.get(), 256u, uint32_t2(MSDFSize, MSDFSize));
 
 		{

From 6c78e29707f3af7cba1ca67781bd71f8d7e35189 Mon Sep 17 00:00:00 2001
From: Arkadiusz Lachowicz <areklachowicz@gmail.com>
Date: Wed, 11 Jun 2025 16:01:03 +0200
Subject: [PATCH 349/529] rework PCH for examples, save work and test with 01 -
 successfully

---
 01_HelloCoreSystemAsset/main.cpp           |  4 +--
 CMakeLists.txt                             | 33 ++++++++--------------
 common/CMakeLists.txt                      | 18 +++---------
 common/include/nbl/examples/PCH.hpp        | 20 ++++++-------
 common/include/nbl/examples/api.hpp        | 31 ++++++++++++++++++++
 common/src/nbl/examples/CMakeLists.txt     |  3 ++
 common/src/nbl/examples/pch.cpp            |  1 +
 common/src/nbl/examples/pch/CMakeLists.txt | 18 ------------
 common/src/nbl/examples/pch/main.cpp       |  9 ------
 9 files changed, 60 insertions(+), 77 deletions(-)
 create mode 100644 common/include/nbl/examples/api.hpp
 create mode 100644 common/src/nbl/examples/pch.cpp
 delete mode 100644 common/src/nbl/examples/pch/CMakeLists.txt
 delete mode 100644 common/src/nbl/examples/pch/main.cpp

diff --git a/01_HelloCoreSystemAsset/main.cpp b/01_HelloCoreSystemAsset/main.cpp
index 6a9188344..96e4a0d4e 100644
--- a/01_HelloCoreSystemAsset/main.cpp
+++ b/01_HelloCoreSystemAsset/main.cpp
@@ -2,8 +2,8 @@
 // This file is part of the "Nabla Engine".
 // For conditions of distribution and use, see copyright notice in nabla.h
 
-// always include nabla first before std:: headers
-#include "nabla.h"
+// <nabla.h> public interface and common examples API, always include first before std:: headers
+#include "nbl/examples/api.hpp"
 
 #include "nbl/system/IApplicationFramework.h"
 
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 789e96937..a9d9d046c 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -2,26 +2,6 @@
 # This file is part of the "Nabla Engine".
 # For conditions of distribution and use, see copyright notice in nabla.h
 
-function(NBL_HOOK_COMMON_API NBL_EXCLUDE_TARGETS_LIST)
-	if(NOT TARGET nblExamplesAPI)
-		message(FATAL_ERROR "nblExamplesAPI not defined!")
-	endif()
-
-    NBL_GET_ALL_TARGETS(NBL_TARGETS)
-
-    foreach(NBL_TARGET IN LISTS NBL_TARGETS)
-		# TODO: exclude builtin targets created by examples as well - doesn't impact anything at all now
-		if(NOT ${NBL_TARGET} IN_LIST NBL_EXCLUDE_TARGETS_LIST)
-
-			target_include_directories(${NBL_TARGET} PRIVATE $<TARGET_PROPERTY:nblExamplesAPI,INTERFACE_INCLUDE_DIRECTORIES>)
-			target_link_libraries(${NBL_TARGET} PRIVATE nblExamplesAPI)
-		endif()
-    endforeach()
-endfunction()
-
-# PCH & CommonAPI library for Nabla framework examples
-add_subdirectory(common EXCLUDE_FROM_ALL)
-
 if(NBL_BUILD_EXAMPLES)
 	if(NBL_BUILD_ANDROID)
 		nbl_android_create_media_storage_apk()
@@ -44,7 +24,7 @@ if(NBL_BUILD_EXAMPLES)
 	# showcase the set-up of a swapchain and picking of a matching device
 	add_subdirectory(08_HelloSwapchain EXCLUDE_FROM_ALL)
 	add_subdirectory(09_GeometryCreator EXCLUDE_FROM_ALL)
-  # demonstrate the counting sort utility
+  	# demonstrate the counting sort utility
 	add_subdirectory(10_CountingSort EXCLUDE_FROM_ALL)
 	# showcase use of FFT for post-FX Bloom  effect
 	add_subdirectory(11_FFT EXCLUDE_FROM_ALL)
@@ -92,5 +72,14 @@ if(NBL_BUILD_EXAMPLES)
   	add_subdirectory(70_FLIPFluids EXCLUDE_FROM_ALL)
 	add_subdirectory(71_RayTracingPipeline EXCLUDE_FROM_ALL)
 
-	NBL_HOOK_COMMON_API("${NBL_EXAMPLES_API_TARGETS}")
+	NBL_GET_ALL_TARGETS(TARGETS)
+
+	# PCH & CommonAPI library for Nabla framework examples
+	add_subdirectory(common EXCLUDE_FROM_ALL)
+
+	foreach(T IN LISTS TARGETS)
+        target_link_libraries(${T} PUBLIC ${NBL_EXAMPLES_API_TARGET})
+		target_include_directories(${T} PUBLIC $<TARGET_PROPERTY:${NBL_EXAMPLES_API_TARGET},INCLUDE_DIRECTORIES>)
+		target_precompile_headers(${T} REUSE_FROM "${NBL_EXAMPLES_API_TARGET}")
+    endforeach()
 endif()
diff --git a/common/CMakeLists.txt b/common/CMakeLists.txt
index 9560a8f42..3cdcce82d 100644
--- a/common/CMakeLists.txt
+++ b/common/CMakeLists.txt
@@ -1,17 +1,7 @@
-###########################################
-# TODO: the way it should work is following (remove the comment once all done!)
-# - one top PCH which includes <nabla.h> -> currently not done
-# - sources used only within examples splitted into "common libraries" (optional -> with options to toggle if include them to build tree), each common library should reuse the above top PCH
-# - examples_tests CMake loop over example targets and hook the interface library with NBL_HOOK_COMMON_API [done]
-# - each common library should declare ONLY interface and never expose source definition into headers nor any 3rdparty stuff!
-##
+nbl_create_ext_library_project(ExamplesAPI "" "${CMAKE_CURRENT_SOURCE_DIR}/src/nbl/examples/pch.cpp" "${CMAKE_CURRENT_SOURCE_DIR}/include" "" "")
 
-# interface libraries don't have build rules (except custom commands however it doesn't matter here) but properties
-add_library(nblExamplesAPI INTERFACE)
-set(NBL_EXAMPLES_API_INCLUDE_DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}/include")
-target_include_directories(nblExamplesAPI INTERFACE "${NBL_EXAMPLES_API_INCLUDE_DIRECTORY}")
+set_target_properties(${LIB_NAME} PROPERTIES DISABLE_PRECOMPILE_HEADERS OFF)
+target_precompile_headers(${LIB_NAME} PUBLIC "${CMAKE_CURRENT_SOURCE_DIR}/include/nbl/examples/PCH.hpp")
 
 add_subdirectory("src/nbl/examples" EXCLUDE_FROM_ALL)
-
-# TODO: Arek what was `NBL_EXECUTABLE_COMMON_API_TARGET` ? I removed it.
-set(NBL_EXAMPLES_API_TARGETS nblExamplesAPI ${NBL_EXAMPLES_API_TARGETS} PARENT_SCOPE)
+set(NBL_EXAMPLES_API_TARGET ${LIB_NAME} PARENT_SCOPE)
\ No newline at end of file
diff --git a/common/include/nbl/examples/PCH.hpp b/common/include/nbl/examples/PCH.hpp
index 7a1b6bdc6..de686b26f 100644
--- a/common/include/nbl/examples/PCH.hpp
+++ b/common/include/nbl/examples/PCH.hpp
@@ -4,19 +4,15 @@
 #ifndef _NBL_EXAMPLES_PCH_HPP_
 #define _NBL_EXAMPLES_PCH_HPP_
 
-
+//! public declarations
+/*
+    NOTE: currently our whole public and private interface is broken
+    and private headers leak to public includes
+*/
 #include <nabla.h>
 
-// #include "nbl/ui/CGraphicalApplicationAndroid.h"
-// #include "nbl/ui/CWindowManagerAndroid.h"
-
-#include "nbl/examples/common/SimpleWindowedApplication.hpp"
-#include "nbl/examples/common/InputSystem.hpp"
-#include "nbl/examples/common/CEventCallback.hpp"
-
-#include "nbl/examples/cameras/CCamera.hpp"
-
-#include "nbl/examples/geometry/CGeometryCreatorScene.hpp"
-
+//! note: one can add common std headers here not present in nabla.h or 
+//! any headers shared between examples, you cannot put there include
+//! files which require extra preprocessor definitions
 
 #endif // _NBL_EXAMPLES_COMMON_PCH_HPP_
\ No newline at end of file
diff --git a/common/include/nbl/examples/api.hpp b/common/include/nbl/examples/api.hpp
new file mode 100644
index 000000000..0cb2278cb
--- /dev/null
+++ b/common/include/nbl/examples/api.hpp
@@ -0,0 +1,31 @@
+// Copyright (C) 2018-2025 - DevSH Graphics Programming Sp. z O.O.
+// This file is part of the "Nabla Engine".
+// For conditions of distribution and use, see copyright notice in nabla.h
+#ifndef _NBL_EXAMPLES_API_HPP_
+#define _NBL_EXAMPLES_API_HPP_
+
+//! PCH for examples
+/*
+    PCH is compiled only once *if* an example can be promoted to use it, it is
+    when its compile options & definitions set is the same as nblExamplesAPI's
+    each example links to, otherwise it compiles its own PCH
+*/
+#include "nbl/examples/PCH.hpp"
+
+//! common headers used across examples
+/*
+    NOTE: those *cannot* be used in PCH since they use unique 
+    preprocessor definitions per example which would change
+    content of PCH
+*/
+#include "nbl/examples/common/SimpleWindowedApplication.hpp"
+#include "nbl/examples/common/InputSystem.hpp"
+#include "nbl/examples/common/CEventCallback.hpp"
+
+#include "nbl/examples/cameras/CCamera.hpp"
+
+// broken? probably to refactor or even remove?
+// #include "nbl/examples/geometry/CGeometryCreatorScene.hpp"
+
+
+#endif // _NBL_EXAMPLES_API_HPP_
\ No newline at end of file
diff --git a/common/src/nbl/examples/CMakeLists.txt b/common/src/nbl/examples/CMakeLists.txt
index 96ccaabea..4fceed571 100644
--- a/common/src/nbl/examples/CMakeLists.txt
+++ b/common/src/nbl/examples/CMakeLists.txt
@@ -1,3 +1,6 @@
+
+
+
 # TODO: @AnastaZluk redo the PCH
 # add_subdirectory(pch EXCLUDE_FROM_ALL)
 
diff --git a/common/src/nbl/examples/pch.cpp b/common/src/nbl/examples/pch.cpp
new file mode 100644
index 000000000..39a146f1d
--- /dev/null
+++ b/common/src/nbl/examples/pch.cpp
@@ -0,0 +1 @@
+#include "nbl/examples/PCH.hpp"
\ No newline at end of file
diff --git a/common/src/nbl/examples/pch/CMakeLists.txt b/common/src/nbl/examples/pch/CMakeLists.txt
deleted file mode 100644
index 34f16c2d2..000000000
--- a/common/src/nbl/examples/pch/CMakeLists.txt
+++ /dev/null
@@ -1,18 +0,0 @@
-# TODO: let arek figure out how to redo the PCH
-#[===[
-include(common RESULT_VARIABLE RES)
-if(NOT RES)
-	message(FATAL_ERROR "common.cmake not found. Should be in '${NBL_ROOT_PATH}/cmake' directory")
-endif()
-
-nbl_create_executable_project("" "" "" "" "")
-
-set(NBL_EXECUTABLE_PROJECT_CREATION_PCH_TARGET "${EXECUTABLE_NAME}" CACHE INTERNAL "")
-get_target_property(NBL_NABLA_TARGET_SOURCE_DIR Nabla SOURCE_DIR)
-set_target_properties("${EXECUTABLE_NAME}" PROPERTIES DISABLE_PRECOMPILE_HEADERS OFF)
-target_precompile_headers("${EXECUTABLE_NAME}" PUBLIC 
-	"${CMAKE_CURRENT_SOURCE_DIR}/PCH.hpp" # Common PCH for examples
-	"${NBL_NABLA_TARGET_SOURCE_DIR}/pch.h" # Nabla's PCH
-)
-unset(NBL_NABLA_TARGET_SOURCE_DIR)
-]===]
\ No newline at end of file
diff --git a/common/src/nbl/examples/pch/main.cpp b/common/src/nbl/examples/pch/main.cpp
deleted file mode 100644
index c19ee3c45..000000000
--- a/common/src/nbl/examples/pch/main.cpp
+++ /dev/null
@@ -1,9 +0,0 @@
-// Copyright (C) 2018-2022 - DevSH Graphics Programming Sp. z O.O.
-// This file is part of the "Nabla Engine".
-// For conditions of distribution and use, see copyright notice in nabla.h
-
-int main(int argc, char** argv)
-{
-	return 0;
-}
-

From fdae6f916057d539a02f1329bb90e687f5ca70d0 Mon Sep 17 00:00:00 2001
From: Arkadiusz Lachowicz <areklachowicz@gmail.com>
Date: Wed, 11 Jun 2025 16:34:37 +0200
Subject: [PATCH 350/529] actually some example headers I removed can be used
 in PCH with small tmp trick

#include "nbl/examples/common/SimpleWindowedApplication.hpp"
#include "nbl/examples/common/InputSystem.hpp"
#include "nbl/examples/common/CEventCallback.hpp"
#include "nbl/examples/cameras/CCamera.hpp"
---
 common/include/nbl/examples/PCH.hpp |  6 ++++++
 common/include/nbl/examples/api.hpp | 11 ++---------
 2 files changed, 8 insertions(+), 9 deletions(-)

diff --git a/common/include/nbl/examples/PCH.hpp b/common/include/nbl/examples/PCH.hpp
index de686b26f..f4fbe377c 100644
--- a/common/include/nbl/examples/PCH.hpp
+++ b/common/include/nbl/examples/PCH.hpp
@@ -11,6 +11,12 @@
 */
 #include <nabla.h>
 
+#include "nbl/examples/common/SimpleWindowedApplication.hpp"
+#include "nbl/examples/common/InputSystem.hpp"
+#include "nbl/examples/common/CEventCallback.hpp"
+
+#include "nbl/examples/cameras/CCamera.hpp"
+
 //! note: one can add common std headers here not present in nabla.h or 
 //! any headers shared between examples, you cannot put there include
 //! files which require extra preprocessor definitions
diff --git a/common/include/nbl/examples/api.hpp b/common/include/nbl/examples/api.hpp
index 0cb2278cb..9b809b8ea 100644
--- a/common/include/nbl/examples/api.hpp
+++ b/common/include/nbl/examples/api.hpp
@@ -12,17 +12,10 @@
 */
 #include "nbl/examples/PCH.hpp"
 
-//! common headers used across examples
+//! common headers used across examples which cannot be part of PCH
 /*
-    NOTE: those *cannot* be used in PCH since they use unique 
-    preprocessor definitions per example which would change
-    content of PCH
+    NOTE: put here if a header requires defines which may be differ
 */
-#include "nbl/examples/common/SimpleWindowedApplication.hpp"
-#include "nbl/examples/common/InputSystem.hpp"
-#include "nbl/examples/common/CEventCallback.hpp"
-
-#include "nbl/examples/cameras/CCamera.hpp"
 
 // broken? probably to refactor or even remove?
 // #include "nbl/examples/geometry/CGeometryCreatorScene.hpp"

From ed51dee2d394663dad3e0d0adfad10fd6df120ca Mon Sep 17 00:00:00 2001
From: Arkadiusz Lachowicz <areklachowicz@gmail.com>
Date: Wed, 11 Jun 2025 16:49:45 +0200
Subject: [PATCH 351/529] remove old header references from
 29_MeshLoaders/main.cpp which are included in PCH now though the example
 doesn't compile anyway (misses imgui link and more)

---
 29_MeshLoaders/main.cpp             | 7 -------
 common/include/nbl/examples/PCH.hpp | 2 +-
 2 files changed, 1 insertion(+), 8 deletions(-)

diff --git a/29_MeshLoaders/main.cpp b/29_MeshLoaders/main.cpp
index feb52936a..6afb74a5c 100644
--- a/29_MeshLoaders/main.cpp
+++ b/29_MeshLoaders/main.cpp
@@ -6,13 +6,6 @@
 #include "nbl/asset/utils/CGeometryCreator.h"
 #include "nbl/application_templates/MonoAssetManagerAndBuiltinResourceApplication.hpp"
 
-#include "SimpleWindowedApplication.hpp"
-
-#include "InputSystem.hpp"
-#include "CEventCallback.hpp"
-
-#include "CCamera.hpp"
-
 #include <nbl/builtin/hlsl/cpp_compat.hlsl>
 #include <nbl/builtin/hlsl/cpp_compat/matrix.hlsl>
 
diff --git a/common/include/nbl/examples/PCH.hpp b/common/include/nbl/examples/PCH.hpp
index f4fbe377c..58269d652 100644
--- a/common/include/nbl/examples/PCH.hpp
+++ b/common/include/nbl/examples/PCH.hpp
@@ -19,6 +19,6 @@
 
 //! note: one can add common std headers here not present in nabla.h or 
 //! any headers shared between examples, you cannot put there include
-//! files which require extra preprocessor definitions
+//! files which require unique preprocessor definitions for each example
 
 #endif // _NBL_EXAMPLES_COMMON_PCH_HPP_
\ No newline at end of file

From 971ee343256efa218a76297887d17d471ef6d414 Mon Sep 17 00:00:00 2001
From: Arkadiusz Lachowicz <areklachowicz@gmail.com>
Date: Wed, 11 Jun 2025 17:08:34 +0200
Subject: [PATCH 352/529] wipe all EXCLUDE_FROM_ALL except common directory,
 group all examples into project to allow build all at once

---
 CMakeLists.txt | 84 ++++++++++++++++++++++++++------------------------
 1 file changed, 43 insertions(+), 41 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index a9d9d046c..a6164dbfd 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -3,74 +3,76 @@
 # For conditions of distribution and use, see copyright notice in nabla.h
 
 if(NBL_BUILD_EXAMPLES)
+	project(NablaExamples)
+
 	if(NBL_BUILD_ANDROID)
 		nbl_android_create_media_storage_apk()
 	endif()
 
 	# showcase the use of `nbl::core`,`nbl::system` and `nbl::asset`
-	add_subdirectory(01_HelloCoreSystemAsset EXCLUDE_FROM_ALL)
+	add_subdirectory(01_HelloCoreSystemAsset)
 	# showcase the use of `system::IApplicationFramework` and `nbl::video`
-	add_subdirectory(02_HelloCompute EXCLUDE_FROM_ALL)
+	add_subdirectory(02_HelloCompute)
 	# showcase physical device selection, resource embedding and the use of identical headers in HLSL and C++
-	add_subdirectory(03_DeviceSelectionAndSharedSources EXCLUDE_FROM_ALL)
+	add_subdirectory(03_DeviceSelectionAndSharedSources)
 	# showcase the creation of windows and polling for input
-	add_subdirectory(04_HelloUI EXCLUDE_FROM_ALL)
+	add_subdirectory(04_HelloUI)
 	# showcase the semi-advanced use of Nabla's Streaming Buffers and BDA
-	add_subdirectory(05_StreamingAndBufferDeviceAddressApp EXCLUDE_FROM_ALL)
+	add_subdirectory(05_StreamingAndBufferDeviceAddressApp)
 	# showcase the use of a graphics queue
-	add_subdirectory(06_HelloGraphicsQueue EXCLUDE_FROM_ALL)
+	add_subdirectory(06_HelloGraphicsQueue)
 	# showcase the set-up of multiple queues
-	add_subdirectory(07_StagingAndMultipleQueues EXCLUDE_FROM_ALL)
+	add_subdirectory(07_StagingAndMultipleQueues)
 	# showcase the set-up of a swapchain and picking of a matching device
-	add_subdirectory(08_HelloSwapchain EXCLUDE_FROM_ALL)
-	add_subdirectory(09_GeometryCreator EXCLUDE_FROM_ALL)
+	add_subdirectory(08_HelloSwapchain)
+	add_subdirectory(09_GeometryCreator)
   	# demonstrate the counting sort utility
-	add_subdirectory(10_CountingSort EXCLUDE_FROM_ALL)
+	add_subdirectory(10_CountingSort)
 	# showcase use of FFT for post-FX Bloom  effect
-	add_subdirectory(11_FFT EXCLUDE_FROM_ALL)
+	add_subdirectory(11_FFT)
 
 
 	# Waiting for a refactor
-	#add_subdirectory(27_PLYSTLDemo EXCLUDE_FROM_ALL)
-	#add_subdirectory(33_Draw3DLine EXCLUDE_FROM_ALL)
+	#add_subdirectory(27_PLYSTLDemo)
+	#add_subdirectory(33_Draw3DLine)
 
 	# Unit Test Examples
-	add_subdirectory(20_AllocatorTest EXCLUDE_FROM_ALL)
-	add_subdirectory(21_LRUCacheUnitTest EXCLUDE_FROM_ALL)
-	add_subdirectory(22_CppCompat EXCLUDE_FROM_ALL)
-	add_subdirectory(23_ArithmeticUnitTest EXCLUDE_FROM_ALL)
-	add_subdirectory(24_ColorSpaceTest EXCLUDE_FROM_ALL)
-	add_subdirectory(25_FilterTest EXCLUDE_FROM_ALL)
-	add_subdirectory(26_Blur EXCLUDE_FROM_ALL)
-	add_subdirectory(27_MPMCScheduler EXCLUDE_FROM_ALL)	
-	add_subdirectory(28_FFTBloom EXCLUDE_FROM_ALL)
-	add_subdirectory(29_MeshLoaders EXCLUDE_FROM_ALL)
-	# add_subdirectory(36_CUDAInterop EXCLUDE_FROM_ALL)
+	add_subdirectory(20_AllocatorTest)
+	add_subdirectory(21_LRUCacheUnitTest)
+	add_subdirectory(22_CppCompat)
+	add_subdirectory(23_ArithmeticUnitTest)
+	add_subdirectory(24_ColorSpaceTest)
+	add_subdirectory(25_FilterTest)
+	add_subdirectory(26_Blur)
+	add_subdirectory(27_MPMCScheduler)	
+	add_subdirectory(28_FFTBloom)
+	add_subdirectory(29_MeshLoaders)
+	# add_subdirectory(36_CUDAInterop)
 
 	# Showcase compute pathtracing
-	add_subdirectory(30_ComputeShaderPathTracer EXCLUDE_FROM_ALL)
+	add_subdirectory(30_ComputeShaderPathTracer)
 
-	add_subdirectory(38_EXRSplit EXCLUDE_FROM_ALL)
+	add_subdirectory(38_EXRSplit)
 	# if (NBL_BUILD_MITSUBA_LOADER AND NBL_BUILD_OPTIX)
-	#	add_subdirectory(39_DenoiserTonemapper EXCLUDE_FROM_ALL)
+	#	add_subdirectory(39_DenoiserTonemapper)
 	# endif()
 
-	#add_subdirectory(43_SumAndCDFFilters EXCLUDE_FROM_ALL)
-	add_subdirectory(47_DerivMapTest EXCLUDE_FROM_ALL)
-	add_subdirectory(54_Transformations EXCLUDE_FROM_ALL)
-	add_subdirectory(55_RGB18E7S3 EXCLUDE_FROM_ALL)
-	add_subdirectory(61_UI EXCLUDE_FROM_ALL)
-	add_subdirectory(62_CAD EXCLUDE_FROM_ALL)
-	add_subdirectory(62_SchusslerTest EXCLUDE_FROM_ALL)
-	add_subdirectory(64_EmulatedFloatTest EXCLUDE_FROM_ALL)
-	add_subdirectory(0_ImportanceSamplingEnvMaps EXCLUDE_FROM_ALL) #TODO: integrate back into 42
+	#add_subdirectory(43_SumAndCDFFilters)
+	add_subdirectory(47_DerivMapTest)
+	add_subdirectory(54_Transformations)
+	add_subdirectory(55_RGB18E7S3)
+	add_subdirectory(61_UI)
+	add_subdirectory(62_CAD)
+	add_subdirectory(62_SchusslerTest)
+	add_subdirectory(64_EmulatedFloatTest)
+	add_subdirectory(0_ImportanceSamplingEnvMaps) #TODO: integrate back into 42
 
-	add_subdirectory(66_HLSLBxDFTests EXCLUDE_FROM_ALL)
-	add_subdirectory(67_RayQueryGeometry EXCLUDE_FROM_ALL)
-	add_subdirectory(68_JpegLoading EXCLUDE_FROM_ALL)
+	add_subdirectory(66_HLSLBxDFTests)
+	add_subdirectory(67_RayQueryGeometry)
+	add_subdirectory(68_JpegLoading)
 
-  	add_subdirectory(70_FLIPFluids EXCLUDE_FROM_ALL)
-	add_subdirectory(71_RayTracingPipeline EXCLUDE_FROM_ALL)
+  	add_subdirectory(70_FLIPFluids)
+	add_subdirectory(71_RayTracingPipeline)
 
 	NBL_GET_ALL_TARGETS(TARGETS)
 

From c9f610f7adbed4f572cf834c79ffd6d09b7c47bf Mon Sep 17 00:00:00 2001
From: Arkadiusz Lachowicz <areklachowicz@gmail.com>
Date: Wed, 11 Jun 2025 17:51:41 +0200
Subject: [PATCH 353/529] use NBL_ADJUST_FOLDERS with examples namespace

---
 CMakeLists.txt | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index a6164dbfd..82daaccb3 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -84,4 +84,6 @@ if(NBL_BUILD_EXAMPLES)
 		target_include_directories(${T} PUBLIC $<TARGET_PROPERTY:${NBL_EXAMPLES_API_TARGET},INCLUDE_DIRECTORIES>)
 		target_precompile_headers(${T} REUSE_FROM "${NBL_EXAMPLES_API_TARGET}")
     endforeach()
-endif()
+
+	NBL_ADJUST_FOLDERS(examples)
+endif()
\ No newline at end of file

From 307f7f9b6e2c42737511eff831ad747cc5ba8de2 Mon Sep 17 00:00:00 2001
From: Erfan Ahmadi <ahmadierfan99@gmail.com>
Date: Thu, 12 Jun 2025 13:08:32 +0400
Subject: [PATCH 354/529] un-premultiply alpha

---
 62_CAD/shaders/main_pipeline/fragment_shader.hlsl | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/62_CAD/shaders/main_pipeline/fragment_shader.hlsl b/62_CAD/shaders/main_pipeline/fragment_shader.hlsl
index 10a2348a7..b6a1e7078 100644
--- a/62_CAD/shaders/main_pipeline/fragment_shader.hlsl
+++ b/62_CAD/shaders/main_pipeline/fragment_shader.hlsl
@@ -198,9 +198,8 @@ float4 fragMain(PSInput input) : SV_TARGET
         }
         if (dtmSettings.drawHeightShadingEnabled())
             dtmColor = dtm::blendUnder(dtmColor, dtm::calculateDTMHeightColor(dtmSettings.heightShadingSettings, v, heightDeriv, input.position.xy, height));
-        
 
-        textureColor = dtmColor.rgb;
+        textureColor = dtmColor.rgb / dtmColor.a;
         localAlpha = dtmColor.a;
 
         gammaUncorrect(textureColor); // want to output to SRGB without gamma correction
@@ -583,7 +582,7 @@ float4 fragMain(PSInput input) : SV_TARGET
             if (dtmSettings.drawHeightShadingEnabled() && !outOfBoundsUV)
                 dtmColor = dtm::blendUnder(dtmColor, dtm::calculateDTMHeightColor(dtmSettings.heightShadingSettings, v, heightDeriv, input.position.xy, height));
 
-            textureColor = dtmColor.rgb;
+            textureColor = dtmColor.rgb / dtmColor.a;
             localAlpha = dtmColor.a;
 
             // test out of bounds draw

From f0c2d5ef74bb8f0b9c1e0d00ff66505e75753a50 Mon Sep 17 00:00:00 2001
From: Przemek <minikers21@gmail.com>
Date: Thu, 12 Jun 2025 16:26:03 +0200
Subject: [PATCH 355/529] Fixed DTM diagonal info flushing

---
 62_CAD/DrawResourcesFiller.cpp                | 12 ++++--
 62_CAD/DrawResourcesFiller.h                  |  6 ++-
 62_CAD/Images.h                               |  1 +
 62_CAD/main.cpp                               | 33 ++++++++++----
 62_CAD/shaders/globals.hlsl                   |  8 +++-
 62_CAD/shaders/main_pipeline/common.hlsl      |  1 +
 .../main_pipeline/fragment_shader.hlsl        | 43 +++++++------------
 .../shaders/main_pipeline/vertex_shader.hlsl  |  3 +-
 8 files changed, 61 insertions(+), 46 deletions(-)

diff --git a/62_CAD/DrawResourcesFiller.cpp b/62_CAD/DrawResourcesFiller.cpp
index 517334ad9..5ea9d3adf 100644
--- a/62_CAD/DrawResourcesFiller.cpp
+++ b/62_CAD/DrawResourcesFiller.cpp
@@ -456,7 +456,7 @@ bool DrawResourcesFiller::ensureStaticImageAvailability(const StaticImageInfo& s
 			}
 
 			// Attempt to create a GPU image and image view for this texture.
-			ImageAllocateResults allocResults = tryCreateAndAllocateImage_SubmitIfNeeded(imageParams, intendedNextSubmit, std::to_string(staticImage.imageID));
+			ImageAllocateResults allocResults = tryCreateAndAllocateImage_SubmitIfNeeded(imageParams, staticImage.imageViewFormatOverride, intendedNextSubmit, std::to_string(staticImage.imageID));
 
 			if (allocResults.isValid())
 			{
@@ -603,7 +603,7 @@ bool DrawResourcesFiller::ensureGeoreferencedImageAvailability_AllocateIfNeeded(
 		if (cachedImageRecord->arrayIndex != video::SubAllocatedDescriptorSet::AddressAllocator::invalid_address)
 		{
 			// Attempt to create a GPU image and image view for this texture.
-			ImageAllocateResults allocResults = tryCreateAndAllocateImage_SubmitIfNeeded(imageCreationParams, intendedNextSubmit, std::to_string(imageID));
+			ImageAllocateResults allocResults = tryCreateAndAllocateImage_SubmitIfNeeded(imageCreationParams, asset::E_FORMAT::EF_COUNT, intendedNextSubmit, std::to_string(imageID));
 
 			if (allocResults.isValid())
 			{
@@ -2171,7 +2171,11 @@ void DrawResourcesFiller::evictImage_SubmitIfNeeded(image_id imageID, const Cach
 	}
 }
 
-DrawResourcesFiller::ImageAllocateResults DrawResourcesFiller::tryCreateAndAllocateImage_SubmitIfNeeded(const nbl::asset::IImage::SCreationParams& imageParams, nbl::video::SIntendedSubmitInfo& intendedNextSubmit, std::string imageDebugName)
+DrawResourcesFiller::ImageAllocateResults DrawResourcesFiller::tryCreateAndAllocateImage_SubmitIfNeeded(
+	const nbl::asset::IImage::SCreationParams& imageParams,
+	const asset::E_FORMAT imageViewFormatOverride,
+	nbl::video::SIntendedSubmitInfo& intendedNextSubmit,
+	std::string imageDebugName)
 {
 	ImageAllocateResults ret = {};
 
@@ -2218,7 +2222,7 @@ DrawResourcesFiller::ImageAllocateResults DrawResourcesFiller::tryCreateAndAlloc
 						IGPUImageView::SCreationParams viewParams = {
 							.image = gpuImage,
 							.viewType = IGPUImageView::ET_2D,
-							.format = gpuImage->getCreationParameters().format
+							.format = (imageViewFormatOverride == asset::E_FORMAT::EF_COUNT) ? gpuImage->getCreationParameters().format : EF_R32G32B32A32_UINT
 						};
 						ret.gpuImageView = device->createImageView(std::move(viewParams));
 						if (ret.gpuImageView)
diff --git a/62_CAD/DrawResourcesFiller.h b/62_CAD/DrawResourcesFiller.h
index f482d8435..f8e4bee67 100644
--- a/62_CAD/DrawResourcesFiller.h
+++ b/62_CAD/DrawResourcesFiller.h
@@ -560,6 +560,7 @@ struct DrawResourcesFiller
 	 * This is primarily used by the draw resource filler to manage GPU image memory for streamed or cached images.
 	 *
 	 * @param imageParams Creation parameters for the image. Should match `nbl::asset::IImage::SCreationParams`.
+	 * @param imageViewFormatOverride Specifies whether the image view format should differ from the image format. If set to asset::E_FORMAT_ET_COUNT, the image view uses the same format as the image
 	 * @param intendedNextSubmit Reference to the current intended submit info. Used for synchronizing evictions.
 	 * @param imageDebugName Debug name assigned to the image and its view for easier profiling/debugging.
 	 *
@@ -568,7 +569,10 @@ struct DrawResourcesFiller
 	 * - `allocationSize`: Size of the allocated memory region.
 	 * - `gpuImageView`: The created GPU image view (nullptr if creation failed).
 	 */
-	ImageAllocateResults tryCreateAndAllocateImage_SubmitIfNeeded(const nbl::asset::IImage::SCreationParams& imageParams, nbl::video::SIntendedSubmitInfo& intendedNextSubmit, std::string debugName = "UnnamedNablaImage");
+	ImageAllocateResults tryCreateAndAllocateImage_SubmitIfNeeded(const nbl::asset::IImage::SCreationParams& imageParams,
+		const asset::E_FORMAT imageViewFormatOverride,
+		nbl::video::SIntendedSubmitInfo& intendedNextSubmit,
+		std::string imageDebugName);
 
 	/**
 	 * @brief Determines creation parameters for a georeferenced image based on heuristics.
diff --git a/62_CAD/Images.h b/62_CAD/Images.h
index bb7b7d3ae..a341eadd6 100644
--- a/62_CAD/Images.h
+++ b/62_CAD/Images.h
@@ -215,4 +215,5 @@ struct StaticImageInfo
 	image_id imageID = ~0ull;
 	core::smart_refctd_ptr<ICPUImage> cpuImage = nullptr;
 	bool forceUpdate = false; // If true, bypasses the existing GPU-side cache and forces an update of the image data; Useful when replacing the contents of a static image that may already be resident.
+	asset::E_FORMAT imageViewFormatOverride = asset::E_FORMAT::EF_COUNT; // if asset::E_FORMAT::EF_COUNT then image view will have the same format as `cpuImage`
 };
diff --git a/62_CAD/main.cpp b/62_CAD/main.cpp
index 41d8fbfd3..82f70f8e6 100644
--- a/62_CAD/main.cpp
+++ b/62_CAD/main.cpp
@@ -1153,6 +1153,9 @@ class ComputerAidedDesign final : public examples::SimpleWindowedApplication, pu
 			"../../media/color_space_test/R8G8B8A8_1.png",
 		};
 
+		/**
+		* @param formatOverride override format of an image view, use special argument asset::E_FORMAT::EF_COUNT to don't override image view format and use one retrieved from the loaded image
+		*/
 		auto loadImage = [&](const std::string& imagePath) -> smart_refctd_ptr<ICPUImage>
 		{
 			constexpr auto cachingFlags = static_cast<IAssetLoader::E_CACHING_FLAGS>(IAssetLoader::ECF_DONT_CACHE_REFERENCES & IAssetLoader::ECF_DONT_CACHE_TOP_LEVEL);
@@ -1172,6 +1175,9 @@ class ComputerAidedDesign final : public examples::SimpleWindowedApplication, pu
 			case IAsset::ET_IMAGE:
 			{
 				auto image = smart_refctd_ptr_static_cast<ICPUImage>(asset);
+				auto& flags = image->getCreationParameters().flags;
+				// assert if asset is mutable
+				const_cast<core::bitflag<asset::IImage::E_CREATE_FLAGS>&>(flags) |= asset::IImage::E_CREATE_FLAGS::ECF_MUTABLE_FORMAT_BIT;
 				const auto format = image->getCreationParameters().format;
 
 				ICPUImageView::SCreationParams viewParams = {
@@ -1199,7 +1205,6 @@ class ComputerAidedDesign final : public examples::SimpleWindowedApplication, pu
 				return nullptr;
 			}
 
-
 			const auto loadedCPUImage = cpuImgView->getCreationParameters().image;
 			const auto loadedCPUImageCreationParams = loadedCPUImage->getCreationParameters();
 
@@ -1262,7 +1267,7 @@ class ComputerAidedDesign final : public examples::SimpleWindowedApplication, pu
 
 		gridDTMHeightMap = loadImage("../../media/gridDTMHeightMap.exr");
 
-		// set diagonals of even cells to TOP_LEFT_TO_BOTTOM_RIGHT and diagonals of odd cells to BOTTOM_LEFT_TO_TOP_RIGHT
+		// set diagonals of cells to TOP_LEFT_TO_BOTTOM_RIGHT or BOTTOM_LEFT_TO_TOP_RIGHT randomly
 		{
 			// assumption is that format of the grid DTM height map is *_SRGB, I don't think we need any code to ensure that
 
@@ -1273,17 +1278,19 @@ class ComputerAidedDesign final : public examples::SimpleWindowedApplication, pu
 			const size_t imageByteSize = gridDTMHeightMap->getImageDataSizeInBytes();
 			assert(imageByteSize % sizeof(float) == 0);
 
+			std::random_device rd;
+			std::mt19937 mt(rd());
+			std::uniform_int_distribution<int> dist(0, 1);
+
 			for (int i = 0; i < imageByteSize; i += sizeof(float))
 			{
-				const bool isCellEven = i % (2 * sizeof(float)) == 0;
-				E_CELL_DIAGONAL diagonal = isCellEven ? TOP_LEFT_TO_BOTTOM_RIGHT : BOTTOM_LEFT_TO_TOP_RIGHT;
-
-				// test
-				diagonal = BOTTOM_LEFT_TO_TOP_RIGHT;
+				const bool isTexelEven = static_cast<bool>(dist(mt));
+				E_CELL_DIAGONAL diagonal = isTexelEven ? TOP_LEFT_TO_BOTTOM_RIGHT : BOTTOM_LEFT_TO_TOP_RIGHT;
 
 				setDiagonalModeBit(imageData, diagonal);
 				imageData++;
 			}
+
 		}
 
 		assert(gridDTMHeightMap);
@@ -3572,7 +3579,7 @@ class ComputerAidedDesign final : public examples::SimpleWindowedApplication, pu
 			dtmInfo.outlineStyleInfo.worldSpaceLineWidth = 2.0f;
 			dtmInfo.outlineStyleInfo.color = float32_t4(0.0f, 0.39f, 0.0f, 1.0f);
 			std::array<double, 4> outlineStipplePattern = { 0.0f, -5.0f, 20.0f, -5.0f };
-			dtmInfo.outlineStyleInfo.setStipplePatternData(outlineStipplePattern);
+			//dtmInfo.outlineStyleInfo.setStipplePatternData(outlineStipplePattern);
 
 			dtmInfo.contourSettingsCount = 2u;
 			dtmInfo.contourSettings[0u].startHeight = 20;
@@ -3645,7 +3652,15 @@ class ComputerAidedDesign final : public examples::SimpleWindowedApplication, pu
 			worldSpaceExtents.x = (heightMapExtent.width - 1) * HeightMapCellWidth;
 			worldSpaceExtents.y = (heightMapExtent.height - 1) * HeightMapCellWidth;
 			const uint64_t heightMapTextureID = 0ull;
-			if (!drawResourcesFiller.ensureStaticImageAvailability({ heightMapTextureID, gridDTMHeightMap }, intendedNextSubmit))
+
+			StaticImageInfo heightMapStaticImageInfo = {
+				.imageID = heightMapTextureID,
+				.cpuImage = gridDTMHeightMap,
+				.forceUpdate = false,
+				.imageViewFormatOverride = asset::E_FORMAT::EF_R32G32B32A32_UINT // for now we use only R32G32B32A32_* anyway
+			};
+
+			if (!drawResourcesFiller.ensureStaticImageAvailability(heightMapStaticImageInfo, intendedNextSubmit))
 				m_logger->log("Grid DTM height map texture unavailable!", ILogger::ELL_ERROR);
 			drawResourcesFiller.drawGridDTM(topLeft, worldSpaceExtents, HeightMapCellWidth, heightMapTextureID,  dtmInfo, intendedNextSubmit);
 
diff --git a/62_CAD/shaders/globals.hlsl b/62_CAD/shaders/globals.hlsl
index 2361de5e2..cae5210b8 100644
--- a/62_CAD/shaders/globals.hlsl
+++ b/62_CAD/shaders/globals.hlsl
@@ -279,16 +279,20 @@ static void setDiagonalModeBit(float* data, E_CELL_DIAGONAL diagonalMode)
         return;
 
     uint32_t dataAsUint = reinterpret_cast<uint32_t&>(*data);
+    constexpr uint32_t HEIGHT_VALUE_MASK = 0xFFFFFFFEu;
+    dataAsUint &= HEIGHT_VALUE_MASK;
     dataAsUint |= static_cast<uint32_t>(diagonalMode);
     *data = reinterpret_cast<float&>(dataAsUint);
+
+    uint32_t dataAsUintDbg = reinterpret_cast<uint32_t&>(*data);
 }
 
 #endif
 
 // Top left corner holds diagonal mode info of a cell 
-static E_CELL_DIAGONAL getDiagonalModeFromCellCornerData(float cellCornerData)
+static E_CELL_DIAGONAL getDiagonalModeFromCellCornerData(uint32_t cellCornerData)
 {
-    return (nbl::hlsl::bit_cast<uint32_t, float>(cellCornerData) & 0x1u) ? BOTTOM_LEFT_TO_TOP_RIGHT : TOP_LEFT_TO_BOTTOM_RIGHT;
+    return (cellCornerData & 0x1u) ? BOTTOM_LEFT_TO_TOP_RIGHT : TOP_LEFT_TO_BOTTOM_RIGHT;
 }
 
 static uint32_t packR11G11B10_UNORM(float32_t3 color)
diff --git a/62_CAD/shaders/main_pipeline/common.hlsl b/62_CAD/shaders/main_pipeline/common.hlsl
index 79dbc0bd1..69f9a8ec8 100644
--- a/62_CAD/shaders/main_pipeline/common.hlsl
+++ b/62_CAD/shaders/main_pipeline/common.hlsl
@@ -258,6 +258,7 @@ struct PSInput
 
 [[vk::binding(2, 0)]] SamplerState textureSampler : register(s5);
 [[vk::binding(3, 0)]] Texture2D textures[ImagesBindingArraySize] : register(t5);
+[[vk::binding(3, 0)]] Texture2D<uint32_t> texturesU32[ImagesBindingArraySize] : register(t5);
 
 // Set 1 - Window dependant data which has higher update frequency due to multiple windows and resize need image recreation and descriptor writes
 [[vk::binding(0, 1)]] globallycoherent RWTexture2D<uint> pseudoStencil : register(u0);
diff --git a/62_CAD/shaders/main_pipeline/fragment_shader.hlsl b/62_CAD/shaders/main_pipeline/fragment_shader.hlsl
index 10a2348a7..63ed5e915 100644
--- a/62_CAD/shaders/main_pipeline/fragment_shader.hlsl
+++ b/62_CAD/shaders/main_pipeline/fragment_shader.hlsl
@@ -117,8 +117,10 @@ float32_t4 calculateFinalColor<true>(const uint2 fragCoord, const float localAlp
     return color;
 }
 
-E_CELL_DIAGONAL resolveGridDTMCellDiagonal(in float4 cellHeights)
+E_CELL_DIAGONAL resolveGridDTMCellDiagonal(in uint32_t4 cellData)
 {
+    float4 cellHeights = asfloat(cellData);
+
     const bool4 invalidHeights = bool4(
         isnan(cellHeights.x),
         isnan(cellHeights.y),
@@ -132,24 +134,8 @@ E_CELL_DIAGONAL resolveGridDTMCellDiagonal(in float4 cellHeights)
 
     if (invalidHeightsCount == 0)
     {
-        E_CELL_DIAGONAL a = getDiagonalModeFromCellCornerData(cellHeights.w);
-
-        if (a == TOP_LEFT_TO_BOTTOM_RIGHT)
-        {
-            uint32_t asdf = nbl::hlsl::bit_cast<uint32_t, float>(cellHeights.w);
-            printf("a %f %u", cellHeights.w, asdf);
-        }
-        else if (a == BOTTOM_LEFT_TO_TOP_RIGHT)
-        {
-            uint32_t asdf = nbl::hlsl::bit_cast<uint32_t, float>(cellHeights.w);
-            printf("b %f %u", cellHeights.w, asdf);
-        }
-        else
-        {
-            printf("wtf");
-        }
-
-        return getDiagonalModeFromCellCornerData(cellHeights.w);
+        E_CELL_DIAGONAL a = getDiagonalModeFromCellCornerData(cellData.w);
+        return getDiagonalModeFromCellCornerData(cellData.w);
     }
 
     if (invalidHeightsCount > 1)
@@ -203,6 +189,9 @@ float4 fragMain(PSInput input) : SV_TARGET
         textureColor = dtmColor.rgb;
         localAlpha = dtmColor.a;
 
+        // because final color is premultiplied by alpha
+        textureColor = dtmColor.rgb / dtmColor.a;
+
         gammaUncorrect(textureColor); // want to output to SRGB without gamma correction
         return calculateFinalColor<DeviceConfigCaps::fragmentShaderPixelInterlock>(uint2(input.position.xy), localAlpha, currentMainObjectIdx, textureColor, true);
     }
@@ -487,24 +476,21 @@ float4 fragMain(PSInput input) : SV_TARGET
                 float2 insideCellCoord = gridSpacePos - float2(cellWidth, cellWidth) * cellCoords; // TODO: use fmod instead?
                 
                 const float InvalidHeightValue = asfloat(0x7FC00000);
+                uint32_t4 cellData;
                 float4 cellHeights = float4(InvalidHeightValue, InvalidHeightValue, InvalidHeightValue, InvalidHeightValue);
                 if (textureId != InvalidTextureIndex)
                 {
                     const float2 maxCellCoords = float2(round(gridExtents.x / cellWidth), round(gridExtents.y / cellWidth));
                     const float2 location = (cellCoords + float2(0.5f, 0.5f)) / maxCellCoords;
 
-                    cellHeights = textures[NonUniformResourceIndex(textureId)].Gather(textureSampler, float2(location.x, location.y), 0);
+                    cellData = texturesU32[NonUniformResourceIndex(textureId)].Gather(textureSampler, float2(location.x, location.y), 0);
+                    cellHeights = asfloat(cellData);
                 }
 
 
-                const E_CELL_DIAGONAL cellDiagonal = resolveGridDTMCellDiagonal(cellHeights);
+                const E_CELL_DIAGONAL cellDiagonal = resolveGridDTMCellDiagonal(cellData);
                 const bool diagonalFromTopLeftToBottomRight = cellDiagonal == E_CELL_DIAGONAL::TOP_LEFT_TO_BOTTOM_RIGHT;
 
-                /*if (!diagonalFromTopLeftToBottomRight)
-                    printf("a");
-                else
-                    printf("b");*/
-
                 if (cellDiagonal == E_CELL_DIAGONAL::INVALID)
                     discard;
 
@@ -515,8 +501,6 @@ float4 fragMain(PSInput input) : SV_TARGET
 
                 float2 gridSpaceCellTopLeftCoords = cellCoords * cellWidth;
 
-                //printf("uv = { %f, %f } diagonalTLtoBR = %i triangleA = %i, insiceCellCoords = { %f, %f }", uv.x, uv.y, int(diagonalFromTopLeftToBottomRight), int(triangleA), insideCellCoord.x / cellWidth, insideCellCoord.y / cellWidth);
-
                 if (diagonalFromTopLeftToBottomRight)
                 {
                     v[0] = float3(gridSpaceCellTopLeftCoords.x, gridSpaceCellTopLeftCoords.y, cellHeights.w);
@@ -586,6 +570,9 @@ float4 fragMain(PSInput input) : SV_TARGET
             textureColor = dtmColor.rgb;
             localAlpha = dtmColor.a;
 
+            // because final color is premultiplied by alpha
+            textureColor = dtmColor.rgb / dtmColor.a;
+
             // test out of bounds draw
             /*if (outOfBoundsUV)
                 textureColor = float3(0.0f, 1.0f, 0.0f);
diff --git a/62_CAD/shaders/main_pipeline/vertex_shader.hlsl b/62_CAD/shaders/main_pipeline/vertex_shader.hlsl
index 11c8f8e22..1cc75c570 100644
--- a/62_CAD/shaders/main_pipeline/vertex_shader.hlsl
+++ b/62_CAD/shaders/main_pipeline/vertex_shader.hlsl
@@ -654,8 +654,7 @@ PSInput main(uint vertexID : SV_VertexID)
             float thicknessOfTheThickestLine = vk::RawBufferLoad<float>(globals.pointers.geometryBuffer + drawObj.geometryAddress + 2 * sizeof(pfloat64_t2) + sizeof(uint32_t) + 2u * sizeof(float), 8u);
 
             // for testing purpose
-            //thicknessOfTheThickestLine += 200.0f;
-            thicknessOfTheThickestLine = 0.0f;
+            thicknessOfTheThickestLine += 200.0f;
 
             const float2 corner = float2(bool2(vertexIdx & 0x1u, vertexIdx >> 1));
             worldSpaceExtents.y = ieee754::flipSign(worldSpaceExtents.y);

From b9f3f13bc42167243bdab2edb624ab0d76b8a878 Mon Sep 17 00:00:00 2001
From: kevyuu <kevin.kayu@gmail.com>
Date: Fri, 13 Jun 2025 17:16:05 +0700
Subject: [PATCH 356/529] Fix example to use reworked shader spec itnerface

---
 02_HelloCompute/main.cpp                      |  1 -
 03_DeviceSelectionAndSharedSources/Testers.h  |  5 ++-
 03_DeviceSelectionAndSharedSources/main.cpp   |  5 ++-
 .../main.cpp                                  |  1 -
 07_StagingAndMultipleQueues/main.cpp          |  1 -
 10_CountingSort/main.cpp                      |  5 ++-
 11_FFT/main.cpp                               |  5 ++-
 22_CppCompat/ITester.h                        |  1 -
 22_CppCompat/main.cpp                         |  1 -
 23_ArithmeticUnitTest/main.cpp                |  5 ++-
 24_ColorSpaceTest/main.cpp                    |  3 +-
 26_Blur/main.cpp                              |  3 +-
 27_MPMCScheduler/main.cpp                     |  3 +-
 30_ComputeShaderPathTracer/main.cpp           |  8 ++---
 67_RayQueryGeometry/main.cpp                  |  1 -
 common/include/CGeomtryCreatorScene.hpp       | 34 +++++++++++--------
 16 files changed, 36 insertions(+), 46 deletions(-)

diff --git a/02_HelloCompute/main.cpp b/02_HelloCompute/main.cpp
index 63a9f8832..32812fb1a 100644
--- a/02_HelloCompute/main.cpp
+++ b/02_HelloCompute/main.cpp
@@ -169,7 +169,6 @@ class HelloComputeApp final : public nbl::application_templates::MonoSystemMonoL
 					// Theoretically a blob of SPIR-V can contain multiple named entry points and one has to be chosen, in practice most compilers only support outputting one (and glslang used to require it be called "main")
 					params.shader.entryPoint = "main";
 					params.shader.shader = shader.get();
-					params.shader.stage = hlsl::ESS_COMPUTE;
 					// we'll cover the specialization constant API in another example
 					if (!device->createComputePipelines(nullptr,{&params,1},&pipeline))
 						return logFail("Failed to create pipelines (compile & link shaders)!\n");
diff --git a/03_DeviceSelectionAndSharedSources/Testers.h b/03_DeviceSelectionAndSharedSources/Testers.h
index b21da71c4..9a4016d20 100644
--- a/03_DeviceSelectionAndSharedSources/Testers.h
+++ b/03_DeviceSelectionAndSharedSources/Testers.h
@@ -254,10 +254,9 @@ class PredefinedLayoutTester final : public IntrospectionTesterBase
 		bool pplnCreationSuccess[MERGE_TEST_SHADERS_CNT];
 		for (uint32_t i = 0u; i < MERGE_TEST_SHADERS_CNT; ++i)
 		{
-			IPipelineBase::SShaderSpecInfo specInfo;
+			ICPUPipelineBase::SShaderSpecInfo specInfo;
 			specInfo.entryPoint = "main";
-			specInfo.shader = sources[i].get();
-			specInfo.stage = hlsl::ShaderStage::ESS_COMPUTE;
+			specInfo.shader = sources[i];
 			pplnCreationSuccess[i] = static_cast<bool>(introspector[i].createApproximateComputePipelineFromIntrospection(specInfo, core::smart_refctd_ptr<ICPUPipelineLayout>(predefinedPplnLayout)));
 		}
 
diff --git a/03_DeviceSelectionAndSharedSources/main.cpp b/03_DeviceSelectionAndSharedSources/main.cpp
index 3712b5719..5fb584e4d 100644
--- a/03_DeviceSelectionAndSharedSources/main.cpp
+++ b/03_DeviceSelectionAndSharedSources/main.cpp
@@ -61,10 +61,9 @@ class DeviceSelectionAndSharedSourcesApp final : public application_templates::M
 		//shaderIntrospection->debugPrint(m_logger.get());
 
 		// We've now skipped the manual creation of a descriptor set layout, pipeline layout
-		IPipelineBase::SShaderSpecInfo specInfo;
+		ICPUPipelineBase::SShaderSpecInfo specInfo;
 		specInfo.entryPoint = "main";
-		specInfo.shader = source.get();
-		specInfo.stage = hlsl::ShaderStage::ESS_COMPUTE;
+		specInfo.shader = source;
 
 		smart_refctd_ptr<nbl::asset::ICPUComputePipeline> cpuPipeline = introspector.createApproximateComputePipelineFromIntrospection(specInfo);
 
diff --git a/05_StreamingAndBufferDeviceAddressApp/main.cpp b/05_StreamingAndBufferDeviceAddressApp/main.cpp
index 96ccce9f5..c6c537363 100644
--- a/05_StreamingAndBufferDeviceAddressApp/main.cpp
+++ b/05_StreamingAndBufferDeviceAddressApp/main.cpp
@@ -135,7 +135,6 @@ class StreamingAndBufferDeviceAddressApp final : public application_templates::M
 				params.layout = layout.get();
 				params.shader.shader = shader.get();
 				params.shader.entryPoint = "main";
-				params.shader.stage = hlsl::ShaderStage::ESS_COMPUTE;
 				if (!m_device->createComputePipelines(nullptr,{&params,1},&m_pipeline))
 					return logFail("Failed to create compute pipeline!\n");
 			}
diff --git a/07_StagingAndMultipleQueues/main.cpp b/07_StagingAndMultipleQueues/main.cpp
index 23f2246bc..3e79bdfed 100644
--- a/07_StagingAndMultipleQueues/main.cpp
+++ b/07_StagingAndMultipleQueues/main.cpp
@@ -311,7 +311,6 @@ class StagingAndMultipleQueuesApp final : public application_templates::BasicMul
 			// Theoretically a blob of SPIR-V can contain multiple named entry points and one has to be chosen, in practice most compilers only support outputting one (and glslang used to require it be called "main")
 			params.shader.entryPoint = "main";
 			params.shader.shader = shader.get();
-			params.shader.stage = hlsl::ShaderStage::ESS_COMPUTE;
 			// we'll cover the specialization constant API in another example
 			if (!m_device->createComputePipelines(nullptr,{&params,1},&pipeline))
 				logFailAndTerminate("Failed to create pipelines (compile & link shaders)!\n");
diff --git a/10_CountingSort/main.cpp b/10_CountingSort/main.cpp
index 1fd789ad1..de2ffca8b 100644
--- a/10_CountingSort/main.cpp
+++ b/10_CountingSort/main.cpp
@@ -92,10 +92,9 @@ class CountingSortApp final : public application_templates::MonoDeviceApplicatio
 				params.layout = layout.get();
 				params.shader.shader = prefixSumShader.get();
 				params.shader.entryPoint = "main";
-				params.shader.stage = hlsl::ShaderStage::ESS_COMPUTE;
 				params.shader.entries = nullptr;
-				params.shader.requireFullSubgroups = true;
-				params.shader.requiredSubgroupSize = static_cast<IPipelineBase::SShaderSpecInfo::SUBGROUP_SIZE>(5);
+				params.shader.requiredSubgroupSize = static_cast<IPipelineBase::SUBGROUP_SIZE>(5);
+				params.cached.requireFullSubgroups = true;
 				if (!m_device->createComputePipelines(nullptr, { &params,1 }, &prefixSumPipeline))
 					return logFail("Failed to create compute pipeline!\n");
 				params.shader.shader = scatterShader.get();
diff --git a/11_FFT/main.cpp b/11_FFT/main.cpp
index 1cac98b1f..ad9bbfd47 100644
--- a/11_FFT/main.cpp
+++ b/11_FFT/main.cpp
@@ -133,9 +133,8 @@ class FFT_Test final : public application_templates::MonoDeviceApplication, publ
 			params.layout = layout.get();
 			params.shader.shader = shader.get();
 			params.shader.entryPoint = "main";
-			params.shader.stage = hlsl::ESS_COMPUTE;
-			params.shader.requiredSubgroupSize = static_cast<IPipelineBase::SShaderSpecInfo::SUBGROUP_SIZE>(hlsl::findMSB(m_physicalDevice->getLimits().maxSubgroupSize));
-			params.shader.requireFullSubgroups = true;
+			params.shader.requiredSubgroupSize = static_cast<IPipelineBase::SUBGROUP_SIZE>(hlsl::findMSB(m_physicalDevice->getLimits().maxSubgroupSize));
+			params.cached.requireFullSubgroups = true;
 			if (!m_device->createComputePipelines(nullptr, { &params,1 }, &m_pipeline))
 				return logFail("Failed to create compute pipeline!\n");
 		}
diff --git a/22_CppCompat/ITester.h b/22_CppCompat/ITester.h
index 273f51663..32138f198 100644
--- a/22_CppCompat/ITester.h
+++ b/22_CppCompat/ITester.h
@@ -113,7 +113,6 @@ class ITester
             params.layout = m_pplnLayout.get();
             params.shader.entryPoint = "main";
             params.shader.shader = shader.get();
-            params.shader.stage = shaderStage;
             if (!m_device->createComputePipelines(nullptr, { &params,1 }, &m_pipeline))
                 logFail("Failed to create pipelines (compile & link shaders)!\n");
         }
diff --git a/22_CppCompat/main.cpp b/22_CppCompat/main.cpp
index 877831c55..a5a819d49 100644
--- a/22_CppCompat/main.cpp
+++ b/22_CppCompat/main.cpp
@@ -128,7 +128,6 @@ class CompatibilityTest final : public MonoDeviceApplication, public MonoAssetMa
             params.layout = layout.get();
             params.shader.shader = shader.get();
             params.shader.entryPoint = "main";
-            params.shader.stage = hlsl::ShaderStage::ESS_COMPUTE;
             if (!m_device->createComputePipelines(nullptr, { &params,1 }, &m_pipeline))
                 return logFail("Failed to create compute pipeline!\n");
         }
diff --git a/23_ArithmeticUnitTest/main.cpp b/23_ArithmeticUnitTest/main.cpp
index e2d7d3cfe..12b9d3c2d 100644
--- a/23_ArithmeticUnitTest/main.cpp
+++ b/23_ArithmeticUnitTest/main.cpp
@@ -284,11 +284,10 @@ class ArithmeticUnitTestApp final : public application_templates::BasicMultiQueu
 		params.shader = {
 			.shader = shader.get(),
 			.entryPoint = "main",
-			.stage = hlsl::ESS_COMPUTE,
-			.requiredSubgroupSize = static_cast<IPipelineBase::SShaderSpecInfo::SUBGROUP_SIZE>(subgroupSizeLog2),
-			.requireFullSubgroups = true,
+			.requiredSubgroupSize = static_cast<IPipelineBase::SUBGROUP_SIZE>(subgroupSizeLog2),
 			.entries = nullptr,
 		};
+		params.cached.requireFullSubgroups = true;
 		core::smart_refctd_ptr<IGPUComputePipeline> pipeline;
 		if (!m_device->createComputePipelines(m_spirv_isa_cache.get(),{&params,1},&pipeline))
 			return nullptr;
diff --git a/24_ColorSpaceTest/main.cpp b/24_ColorSpaceTest/main.cpp
index 1c23a3f2f..fae93cf45 100644
--- a/24_ColorSpaceTest/main.cpp
+++ b/24_ColorSpaceTest/main.cpp
@@ -260,10 +260,9 @@ class ColorSpaceTestSampleApp final : public examples::SimpleWindowedApplication
 						.size = sizeof(push_constants_t)
 					};
 					auto layout = m_device->createPipelineLayout({ &range,1 }, nullptr, nullptr, nullptr, core::smart_refctd_ptr(dsLayout));
-					const IPipelineBase::SShaderSpecInfo fragSpec = {
+					const IGPUPipelineBase::SShaderSpecInfo fragSpec = {
 						.shader = fragmentShader.get(),
 						.entryPoint = "main",
-						.stage = ESS_FRAGMENT,
 					};
 					m_pipeline = fsTriProtoPPln.createPipeline(fragSpec, layout.get(), scResources->getRenderpass()/*,default is subpass 0*/);
 					if (!m_pipeline)
diff --git a/26_Blur/main.cpp b/26_Blur/main.cpp
index 4910ba5f0..bd4b6dedc 100644
--- a/26_Blur/main.cpp
+++ b/26_Blur/main.cpp
@@ -282,8 +282,7 @@ class BlurApp final : public examples::SimpleWindowedApplication, public applica
 				params.layout = layout.get();
 				params.shader.shader = shader.get();
 				params.shader.entryPoint = "main";
-				params.shader.stage = hlsl::ShaderStage::ESS_COMPUTE;
-				params.shader.requireFullSubgroups = true;
+				params.cached.requireFullSubgroups = true;
 				if (!m_device->createComputePipelines(nullptr, { &params, 1 }, &m_ppln))
 					return logFail("Failed to create Pipeline");
 			}
diff --git a/27_MPMCScheduler/main.cpp b/27_MPMCScheduler/main.cpp
index 03275d114..33768c981 100644
--- a/27_MPMCScheduler/main.cpp
+++ b/27_MPMCScheduler/main.cpp
@@ -115,8 +115,7 @@ class MPMCSchedulerApp final : public examples::SimpleWindowedApplication, publi
 				params.layout = layout.get();
 				params.shader.shader = shader.get();
 				params.shader.entryPoint = "main";
-				params.shader.stage = hlsl::ShaderStage::ESS_COMPUTE;
-				params.shader.requireFullSubgroups = true;
+				params.cached.requireFullSubgroups = true;
 				if (!m_device->createComputePipelines(nullptr, { &params, 1 }, &m_ppln))
 					return logFail("Failed to create Pipeline");
 			}
diff --git a/30_ComputeShaderPathTracer/main.cpp b/30_ComputeShaderPathTracer/main.cpp
index 44a4dd6ef..2fa7bfc0b 100644
--- a/30_ComputeShaderPathTracer/main.cpp
+++ b/30_ComputeShaderPathTracer/main.cpp
@@ -351,10 +351,9 @@ class ComputeShaderPathtracer final : public examples::SimpleWindowedApplication
 						params.layout = ptPipelineLayout.get();
 						params.shader.shader = ptShader.get();
 						params.shader.entryPoint = "main";
-						params.shader.stage = ESS_COMPUTE;
 						params.shader.entries = nullptr;
-						params.shader.requireFullSubgroups = true;
-						params.shader.requiredSubgroupSize = static_cast<IPipelineBase::SShaderSpecInfo::SUBGROUP_SIZE>(5);
+						params.cached.requireFullSubgroups = true;
+						params.shader.requiredSubgroupSize = static_cast<IPipelineBase::SUBGROUP_SIZE>(5);
 						if (!m_device->createComputePipelines(nullptr, { &params, 1 }, m_PTPipelines.data() + index)) {
 							return logFail("Failed to create compute pipeline!\n");
 						}
@@ -373,10 +372,9 @@ class ComputeShaderPathtracer final : public examples::SimpleWindowedApplication
 					if (!fragmentShader)
 						return logFail("Failed to Load and Compile Fragment Shader: lumaMeterShader!");
 
-					const IPipelineBase::SShaderSpecInfo fragSpec = {
+					const IGPUPipelineBase::SShaderSpecInfo fragSpec = {
 						.shader = fragmentShader.get(),
 						.entryPoint = "main",
-						.stage = ESS_FRAGMENT,
 					};
 
 					auto presentLayout = m_device->createPipelineLayout(
diff --git a/67_RayQueryGeometry/main.cpp b/67_RayQueryGeometry/main.cpp
index 4c09da5da..f4fa38aa1 100644
--- a/67_RayQueryGeometry/main.cpp
+++ b/67_RayQueryGeometry/main.cpp
@@ -203,7 +203,6 @@ class RayQueryGeometryApp final : public examples::SimpleWindowedApplication, pu
 				params.layout = pipelineLayout.get();
 				params.shader.shader = shader.get();
 				params.shader.entryPoint = "main";
-				params.shader.stage = ESS_COMPUTE;
 				if (!m_device->createComputePipelines(nullptr, { &params, 1 }, &renderPipeline))
 					return logFail("Failed to create compute pipeline");
 			}
diff --git a/common/include/CGeomtryCreatorScene.hpp b/common/include/CGeomtryCreatorScene.hpp
index 6ffad2c73..8cbce35c4 100644
--- a/common/include/CGeomtryCreatorScene.hpp
+++ b/common/include/CGeomtryCreatorScene.hpp
@@ -824,7 +824,6 @@ class ResourceBuilder
 			{
 				SBlendParams blend;
 				SRasterizationParams rasterization;
-				typename Types::graphics_pipeline_t::SCreationParams pipeline;
 			} params;
 				
 			{
@@ -842,16 +841,6 @@ class ResourceBuilder
 
 			params.rasterization.faceCullingMode = EFCM_NONE;
 			{
-				const IPipelineBase::SShaderSpecInfo info [] =
-				{
-					{.shader = scratch.shaders[inGeometry.shadersType].vertex.get(), .entryPoint = "VSMain", .stage = hlsl::ShaderStage::ESS_VERTEX},
-					{.shader = scratch.shaders[inGeometry.shadersType].fragment.get(), .entryPoint = "PSMain", .stage = hlsl::ShaderStage::ESS_FRAGMENT},
-				};
-
-				params.pipeline.layout = scratch.pipelineLayout.get();
-				params.pipeline.shaders = info;
-				params.pipeline.renderpass = scratch.renderpass.get();
-				params.pipeline.cached = { .vertexInput = inGeometry.data.inputParams, .primitiveAssembly = inGeometry.data.assemblyParams, .rasterization = params.rasterization, .blend = params.blend, .subpassIx = 0u };
 
 				obj.indexCount = inGeometry.data.indexCount;
 				obj.indexType = inGeometry.data.indexType;
@@ -859,11 +848,28 @@ class ResourceBuilder
 				// TODO: cache pipeline & try lookup for existing one first maybe
 
 				// similar issue like with shaders again, in this case gpu contructor allows for extra cache parameters + there is no constructor you can use to fire make_smart_refctd_ptr yourself for cpu
-				if constexpr (withAssetConverter)
-					obj.pipeline = ICPUGraphicsPipeline::create(params.pipeline);
+				if constexpr (withAssetConverter) {
+
+					obj.pipeline = ICPUGraphicsPipeline::create(scratch.pipelineLayout.get(), scratch.renderpass.get());
+					obj.pipeline->getCachedCreationParams() = {
+            .vertexInput = inGeometry.data.inputParams, 
+						.primitiveAssembly = inGeometry.data.assemblyParams, 
+						.rasterization = params.rasterization, 
+						.blend = params.blend, 
+						.subpassIx = 0u 
+					};
+					*obj.pipeline->getSpecInfo(hlsl::ESS_VERTEX) = { .shader = scratch.shaders[inGeometry.shadersType].vertex, .entryPoint = "VSMain" };
+					*obj.pipeline->getSpecInfo(hlsl::ESS_FRAGMENT) = { .shader = scratch.shaders[inGeometry.shadersType].fragment, .entryPoint = "PSMain" };
+				}
 				else
 				{
-					const std::array<const IGPUGraphicsPipeline::SCreationParams,1> info = { { params.pipeline } };
+					IGPUGraphicsPipeline::SCreationParams createParams = {};
+          createParams.layout = scratch.pipelineLayout.get();
+          createParams.vertexShader = {.shader = scratch.shaders[inGeometry.shadersType].vertex.get(), .entryPoint = "VSMain" };
+          createParams.fragmentShader = { .shader = scratch.shaders[inGeometry.shadersType].fragment.get(), .entryPoint = "PSMain" };
+          createParams.renderpass = scratch.renderpass.get();
+          createParams.cached = { .vertexInput = inGeometry.data.inputParams, .primitiveAssembly = inGeometry.data.assemblyParams, .rasterization = params.rasterization, .blend = params.blend, .subpassIx = 0u };
+					const std::array<const IGPUGraphicsPipeline::SCreationParams,1> info = { { createParams } };
 					utilities->getLogicalDevice()->createGraphicsPipelines(nullptr, info, &obj.pipeline);
 				}
 

From 60319f442bacf210404035cc5daef042f169d2ff Mon Sep 17 00:00:00 2001
From: kevyuu <kevin.kayu@gmail.com>
Date: Fri, 13 Jun 2025 17:28:21 +0700
Subject: [PATCH 357/529] Fix picking the wrong diff when merging with master

---
 67_RayQueryGeometry/main.cpp | 830 ++++++++++++++---------------------
 1 file changed, 330 insertions(+), 500 deletions(-)

diff --git a/67_RayQueryGeometry/main.cpp b/67_RayQueryGeometry/main.cpp
index f4fa38aa1..7371cf1ea 100644
--- a/67_RayQueryGeometry/main.cpp
+++ b/67_RayQueryGeometry/main.cpp
@@ -1,7 +1,6 @@
 // Copyright (C) 2018-2024 - DevSH Graphics Programming Sp. z O.O.
 // This file is part of the "Nabla Engine".
 // For conditions of distribution and use, see copyright notice in nabla.h
-
 #include "common.hpp"
 
 class RayQueryGeometryApp final : public examples::SimpleWindowedApplication, public application_templates::MonoAssetManagerAndBuiltinResourceApplication
@@ -126,11 +125,8 @@ class RayQueryGeometryApp final : public examples::SimpleWindowedApplication, pu
 
 			auto cQueue = getComputeQueue();
 
-			// create geometry objects
-			if (!createGeometries(gQueue, geometryCreator))
-				return logFail("Could not create geometries from geometry creator");
-
 			// create blas/tlas
+			renderDs = 
 //#define TRY_BUILD_FOR_NGFX // Validation errors on the fake Acquire-Presents, TODO fix
 #ifdef TRY_BUILD_FOR_NGFX
 			// Nsight is special and can't do debugger delay so you can debug your CPU stuff during a capture
@@ -142,11 +138,12 @@ class RayQueryGeometryApp final : public examples::SimpleWindowedApplication, pu
 					std::this_thread::yield();
 			}
 			// Nsight is special and can't capture anything not on the queue that performs the swapchain acquire/release
-			if (!createAccelerationStructures(gQueue))
+			createAccelerationStructureDS(gQueue,geometryCreator);
 #else
-			if (!createAccelerationStructures(cQueue))
+			createAccelerationStructureDS(cQueue,geometryCreator);
 #endif
-				return logFail("Could not create acceleration structures");
+			if (!renderDs)
+				return logFail("Could not create acceleration structures and descriptor set");
 
 			// create pipelines
 			{
@@ -164,67 +161,38 @@ class RayQueryGeometryApp final : public examples::SimpleWindowedApplication, pu
 
 				const auto assets = bundle.getContents();
 				assert(assets.size() == 1);
-				smart_refctd_ptr<IShader> shaderSrc = IAsset::castDown<IShader>(assets[0]);
-				auto shader = m_device->compileShader({ shaderSrc.get() });
+				smart_refctd_ptr<ICPUShader> shaderSrc = IAsset::castDown<ICPUShader>(assets[0]);
+				shaderSrc->setShaderStage(IShader::E_SHADER_STAGE::ESS_COMPUTE);
+				auto shader = m_device->createShader(shaderSrc.get());
 				if (!shader)
 					return logFail("Failed to create shader!");
 
-				// descriptors
-				IGPUDescriptorSetLayout::SBinding bindings[] = {
-					{
-						.binding = 0,
-						.type = asset::IDescriptor::E_TYPE::ET_ACCELERATION_STRUCTURE,
-						.createFlags = IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_NONE,
-						.stageFlags = asset::IShader::E_SHADER_STAGE::ESS_COMPUTE,
-						.count = 1,
-					},
-					{
-						.binding = 1,
-						.type = asset::IDescriptor::E_TYPE::ET_STORAGE_IMAGE,
-						.createFlags = IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_NONE,
-						.stageFlags = asset::IShader::E_SHADER_STAGE::ESS_COMPUTE,
-						.count = 1,
-					}
-				};
-				auto descriptorSetLayout = m_device->createDescriptorSetLayout(bindings);
-
-				const std::array<IGPUDescriptorSetLayout*, ICPUPipelineLayout::DESCRIPTOR_SET_COUNT> dsLayoutPtrs = { descriptorSetLayout.get() };
-				renderPool = m_device->createDescriptorPoolForDSLayouts(IDescriptorPool::ECF_UPDATE_AFTER_BIND_BIT, std::span(dsLayoutPtrs.begin(), dsLayoutPtrs.end()));
-				if (!renderPool)
-					return logFail("Could not create descriptor pool");
-				renderDs = renderPool->createDescriptorSet(descriptorSetLayout);
-				if (!renderDs)
-					return logFail("Could not create descriptor set");
-
 				SPushConstantRange pcRange = { .stageFlags = IShader::E_SHADER_STAGE::ESS_COMPUTE, .offset = 0u, .size = sizeof(SPushConstants)};
-				auto pipelineLayout = m_device->createPipelineLayout({ &pcRange, 1 }, smart_refctd_ptr(descriptorSetLayout), nullptr, nullptr, nullptr);
+				auto pipelineLayout = m_device->createPipelineLayout({ &pcRange, 1 }, smart_refctd_ptr<const IGPUDescriptorSetLayout>(renderDs->getLayout()), nullptr, nullptr, nullptr);
 
 				IGPUComputePipeline::SCreationParams params = {};
 				params.layout = pipelineLayout.get();
 				params.shader.shader = shader.get();
-				params.shader.entryPoint = "main";
 				if (!m_device->createComputePipelines(nullptr, { &params, 1 }, &renderPipeline))
 					return logFail("Failed to create compute pipeline");
 			}
 
 			// write descriptors
-			IGPUDescriptorSet::SDescriptorInfo infos[2];
-			infos[0].desc = gpuTlas;
-			infos[1].desc = m_device->createImageView({
-				.flags = IGPUImageView::ECF_NONE,
-				.subUsages = IGPUImage::E_USAGE_FLAGS::EUF_STORAGE_BIT,
-				.image = outHDRImage,
-				.viewType = IGPUImageView::E_TYPE::ET_2D,
-				.format = asset::EF_R16G16B16A16_SFLOAT
-			});
-			if (!infos[1].desc)
-				return logFail("Failed to create image view");
-			infos[1].info.image.imageLayout = IImage::LAYOUT::GENERAL;
-			IGPUDescriptorSet::SWriteDescriptorSet writes[3] = {
-				{.dstSet = renderDs.get(), .binding = 0, .arrayElement = 0, .count = 1, .info = &infos[0]},
-				{.dstSet = renderDs.get(), .binding = 1, .arrayElement = 0, .count = 1, .info = &infos[1]}
-			};
-			m_device->updateDescriptorSets(std::span(writes, 2), {});
+			{
+				IGPUDescriptorSet::SDescriptorInfo info = {};
+				info.desc = m_device->createImageView({
+					.flags = IGPUImageView::ECF_NONE,
+					.subUsages = IGPUImage::E_USAGE_FLAGS::EUF_STORAGE_BIT,
+					.image = outHDRImage,
+					.viewType = IGPUImageView::E_TYPE::ET_2D,
+					.format = asset::EF_R16G16B16A16_SFLOAT
+				});
+				if (!info.desc)
+					return logFail("Failed to create image view");
+				info.info.image.imageLayout = IImage::LAYOUT::GENERAL;
+				const IGPUDescriptorSet::SWriteDescriptorSet write = {.dstSet=renderDs.get(), .binding=1, .arrayElement=0, .count=1, .info=&info};
+				m_device->updateDescriptorSets({&write,1}, {});
+			}
 
 			// camera
 			{
@@ -281,7 +249,6 @@ class RayQueryGeometryApp final : public examples::SimpleWindowedApplication, pu
 			static bool first = true;
 			if (first)
 			{
-				m_api->startCapture();
 				first = false;
 			}
 
@@ -520,82 +487,9 @@ class RayQueryGeometryApp final : public examples::SimpleWindowedApplication, pu
 			return (dim + size - 1) / size;
 		}
 
-		smart_refctd_ptr<IGPUBuffer> createBuffer(IGPUBuffer::SCreationParams& params)
-		{
-			smart_refctd_ptr<IGPUBuffer> buffer;
-			buffer = m_device->createBuffer(std::move(params));
-			auto bufReqs = buffer->getMemoryReqs();
-			bufReqs.memoryTypeBits &= m_physicalDevice->getDeviceLocalMemoryTypeBits();
-			m_device->allocate(bufReqs, buffer.get(), IDeviceMemoryAllocation::EMAF_DEVICE_ADDRESS_BIT);
-
-			return buffer;
-		}
-
-		smart_refctd_ptr<IGPUCommandBuffer> getSingleUseCommandBufferAndBegin(smart_refctd_ptr<IGPUCommandPool> pool)
-		{
-			smart_refctd_ptr<IGPUCommandBuffer> cmdbuf;
-			if (!pool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY, 1u, &cmdbuf))
-				return nullptr;
-
-			cmdbuf->reset(IGPUCommandBuffer::RESET_FLAGS::RELEASE_RESOURCES_BIT);
-			cmdbuf->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT);
-
-			return cmdbuf;
-		}
-
-		void cmdbufSubmitAndWait(smart_refctd_ptr<IGPUCommandBuffer> cmdbuf, CThreadSafeQueueAdapter* queue, uint64_t startValue)
-		{
-			cmdbuf->end();
-
-			uint64_t finishedValue = startValue + 1;
-
-			// submit builds
-			{
-				auto completed = m_device->createSemaphore(startValue);
-
-				std::array<IQueue::SSubmitInfo::SSemaphoreInfo, 1u> signals;
-				{
-					auto& signal = signals.front();
-					signal.value = finishedValue;
-					signal.stageMask = bitflag(PIPELINE_STAGE_FLAGS::ALL_TRANSFER_BITS);
-					signal.semaphore = completed.get();
-				}
-
-				const IQueue::SSubmitInfo::SCommandBufferInfo commandBuffers[1] = { {
-					.cmdbuf = cmdbuf.get()
-				} };
-
-				const IQueue::SSubmitInfo infos[] =
-				{
-					{
-						.waitSemaphores = {},
-						.commandBuffers = commandBuffers,
-						.signalSemaphores = signals
-					}
-				};
-
-				if (queue->submit(infos) != IQueue::RESULT::SUCCESS)
-				{
-					m_logger->log("Failed to submit geometry transfer upload operations!", ILogger::ELL_ERROR);
-					return;
-				}
-
-				const ISemaphore::SWaitInfo info[] =
-				{ {
-					.semaphore = completed.get(),
-					.value = finishedValue
-				} };
-
-				m_device->blockForSemaphores(info);
-			}
-		}
-
-		bool createGeometries(video::CThreadSafeQueueAdapter* queue, const IGeometryCreator* gc)
+		smart_refctd_ptr<IGPUDescriptorSet> createAccelerationStructureDS(video::CThreadSafeQueueAdapter* queue, const IGeometryCreator* gc)
 		{
-			auto pool = m_device->createCommandPool(queue->getFamilyIndex(), IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT);
-			if (!pool)
-				return logFail("Couldn't create Command Pool for geometry creation!");
-
+			// get geometries in ICPUBuffers
 			std::array<ReferenceObjectCpu, OT_COUNT> objectsCpu;
 			objectsCpu[OT_CUBE] = ReferenceObjectCpu{ .meta = {.type = OT_CUBE, .name = "Cube Mesh" }, .shadersType = GP_BASIC, .data = gc->createCubeMesh(nbl::core::vector3df(1.f, 1.f, 1.f)) };
 			objectsCpu[OT_SPHERE] = ReferenceObjectCpu{ .meta = {.type = OT_SPHERE, .name = "Sphere Mesh" }, .shadersType = GP_BASIC, .data = gc->createSphereMesh(2, 16, 16) };
@@ -606,163 +500,213 @@ class RayQueryGeometryApp final : public examples::SimpleWindowedApplication, pu
 			objectsCpu[OT_CONE] = ReferenceObjectCpu{ .meta = {.type = OT_CONE, .name = "Cone Mesh" }, .shadersType = GP_CONE, .data = gc->createConeMesh(2, 3, 10) };
 			objectsCpu[OT_ICOSPHERE] = ReferenceObjectCpu{ .meta = {.type = OT_ICOSPHERE, .name = "Icosphere Mesh" }, .shadersType = GP_ICO, .data = gc->createIcoSphere(1, 3, true) };
 
-			struct ScratchVIBindings
-			{
-				nbl::asset::SBufferBinding<ICPUBuffer> vertex, index;
-			};
-			std::array<ScratchVIBindings, OT_COUNT> scratchBuffers;
-			//std::array<SGeomInfo, OT_COUNT> geomInfos;
 			auto geomInfoBuffer = ICPUBuffer::create({ OT_COUNT * sizeof(SGeomInfo) });
-			
+
 			SGeomInfo* geomInfos = reinterpret_cast<SGeomInfo*>(geomInfoBuffer->getPointer());
 			const uint32_t byteOffsets[OT_COUNT] = { 18, 24, 24, 20, 20, 24, 16, 12 };	// based on normals data position
 			const uint32_t smoothNormals[OT_COUNT] = { 0, 1, 1, 0, 0, 1, 1, 1 };
 
-			for (uint32_t i = 0; i < objectsCpu.size(); i++)
+			// get ICPUBuffers into ICPUBottomLevelAccelerationStructures
+			std::array<smart_refctd_ptr<ICPUBottomLevelAccelerationStructure>, OT_COUNT> cpuBlas;
+			for (uint32_t i = 0; i < cpuBlas.size(); i++)
 			{
+				auto triangles = make_refctd_dynamic_array<smart_refctd_dynamic_array<ICPUBottomLevelAccelerationStructure::Triangles<ICPUBuffer>>>(1u);
+				auto primitiveCounts = make_refctd_dynamic_array<smart_refctd_dynamic_array<uint32_t>>(1u);
+
+				auto& tri = triangles->front();
+				auto& primCount = primitiveCounts->front();
 				const auto& geom = objectsCpu[i];
-				auto& obj = objectsGpu[i];
-				auto& scratchObj = scratchBuffers[i];
 
-				obj.meta.name = geom.meta.name;
-				obj.meta.type = geom.meta.type;
+				const bool useIndex = geom.data.indexType != EIT_UNKNOWN;
+				const uint32_t vertexStride = geom.data.inputParams.bindings[0].stride;
+				const uint32_t numVertices = (geom.data.bindings[0].buffer->getSize()-geom.data.bindings[0].offset) / vertexStride;
 
-				obj.indexCount = geom.data.indexCount;
-				obj.indexType = geom.data.indexType;
-				obj.vertexStride = geom.data.inputParams.bindings[0].stride;
+				if (useIndex)
+					primCount = geom.data.indexCount / 3;
+				else
+					primCount = numVertices / 3;
 
-				geomInfos[i].indexType = obj.indexType;
-				geomInfos[i].vertexStride = obj.vertexStride;
+				geomInfos[i].indexType = geom.data.indexType;
+				geomInfos[i].vertexStride = vertexStride;
 				geomInfos[i].smoothNormals = smoothNormals[i];
 
-				auto vBuffer = smart_refctd_ptr(geom.data.bindings[0].buffer); // no offset
-				auto vUsage = bitflag(IGPUBuffer::EUF_STORAGE_BUFFER_BIT) | IGPUBuffer::EUF_TRANSFER_DST_BIT | IGPUBuffer::EUF_INLINE_UPDATE_VIA_CMDBUF | 
-					IGPUBuffer::EUF_ACCELERATION_STRUCTURE_BUILD_INPUT_READ_ONLY_BIT | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT;
-				obj.bindings.vertex.offset = 0u;
-
-				auto iBuffer = smart_refctd_ptr(geom.data.indexBuffer.buffer); // no offset
-				auto iUsage = bitflag(IGPUBuffer::EUF_STORAGE_BUFFER_BIT) | IGPUBuffer::EUF_TRANSFER_DST_BIT | IGPUBuffer::EUF_INLINE_UPDATE_VIA_CMDBUF |
-					IGPUBuffer::EUF_ACCELERATION_STRUCTURE_BUILD_INPUT_READ_ONLY_BIT | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT;
-				obj.bindings.index.offset = 0u;
+				geom.data.bindings[0].buffer->setContentHash(geom.data.bindings[0].buffer->computeContentHash());
+				tri.vertexData[0] = geom.data.bindings[0];
+				if (useIndex)
+				{
+					geom.data.indexBuffer.buffer->setContentHash(geom.data.indexBuffer.buffer->computeContentHash());
+					tri.indexData = geom.data.indexBuffer;
+				}
+				tri.maxVertex = numVertices - 1;
+				tri.vertexStride = vertexStride;
+				tri.vertexFormat = static_cast<E_FORMAT>(geom.data.inputParams.attributes[0].format);
+				tri.indexType = geom.data.indexType;
+				tri.geometryFlags = IGPUBottomLevelAccelerationStructure::GEOMETRY_FLAGS::OPAQUE_BIT;
+
+				auto& blas = cpuBlas[i];
+				blas = make_smart_refctd_ptr<ICPUBottomLevelAccelerationStructure>();
+				blas->setGeometries(std::move(triangles), std::move(primitiveCounts));
+
+				auto blasFlags = bitflag(IGPUBottomLevelAccelerationStructure::BUILD_FLAGS::PREFER_FAST_TRACE_BIT) | IGPUBottomLevelAccelerationStructure::BUILD_FLAGS::ALLOW_COMPACTION_BIT;
+				if (m_physicalDevice->getProperties().limits.rayTracingPositionFetch)
+					blasFlags |= IGPUBottomLevelAccelerationStructure::BUILD_FLAGS::ALLOW_DATA_ACCESS;
+
+				blas->setBuildFlags(blasFlags);
+				blas->setContentHash(blas->computeContentHash());
+			}
 
-				vBuffer->addUsageFlags(vUsage);
-				vBuffer->setContentHash(vBuffer->computeContentHash());
-				scratchObj.vertex = { .offset = 0, .buffer = vBuffer };
+			// get ICPUBottomLevelAccelerationStructure into ICPUTopLevelAccelerationStructure
+			auto geomInstances = make_refctd_dynamic_array<smart_refctd_dynamic_array<ICPUTopLevelAccelerationStructure::PolymorphicInstance>>(OT_COUNT);
+			{
+				uint32_t i = 0;
+				for (auto instance = geomInstances->begin(); instance != geomInstances->end(); instance++, i++)
+				{
+					ICPUTopLevelAccelerationStructure::StaticInstance inst;
+					inst.base.blas = cpuBlas[i];
+					inst.base.flags = static_cast<uint32_t>(IGPUTopLevelAccelerationStructure::INSTANCE_FLAGS::TRIANGLE_FACING_CULL_DISABLE_BIT);
+					inst.base.instanceCustomIndex = i;
+					inst.base.instanceShaderBindingTableRecordOffset = 0;
+					inst.base.mask = 0xFF;
 
-				if (geom.data.indexType != EIT_UNKNOWN)
-					if (iBuffer)
-					{
-						iBuffer->addUsageFlags(iUsage);
-						iBuffer->setContentHash(iBuffer->computeContentHash());
-					}
-				scratchObj.index = { .offset = 0, .buffer = iBuffer };
+					core::matrix3x4SIMD transform;
+					transform.setTranslation(nbl::core::vectorSIMDf(5.f * i, 0, 0, 0));
+					inst.transform = transform;
+					
+					instance->instance = inst;
+				}
 			}
 
-			auto cmdbuf = getSingleUseCommandBufferAndBegin(pool);
-			cmdbuf->beginDebugMarker("Build geometry vertex and index buffers");
+			auto cpuTlas = make_smart_refctd_ptr<ICPUTopLevelAccelerationStructure>();
+			cpuTlas->setInstances(std::move(geomInstances));
+			cpuTlas->setBuildFlags(IGPUTopLevelAccelerationStructure::BUILD_FLAGS::PREFER_FAST_TRACE_BIT);
+			
+			// descriptor set and layout
+			ICPUDescriptorSetLayout::SBinding bindings[] = {
+				{
+					.binding = 0,
+					.type = asset::IDescriptor::E_TYPE::ET_ACCELERATION_STRUCTURE,
+					.createFlags = IDescriptorSetLayoutBase::SBindingBase::E_CREATE_FLAGS::ECF_NONE,
+					.stageFlags = asset::IShader::E_SHADER_STAGE::ESS_COMPUTE,
+					.count = 1,
+				},
+				{
+					.binding = 1,
+					.type = asset::IDescriptor::E_TYPE::ET_STORAGE_IMAGE,
+					.createFlags = IDescriptorSetLayoutBase::SBindingBase::E_CREATE_FLAGS::ECF_NONE,
+					.stageFlags = asset::IShader::E_SHADER_STAGE::ESS_COMPUTE,
+					.count = 1,
+				}
+			};
+			auto descriptorSet = core::make_smart_refctd_ptr<ICPUDescriptorSet>(core::make_smart_refctd_ptr<ICPUDescriptorSetLayout>(bindings));
+			descriptorSet->getDescriptorInfos(IDescriptorSetLayoutBase::CBindingRedirect::binding_number_t{0},IDescriptor::E_TYPE::ET_ACCELERATION_STRUCTURE).front().desc = cpuTlas;
 
+//#define TEST_REBAR_FALLBACK
+			// convert with asset converter
 			smart_refctd_ptr<CAssetConverter> converter = CAssetConverter::create({ .device = m_device.get(), .optimizer = {} });
-			CAssetConverter::SInputs inputs = {};
+			struct MyInputs : CAssetConverter::SInputs
+			{
+#ifndef TEST_REBAR_FALLBACK
+				inline uint32_t constrainMemoryTypeBits(const size_t groupCopyID, const IAsset* canonicalAsset, const blake3_hash_t& contentHash, const IDeviceMemoryBacked* memoryBacked) const override
+				{
+					assert(memoryBacked);
+					return memoryBacked->getObjectType()!=IDeviceMemoryBacked::EOT_BUFFER ? (~0u):rebarMemoryTypes;
+				}
+#endif
+				uint32_t rebarMemoryTypes;
+			} inputs = {};
 			inputs.logger = m_logger.get();
+			inputs.rebarMemoryTypes = m_physicalDevice->getDirectVRAMAccessMemoryTypeBits();
+#ifndef TEST_REBAR_FALLBACK
+			struct MyAllocator final : public IDeviceMemoryAllocator
+			{
+				ILogicalDevice* getDeviceForAllocations() const override {return device;}
 
-			std::array<ICPUBuffer*, OT_COUNT * 2u> tmpBuffers;
+				SAllocation allocate(const SAllocateInfo& info) override
+				{
+					auto retval = device->allocate(info);
+					// map what is mappable by default so ReBAR checks succeed
+					if (retval.isValid() && retval.memory->isMappable())
+						retval.memory->map({.offset=0,.length=info.size});
+					return retval;
+				}
+
+				ILogicalDevice* device;
+			} myalloc;
+			myalloc.device = m_device.get();
+			inputs.allocator = &myalloc;
+#endif
+			
+			CAssetConverter::patch_t<ICPUTopLevelAccelerationStructure> tlasPatch = {};
+			tlasPatch.compactAfterBuild = true;
+			std::array<CAssetConverter::patch_t<ICPUBottomLevelAccelerationStructure>,OT_COUNT> tmpBLASPatches = {};
+			std::array<const ICPUBuffer*, OT_COUNT * 2u> tmpBuffers;
+			std::array<CAssetConverter::patch_t<ICPUBuffer>, OT_COUNT * 2u> tmpBufferPatches;
 			{
+				tmpBLASPatches.front().compactAfterBuild = true;
+				std::fill(tmpBLASPatches.begin(),tmpBLASPatches.end(),tmpBLASPatches.front());
+				//
 				for (uint32_t i = 0; i < objectsCpu.size(); i++)
 				{
-					tmpBuffers[2 * i + 0] = scratchBuffers[i].vertex.buffer.get();
-					tmpBuffers[2 * i + 1] = scratchBuffers[i].index.buffer.get();
+					tmpBuffers[2 * i + 0] = cpuBlas[i]->getTriangleGeometries().front().vertexData[0].buffer.get();
+					tmpBuffers[2 * i + 1] = cpuBlas[i]->getTriangleGeometries().front().indexData.buffer.get();
 				}
-
+				// make sure all buffers are BDA-readable
+				for (auto& patch : tmpBufferPatches)
+					patch.usage |= asset::IBuffer::E_USAGE_FLAGS::EUF_SHADER_DEVICE_ADDRESS_BIT;
+
+				std::get<CAssetConverter::SInputs::asset_span_t<ICPUDescriptorSet>>(inputs.assets) = {&descriptorSet.get(),1};
+				std::get<CAssetConverter::SInputs::asset_span_t<ICPUTopLevelAccelerationStructure>>(inputs.assets) = {&cpuTlas.get(),1};
+				std::get<CAssetConverter::SInputs::patch_span_t<ICPUTopLevelAccelerationStructure>>(inputs.patches) = {&tlasPatch,1};
+				std::get<CAssetConverter::SInputs::asset_span_t<ICPUBottomLevelAccelerationStructure>>(inputs.assets) = {&cpuBlas.data()->get(),cpuBlas.size()};
+				std::get<CAssetConverter::SInputs::patch_span_t<ICPUBottomLevelAccelerationStructure>>(inputs.patches) = tmpBLASPatches;
 				std::get<CAssetConverter::SInputs::asset_span_t<ICPUBuffer>>(inputs.assets) = tmpBuffers;
+				std::get<CAssetConverter::SInputs::patch_span_t<ICPUBuffer>>(inputs.patches) = tmpBufferPatches;
 			}
 
 			auto reservation = converter->reserve(inputs);
-			{
-				auto prepass = [&]<typename asset_type_t>(const auto & references) -> bool
-				{
-					auto objects = reservation.getGPUObjects<asset_type_t>();
-					uint32_t counter = {};
-					for (auto& object : objects)
-					{
-						auto gpu = object.value;
-						auto* reference = references[counter];
 
-						if (reference)
-						{
-							if (!gpu)
-							{
-								m_logger->log("Failed to convert a CPU object to GPU!", ILogger::ELL_ERROR);
-								return false;
-							}
-						}
-						counter++;
-					}
-					return true;
-				};
-
-				prepass.template operator() < ICPUBuffer > (tmpBuffers);
+			constexpr auto XferBufferCount = 2;
+			std::array<smart_refctd_ptr<IGPUCommandBuffer>,XferBufferCount> xferBufs = {};
+			std::array<IQueue::SSubmitInfo::SCommandBufferInfo,XferBufferCount> xferBufInfos = {};
+			{
+				auto pool = m_device->createCommandPool(getTransferUpQueue()->getFamilyIndex(),IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT | IGPUCommandPool::CREATE_FLAGS::TRANSIENT_BIT);
+				pool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY,xferBufs);
+				xferBufs.front()->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT);
+				for (auto i=0; i<XferBufferCount; i++)
+					xferBufInfos[i].cmdbuf = xferBufs[i].get();
 			}
-
-			// not sure if need this (probably not, originally for transition img view)
-			auto semaphore = m_device->createSemaphore(0u);
-
-			std::array<IQueue::SSubmitInfo::SCommandBufferInfo, 1> cmdbufs = {};
-			cmdbufs.front().cmdbuf = cmdbuf.get();
-
+			auto xferSema = m_device->createSemaphore(0u);
+			xferSema->setObjectDebugName("Transfer Semaphore");
 			SIntendedSubmitInfo transfer = {};
-			transfer.queue = queue;
-			transfer.scratchCommandBuffers = cmdbufs;
+			transfer.queue = getTransferUpQueue();
+			transfer.scratchCommandBuffers = xferBufInfos;
 			transfer.scratchSemaphore = {
-				.semaphore = semaphore.get(),
+				.semaphore = xferSema.get(),
 				.value = 0u,
 				.stageMask = PIPELINE_STAGE_FLAGS::ALL_TRANSFER_BITS
 			};
-			// convert
-			{
-				CAssetConverter::SConvertParams params = {};
-				params.utilities = m_utils.get();
-				params.transfer = &transfer;
-
-				auto future = reservation.convert(params);
-				if (future.copy() != IQueue::RESULT::SUCCESS)
-				{
-					m_logger->log("Failed to await submission feature!", ILogger::ELL_ERROR);
-					return false;
-				}
-
-				// assign gpu objects to output
-				auto&& buffers = reservation.getGPUObjects<ICPUBuffer>();
-				for (uint32_t i = 0; i < objectsCpu.size(); i++)
-				{
-					auto& obj = objectsGpu[i];
-					obj.bindings.vertex = { .offset = 0, .buffer = buffers[2 * i + 0].value };
-					obj.bindings.index = { .offset = 0, .buffer = buffers[2 * i + 1].value };
-
-					geomInfos[i].vertexBufferAddress = obj.bindings.vertex.buffer->getDeviceAddress() + byteOffsets[i];
-					geomInfos[i].indexBufferAddress = obj.useIndex() ? obj.bindings.index.buffer->getDeviceAddress() : geomInfos[i].vertexBufferAddress;
-				}
-			}
-
+			
+			constexpr auto CompBufferCount = 2;
+			std::array<smart_refctd_ptr<IGPUCommandBuffer>,CompBufferCount> compBufs = {};
+			std::array<IQueue::SSubmitInfo::SCommandBufferInfo,CompBufferCount> compBufInfos = {};
 			{
-				IGPUBuffer::SCreationParams params;
-				params.usage = IGPUBuffer::EUF_STORAGE_BUFFER_BIT | IGPUBuffer::EUF_TRANSFER_DST_BIT | IGPUBuffer::EUF_INLINE_UPDATE_VIA_CMDBUF | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT;
-				params.size = OT_COUNT * sizeof(SGeomInfo);
-				m_utils->createFilledDeviceLocalBufferOnDedMem(SIntendedSubmitInfo{.queue = queue}, std::move(params), geomInfos).move_into(geometryInfoBuffer);
+				auto pool = m_device->createCommandPool(getComputeQueue()->getFamilyIndex(),IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT|IGPUCommandPool::CREATE_FLAGS::TRANSIENT_BIT);
+				pool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY,compBufs);
+				compBufs.front()->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT);
+				for (auto i=0; i<CompBufferCount; i++)
+					compBufInfos[i].cmdbuf = compBufs[i].get();
 			}
-
-			return true;
-		}
-
-		bool createAccelerationStructures(video::CThreadSafeQueueAdapter* queue)
-		{
-			IQueryPool::SCreationParams qParams{ .queryCount = OT_COUNT, .queryType = IQueryPool::ACCELERATION_STRUCTURE_COMPACTED_SIZE };
-			smart_refctd_ptr<IQueryPool> queryPool = m_device->createQueryPool(std::move(qParams));
-
-			auto pool = m_device->createCommandPool(queue->getFamilyIndex(), IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT | IGPUCommandPool::CREATE_FLAGS::TRANSIENT_BIT);
-			if (!pool)
-				return logFail("Couldn't create Command Pool for blas/tlas creation!");
-			
-			m_api->startCapture();
+			auto compSema = m_device->createSemaphore(0u);
+			compSema->setObjectDebugName("Compute Semaphore");
+			SIntendedSubmitInfo compute = {};
+			compute.queue = getComputeQueue();
+			compute.scratchCommandBuffers = compBufInfos;
+			compute.scratchSemaphore = {
+				.semaphore = compSema.get(),
+				.value = 0u,
+				.stageMask = PIPELINE_STAGE_FLAGS::ACCELERATION_STRUCTURE_BUILD_BIT|PIPELINE_STAGE_FLAGS::ACCELERATION_STRUCTURE_COPY_BIT
+			};
+			// convert
 #ifdef TRY_BUILD_FOR_NGFX // NSight is "debugger-challenged" it can't capture anything not happenning "during a frame", so we need to trick it
 			m_currentImageAcquire = m_surface->acquireNextImage();
 			{
@@ -775,274 +719,166 @@ class RayQueryGeometryApp final : public examples::SimpleWindowedApplication, pu
 			}
 			m_currentImageAcquire = m_surface->acquireNextImage();
 #endif
-			size_t totalScratchSize = 0;
-			const auto scratchOffsetAlignment = m_device->getPhysicalDevice()->getLimits().minAccelerationStructureScratchOffsetAlignment;
-
-			// build bottom level ASes
+			m_api->startCapture();
+			auto gQueue = getGraphicsQueue();
 			{
-				IGPUBottomLevelAccelerationStructure::DeviceBuildInfo blasBuildInfos[OT_COUNT];
-				uint32_t primitiveCounts[OT_COUNT];
-				using Geometry = IGPUBottomLevelAccelerationStructure::Triangles<const IGPUBuffer>;
-				Geometry triangles[OT_COUNT];
-				uint32_t scratchSizes[OT_COUNT];
-
-				for (uint32_t i = 0; i < objectsGpu.size(); i++)
+				smart_refctd_ptr<CAssetConverter::SConvertParams::scratch_for_device_AS_build_t> scratchAlloc;
 				{
-					const auto& obj = objectsGpu[i];
-
-					const uint32_t vertexStride = obj.vertexStride;
-					const uint32_t numVertices = obj.bindings.vertex.buffer->getSize() / vertexStride;
-					if (obj.useIndex())
-						primitiveCounts[i] = obj.indexCount / 3;
-					else
-						primitiveCounts[i] = numVertices / 3;
-
-					triangles[i].vertexData[0] = obj.bindings.vertex;
-					triangles[i].indexData = obj.useIndex() ? obj.bindings.index : obj.bindings.vertex;
-					triangles[i].maxVertex = numVertices - 1;
-					triangles[i].vertexStride = vertexStride;
-					triangles[i].vertexFormat = EF_R32G32B32_SFLOAT;
-					triangles[i].indexType = obj.indexType;
-					triangles[i].geometryFlags = IGPUBottomLevelAccelerationStructure::GEOMETRY_FLAGS::OPAQUE_BIT;
-
-					auto blasFlags = bitflag(IGPUBottomLevelAccelerationStructure::BUILD_FLAGS::PREFER_FAST_TRACE_BIT) | IGPUBottomLevelAccelerationStructure::BUILD_FLAGS::ALLOW_COMPACTION_BIT;
-					if (m_physicalDevice->getProperties().limits.rayTracingPositionFetch)
-						blasFlags |= IGPUBottomLevelAccelerationStructure::BUILD_FLAGS::ALLOW_DATA_ACCESS_KHR;
-
-					blasBuildInfos[i].buildFlags = blasFlags;
-					blasBuildInfos[i].geometryCount = 1;	// only 1 geometry object per blas
-					blasBuildInfos[i].srcAS = nullptr;
-					blasBuildInfos[i].dstAS = nullptr;
-					blasBuildInfos[i].triangles = &triangles[i];
-					blasBuildInfos[i].scratch = {};
-
-					ILogicalDevice::AccelerationStructureBuildSizes buildSizes;
-					{
-						const uint32_t maxPrimCount[1] = { primitiveCounts[i] };
-						buildSizes = m_device->getAccelerationStructureBuildSizes(blasFlags, false, std::span<const Geometry>{&triangles[i], 1}, maxPrimCount);
-						if (!buildSizes)
-							return logFail("Failed to get BLAS build sizes");
-					}
-
-					scratchSizes[i] = buildSizes.buildScratchSize;
-					totalScratchSize = core::alignUp(totalScratchSize, scratchOffsetAlignment);
-					totalScratchSize += buildSizes.buildScratchSize;
-
-					{
-						IGPUBuffer::SCreationParams params;
-						params.usage = bitflag(IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT) | IGPUBuffer::EUF_ACCELERATION_STRUCTURE_STORAGE_BIT;
-						params.size = buildSizes.accelerationStructureSize;
-						smart_refctd_ptr<IGPUBuffer> asBuffer = createBuffer(params);
-
-						IGPUBottomLevelAccelerationStructure::SCreationParams blasParams;
-						blasParams.bufferRange.buffer = asBuffer;
-						blasParams.bufferRange.offset = 0u;
-						blasParams.bufferRange.size = buildSizes.accelerationStructureSize;
-						blasParams.flags = IGPUBottomLevelAccelerationStructure::SCreationParams::FLAGS::NONE;
-						gpuBlas[i] = m_device->createBottomLevelAccelerationStructure(std::move(blasParams));
-						if (!gpuBlas[i])
-							return logFail("Could not create BLAS");
-					}
-				}
-
-				auto cmdbufBlas = getSingleUseCommandBufferAndBegin(pool);
-				cmdbufBlas->beginDebugMarker("Build BLAS");
+					constexpr auto MaxAlignment = 256;
+					constexpr auto MinAllocationSize = 1024;
+					const auto scratchSize = core::alignUp(reservation.getMinASBuildScratchSize(false),MaxAlignment);
+					
+
+					IGPUBuffer::SCreationParams creationParams = {};
+					creationParams.size = scratchSize;
+					creationParams.usage = IGPUBuffer::EUF_ACCELERATION_STRUCTURE_BUILD_INPUT_READ_ONLY_BIT|IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT|IGPUBuffer::EUF_STORAGE_BUFFER_BIT;
+#ifdef TEST_REBAR_FALLBACK
+					creationParams.usage |= IGPUBuffer::EUF_TRANSFER_DST_BIT;
+					core::unordered_set<uint32_t> sharingSet = {compute.queue->getFamilyIndex(),transfer.queue->getFamilyIndex()};
+					core::vector<uint32_t> sharingIndices(sharingSet.begin(),sharingSet.end());
+					if (sharingIndices.size()>1)
+						creationParams.queueFamilyIndexCount = sharingIndices.size();
+					creationParams.queueFamilyIndices = sharingIndices.data();
+#endif
+					auto scratchBuffer = m_device->createBuffer(std::move(creationParams));
 
-				cmdbufBlas->resetQueryPool(queryPool.get(), 0, objectsGpu.size());
+					auto reqs = scratchBuffer->getMemoryReqs();
+#ifndef TEST_REBAR_FALLBACK
+					reqs.memoryTypeBits &= m_physicalDevice->getDirectVRAMAccessMemoryTypeBits();
+#endif
+					auto allocation = m_device->allocate(reqs,scratchBuffer.get(),IDeviceMemoryAllocation::EMAF_DEVICE_ADDRESS_BIT);
+#ifndef TEST_REBAR_FALLBACK
+					allocation.memory->map({.offset=0,.length=reqs.size});
+#endif
 
-				smart_refctd_ptr<IGPUBuffer> scratchBuffer;
-				{
-					IGPUBuffer::SCreationParams params;
-					params.usage = bitflag(IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT) | IGPUBuffer::EUF_STORAGE_BUFFER_BIT;
-					params.size = totalScratchSize;
-					scratchBuffer = createBuffer(params);
+					scratchAlloc = make_smart_refctd_ptr<CAssetConverter::SConvertParams::scratch_for_device_AS_build_t>(
+						SBufferRange<video::IGPUBuffer>{0ull,scratchSize,std::move(scratchBuffer)},
+						core::allocator<uint8_t>(),MaxAlignment,MinAllocationSize
+					);
 				}
 
-				uint32_t queryCount = 0;
-				IGPUBottomLevelAccelerationStructure::BuildRangeInfo buildRangeInfos[OT_COUNT];
-				IGPUBottomLevelAccelerationStructure::BuildRangeInfo* pRangeInfos[OT_COUNT];
-				for (uint32_t i = 0; i < objectsGpu.size(); i++)
+				struct MyParams final : CAssetConverter::SConvertParams
 				{
-					blasBuildInfos[i].dstAS = gpuBlas[i].get();
-					blasBuildInfos[i].scratch.buffer = scratchBuffer;
-					if (i == 0)
+					inline uint32_t getFinalOwnerQueueFamily(const IGPUBuffer* buffer, const core::blake3_hash_t& createdFrom) override
 					{
-						blasBuildInfos[i].scratch.offset = 0u;
+						return finalUser;
 					}
-					else
+					inline uint32_t getFinalOwnerQueueFamily(const IGPUAccelerationStructure* image, const core::blake3_hash_t& createdFrom) override
 					{
-						const auto unalignedOffset = blasBuildInfos[i - 1].scratch.offset + scratchSizes[i - 1];
-						blasBuildInfos[i].scratch.offset = core::alignUp(unalignedOffset, scratchOffsetAlignment);
+						return finalUser;
 					}
 
-					buildRangeInfos[i].primitiveCount = primitiveCounts[i];
-					buildRangeInfos[i].primitiveByteOffset = 0u;
-					buildRangeInfos[i].firstVertex = 0u;
-					buildRangeInfos[i].transformByteOffset = 0u;
-
-					pRangeInfos[i] = &buildRangeInfos[i];
-				}
-
-				if (!cmdbufBlas->buildAccelerationStructures({ blasBuildInfos, OT_COUNT }, pRangeInfos))
-					return logFail("Failed to build BLAS");
+					uint8_t finalUser;
+				} params = {};
+				params.utilities = m_utils.get();
+				params.transfer = &transfer;
+				params.compute = &compute;
+				params.scratchForDeviceASBuild = scratchAlloc.get();
+				params.finalUser = gQueue->getFamilyIndex();
 
+				auto future = reservation.convert(params);
+				if (future.copy() != IQueue::RESULT::SUCCESS)
 				{
-					SMemoryBarrier memBarrier;
-					memBarrier.srcStageMask = PIPELINE_STAGE_FLAGS::ACCELERATION_STRUCTURE_BUILD_BIT;
-					memBarrier.srcAccessMask = ACCESS_FLAGS::ACCELERATION_STRUCTURE_WRITE_BIT;
-					memBarrier.dstStageMask = PIPELINE_STAGE_FLAGS::ACCELERATION_STRUCTURE_BUILD_BIT;
-					memBarrier.dstAccessMask = ACCESS_FLAGS::ACCELERATION_STRUCTURE_READ_BIT;
-					cmdbufBlas->pipelineBarrier(E_DEPENDENCY_FLAGS::EDF_NONE, { .memBarriers = {&memBarrier, 1} });
+					m_logger->log("Failed to await submission feature!", ILogger::ELL_ERROR);
+					return {};
 				}
 
-				const IGPUAccelerationStructure* ases[OT_COUNT];
-				for (uint32_t i = 0; i < objectsGpu.size(); i++)
-					ases[i] = gpuBlas[i].get();
-				if (!cmdbufBlas->writeAccelerationStructureProperties({ ases, OT_COUNT }, IQueryPool::ACCELERATION_STRUCTURE_COMPACTED_SIZE,
-					queryPool.get(), queryCount++))
-					return logFail("Failed to write acceleration structure properties!");
-
-				cmdbufBlas->endDebugMarker();
-				cmdbufSubmitAndWait(cmdbufBlas, queue, 39);
-			}
-
-			auto cmdbufCompact = getSingleUseCommandBufferAndBegin(pool);
-			cmdbufCompact->beginDebugMarker("Compact BLAS");
-
-			// compact blas
-			{
-				std::array<size_t, OT_COUNT> asSizes{ 0 };
-				if (!m_device->getQueryPoolResults(queryPool.get(), 0, objectsGpu.size(), asSizes.data(), sizeof(size_t), IQueryPool::WAIT_BIT))
-					return logFail("Could not get query pool results for AS sizes");
-
-				std::array<smart_refctd_ptr<IGPUBottomLevelAccelerationStructure>, OT_COUNT> cleanupBlas;
-				for (uint32_t i = 0; i < objectsGpu.size(); i++)
+				// assign gpu objects to output
+				for (const auto& buffer : reservation.getGPUObjects<ICPUBuffer>())
+					retainedBuffers.push_back(buffer.value);
+				for (uint32_t i = 0; i < objectsCpu.size(); i++)
 				{
-					cleanupBlas[i] = gpuBlas[i];
-					{
-						IGPUBuffer::SCreationParams params;
-						params.usage = bitflag(IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT) | IGPUBuffer::EUF_ACCELERATION_STRUCTURE_STORAGE_BIT;
-						params.size = asSizes[i];
-						smart_refctd_ptr<IGPUBuffer> asBuffer = createBuffer(params);
-
-						IGPUBottomLevelAccelerationStructure::SCreationParams blasParams;
-						blasParams.bufferRange.buffer = asBuffer;
-						blasParams.bufferRange.offset = 0u;
-						blasParams.bufferRange.size = asSizes[i];
-						blasParams.flags = IGPUBottomLevelAccelerationStructure::SCreationParams::FLAGS::NONE;
-						gpuBlas[i] = m_device->createBottomLevelAccelerationStructure(std::move(blasParams));
-						if (!gpuBlas[i])
-							return logFail("Could not create compacted BLAS");
-					}
+					auto vBuffer = retainedBuffers[2 * i + 0].get();
+					auto iBuffer = retainedBuffers[2 * i + 1].get();
+					const auto& geom = objectsCpu[i];
+					const bool useIndex = geom.data.indexType != EIT_UNKNOWN;
 
-					IGPUBottomLevelAccelerationStructure::CopyInfo copyInfo;
-					copyInfo.src = cleanupBlas[i].get();
-					copyInfo.dst = gpuBlas[i].get();
-					copyInfo.mode = IGPUBottomLevelAccelerationStructure::COPY_MODE::COMPACT;
-					if (!cmdbufCompact->copyAccelerationStructure(copyInfo))
-						return logFail("Failed to copy AS to compact");
+					geomInfos[i].vertexBufferAddress = vBuffer->getDeviceAddress() + byteOffsets[i];
+					geomInfos[i].indexBufferAddress = useIndex ? iBuffer->getDeviceAddress():0x0ull;
 				}
 			}
 
-			cmdbufCompact->endDebugMarker();
-			cmdbufSubmitAndWait(cmdbufCompact, queue, 40);
-
-			auto cmdbufTlas = getSingleUseCommandBufferAndBegin(pool);
-			cmdbufTlas->beginDebugMarker("Build TLAS");
-
-			// build top level AS
+			//
 			{
-				const uint32_t instancesCount = objectsGpu.size();
-				IGPUTopLevelAccelerationStructure::DeviceStaticInstance instances[OT_COUNT];
-				for (uint32_t i = 0; i < instancesCount; i++)
-				{
-					core::matrix3x4SIMD transform;
-					transform.setTranslation(nbl::core::vectorSIMDf(5.f * i, 0, 0, 0));
-					instances[i].base.blas.deviceAddress = gpuBlas[i]->getReferenceForDeviceOperations().deviceAddress;
-					instances[i].base.mask = 0xFF;
-					instances[i].base.instanceCustomIndex = i;
-					instances[i].base.instanceShaderBindingTableRecordOffset = 0;
-					instances[i].base.flags = static_cast<uint32_t>(IGPUTopLevelAccelerationStructure::INSTANCE_FLAGS::TRIANGLE_FACING_CULL_DISABLE_BIT);
-					instances[i].transform = transform;
-				}
-
-				{
-					size_t bufSize = instancesCount * sizeof(IGPUTopLevelAccelerationStructure::DeviceStaticInstance);
-					IGPUBuffer::SCreationParams params;
-					params.usage = bitflag(IGPUBuffer::EUF_ACCELERATION_STRUCTURE_BUILD_INPUT_READ_ONLY_BIT) | IGPUBuffer::EUF_STORAGE_BUFFER_BIT |
-						IGPUBuffer::EUF_INLINE_UPDATE_VIA_CMDBUF | IGPUBuffer::EUF_TRANSFER_DST_BIT | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT;
-					params.size = bufSize;
-					instancesBuffer = createBuffer(params);
-
-					SBufferRange<IGPUBuffer> range = { .offset = 0u, .size = bufSize, .buffer = instancesBuffer };
-					cmdbufTlas->updateBuffer(range, instances);
-				}
-
-				// make sure instances upload complete first
-				{
-					SMemoryBarrier memBarrier;
-					memBarrier.srcStageMask = PIPELINE_STAGE_FLAGS::ALL_TRANSFER_BITS;
-					memBarrier.srcAccessMask = ACCESS_FLAGS::TRANSFER_WRITE_BIT;
-					memBarrier.dstStageMask = PIPELINE_STAGE_FLAGS::ACCELERATION_STRUCTURE_BUILD_BIT;
-					memBarrier.dstAccessMask = ACCESS_FLAGS::ACCELERATION_STRUCTURE_WRITE_BIT;
-					cmdbufTlas->pipelineBarrier(E_DEPENDENCY_FLAGS::EDF_NONE, { .memBarriers = {&memBarrier, 1} });
-				}
-
-				auto tlasFlags = bitflag(IGPUTopLevelAccelerationStructure::BUILD_FLAGS::PREFER_FAST_TRACE_BIT);
-
-				IGPUTopLevelAccelerationStructure::DeviceBuildInfo tlasBuildInfo;
-				tlasBuildInfo.buildFlags = tlasFlags;
-				tlasBuildInfo.srcAS = nullptr;
-				tlasBuildInfo.dstAS = nullptr;
-				tlasBuildInfo.instanceData.buffer = instancesBuffer;
-				tlasBuildInfo.instanceData.offset = 0u;
-				tlasBuildInfo.scratch = {};
-
-				auto buildSizes = m_device->getAccelerationStructureBuildSizes(tlasFlags, false, instancesCount);
-				if (!buildSizes)
-					return logFail("Failed to get TLAS build sizes");
+				IGPUBuffer::SCreationParams params;
+				params.usage = IGPUBuffer::EUF_STORAGE_BUFFER_BIT | IGPUBuffer::EUF_TRANSFER_DST_BIT | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT;
+				params.size = OT_COUNT * sizeof(SGeomInfo);
+				m_utils->createFilledDeviceLocalBufferOnDedMem(SIntendedSubmitInfo{ .queue = gQueue }, std::move(params), geomInfos).move_into(geometryInfoBuffer);
+			}
 
+			// acquire ownership
+			{
+				smart_refctd_ptr<IGPUCommandBuffer> cmdbuf;
 				{
-					IGPUBuffer::SCreationParams params;
-					params.usage = bitflag(IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT) | IGPUBuffer::EUF_ACCELERATION_STRUCTURE_STORAGE_BIT;
-					params.size = buildSizes.accelerationStructureSize;
-					smart_refctd_ptr<IGPUBuffer> asBuffer = createBuffer(params);
-
-					IGPUTopLevelAccelerationStructure::SCreationParams tlasParams;
-					tlasParams.bufferRange.buffer = asBuffer;
-					tlasParams.bufferRange.offset = 0u;
-					tlasParams.bufferRange.size = buildSizes.accelerationStructureSize;
-					tlasParams.flags = IGPUTopLevelAccelerationStructure::SCreationParams::FLAGS::NONE;
-					gpuTlas = m_device->createTopLevelAccelerationStructure(std::move(tlasParams));
-					if (!gpuTlas)
-						return logFail("Could not create TLAS");
+					const auto gQFI = gQueue->getFamilyIndex();
+					m_device->createCommandPool(gQFI,IGPUCommandPool::CREATE_FLAGS::TRANSIENT_BIT)->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY,{&cmdbuf,1});
+					cmdbuf->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT);
+					{
+						core::vector<IGPUCommandBuffer::SBufferMemoryBarrier<IGPUCommandBuffer::SOwnershipTransferBarrier>> bufBarriers;
+						auto acquireBufferRange = [&bufBarriers](const uint8_t otherQueueFamilyIndex, const SBufferRange<IGPUBuffer>& bufferRange)
+						{
+							bufBarriers.push_back({
+								.barrier = {
+									.dep = {
+										.srcStageMask = PIPELINE_STAGE_FLAGS::NONE,
+										.srcAccessMask = ACCESS_FLAGS::NONE,
+										.dstStageMask = PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT,
+										// we don't care what exactly, uncomplex our code
+										.dstAccessMask = ACCESS_FLAGS::SHADER_READ_BITS
+									},
+									.ownershipOp = IGPUCommandBuffer::SOwnershipTransferBarrier::OWNERSHIP_OP::ACQUIRE,
+									.otherQueueFamilyIndex = otherQueueFamilyIndex
+								},
+								.range = bufferRange
+							});
+						};
+#ifdef TEST_REBAR_FALLBACK
+						if (const auto otherQueueFamilyIndex=transfer.queue->getFamilyIndex(); gQFI!=otherQueueFamilyIndex)
+						for (const auto& buffer : reservation.getGPUObjects<ICPUBuffer>())
+						{
+							const auto& buff = buffer.value;
+							if (buff)
+								acquireBufferRange(otherQueueFamilyIndex,{.offset=0,.size=buff->getSize(),.buffer=buff});
+						}
+#endif
+						if (const auto otherQueueFamilyIndex=compute.queue->getFamilyIndex(); gQFI!=otherQueueFamilyIndex)
+						{
+							auto acquireAS = [&acquireBufferRange,otherQueueFamilyIndex](const IGPUAccelerationStructure* as)
+							{
+								acquireBufferRange(otherQueueFamilyIndex,as->getCreationParams().bufferRange);
+							};
+							for (const auto& blas : reservation.getGPUObjects<ICPUBottomLevelAccelerationStructure>())
+								acquireAS(blas.value.get());
+							acquireAS(reservation.getGPUObjects<ICPUTopLevelAccelerationStructure>().front().value.get());
+						}
+						if (!bufBarriers.empty())
+							cmdbuf->pipelineBarrier(asset::E_DEPENDENCY_FLAGS::EDF_NONE,{.memBarriers={},.bufBarriers=bufBarriers});
+					}
+					cmdbuf->end();
 				}
-
-				smart_refctd_ptr<IGPUBuffer> scratchBuffer;
+				if (!cmdbuf->empty())
 				{
-					IGPUBuffer::SCreationParams params;
-					params.usage = bitflag(IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT) | IGPUBuffer::EUF_STORAGE_BUFFER_BIT;
-					params.size = buildSizes.buildScratchSize;
-					scratchBuffer = createBuffer(params);
+					const IQueue::SSubmitInfo::SCommandBufferInfo cmdbufInfo = {
+						.cmdbuf = cmdbuf.get()
+					};
+					const IQueue::SSubmitInfo::SSemaphoreInfo signal = {
+						.semaphore = compute.scratchSemaphore.semaphore,
+						.value = compute.getFutureScratchSemaphore().value,
+						.stageMask = asset::PIPELINE_STAGE_FLAGS::ALL_COMMANDS_BITS
+					};
+					auto wait = signal;
+					wait.value--;
+					const IQueue::SSubmitInfo info = {
+						.waitSemaphores = {&wait,1}, // we already waited with the host on the AS build
+						.commandBuffers = {&cmdbufInfo,1},
+						.signalSemaphores = {&signal,1}
+					};
+					if (const auto retval=gQueue->submit({&info,1}); retval!=IQueue::RESULT::SUCCESS)
+						m_logger->log("Failed to transfer ownership with code %d!",system::ILogger::ELL_ERROR,retval);
 				}
-
-				tlasBuildInfo.dstAS = gpuTlas.get();
-				tlasBuildInfo.scratch.buffer = scratchBuffer;
-				tlasBuildInfo.scratch.offset = 0u;
-
-				IGPUTopLevelAccelerationStructure::BuildRangeInfo buildRangeInfo[1u];
-				buildRangeInfo[0].instanceCount = instancesCount;
-				buildRangeInfo[0].instanceByteOffset = 0u;
-				IGPUTopLevelAccelerationStructure::BuildRangeInfo* pRangeInfos;
-				pRangeInfos = &buildRangeInfo[0];
-
-				if (!cmdbufTlas->buildAccelerationStructures({ &tlasBuildInfo, 1 }, pRangeInfos))
-					return logFail("Failed to build TLAS");
 			}
-
-			cmdbufTlas->endDebugMarker();
-			cmdbufSubmitAndWait(cmdbufTlas, queue, 45);
-
+#undef TEST_REBAR_FALLBACK
+			
 #ifdef TRY_BUILD_FOR_NGFX
 			{
 				const IQueue::SSubmitInfo::SSemaphoreInfo acquired[] = { {
@@ -1055,7 +891,7 @@ class RayQueryGeometryApp final : public examples::SimpleWindowedApplication, pu
 #endif
 			m_api->endCapture();
 
-			return true;
+			return reservation.getGPUObjects<ICPUDescriptorSet>().front().value;
 		}
 
 
@@ -1073,18 +909,12 @@ class RayQueryGeometryApp final : public examples::SimpleWindowedApplication, pu
 		Camera camera = Camera(core::vectorSIMDf(0, 0, 0), core::vectorSIMDf(0, 0, 0), core::matrix4SIMD());
 		video::CDumbPresentationOracle oracle;
 
-		std::array<ReferenceObjectGpu, OT_COUNT> objectsGpu;
-
-		std::array<smart_refctd_ptr<IGPUBottomLevelAccelerationStructure>, OT_COUNT> gpuBlas;
-		smart_refctd_ptr<IGPUTopLevelAccelerationStructure> gpuTlas;
-		smart_refctd_ptr<IGPUBuffer> instancesBuffer;
-
 		smart_refctd_ptr<IGPUBuffer> geometryInfoBuffer;
+		core::vector<smart_refctd_ptr<IGPUBuffer>> retainedBuffers;
 		smart_refctd_ptr<IGPUImage> outHDRImage;
 
 		smart_refctd_ptr<IGPUComputePipeline> renderPipeline;
 		smart_refctd_ptr<IGPUDescriptorSet> renderDs;
-		smart_refctd_ptr<IDescriptorPool> renderPool;
 
 		uint16_t gcIndex = {};
 

From fbf674031e2f16b2ee79305094ad3a45b6051c6c Mon Sep 17 00:00:00 2001
From: Erfan Ahmadi <ahmadierfan99@gmail.com>
Date: Fri, 13 Jun 2025 14:05:09 +0400
Subject: [PATCH 358/529] Bringing Francisco's changes to DrawResourcesFiller

---
 62_CAD/DrawResourcesFiller.cpp                | 87 +++++++++++++++----
 62_CAD/DrawResourcesFiller.h                  | 39 ++++++++-
 .../main_pipeline/fragment_shader_debug.hlsl  |  3 -
 3 files changed, 107 insertions(+), 22 deletions(-)

diff --git a/62_CAD/DrawResourcesFiller.cpp b/62_CAD/DrawResourcesFiller.cpp
index ed46600e6..b540d9257 100644
--- a/62_CAD/DrawResourcesFiller.cpp
+++ b/62_CAD/DrawResourcesFiller.cpp
@@ -369,6 +369,71 @@ void DrawResourcesFiller::drawHatch(
 		const float32_t4& color,
 		const HatchFillPattern fillPattern,
 		SIntendedSubmitInfo& intendedNextSubmit)
+{
+	drawHatch_impl(hatch, color, fillPattern, intendedNextSubmit);
+}
+
+void DrawResourcesFiller::drawHatch(const Hatch& hatch, const float32_t4& color, SIntendedSubmitInfo& intendedNextSubmit)
+{
+	drawHatch(hatch, color, HatchFillPattern::SOLID_FILL, intendedNextSubmit);
+}
+
+void DrawResourcesFiller::drawFixedGeometryHatch(
+		const en::nabla2d::Hatch& hatch,
+		const float32_t4& foregroundColor,
+		const float32_t4& backgroundColor,
+		const en::nabla2d::HatchFillPattern fillPattern,
+		const float64_t3x3& transformation,
+		en::nabla2d::TransformationType transformationType, 
+		SIntendedSubmitInfo& intendedNextSubmit)
+{
+	// TODO[Optimization Idea]: don't draw hatch twice, we now have color storage buffer and we can treat rendering hatches like a procedural texture (requires 2 colors so no more abusing of linestyle for hatches)
+
+	// if backgroundColor is visible
+	drawFixedGeometryHatch(hatch, backgroundColor, transformation, transformationType, intendedNextSubmit);
+	// if foregroundColor is visible
+	drawFixedGeometryHatch(hatch, foregroundColor, fillPattern, transformation, transformationType, intendedNextSubmit);
+}
+
+void DrawResourcesFiller::drawFixedGeometryHatch(
+	const Hatch& hatch,
+	const float32_t4& color,
+	const HatchFillPattern fillPattern,
+	const float64_t3x3& transformation,
+	en::nabla2d::TransformationType transformationType,
+	SIntendedSubmitInfo& intendedNextSubmit)
+{
+	if (!activeProjections.empty())
+	{
+		// if there is already an active custom projection, it should be considered into the transformation of the fixed geometry polyline
+		float64_t3x3 newTransformation = nbl::hlsl::mul(activeProjections.back(), transformation);
+		pushCustomProjection(newTransformation);
+	}
+	else
+	{
+		// will be multiplied by the default projection matrix from the left (in shader), no need to consider it here
+		pushCustomProjection(transformation);
+	}
+	drawHatch_impl(hatch, color, fillPattern, intendedNextSubmit, transformationType);
+	popCustomProjection();
+}
+
+void DrawResourcesFiller::drawFixedGeometryHatch(
+	const Hatch& hatch,
+	const float32_t4& color,
+	const float64_t3x3& transformation,
+	en::nabla2d::TransformationType transformationType,
+	SIntendedSubmitInfo& intendedNextSubmit)
+{
+	drawFixedGeometryHatch(hatch, color, HatchFillPattern::SOLID_FILL, transformation, transformationType, intendedNextSubmit);
+}
+
+void DrawResourcesFiller::drawHatch_impl(
+	const Hatch& hatch,
+	const float32_t4& color,
+	const HatchFillPattern fillPattern,
+	SIntendedSubmitInfo& intendedNextSubmit,
+	en::nabla2d::TransformationType transformationType)
 {
 	if (color.a == 0.0f) // not visible
 		return;
@@ -380,26 +445,17 @@ void DrawResourcesFiller::drawHatch(
 		textureIdx = getMSDFIndexFromInputInfo(msdfInfo, intendedNextSubmit);
 		if (textureIdx == InvalidTextureIndex)
 			textureIdx = addMSDFTexture(msdfInfo, getHatchFillPatternMSDF(fillPattern), intendedNextSubmit);
-
-		if (textureIdx == InvalidTextureIndex)
-			m_logger.log("drawHatch: textureIdx returned invalid index", nbl::system::ILogger::ELL_ERROR);
+		_NBL_DEBUG_BREAK_IF(textureIdx == InvalidTextureIndex); // probably getHatchFillPatternMSDF returned nullptr
 	}
 
 	LineStyleInfo lineStyle = {};
 	lineStyle.color = color;
 	lineStyle.screenSpaceLineWidth = nbl::hlsl::bit_cast<float, uint32_t>(textureIdx);
-	
+
 	setActiveLineStyle(lineStyle);
-	beginMainObject(MainObjectType::HATCH);
-	
-	uint32_t mainObjectIdx = acquireActiveMainObjectIndex_SubmitIfNeeded(intendedNextSubmit);
-	if (mainObjectIdx == InvalidMainObjectIdx)
-	{
-		m_logger.log("drawHatch: acquireActiveMainObjectIndex returned invalid index", nbl::system::ILogger::ELL_ERROR);
-		assert(false);
-		return;
-	}
+	beginMainObject(MainObjectType::HATCH, transformationType);
 
+	uint32_t mainObjectIdx = acquireActiveMainObjectIndex_SubmitIfNeeded(intendedNextSubmit);
 	uint32_t currentObjectInSection = 0u; // Object here refers to DrawObject. You can think of it as a Cage.
 	while (currentObjectInSection < hatch.getHatchBoxCount())
 	{
@@ -411,11 +467,6 @@ void DrawResourcesFiller::drawHatch(
 	endMainObject();
 }
 
-void DrawResourcesFiller::drawHatch(const Hatch& hatch, const float32_t4& color, SIntendedSubmitInfo& intendedNextSubmit)
-{
-	drawHatch(hatch, color, HatchFillPattern::SOLID_FILL, intendedNextSubmit);
-}
-
 void DrawResourcesFiller::drawFontGlyph(
 		nbl::ext::TextRendering::FontFace* fontFace,
 		uint32_t glyphIdx,
diff --git a/62_CAD/DrawResourcesFiller.h b/62_CAD/DrawResourcesFiller.h
index dd24ea2e9..1babd7d7a 100644
--- a/62_CAD/DrawResourcesFiller.h
+++ b/62_CAD/DrawResourcesFiller.h
@@ -180,7 +180,7 @@ struct DrawResourcesFiller
 
 	// Must be called at the end of each frame.
 	// right before submitting the main draw that uses the currently queued geometry, images, or other objects/resources.
-	// Registers the semaphore/value that will signal completion of this frame�s draw,
+	// Registers the semaphore/value that will signal completion of this frame�s draw,
 	// This allows future frames to safely deallocate or evict resources used in the current frame by waiting on this signal before reuse or destruction.
 	// `drawSubmitWaitValue` should reference the wait value of the draw submission finishing this frame using the `intendedNextSubmit`; 
 	void markFrameUsageComplete(uint64_t drawSubmitWaitValue);
@@ -227,6 +227,33 @@ struct DrawResourcesFiller
 		const float32_t4& color,
 		SIntendedSubmitInfo& intendedNextSubmit);
 	
+	//! Convinience function for fixed-geometry Hatch with MSDF Pattern and a solid background
+	void drawFixedGeometryHatch(
+		const en::nabla2d::Hatch& hatch,
+		const float32_t4& foregroundColor,
+		const float32_t4& backgroundColor,
+		const en::nabla2d::HatchFillPattern fillPattern,
+		const float64_t3x3& transformation,
+		en::nabla2d::TransformationType transformationType,
+		SIntendedSubmitInfo& intendedNextSubmit);
+
+	// ! Fixed-geometry Hatch with MSDF Pattern
+	void drawFixedGeometryHatch(
+		const Hatch& hatch,
+		const float32_t4& color,
+		const HatchFillPattern fillPattern,
+		const float64_t3x3& transformation,
+		en::nabla2d::TransformationType transformationType,
+		SIntendedSubmitInfo& intendedNextSubmit);
+
+	// ! Solid Fill Fixed-geometry Hatch
+	void drawFixedGeometryHatch(
+		const Hatch& hatch,
+		const float32_t4& color,
+		const float64_t3x3& transformation,
+		en::nabla2d::TransformationType transformationType,
+		SIntendedSubmitInfo& intendedNextSubmit);
+	
 	/// Used by SingleLineText, Issue drawing a font glyph
 	/// WARNING: make sure this function  is called within begin/endMainObject scope
 	void drawFontGlyph(
@@ -616,6 +643,16 @@ struct DrawResourcesFiller
 	*/
 	void determineGeoreferencedImageCreationParams(nbl::asset::IImage::SCreationParams& outImageParams, ImageType& outImageType, const GeoreferencedImageParams& georeferencedImageParams);
 
+	/**
+	 * @brief Used to implement both `drawHatch` and `drawFixedGeometryHatch` without exposing the transformation type parameter
+	*/
+	void drawHatch_impl(
+		const Hatch& hatch,
+		const float32_t4& color,
+		const HatchFillPattern fillPattern,
+		SIntendedSubmitInfo& intendedNextSubmit,
+		en::nabla2d::TransformationType transformationType = en::nabla2d::TransformationType::TT_NORMAL);
+
 	void resetMainObjects()
 	{
 		resourcesCollection.mainObjects.vector.clear();
diff --git a/62_CAD/shaders/main_pipeline/fragment_shader_debug.hlsl b/62_CAD/shaders/main_pipeline/fragment_shader_debug.hlsl
index 7dba46dd0..2955d22fe 100644
--- a/62_CAD/shaders/main_pipeline/fragment_shader_debug.hlsl
+++ b/62_CAD/shaders/main_pipeline/fragment_shader_debug.hlsl
@@ -1,9 +1,6 @@
 struct PSInputDebug
 {
     float4 position : SV_Position;
-    [[vk::location(0)]] float4 color : COLOR;
-    [[vk::location(1)]] nointerpolation float4 start_end : COLOR1;
-    [[vk::location(2)]] nointerpolation uint3 lineWidth_eccentricity_objType : COLOR2;
 };
 
 [shader("pixel")]

From bb0e4fd1a0064ee8e50c65051fbf3bac8e50b460 Mon Sep 17 00:00:00 2001
From: kevyuu <kevin.kayu@gmail.com>
Date: Sat, 14 Jun 2025 09:28:48 +0700
Subject: [PATCH 359/529] Fix merge by using master code

---
 71_RayTracingPipeline/main.cpp | 3442 +++++++++++++++-----------------
 1 file changed, 1602 insertions(+), 1840 deletions(-)

diff --git a/71_RayTracingPipeline/main.cpp b/71_RayTracingPipeline/main.cpp
index c9ee0eafb..42aaa2233 100644
--- a/71_RayTracingPipeline/main.cpp
+++ b/71_RayTracingPipeline/main.cpp
@@ -6,787 +6,778 @@
 #include "nbl/ext/FullScreenTriangle/FullScreenTriangle.h"
 #include "nbl/builtin/hlsl/indirect_commands.hlsl"
 
+
 class RaytracingPipelineApp final : public examples::SimpleWindowedApplication, public application_templates::MonoAssetManagerAndBuiltinResourceApplication
 {
-  using device_base_t = examples::SimpleWindowedApplication;
-  using asset_base_t = application_templates::MonoAssetManagerAndBuiltinResourceApplication;
-  using clock_t = std::chrono::steady_clock;
-
-  constexpr static inline uint32_t WIN_W = 1280, WIN_H = 720;
-  constexpr static inline uint32_t MaxFramesInFlight = 3u;
-  constexpr static inline uint8_t MaxUITextureCount = 1u;
-  constexpr static inline uint32_t NumberOfProceduralGeometries = 5;
-
-  static constexpr const char* s_lightTypeNames[E_LIGHT_TYPE::ELT_COUNT] = {
-    "Directional",
-    "Point",
-    "Spot"
-  };
-
-  constexpr static inline clock_t::duration DisplayImageDuration = std::chrono::milliseconds(900);
-
-  struct ShaderBindingTable
-  {
-    SBufferRange<IGPUBuffer> raygenGroupRange;
-    SBufferRange<IGPUBuffer> hitGroupsRange;
-    uint32_t hitGroupsStride;
-    SBufferRange<IGPUBuffer> missGroupsRange;
-    uint32_t missGroupsStride;
-    SBufferRange<IGPUBuffer> callableGroupsRange;
-    uint32_t callableGroupsStride;
-  };
+	using device_base_t = examples::SimpleWindowedApplication;
+	using asset_base_t = application_templates::MonoAssetManagerAndBuiltinResourceApplication;
+	using clock_t = std::chrono::steady_clock;
+
+	constexpr static inline uint32_t WIN_W = 1280, WIN_H = 720;
+	constexpr static inline uint32_t MaxFramesInFlight = 3u;
+	constexpr static inline uint8_t MaxUITextureCount = 1u;
+	constexpr static inline uint32_t NumberOfProceduralGeometries = 5;
+
+	static constexpr const char* s_lightTypeNames[E_LIGHT_TYPE::ELT_COUNT] = {
+	  "Directional",
+	  "Point",
+	  "Spot"
+	};
+
+	struct ShaderBindingTable
+	{
+		SBufferRange<IGPUBuffer> raygenGroupRange;
+		SBufferRange<IGPUBuffer> hitGroupsRange;
+		uint32_t hitGroupsStride;
+		SBufferRange<IGPUBuffer> missGroupsRange;
+		uint32_t missGroupsStride;
+		SBufferRange<IGPUBuffer> callableGroupsRange;
+		uint32_t callableGroupsStride;
+	};
 
 
 public:
-  inline RaytracingPipelineApp(const path& _localInputCWD, const path& _localOutputCWD, const path& _sharedInputCWD, const path& _sharedOutputCWD)
-    : IApplicationFramework(_localInputCWD, _localOutputCWD, _sharedInputCWD, _sharedOutputCWD)
-  {
-  }
-
-  inline SPhysicalDeviceFeatures getRequiredDeviceFeatures() const override
-  {
-    auto retval = device_base_t::getRequiredDeviceFeatures();
-    retval.rayTracingPipeline = true;
-    retval.accelerationStructure = true;
-    retval.rayQuery = true;
-    return retval;
-  }
-
-  inline SPhysicalDeviceFeatures getPreferredDeviceFeatures() const override
-  {
-    auto retval = device_base_t::getPreferredDeviceFeatures();
-    retval.accelerationStructureHostCommands = true;
-    return retval;
-  }
-
-  inline core::vector<video::SPhysicalDeviceFilter::SurfaceCompatibility> getSurfaces() const override
-  {
-    if (!m_surface)
-    {
-      {
-        auto windowCallback = core::make_smart_refctd_ptr<CEventCallback>(smart_refctd_ptr(m_inputSystem), smart_refctd_ptr(m_logger));
-        IWindow::SCreationParams params = {};
-        params.callback = core::make_smart_refctd_ptr<ISimpleManagedSurface::ICallback>();
-        params.width = WIN_W;
-        params.height = WIN_H;
-        params.x = 32;
-        params.y = 32;
-        params.flags = ui::IWindow::ECF_HIDDEN | IWindow::ECF_BORDERLESS | IWindow::ECF_RESIZABLE;
-        params.windowCaption = "RaytracingPipelineApp";
-        params.callback = windowCallback;
-        const_cast<std::remove_const_t<decltype(m_window)>&>(m_window) = m_winMgr->createWindow(std::move(params));
-      }
-
-      auto surface = CSurfaceVulkanWin32::create(smart_refctd_ptr(m_api), smart_refctd_ptr_static_cast<IWindowWin32>(m_window));
-      const_cast<std::remove_const_t<decltype(m_surface)>&>(m_surface) = CSimpleResizeSurface<ISimpleManagedSurface::ISwapchainResources>::create(std::move(surface));
-    }
-
-    if (m_surface)
-      return { {m_surface->getSurface()/*,EQF_NONE*/} };
-
-    return {};
-  }
-
-  // so that we can use the same queue for asset converter and rendering
-  inline core::vector<queue_req_t> getQueueRequirements() const override
-  {
-    auto reqs = device_base_t::getQueueRequirements();
-    reqs.front().requiredFlags |= IQueue::FAMILY_FLAGS::TRANSFER_BIT;
-    return reqs;
-  }
-
-  inline bool onAppInitialized(smart_refctd_ptr<ISystem>&& system) override
-  {
-    m_inputSystem = make_smart_refctd_ptr<InputSystem>(logger_opt_smart_ptr(smart_refctd_ptr(m_logger)));
-
-    if (!device_base_t::onAppInitialized(smart_refctd_ptr(system)))
-      return false;
-
-    if (!asset_base_t::onAppInitialized(smart_refctd_ptr(system)))
-      return false;
-
-    smart_refctd_ptr<IShaderCompiler::CCache> shaderReadCache = nullptr;
-    smart_refctd_ptr<IShaderCompiler::CCache> shaderWriteCache = core::make_smart_refctd_ptr<IShaderCompiler::CCache>();
-    auto shaderCachePath = localOutputCWD / "main_pipeline_shader_cache.bin";
-
-    {
-        core::smart_refctd_ptr<system::IFile> shaderReadCacheFile;
-        {
-            system::ISystem::future_t<core::smart_refctd_ptr<system::IFile>> future;
-            m_system->createFile(future, shaderCachePath.c_str(), system::IFile::ECF_READ);
-            if (future.wait())
-            {
-                future.acquire().move_into(shaderReadCacheFile);
-                if (shaderReadCacheFile)
-                {
-                    const size_t size = shaderReadCacheFile->getSize();
-                    if (size > 0ull)
-                    {
-                        std::vector<uint8_t> contents(size);
-                        system::IFile::success_t succ;
-                        shaderReadCacheFile->read(succ, contents.data(), 0, size);
-                        if (succ)
-                            shaderReadCache = IShaderCompiler::CCache::deserialize(contents);
-                    }
-                }
-            }
-            else
-                m_logger->log("Failed Openning Shader Cache File.", ILogger::ELL_ERROR);
-        }
-
-    }
-
-    // Load Custom Shader
-    auto loadCompileAndCreateShader = [&](const std::string& relPath) -> smart_refctd_ptr<IShader>
-        {
-            IAssetLoader::SAssetLoadParams lp = {};
-            lp.logger = m_logger.get();
-            lp.workingDirectory = ""; // virtual root
-            auto assetBundle = m_assetMgr->getAsset(relPath, lp);
-            const auto assets = assetBundle.getContents();
-            if (assets.empty())
-            {
-                assert(false);
-                return nullptr;
-            }
-
-            // lets go straight from ICPUSpecializedShader to IGPUSpecializedShader
-            auto sourceRaw = IAsset::castDown<IShader>(assets[0]);
-            if (!sourceRaw)
-            {
-                assert(false);
-                return nullptr;
-            }
-
-            return m_device->compileShader({ sourceRaw.get(), nullptr, shaderReadCache.get(), shaderWriteCache.get() });
-        };
-
-    // load shaders
-    const auto raygenShader = loadCompileAndCreateShader("app_resources/raytrace.rgen.hlsl");
-    const auto closestHitShader = loadCompileAndCreateShader("app_resources/raytrace.rchit.hlsl");
-    const auto proceduralClosestHitShader = loadCompileAndCreateShader("app_resources/raytrace_procedural.rchit.hlsl");
-    const auto intersectionHitShader = loadCompileAndCreateShader("app_resources/raytrace.rint.hlsl");
-    const auto anyHitShaderColorPayload = loadCompileAndCreateShader("app_resources/raytrace.rahit.hlsl");
-    const auto anyHitShaderShadowPayload = loadCompileAndCreateShader("app_resources/raytrace_shadow.rahit.hlsl");
-    const auto missShader = loadCompileAndCreateShader("app_resources/raytrace.rmiss.hlsl");
-    const auto missShadowShader = loadCompileAndCreateShader("app_resources/raytrace_shadow.rmiss.hlsl");
-    const auto directionalLightCallShader = loadCompileAndCreateShader("app_resources/light_directional.rcall.hlsl");
-    const auto pointLightCallShader = loadCompileAndCreateShader("app_resources/light_point.rcall.hlsl");
-    const auto spotLightCallShader = loadCompileAndCreateShader("app_resources/light_spot.rcall.hlsl");
-    const auto fragmentShader = loadCompileAndCreateShader("app_resources/present.frag.hlsl");
-
-    core::smart_refctd_ptr<system::IFile> shaderWriteCacheFile;
-    {
-        system::ISystem::future_t<core::smart_refctd_ptr<system::IFile>> future;
-        m_system->deleteFile(shaderCachePath); // temp solution instead of trimming, to make sure we won't have corrupted json
-        m_system->createFile(future, shaderCachePath.c_str(), system::IFile::ECF_WRITE);
-        if (future.wait())
-        {
-            future.acquire().move_into(shaderWriteCacheFile);
-            if (shaderWriteCacheFile)
-            {
-                auto serializedCache = shaderWriteCache->serialize();
-                if (shaderWriteCacheFile)
-                {
-                    system::IFile::success_t succ;
-                    shaderWriteCacheFile->write(succ, serializedCache->getPointer(), 0, serializedCache->getSize());
-                    if (!succ)
-                        m_logger->log("Failed Writing To Shader Cache File.", ILogger::ELL_ERROR);
-                }
-            }
-            else
-                m_logger->log("Failed Creating Shader Cache File.", ILogger::ELL_ERROR);
-        }
-        else
-            m_logger->log("Failed Creating Shader Cache File.", ILogger::ELL_ERROR);
-    }
-
-    m_semaphore = m_device->createSemaphore(m_realFrameIx);
-    if (!m_semaphore)
-      return logFail("Failed to Create a Semaphore!");
-
-    auto gQueue = getGraphicsQueue();
-
-    // Create renderpass and init surface
-    nbl::video::IGPURenderpass* renderpass;
-    {
-      ISwapchain::SCreationParams swapchainParams = { .surface = smart_refctd_ptr<ISurface>(m_surface->getSurface()) };
-      if (!swapchainParams.deduceFormat(m_physicalDevice))
-        return logFail("Could not choose a Surface Format for the Swapchain!");
-
-      const static IGPURenderpass::SCreationParams::SSubpassDependency dependencies[] =
-      {
-        {
-          .srcSubpass = IGPURenderpass::SCreationParams::SSubpassDependency::External,
-          .dstSubpass = 0,
-          .memoryBarrier =
-          {
-            .srcStageMask = asset::PIPELINE_STAGE_FLAGS::COPY_BIT,
-            .srcAccessMask = asset::ACCESS_FLAGS::TRANSFER_WRITE_BIT,
-            .dstStageMask = asset::PIPELINE_STAGE_FLAGS::COLOR_ATTACHMENT_OUTPUT_BIT,
-            .dstAccessMask = asset::ACCESS_FLAGS::COLOR_ATTACHMENT_WRITE_BIT
-          }
-        },
-        {
-          .srcSubpass = 0,
-          .dstSubpass = IGPURenderpass::SCreationParams::SSubpassDependency::External,
-          .memoryBarrier =
-          {
-            .srcStageMask = asset::PIPELINE_STAGE_FLAGS::COLOR_ATTACHMENT_OUTPUT_BIT,
-            .srcAccessMask = asset::ACCESS_FLAGS::COLOR_ATTACHMENT_WRITE_BIT
-          }
-        },
-        IGPURenderpass::SCreationParams::DependenciesEnd
-      };
-
-      auto scResources = std::make_unique<CDefaultSwapchainFramebuffers>(m_device.get(), swapchainParams.surfaceFormat.format, dependencies);
-      renderpass = scResources->getRenderpass();
-
-      if (!renderpass)
-        return logFail("Failed to create Renderpass!");
-
-      if (!m_surface || !m_surface->init(gQueue, std::move(scResources), swapchainParams.sharedParams))
-        return logFail("Could not create Window & Surface or initialize the Surface!");
-    }
-
-    auto pool = m_device->createCommandPool(gQueue->getFamilyIndex(), IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT);
-
-    m_converter = CAssetConverter::create({ .device = m_device.get(), .optimizer = {} });
-
-    for (auto i = 0u; i < MaxFramesInFlight; i++)
-    {
-      if (!pool)
-        return logFail("Couldn't create Command Pool!");
-      if (!pool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY, { m_cmdBufs.data() + i, 1 }))
-        return logFail("Couldn't create Command Buffer!");
-    }
-
-    m_winMgr->setWindowSize(m_window.get(), WIN_W, WIN_H);
-    m_surface->recreateSwapchain();
-
-
-    // create output images
-    m_hdrImage = m_device->createImage({
-        {
-          .type = IGPUImage::ET_2D,
-          .samples = ICPUImage::ESCF_1_BIT,
-          .format = EF_R16G16B16A16_SFLOAT,
-          .extent = {WIN_W, WIN_H, 1},
-          .mipLevels = 1,
-          .arrayLayers = 1,
-          .flags = IImage::ECF_NONE,
-          .usage = bitflag(IImage::EUF_STORAGE_BIT) | IImage::EUF_TRANSFER_SRC_BIT | IImage::EUF_SAMPLED_BIT
-        }
-      });
-
-    if (!m_hdrImage || !m_device->allocate(m_hdrImage->getMemoryReqs(), m_hdrImage.get()).isValid())
-      return logFail("Could not create HDR Image");
-
-    m_hdrImageView = m_device->createImageView({
-      .flags = IGPUImageView::ECF_NONE,
-      .subUsages = IGPUImage::E_USAGE_FLAGS::EUF_STORAGE_BIT | IGPUImage::E_USAGE_FLAGS::EUF_SAMPLED_BIT,
-      .image = m_hdrImage,
-      .viewType = IGPUImageView::E_TYPE::ET_2D,
-      .format = asset::EF_R16G16B16A16_SFLOAT
-    });
-
-
-
-    // ray trace pipeline and descriptor set layout setup
-    {
-      const IGPUDescriptorSetLayout::SBinding bindings[] = {
-        {
-          .binding = 0,
-          .type = asset::IDescriptor::E_TYPE::ET_ACCELERATION_STRUCTURE,
-          .createFlags = IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_NONE,
-          .stageFlags = asset::IShader::E_SHADER_STAGE::ESS_RAYGEN,
-          .count = 1,
-        },
-        {
-          .binding = 1,
-          .type = asset::IDescriptor::E_TYPE::ET_STORAGE_IMAGE,
-          .createFlags = IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_NONE,
-          .stageFlags = asset::IShader::E_SHADER_STAGE::ESS_RAYGEN,
-          .count = 1,
-        }
-      };
-      const auto descriptorSetLayout = m_device->createDescriptorSetLayout(bindings);
-
-      const std::array<IGPUDescriptorSetLayout*, ICPUPipelineLayout::DESCRIPTOR_SET_COUNT> dsLayoutPtrs = { descriptorSetLayout.get() };
-      m_rayTracingDsPool = m_device->createDescriptorPoolForDSLayouts(IDescriptorPool::ECF_UPDATE_AFTER_BIND_BIT, std::span(dsLayoutPtrs.begin(), dsLayoutPtrs.end()));
-      m_rayTracingDs = m_rayTracingDsPool->createDescriptorSet(descriptorSetLayout);
-
-      const SPushConstantRange pcRange = {
-        .stageFlags = IShader::E_SHADER_STAGE::ESS_ALL_RAY_TRACING,
-        .offset = 0u,
-        .size = sizeof(SPushConstants),
-      };
-      const auto pipelineLayout = m_device->createPipelineLayout({ &pcRange, 1 }, smart_refctd_ptr(descriptorSetLayout), nullptr, nullptr, nullptr);
-
-      IGPURayTracingPipeline::SCreationParams params = {};
-
-      enum RtDemoShader
-      {
-        RTDS_RAYGEN,
-        RTDS_MISS,
-        RTDS_MISS_SHADOW,
-        RTDS_CLOSEST_HIT,
-        RTDS_SPHERE_CLOSEST_HIT,
-        RTDS_ANYHIT_PRIMARY,
-        RTDS_ANYHIT_SHADOW,
-        RTDS_INTERSECTION,
-        RTDS_DIRECTIONAL_CALL,
-        RTDS_POINT_CALL,
-        RTDS_SPOT_CALL,
-        RTDS_COUNT
-      };
-
-      IPipelineBase::SShaderSpecInfo shaders[RTDS_COUNT];
-      shaders[RTDS_RAYGEN] = {.shader = raygenShader.get(), .entryPoint = "main", .stage = ESS_RAYGEN};
-      shaders[RTDS_MISS] = {.shader = missShader.get(), .entryPoint = "main", .stage = ESS_MISS};
-      shaders[RTDS_MISS_SHADOW] = { .shader = missShadowShader.get(), .entryPoint = "main", .stage = ESS_MISS};
-      shaders[RTDS_CLOSEST_HIT] = {.shader = closestHitShader.get(), .entryPoint = "main", .stage = ESS_CLOSEST_HIT};
-      shaders[RTDS_SPHERE_CLOSEST_HIT] = {.shader = proceduralClosestHitShader.get(), .entryPoint = "main", .stage = ESS_CLOSEST_HIT};
-      shaders[RTDS_ANYHIT_PRIMARY] = {.shader = anyHitShaderColorPayload.get(), .entryPoint = "main", .stage = ESS_ANY_HIT};
-      shaders[RTDS_ANYHIT_SHADOW] = {.shader = anyHitShaderShadowPayload.get(), .entryPoint = "main", .stage = ESS_ANY_HIT};
-      shaders[RTDS_INTERSECTION] = {.shader = intersectionHitShader.get(), .entryPoint = "main", .stage = ESS_INTERSECTION };
-      shaders[RTDS_DIRECTIONAL_CALL] = {.shader = directionalLightCallShader.get(), .entryPoint = "main", .stage = ESS_CALLABLE};
-      shaders[RTDS_POINT_CALL] = {.shader = pointLightCallShader.get(), .entryPoint = "main", .stage = ESS_CALLABLE};
-      shaders[RTDS_SPOT_CALL] = {.shader = spotLightCallShader.get(), .entryPoint = "main", .stage = ESS_CALLABLE};
-
-      params.layout = pipelineLayout.get();
-      params.shaders = std::span(shaders);
-      using RayTracingFlags = IGPURayTracingPipeline::SCreationParams::FLAGS;
-      params.flags = core::bitflag(RayTracingFlags::NO_NULL_MISS_SHADERS) |
-        RayTracingFlags::NO_NULL_INTERSECTION_SHADERS | 
-        RayTracingFlags::NO_NULL_ANY_HIT_SHADERS;
-
-      auto& shaderGroups = params.shaderGroups;
-
-      shaderGroups.raygen = { .index = RTDS_RAYGEN };
-
-      IRayTracingPipelineBase::SGeneralShaderGroup missGroups[EMT_COUNT];
-      missGroups[EMT_PRIMARY] = { .index = RTDS_MISS };
-      missGroups[EMT_OCCLUSION] = { .index = RTDS_MISS_SHADOW };
-      shaderGroups.misses = missGroups;
-
-      auto getHitGroupIndex = [](E_GEOM_TYPE geomType, E_RAY_TYPE rayType)
-        {
-          return geomType * ERT_COUNT + rayType;
-        };
-      IRayTracingPipelineBase::SHitShaderGroup hitGroups[E_RAY_TYPE::ERT_COUNT * E_GEOM_TYPE::EGT_COUNT];
-      hitGroups[getHitGroupIndex(EGT_TRIANGLES, ERT_PRIMARY)] = {
-        .closestHit = RTDS_CLOSEST_HIT,
-        .anyHit = RTDS_ANYHIT_PRIMARY,
-      };
-      hitGroups[getHitGroupIndex(EGT_TRIANGLES, ERT_OCCLUSION)] = {
-        .closestHit = IGPURayTracingPipeline::SGeneralShaderGroup::Unused,
-        .anyHit = RTDS_ANYHIT_SHADOW,
-      };
-      hitGroups[getHitGroupIndex(EGT_PROCEDURAL, ERT_PRIMARY)] = {
-        .closestHit = RTDS_SPHERE_CLOSEST_HIT,
-        .anyHit = RTDS_ANYHIT_PRIMARY,
-        .intersection = RTDS_INTERSECTION,
-      };
-      hitGroups[getHitGroupIndex(EGT_PROCEDURAL, ERT_OCCLUSION)] = {
-        .closestHit = IGPURayTracingPipeline::SGeneralShaderGroup::Unused,
-        .anyHit = RTDS_ANYHIT_SHADOW,
-        .intersection = RTDS_INTERSECTION,
-      };
-      shaderGroups.hits = hitGroups;
-
-      IRayTracingPipelineBase::SGeneralShaderGroup callableGroups[ELT_COUNT];
-      callableGroups[ELT_DIRECTIONAL] = { .index = RTDS_DIRECTIONAL_CALL };
-      callableGroups[ELT_POINT] = { .index = RTDS_POINT_CALL };
-      callableGroups[ELT_SPOT] = { .index = RTDS_SPOT_CALL };
-      shaderGroups.callables = callableGroups;
-
-      params.cached.maxRecursionDepth = 1;
-      params.cached.dynamicStackSize = true;
-
-      if (!m_device->createRayTracingPipelines(nullptr, { &params, 1 }, &m_rayTracingPipeline))
-        return logFail("Failed to create ray tracing pipeline");
-
-      calculateRayTracingStackSize(m_rayTracingPipeline);
-      
-      if (!createShaderBindingTable(gQueue, m_rayTracingPipeline))
-        return logFail("Could not create shader binding table");
-
-    }
-
-    auto assetManager = make_smart_refctd_ptr<nbl::asset::IAssetManager>(smart_refctd_ptr(system));
-    auto* geometryCreator = assetManager->getGeometryCreator();
-
-    if (!createIndirectBuffer(gQueue))
-      return logFail("Could not create indirect buffer");
-
-    // create geometry objects
-    if (!createGeometries(gQueue, geometryCreator))
-      return logFail("Could not create geometries from geometry creator");
-
-    if (!createAccelerationStructures(getComputeQueue()))
-      return logFail("Could not create acceleration structures");
-
-    ISampler::SParams samplerParams = {
-      .AnisotropicFilter = 0
-    };
-    auto defaultSampler = m_device->createSampler(samplerParams);
-
-    {
-      const IGPUDescriptorSetLayout::SBinding bindings[] = {
-        {
-          .binding = 0u,
-          .type = nbl::asset::IDescriptor::E_TYPE::ET_COMBINED_IMAGE_SAMPLER,
-          .createFlags = ICPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_NONE,
-          .stageFlags = IShader::E_SHADER_STAGE::ESS_FRAGMENT,
-          .count = 1u,
-          .immutableSamplers = &defaultSampler
-        }
-      };
-      auto gpuPresentDescriptorSetLayout = m_device->createDescriptorSetLayout(bindings);
-      const video::IGPUDescriptorSetLayout* const layouts[] = { gpuPresentDescriptorSetLayout.get() };
-      const uint32_t setCounts[] = { 1u };
-      m_presentDsPool = m_device->createDescriptorPoolForDSLayouts(IDescriptorPool::E_CREATE_FLAGS::ECF_NONE, layouts, setCounts);
-      m_presentDs = m_presentDsPool->createDescriptorSet(gpuPresentDescriptorSetLayout);
-
-      auto scRes = static_cast<CDefaultSwapchainFramebuffers*>(m_surface->getSwapchainResources());
-      ext::FullScreenTriangle::ProtoPipeline fsTriProtoPPln(m_assetMgr.get(), m_device.get(), m_logger.get());
-      if (!fsTriProtoPPln)
-        return logFail("Failed to create Full Screen Triangle protopipeline or load its vertex shader!");
-
-      const IPipelineBase::SShaderSpecInfo fragSpec = {
-        .shader = fragmentShader.get(),
-        .entryPoint = "main",
-        .stage = ESS_FRAGMENT,
-      };
-
-      auto presentLayout = m_device->createPipelineLayout(
-        {},
-        core::smart_refctd_ptr(gpuPresentDescriptorSetLayout),
-        nullptr,
-        nullptr,
-        nullptr
-      );
-      m_presentPipeline = fsTriProtoPPln.createPipeline(fragSpec, presentLayout.get(), scRes->getRenderpass());
-      if (!m_presentPipeline)
-        return logFail("Could not create Graphics Pipeline!");
-    }
-
-    // write descriptors
-    IGPUDescriptorSet::SDescriptorInfo infos[3];
-    infos[0].desc = m_gpuTlas;
-
-    infos[1].desc = m_hdrImageView;
-    if (!infos[1].desc)
-      return logFail("Failed to create image view");
-    infos[1].info.image.imageLayout = IImage::LAYOUT::GENERAL;
-
-    infos[2].desc = m_hdrImageView;
-    infos[2].info.image.imageLayout = IImage::LAYOUT::READ_ONLY_OPTIMAL;
-
-    IGPUDescriptorSet::SWriteDescriptorSet writes[] = {
-        {.dstSet = m_rayTracingDs.get(), .binding = 0, .arrayElement = 0, .count = 1, .info = &infos[0]},
-        {.dstSet = m_rayTracingDs.get(), .binding = 1, .arrayElement = 0, .count = 1, .info = &infos[1]},
-        {.dstSet = m_presentDs.get(), .binding = 0, .arrayElement = 0, .count = 1, .info = &infos[2] },
-    };
-    m_device->updateDescriptorSets(std::span(writes), {});
-
-    // gui descriptor setup
-    {
-      using binding_flags_t = IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS;
-      {
-        IGPUSampler::SParams params;
-        params.AnisotropicFilter = 1u;
-        params.TextureWrapU = ETC_REPEAT;
-        params.TextureWrapV = ETC_REPEAT;
-        params.TextureWrapW = ETC_REPEAT;
-
-        m_ui.samplers.gui = m_device->createSampler(params);
-        m_ui.samplers.gui->setObjectDebugName("Nabla IMGUI UI Sampler");
-      }
-
-      std::array<core::smart_refctd_ptr<IGPUSampler>, 69u> immutableSamplers;
-      for (auto& it : immutableSamplers)
-        it = smart_refctd_ptr(m_ui.samplers.scene);
-
-      immutableSamplers[nbl::ext::imgui::UI::FontAtlasTexId] = smart_refctd_ptr(m_ui.samplers.gui);
-
-      nbl::ext::imgui::UI::SCreationParameters params;
-
-      params.resources.texturesInfo = { .setIx = 0u, .bindingIx = 0u };
-      params.resources.samplersInfo = { .setIx = 0u, .bindingIx = 1u };
-      params.assetManager = m_assetMgr;
-      params.pipelineCache = nullptr;
-      params.pipelineLayout = nbl::ext::imgui::UI::createDefaultPipelineLayout(m_utils->getLogicalDevice(), params.resources.texturesInfo, params.resources.samplersInfo, MaxUITextureCount);
-      params.renderpass = smart_refctd_ptr<IGPURenderpass>(renderpass);
-      params.streamingBuffer = nullptr;
-      params.subpassIx = 0u;
-      params.transfer = getTransferUpQueue();
-      params.utilities = m_utils;
-      {
-        m_ui.manager = ext::imgui::UI::create(std::move(params));
-
-        // note that we use default layout provided by our extension, but you are free to create your own by filling nbl::ext::imgui::UI::S_CREATION_PARAMETERS::resources
-        const auto* descriptorSetLayout = m_ui.manager->getPipeline()->getLayout()->getDescriptorSetLayout(0u);
-        const auto& params = m_ui.manager->getCreationParameters();
-
-        IDescriptorPool::SCreateInfo descriptorPoolInfo = {};
-        descriptorPoolInfo.maxDescriptorCount[static_cast<uint32_t>(asset::IDescriptor::E_TYPE::ET_SAMPLER)] = (uint32_t)nbl::ext::imgui::UI::DefaultSamplerIx::COUNT;
-        descriptorPoolInfo.maxDescriptorCount[static_cast<uint32_t>(asset::IDescriptor::E_TYPE::ET_SAMPLED_IMAGE)] = MaxUITextureCount;
-        descriptorPoolInfo.maxSets = 1u;
-        descriptorPoolInfo.flags = IDescriptorPool::E_CREATE_FLAGS::ECF_UPDATE_AFTER_BIND_BIT;
-
-        m_guiDescriptorSetPool = m_device->createDescriptorPool(std::move(descriptorPoolInfo));
-        assert(m_guiDescriptorSetPool);
-
-        m_guiDescriptorSetPool->createDescriptorSets(1u, &descriptorSetLayout, &m_ui.descriptorSet);
-        assert(m_ui.descriptorSet);
-      }
-    }
-
-    m_ui.manager->registerListener(
-      [this]() -> void {
-        ImGuiIO& io = ImGui::GetIO();
-
-        m_camera.setProjectionMatrix([&]()
-        {
-          static matrix4SIMD projection;
-
-          projection = matrix4SIMD::buildProjectionMatrixPerspectiveFovRH(
-            core::radians(m_cameraSetting.fov), 
-            io.DisplaySize.x / io.DisplaySize.y, 
-            m_cameraSetting.zNear, 
-            m_cameraSetting.zFar);
-
-          return projection;
-        }());
-
-        ImGui::SetNextWindowPos(ImVec2(1024, 100), ImGuiCond_Appearing);
-        ImGui::SetNextWindowSize(ImVec2(256, 256), ImGuiCond_Appearing);
-
-        // create a window and insert the inspector
-        ImGui::SetNextWindowPos(ImVec2(10, 10), ImGuiCond_Appearing);
-        ImGui::SetNextWindowSize(ImVec2(320, 340), ImGuiCond_Appearing);
-        ImGui::Begin("Controls");
-
-        ImGui::SameLine();
-
-        ImGui::Text("Camera");
-
-        ImGui::SliderFloat("Move speed", &m_cameraSetting.moveSpeed, 0.1f, 10.f);
-        ImGui::SliderFloat("Rotate speed", &m_cameraSetting.rotateSpeed, 0.1f, 10.f);
-        ImGui::SliderFloat("Fov", &m_cameraSetting.fov, 20.f, 150.f);
-        ImGui::SliderFloat("zNear", &m_cameraSetting.zNear, 0.1f, 100.f);
-        ImGui::SliderFloat("zFar", &m_cameraSetting.zFar, 110.f, 10000.f);
-        Light m_oldLight = m_light;
-        int light_type = m_light.type;
-        ImGui::ListBox("LightType", &light_type, s_lightTypeNames, ELT_COUNT);
-        m_light.type = static_cast<E_LIGHT_TYPE>(light_type);
-        if (m_light.type == ELT_DIRECTIONAL)
-        {
-          ImGui::SliderFloat3("Light Direction", &m_light.direction.x, -1.f, 1.f);
-        } else if (m_light.type == ELT_POINT)
-        {
-          ImGui::SliderFloat3("Light Position", &m_light.position.x, -20.f, 20.f);
-        } else if (m_light.type == ELT_SPOT)
-        {
-          ImGui::SliderFloat3("Light Direction", &m_light.direction.x, -1.f, 1.f);
-          ImGui::SliderFloat3("Light Position", &m_light.position.x, -20.f, 20.f);
-
-          float32_t dOuterCutoff = hlsl::degrees(acos(m_light.outerCutoff));
-          if (ImGui::SliderFloat("Light Outer Cutoff", &dOuterCutoff, 0.0f, 45.0f))
-          {
-            m_light.outerCutoff = cos(hlsl::radians(dOuterCutoff));
-          }
-        }
-        ImGui::Checkbox("Use Indirect Command", &m_useIndirectCommand);
-        if (m_light != m_oldLight)
-        {
-          m_frameAccumulationCounter = 0;
-        }
-
-        ImGui::Text("X: %f Y: %f", io.MousePos.x, io.MousePos.y);
-
-        ImGui::End();
-      }
-    );
-
-    // Set Camera
-    {
-      core::vectorSIMDf cameraPosition(0, 5, -10);
-      matrix4SIMD proj = matrix4SIMD::buildProjectionMatrixPerspectiveFovRH(
-        core::radians(60.0f),
-        WIN_W / WIN_H,
-        0.01f,
-        500.0f
-      );
-      m_camera = Camera(cameraPosition, core::vectorSIMDf(0, 0, 0), proj);
-    }
-
-    m_winMgr->setWindowSize(m_window.get(), WIN_W, WIN_H);
-    m_surface->recreateSwapchain();
-    m_winMgr->show(m_window.get());
-    m_oracle.reportBeginFrameRecord();
-    m_camera.mapKeysToWASD();
-
-    return true;
-  }
-
-  bool updateGUIDescriptorSet()
-  {
-    // texture atlas, note we don't create info & write pair for the font sampler because UI extension's is immutable and baked into DS layout
-    static std::array<IGPUDescriptorSet::SDescriptorInfo, MaxUITextureCount> descriptorInfo;
-    static IGPUDescriptorSet::SWriteDescriptorSet writes[MaxUITextureCount];
-
-    descriptorInfo[nbl::ext::imgui::UI::FontAtlasTexId].info.image.imageLayout = IImage::LAYOUT::READ_ONLY_OPTIMAL;
-    descriptorInfo[nbl::ext::imgui::UI::FontAtlasTexId].desc = smart_refctd_ptr<IGPUImageView>(m_ui.manager->getFontAtlasView());
-
-    for (uint32_t i = 0; i < descriptorInfo.size(); ++i)
-    {
-      writes[i].dstSet = m_ui.descriptorSet.get();
-      writes[i].binding = 0u;
-      writes[i].arrayElement = i;
-      writes[i].count = 1u;
-    }
-    writes[nbl::ext::imgui::UI::FontAtlasTexId].info = descriptorInfo.data() + nbl::ext::imgui::UI::FontAtlasTexId;
-
-    return m_device->updateDescriptorSets(writes, {});
-  }
-
-  inline void workLoopBody() override
-  {
-    // framesInFlight: ensuring safe execution of command buffers and acquires, `framesInFlight` only affect semaphore waits, don't use this to index your resources because it can change with swapchain recreation.
-    const uint32_t framesInFlight = core::min(MaxFramesInFlight, m_surface->getMaxAcquiresInFlight());
-    // We block for semaphores for 2 reasons here:
-      // A) Resource: Can't use resource like a command buffer BEFORE previous use is finished! [MaxFramesInFlight]
-      // B) Acquire: Can't have more acquires in flight than a certain threshold returned by swapchain or your surface helper class. [MaxAcquiresInFlight]
-    if (m_realFrameIx >= framesInFlight)
-    {
-      const ISemaphore::SWaitInfo cbDonePending[] = 
-      {
-        {
-          .semaphore = m_semaphore.get(),
-          .value = m_realFrameIx + 1 - framesInFlight
-        }
-      };
-      if (m_device->blockForSemaphores(cbDonePending) != ISemaphore::WAIT_RESULT::SUCCESS)
-        return;
-    }
-    const auto resourceIx = m_realFrameIx % MaxFramesInFlight;
-
-    m_api->startCapture();
-
-    update();
-
-    auto queue = getGraphicsQueue();
-    auto cmdbuf = m_cmdBufs[resourceIx].get();
-
-    if (!keepRunning())
-      return;
-
-    cmdbuf->reset(IGPUCommandBuffer::RESET_FLAGS::RELEASE_RESOURCES_BIT);
-    cmdbuf->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT);
-    cmdbuf->beginDebugMarker("RaytracingPipelineApp Frame");
-
-    const auto viewMatrix = m_camera.getViewMatrix();
-    const auto projectionMatrix = m_camera.getProjectionMatrix();
-    const auto viewProjectionMatrix = m_camera.getConcatenatedMatrix();
-
-    core::matrix3x4SIMD modelMatrix;
-    modelMatrix.setTranslation(nbl::core::vectorSIMDf(0, 0, 0, 0));
-    modelMatrix.setRotation(quaternion(0, 0, 0));
-
-    core::matrix4SIMD modelViewProjectionMatrix = core::concatenateBFollowedByA(viewProjectionMatrix, modelMatrix);
-    if (m_cachedModelViewProjectionMatrix != modelViewProjectionMatrix)
-    {
-      m_frameAccumulationCounter = 0;
-      m_cachedModelViewProjectionMatrix = modelViewProjectionMatrix;
-    }
-    core::matrix4SIMD invModelViewProjectionMatrix;
-    modelViewProjectionMatrix.getInverseTransform(invModelViewProjectionMatrix);
-
-    {
-      IGPUCommandBuffer::SPipelineBarrierDependencyInfo::image_barrier_t imageBarriers[1];
-      imageBarriers[0].barrier = {
-         .dep = {
-           .srcStageMask = PIPELINE_STAGE_FLAGS::FRAGMENT_SHADER_BIT, // previous frame read from framgent shader
-           .srcAccessMask = ACCESS_FLAGS::SHADER_READ_BITS,
-           .dstStageMask = PIPELINE_STAGE_FLAGS::RAY_TRACING_SHADER_BIT,
-           .dstAccessMask = ACCESS_FLAGS::SHADER_WRITE_BITS
-        }
-      };
-      imageBarriers[0].image = m_hdrImage.get();
-      imageBarriers[0].subresourceRange = {
-        .aspectMask = IImage::EAF_COLOR_BIT,
-        .baseMipLevel = 0u,
-        .levelCount = 1u,
-        .baseArrayLayer = 0u,
-        .layerCount = 1u
-      };
-      imageBarriers[0].oldLayout = m_frameAccumulationCounter == 0 ? IImage::LAYOUT::UNDEFINED : IImage::LAYOUT::READ_ONLY_OPTIMAL;
-      imageBarriers[0].newLayout = IImage::LAYOUT::GENERAL;
-      cmdbuf->pipelineBarrier(E_DEPENDENCY_FLAGS::EDF_NONE, { .imgBarriers = imageBarriers });
-    }
-
-    // Trace Rays Pass
-    {
-      SPushConstants pc;
-      pc.light = m_light;
-      pc.proceduralGeomInfoBuffer = m_proceduralGeomInfoBuffer->getDeviceAddress();
-      pc.triangleGeomInfoBuffer = m_triangleGeomInfoBuffer->getDeviceAddress();
-      pc.frameCounter = m_frameAccumulationCounter;
-      const core::vector3df camPos = m_camera.getPosition().getAsVector3df();
-      pc.camPos = { camPos.X, camPos.Y, camPos.Z };
-      memcpy(&pc.invMVP, invModelViewProjectionMatrix.pointer(), sizeof(pc.invMVP));
-
-      cmdbuf->bindRayTracingPipeline(m_rayTracingPipeline.get());
-      cmdbuf->setRayTracingPipelineStackSize(m_rayTracingStackSize);
-      cmdbuf->pushConstants(m_rayTracingPipeline->getLayout(), IShader::E_SHADER_STAGE::ESS_ALL_RAY_TRACING, 0, sizeof(SPushConstants), &pc);
-      cmdbuf->bindDescriptorSets(EPBP_RAY_TRACING, m_rayTracingPipeline->getLayout(), 0, 1, &m_rayTracingDs.get());
-      if (m_useIndirectCommand)
-      {
-        cmdbuf->traceRaysIndirect(
-          SBufferBinding<const IGPUBuffer>{
-            .offset = 0,
-            .buffer = m_indirectBuffer,
-          });
-      }else
-      {
-        cmdbuf->traceRays(
-          m_shaderBindingTable.raygenGroupRange,
-          m_shaderBindingTable.missGroupsRange, m_shaderBindingTable.missGroupsStride,
-          m_shaderBindingTable.hitGroupsRange, m_shaderBindingTable.hitGroupsStride,
-          m_shaderBindingTable.callableGroupsRange, m_shaderBindingTable.callableGroupsStride,
-          WIN_W, WIN_H, 1);
-      }
-    }
-
-    // pipeline barrier
-    {
-      IGPUCommandBuffer::SPipelineBarrierDependencyInfo::image_barrier_t imageBarriers[1];
-      imageBarriers[0].barrier = {
-        .dep = {
-          .srcStageMask = PIPELINE_STAGE_FLAGS::RAY_TRACING_SHADER_BIT,
-          .srcAccessMask = ACCESS_FLAGS::SHADER_WRITE_BITS,
-          .dstStageMask = PIPELINE_STAGE_FLAGS::COLOR_ATTACHMENT_OUTPUT_BIT,
-          .dstAccessMask = ACCESS_FLAGS::COLOR_ATTACHMENT_WRITE_BIT
-        }
-      };
-      imageBarriers[0].image = m_hdrImage.get();
-      imageBarriers[0].subresourceRange = {
-        .aspectMask = IImage::EAF_COLOR_BIT,
-        .baseMipLevel = 0u,
-        .levelCount = 1u,
-        .baseArrayLayer = 0u,
-        .layerCount = 1u
-      };
-      imageBarriers[0].oldLayout = IImage::LAYOUT::GENERAL;
-      imageBarriers[0].newLayout = IImage::LAYOUT::READ_ONLY_OPTIMAL;
-
-      cmdbuf->pipelineBarrier(E_DEPENDENCY_FLAGS::EDF_NONE, { .imgBarriers = imageBarriers });
-    }
-
-    {
+	inline RaytracingPipelineApp(const path& _localInputCWD, const path& _localOutputCWD, const path& _sharedInputCWD, const path& _sharedOutputCWD)
+		: IApplicationFramework(_localInputCWD, _localOutputCWD, _sharedInputCWD, _sharedOutputCWD)
+	{
+	}
+
+	inline SPhysicalDeviceFeatures getRequiredDeviceFeatures() const override
+	{
+		auto retval = device_base_t::getRequiredDeviceFeatures();
+		retval.rayTracingPipeline = true;
+		retval.accelerationStructure = true;
+		retval.rayQuery = true;
+		return retval;
+	}
+
+	inline SPhysicalDeviceFeatures getPreferredDeviceFeatures() const override
+	{
+		auto retval = device_base_t::getPreferredDeviceFeatures();
+		retval.accelerationStructureHostCommands = true;
+		return retval;
+	}
+
+	inline core::vector<video::SPhysicalDeviceFilter::SurfaceCompatibility> getSurfaces() const override
+	{
+		if (!m_surface)
+		{
+			{
+				auto windowCallback = core::make_smart_refctd_ptr<CEventCallback>(smart_refctd_ptr(m_inputSystem), smart_refctd_ptr(m_logger));
+				IWindow::SCreationParams params = {};
+				params.callback = core::make_smart_refctd_ptr<ISimpleManagedSurface::ICallback>();
+				params.width = WIN_W;
+				params.height = WIN_H;
+				params.x = 32;
+				params.y = 32;
+				params.flags = ui::IWindow::ECF_HIDDEN | IWindow::ECF_BORDERLESS | IWindow::ECF_RESIZABLE;
+				params.windowCaption = "RaytracingPipelineApp";
+				params.callback = windowCallback;
+				const_cast<std::remove_const_t<decltype(m_window)>&>(m_window) = m_winMgr->createWindow(std::move(params));
+			}
+
+			auto surface = CSurfaceVulkanWin32::create(smart_refctd_ptr(m_api), smart_refctd_ptr_static_cast<IWindowWin32>(m_window));
+			const_cast<std::remove_const_t<decltype(m_surface)>&>(m_surface) = CSimpleResizeSurface<ISimpleManagedSurface::ISwapchainResources>::create(std::move(surface));
+		}
+
+		if (m_surface)
+			return { {m_surface->getSurface()/*,EQF_NONE*/} };
+
+		return {};
+	}
+
+	// so that we can use the same queue for asset converter and rendering
+	inline core::vector<queue_req_t> getQueueRequirements() const override
+	{
+		auto reqs = device_base_t::getQueueRequirements();
+		reqs.front().requiredFlags |= IQueue::FAMILY_FLAGS::COMPUTE_BIT;
+		return reqs;
+	}
+
+	inline bool onAppInitialized(smart_refctd_ptr<ISystem>&& system) override
+	{
+		m_inputSystem = make_smart_refctd_ptr<InputSystem>(logger_opt_smart_ptr(smart_refctd_ptr(m_logger)));
+
+		if (!device_base_t::onAppInitialized(smart_refctd_ptr(system)))
+			return false;
+
+		if (!asset_base_t::onAppInitialized(smart_refctd_ptr(system)))
+			return false;
+
+		smart_refctd_ptr<IShaderCompiler::CCache> shaderReadCache = nullptr;
+		smart_refctd_ptr<IShaderCompiler::CCache> shaderWriteCache = core::make_smart_refctd_ptr<IShaderCompiler::CCache>();
+		auto shaderCachePath = localOutputCWD / "main_pipeline_shader_cache.bin";
+
+		{
+			core::smart_refctd_ptr<system::IFile> shaderReadCacheFile;
+			{
+				system::ISystem::future_t<core::smart_refctd_ptr<system::IFile>> future;
+				m_system->createFile(future, shaderCachePath.c_str(), system::IFile::ECF_READ);
+				if (future.wait())
+				{
+					future.acquire().move_into(shaderReadCacheFile);
+					if (shaderReadCacheFile)
+					{
+						const size_t size = shaderReadCacheFile->getSize();
+						if (size > 0ull)
+						{
+							std::vector<uint8_t> contents(size);
+							system::IFile::success_t succ;
+							shaderReadCacheFile->read(succ, contents.data(), 0, size);
+							if (succ)
+								shaderReadCache = IShaderCompiler::CCache::deserialize(contents);
+						}
+					}
+				}
+				else
+					m_logger->log("Failed Openning Shader Cache File.", ILogger::ELL_ERROR);
+			}
+
+		}
+
+		// Load Custom Shader
+		auto loadCompileAndCreateShader = [&](const std::string& relPath) -> smart_refctd_ptr<IGPUShader>
+			{
+				IAssetLoader::SAssetLoadParams lp = {};
+				lp.logger = m_logger.get();
+				lp.workingDirectory = ""; // virtual root
+				auto assetBundle = m_assetMgr->getAsset(relPath, lp);
+				const auto assets = assetBundle.getContents();
+				if (assets.empty())
+					return nullptr;
+
+				// lets go straight from ICPUSpecializedShader to IGPUSpecializedShader
+				auto sourceRaw = IAsset::castDown<ICPUShader>(assets[0]);
+				if (!sourceRaw)
+					return nullptr;
+
+				return m_device->createShader({ sourceRaw.get(), nullptr, shaderReadCache.get(), shaderWriteCache.get() });
+			};
+
+		// load shaders
+		const auto raygenShader = loadCompileAndCreateShader("app_resources/raytrace.rgen.hlsl");
+		const auto closestHitShader = loadCompileAndCreateShader("app_resources/raytrace.rchit.hlsl");
+		const auto proceduralClosestHitShader = loadCompileAndCreateShader("app_resources/raytrace_procedural.rchit.hlsl");
+		const auto intersectionHitShader = loadCompileAndCreateShader("app_resources/raytrace.rint.hlsl");
+		const auto anyHitShaderColorPayload = loadCompileAndCreateShader("app_resources/raytrace.rahit.hlsl");
+		const auto anyHitShaderShadowPayload = loadCompileAndCreateShader("app_resources/raytrace_shadow.rahit.hlsl");
+		const auto missShader = loadCompileAndCreateShader("app_resources/raytrace.rmiss.hlsl");
+		const auto missShadowShader = loadCompileAndCreateShader("app_resources/raytrace_shadow.rmiss.hlsl");
+		const auto directionalLightCallShader = loadCompileAndCreateShader("app_resources/light_directional.rcall.hlsl");
+		const auto pointLightCallShader = loadCompileAndCreateShader("app_resources/light_point.rcall.hlsl");
+		const auto spotLightCallShader = loadCompileAndCreateShader("app_resources/light_spot.rcall.hlsl");
+		const auto fragmentShader = loadCompileAndCreateShader("app_resources/present.frag.hlsl");
+
+		core::smart_refctd_ptr<system::IFile> shaderWriteCacheFile;
+		{
+			system::ISystem::future_t<core::smart_refctd_ptr<system::IFile>> future;
+			m_system->deleteFile(shaderCachePath); // temp solution instead of trimming, to make sure we won't have corrupted json
+			m_system->createFile(future, shaderCachePath.c_str(), system::IFile::ECF_WRITE);
+			if (future.wait())
+			{
+				future.acquire().move_into(shaderWriteCacheFile);
+				if (shaderWriteCacheFile)
+				{
+					auto serializedCache = shaderWriteCache->serialize();
+					if (shaderWriteCacheFile)
+					{
+						system::IFile::success_t succ;
+						shaderWriteCacheFile->write(succ, serializedCache->getPointer(), 0, serializedCache->getSize());
+						if (!succ)
+							m_logger->log("Failed Writing To Shader Cache File.", ILogger::ELL_ERROR);
+					}
+				}
+				else
+					m_logger->log("Failed Creating Shader Cache File.", ILogger::ELL_ERROR);
+			}
+			else
+				m_logger->log("Failed Creating Shader Cache File.", ILogger::ELL_ERROR);
+		}
+
+		m_semaphore = m_device->createSemaphore(m_realFrameIx);
+		if (!m_semaphore)
+			return logFail("Failed to Create a Semaphore!");
+
+		auto gQueue = getGraphicsQueue();
+
+		// Create renderpass and init surface
+		nbl::video::IGPURenderpass* renderpass;
+		{
+			ISwapchain::SCreationParams swapchainParams = { .surface = smart_refctd_ptr<ISurface>(m_surface->getSurface()) };
+			if (!swapchainParams.deduceFormat(m_physicalDevice))
+				return logFail("Could not choose a Surface Format for the Swapchain!");
+
+			const static IGPURenderpass::SCreationParams::SSubpassDependency dependencies[] =
+			{
+			  {
+				.srcSubpass = IGPURenderpass::SCreationParams::SSubpassDependency::External,
+				.dstSubpass = 0,
+				.memoryBarrier =
+				{
+				  .srcStageMask = asset::PIPELINE_STAGE_FLAGS::COPY_BIT,
+				  .srcAccessMask = asset::ACCESS_FLAGS::TRANSFER_WRITE_BIT,
+				  .dstStageMask = asset::PIPELINE_STAGE_FLAGS::COLOR_ATTACHMENT_OUTPUT_BIT,
+				  .dstAccessMask = asset::ACCESS_FLAGS::COLOR_ATTACHMENT_WRITE_BIT
+				}
+			  },
+			  {
+				.srcSubpass = 0,
+				.dstSubpass = IGPURenderpass::SCreationParams::SSubpassDependency::External,
+				.memoryBarrier =
+				{
+				  .srcStageMask = asset::PIPELINE_STAGE_FLAGS::COLOR_ATTACHMENT_OUTPUT_BIT,
+				  .srcAccessMask = asset::ACCESS_FLAGS::COLOR_ATTACHMENT_WRITE_BIT
+				}
+			  },
+			  IGPURenderpass::SCreationParams::DependenciesEnd
+			};
+
+			auto scResources = std::make_unique<CDefaultSwapchainFramebuffers>(m_device.get(), swapchainParams.surfaceFormat.format, dependencies);
+			renderpass = scResources->getRenderpass();
+
+			if (!renderpass)
+				return logFail("Failed to create Renderpass!");
+
+			if (!m_surface || !m_surface->init(gQueue, std::move(scResources), swapchainParams.sharedParams))
+				return logFail("Could not create Window & Surface or initialize the Surface!");
+		}
+
+		auto pool = m_device->createCommandPool(gQueue->getFamilyIndex(), IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT);
+
+		m_converter = CAssetConverter::create({ .device = m_device.get(), .optimizer = {} });
+
+		for (auto i = 0u; i < MaxFramesInFlight; i++)
+		{
+			if (!pool)
+				return logFail("Couldn't create Command Pool!");
+			if (!pool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY, { m_cmdBufs.data() + i, 1 }))
+				return logFail("Couldn't create Command Buffer!");
+		}
+
+		m_winMgr->setWindowSize(m_window.get(), WIN_W, WIN_H);
+		m_surface->recreateSwapchain();
+
+
+		// create output images
+		m_hdrImage = m_device->createImage({
+			{
+			  .type = IGPUImage::ET_2D,
+			  .samples = ICPUImage::ESCF_1_BIT,
+			  .format = EF_R16G16B16A16_SFLOAT,
+			  .extent = {WIN_W, WIN_H, 1},
+			  .mipLevels = 1,
+			  .arrayLayers = 1,
+			  .flags = IImage::ECF_NONE,
+			  .usage = bitflag(IImage::EUF_STORAGE_BIT) | IImage::EUF_TRANSFER_SRC_BIT | IImage::EUF_SAMPLED_BIT
+			}
+			});
+
+		if (!m_hdrImage || !m_device->allocate(m_hdrImage->getMemoryReqs(), m_hdrImage.get()).isValid())
+			return logFail("Could not create HDR Image");
+
+		m_hdrImageView = m_device->createImageView({
+		  .flags = IGPUImageView::ECF_NONE,
+		  .subUsages = IGPUImage::E_USAGE_FLAGS::EUF_STORAGE_BIT | IGPUImage::E_USAGE_FLAGS::EUF_SAMPLED_BIT,
+		  .image = m_hdrImage,
+		  .viewType = IGPUImageView::E_TYPE::ET_2D,
+		  .format = asset::EF_R16G16B16A16_SFLOAT
+			});
+
+
+
+		// ray trace pipeline and descriptor set layout setup
+		{
+			const IGPUDescriptorSetLayout::SBinding bindings[] = {
+			  {
+				.binding = 0,
+				.type = asset::IDescriptor::E_TYPE::ET_ACCELERATION_STRUCTURE,
+				.createFlags = IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_NONE,
+				.stageFlags = asset::IShader::E_SHADER_STAGE::ESS_RAYGEN,
+				.count = 1,
+			  },
+			  {
+				.binding = 1,
+				.type = asset::IDescriptor::E_TYPE::ET_STORAGE_IMAGE,
+				.createFlags = IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_NONE,
+				.stageFlags = asset::IShader::E_SHADER_STAGE::ESS_RAYGEN,
+				.count = 1,
+			  }
+			};
+			const auto descriptorSetLayout = m_device->createDescriptorSetLayout(bindings);
+
+			const std::array<IGPUDescriptorSetLayout*, ICPUPipelineLayout::DESCRIPTOR_SET_COUNT> dsLayoutPtrs = { descriptorSetLayout.get() };
+			m_rayTracingDsPool = m_device->createDescriptorPoolForDSLayouts(IDescriptorPool::ECF_UPDATE_AFTER_BIND_BIT, std::span(dsLayoutPtrs.begin(), dsLayoutPtrs.end()));
+			m_rayTracingDs = m_rayTracingDsPool->createDescriptorSet(descriptorSetLayout);
+
+			const SPushConstantRange pcRange = {
+			  .stageFlags = IShader::E_SHADER_STAGE::ESS_ALL_RAY_TRACING,
+			  .offset = 0u,
+			  .size = sizeof(SPushConstants),
+			};
+			const auto pipelineLayout = m_device->createPipelineLayout({ &pcRange, 1 }, smart_refctd_ptr(descriptorSetLayout), nullptr, nullptr, nullptr);
+
+			IGPURayTracingPipeline::SCreationParams params = {};
+
+			enum RtDemoShader
+			{
+				RTDS_RAYGEN,
+				RTDS_MISS,
+				RTDS_MISS_SHADOW,
+				RTDS_CLOSEST_HIT,
+				RTDS_SPHERE_CLOSEST_HIT,
+				RTDS_ANYHIT_PRIMARY,
+				RTDS_ANYHIT_SHADOW,
+				RTDS_INTERSECTION,
+				RTDS_DIRECTIONAL_CALL,
+				RTDS_POINT_CALL,
+				RTDS_SPOT_CALL,
+				RTDS_COUNT
+			};
+
+			IGPUShader::SSpecInfo shaders[RTDS_COUNT];
+			shaders[RTDS_RAYGEN] = { .shader = raygenShader.get() };
+			shaders[RTDS_MISS] = { .shader = missShader.get() };
+			shaders[RTDS_MISS_SHADOW] = { .shader = missShadowShader.get() };
+			shaders[RTDS_CLOSEST_HIT] = { .shader = closestHitShader.get() };
+			shaders[RTDS_SPHERE_CLOSEST_HIT] = { .shader = proceduralClosestHitShader.get() };
+			shaders[RTDS_ANYHIT_PRIMARY] = { .shader = anyHitShaderColorPayload.get() };
+			shaders[RTDS_ANYHIT_SHADOW] = { .shader = anyHitShaderShadowPayload.get() };
+			shaders[RTDS_INTERSECTION] = { .shader = intersectionHitShader.get() };
+			shaders[RTDS_DIRECTIONAL_CALL] = { .shader = directionalLightCallShader.get() };
+			shaders[RTDS_POINT_CALL] = { .shader = pointLightCallShader.get() };
+			shaders[RTDS_SPOT_CALL] = { .shader = spotLightCallShader.get() };
+
+			params.layout = pipelineLayout.get();
+			params.shaders = std::span(shaders);
+			using RayTracingFlags = IGPURayTracingPipeline::SCreationParams::FLAGS;
+			params.flags = core::bitflag(RayTracingFlags::NO_NULL_MISS_SHADERS) |
+				RayTracingFlags::NO_NULL_INTERSECTION_SHADERS |
+				RayTracingFlags::NO_NULL_ANY_HIT_SHADERS;
+
+			auto& shaderGroups = params.shaderGroups;
+
+			shaderGroups.raygen = { .index = RTDS_RAYGEN };
+
+			IRayTracingPipelineBase::SGeneralShaderGroup missGroups[EMT_COUNT];
+			missGroups[EMT_PRIMARY] = { .index = RTDS_MISS };
+			missGroups[EMT_OCCLUSION] = { .index = RTDS_MISS_SHADOW };
+			shaderGroups.misses = missGroups;
+
+			auto getHitGroupIndex = [](E_GEOM_TYPE geomType, E_RAY_TYPE rayType)
+				{
+					return geomType * ERT_COUNT + rayType;
+				};
+			IRayTracingPipelineBase::SHitShaderGroup hitGroups[E_RAY_TYPE::ERT_COUNT * E_GEOM_TYPE::EGT_COUNT];
+			hitGroups[getHitGroupIndex(EGT_TRIANGLES, ERT_PRIMARY)] = {
+			  .closestHit = RTDS_CLOSEST_HIT,
+			  .anyHit = RTDS_ANYHIT_PRIMARY,
+			};
+			hitGroups[getHitGroupIndex(EGT_TRIANGLES, ERT_OCCLUSION)] = {
+			  .closestHit = IGPURayTracingPipeline::SGeneralShaderGroup::Unused,
+			  .anyHit = RTDS_ANYHIT_SHADOW,
+			};
+			hitGroups[getHitGroupIndex(EGT_PROCEDURAL, ERT_PRIMARY)] = {
+			  .closestHit = RTDS_SPHERE_CLOSEST_HIT,
+			  .anyHit = RTDS_ANYHIT_PRIMARY,
+			  .intersection = RTDS_INTERSECTION,
+			};
+			hitGroups[getHitGroupIndex(EGT_PROCEDURAL, ERT_OCCLUSION)] = {
+			  .closestHit = IGPURayTracingPipeline::SGeneralShaderGroup::Unused,
+			  .anyHit = RTDS_ANYHIT_SHADOW,
+			  .intersection = RTDS_INTERSECTION,
+			};
+			shaderGroups.hits = hitGroups;
+
+			IRayTracingPipelineBase::SGeneralShaderGroup callableGroups[ELT_COUNT];
+			callableGroups[ELT_DIRECTIONAL] = { .index = RTDS_DIRECTIONAL_CALL };
+			callableGroups[ELT_POINT] = { .index = RTDS_POINT_CALL };
+			callableGroups[ELT_SPOT] = { .index = RTDS_SPOT_CALL };
+			shaderGroups.callables = callableGroups;
+
+			params.cached.maxRecursionDepth = 1;
+			params.cached.dynamicStackSize = true;
+
+			if (!m_device->createRayTracingPipelines(nullptr, { &params, 1 }, &m_rayTracingPipeline))
+				return logFail("Failed to create ray tracing pipeline");
+
+			calculateRayTracingStackSize(m_rayTracingPipeline);
+
+			if (!createShaderBindingTable(m_rayTracingPipeline))
+				return logFail("Could not create shader binding table");
+
+		}
+
+		auto assetManager = make_smart_refctd_ptr<nbl::asset::IAssetManager>(smart_refctd_ptr(system));
+		auto* geometryCreator = assetManager->getGeometryCreator();
+
+		if (!createIndirectBuffer())
+			return logFail("Could not create indirect buffer");
+
+		if (!createAccelerationStructuresFromGeometry(geometryCreator))
+			return logFail("Could not create acceleration structures from geometry creator");
+
+		ISampler::SParams samplerParams = {
+		  .AnisotropicFilter = 0
+		};
+		auto defaultSampler = m_device->createSampler(samplerParams);
+
+		{
+			const IGPUDescriptorSetLayout::SBinding bindings[] = {
+			  {
+				.binding = 0u,
+				.type = nbl::asset::IDescriptor::E_TYPE::ET_COMBINED_IMAGE_SAMPLER,
+				.createFlags = ICPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_NONE,
+				.stageFlags = IShader::E_SHADER_STAGE::ESS_FRAGMENT,
+				.count = 1u,
+				.immutableSamplers = &defaultSampler
+			  }
+			};
+			auto gpuPresentDescriptorSetLayout = m_device->createDescriptorSetLayout(bindings);
+			const video::IGPUDescriptorSetLayout* const layouts[] = { gpuPresentDescriptorSetLayout.get() };
+			const uint32_t setCounts[] = { 1u };
+			m_presentDsPool = m_device->createDescriptorPoolForDSLayouts(IDescriptorPool::E_CREATE_FLAGS::ECF_NONE, layouts, setCounts);
+			m_presentDs = m_presentDsPool->createDescriptorSet(gpuPresentDescriptorSetLayout);
+
+			auto scRes = static_cast<CDefaultSwapchainFramebuffers*>(m_surface->getSwapchainResources());
+			ext::FullScreenTriangle::ProtoPipeline fsTriProtoPPln(m_assetMgr.get(), m_device.get(), m_logger.get());
+			if (!fsTriProtoPPln)
+				return logFail("Failed to create Full Screen Triangle protopipeline or load its vertex shader!");
+
+			const IGPUShader::SSpecInfo fragSpec = {
+			  .entryPoint = "main",
+			  .shader = fragmentShader.get()
+			};
+
+			auto presentLayout = m_device->createPipelineLayout(
+				{},
+				core::smart_refctd_ptr(gpuPresentDescriptorSetLayout),
+				nullptr,
+				nullptr,
+				nullptr
+			);
+			m_presentPipeline = fsTriProtoPPln.createPipeline(fragSpec, presentLayout.get(), scRes->getRenderpass());
+			if (!m_presentPipeline)
+				return logFail("Could not create Graphics Pipeline!");
+		}
+
+		// write descriptors
+		IGPUDescriptorSet::SDescriptorInfo infos[3];
+		infos[0].desc = m_gpuTlas;
+
+		infos[1].desc = m_hdrImageView;
+		if (!infos[1].desc)
+			return logFail("Failed to create image view");
+		infos[1].info.image.imageLayout = IImage::LAYOUT::GENERAL;
+
+		infos[2].desc = m_hdrImageView;
+		infos[2].info.image.imageLayout = IImage::LAYOUT::READ_ONLY_OPTIMAL;
+
+		IGPUDescriptorSet::SWriteDescriptorSet writes[] = {
+			{.dstSet = m_rayTracingDs.get(), .binding = 0, .arrayElement = 0, .count = 1, .info = &infos[0]},
+			{.dstSet = m_rayTracingDs.get(), .binding = 1, .arrayElement = 0, .count = 1, .info = &infos[1]},
+			{.dstSet = m_presentDs.get(), .binding = 0, .arrayElement = 0, .count = 1, .info = &infos[2] },
+		};
+		m_device->updateDescriptorSets(std::span(writes), {});
+
+		// gui descriptor setup
+		{
+			using binding_flags_t = IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS;
+			{
+				IGPUSampler::SParams params;
+				params.AnisotropicFilter = 1u;
+				params.TextureWrapU = ETC_REPEAT;
+				params.TextureWrapV = ETC_REPEAT;
+				params.TextureWrapW = ETC_REPEAT;
+
+				m_ui.samplers.gui = m_device->createSampler(params);
+				m_ui.samplers.gui->setObjectDebugName("Nabla IMGUI UI Sampler");
+			}
+
+			std::array<core::smart_refctd_ptr<IGPUSampler>, 69u> immutableSamplers;
+			for (auto& it : immutableSamplers)
+				it = smart_refctd_ptr(m_ui.samplers.scene);
+
+			immutableSamplers[nbl::ext::imgui::UI::FontAtlasTexId] = smart_refctd_ptr(m_ui.samplers.gui);
+
+			nbl::ext::imgui::UI::SCreationParameters params;
+
+			params.resources.texturesInfo = { .setIx = 0u, .bindingIx = 0u };
+			params.resources.samplersInfo = { .setIx = 0u, .bindingIx = 1u };
+			params.assetManager = m_assetMgr;
+			params.pipelineCache = nullptr;
+			params.pipelineLayout = nbl::ext::imgui::UI::createDefaultPipelineLayout(m_utils->getLogicalDevice(), params.resources.texturesInfo, params.resources.samplersInfo, MaxUITextureCount);
+			params.renderpass = smart_refctd_ptr<IGPURenderpass>(renderpass);
+			params.streamingBuffer = nullptr;
+			params.subpassIx = 0u;
+			params.transfer = getGraphicsQueue();
+			params.utilities = m_utils;
+			{
+				m_ui.manager = ext::imgui::UI::create(std::move(params));
+
+				// note that we use default layout provided by our extension, but you are free to create your own by filling nbl::ext::imgui::UI::S_CREATION_PARAMETERS::resources
+				const auto* descriptorSetLayout = m_ui.manager->getPipeline()->getLayout()->getDescriptorSetLayout(0u);
+				const auto& params = m_ui.manager->getCreationParameters();
+
+				IDescriptorPool::SCreateInfo descriptorPoolInfo = {};
+				descriptorPoolInfo.maxDescriptorCount[static_cast<uint32_t>(asset::IDescriptor::E_TYPE::ET_SAMPLER)] = (uint32_t)nbl::ext::imgui::UI::DefaultSamplerIx::COUNT;
+				descriptorPoolInfo.maxDescriptorCount[static_cast<uint32_t>(asset::IDescriptor::E_TYPE::ET_SAMPLED_IMAGE)] = MaxUITextureCount;
+				descriptorPoolInfo.maxSets = 1u;
+				descriptorPoolInfo.flags = IDescriptorPool::E_CREATE_FLAGS::ECF_UPDATE_AFTER_BIND_BIT;
+
+				m_guiDescriptorSetPool = m_device->createDescriptorPool(std::move(descriptorPoolInfo));
+				assert(m_guiDescriptorSetPool);
+
+				m_guiDescriptorSetPool->createDescriptorSets(1u, &descriptorSetLayout, &m_ui.descriptorSet);
+				assert(m_ui.descriptorSet);
+			}
+		}
+
+		m_ui.manager->registerListener(
+			[this]() -> void {
+				ImGuiIO& io = ImGui::GetIO();
+
+				m_camera.setProjectionMatrix([&]()
+					{
+						static matrix4SIMD projection;
+
+						projection = matrix4SIMD::buildProjectionMatrixPerspectiveFovRH(
+							core::radians(m_cameraSetting.fov),
+							io.DisplaySize.x / io.DisplaySize.y,
+							m_cameraSetting.zNear,
+							m_cameraSetting.zFar);
+
+						return projection;
+					}());
+
+				ImGui::SetNextWindowPos(ImVec2(1024, 100), ImGuiCond_Appearing);
+				ImGui::SetNextWindowSize(ImVec2(256, 256), ImGuiCond_Appearing);
+
+				// create a window and insert the inspector
+				ImGui::SetNextWindowPos(ImVec2(10, 10), ImGuiCond_Appearing);
+				ImGui::SetNextWindowSize(ImVec2(320, 340), ImGuiCond_Appearing);
+				ImGui::Begin("Controls");
+
+				ImGui::SameLine();
+
+				ImGui::Text("Camera");
+
+				ImGui::SliderFloat("Move speed", &m_cameraSetting.moveSpeed, 0.1f, 10.f);
+				ImGui::SliderFloat("Rotate speed", &m_cameraSetting.rotateSpeed, 0.1f, 10.f);
+				ImGui::SliderFloat("Fov", &m_cameraSetting.fov, 20.f, 150.f);
+				ImGui::SliderFloat("zNear", &m_cameraSetting.zNear, 0.1f, 100.f);
+				ImGui::SliderFloat("zFar", &m_cameraSetting.zFar, 110.f, 10000.f);
+				Light m_oldLight = m_light;
+				int light_type = m_light.type;
+				ImGui::ListBox("LightType", &light_type, s_lightTypeNames, ELT_COUNT);
+				m_light.type = static_cast<E_LIGHT_TYPE>(light_type);
+				if (m_light.type == ELT_DIRECTIONAL)
+				{
+					ImGui::SliderFloat3("Light Direction", &m_light.direction.x, -1.f, 1.f);
+				}
+				else if (m_light.type == ELT_POINT)
+				{
+					ImGui::SliderFloat3("Light Position", &m_light.position.x, -20.f, 20.f);
+				}
+				else if (m_light.type == ELT_SPOT)
+				{
+					ImGui::SliderFloat3("Light Direction", &m_light.direction.x, -1.f, 1.f);
+					ImGui::SliderFloat3("Light Position", &m_light.position.x, -20.f, 20.f);
+
+					float32_t dOuterCutoff = hlsl::degrees(acos(m_light.outerCutoff));
+					if (ImGui::SliderFloat("Light Outer Cutoff", &dOuterCutoff, 0.0f, 45.0f))
+					{
+						m_light.outerCutoff = cos(hlsl::radians(dOuterCutoff));
+					}
+				}
+				ImGui::Checkbox("Use Indirect Command", &m_useIndirectCommand);
+				if (m_light != m_oldLight)
+				{
+					m_frameAccumulationCounter = 0;
+				}
+
+				ImGui::Text("X: %f Y: %f", io.MousePos.x, io.MousePos.y);
+
+				ImGui::End();
+			}
+		);
+
+		// Set Camera
+		{
+			core::vectorSIMDf cameraPosition(0, 5, -10);
+			matrix4SIMD proj = matrix4SIMD::buildProjectionMatrixPerspectiveFovRH(
+				core::radians(60.0f),
+				WIN_W / WIN_H,
+				0.01f,
+				500.0f
+			);
+			m_camera = Camera(cameraPosition, core::vectorSIMDf(0, 0, 0), proj);
+		}
+
+		m_winMgr->setWindowSize(m_window.get(), WIN_W, WIN_H);
+		m_surface->recreateSwapchain();
+		m_winMgr->show(m_window.get());
+		m_oracle.reportBeginFrameRecord();
+		m_camera.mapKeysToWASD();
+
+		return true;
+	}
+
+	bool updateGUIDescriptorSet()
+	{
+		// texture atlas, note we don't create info & write pair for the font sampler because UI extension's is immutable and baked into DS layout
+		static std::array<IGPUDescriptorSet::SDescriptorInfo, MaxUITextureCount> descriptorInfo;
+		static IGPUDescriptorSet::SWriteDescriptorSet writes[MaxUITextureCount];
+
+		descriptorInfo[nbl::ext::imgui::UI::FontAtlasTexId].info.image.imageLayout = IImage::LAYOUT::READ_ONLY_OPTIMAL;
+		descriptorInfo[nbl::ext::imgui::UI::FontAtlasTexId].desc = smart_refctd_ptr<IGPUImageView>(m_ui.manager->getFontAtlasView());
+
+		for (uint32_t i = 0; i < descriptorInfo.size(); ++i)
+		{
+			writes[i].dstSet = m_ui.descriptorSet.get();
+			writes[i].binding = 0u;
+			writes[i].arrayElement = i;
+			writes[i].count = 1u;
+		}
+		writes[nbl::ext::imgui::UI::FontAtlasTexId].info = descriptorInfo.data() + nbl::ext::imgui::UI::FontAtlasTexId;
+
+		return m_device->updateDescriptorSets(writes, {});
+	}
+
+	inline void workLoopBody() override
+	{
+		// framesInFlight: ensuring safe execution of command buffers and acquires, `framesInFlight` only affect semaphore waits, don't use this to index your resources because it can change with swapchain recreation.
+		const uint32_t framesInFlight = core::min(MaxFramesInFlight, m_surface->getMaxAcquiresInFlight());
+		// We block for semaphores for 2 reasons here:
+		  // A) Resource: Can't use resource like a command buffer BEFORE previous use is finished! [MaxFramesInFlight]
+		  // B) Acquire: Can't have more acquires in flight than a certain threshold returned by swapchain or your surface helper class. [MaxAcquiresInFlight]
+		if (m_realFrameIx >= framesInFlight)
+		{
+			const ISemaphore::SWaitInfo cbDonePending[] =
+			{
+			  {
+				.semaphore = m_semaphore.get(),
+				.value = m_realFrameIx + 1 - framesInFlight
+			  }
+			};
+			if (m_device->blockForSemaphores(cbDonePending) != ISemaphore::WAIT_RESULT::SUCCESS)
+				return;
+		}
+		const auto resourceIx = m_realFrameIx % MaxFramesInFlight;
+
+		m_api->startCapture();
+
+		update();
+
+		auto queue = getGraphicsQueue();
+		auto cmdbuf = m_cmdBufs[resourceIx].get();
+
+		if (!keepRunning())
+			return;
+
+		cmdbuf->reset(IGPUCommandBuffer::RESET_FLAGS::RELEASE_RESOURCES_BIT);
+		cmdbuf->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT);
+		cmdbuf->beginDebugMarker("RaytracingPipelineApp Frame");
+
+		const auto viewMatrix = m_camera.getViewMatrix();
+		const auto projectionMatrix = m_camera.getProjectionMatrix();
+		const auto viewProjectionMatrix = m_camera.getConcatenatedMatrix();
+
+		core::matrix3x4SIMD modelMatrix;
+		modelMatrix.setTranslation(nbl::core::vectorSIMDf(0, 0, 0, 0));
+		modelMatrix.setRotation(quaternion(0, 0, 0));
+
+		core::matrix4SIMD modelViewProjectionMatrix = core::concatenateBFollowedByA(viewProjectionMatrix, modelMatrix);
+		if (m_cachedModelViewProjectionMatrix != modelViewProjectionMatrix)
+		{
+			m_frameAccumulationCounter = 0;
+			m_cachedModelViewProjectionMatrix = modelViewProjectionMatrix;
+		}
+		core::matrix4SIMD invModelViewProjectionMatrix;
+		modelViewProjectionMatrix.getInverseTransform(invModelViewProjectionMatrix);
+
+		{
+			IGPUCommandBuffer::SPipelineBarrierDependencyInfo::image_barrier_t imageBarriers[1];
+			imageBarriers[0].barrier = {
+			   .dep = {
+				 .srcStageMask = PIPELINE_STAGE_FLAGS::FRAGMENT_SHADER_BIT, // previous frame read from framgent shader
+				 .srcAccessMask = ACCESS_FLAGS::SHADER_READ_BITS,
+				 .dstStageMask = PIPELINE_STAGE_FLAGS::RAY_TRACING_SHADER_BIT,
+				 .dstAccessMask = ACCESS_FLAGS::SHADER_WRITE_BITS
+			  }
+			};
+			imageBarriers[0].image = m_hdrImage.get();
+			imageBarriers[0].subresourceRange = {
+			  .aspectMask = IImage::EAF_COLOR_BIT,
+			  .baseMipLevel = 0u,
+			  .levelCount = 1u,
+			  .baseArrayLayer = 0u,
+			  .layerCount = 1u
+			};
+			imageBarriers[0].oldLayout = m_frameAccumulationCounter == 0 ? IImage::LAYOUT::UNDEFINED : IImage::LAYOUT::READ_ONLY_OPTIMAL;
+			imageBarriers[0].newLayout = IImage::LAYOUT::GENERAL;
+			cmdbuf->pipelineBarrier(E_DEPENDENCY_FLAGS::EDF_NONE, { .imgBarriers = imageBarriers });
+		}
+
+		// Trace Rays Pass
+		{
+			SPushConstants pc;
+			pc.light = m_light;
+			pc.proceduralGeomInfoBuffer = m_proceduralGeomInfoBuffer->getDeviceAddress();
+			pc.triangleGeomInfoBuffer = m_triangleGeomInfoBuffer->getDeviceAddress();
+			pc.frameCounter = m_frameAccumulationCounter;
+			const core::vector3df camPos = m_camera.getPosition().getAsVector3df();
+			pc.camPos = { camPos.X, camPos.Y, camPos.Z };
+			memcpy(&pc.invMVP, invModelViewProjectionMatrix.pointer(), sizeof(pc.invMVP));
+
+			cmdbuf->bindRayTracingPipeline(m_rayTracingPipeline.get());
+			cmdbuf->setRayTracingPipelineStackSize(m_rayTracingStackSize);
+			cmdbuf->pushConstants(m_rayTracingPipeline->getLayout(), IShader::E_SHADER_STAGE::ESS_ALL_RAY_TRACING, 0, sizeof(SPushConstants), &pc);
+			cmdbuf->bindDescriptorSets(EPBP_RAY_TRACING, m_rayTracingPipeline->getLayout(), 0, 1, &m_rayTracingDs.get());
+			if (m_useIndirectCommand)
+			{
+				cmdbuf->traceRaysIndirect(
+					SBufferBinding<const IGPUBuffer>{
+					.offset = 0,
+						.buffer = m_indirectBuffer,
+				});
+			}
+			else
+			{
+				cmdbuf->traceRays(
+					m_shaderBindingTable.raygenGroupRange,
+					m_shaderBindingTable.missGroupsRange, m_shaderBindingTable.missGroupsStride,
+					m_shaderBindingTable.hitGroupsRange, m_shaderBindingTable.hitGroupsStride,
+					m_shaderBindingTable.callableGroupsRange, m_shaderBindingTable.callableGroupsStride,
+					WIN_W, WIN_H, 1);
+			}
+		}
+
+		// pipeline barrier
+		{
+			IGPUCommandBuffer::SPipelineBarrierDependencyInfo::image_barrier_t imageBarriers[1];
+			imageBarriers[0].barrier = {
+			  .dep = {
+				.srcStageMask = PIPELINE_STAGE_FLAGS::RAY_TRACING_SHADER_BIT,
+				.srcAccessMask = ACCESS_FLAGS::SHADER_WRITE_BITS,
+				.dstStageMask = PIPELINE_STAGE_FLAGS::COLOR_ATTACHMENT_OUTPUT_BIT,
+				.dstAccessMask = ACCESS_FLAGS::COLOR_ATTACHMENT_WRITE_BIT
+			  }
+			};
+			imageBarriers[0].image = m_hdrImage.get();
+			imageBarriers[0].subresourceRange = {
+			  .aspectMask = IImage::EAF_COLOR_BIT,
+			  .baseMipLevel = 0u,
+			  .levelCount = 1u,
+			  .baseArrayLayer = 0u,
+			  .layerCount = 1u
+			};
+			imageBarriers[0].oldLayout = IImage::LAYOUT::GENERAL;
+			imageBarriers[0].newLayout = IImage::LAYOUT::READ_ONLY_OPTIMAL;
+
+			cmdbuf->pipelineBarrier(E_DEPENDENCY_FLAGS::EDF_NONE, { .imgBarriers = imageBarriers });
+		}
+
+		{
 			asset::SViewport viewport;
 			{
 				viewport.minDepth = 1.f;
@@ -802,1071 +793,842 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication,
 			VkRect2D defaultScisors[] = { {.offset = {(int32_t)viewport.x, (int32_t)viewport.y}, .extent = {(uint32_t)viewport.width, (uint32_t)viewport.height}} };
 			cmdbuf->setScissor(defaultScisors);
 
-      auto scRes = static_cast<CDefaultSwapchainFramebuffers*>(m_surface->getSwapchainResources());
-      const VkRect2D currentRenderArea =
-      {
-        .offset = {0,0},
-        .extent = {m_window->getWidth(),m_window->getHeight()}
-      };
-      const IGPUCommandBuffer::SClearColorValue clearColor = { .float32 = {0.f,0.f,0.f,1.f} };
-      const IGPUCommandBuffer::SRenderpassBeginInfo info =
-      {
-        .framebuffer = scRes->getFramebuffer(m_currentImageAcquire.imageIndex),
-        .colorClearValues = &clearColor,
-        .depthStencilClearValues = nullptr,
-        .renderArea = currentRenderArea
-      };
-      nbl::video::ISemaphore::SWaitInfo waitInfo = { .semaphore = m_semaphore.get(), .value = m_realFrameIx + 1u };
-
-      cmdbuf->beginRenderPass(info, IGPUCommandBuffer::SUBPASS_CONTENTS::INLINE);
-
-      cmdbuf->bindGraphicsPipeline(m_presentPipeline.get());
-      cmdbuf->bindDescriptorSets(EPBP_GRAPHICS, m_presentPipeline->getLayout(), 0, 1u, &m_presentDs.get());
-      ext::FullScreenTriangle::recordDrawCall(cmdbuf);
-
-      const auto uiParams = m_ui.manager->getCreationParameters();
-      auto* uiPipeline = m_ui.manager->getPipeline();
-      cmdbuf->bindGraphicsPipeline(uiPipeline);
-      cmdbuf->bindDescriptorSets(EPBP_GRAPHICS, uiPipeline->getLayout(), uiParams.resources.texturesInfo.setIx, 1u, &m_ui.descriptorSet.get());
-      m_ui.manager->render(cmdbuf, waitInfo);
-
-      cmdbuf->endRenderPass();
-
-    }
-
-    cmdbuf->endDebugMarker();
-    cmdbuf->end();
-
-    {
-      const IQueue::SSubmitInfo::SSemaphoreInfo rendered[] =
-      {
-        {
-          .semaphore = m_semaphore.get(),
-          .value = ++m_realFrameIx,
-          .stageMask = PIPELINE_STAGE_FLAGS::ALL_TRANSFER_BITS
-        }
-      };
-      {
-        {
-          const IQueue::SSubmitInfo::SCommandBufferInfo commandBuffers[] =
-          {
-            {.cmdbuf = cmdbuf }
-          };
-
-          const IQueue::SSubmitInfo::SSemaphoreInfo acquired[] =
-          {
-            {
-              .semaphore = m_currentImageAcquire.semaphore,
-              .value = m_currentImageAcquire.acquireCount,
-              .stageMask = PIPELINE_STAGE_FLAGS::NONE
-            }
-          };
-          const IQueue::SSubmitInfo infos[] =
-          {
-            {
-              .waitSemaphores = acquired,
-              .commandBuffers = commandBuffers,
-              .signalSemaphores = rendered
-            }
-          };
-
-          updateGUIDescriptorSet();
-
-          if (queue->submit(infos) != IQueue::RESULT::SUCCESS)
-            m_realFrameIx--;
-        }
-      }
-
-      m_window->setCaption("[Nabla Engine] Ray Tracing Pipeline");
-      m_surface->present(m_currentImageAcquire.imageIndex, rendered);
-    }
-    m_api->endCapture();
-    m_frameAccumulationCounter++;
-  }
-
-  inline void update()
-  {
-    m_camera.setMoveSpeed(m_cameraSetting.moveSpeed);
-    m_camera.setRotateSpeed(m_cameraSetting.rotateSpeed);
-
-    static std::chrono::microseconds previousEventTimestamp{};
-
-    m_inputSystem->getDefaultMouse(&m_mouse);
-    m_inputSystem->getDefaultKeyboard(&m_keyboard);
-
-    auto updatePresentationTimestamp = [&]()
-      {
-        m_currentImageAcquire = m_surface->acquireNextImage();
-
-        m_oracle.reportEndFrameRecord();
-        const auto timestamp = m_oracle.getNextPresentationTimeStamp();
-        m_oracle.reportBeginFrameRecord();
-
-        return timestamp;
-      };
-
-    const auto nextPresentationTimestamp = updatePresentationTimestamp();
-
-    struct
-    {
-      std::vector<SMouseEvent> mouse{};
-      std::vector<SKeyboardEvent> keyboard{};
-    } capturedEvents;
-
-    m_camera.beginInputProcessing(nextPresentationTimestamp);
-    {
-      m_mouse.consumeEvents([&](const IMouseEventChannel::range_t& events) -> void
-        {
-          m_camera.mouseProcess(events); // don't capture the events, only let camera handle them with its impl
-
-          for (const auto& e : events) // here capture
-          {
-            if (e.timeStamp < previousEventTimestamp)
-              continue;
-
-            previousEventTimestamp = e.timeStamp;
-            capturedEvents.mouse.emplace_back(e);
-
-          }
-        }, m_logger.get());
-
-      m_keyboard.consumeEvents([&](const IKeyboardEventChannel::range_t& events) -> void
-        {
-          m_camera.keyboardProcess(events); // don't capture the events, only let camera handle them with its impl
-
-          for (const auto& e : events) // here capture
-          {
-            if (e.timeStamp < previousEventTimestamp)
-              continue;
-
-            previousEventTimestamp = e.timeStamp;
-            capturedEvents.keyboard.emplace_back(e);
-          }
-        }, m_logger.get());
-
-    }
-    m_camera.endInputProcessing(nextPresentationTimestamp);
-
-    const core::SRange<const nbl::ui::SMouseEvent> mouseEvents(capturedEvents.mouse.data(), capturedEvents.mouse.data() + capturedEvents.mouse.size());
-    const core::SRange<const nbl::ui::SKeyboardEvent> keyboardEvents(capturedEvents.keyboard.data(), capturedEvents.keyboard.data() + capturedEvents.keyboard.size());
-    const auto cursorPosition = m_window->getCursorControl()->getPosition();
-    const auto mousePosition = float32_t2(cursorPosition.x, cursorPosition.y) - float32_t2(m_window->getX(), m_window->getY());
-
-    const ext::imgui::UI::SUpdateParameters params =
-    {
-      .mousePosition = mousePosition,
-      .displaySize = { m_window->getWidth(), m_window->getHeight() },
-      .mouseEvents = mouseEvents,
-      .keyboardEvents = keyboardEvents
-    };
-
-    m_ui.manager->update(params);
-  }
-
-  inline bool keepRunning() override
-  {
-    if (m_surface->irrecoverable())
-      return false;
-
-    return true;
-  }
-
-  inline bool onAppTerminated() override
-  {
-    return device_base_t::onAppTerminated();
-  }
+			auto scRes = static_cast<CDefaultSwapchainFramebuffers*>(m_surface->getSwapchainResources());
+			const VkRect2D currentRenderArea =
+			{
+			  .offset = {0,0},
+			  .extent = {m_window->getWidth(),m_window->getHeight()}
+			};
+			const IGPUCommandBuffer::SClearColorValue clearColor = { .float32 = {0.f,0.f,0.f,1.f} };
+			const IGPUCommandBuffer::SRenderpassBeginInfo info =
+			{
+			  .framebuffer = scRes->getFramebuffer(m_currentImageAcquire.imageIndex),
+			  .colorClearValues = &clearColor,
+			  .depthStencilClearValues = nullptr,
+			  .renderArea = currentRenderArea
+			};
+			nbl::video::ISemaphore::SWaitInfo waitInfo = { .semaphore = m_semaphore.get(), .value = m_realFrameIx + 1u };
+
+			cmdbuf->beginRenderPass(info, IGPUCommandBuffer::SUBPASS_CONTENTS::INLINE);
+
+			cmdbuf->bindGraphicsPipeline(m_presentPipeline.get());
+			cmdbuf->bindDescriptorSets(EPBP_GRAPHICS, m_presentPipeline->getLayout(), 0, 1u, &m_presentDs.get());
+			ext::FullScreenTriangle::recordDrawCall(cmdbuf);
+
+			const auto uiParams = m_ui.manager->getCreationParameters();
+			auto* uiPipeline = m_ui.manager->getPipeline();
+			cmdbuf->bindGraphicsPipeline(uiPipeline);
+			cmdbuf->bindDescriptorSets(EPBP_GRAPHICS, uiPipeline->getLayout(), uiParams.resources.texturesInfo.setIx, 1u, &m_ui.descriptorSet.get());
+			m_ui.manager->render(cmdbuf, waitInfo);
+
+			cmdbuf->endRenderPass();
+
+		}
+
+		cmdbuf->endDebugMarker();
+		cmdbuf->end();
+
+		{
+			const IQueue::SSubmitInfo::SSemaphoreInfo rendered[] =
+			{
+			  {
+				.semaphore = m_semaphore.get(),
+				.value = ++m_realFrameIx,
+				.stageMask = PIPELINE_STAGE_FLAGS::ALL_TRANSFER_BITS
+			  }
+			};
+			{
+				{
+					const IQueue::SSubmitInfo::SCommandBufferInfo commandBuffers[] =
+					{
+					  {.cmdbuf = cmdbuf }
+					};
+
+					const IQueue::SSubmitInfo::SSemaphoreInfo acquired[] =
+					{
+					  {
+						.semaphore = m_currentImageAcquire.semaphore,
+						.value = m_currentImageAcquire.acquireCount,
+						.stageMask = PIPELINE_STAGE_FLAGS::NONE
+					  }
+					};
+					const IQueue::SSubmitInfo infos[] =
+					{
+					  {
+						.waitSemaphores = acquired,
+						.commandBuffers = commandBuffers,
+						.signalSemaphores = rendered
+					  }
+					};
+
+					updateGUIDescriptorSet();
+
+					if (queue->submit(infos) != IQueue::RESULT::SUCCESS)
+						m_realFrameIx--;
+				}
+			}
+
+			m_window->setCaption("[Nabla Engine] Ray Tracing Pipeline");
+			m_surface->present(m_currentImageAcquire.imageIndex, rendered);
+		}
+		m_api->endCapture();
+		m_frameAccumulationCounter++;
+	}
+
+	inline void update()
+	{
+		m_camera.setMoveSpeed(m_cameraSetting.moveSpeed);
+		m_camera.setRotateSpeed(m_cameraSetting.rotateSpeed);
+
+		static std::chrono::microseconds previousEventTimestamp{};
+
+		m_inputSystem->getDefaultMouse(&m_mouse);
+		m_inputSystem->getDefaultKeyboard(&m_keyboard);
+
+		auto updatePresentationTimestamp = [&]()
+			{
+				m_currentImageAcquire = m_surface->acquireNextImage();
+
+				m_oracle.reportEndFrameRecord();
+				const auto timestamp = m_oracle.getNextPresentationTimeStamp();
+				m_oracle.reportBeginFrameRecord();
+
+				return timestamp;
+			};
+
+		const auto nextPresentationTimestamp = updatePresentationTimestamp();
+
+		struct
+		{
+			std::vector<SMouseEvent> mouse{};
+			std::vector<SKeyboardEvent> keyboard{};
+		} capturedEvents;
+
+		m_camera.beginInputProcessing(nextPresentationTimestamp);
+		{
+			const auto& io = ImGui::GetIO();
+			m_mouse.consumeEvents([&](const IMouseEventChannel::range_t& events) -> void
+				{
+					if (!io.WantCaptureMouse)
+						m_camera.mouseProcess(events); // don't capture the events, only let camera handle them with its impl
+
+					for (const auto& e : events) // here capture
+					{
+						if (e.timeStamp < previousEventTimestamp)
+							continue;
+
+						previousEventTimestamp = e.timeStamp;
+						capturedEvents.mouse.emplace_back(e);
+
+					}
+				}, m_logger.get());
+
+			m_keyboard.consumeEvents([&](const IKeyboardEventChannel::range_t& events) -> void
+				{
+					if (!io.WantCaptureKeyboard)
+						m_camera.keyboardProcess(events); // don't capture the events, only let camera handle them with its impl
+
+					for (const auto& e : events) // here capture
+					{
+						if (e.timeStamp < previousEventTimestamp)
+							continue;
+
+						previousEventTimestamp = e.timeStamp;
+						capturedEvents.keyboard.emplace_back(e);
+					}
+				}, m_logger.get());
+
+		}
+		m_camera.endInputProcessing(nextPresentationTimestamp);
+
+		const core::SRange<const nbl::ui::SMouseEvent> mouseEvents(capturedEvents.mouse.data(), capturedEvents.mouse.data() + capturedEvents.mouse.size());
+		const core::SRange<const nbl::ui::SKeyboardEvent> keyboardEvents(capturedEvents.keyboard.data(), capturedEvents.keyboard.data() + capturedEvents.keyboard.size());
+		const auto cursorPosition = m_window->getCursorControl()->getPosition();
+		const auto mousePosition = float32_t2(cursorPosition.x, cursorPosition.y) - float32_t2(m_window->getX(), m_window->getY());
+
+		const ext::imgui::UI::SUpdateParameters params =
+		{
+		  .mousePosition = mousePosition,
+		  .displaySize = { m_window->getWidth(), m_window->getHeight() },
+		  .mouseEvents = mouseEvents,
+		  .keyboardEvents = keyboardEvents
+		};
+
+		m_ui.manager->update(params);
+	}
+
+	inline bool keepRunning() override
+	{
+		if (m_surface->irrecoverable())
+			return false;
+
+		return true;
+	}
+
+	inline bool onAppTerminated() override
+	{
+		return device_base_t::onAppTerminated();
+	}
 
 private:
-  uint32_t getWorkgroupCount(uint32_t dim, uint32_t size)
-  {
-    return (dim + size - 1) / size;
-  }
-
-  smart_refctd_ptr<IGPUBuffer> createBuffer(IGPUBuffer::SCreationParams& params)
-  {
-    smart_refctd_ptr<IGPUBuffer> buffer;
-    buffer = m_device->createBuffer(std::move(params));
-    auto bufReqs = buffer->getMemoryReqs();
-    bufReqs.memoryTypeBits &= m_physicalDevice->getDeviceLocalMemoryTypeBits();
-    m_device->allocate(bufReqs, buffer.get(), IDeviceMemoryAllocation::EMAF_DEVICE_ADDRESS_BIT);
-
-    return buffer;
-  }
-
-  smart_refctd_ptr<IGPUCommandBuffer> getSingleUseCommandBufferAndBegin(smart_refctd_ptr<IGPUCommandPool> pool)
-  {
-    smart_refctd_ptr<IGPUCommandBuffer> cmdbuf;
-    if (!pool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY, 1u, &cmdbuf))
-      return nullptr;
-
-    cmdbuf->reset(IGPUCommandBuffer::RESET_FLAGS::RELEASE_RESOURCES_BIT);
-    cmdbuf->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT);
-
-    return cmdbuf;
-  }
-
-  void cmdbufSubmitAndWait(smart_refctd_ptr<IGPUCommandBuffer> cmdbuf, CThreadSafeQueueAdapter* queue, uint64_t startValue)
-  {
-    cmdbuf->end();
-
-    uint64_t finishedValue = startValue + 1;
-
-    // submit builds
-    {
-      auto completed = m_device->createSemaphore(startValue);
-
-      std::array<IQueue::SSubmitInfo::SSemaphoreInfo, 1u> signals;
-      {
-        auto& signal = signals.front();
-        signal.value = finishedValue;
-        signal.stageMask = bitflag(PIPELINE_STAGE_FLAGS::ALL_TRANSFER_BITS);
-        signal.semaphore = completed.get();
-      }
-
-      const IQueue::SSubmitInfo::SCommandBufferInfo commandBuffers[1] = { {
-        .cmdbuf = cmdbuf.get()
-      } };
-
-      const IQueue::SSubmitInfo infos[] =
-      {
-        {
-          .waitSemaphores = {},
-          .commandBuffers = commandBuffers,
-          .signalSemaphores = signals
-        }
-      };
-
-      if (queue->submit(infos) != IQueue::RESULT::SUCCESS)
-      {
-        m_logger->log("Failed to submit geometry transfer upload operations!", ILogger::ELL_ERROR);
-        return;
-      }
-
-      const ISemaphore::SWaitInfo info[] =
-      { {
-        .semaphore = completed.get(),
-        .value = finishedValue
-      } };
-
-      m_device->blockForSemaphores(info);
-    }
-  }
-
-  bool createIndirectBuffer(video::CThreadSafeQueueAdapter* queue)
-  {
-    const auto getBufferRangeAddress = [](const SBufferRange<IGPUBuffer>& range)
-      {
-        return range.buffer->getDeviceAddress() + range.offset;
-      };
-    const auto command = TraceRaysIndirectCommand_t{
-      .raygenShaderRecordAddress = getBufferRangeAddress(m_shaderBindingTable.raygenGroupRange),
-      .raygenShaderRecordSize = m_shaderBindingTable.raygenGroupRange.size,
-      .missShaderBindingTableAddress = getBufferRangeAddress(m_shaderBindingTable.missGroupsRange),
-      .missShaderBindingTableSize = m_shaderBindingTable.missGroupsRange.size,
-      .missShaderBindingTableStride = m_shaderBindingTable.missGroupsStride,
-      .hitShaderBindingTableAddress = getBufferRangeAddress(m_shaderBindingTable.hitGroupsRange),
-      .hitShaderBindingTableSize = m_shaderBindingTable.hitGroupsRange.size,
-      .hitShaderBindingTableStride = m_shaderBindingTable.hitGroupsStride,
-      .callableShaderBindingTableAddress = getBufferRangeAddress(m_shaderBindingTable.callableGroupsRange),
-      .callableShaderBindingTableSize = m_shaderBindingTable.callableGroupsRange.size,
-      .callableShaderBindingTableStride = m_shaderBindingTable.callableGroupsStride,
-      .width = WIN_W,
-      .height = WIN_H,
-      .depth = 1,
-    };
-    IGPUBuffer::SCreationParams params;
-    params.usage = IGPUBuffer::EUF_TRANSFER_DST_BIT | IGPUBuffer::EUF_INDIRECT_BUFFER_BIT | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT;
-    params.size = sizeof(TraceRaysIndirectCommand_t);
-    m_utils->createFilledDeviceLocalBufferOnDedMem(SIntendedSubmitInfo{ .queue = queue }, std::move(params), &command).move_into(m_indirectBuffer);
-    return true;
-  }
-
-  bool createGeometries(video::CThreadSafeQueueAdapter* queue, const IGeometryCreator* gc)
-  {
-    auto pool = m_device->createCommandPool(queue->getFamilyIndex(), IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT);
-    if (!pool)
-      return logFail("Couldn't create Command Pool for geometry creation!");
-
-    const auto defaultMaterial = Material{
-      .ambient = {0.2, 0.1, 0.1},
-      .diffuse = {0.8, 0.3, 0.3},
-      .specular = {0.8, 0.8, 0.8},
-      .shininess = 1.0f,
-      .alpha = 1.0f,
-    };
-
-    auto getTranslationMatrix = [](float32_t x, float32_t y, float32_t z)
-      {
-        core::matrix3x4SIMD transform;
-        transform.setTranslation(nbl::core::vectorSIMDf(x, y, z, 0));
-        return transform;
-      };
-
-    core::matrix3x4SIMD planeTransform;
-    planeTransform.setRotation(quaternion::fromAngleAxis(core::radians(-90.0f), vector3df_SIMD{ 1, 0, 0 }));
-
-    const auto cpuObjects = std::array{
-      ReferenceObjectCpu {
-        .meta = {.type = OT_RECTANGLE, .name = "Plane Mesh"},
-        .data = gc->createRectangleMesh(nbl::core::vector2df_SIMD(10, 10)),
-        .material = defaultMaterial,
-        .transform = planeTransform,
-      },
-      ReferenceObjectCpu {
-        .meta = {.type = OT_CUBE, .name = "Cube Mesh"},
-        .data = gc->createCubeMesh(nbl::core::vector3df(1, 1, 1)),
-        .material = defaultMaterial,
-        .transform = getTranslationMatrix(0, 0.5f, 0),
-      },
-      ReferenceObjectCpu {
-        .meta = {.type = OT_CUBE, .name = "Cube Mesh 2"},
-        .data = gc->createCubeMesh(nbl::core::vector3df(1.5, 1.5, 1.5)),
-        .material = Material{
-          .ambient = {0.1, 0.1, 0.2},
-          .diffuse = {0.2, 0.2, 0.8},
-          .specular = {0.8, 0.8, 0.8},
-          .shininess = 1.0f,
-        },
-        .transform = getTranslationMatrix(-5.0f, 1.0f, 0),
-      },
-      ReferenceObjectCpu {
-        .meta = {.type = OT_CUBE, .name = "Transparent Cube Mesh"},
-        .data = gc->createCubeMesh(nbl::core::vector3df(1.5, 1.5, 1.5)),
-        .material = Material{
-          .ambient = {0.1, 0.2, 0.1},
-          .diffuse = {0.2, 0.8, 0.2},
-          .specular = {0.8, 0.8, 0.8},
-          .shininess = 1.0f,
-          .alpha = 0.2,
-        },
-        .transform = getTranslationMatrix(5.0f, 1.0f, 0),
-      },
-    };
-
-    struct ScratchVIBindings
-    {
-      nbl::asset::SBufferBinding<ICPUBuffer> vertex, index;
-    };
-    std::array<ScratchVIBindings, std::size(cpuObjects)> scratchBuffers;
-
-    for (uint32_t i = 0; i < cpuObjects.size(); i++)
-    {
-      const auto& cpuObject = cpuObjects[i];
-
-      auto vBuffer = smart_refctd_ptr(cpuObject.data.bindings[0].buffer); // no offset
-      auto vUsage = bitflag(IGPUBuffer::EUF_STORAGE_BUFFER_BIT) | IGPUBuffer::EUF_TRANSFER_DST_BIT | IGPUBuffer::EUF_INLINE_UPDATE_VIA_CMDBUF |
-        IGPUBuffer::EUF_ACCELERATION_STRUCTURE_BUILD_INPUT_READ_ONLY_BIT | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT;
-      vBuffer->addUsageFlags(vUsage);
-      vBuffer->setContentHash(vBuffer->computeContentHash());
-
-      auto iBuffer = smart_refctd_ptr(cpuObject.data.indexBuffer.buffer); // no offset
-      auto iUsage = bitflag(IGPUBuffer::EUF_STORAGE_BUFFER_BIT) | IGPUBuffer::EUF_TRANSFER_DST_BIT | IGPUBuffer::EUF_INLINE_UPDATE_VIA_CMDBUF |
-        IGPUBuffer::EUF_ACCELERATION_STRUCTURE_BUILD_INPUT_READ_ONLY_BIT | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT;
-
-      if (cpuObject.data.indexType != EIT_UNKNOWN)
-        if (iBuffer)
-        {
-          iBuffer->addUsageFlags(iUsage);
-          iBuffer->setContentHash(iBuffer->computeContentHash());
-        }
-
-      scratchBuffers[i] = {
-        .vertex = {.offset = 0, .buffer = vBuffer},
-        .index = {.offset = 0, .buffer = iBuffer},
-      };
-
-    }
-
-    auto cmdbuf = getSingleUseCommandBufferAndBegin(pool);
-    cmdbuf->beginDebugMarker("Build geometry vertex and index buffers");
-
-    CAssetConverter::SInputs inputs = {};
-    inputs.logger = m_logger.get();
-    std::array<ICPUBuffer*, std::size(cpuObjects) * 2u> tmpBuffers;
-    {
-      for (uint32_t i = 0; i < cpuObjects.size(); i++)
-      {
-        tmpBuffers[2 * i + 0] = scratchBuffers[i].vertex.buffer.get();
-        tmpBuffers[2 * i + 1] = scratchBuffers[i].index.buffer.get();
-      }
-
-      std::get<CAssetConverter::SInputs::asset_span_t<ICPUBuffer>>(inputs.assets) = tmpBuffers;
-    }
-
-    auto reservation = m_converter->reserve(inputs);
-    {
-      auto prepass = [&]<typename asset_type_t>(const auto & references) -> bool
-      {
-        auto objects = reservation.getGPUObjects<asset_type_t>();
-        uint32_t counter = {};
-        for (auto& object : objects)
-        {
-          auto gpu = object.value;
-          auto* reference = references[counter];
-
-          if (reference)
-          {
-            if (!gpu)
-            {
-              m_logger->log("Failed to convert a CPU object to GPU!", ILogger::ELL_ERROR);
-              return false;
-            }
-          }
-          counter++;
-        }
-        return true;
-      };
-
-      prepass.template operator() < ICPUBuffer > (tmpBuffers);
-    }
-
-    auto geomInfoBuffer = ICPUBuffer::create({ std::size(cpuObjects) * sizeof(STriangleGeomInfo) });
-    STriangleGeomInfo* geomInfos = reinterpret_cast<STriangleGeomInfo*>(geomInfoBuffer->getPointer());
-
-    m_gpuTriangleGeometries.reserve(std::size(cpuObjects));
-    // convert
-    {
-      // not sure if need this (probably not, originally for transition img view)
-      auto semaphore = m_device->createSemaphore(0u);
-
-      std::array<IQueue::SSubmitInfo::SCommandBufferInfo, 1> cmdbufs = {};
-      cmdbufs.front().cmdbuf = cmdbuf.get();
-
-      SIntendedSubmitInfo transfer = {};
-      transfer.queue = queue;
-      transfer.scratchCommandBuffers = cmdbufs;
-      transfer.scratchSemaphore = {
-        .semaphore = semaphore.get(),
-        .value = 0u,
-        .stageMask = PIPELINE_STAGE_FLAGS::ALL_TRANSFER_BITS
-      };
-
-      CAssetConverter::SConvertParams params = {};
-      params.utilities = m_utils.get();
-      params.transfer = &transfer;
-
-      auto future = reservation.convert(params);
-      if (future.copy() != IQueue::RESULT::SUCCESS)
-      {
-        m_logger->log("Failed to await submission feature!", ILogger::ELL_ERROR);
-        return false;
-      }
-
-      auto&& buffers = reservation.getGPUObjects<ICPUBuffer>();
-      for (uint32_t i = 0; i < cpuObjects.size(); i++)
-      {
-        auto& cpuObject = cpuObjects[i];
-
-        m_gpuTriangleGeometries.push_back(ReferenceObjectGpu{
-          .meta = cpuObject.meta,
-          .bindings = {
-            .vertex = {.offset = 0, .buffer = buffers[2 * i + 0].value },
-            .index = {.offset = 0, .buffer = buffers[2 * i + 1].value },
-          },
-          .vertexStride = cpuObject.data.inputParams.bindings[0].stride,
-          .indexType = cpuObject.data.indexType,
-          .indexCount = cpuObject.data.indexCount,
-          .material = hlsl::_static_cast<MaterialPacked>(cpuObject.material),
-          .transform = cpuObject.transform,
-          });
-      }
-
-      for (uint32_t i = 0; i < m_gpuTriangleGeometries.size(); i++)
-      {
-        const auto& gpuObject = m_gpuTriangleGeometries[i];
-        const uint64_t vertexBufferAddress = gpuObject.bindings.vertex.buffer->getDeviceAddress();
-        geomInfos[i] = {
-          .material = gpuObject.material,
-          .vertexBufferAddress = vertexBufferAddress,
-          .indexBufferAddress = gpuObject.useIndex() ? gpuObject.bindings.index.buffer->getDeviceAddress() : vertexBufferAddress,
-          .vertexStride = gpuObject.vertexStride,
-          .objType = gpuObject.meta.type,
-          .indexType = gpuObject.indexType,
-          .smoothNormals = s_smoothNormals[gpuObject.meta.type],
-        };
-      }
-    }
-
-    {
-      IGPUBuffer::SCreationParams params;
-      params.usage = IGPUBuffer::EUF_STORAGE_BUFFER_BIT | IGPUBuffer::EUF_TRANSFER_DST_BIT | IGPUBuffer::EUF_INLINE_UPDATE_VIA_CMDBUF | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT;
-      params.size = geomInfoBuffer->getSize();
-      m_utils->createFilledDeviceLocalBufferOnDedMem(SIntendedSubmitInfo{ .queue = queue }, std::move(params), geomInfos).move_into(m_triangleGeomInfoBuffer);
-    }
-
-    // intersection geometries setup
-    {
-      core::vector<SProceduralGeomInfo> proceduralGeoms;
-      proceduralGeoms.reserve(NumberOfProceduralGeometries);
-      using Aabb = IGPUBottomLevelAccelerationStructure::AABB_t;
-      core::vector<Aabb> aabbs;
-      aabbs.reserve(NumberOfProceduralGeometries);
-      for (int32_t i = 0; i < NumberOfProceduralGeometries; i++)
-      {
-        const auto middle_i = NumberOfProceduralGeometries / 2.0;
-        SProceduralGeomInfo sphere = {
-          .material = hlsl::_static_cast<MaterialPacked>(Material{
-            .ambient = {0.1, 0.05 * i, 0.1},
-            .diffuse = {0.3, 0.2 * i, 0.3},
-            .specular = {0.8, 0.8, 0.8},
-            .shininess = 1.0f,
-          }),
-          .center = float32_t3((i - middle_i) * 4.0, 2, 5.0),
-          .radius = 1,
-        };
-
-        proceduralGeoms.push_back(sphere);
-        const auto sphereMin = sphere.center - sphere.radius;
-        const auto sphereMax = sphere.center + sphere.radius;
-        aabbs.emplace_back(
-          vector3d(sphereMin.x, sphereMin.y, sphereMin.z), 
-          vector3d(sphereMax.x, sphereMax.y, sphereMax.z));
-      }
-
-      {
-        IGPUBuffer::SCreationParams params;
-        params.usage = IGPUBuffer::EUF_STORAGE_BUFFER_BIT | IGPUBuffer::EUF_TRANSFER_DST_BIT | IGPUBuffer::EUF_INLINE_UPDATE_VIA_CMDBUF | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT;
-        params.size = proceduralGeoms.size() * sizeof(SProceduralGeomInfo);
-        m_utils->createFilledDeviceLocalBufferOnDedMem(SIntendedSubmitInfo{ .queue = queue }, std::move(params), proceduralGeoms.data()).move_into(m_proceduralGeomInfoBuffer);
-      }
-
-      {
-        IGPUBuffer::SCreationParams params;
-        params.usage = IGPUBuffer::EUF_TRANSFER_DST_BIT | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT | IGPUBuffer::EUF_ACCELERATION_STRUCTURE_BUILD_INPUT_READ_ONLY_BIT;
-        params.size = aabbs.size() * sizeof(Aabb);
-        m_utils->createFilledDeviceLocalBufferOnDedMem(SIntendedSubmitInfo{ .queue = queue }, std::move(params), aabbs.data()).move_into(m_proceduralAabbBuffer);
-      }
-    }
-
-    return true;
-  }
-
-  void calculateRayTracingStackSize(const smart_refctd_ptr<video::IGPURayTracingPipeline>& pipeline)
-  {
-    const auto raygenStackSize = pipeline->getRaygenStackSize();
-    auto getMaxSize = [&](auto ranges, auto valProj) -> uint16_t
-      {
-        auto maxValue = 0;
-        for (const auto& val : ranges)
-        {
-          maxValue = std::max<uint16_t>(maxValue, std::invoke(valProj, val));
-        }
-        return maxValue;
-      };
-
-    const auto closestHitStackMax = getMaxSize(pipeline->getHitStackSizes(), &IGPURayTracingPipeline::SHitGroupStackSize::closestHit);
-    const auto anyHitStackMax = getMaxSize(pipeline->getHitStackSizes(), &IGPURayTracingPipeline::SHitGroupStackSize::anyHit);
-    const auto intersectionStackMax = getMaxSize(pipeline->getHitStackSizes(), &IGPURayTracingPipeline::SHitGroupStackSize::intersection);
-    const auto missStackMax = getMaxSize(pipeline->getMissStackSizes(), std::identity{});
-    const auto callableStackMax = getMaxSize(pipeline->getCallableStackSizes(), std::identity{});
-    auto firstDepthStackSizeMax = std::max(closestHitStackMax, missStackMax);
-    firstDepthStackSizeMax = std::max<uint16_t>(firstDepthStackSizeMax, intersectionStackMax + anyHitStackMax);
-    m_rayTracingStackSize = raygenStackSize + std::max(firstDepthStackSizeMax, callableStackMax);
-  }
-
-  bool createShaderBindingTable(video::CThreadSafeQueueAdapter* queue, const smart_refctd_ptr<video::IGPURayTracingPipeline>& pipeline)
-  {
-    const auto& limits = m_device->getPhysicalDevice()->getLimits();
-    const auto handleSize = SPhysicalDeviceLimits::ShaderGroupHandleSize;
-    const auto handleSizeAligned = nbl::core::alignUp(handleSize, limits.shaderGroupHandleAlignment);
-
-    auto& raygenRange = m_shaderBindingTable.raygenGroupRange;
-
-    auto& hitRange = m_shaderBindingTable.hitGroupsRange;
-    const auto hitHandles = pipeline->getHitHandles();
-
-    auto& missRange = m_shaderBindingTable.missGroupsRange;
-    const auto missHandles = pipeline->getMissHandles();
-
-    auto& callableRange = m_shaderBindingTable.callableGroupsRange;
-    const auto callableHandles = pipeline->getCallableHandles();
-
-    raygenRange = {
-      .offset = 0,
-      .size = core::alignUp(handleSizeAligned, limits.shaderGroupBaseAlignment)
-    };
-
-    missRange = {
-      .offset = raygenRange.size,
-      .size = core::alignUp(missHandles.size() * handleSizeAligned, limits.shaderGroupBaseAlignment),
-    };
-    m_shaderBindingTable.missGroupsStride = handleSizeAligned;
-
-    hitRange = {
-      .offset = missRange.offset + missRange.size,
-      .size = core::alignUp(hitHandles.size() * handleSizeAligned, limits.shaderGroupBaseAlignment),
-    };
-    m_shaderBindingTable.hitGroupsStride = handleSizeAligned;
-
-    callableRange = {
-      .offset = hitRange.offset + hitRange.size,
-      .size = core::alignUp(callableHandles.size() * handleSizeAligned, limits.shaderGroupBaseAlignment),
-    };
-    m_shaderBindingTable.callableGroupsStride = handleSizeAligned;
-
-    const auto bufferSize = raygenRange.size + missRange.size + hitRange.size + callableRange.size;
-
-    ICPUBuffer::SCreationParams cpuBufferParams;
-    cpuBufferParams.size = bufferSize;
-    auto cpuBuffer = ICPUBuffer::create(std::move(cpuBufferParams));
-    uint8_t* pData = reinterpret_cast<uint8_t*>(cpuBuffer->getPointer());
-
-    // copy raygen region
-    memcpy(pData, &pipeline->getRaygen(), handleSize);
-
-    // copy miss region
-    uint8_t* pMissData = pData + missRange.offset;
-    for (const auto& handle : missHandles)
-    {
-      memcpy(pMissData, &handle, handleSize);
-      pMissData += m_shaderBindingTable.missGroupsStride;
-    }
-
-    // copy hit region
-    uint8_t* pHitData = pData + hitRange.offset;
-    for (const auto& handle : hitHandles)
-    {
-      memcpy(pHitData, &handle, handleSize);
-      pHitData += m_shaderBindingTable.hitGroupsStride;
-    }
-
-    // copy callable region
-    uint8_t* pCallableData = pData + callableRange.offset;
-    for (const auto& handle : callableHandles)
-    {
-      memcpy(pCallableData, &handle, handleSize);
-      pCallableData += m_shaderBindingTable.callableGroupsStride;
-    }
-
-    {
-      IGPUBuffer::SCreationParams params;
-      params.usage = IGPUBuffer::EUF_TRANSFER_DST_BIT | IGPUBuffer::EUF_INLINE_UPDATE_VIA_CMDBUF | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT | IGPUBuffer::EUF_SHADER_BINDING_TABLE_BIT;
-      params.size = bufferSize;
-      m_utils->createFilledDeviceLocalBufferOnDedMem(SIntendedSubmitInfo{ .queue = queue }, std::move(params), pData).move_into(raygenRange.buffer);
-      missRange.buffer = core::smart_refctd_ptr(raygenRange.buffer);
-      hitRange.buffer = core::smart_refctd_ptr(raygenRange.buffer);
-      callableRange.buffer = core::smart_refctd_ptr(raygenRange.buffer);
-    }
-
-    return true;
-  }
-
-  bool createAccelerationStructures(video::CThreadSafeQueueAdapter* queue)
-  {
-    // plus 1 blas for procedural geometry contains {{var::NumberOfProcedural}}
-    // spheres. Each sphere is a primitive instead one instance or geometry
-    const auto blasCount = m_gpuTriangleGeometries.size() + 1;
-    const auto proceduralBlasIdx = m_gpuTriangleGeometries.size();
-
-    IQueryPool::SCreationParams qParams{ .queryCount = static_cast<uint32_t>(blasCount), .queryType = IQueryPool::ACCELERATION_STRUCTURE_COMPACTED_SIZE };
-    smart_refctd_ptr<IQueryPool> queryPool = m_device->createQueryPool(std::move(qParams));
-
-    auto pool = m_device->createCommandPool(queue->getFamilyIndex(), IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT | IGPUCommandPool::CREATE_FLAGS::TRANSIENT_BIT);
-    if (!pool)
-      return logFail("Couldn't create Command Pool for blas/tlas creation!");
-
-    m_api->startCapture();
-#ifdef TRY_BUILD_FOR_NGFX // NSight is "debugger-challenged" it can't capture anything not happenning "during a frame", so we need to trick it
-    m_currentImageAcquire = m_surface->acquireNextImage();
-    {
-      const IQueue::SSubmitInfo::SSemaphoreInfo acquired[] = { {
-        .semaphore = m_currentImageAcquire.semaphore,
-        .value = m_currentImageAcquire.acquireCount,
-        .stageMask = PIPELINE_STAGE_FLAGS::ALL_COMMANDS_BITS
-      } };
-      m_surface->present(m_currentImageAcquire.imageIndex, acquired);
-    }
-    m_currentImageAcquire = m_surface->acquireNextImage();
-#endif
-    size_t totalScratchSize = 0;
-    const auto scratchOffsetAlignment = m_device->getPhysicalDevice()->getLimits().minAccelerationStructureScratchOffsetAlignment;
-
-    // build bottom level ASes
-    {
-      core::vector<uint32_t> primitiveCounts(blasCount);
-      core::vector<IGPUBottomLevelAccelerationStructure::Triangles<const IGPUBuffer>> triangles(m_gpuTriangleGeometries.size());
-      core::vector<uint32_t> scratchSizes(blasCount);
-      IGPUBottomLevelAccelerationStructure::AABBs<const IGPUBuffer> aabbs;
-
-      auto blasFlags = bitflag(IGPUBottomLevelAccelerationStructure::BUILD_FLAGS::PREFER_FAST_TRACE_BIT) | IGPUBottomLevelAccelerationStructure::BUILD_FLAGS::ALLOW_COMPACTION_BIT;
-      if (m_physicalDevice->getProperties().limits.rayTracingPositionFetch)
-        blasFlags |= IGPUBottomLevelAccelerationStructure::BUILD_FLAGS::ALLOW_DATA_ACCESS_KHR;
-
-      IGPUBottomLevelAccelerationStructure::DeviceBuildInfo initBuildInfo;
-      initBuildInfo.buildFlags = blasFlags;
-      initBuildInfo.geometryCount = 1;	// only 1 geometry object per blas
-      initBuildInfo.srcAS = nullptr;
-      initBuildInfo.dstAS = nullptr;
-      initBuildInfo.scratch = {};
-
-      auto blasBuildInfos = core::vector(blasCount, initBuildInfo);
-
-      m_gpuBlasList.resize(blasCount);
-      // setup blas info for triangle geometries
-      for (uint32_t i = 0; i < blasCount; i++)
-      {
-        const auto isProcedural = i == proceduralBlasIdx;
-        if (isProcedural)
-        {
-          aabbs.data.buffer = smart_refctd_ptr(m_proceduralAabbBuffer);
-          aabbs.data.offset = 0;
-          aabbs.stride = sizeof(IGPUBottomLevelAccelerationStructure::AABB_t);
-          aabbs.geometryFlags = IGPUBottomLevelAccelerationStructure::GEOMETRY_FLAGS::OPAQUE_BIT; // only allow opaque for now
-
-          primitiveCounts[proceduralBlasIdx] = NumberOfProceduralGeometries;
-          blasBuildInfos[proceduralBlasIdx].aabbs = &aabbs;
-          blasBuildInfos[proceduralBlasIdx].buildFlags |= IGPUBottomLevelAccelerationStructure::BUILD_FLAGS::GEOMETRY_TYPE_IS_AABB_BIT;
-        } else
-        {
-          const auto& gpuObject = m_gpuTriangleGeometries[i];
-
-          const uint32_t vertexStride = gpuObject.vertexStride;
-          const uint32_t numVertices = gpuObject.bindings.vertex.buffer->getSize() / vertexStride;
-          if (gpuObject.useIndex())
-            primitiveCounts[i] = gpuObject.indexCount / 3;
-          else
-            primitiveCounts[i] = numVertices / 3;
-
-          triangles[i].vertexData[0] = gpuObject.bindings.vertex;
-          triangles[i].indexData = gpuObject.useIndex() ? gpuObject.bindings.index : gpuObject.bindings.vertex;
-          triangles[i].maxVertex = numVertices - 1;
-          triangles[i].vertexStride = vertexStride;
-          triangles[i].vertexFormat = EF_R32G32B32_SFLOAT;
-          triangles[i].indexType = gpuObject.indexType;
-          triangles[i].geometryFlags = gpuObject.material.isTransparent() ?
-            IGPUBottomLevelAccelerationStructure::GEOMETRY_FLAGS::NO_DUPLICATE_ANY_HIT_INVOCATION_BIT :
-            IGPUBottomLevelAccelerationStructure::GEOMETRY_FLAGS::OPAQUE_BIT;
-
-          blasBuildInfos[i].triangles = &triangles[i];
-        }
-        ILogicalDevice::AccelerationStructureBuildSizes buildSizes;
-        {
-          const uint32_t maxPrimCount[1] = { primitiveCounts[i] };
-          if (isProcedural)
-          {
-            buildSizes = m_device->getAccelerationStructureBuildSizes(blasBuildInfos[i].buildFlags, false, std::span{&aabbs, 1}, maxPrimCount);
-          } else
-          {
-            buildSizes = m_device->getAccelerationStructureBuildSizes(blasBuildInfos[i].buildFlags, false, std::span{&triangles[i], 1}, maxPrimCount);
-          }
-          if (!buildSizes)
-            return logFail("Failed to get BLAS build sizes");
-        }
-
-        scratchSizes[i] = buildSizes.buildScratchSize;
-        totalScratchSize = core::alignUp(totalScratchSize, scratchOffsetAlignment);
-        totalScratchSize += buildSizes.buildScratchSize;
-
-        {
-          IGPUBuffer::SCreationParams params;
-          params.usage = bitflag(IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT) | IGPUBuffer::EUF_ACCELERATION_STRUCTURE_STORAGE_BIT;
-          params.size = buildSizes.accelerationStructureSize;
-          smart_refctd_ptr<IGPUBuffer> asBuffer = createBuffer(params);
-
-          IGPUBottomLevelAccelerationStructure::SCreationParams blasParams;
-          blasParams.bufferRange.buffer = asBuffer;
-          blasParams.bufferRange.offset = 0u;
-          blasParams.bufferRange.size = buildSizes.accelerationStructureSize;
-          blasParams.flags = IGPUBottomLevelAccelerationStructure::SCreationParams::FLAGS::NONE;
-          m_gpuBlasList[i] = m_device->createBottomLevelAccelerationStructure(std::move(blasParams));
-          if (!m_gpuBlasList[i])
-            return logFail("Could not create BLAS");
-        }
-      }
-
-
-      auto cmdbufBlas = getSingleUseCommandBufferAndBegin(pool);
-      cmdbufBlas->beginDebugMarker("Build BLAS");
-
-      cmdbufBlas->resetQueryPool(queryPool.get(), 0, blasCount);
-
-      smart_refctd_ptr<IGPUBuffer> scratchBuffer;
-      {
-        IGPUBuffer::SCreationParams params;
-        params.usage = bitflag(IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT) | IGPUBuffer::EUF_STORAGE_BUFFER_BIT;
-        params.size = totalScratchSize;
-        scratchBuffer = createBuffer(params);
-      }
-
-      core::vector<IGPUBottomLevelAccelerationStructure::BuildRangeInfo> buildRangeInfos(blasCount);
-      core::vector<IGPUBottomLevelAccelerationStructure::BuildRangeInfo*> pRangeInfos(blasCount);
-      for (uint32_t i = 0; i < blasCount; i++)
-      {
-        blasBuildInfos[i].dstAS = m_gpuBlasList[i].get();
-        blasBuildInfos[i].scratch.buffer = scratchBuffer;
-        if (i == 0)
-        {
-          blasBuildInfos[i].scratch.offset = 0u;
-        } else
-        {
-          const auto unalignedOffset = blasBuildInfos[i - 1].scratch.offset + scratchSizes[i - 1];
-          blasBuildInfos[i].scratch.offset = core::alignUp(unalignedOffset, scratchOffsetAlignment);
-        }
-
-        buildRangeInfos[i].primitiveCount = primitiveCounts[i];
-        buildRangeInfos[i].primitiveByteOffset = 0u;
-        buildRangeInfos[i].firstVertex = 0u;
-        buildRangeInfos[i].transformByteOffset = 0u;
-
-        pRangeInfos[i] = &buildRangeInfos[i];
-      }
-
-      if (!cmdbufBlas->buildAccelerationStructures(std::span(blasBuildInfos), pRangeInfos.data()))
-        return logFail("Failed to build BLAS");
-
-      {
-        SMemoryBarrier memBarrier;
-        memBarrier.srcStageMask = PIPELINE_STAGE_FLAGS::ACCELERATION_STRUCTURE_BUILD_BIT;
-        memBarrier.srcAccessMask = ACCESS_FLAGS::ACCELERATION_STRUCTURE_WRITE_BIT;
-        memBarrier.dstStageMask = PIPELINE_STAGE_FLAGS::ACCELERATION_STRUCTURE_BUILD_BIT;
-        memBarrier.dstAccessMask = ACCESS_FLAGS::ACCELERATION_STRUCTURE_READ_BIT;
-        cmdbufBlas->pipelineBarrier(E_DEPENDENCY_FLAGS::EDF_NONE, { .memBarriers = {&memBarrier, 1} });
-      }
-
-
-      core::vector<const IGPUAccelerationStructure*> ases(blasCount);
-      for (uint32_t i = 0; i < blasCount; i++)
-        ases[i] = m_gpuBlasList[i].get();
-      if (!cmdbufBlas->writeAccelerationStructureProperties(std::span(ases), IQueryPool::ACCELERATION_STRUCTURE_COMPACTED_SIZE,
-        queryPool.get(), 0))
-        return logFail("Failed to write acceleration structure properties!");
-
-      cmdbufBlas->endDebugMarker();
-      cmdbufSubmitAndWait(cmdbufBlas, queue, 39);
-    }
-
-    auto cmdbufCompact = getSingleUseCommandBufferAndBegin(pool);
-    cmdbufCompact->beginDebugMarker("Compact BLAS");
-
-    // compact blas
-    {
-      core::vector<size_t> asSizes(blasCount);
-      if (!m_device->getQueryPoolResults(queryPool.get(), 0, blasCount, asSizes.data(), sizeof(size_t), bitflag(IQueryPool::WAIT_BIT) | IQueryPool::_64_BIT))
-        return logFail("Could not get query pool results for AS sizes");
-
-      core::vector<smart_refctd_ptr<IGPUBottomLevelAccelerationStructure>> cleanupBlas(blasCount);
-      for (uint32_t i = 0; i < blasCount; i++)
-      {
-        if (asSizes[i] == 0) continue;
-        cleanupBlas[i] = m_gpuBlasList[i];
-        {
-          IGPUBuffer::SCreationParams params;
-          params.usage = bitflag(IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT) | IGPUBuffer::EUF_ACCELERATION_STRUCTURE_STORAGE_BIT;
-          params.size = asSizes[i];
-          smart_refctd_ptr<IGPUBuffer> asBuffer = createBuffer(params);
-
-          IGPUBottomLevelAccelerationStructure::SCreationParams blasParams;
-          blasParams.bufferRange.buffer = asBuffer;
-          blasParams.bufferRange.offset = 0u;
-          blasParams.bufferRange.size = asSizes[i];
-          blasParams.flags = IGPUBottomLevelAccelerationStructure::SCreationParams::FLAGS::NONE;
-          m_gpuBlasList[i] = m_device->createBottomLevelAccelerationStructure(std::move(blasParams));
-          if (!m_gpuBlasList[i])
-            return logFail("Could not create compacted BLAS");
-        }
-
-        IGPUBottomLevelAccelerationStructure::CopyInfo copyInfo;
-        copyInfo.src = cleanupBlas[i].get();
-        copyInfo.dst = m_gpuBlasList[i].get();
-        copyInfo.mode = IGPUBottomLevelAccelerationStructure::COPY_MODE::COMPACT;
-        if (!cmdbufCompact->copyAccelerationStructure(copyInfo))
-          return logFail("Failed to copy AS to compact");
-      }
-    }
-
-    cmdbufCompact->endDebugMarker();
-    cmdbufSubmitAndWait(cmdbufCompact, queue, 40);
-
-    auto cmdbufTlas = getSingleUseCommandBufferAndBegin(pool);
-    cmdbufTlas->beginDebugMarker("Build TLAS");
-
-    // build top level AS
-    {
-      const uint32_t instancesCount = blasCount;
-      core::vector<IGPUTopLevelAccelerationStructure::DeviceStaticInstance> instances(instancesCount);
-      for (uint32_t i = 0; i < instancesCount; i++)
-      {
-        const auto isProceduralInstance = i == proceduralBlasIdx;
-        instances[i].base.blas.deviceAddress = m_gpuBlasList[i]->getReferenceForDeviceOperations().deviceAddress;
-        instances[i].base.mask = 0xFF;
-        instances[i].base.instanceCustomIndex = i;
-        instances[i].base.instanceShaderBindingTableRecordOffset = isProceduralInstance ? 2 : 0;
-        instances[i].base.flags = static_cast<uint32_t>(IGPUTopLevelAccelerationStructure::INSTANCE_FLAGS::TRIANGLE_FACING_CULL_DISABLE_BIT);
-        instances[i].transform = isProceduralInstance ? matrix3x4SIMD() : m_gpuTriangleGeometries[i].transform;
-      }
-
-      {
-        size_t bufSize = instancesCount * sizeof(IGPUTopLevelAccelerationStructure::DeviceStaticInstance);
-        IGPUBuffer::SCreationParams params;
-        params.usage = bitflag(IGPUBuffer::EUF_ACCELERATION_STRUCTURE_BUILD_INPUT_READ_ONLY_BIT) | IGPUBuffer::EUF_STORAGE_BUFFER_BIT |
-          IGPUBuffer::EUF_INLINE_UPDATE_VIA_CMDBUF | IGPUBuffer::EUF_TRANSFER_DST_BIT | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT;
-        params.size = bufSize;
-        m_instanceBuffer = createBuffer(params);
-
-        SBufferRange<IGPUBuffer> range = { .offset = 0u, .size = bufSize, .buffer = m_instanceBuffer };
-        cmdbufTlas->updateBuffer(range, instances.data());
-      }
-
-      // make sure instances upload complete first
-      {
-        SMemoryBarrier memBarrier;
-        memBarrier.srcStageMask = PIPELINE_STAGE_FLAGS::ALL_TRANSFER_BITS;
-        memBarrier.srcAccessMask = ACCESS_FLAGS::TRANSFER_WRITE_BIT;
-        memBarrier.dstStageMask = PIPELINE_STAGE_FLAGS::ACCELERATION_STRUCTURE_BUILD_BIT;
-        memBarrier.dstAccessMask = ACCESS_FLAGS::ACCELERATION_STRUCTURE_WRITE_BIT;
-        cmdbufTlas->pipelineBarrier(E_DEPENDENCY_FLAGS::EDF_NONE, { .memBarriers = {&memBarrier, 1} });
-      }
-
-      auto tlasFlags = bitflag(IGPUTopLevelAccelerationStructure::BUILD_FLAGS::PREFER_FAST_TRACE_BIT);
-
-      IGPUTopLevelAccelerationStructure::DeviceBuildInfo tlasBuildInfo;
-      tlasBuildInfo.buildFlags = tlasFlags;
-      tlasBuildInfo.srcAS = nullptr;
-      tlasBuildInfo.dstAS = nullptr;
-      tlasBuildInfo.instanceData.buffer = m_instanceBuffer;
-      tlasBuildInfo.instanceData.offset = 0u;
-      tlasBuildInfo.scratch = {};
-
-      auto buildSizes = m_device->getAccelerationStructureBuildSizes(tlasFlags, false, instancesCount);
-      if (!buildSizes)
-        return logFail("Failed to get TLAS build sizes");
-
-      {
-        IGPUBuffer::SCreationParams params;
-        params.usage = bitflag(IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT) | IGPUBuffer::EUF_ACCELERATION_STRUCTURE_STORAGE_BIT;
-        params.size = buildSizes.accelerationStructureSize;
-        smart_refctd_ptr<IGPUBuffer> asBuffer = createBuffer(params);
-
-        IGPUTopLevelAccelerationStructure::SCreationParams tlasParams;
-        tlasParams.bufferRange.buffer = asBuffer;
-        tlasParams.bufferRange.offset = 0u;
-        tlasParams.bufferRange.size = buildSizes.accelerationStructureSize;
-        tlasParams.flags = IGPUTopLevelAccelerationStructure::SCreationParams::FLAGS::NONE;
-        m_gpuTlas = m_device->createTopLevelAccelerationStructure(std::move(tlasParams));
-        if (!m_gpuTlas)
-          return logFail("Could not create TLAS");
-      }
-
-      smart_refctd_ptr<IGPUBuffer> scratchBuffer;
-      {
-        IGPUBuffer::SCreationParams params;
-        params.usage = bitflag(IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT) | IGPUBuffer::EUF_STORAGE_BUFFER_BIT;
-        params.size = buildSizes.buildScratchSize;
-        scratchBuffer = createBuffer(params);
-      }
-
-      tlasBuildInfo.dstAS = m_gpuTlas.get();
-      tlasBuildInfo.scratch.buffer = scratchBuffer;
-      tlasBuildInfo.scratch.offset = 0u;
-
-      IGPUTopLevelAccelerationStructure::BuildRangeInfo buildRangeInfo[1u];
-      buildRangeInfo[0].instanceCount = instancesCount;
-      buildRangeInfo[0].instanceByteOffset = 0u;
-      IGPUTopLevelAccelerationStructure::BuildRangeInfo* pRangeInfos;
-      pRangeInfos = &buildRangeInfo[0];
-
-      if (!cmdbufTlas->buildAccelerationStructures({ &tlasBuildInfo, 1 }, pRangeInfos))
-        return logFail("Failed to build TLAS");
-    }
-
-    cmdbufTlas->endDebugMarker();
-    cmdbufSubmitAndWait(cmdbufTlas, queue, 45);
-
-#ifdef TRY_BUILD_FOR_NGFX
-    {
-      const IQueue::SSubmitInfo::SSemaphoreInfo acquired[] = { {
-        .semaphore = m_currentImageAcquire.semaphore,
-        .value = m_currentImageAcquire.acquireCount,
-        .stageMask = PIPELINE_STAGE_FLAGS::ALL_COMMANDS_BITS
-      } };
-      m_surface->present(m_currentImageAcquire.imageIndex, acquired);
-    }
-#endif
-    m_api->endCapture();
-
-    return true;
-  }
-
-
-  smart_refctd_ptr<IWindow> m_window;
-  smart_refctd_ptr<CSimpleResizeSurface<ISimpleManagedSurface::ISwapchainResources>> m_surface;
-  smart_refctd_ptr<ISemaphore> m_semaphore;
-  uint64_t m_realFrameIx = 0;
-  uint32_t m_frameAccumulationCounter = 0;
-  std::array<smart_refctd_ptr<IGPUCommandBuffer>, MaxFramesInFlight> m_cmdBufs;
-  ISimpleManagedSurface::SAcquireResult m_currentImageAcquire = {};
-
-  core::smart_refctd_ptr<InputSystem> m_inputSystem;
-  InputSystem::ChannelReader<IMouseEventChannel> m_mouse;
-  InputSystem::ChannelReader<IKeyboardEventChannel> m_keyboard;
-
-  struct CameraSetting
-  {
-    float fov = 60.f;
-    float zNear = 0.1f;
-    float zFar = 10000.f;
-    float moveSpeed = 1.f;
-    float rotateSpeed = 1.f;
-    float viewWidth = 10.f;
-    float camYAngle = 165.f / 180.f * 3.14159f;
-    float camXAngle = 32.f / 180.f * 3.14159f;
-    
-  } m_cameraSetting;
-  Camera m_camera = Camera(core::vectorSIMDf(0, 0, 0), core::vectorSIMDf(0, 0, 0), core::matrix4SIMD());
-
-  Light m_light = {
-    .direction = {-1.0f, -1.0f, -0.4f},
-    .position = {10.0f, 15.0f, 8.0f},
-    .outerCutoff = 0.866025404f, // {cos(radians(30.0f))}, 
-    .type = ELT_DIRECTIONAL
-  };
-
-  video::CDumbPresentationOracle m_oracle;
-
-  struct C_UI
-  {
-    nbl::core::smart_refctd_ptr<nbl::ext::imgui::UI> manager;
-
-    struct
-    {
-      core::smart_refctd_ptr<video::IGPUSampler> gui, scene;
-    } samplers;
-
-    core::smart_refctd_ptr<IGPUDescriptorSet> descriptorSet;
-  } m_ui;
-  core::smart_refctd_ptr<IDescriptorPool> m_guiDescriptorSetPool;
-
-  core::vector<ReferenceObjectGpu> m_gpuTriangleGeometries;
-  core::vector<SProceduralGeomInfo> m_gpuIntersectionSpheres;
-  uint32_t m_intersectionHitGroupIdx;
-
-  std::vector<smart_refctd_ptr<IGPUBottomLevelAccelerationStructure>> m_gpuBlasList;
-  smart_refctd_ptr<IGPUTopLevelAccelerationStructure> m_gpuTlas;
-  smart_refctd_ptr<IGPUBuffer> m_instanceBuffer;
-
-  smart_refctd_ptr<IGPUBuffer> m_triangleGeomInfoBuffer;
-  smart_refctd_ptr<IGPUBuffer> m_proceduralGeomInfoBuffer;
-  smart_refctd_ptr<IGPUBuffer> m_proceduralAabbBuffer;
-  smart_refctd_ptr<IGPUBuffer> m_indirectBuffer;
-
-  smart_refctd_ptr<IGPUImage> m_hdrImage;
-  smart_refctd_ptr<IGPUImageView> m_hdrImageView;
-
-  smart_refctd_ptr<IDescriptorPool> m_rayTracingDsPool;
-  smart_refctd_ptr<IGPUDescriptorSet> m_rayTracingDs;
-  smart_refctd_ptr<IGPURayTracingPipeline> m_rayTracingPipeline;
-  uint64_t m_rayTracingStackSize;
-  ShaderBindingTable m_shaderBindingTable;
-
-  smart_refctd_ptr<IGPUDescriptorSet> m_presentDs;
-  smart_refctd_ptr<IDescriptorPool> m_presentDsPool;
-  smart_refctd_ptr<IGPUGraphicsPipeline> m_presentPipeline;
-
-  smart_refctd_ptr<CAssetConverter> m_converter;
-
-
-  core::matrix4SIMD m_cachedModelViewProjectionMatrix;
-  bool m_useIndirectCommand = false;
+	uint32_t getWorkgroupCount(uint32_t dim, uint32_t size)
+	{
+		return (dim + size - 1) / size;
+	}
+
+	bool createIndirectBuffer()
+	{
+		const auto getBufferRangeAddress = [](const SBufferRange<IGPUBuffer>& range)
+			{
+				return range.buffer->getDeviceAddress() + range.offset;
+			};
+		const auto command = TraceRaysIndirectCommand_t{
+		  .raygenShaderRecordAddress = getBufferRangeAddress(m_shaderBindingTable.raygenGroupRange),
+		  .raygenShaderRecordSize = m_shaderBindingTable.raygenGroupRange.size,
+		  .missShaderBindingTableAddress = getBufferRangeAddress(m_shaderBindingTable.missGroupsRange),
+		  .missShaderBindingTableSize = m_shaderBindingTable.missGroupsRange.size,
+		  .missShaderBindingTableStride = m_shaderBindingTable.missGroupsStride,
+		  .hitShaderBindingTableAddress = getBufferRangeAddress(m_shaderBindingTable.hitGroupsRange),
+		  .hitShaderBindingTableSize = m_shaderBindingTable.hitGroupsRange.size,
+		  .hitShaderBindingTableStride = m_shaderBindingTable.hitGroupsStride,
+		  .callableShaderBindingTableAddress = getBufferRangeAddress(m_shaderBindingTable.callableGroupsRange),
+		  .callableShaderBindingTableSize = m_shaderBindingTable.callableGroupsRange.size,
+		  .callableShaderBindingTableStride = m_shaderBindingTable.callableGroupsStride,
+		  .width = WIN_W,
+		  .height = WIN_H,
+		  .depth = 1,
+		};
+		IGPUBuffer::SCreationParams params;
+		params.usage = IGPUBuffer::EUF_TRANSFER_DST_BIT | IGPUBuffer::EUF_INDIRECT_BUFFER_BIT | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT;
+		params.size = sizeof(TraceRaysIndirectCommand_t);
+		m_utils->createFilledDeviceLocalBufferOnDedMem(SIntendedSubmitInfo{ .queue = getGraphicsQueue() }, std::move(params), &command).move_into(m_indirectBuffer);
+		return true;
+	}
+
+	void calculateRayTracingStackSize(const smart_refctd_ptr<video::IGPURayTracingPipeline>& pipeline)
+	{
+		const auto raygenStackSize = pipeline->getRaygenStackSize();
+		auto getMaxSize = [&](auto ranges, auto valProj) -> uint16_t
+			{
+				auto maxValue = 0;
+				for (const auto& val : ranges)
+				{
+					maxValue = std::max<uint16_t>(maxValue, std::invoke(valProj, val));
+				}
+				return maxValue;
+			};
+
+		const auto closestHitStackMax = getMaxSize(pipeline->getHitStackSizes(), &IGPURayTracingPipeline::SHitGroupStackSize::closestHit);
+		const auto anyHitStackMax = getMaxSize(pipeline->getHitStackSizes(), &IGPURayTracingPipeline::SHitGroupStackSize::anyHit);
+		const auto intersectionStackMax = getMaxSize(pipeline->getHitStackSizes(), &IGPURayTracingPipeline::SHitGroupStackSize::intersection);
+		const auto missStackMax = getMaxSize(pipeline->getMissStackSizes(), std::identity{});
+		const auto callableStackMax = getMaxSize(pipeline->getCallableStackSizes(), std::identity{});
+		auto firstDepthStackSizeMax = std::max(closestHitStackMax, missStackMax);
+		firstDepthStackSizeMax = std::max<uint16_t>(firstDepthStackSizeMax, intersectionStackMax + anyHitStackMax);
+		m_rayTracingStackSize = raygenStackSize + std::max(firstDepthStackSizeMax, callableStackMax);
+	}
+
+	bool createShaderBindingTable(const smart_refctd_ptr<video::IGPURayTracingPipeline>& pipeline)
+	{
+		const auto& limits = m_device->getPhysicalDevice()->getLimits();
+		const auto handleSize = SPhysicalDeviceLimits::ShaderGroupHandleSize;
+		const auto handleSizeAligned = nbl::core::alignUp(handleSize, limits.shaderGroupHandleAlignment);
+
+		auto& raygenRange = m_shaderBindingTable.raygenGroupRange;
+
+		auto& hitRange = m_shaderBindingTable.hitGroupsRange;
+		const auto hitHandles = pipeline->getHitHandles();
+
+		auto& missRange = m_shaderBindingTable.missGroupsRange;
+		const auto missHandles = pipeline->getMissHandles();
+
+		auto& callableRange = m_shaderBindingTable.callableGroupsRange;
+		const auto callableHandles = pipeline->getCallableHandles();
+
+		raygenRange = {
+		  .offset = 0,
+		  .size = core::alignUp(handleSizeAligned, limits.shaderGroupBaseAlignment)
+		};
+
+		missRange = {
+		  .offset = raygenRange.size,
+		  .size = core::alignUp(missHandles.size() * handleSizeAligned, limits.shaderGroupBaseAlignment),
+		};
+		m_shaderBindingTable.missGroupsStride = handleSizeAligned;
+
+		hitRange = {
+		  .offset = missRange.offset + missRange.size,
+		  .size = core::alignUp(hitHandles.size() * handleSizeAligned, limits.shaderGroupBaseAlignment),
+		};
+		m_shaderBindingTable.hitGroupsStride = handleSizeAligned;
+
+		callableRange = {
+		  .offset = hitRange.offset + hitRange.size,
+		  .size = core::alignUp(callableHandles.size() * handleSizeAligned, limits.shaderGroupBaseAlignment),
+		};
+		m_shaderBindingTable.callableGroupsStride = handleSizeAligned;
+
+		const auto bufferSize = raygenRange.size + missRange.size + hitRange.size + callableRange.size;
+
+		ICPUBuffer::SCreationParams cpuBufferParams;
+		cpuBufferParams.size = bufferSize;
+		auto cpuBuffer = ICPUBuffer::create(std::move(cpuBufferParams));
+		uint8_t* pData = reinterpret_cast<uint8_t*>(cpuBuffer->getPointer());
+
+		// copy raygen region
+		memcpy(pData, &pipeline->getRaygen(), handleSize);
+
+		// copy miss region
+		uint8_t* pMissData = pData + missRange.offset;
+		for (const auto& handle : missHandles)
+		{
+			memcpy(pMissData, &handle, handleSize);
+			pMissData += m_shaderBindingTable.missGroupsStride;
+		}
+
+		// copy hit region
+		uint8_t* pHitData = pData + hitRange.offset;
+		for (const auto& handle : hitHandles)
+		{
+			memcpy(pHitData, &handle, handleSize);
+			pHitData += m_shaderBindingTable.hitGroupsStride;
+		}
+
+		// copy callable region
+		uint8_t* pCallableData = pData + callableRange.offset;
+		for (const auto& handle : callableHandles)
+		{
+			memcpy(pCallableData, &handle, handleSize);
+			pCallableData += m_shaderBindingTable.callableGroupsStride;
+		}
+
+		{
+			IGPUBuffer::SCreationParams params;
+			params.usage = IGPUBuffer::EUF_TRANSFER_DST_BIT | IGPUBuffer::EUF_INLINE_UPDATE_VIA_CMDBUF | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT | IGPUBuffer::EUF_SHADER_BINDING_TABLE_BIT;
+			params.size = bufferSize;
+			m_utils->createFilledDeviceLocalBufferOnDedMem(SIntendedSubmitInfo{ .queue = getGraphicsQueue() }, std::move(params), pData).move_into(raygenRange.buffer);
+			missRange.buffer = core::smart_refctd_ptr(raygenRange.buffer);
+			hitRange.buffer = core::smart_refctd_ptr(raygenRange.buffer);
+			callableRange.buffer = core::smart_refctd_ptr(raygenRange.buffer);
+		}
+
+		return true;
+	}
+
+	bool createAccelerationStructuresFromGeometry(const IGeometryCreator* gc)
+	{
+		auto queue = getGraphicsQueue();
+		// get geometries into ICPUBuffers
+		auto pool = m_device->createCommandPool(queue->getFamilyIndex(), IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT);
+		if (!pool)
+			return logFail("Couldn't create Command Pool for geometry creation!");
+
+		const auto defaultMaterial = Material{
+		  .ambient = {0.2, 0.1, 0.1},
+		  .diffuse = {0.8, 0.3, 0.3},
+		  .specular = {0.8, 0.8, 0.8},
+		  .shininess = 1.0f,
+		  .alpha = 1.0f,
+		};
+
+		auto getTranslationMatrix = [](float32_t x, float32_t y, float32_t z)
+			{
+				core::matrix3x4SIMD transform;
+				transform.setTranslation(nbl::core::vectorSIMDf(x, y, z, 0));
+				return transform;
+			};
+
+		core::matrix3x4SIMD planeTransform;
+		planeTransform.setRotation(quaternion::fromAngleAxis(core::radians(-90.0f), vector3df_SIMD{ 1, 0, 0 }));
+
+		// triangles geometries
+		const auto cpuObjects = std::array{
+			ReferenceObjectCpu {
+				.meta = {.type = OT_RECTANGLE, .name = "Plane Mesh"},
+				.data = gc->createRectangleMesh(nbl::core::vector2df_SIMD(10, 10)),
+				.material = defaultMaterial,
+				.transform = planeTransform,
+			},
+			ReferenceObjectCpu {
+				.meta = {.type = OT_CUBE, .name = "Cube Mesh"},
+				.data = gc->createCubeMesh(nbl::core::vector3df(1, 1, 1)),
+				.material = defaultMaterial,
+				.transform = getTranslationMatrix(0, 0.5f, 0),
+			},
+			ReferenceObjectCpu {
+				.meta = {.type = OT_CUBE, .name = "Cube Mesh 2"},
+				.data = gc->createCubeMesh(nbl::core::vector3df(1.5, 1.5, 1.5)),
+				.material = Material{
+					.ambient = {0.1, 0.1, 0.2},
+					.diffuse = {0.2, 0.2, 0.8},
+					.specular = {0.8, 0.8, 0.8},
+					.shininess = 1.0f,
+				},
+				.transform = getTranslationMatrix(-5.0f, 1.0f, 0),
+			},
+			ReferenceObjectCpu {
+				.meta = {.type = OT_CUBE, .name = "Transparent Cube Mesh"},
+				.data = gc->createCubeMesh(nbl::core::vector3df(1.5, 1.5, 1.5)),
+				.material = Material{
+					.ambient = {0.1, 0.2, 0.1},
+					.diffuse = {0.2, 0.8, 0.2},
+					.specular = {0.8, 0.8, 0.8},
+					.shininess = 1.0f,
+					.alpha = 0.2,
+				},
+				.transform = getTranslationMatrix(5.0f, 1.0f, 0),
+			},
+		};
+
+		struct CPUTriBufferBindings
+		{
+			nbl::asset::SBufferBinding<ICPUBuffer> vertex, index;
+		};
+		std::array<CPUTriBufferBindings, std::size(cpuObjects)> cpuTriBuffers;
+
+		for (uint32_t i = 0; i < cpuObjects.size(); i++)
+		{
+			const auto& cpuObject = cpuObjects[i];
+
+			auto vBuffer = smart_refctd_ptr(cpuObject.data.bindings[0].buffer); // no offset
+			auto vUsage = bitflag(IGPUBuffer::EUF_STORAGE_BUFFER_BIT) | IGPUBuffer::EUF_TRANSFER_DST_BIT | IGPUBuffer::EUF_INLINE_UPDATE_VIA_CMDBUF |
+				IGPUBuffer::EUF_ACCELERATION_STRUCTURE_BUILD_INPUT_READ_ONLY_BIT | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT;
+			vBuffer->addUsageFlags(vUsage);
+			vBuffer->setContentHash(vBuffer->computeContentHash());
+
+			auto iBuffer = smart_refctd_ptr(cpuObject.data.indexBuffer.buffer); // no offset
+			auto iUsage = bitflag(IGPUBuffer::EUF_STORAGE_BUFFER_BIT) | IGPUBuffer::EUF_TRANSFER_DST_BIT | IGPUBuffer::EUF_INLINE_UPDATE_VIA_CMDBUF |
+				IGPUBuffer::EUF_ACCELERATION_STRUCTURE_BUILD_INPUT_READ_ONLY_BIT | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT;
+
+			if (cpuObject.data.indexType != EIT_UNKNOWN)
+				if (iBuffer)
+				{
+					iBuffer->addUsageFlags(iUsage);
+					iBuffer->setContentHash(iBuffer->computeContentHash());
+				}
+
+			cpuTriBuffers[i] = {
+			  .vertex = {.offset = 0, .buffer = vBuffer},
+			  .index = {.offset = 0, .buffer = iBuffer},
+			};
+
+		}
+
+		// procedural geometries
+		using Aabb = IGPUBottomLevelAccelerationStructure::AABB_t;
+
+		smart_refctd_ptr<ICPUBuffer> cpuProcBuffer;
+		{
+			ICPUBuffer::SCreationParams params;
+			params.size = NumberOfProceduralGeometries * sizeof(Aabb);
+			cpuProcBuffer = ICPUBuffer::create(std::move(params));
+		}
+
+		core::vector<SProceduralGeomInfo> proceduralGeoms;
+		proceduralGeoms.reserve(NumberOfProceduralGeometries);
+		auto proceduralGeometries = reinterpret_cast<Aabb*>(cpuProcBuffer->getPointer());
+		for (int32_t i = 0; i < NumberOfProceduralGeometries; i++)
+		{
+			const auto middle_i = NumberOfProceduralGeometries / 2.0;
+			SProceduralGeomInfo sphere = {
+					.material = hlsl::_static_cast<MaterialPacked>(Material{
+					.ambient = {0.1, 0.05 * i, 0.1},
+					.diffuse = {0.3, 0.2 * i, 0.3},
+					.specular = {0.8, 0.8, 0.8},
+					.shininess = 1.0f,
+				}),
+				.center = float32_t3((i - middle_i) * 4.0, 2, 5.0),
+				.radius = 1,
+			};
+
+			proceduralGeoms.push_back(sphere);
+			const auto sphereMin = sphere.center - sphere.radius;
+			const auto sphereMax = sphere.center + sphere.radius;
+			proceduralGeometries[i] = {
+				vector3d(sphereMin.x, sphereMin.y, sphereMin.z),
+				vector3d(sphereMax.x, sphereMax.y, sphereMax.z)
+			};
+		}
+
+		{
+			IGPUBuffer::SCreationParams params;
+			params.usage = IGPUBuffer::EUF_STORAGE_BUFFER_BIT | IGPUBuffer::EUF_TRANSFER_DST_BIT | IGPUBuffer::EUF_INLINE_UPDATE_VIA_CMDBUF | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT;
+			params.size = proceduralGeoms.size() * sizeof(SProceduralGeomInfo);
+			m_utils->createFilledDeviceLocalBufferOnDedMem(SIntendedSubmitInfo{ .queue = queue }, std::move(params), proceduralGeoms.data()).move_into(m_proceduralGeomInfoBuffer);
+		}
+
+		// get ICPUBuffers into ICPUBLAS
+		// TODO use one BLAS and multiple triangles/aabbs in one
+		const auto blasCount = std::size(cpuObjects) + 1;
+		const auto proceduralBlasIdx = std::size(cpuObjects);
+
+		std::array<smart_refctd_ptr<ICPUBottomLevelAccelerationStructure>, std::size(cpuObjects)+1u> cpuBlas;
+		for (uint32_t i = 0; i < blasCount; i++)
+		{
+			auto& blas = cpuBlas[i];
+			blas = make_smart_refctd_ptr<ICPUBottomLevelAccelerationStructure>();
+
+			if (i == proceduralBlasIdx)
+			{
+				auto aabbs = make_refctd_dynamic_array<smart_refctd_dynamic_array<ICPUBottomLevelAccelerationStructure::AABBs<ICPUBuffer>>>(1u);
+				auto primitiveCounts = make_refctd_dynamic_array<smart_refctd_dynamic_array<uint32_t>>(1u);
+
+				auto& aabb = aabbs->front();
+				auto& primCount = primitiveCounts->front();
+				
+				primCount = NumberOfProceduralGeometries;
+				aabb.data = { .offset = 0, .buffer = cpuProcBuffer };
+				aabb.stride = sizeof(IGPUBottomLevelAccelerationStructure::AABB_t);
+				aabb.geometryFlags = IGPUBottomLevelAccelerationStructure::GEOMETRY_FLAGS::OPAQUE_BIT; // only allow opaque for now
+
+				blas->setGeometries(std::move(aabbs), std::move(primitiveCounts));
+			}
+			else
+			{
+				auto triangles = make_refctd_dynamic_array<smart_refctd_dynamic_array<ICPUBottomLevelAccelerationStructure::Triangles<ICPUBuffer>>>(1u);
+				auto primitiveCounts = make_refctd_dynamic_array<smart_refctd_dynamic_array<uint32_t>>(1u);
+
+				auto& tri = triangles->front();
+				auto& primCount = primitiveCounts->front();
+				const auto& geom = cpuObjects[i];
+				const auto& cpuBuf = cpuTriBuffers[i];
+
+				const bool useIndex = geom.data.indexType != EIT_UNKNOWN;
+				const uint32_t vertexStride = geom.data.inputParams.bindings[0].stride;
+				const uint32_t numVertices = cpuBuf.vertex.buffer->getSize() / vertexStride;
+
+				if (useIndex)
+					primCount = geom.data.indexCount / 3;
+				else
+					primCount = numVertices / 3;
+
+				tri.vertexData[0] = cpuBuf.vertex;
+				tri.indexData = useIndex ? cpuBuf.index : cpuBuf.vertex;
+				tri.maxVertex = numVertices - 1;
+				tri.vertexStride = vertexStride;
+				tri.vertexFormat = EF_R32G32B32_SFLOAT;
+				tri.indexType = geom.data.indexType;
+				tri.geometryFlags = geom.material.isTransparent() ?
+					IGPUBottomLevelAccelerationStructure::GEOMETRY_FLAGS::NO_DUPLICATE_ANY_HIT_INVOCATION_BIT :
+					IGPUBottomLevelAccelerationStructure::GEOMETRY_FLAGS::OPAQUE_BIT;
+
+				blas->setGeometries(std::move(triangles), std::move(primitiveCounts));
+			}
+
+			auto blasFlags = bitflag(IGPUBottomLevelAccelerationStructure::BUILD_FLAGS::PREFER_FAST_TRACE_BIT) | IGPUBottomLevelAccelerationStructure::BUILD_FLAGS::ALLOW_COMPACTION_BIT;
+			if (i == proceduralBlasIdx)
+				blasFlags |= IGPUBottomLevelAccelerationStructure::BUILD_FLAGS::GEOMETRY_TYPE_IS_AABB_BIT;
+
+			blas->setBuildFlags(blasFlags);
+			blas->setContentHash(blas->computeContentHash());
+		}
+
+		auto geomInfoBuffer = ICPUBuffer::create({ std::size(cpuObjects) * sizeof(STriangleGeomInfo) });
+		STriangleGeomInfo* geomInfos = reinterpret_cast<STriangleGeomInfo*>(geomInfoBuffer->getPointer());
+
+		// get ICPUBLAS into ICPUTLAS
+		auto geomInstances = make_refctd_dynamic_array<smart_refctd_dynamic_array<ICPUTopLevelAccelerationStructure::PolymorphicInstance>>(blasCount);
+		{
+			uint32_t i = 0;
+			for (auto instance = geomInstances->begin(); instance != geomInstances->end(); instance++, i++)
+			{
+				const auto isProceduralInstance = i == proceduralBlasIdx;
+				ICPUTopLevelAccelerationStructure::StaticInstance inst;
+				inst.base.blas = cpuBlas[i];
+				inst.base.flags = static_cast<uint32_t>(IGPUTopLevelAccelerationStructure::INSTANCE_FLAGS::TRIANGLE_FACING_CULL_DISABLE_BIT);
+				inst.base.instanceCustomIndex = i;
+				inst.base.instanceShaderBindingTableRecordOffset = isProceduralInstance ? 2 : 0;;
+				inst.base.mask = 0xFF;
+				inst.transform = isProceduralInstance ? matrix3x4SIMD() : cpuObjects[i].transform;
+
+				instance->instance = inst;
+			}
+		}
+
+		auto cpuTlas = make_smart_refctd_ptr<ICPUTopLevelAccelerationStructure>();
+		cpuTlas->setInstances(std::move(geomInstances));
+		cpuTlas->setBuildFlags(IGPUTopLevelAccelerationStructure::BUILD_FLAGS::PREFER_FAST_TRACE_BIT);
+
+		// convert with asset converter
+		smart_refctd_ptr<CAssetConverter> converter = CAssetConverter::create({ .device = m_device.get(), .optimizer = {} });
+		struct MyInputs : CAssetConverter::SInputs
+		{
+			// For the GPU Buffers to be directly writeable and so that we don't need a Transfer Queue submit at all
+			inline uint32_t constrainMemoryTypeBits(const size_t groupCopyID, const IAsset* canonicalAsset, const blake3_hash_t& contentHash, const IDeviceMemoryBacked* memoryBacked) const override
+			{
+				assert(memoryBacked);
+				return memoryBacked->getObjectType() != IDeviceMemoryBacked::EOT_BUFFER ? (~0u) : rebarMemoryTypes;
+			}
+
+			uint32_t rebarMemoryTypes;
+		} inputs = {};
+		inputs.logger = m_logger.get();
+		inputs.rebarMemoryTypes = m_physicalDevice->getDirectVRAMAccessMemoryTypeBits();
+		// the allocator needs to be overriden to hand out memory ranges which have already been mapped so that the ReBAR fast-path can kick in
+		// (multiple buffers can be bound to same memory, but memory can only be mapped once at one place, so Asset Converter can't do it)
+		struct MyAllocator final : public IDeviceMemoryAllocator
+		{
+			ILogicalDevice* getDeviceForAllocations() const override { return device; }
+
+			SAllocation allocate(const SAllocateInfo& info) override
+			{
+				auto retval = device->allocate(info);
+				// map what is mappable by default so ReBAR checks succeed
+				if (retval.isValid() && retval.memory->isMappable())
+					retval.memory->map({ .offset = 0,.length = info.size });
+				return retval;
+			}
+
+			ILogicalDevice* device;
+		} myalloc;
+		myalloc.device = m_device.get();
+		inputs.allocator = &myalloc;
+
+		std::array<ICPUTopLevelAccelerationStructure*, 1u> tmpTlas;
+		std::array<ICPUBuffer*, 2 * std::size(cpuObjects) + 1u> tmpBuffers;
+		{
+			tmpTlas[0] = cpuTlas.get();
+			for (uint32_t i = 0; i < cpuObjects.size(); i++)
+			{
+				tmpBuffers[2 * i + 0] = cpuTriBuffers[i].vertex.buffer.get();
+				tmpBuffers[2 * i + 1] = cpuTriBuffers[i].index.buffer.get();
+			}
+			tmpBuffers[2 * proceduralBlasIdx] = cpuProcBuffer.get();
+
+			std::get<CAssetConverter::SInputs::asset_span_t<ICPUTopLevelAccelerationStructure>>(inputs.assets) = tmpTlas;
+			std::get<CAssetConverter::SInputs::asset_span_t<ICPUBuffer>>(inputs.assets) = tmpBuffers;
+		}
+
+		auto reservation = converter->reserve(inputs);
+		{
+			auto prepass = [&]<typename asset_type_t>(const auto & references) -> bool
+			{
+				auto objects = reservation.getGPUObjects<asset_type_t>();
+				uint32_t counter = {};
+				for (auto& object : objects)
+				{
+					auto gpu = object.value;
+					auto* reference = references[counter];
+
+					if (reference)
+					{
+						if (!gpu)
+						{
+							m_logger->log("Failed to convert a CPU object to GPU!", ILogger::ELL_ERROR);
+							return false;
+						}
+					}
+					counter++;
+				}
+				return true;
+			};
+
+			prepass.template operator() < ICPUTopLevelAccelerationStructure > (tmpTlas);
+			prepass.template operator() < ICPUBuffer > (tmpBuffers);
+		}
+
+		constexpr auto CompBufferCount = 2;
+		std::array<smart_refctd_ptr<IGPUCommandBuffer>, CompBufferCount> compBufs = {};
+		std::array<IQueue::SSubmitInfo::SCommandBufferInfo, CompBufferCount> compBufInfos = {};
+		{
+			auto pool = m_device->createCommandPool(queue->getFamilyIndex(), IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT | IGPUCommandPool::CREATE_FLAGS::TRANSIENT_BIT);
+			pool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY, compBufs);
+			compBufs.front()->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT);
+			for (auto i = 0; i < CompBufferCount; i++)
+				compBufInfos[i].cmdbuf = compBufs[i].get();
+		}
+		auto compSema = m_device->createSemaphore(0u);
+		SIntendedSubmitInfo compute = {};
+		compute.queue = queue;
+		compute.scratchCommandBuffers = compBufInfos;
+		compute.scratchSemaphore = {
+			.semaphore = compSema.get(),
+			.value = 0u,
+			.stageMask = PIPELINE_STAGE_FLAGS::ACCELERATION_STRUCTURE_BUILD_BIT | PIPELINE_STAGE_FLAGS::ACCELERATION_STRUCTURE_COPY_BIT
+		};
+		// convert
+		{
+			smart_refctd_ptr<CAssetConverter::SConvertParams::scratch_for_device_AS_build_t> scratchAlloc;
+			{
+				constexpr auto MaxAlignment = 256;
+				constexpr auto MinAllocationSize = 1024;
+				const auto scratchSize = core::alignUp(reservation.getMaxASBuildScratchSize(false), MaxAlignment);
+
+
+				IGPUBuffer::SCreationParams creationParams = {};
+				creationParams.size = scratchSize;
+				creationParams.usage = IGPUBuffer::EUF_ACCELERATION_STRUCTURE_BUILD_INPUT_READ_ONLY_BIT | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT | IGPUBuffer::EUF_STORAGE_BUFFER_BIT;
+				auto scratchBuffer = m_device->createBuffer(std::move(creationParams));
+
+				auto reqs = scratchBuffer->getMemoryReqs();
+				reqs.memoryTypeBits &= m_physicalDevice->getDirectVRAMAccessMemoryTypeBits();
+
+				auto allocation = m_device->allocate(reqs, scratchBuffer.get(), IDeviceMemoryAllocation::EMAF_DEVICE_ADDRESS_BIT);
+				allocation.memory->map({ .offset = 0,.length = reqs.size });
+
+				scratchAlloc = make_smart_refctd_ptr<CAssetConverter::SConvertParams::scratch_for_device_AS_build_t>(
+					SBufferRange<video::IGPUBuffer>{0ull, scratchSize, std::move(scratchBuffer)},
+					core::allocator<uint8_t>(), MaxAlignment, MinAllocationSize
+				);
+			}
+
+			struct MyParams final : CAssetConverter::SConvertParams
+			{
+				inline uint32_t getFinalOwnerQueueFamily(const IGPUBuffer* buffer, const core::blake3_hash_t& createdFrom) override
+				{
+					return finalUser;
+				}
+				inline uint32_t getFinalOwnerQueueFamily(const IGPUAccelerationStructure* image, const core::blake3_hash_t& createdFrom) override
+				{
+					return finalUser;
+				}
+
+				uint8_t finalUser;
+			} params = {};
+			params.utilities = m_utils.get();
+			params.compute = &compute;
+			params.scratchForDeviceASBuild = scratchAlloc.get();
+			params.finalUser = queue->getFamilyIndex();
+
+			auto future = reservation.convert(params);
+			if (future.copy() != IQueue::RESULT::SUCCESS)
+			{
+				m_logger->log("Failed to await submission feature!", ILogger::ELL_ERROR);
+				return false;
+			}
+			// 2 submits, BLAS build, TLAS build, DO NOT ADD COMPACTIONS IN THIS EXAMPLE!
+			if (compute.getFutureScratchSemaphore().value>3)
+				m_logger->log("Overflow submitted on Compute Queue despite using ReBAR (no transfer submits or usage of staging buffer) and providing a AS Build Scratch Buffer of correctly queried max size!",system::ILogger::ELL_ERROR);
+
+			// assign gpu objects to output
+			auto&& tlases = reservation.getGPUObjects<ICPUTopLevelAccelerationStructure>();
+			m_gpuTlas = tlases[0].value;
+			auto&& buffers = reservation.getGPUObjects<ICPUBuffer>();
+			for (uint32_t i = 0; i < cpuObjects.size(); i++)
+			{
+				auto& cpuObject = cpuObjects[i];
+
+				m_gpuTriangleGeometries.push_back(ReferenceObjectGpu{
+				  .meta = cpuObject.meta,
+				  .bindings = {
+					.vertex = {.offset = 0, .buffer = buffers[2 * i + 0].value },
+					.index = {.offset = 0, .buffer = buffers[2 * i + 1].value },
+				  },
+				  .vertexStride = cpuObject.data.inputParams.bindings[0].stride,
+				  .indexType = cpuObject.data.indexType,
+				  .indexCount = cpuObject.data.indexCount,
+				  .material = hlsl::_static_cast<MaterialPacked>(cpuObject.material),
+				  .transform = cpuObject.transform,
+					});
+			}
+			m_proceduralAabbBuffer = buffers[2 * proceduralBlasIdx].value;
+
+			for (uint32_t i = 0; i < m_gpuTriangleGeometries.size(); i++)
+			{
+				const auto& gpuObject = m_gpuTriangleGeometries[i];
+				const uint64_t vertexBufferAddress = gpuObject.bindings.vertex.buffer->getDeviceAddress();
+				geomInfos[i] = {
+				  .material = gpuObject.material,
+				  .vertexBufferAddress = vertexBufferAddress,
+				  .indexBufferAddress = gpuObject.useIndex() ? gpuObject.bindings.index.buffer->getDeviceAddress() : vertexBufferAddress,
+				  .vertexStride = gpuObject.vertexStride,
+				  .objType = gpuObject.meta.type,
+				  .indexType = gpuObject.indexType,
+				  .smoothNormals = s_smoothNormals[gpuObject.meta.type],
+				};
+			}
+		}
+
+		{
+			IGPUBuffer::SCreationParams params;
+			params.usage = IGPUBuffer::EUF_STORAGE_BUFFER_BIT | IGPUBuffer::EUF_TRANSFER_DST_BIT | IGPUBuffer::EUF_INLINE_UPDATE_VIA_CMDBUF | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT;
+			params.size = geomInfoBuffer->getSize();
+			m_utils->createFilledDeviceLocalBufferOnDedMem(SIntendedSubmitInfo{ .queue = queue }, std::move(params), geomInfos).move_into(m_triangleGeomInfoBuffer);
+		}
+
+		return true;
+	}
+
+
+
+	smart_refctd_ptr<IWindow> m_window;
+	smart_refctd_ptr<CSimpleResizeSurface<ISimpleManagedSurface::ISwapchainResources>> m_surface;
+	smart_refctd_ptr<ISemaphore> m_semaphore;
+	uint64_t m_realFrameIx = 0;
+	uint32_t m_frameAccumulationCounter = 0;
+	std::array<smart_refctd_ptr<IGPUCommandBuffer>, MaxFramesInFlight> m_cmdBufs;
+	ISimpleManagedSurface::SAcquireResult m_currentImageAcquire = {};
+
+	core::smart_refctd_ptr<InputSystem> m_inputSystem;
+	InputSystem::ChannelReader<IMouseEventChannel> m_mouse;
+	InputSystem::ChannelReader<IKeyboardEventChannel> m_keyboard;
+
+	struct CameraSetting
+	{
+		float fov = 60.f;
+		float zNear = 0.1f;
+		float zFar = 10000.f;
+		float moveSpeed = 1.f;
+		float rotateSpeed = 1.f;
+		float viewWidth = 10.f;
+		float camYAngle = 165.f / 180.f * 3.14159f;
+		float camXAngle = 32.f / 180.f * 3.14159f;
+
+	} m_cameraSetting;
+	Camera m_camera = Camera(core::vectorSIMDf(0, 0, 0), core::vectorSIMDf(0, 0, 0), core::matrix4SIMD());
+
+	Light m_light = {
+	  .direction = {-1.0f, -1.0f, -0.4f},
+	  .position = {10.0f, 15.0f, 8.0f},
+	  .outerCutoff = 0.866025404f, // {cos(radians(30.0f))}, 
+	  .type = ELT_DIRECTIONAL
+	};
+
+	video::CDumbPresentationOracle m_oracle;
+
+	struct C_UI
+	{
+		nbl::core::smart_refctd_ptr<nbl::ext::imgui::UI> manager;
+
+		struct
+		{
+			core::smart_refctd_ptr<video::IGPUSampler> gui, scene;
+		} samplers;
+
+		core::smart_refctd_ptr<IGPUDescriptorSet> descriptorSet;
+	} m_ui;
+	core::smart_refctd_ptr<IDescriptorPool> m_guiDescriptorSetPool;
+
+	core::vector<ReferenceObjectGpu> m_gpuTriangleGeometries;
+	core::vector<SProceduralGeomInfo> m_gpuIntersectionSpheres;
+	uint32_t m_intersectionHitGroupIdx;
+
+	smart_refctd_ptr<IGPUTopLevelAccelerationStructure> m_gpuTlas;
+	smart_refctd_ptr<IGPUBuffer> m_instanceBuffer;
+
+	smart_refctd_ptr<IGPUBuffer> m_triangleGeomInfoBuffer;
+	smart_refctd_ptr<IGPUBuffer> m_proceduralGeomInfoBuffer;
+	smart_refctd_ptr<IGPUBuffer> m_proceduralAabbBuffer;
+	smart_refctd_ptr<IGPUBuffer> m_indirectBuffer;
+
+	smart_refctd_ptr<IGPUImage> m_hdrImage;
+	smart_refctd_ptr<IGPUImageView> m_hdrImageView;
+
+	smart_refctd_ptr<IDescriptorPool> m_rayTracingDsPool;
+	smart_refctd_ptr<IGPUDescriptorSet> m_rayTracingDs;
+	smart_refctd_ptr<IGPURayTracingPipeline> m_rayTracingPipeline;
+	uint64_t m_rayTracingStackSize;
+	ShaderBindingTable m_shaderBindingTable;
+
+	smart_refctd_ptr<IGPUDescriptorSet> m_presentDs;
+	smart_refctd_ptr<IDescriptorPool> m_presentDsPool;
+	smart_refctd_ptr<IGPUGraphicsPipeline> m_presentPipeline;
+
+	smart_refctd_ptr<CAssetConverter> m_converter;
+
+
+	core::matrix4SIMD m_cachedModelViewProjectionMatrix;
+	bool m_useIndirectCommand = false;
 
 };
-NBL_MAIN_FUNC(RaytracingPipelineApp)
+NBL_MAIN_FUNC(RaytracingPipelineApp)
\ No newline at end of file

From 04e32adc077f87f2fe854e9cf03172ed7da7a35e Mon Sep 17 00:00:00 2001
From: kevyuu <kevin.kayu@gmail.com>
Date: Sat, 14 Jun 2025 09:47:52 +0700
Subject: [PATCH 360/529] Fix example 64,67,70,71

---
 64_EmulatedFloatTest/main.cpp  | 20 ++++-----
 67_RayQueryGeometry/main.cpp   |  6 +--
 70_FLIPFluids/main.cpp         | 11 +----
 71_RayTracingPipeline/main.cpp | 78 ++++++++++------------------------
 4 files changed, 36 insertions(+), 79 deletions(-)

diff --git a/64_EmulatedFloatTest/main.cpp b/64_EmulatedFloatTest/main.cpp
index a9ff5fde6..b44cb2b4e 100644
--- a/64_EmulatedFloatTest/main.cpp
+++ b/64_EmulatedFloatTest/main.cpp
@@ -255,7 +255,7 @@ class CompatibilityTest final : public MonoDeviceApplication, public MonoAssetMa
 
             // Load shaders, set up pipeline
             {
-                smart_refctd_ptr<IGPUShader> shader;
+                smart_refctd_ptr<IShader> shader;
                 {
                     IAssetLoader::SAssetLoadParams lp = {};
                     lp.logger = base.m_logger.get();
@@ -271,12 +271,12 @@ class CompatibilityTest final : public MonoDeviceApplication, public MonoAssetMa
 
                     // It would be super weird if loading a shader from a file produced more than 1 asset
                     assert(assets.size() == 1);
-                    smart_refctd_ptr<ICPUShader> source = IAsset::castDown<ICPUShader>(assets[0]);
+                    smart_refctd_ptr<IShader> source = IAsset::castDown<IShader>(assets[0]);
 
                     auto* compilerSet = base.m_assetMgr->getCompilerSet();
 
                     nbl::asset::IShaderCompiler::SCompilerOptions options = {};
-                    options.stage = source->getStage();
+                    options.stage = ESS_COMPUTE;
                     options.targetSpirvVersion = base.m_device->getPhysicalDevice()->getLimits().spirvVersion;
                     options.spirvOptimizer = nullptr;
                     options.debugInfoFlags |= IShaderCompiler::E_DEBUG_INFO_FLAGS::EDIF_SOURCE_BIT;
@@ -286,9 +286,7 @@ class CompatibilityTest final : public MonoDeviceApplication, public MonoAssetMa
 
                     auto spirv = compilerSet->compileToSPIRV(source.get(), options);
 
-                    ILogicalDevice::SShaderCreationParameters params{};
-                    params.cpushader = spirv.get();
-                    shader = base.m_device->createShader(params);
+                    shader = base.m_device->compileShader({spirv.get()});
                 }
 
                 if (!shader)
@@ -923,7 +921,7 @@ class CompatibilityTest final : public MonoDeviceApplication, public MonoAssetMa
 
             // Load shaders, set up pipeline
             {
-                smart_refctd_ptr<IGPUShader> shader;
+                smart_refctd_ptr<IShader> shader;
                 {
                     IAssetLoader::SAssetLoadParams lp = {};
                     lp.logger = base.m_logger.get();
@@ -939,12 +937,12 @@ class CompatibilityTest final : public MonoDeviceApplication, public MonoAssetMa
 
                     // It would be super weird if loading a shader from a file produced more than 1 asset
                     assert(assets.size() == 1);
-                    smart_refctd_ptr<ICPUShader> source = IAsset::castDown<ICPUShader>(assets[0]);
+                    smart_refctd_ptr<IShader> source = IAsset::castDown<IShader>(assets[0]);
 
                     auto* compilerSet = base.m_assetMgr->getCompilerSet();
 
                     IShaderCompiler::SCompilerOptions options = {};
-                    options.stage = source->getStage();
+                    options.stage = ESS_COMPUTE;
                     options.targetSpirvVersion = base.m_device->getPhysicalDevice()->getLimits().spirvVersion;
                     options.spirvOptimizer = nullptr;
                     options.debugInfoFlags |= IShaderCompiler::E_DEBUG_INFO_FLAGS::EDIF_SOURCE_BIT;
@@ -954,9 +952,7 @@ class CompatibilityTest final : public MonoDeviceApplication, public MonoAssetMa
 
                     auto spirv = compilerSet->compileToSPIRV(source.get(), options);
 
-                    ILogicalDevice::SShaderCreationParameters params{};
-                    params.cpushader = spirv.get();
-                    shader = base.m_device->createShader(params);
+                    shader = base.m_device->compileShader({spirv.get()});
                 }
 
                 if (!shader)
diff --git a/67_RayQueryGeometry/main.cpp b/67_RayQueryGeometry/main.cpp
index 7371cf1ea..fdee5c5a1 100644
--- a/67_RayQueryGeometry/main.cpp
+++ b/67_RayQueryGeometry/main.cpp
@@ -161,9 +161,8 @@ class RayQueryGeometryApp final : public examples::SimpleWindowedApplication, pu
 
 				const auto assets = bundle.getContents();
 				assert(assets.size() == 1);
-				smart_refctd_ptr<ICPUShader> shaderSrc = IAsset::castDown<ICPUShader>(assets[0]);
-				shaderSrc->setShaderStage(IShader::E_SHADER_STAGE::ESS_COMPUTE);
-				auto shader = m_device->createShader(shaderSrc.get());
+				smart_refctd_ptr<IShader> shaderSrc = IAsset::castDown<IShader>(assets[0]);
+				auto shader = m_device->compileShader({shaderSrc.get()});
 				if (!shader)
 					return logFail("Failed to create shader!");
 
@@ -173,6 +172,7 @@ class RayQueryGeometryApp final : public examples::SimpleWindowedApplication, pu
 				IGPUComputePipeline::SCreationParams params = {};
 				params.layout = pipelineLayout.get();
 				params.shader.shader = shader.get();
+				params.shader.entryPoint = "main";
 				if (!m_device->createComputePipelines(nullptr, { &params, 1 }, &renderPipeline))
 					return logFail("Failed to create compute pipeline");
 			}
diff --git a/70_FLIPFluids/main.cpp b/70_FLIPFluids/main.cpp
index a0d2ad95d..c0f68ca49 100644
--- a/70_FLIPFluids/main.cpp
+++ b/70_FLIPFluids/main.cpp
@@ -374,7 +374,6 @@ class FLIPFluidsApp final : public examples::SimpleWindowedApplication, public a
                 params.layout = pipelineLayout.get();
                 params.shader.entryPoint = entryPoint;
                 params.shader.shader = shader.get();
-                params.shader.stage = ESS_COMPUTE;
                 
                 m_device->createComputePipelines(nullptr, { &params,1 }, &pipeline);
             };
@@ -631,7 +630,6 @@ class FLIPFluidsApp final : public examples::SimpleWindowedApplication, public a
                 params.layout = pipelineLayout.get();
                 params.shader.entryPoint = iterateKernel;
                 params.shader.shader = iterateShader.get();
-                params.shader.stage = ESS_COMPUTE;
 
                 m_device->createComputePipelines(nullptr, { &params,1 }, &m_iterateDiffusionPipeline);
             }
@@ -640,7 +638,6 @@ class FLIPFluidsApp final : public examples::SimpleWindowedApplication, public a
                 params.layout = pipelineLayout.get();
                 params.shader.entryPoint = applyKernel;
                 params.shader.shader = applyShader.get();
-                params.shader.stage = ESS_COMPUTE;
 
                 m_device->createComputePipelines(nullptr, { &params,1 }, &m_diffusionPipeline);
             }
@@ -1635,11 +1632,6 @@ class FLIPFluidsApp final : public examples::SimpleWindowedApplication, public a
         blendParams.blendParams[0u].colorWriteMask = (1u << 0u) | (1u << 1u) | (1u << 2u) | (1u << 3u);
 
         {
-            IPipelineBase::SShaderSpecInfo specInfo[] = {
-                {.shader = vs.get(), .entryPoint = "main", .stage = ESS_VERTEX, },
-                {.shader = fs.get(), .entryPoint = "main", .stage = ESS_FRAGMENT, },
-            };
-
             const asset::SPushConstantRange pcRange = { .stageFlags = IShader::E_SHADER_STAGE::ESS_VERTEX, .offset = 0, .size = sizeof(uint64_t) };
             const auto pipelineLayout = m_device->createPipelineLayout({ &pcRange , 1 }, nullptr, smart_refctd_ptr(descriptorSetLayout1), nullptr, nullptr);
 
@@ -1649,7 +1641,8 @@ class FLIPFluidsApp final : public examples::SimpleWindowedApplication, public a
 
             IGPUGraphicsPipeline::SCreationParams params[1] = {};
             params[0].layout = pipelineLayout.get();
-            params[0].shaders = specInfo;
+            params[0].vertexShader = { .shader = vs.get(), .entryPoint = "main", };
+            params[0].fragmentShader = { .shader = fs.get(), .entryPoint = "main", };
             params[0].cached = {
                 .vertexInput = {
                 },
diff --git a/71_RayTracingPipeline/main.cpp b/71_RayTracingPipeline/main.cpp
index 42aaa2233..0642220ba 100644
--- a/71_RayTracingPipeline/main.cpp
+++ b/71_RayTracingPipeline/main.cpp
@@ -136,7 +136,7 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication,
 		}
 
 		// Load Custom Shader
-		auto loadCompileAndCreateShader = [&](const std::string& relPath) -> smart_refctd_ptr<IGPUShader>
+		auto loadCompileAndCreateShader = [&](const std::string& relPath) -> smart_refctd_ptr<IShader>
 			{
 				IAssetLoader::SAssetLoadParams lp = {};
 				lp.logger = m_logger.get();
@@ -147,11 +147,11 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication,
 					return nullptr;
 
 				// lets go straight from ICPUSpecializedShader to IGPUSpecializedShader
-				auto sourceRaw = IAsset::castDown<ICPUShader>(assets[0]);
+				auto sourceRaw = IAsset::castDown<IShader>(assets[0]);
 				if (!sourceRaw)
 					return nullptr;
 
-				return m_device->createShader({ sourceRaw.get(), nullptr, shaderReadCache.get(), shaderWriteCache.get() });
+				return m_device->compileShader({ sourceRaw.get(), nullptr, shaderReadCache.get(), shaderWriteCache.get() });
 			};
 
 		// load shaders
@@ -317,38 +317,7 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication,
 			const auto pipelineLayout = m_device->createPipelineLayout({ &pcRange, 1 }, smart_refctd_ptr(descriptorSetLayout), nullptr, nullptr, nullptr);
 
 			IGPURayTracingPipeline::SCreationParams params = {};
-
-			enum RtDemoShader
-			{
-				RTDS_RAYGEN,
-				RTDS_MISS,
-				RTDS_MISS_SHADOW,
-				RTDS_CLOSEST_HIT,
-				RTDS_SPHERE_CLOSEST_HIT,
-				RTDS_ANYHIT_PRIMARY,
-				RTDS_ANYHIT_SHADOW,
-				RTDS_INTERSECTION,
-				RTDS_DIRECTIONAL_CALL,
-				RTDS_POINT_CALL,
-				RTDS_SPOT_CALL,
-				RTDS_COUNT
-			};
-
-			IGPUShader::SSpecInfo shaders[RTDS_COUNT];
-			shaders[RTDS_RAYGEN] = { .shader = raygenShader.get() };
-			shaders[RTDS_MISS] = { .shader = missShader.get() };
-			shaders[RTDS_MISS_SHADOW] = { .shader = missShadowShader.get() };
-			shaders[RTDS_CLOSEST_HIT] = { .shader = closestHitShader.get() };
-			shaders[RTDS_SPHERE_CLOSEST_HIT] = { .shader = proceduralClosestHitShader.get() };
-			shaders[RTDS_ANYHIT_PRIMARY] = { .shader = anyHitShaderColorPayload.get() };
-			shaders[RTDS_ANYHIT_SHADOW] = { .shader = anyHitShaderShadowPayload.get() };
-			shaders[RTDS_INTERSECTION] = { .shader = intersectionHitShader.get() };
-			shaders[RTDS_DIRECTIONAL_CALL] = { .shader = directionalLightCallShader.get() };
-			shaders[RTDS_POINT_CALL] = { .shader = pointLightCallShader.get() };
-			shaders[RTDS_SPOT_CALL] = { .shader = spotLightCallShader.get() };
-
 			params.layout = pipelineLayout.get();
-			params.shaders = std::span(shaders);
 			using RayTracingFlags = IGPURayTracingPipeline::SCreationParams::FLAGS;
 			params.flags = core::bitflag(RayTracingFlags::NO_NULL_MISS_SHADERS) |
 				RayTracingFlags::NO_NULL_INTERSECTION_SHADERS |
@@ -356,42 +325,40 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication,
 
 			auto& shaderGroups = params.shaderGroups;
 
-			shaderGroups.raygen = { .index = RTDS_RAYGEN };
+			shaderGroups.raygen = { .shader = raygenShader.get(), .entryPoint = "main" };
 
-			IRayTracingPipelineBase::SGeneralShaderGroup missGroups[EMT_COUNT];
-			missGroups[EMT_PRIMARY] = { .index = RTDS_MISS };
-			missGroups[EMT_OCCLUSION] = { .index = RTDS_MISS_SHADOW };
+			IGPUPipelineBase::SShaderSpecInfo missGroups[EMT_COUNT];
+			missGroups[EMT_PRIMARY] = { .shader = missShader.get(), .entryPoint = "main" };
+			missGroups[EMT_OCCLUSION] = { .shader = missShadowShader.get(), .entryPoint = "main" };
 			shaderGroups.misses = missGroups;
 
 			auto getHitGroupIndex = [](E_GEOM_TYPE geomType, E_RAY_TYPE rayType)
 				{
 					return geomType * ERT_COUNT + rayType;
 				};
-			IRayTracingPipelineBase::SHitShaderGroup hitGroups[E_RAY_TYPE::ERT_COUNT * E_GEOM_TYPE::EGT_COUNT];
+			IGPURayTracingPipeline::SHitGroup hitGroups[E_RAY_TYPE::ERT_COUNT * E_GEOM_TYPE::EGT_COUNT];
 			hitGroups[getHitGroupIndex(EGT_TRIANGLES, ERT_PRIMARY)] = {
-			  .closestHit = RTDS_CLOSEST_HIT,
-			  .anyHit = RTDS_ANYHIT_PRIMARY,
+				.closestHit = {.shader = closestHitShader.get(), .entryPoint = "main" },
+				.anyHit = { .shader = anyHitShaderColorPayload.get(), .entryPoint = "main" },
 			};
 			hitGroups[getHitGroupIndex(EGT_TRIANGLES, ERT_OCCLUSION)] = {
-			  .closestHit = IGPURayTracingPipeline::SGeneralShaderGroup::Unused,
-			  .anyHit = RTDS_ANYHIT_SHADOW,
+			  .anyHit = { .shader = anyHitShaderShadowPayload.get(), .entryPoint = "main" },
 			};
 			hitGroups[getHitGroupIndex(EGT_PROCEDURAL, ERT_PRIMARY)] = {
-			  .closestHit = RTDS_SPHERE_CLOSEST_HIT,
-			  .anyHit = RTDS_ANYHIT_PRIMARY,
-			  .intersection = RTDS_INTERSECTION,
+			  .closestHit = { .shader = proceduralClosestHitShader.get(), .entryPoint = "main" },
+			  .anyHit = { .shader = anyHitShaderColorPayload.get(), .entryPoint = "main" },
+			  .intersection = { .shader = intersectionHitShader.get(), .entryPoint = "main" },
 			};
 			hitGroups[getHitGroupIndex(EGT_PROCEDURAL, ERT_OCCLUSION)] = {
-			  .closestHit = IGPURayTracingPipeline::SGeneralShaderGroup::Unused,
-			  .anyHit = RTDS_ANYHIT_SHADOW,
-			  .intersection = RTDS_INTERSECTION,
+			  .anyHit = { .shader = anyHitShaderShadowPayload.get(), .entryPoint = "main" },
+			  .intersection = { .shader = intersectionHitShader.get(), .entryPoint = "main" },
 			};
 			shaderGroups.hits = hitGroups;
 
-			IRayTracingPipelineBase::SGeneralShaderGroup callableGroups[ELT_COUNT];
-			callableGroups[ELT_DIRECTIONAL] = { .index = RTDS_DIRECTIONAL_CALL };
-			callableGroups[ELT_POINT] = { .index = RTDS_POINT_CALL };
-			callableGroups[ELT_SPOT] = { .index = RTDS_SPOT_CALL };
+			IGPUPipelineBase::SShaderSpecInfo callableGroups[ELT_COUNT];
+			callableGroups[ELT_DIRECTIONAL] = { .shader = directionalLightCallShader.get(), .entryPoint = "main" };
+			callableGroups[ELT_POINT] = { .shader = pointLightCallShader.get(), .entryPoint = "main" };
+			callableGroups[ELT_SPOT] = { .shader = spotLightCallShader.get(), .entryPoint = "main" };
 			shaderGroups.callables = callableGroups;
 
 			params.cached.maxRecursionDepth = 1;
@@ -443,9 +410,9 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication,
 			if (!fsTriProtoPPln)
 				return logFail("Failed to create Full Screen Triangle protopipeline or load its vertex shader!");
 
-			const IGPUShader::SSpecInfo fragSpec = {
+			const IGPUPipelineBase::SShaderSpecInfo fragSpec = {
+			  .shader = fragmentShader.get(),
 			  .entryPoint = "main",
-			  .shader = fragmentShader.get()
 			};
 
 			auto presentLayout = m_device->createPipelineLayout(
@@ -1163,6 +1130,7 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication,
 					.diffuse = {0.2, 0.2, 0.8},
 					.specular = {0.8, 0.8, 0.8},
 					.shininess = 1.0f,
+					.alpha = 1.0f,
 				},
 				.transform = getTranslationMatrix(-5.0f, 1.0f, 0),
 			},

From 3866e2dc1da9b3ac3d1a0770c2724931abd5af61 Mon Sep 17 00:00:00 2001
From: Przemek <minikers21@gmail.com>
Date: Sat, 14 Jun 2025 13:14:58 +0200
Subject: [PATCH 361/529] Saving work

---
 62_CAD/DrawResourcesFiller.cpp                |   6 -
 62_CAD/main.cpp                               |   2 +-
 62_CAD/shaders/globals.hlsl                   |  14 +-
 62_CAD/shaders/main_pipeline/common.hlsl      |   2 -
 62_CAD/shaders/main_pipeline/dtm.hlsl         |  33 ++++
 .../main_pipeline/fragment_shader.hlsl        | 142 +++++++-----------
 .../shaders/main_pipeline/vertex_shader.hlsl  |   4 +-
 7 files changed, 101 insertions(+), 102 deletions(-)

diff --git a/62_CAD/DrawResourcesFiller.cpp b/62_CAD/DrawResourcesFiller.cpp
index ed46600e6..3935e26d3 100644
--- a/62_CAD/DrawResourcesFiller.cpp
+++ b/62_CAD/DrawResourcesFiller.cpp
@@ -772,12 +772,6 @@ void DrawResourcesFiller::drawGridDTM(
 	}
 	gridDTMInfo.thicknessOfTheThickestLine = thickestLineThickness;
 
-	if (dtmSettingsInfo.mode & E_DTM_MODE::OUTLINE)
-	{
-		const bool isOutlineStippled = dtmSettingsInfo.outlineStyleInfo.stipplePatternSize > 0;
-		gridDTMInfo.outlineStipplePatternLengthReciprocal = isOutlineStippled ? dtmSettingsInfo.outlineStyleInfo.reciprocalStipplePatternLen : 0.0f;
-	}
-
 	setActiveDTMSettings(dtmSettingsInfo);
 	beginMainObject(MainObjectType::GRID_DTM);
 
diff --git a/62_CAD/main.cpp b/62_CAD/main.cpp
index cd53d402c..49f6090e7 100644
--- a/62_CAD/main.cpp
+++ b/62_CAD/main.cpp
@@ -3580,7 +3580,7 @@ class ComputerAidedDesign final : public examples::SimpleWindowedApplication, pu
 			dtmInfo.outlineStyleInfo.worldSpaceLineWidth = 2.0f;
 			dtmInfo.outlineStyleInfo.color = float32_t4(0.0f, 0.39f, 0.0f, 1.0f);
 			std::array<double, 4> outlineStipplePattern = { 0.0f, -5.0f, 20.0f, -5.0f };
-			//dtmInfo.outlineStyleInfo.setStipplePatternData(outlineStipplePattern);
+			dtmInfo.outlineStyleInfo.setStipplePatternData(outlineStipplePattern);
 
 			dtmInfo.contourSettingsCount = 2u;
 			dtmInfo.contourSettings[0u].startHeight = 20;
diff --git a/62_CAD/shaders/globals.hlsl b/62_CAD/shaders/globals.hlsl
index cae5210b8..255c46d8a 100644
--- a/62_CAD/shaders/globals.hlsl
+++ b/62_CAD/shaders/globals.hlsl
@@ -43,6 +43,16 @@ struct PushConstants
     uint32_t isDTMRendering;
 };
 
+#ifdef __HLSL_VERSION
+NBL_CONSTEXPR float InvalidGridDTMHeightValue = asfloat(0x7FC00000);
+
+bool isInvalidGridDtmHeightValue(float value)
+{
+    return isnan(value);
+}
+
+#endif
+
 struct WorldClipRect
 {
     pfloat64_t2 minClip; // min clip of a rect in worldspace coordinates of the original space (globals.defaultProjectionToNDC)
@@ -259,8 +269,8 @@ struct GridDTMInfo
     pfloat64_t2 worldSpaceExtents; // 16 bytes (32)
     uint32_t textureID; // 4 bytes (36)
     float gridCellWidth; // 4 bytes (40)
-    float outlineStipplePatternLengthReciprocal; // 4 bytes (44)
-    float thicknessOfTheThickestLine; // 4 bytes (48)
+    float thicknessOfTheThickestLine; // 4 bytes (44)
+    float _padding; // 4 bytes (48)
 };
 
 enum E_CELL_DIAGONAL : uint32_t
diff --git a/62_CAD/shaders/main_pipeline/common.hlsl b/62_CAD/shaders/main_pipeline/common.hlsl
index 69f9a8ec8..f378c44db 100644
--- a/62_CAD/shaders/main_pipeline/common.hlsl
+++ b/62_CAD/shaders/main_pipeline/common.hlsl
@@ -236,14 +236,12 @@ struct PSInput
     float2 getGridDTMScreenSpaceTopLeft() { return data2.xy; }
     float2 getGridDTMScreenSpaceGridExtents() { return data2.zw; }
     float getGridDTMScreenSpaceCellWidth() { return data3.x; }
-    float getGridDTMOutlineStipplePatternLengthReciprocal() { return data3.y; }
     float2 getGridDTMScreenSpacePosition() { return interp_data5.zw; }
 
     void setGridDTMHeightTextureID(uint textureID) { data1.z = textureID; }
     void setGridDTMScreenSpaceTopLeft(float2 screenSpaceTopLeft) { data2.xy = screenSpaceTopLeft; }
     void setGridDTMScreenSpaceGridExtents(float2 screenSpaceGridExtends) { data2.zw = screenSpaceGridExtends; }
     void setGridDTMScreenSpaceCellWidth(float screenSpaceGridWidth) { data3.x = screenSpaceGridWidth; }
-    void setGridDTMOutlineStipplePatternLengthReciprocal(float outlineStipplePatternLength) { data3.y = outlineStipplePatternLength; }
     void setGridDTMScreenSpacePosition(float2 screenSpacePosition) { interp_data5.zw = screenSpacePosition; }
 };
 
diff --git a/62_CAD/shaders/main_pipeline/dtm.hlsl b/62_CAD/shaders/main_pipeline/dtm.hlsl
index 839b5483e..0aced1b89 100644
--- a/62_CAD/shaders/main_pipeline/dtm.hlsl
+++ b/62_CAD/shaders/main_pipeline/dtm.hlsl
@@ -417,6 +417,39 @@ float4 blendUnder(in float4 dstColor, in float4 srcColor)
 
     return dstColor;
 }
+
+E_CELL_DIAGONAL resolveGridDTMCellDiagonal(in uint32_t4 cellData)
+{
+    float4 cellHeights = asfloat(cellData);
+
+    const bool4 invalidHeights = bool4(
+        isInvalidGridDtmHeightValue(cellHeights.x),
+        isInvalidGridDtmHeightValue(cellHeights.y),
+        isInvalidGridDtmHeightValue(cellHeights.z),
+        isInvalidGridDtmHeightValue(cellHeights.w)
+    );
+
+    int invalidHeightsCount = 0;
+    for (int i = 0; i < 4; ++i)
+        invalidHeightsCount += int(invalidHeights[i]);
+
+    if (invalidHeightsCount == 0)
+    {
+        E_CELL_DIAGONAL a = getDiagonalModeFromCellCornerData(cellData.w);
+        return getDiagonalModeFromCellCornerData(cellData.w);
+    }
+
+    if (invalidHeightsCount > 1)
+        return INVALID;
+
+    if (invalidHeights.x || invalidHeights.z)
+        return TOP_LEFT_TO_BOTTOM_RIGHT;
+    else if (invalidHeights.y || invalidHeights.w)
+        return BOTTOM_LEFT_TO_TOP_RIGHT;
+
+    return INVALID;
+}
+
 }
 
 #endif
\ No newline at end of file
diff --git a/62_CAD/shaders/main_pipeline/fragment_shader.hlsl b/62_CAD/shaders/main_pipeline/fragment_shader.hlsl
index 3bb6d0dad..aca52e937 100644
--- a/62_CAD/shaders/main_pipeline/fragment_shader.hlsl
+++ b/62_CAD/shaders/main_pipeline/fragment_shader.hlsl
@@ -117,38 +117,6 @@ float32_t4 calculateFinalColor<true>(const uint2 fragCoord, const float localAlp
     return color;
 }
 
-E_CELL_DIAGONAL resolveGridDTMCellDiagonal(in uint32_t4 cellData)
-{
-    float4 cellHeights = asfloat(cellData);
-
-    const bool4 invalidHeights = bool4(
-        isnan(cellHeights.x),
-        isnan(cellHeights.y),
-        isnan(cellHeights.z),
-        isnan(cellHeights.w)
-    );
-
-    int invalidHeightsCount = 0;
-    for (int i = 0; i < 4; ++i)
-        invalidHeightsCount += int(invalidHeights[i]);
-
-    if (invalidHeightsCount == 0)
-    {
-        E_CELL_DIAGONAL a = getDiagonalModeFromCellCornerData(cellData.w);
-        return getDiagonalModeFromCellCornerData(cellData.w);
-    }
-
-    if (invalidHeightsCount > 1)
-        return INVALID;
-
-    if (invalidHeights.x || invalidHeights.z)
-        return TOP_LEFT_TO_BOTTOM_RIGHT;
-    else if (invalidHeights.y || invalidHeights.w)
-        return BOTTOM_LEFT_TO_TOP_RIGHT;
-
-    return INVALID;
-}
-
 [[vk::spvexecutionmode(spv::ExecutionModePixelInterlockOrderedEXT)]]
 [shader("pixel")]
 float4 fragMain(PSInput input) : SV_TARGET
@@ -439,10 +407,27 @@ float4 fragMain(PSInput input) : SV_TARGET
 
             DTMSettings dtmSettings = loadDTMSettings(mainObj.dtmSettingsIdx);
 
+            if (!dtmSettings.drawContourEnabled() && !dtmSettings.drawOutlineEnabled() && !dtmSettings.drawHeightShadingEnabled())
+                discard;
+
             float2 pos = input.getGridDTMScreenSpacePosition();
             float2 uv = input.getImageUV();
             const uint32_t textureId = input.getGridDTMHeightTextureID();
 
+            float2 topLeft = input.getGridDTMScreenSpaceTopLeft();
+            float2 gridExtents = input.getGridDTMScreenSpaceGridExtents();
+            const float cellWidth = input.getGridDTMScreenSpaceCellWidth();
+
+            float2 gridSpacePos = uv * gridExtents;
+            float2 cellCoords;
+            {
+                float2 gridSpacePosDivGridCellWidth = gridSpacePos / cellWidth;
+                cellCoords.x = uint32_t(gridSpacePosDivGridCellWidth.x);
+                cellCoords.y = uint32_t(gridSpacePosDivGridCellWidth.y);
+            }
+
+            float2 gridSpaceCellTopLeftCoords = cellCoords * cellWidth;
+
             // grid consists of square cells and cells are divided into two triangles:
             // depending on mode it is
             // either:        or:
@@ -453,30 +438,17 @@ float4 fragMain(PSInput input) : SV_TARGET
             // v0-------v2b   v2a-------v1
             // 
 
-            // calculate screen space coordinates of vertices of t
-            // he current tiranlge within the grid
-            float3 v[3];
-            nbl::hlsl::shapes::Line<float> outlineLineSegments[2];
-            float outlinePhaseShift;
+            // calculate screen space coordinates of vertices of the current tiranlge within the grid
+            float3 currentTriangleVertices[3];
             {
-                float2 topLeft = input.getGridDTMScreenSpaceTopLeft();
-                float2 gridExtents = input.getGridDTMScreenSpaceGridExtents();
-                float cellWidth = input.getGridDTMScreenSpaceCellWidth();
-
-                float2 gridSpacePos = uv * gridExtents;
-
-                float2 cellCoords;
-                {
-                    float2 gridSpacePosDivGridCellWidth = gridSpacePos / cellWidth;
-                    cellCoords.x = uint32_t(gridSpacePosDivGridCellWidth.x);
-                    cellCoords.y = uint32_t(gridSpacePosDivGridCellWidth.y);
-                }
-
                 float2 insideCellCoord = gridSpacePos - float2(cellWidth, cellWidth) * cellCoords; // TODO: use fmod instead?
                 
-                const float InvalidHeightValue = asfloat(0x7FC00000);
                 uint32_t4 cellData;
-                float4 cellHeights = float4(InvalidHeightValue, InvalidHeightValue, InvalidHeightValue, InvalidHeightValue);
+                // cellHeihts.x - bottom left texel
+                // cellHeihts.y - bottom right texel
+                // cellHeihts.z - top right texel
+                // cellHeihts.w - top left texel
+                float4 cellHeights = float4(InvalidGridDTMHeightValue, InvalidGridDTMHeightValue, InvalidGridDTMHeightValue, InvalidGridDTMHeightValue);
                 if (textureId != InvalidTextureIndex)
                 {
                     const float2 maxCellCoords = float2(round(gridExtents.x / cellWidth), round(gridExtents.y / cellWidth));
@@ -486,8 +458,7 @@ float4 fragMain(PSInput input) : SV_TARGET
                     cellHeights = asfloat(cellData);
                 }
 
-
-                const E_CELL_DIAGONAL cellDiagonal = resolveGridDTMCellDiagonal(cellData);
+                const E_CELL_DIAGONAL cellDiagonal = dtm::resolveGridDTMCellDiagonal(cellData);
                 const bool diagonalFromTopLeftToBottomRight = cellDiagonal == E_CELL_DIAGONAL::TOP_LEFT_TO_BOTTOM_RIGHT;
 
                 if (cellDiagonal == E_CELL_DIAGONAL::INVALID)
@@ -498,22 +469,20 @@ float4 fragMain(PSInput input) : SV_TARGET
                     insideCellCoord.x < insideCellCoord.y :
                     insideCellCoord.x < cellWidth - insideCellCoord.y;
 
-                float2 gridSpaceCellTopLeftCoords = cellCoords * cellWidth;
-
                 if (diagonalFromTopLeftToBottomRight)
                 {
-                    v[0] = float3(gridSpaceCellTopLeftCoords.x, gridSpaceCellTopLeftCoords.y, cellHeights.w);
-                    v[1] = float3(gridSpaceCellTopLeftCoords.x + cellWidth, gridSpaceCellTopLeftCoords.y + cellWidth, cellHeights.y);
-                    v[2] = triangleA ? float3(gridSpaceCellTopLeftCoords.x, gridSpaceCellTopLeftCoords.y + cellWidth, cellHeights.x) : float3(gridSpaceCellTopLeftCoords.x + cellWidth, gridSpaceCellTopLeftCoords.y, cellHeights.z);
+                    currentTriangleVertices[0] = float3(gridSpaceCellTopLeftCoords.x, gridSpaceCellTopLeftCoords.y, cellHeights.w);
+                    currentTriangleVertices[1] = float3(gridSpaceCellTopLeftCoords.x + cellWidth, gridSpaceCellTopLeftCoords.y + cellWidth, cellHeights.y);
+                    currentTriangleVertices[2] = triangleA ? float3(gridSpaceCellTopLeftCoords.x, gridSpaceCellTopLeftCoords.y + cellWidth, cellHeights.x) : float3(gridSpaceCellTopLeftCoords.x + cellWidth, gridSpaceCellTopLeftCoords.y, cellHeights.z);
                 }
                 else
                 {
-                    v[0] = float3(gridSpaceCellTopLeftCoords.x, gridSpaceCellTopLeftCoords.y + cellWidth, cellHeights.x);
-                    v[1] = float3(gridSpaceCellTopLeftCoords.x + cellWidth, gridSpaceCellTopLeftCoords.y, cellHeights.z);
-                    v[2] = triangleA ? float3(gridSpaceCellTopLeftCoords.x, gridSpaceCellTopLeftCoords.y, cellHeights.w) : float3(gridSpaceCellTopLeftCoords.x + cellWidth, gridSpaceCellTopLeftCoords.y + cellWidth, cellHeights.y);
+                    currentTriangleVertices[0] = float3(gridSpaceCellTopLeftCoords.x, gridSpaceCellTopLeftCoords.y + cellWidth, cellHeights.x);
+                    currentTriangleVertices[1] = float3(gridSpaceCellTopLeftCoords.x + cellWidth, gridSpaceCellTopLeftCoords.y, cellHeights.z);
+                    currentTriangleVertices[2] = triangleA ? float3(gridSpaceCellTopLeftCoords.x, gridSpaceCellTopLeftCoords.y, cellHeights.w) : float3(gridSpaceCellTopLeftCoords.x + cellWidth, gridSpaceCellTopLeftCoords.y + cellWidth, cellHeights.y);
                 }
 
-                bool isTriangleInvalid = isnan(v[0].z) || isnan(v[1].z) || isnan(v[2].z);
+                bool isTriangleInvalid = isnan(currentTriangleVertices[0].z) || isnan(currentTriangleVertices[1].z) || isnan(currentTriangleVertices[2].z);
                 bool isCellPartiallyInvalid = isnan(cellHeights.x) || isnan(cellHeights.y) || isnan(cellHeights.z) || isnan(cellHeights.w);
 
                 if (isTriangleInvalid)
@@ -522,36 +491,33 @@ float4 fragMain(PSInput input) : SV_TARGET
                 // move from grid space to screen space
                 [unroll]
                 for (int i = 0; i < 3; ++i)
-                    v[i].xy += topLeft;
-
-                if (triangleA)
-                {
-                    outlineLineSegments[0] = nbl::hlsl::shapes::Line<float>::construct(v[2].xy, v[0].xy);
-                    outlineLineSegments[1] = nbl::hlsl::shapes::Line<float>::construct(v[2].xy, v[1].xy);
-                }
-                else
-                {
-                    outlineLineSegments[0] = nbl::hlsl::shapes::Line<float>::construct(v[1].xy, v[2].xy);
-                    outlineLineSegments[1] = nbl::hlsl::shapes::Line<float>::construct(v[0].xy, v[2].xy);
-                }
-
-                // test diagonal draw
-                //outlineLineSegments[0] = nbl::hlsl::shapes::Line<float>::construct(v[0].xy, v[1].xy);
-                //outlineLineSegments[1] = nbl::hlsl::shapes::Line<float>::construct(v[0].xy, v[1].xy);
-
+                    currentTriangleVertices[i].xy += topLeft;
 
                 float distancesToVerticalCellSides = min(insideCellCoord.x, cellWidth - insideCellCoord.x);
                 float distancesToHorizontalCellSides = min(insideCellCoord.y, cellWidth - insideCellCoord.y);
 
                 float patternCellCoord = distancesToVerticalCellSides >= distancesToHorizontalCellSides ? cellCoords.x : cellCoords.y;
+            }
 
-                float reciprocalPatternLength = input.getGridDTMOutlineStipplePatternLengthReciprocal();
-                if(reciprocalPatternLength > 0.0f)
-                    outlinePhaseShift = (cellWidth * (1.0f / globals.screenToWorldRatio) * patternCellCoord) * reciprocalPatternLength;
+            // find the nearest horizontal and vertical line to the fragment
+            nbl::hlsl::shapes::Line<float> outlineLineSegments[2];
+            {
+                const float halfCellWidth = cellWidth * 0.5f;
+                const float2 nearestLineRemainingCoords = int2((gridSpacePos + halfCellWidth) / cellWidth) * cellWidth + topLeft;
+
+                // find the nearest horizontal line
+                outlineLineSegments[0].P0 = float32_t2(topLeft.x, nearestLineRemainingCoords.y);
+                outlineLineSegments[0].P1 = float32_t2(topLeft.x + gridExtents.x, nearestLineRemainingCoords.y);
+                outlineLineSegments[1].P0 = float32_t2(nearestLineRemainingCoords.x, topLeft.y);
+                outlineLineSegments[1].P1 = float32_t2(nearestLineRemainingCoords.x, topLeft.y + gridExtents.y);
+
+                // test diagonal draw (to draw diagonals height or contour shading must be enabled)
+                outlineLineSegments[0] = nbl::hlsl::shapes::Line<float>::construct(currentTriangleVertices[0].xy, currentTriangleVertices[1].xy);
+                outlineLineSegments[1] = nbl::hlsl::shapes::Line<float>::construct(currentTriangleVertices[0].xy, currentTriangleVertices[1].xy);
             }
 
-            const float3 baryCoord = dtm::calculateDTMTriangleBarycentrics(v[0], v[1], v[2], input.position.xy);
-            float height = baryCoord.x * v[0].z + baryCoord.y * v[1].z + baryCoord.z * v[2].z;
+            const float3 baryCoord = dtm::calculateDTMTriangleBarycentrics(currentTriangleVertices[0], currentTriangleVertices[1], currentTriangleVertices[2], input.position.xy);
+            float height = baryCoord.x * currentTriangleVertices[0].z + baryCoord.y * currentTriangleVertices[1].z + baryCoord.z * currentTriangleVertices[2].z;
             float2 heightDeriv = fwidth(height);
 
             const bool outOfBoundsUV = uv.x < 0.0f || uv.y < 0.0f || uv.x > 1.0f || uv.y > 1.0f;
@@ -559,12 +525,12 @@ float4 fragMain(PSInput input) : SV_TARGET
             if (dtmSettings.drawContourEnabled())
             {
                 for (int i = dtmSettings.contourSettingsCount-1u; i >= 0; --i) 
-                    dtmColor = dtm::blendUnder(dtmColor, dtm::calculateDTMContourColor(dtmSettings.contourSettings[i], v, input.position.xy, height));
+                    dtmColor = dtm::blendUnder(dtmColor, dtm::calculateDTMContourColor(dtmSettings.contourSettings[i], currentTriangleVertices, input.position.xy, height));
             }
             if (dtmSettings.drawOutlineEnabled())
-                dtmColor = dtm::blendUnder(dtmColor, dtm::calculateGridDTMOutlineColor(dtmSettings.outlineLineStyleIdx, outlineLineSegments, input.position.xy, outlinePhaseShift));
+                dtmColor = dtm::blendUnder(dtmColor, dtm::calculateGridDTMOutlineColor(dtmSettings.outlineLineStyleIdx, outlineLineSegments, input.position.xy, 0.0f));
             if (dtmSettings.drawHeightShadingEnabled() && !outOfBoundsUV)
-                dtmColor = dtm::blendUnder(dtmColor, dtm::calculateDTMHeightColor(dtmSettings.heightShadingSettings, v, heightDeriv, input.position.xy, height));
+                dtmColor = dtm::blendUnder(dtmColor, dtm::calculateDTMHeightColor(dtmSettings.heightShadingSettings, currentTriangleVertices, heightDeriv, input.position.xy, height));
 
             textureColor = dtmColor.rgb / dtmColor.a;
             localAlpha = dtmColor.a;
diff --git a/62_CAD/shaders/main_pipeline/vertex_shader.hlsl b/62_CAD/shaders/main_pipeline/vertex_shader.hlsl
index 1cc75c570..6aa43cdf6 100644
--- a/62_CAD/shaders/main_pipeline/vertex_shader.hlsl
+++ b/62_CAD/shaders/main_pipeline/vertex_shader.hlsl
@@ -650,8 +650,7 @@ PSInput main(uint vertexID : SV_VertexID)
             pfloat64_t2 worldSpaceExtents = vk::RawBufferLoad<pfloat64_t2>(globals.pointers.geometryBuffer + drawObj.geometryAddress + sizeof(pfloat64_t2), 8u);
             uint32_t textureID = vk::RawBufferLoad<uint32_t>(globals.pointers.geometryBuffer + drawObj.geometryAddress + 2 * sizeof(pfloat64_t2), 8u);
             float gridCellWidth = vk::RawBufferLoad<float>(globals.pointers.geometryBuffer + drawObj.geometryAddress + 2 * sizeof(pfloat64_t2) + sizeof(uint32_t), 8u);
-            float reciprocalOutlineStipplePatternLength = vk::RawBufferLoad<float>(globals.pointers.geometryBuffer + drawObj.geometryAddress + 2 * sizeof(pfloat64_t2) + sizeof(uint32_t) + sizeof(float), 8u);
-            float thicknessOfTheThickestLine = vk::RawBufferLoad<float>(globals.pointers.geometryBuffer + drawObj.geometryAddress + 2 * sizeof(pfloat64_t2) + sizeof(uint32_t) + 2u * sizeof(float), 8u);
+            float thicknessOfTheThickestLine = vk::RawBufferLoad<float>(globals.pointers.geometryBuffer + drawObj.geometryAddress + 2 * sizeof(pfloat64_t2) + sizeof(uint32_t) + sizeof(float), 8u);
 
             // for testing purpose
             thicknessOfTheThickestLine += 200.0f;
@@ -669,7 +668,6 @@ PSInput main(uint vertexID : SV_VertexID)
             outV.setGridDTMScreenSpacePosition(transformPointScreenSpace(clipProjectionData.projectionToNDC, globals.resolution, vtxPos));
             outV.setGridDTMScreenSpaceTopLeft(transformPointScreenSpace(clipProjectionData.projectionToNDC, globals.resolution, topLeft));
             outV.setGridDTMScreenSpaceGridExtents(_static_cast<float2>(worldSpaceExtents) * globals.screenToWorldRatio);
-            outV.setGridDTMOutlineStipplePatternLengthReciprocal(reciprocalOutlineStipplePatternLength);
 
             static const float SquareRootOfTwo = 1.4142135f;
             const pfloat64_t dilationFactor = SquareRootOfTwo * thicknessOfTheThickestLine;

From 9ed92e5875006f4a9fb10598e1325a1dad89af91 Mon Sep 17 00:00:00 2001
From: Erfan Ahmadi <ahmadierfan99@gmail.com>
Date: Mon, 16 Jun 2025 11:13:22 +0400
Subject: [PATCH 362/529] Fixed Geometry Fixes

---
 62_CAD/DrawResourcesFiller.cpp | 71 +++++++++++++++++++++++-----------
 62_CAD/DrawResourcesFiller.h   | 20 ++++++++++
 2 files changed, 68 insertions(+), 23 deletions(-)

diff --git a/62_CAD/DrawResourcesFiller.cpp b/62_CAD/DrawResourcesFiller.cpp
index b540d9257..6ecbc4771 100644
--- a/62_CAD/DrawResourcesFiller.cpp
+++ b/62_CAD/DrawResourcesFiller.cpp
@@ -230,18 +230,7 @@ void DrawResourcesFiller::drawFixedGeometryPolyline(const CPolylineBase& polylin
 
 	setActiveLineStyle(lineStyleInfo);
 	
-	if (!activeProjections.empty())
-	{
-		// if there is already an active custom projection, it should be considered into the transformation of the fixed geometry polyline
-		float64_t3x3 newTransformation = nbl::hlsl::mul(activeProjections.back(), transformation);
-		pushCustomProjection(newTransformation);
-	}
-	else
-	{
-		// will be multiplied by the default projection matrix from the left (in shader), no need to consider it here
-		pushCustomProjection(transformation);
-	}
-
+	pushCustomProjection(getFixedGeometryFinalTransformationMatrix(transformation, transformationType));
 	beginMainObject(MainObjectType::POLYLINE, transformationType);
 	drawPolyline(polyline, intendedNextSubmit);
 	endMainObject();
@@ -403,17 +392,7 @@ void DrawResourcesFiller::drawFixedGeometryHatch(
 	en::nabla2d::TransformationType transformationType,
 	SIntendedSubmitInfo& intendedNextSubmit)
 {
-	if (!activeProjections.empty())
-	{
-		// if there is already an active custom projection, it should be considered into the transformation of the fixed geometry polyline
-		float64_t3x3 newTransformation = nbl::hlsl::mul(activeProjections.back(), transformation);
-		pushCustomProjection(newTransformation);
-	}
-	else
-	{
-		// will be multiplied by the default projection matrix from the left (in shader), no need to consider it here
-		pushCustomProjection(transformation);
-	}
+	pushCustomProjection(getFixedGeometryFinalTransformationMatrix(transformation, transformationType));
 	drawHatch_impl(hatch, color, fillPattern, intendedNextSubmit, transformationType);
 	popCustomProjection();
 }
@@ -1718,6 +1697,52 @@ uint32_t DrawResourcesFiller::addDTMSettings_Internal(const DTMSettingsInfo& dtm
 	return resourcesCollection.dtmSettings.addAndGetOffset(dtmSettings); // this will implicitly increase total resource consumption and reduce remaining size --> no need for mem size trackers
 }
 
+float64_t3x3 DrawResourcesFiller::getFixedGeometryFinalTransformationMatrix(const float64_t3x3& transformation, TransformationType transformationType) const
+{
+	if (!activeProjections.empty())
+	{
+		float64_t3x3 newTransformation = nbl::hlsl::mul(activeProjections.back(), transformation);
+
+		if (transformationType == TransformationType::TT_NORMAL)
+		{
+			return newTransformation;
+		}
+		else if (transformationType == TransformationType::TT_FIXED_SCREENSPACE_SIZE)
+		{
+			// Extract normalized rotation columns
+			float64_t2 column0 = nbl::hlsl::normalize(float64_t2(newTransformation[0][0], newTransformation[1][0]));
+			float64_t2 column1 = nbl::hlsl::normalize(float64_t2(newTransformation[0][1], newTransformation[1][1]));
+
+			// Extract fixed screen-space scale from the original transformation
+			float64_t2 fixedScale = float64_t2(
+				nbl::hlsl::length(float64_t2(transformation[0][0], transformation[1][0])),
+				nbl::hlsl::length(float64_t2(transformation[0][1], transformation[1][1])));
+
+			// Apply fixed scale to normalized directions
+			column0 *= fixedScale.x;
+			column1 *= fixedScale.y;
+
+			// Compose final matrix with adjusted columns
+			newTransformation[0][0] = column0[0];
+			newTransformation[1][0] = column0[1];
+			newTransformation[0][1] = column1[0];
+			newTransformation[1][1] = column1[1];
+
+			return newTransformation;
+		}
+		else
+		{
+			// Fallback if transformationType is unrecognized, shouldn't happen
+			return newTransformation;
+		}
+	}
+	else
+	{
+		// Within no active projection scope, return transformation directly
+		return transformation;
+	}
+}
+
 uint32_t DrawResourcesFiller::acquireActiveLineStyleIndex_SubmitIfNeeded(SIntendedSubmitInfo& intendedNextSubmit)
 {
 	if (activeLineStyleIndex == InvalidStyleIdx)
diff --git a/62_CAD/DrawResourcesFiller.h b/62_CAD/DrawResourcesFiller.h
index 1babd7d7a..747a225a9 100644
--- a/62_CAD/DrawResourcesFiller.h
+++ b/62_CAD/DrawResourcesFiller.h
@@ -551,6 +551,26 @@ struct DrawResourcesFiller
 	/// returns index to added DTMSettingsInfo, returns Invalid index if it exceeds resource limitations
 	uint32_t addDTMSettings_Internal(const DTMSettingsInfo& dtmSettings, SIntendedSubmitInfo& intendedNextSubmit);
 	
+	/**
+	 * @brief Computes the final transformation matrix for fixed geometry rendering,
+	 *        considering any active custom projections and the transformation type.
+	 *
+	 * This function handles how a given transformation should be applied depending on the
+	 * current transformation type and the presence of any active projection matrices.
+	 *
+	 * - If no active projection exists, the input transformation is returned unmodified.
+	 *
+	 * - If an active projection exists:
+	 *   - For TT_NORMAL, the input transformation is simply multiplied by the top of the projection stack.
+	 * - For TT_FIXED_SCREENSPACE_SIZE, the input transformation is multiplied by the top of the projection stack,
+	 *	 but the resulting scale is replaced with the screen-space scale from the original input `transformation`.
+	 *
+	 * @param transformation The input 3x3 transformation matrix to apply.
+	 * @param transformationType The type of transformation to apply (e.g., TT_NORMAL or TT_FIXED_SCREENSPACE_SIZE).
+	 *
+	 */
+	float64_t3x3 getFixedGeometryFinalTransformationMatrix(const float64_t3x3& transformation, TransformationType transformationType) const;
+
 	/// Attempts to upload as many draw objects as possible within the given polyline section considering resource limitations
 	void addPolylineObjects_Internal(const CPolylineBase& polyline, const CPolylineBase::SectionInfo& section, uint32_t& currentObjectInSection, uint32_t mainObjIdx);
 	

From 4c10dc1cdba4ab12dfedef97768aa4a10e606213 Mon Sep 17 00:00:00 2001
From: keptsecret <sorchon@gmail.com>
Date: Mon, 16 Jun 2025 15:09:09 +0700
Subject: [PATCH 363/529] use config header file for workgroup sizes

---
 23_Arithmetic2UnitTest/main.cpp | 25 +++----------------------
 29_Arithmetic2Bench/main.cpp    | 27 ++++-----------------------
 2 files changed, 7 insertions(+), 45 deletions(-)

diff --git a/23_Arithmetic2UnitTest/main.cpp b/23_Arithmetic2UnitTest/main.cpp
index 85d6e610f..158fc5c4c 100644
--- a/23_Arithmetic2UnitTest/main.cpp
+++ b/23_Arithmetic2UnitTest/main.cpp
@@ -1,6 +1,7 @@
 #include "nbl/application_templates/BasicMultiQueueApplication.hpp"
 #include "nbl/application_templates/MonoAssetManagerAndBuiltinResourceApplication.hpp"
 #include "app_resources/common.hlsl"
+#include "nbl/builtin/hlsl/workgroup2/arithmetic_config.hlsl"
 
 using namespace nbl;
 using namespace core;
@@ -214,7 +215,8 @@ class Workgroup2ScanTestApp final : public application_templates::BasicMultiQueu
 						passed = runTest<emulatedScanExclusive, false>(subgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize, b_useNative, itemsPerWG, itemsPerInvocation) && passed;
 						logTestOutcome(passed, itemsPerWG);
 
-						itemsPerWG = calculateItemsPerWorkgroup(workgroupSize, subgroupSize, itemsPerInvocation);
+						hlsl::workgroup2::SArithmeticConfiguration wgConfig = hlsl::workgroup2::SArithmeticConfiguration::create(hlsl::findMSB(workgroupSize), subgroupSizeLog2, itemsPerInvocation);
+						itemsPerWG = wgConfig.VirtualWorkgroupSize * wgConfig.ItemsPerInvocation_0;
 						m_logger->log("Testing Item Count %u", ILogger::ELL_INFO, itemsPerWG);
 						passed = runTest<emulatedReduction, true>(workgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize, b_useNative, itemsPerWG, itemsPerInvocation) && passed;
 						logTestOutcome(passed, itemsPerWG);
@@ -268,27 +270,6 @@ class Workgroup2ScanTestApp final : public application_templates::BasicMultiQueu
 		}
 	}
 
-	// reflects calculations in workgroup2::ArithmeticConfiguration
-	uint32_t calculateItemsPerWorkgroup(const uint32_t workgroupSize, const uint32_t subgroupSize, const uint32_t itemsPerInvocation)
-	{
-		if (workgroupSize <= subgroupSize)
-			return workgroupSize * itemsPerInvocation;
-		
-		const uint8_t subgroupSizeLog2 = hlsl::findMSB(subgroupSize);
-		const uint8_t workgroupSizeLog2 = hlsl::findMSB(workgroupSize);
-
-		const uint16_t levels = (workgroupSizeLog2 == subgroupSizeLog2) ? 1 :
-			(workgroupSizeLog2 > subgroupSizeLog2 * 2 + 2) ? 3 : 2;
-
-		const uint16_t itemsPerInvocationProductLog2 = max(workgroupSizeLog2 - subgroupSizeLog2 * levels, 0);
-		uint16_t itemsPerInvocation1 = (levels == 3) ? min(itemsPerInvocationProductLog2, 2) : itemsPerInvocationProductLog2;
-		itemsPerInvocation1 = uint16_t(1u) << itemsPerInvocation1;
-
-		uint32_t virtualWorkgroupSize = 1u << max(subgroupSizeLog2 * levels, workgroupSizeLog2);
-
-		return itemsPerInvocation * virtualWorkgroupSize;
-	}
-
 	// create pipeline (specialized every test) [TODO: turn into a future/async]
 	smart_refctd_ptr<IGPUComputePipeline> createPipeline(const ICPUShader* overridenUnspecialized, const uint8_t subgroupSizeLog2)
 	{
diff --git a/29_Arithmetic2Bench/main.cpp b/29_Arithmetic2Bench/main.cpp
index d317f07df..98ff65e05 100644
--- a/29_Arithmetic2Bench/main.cpp
+++ b/29_Arithmetic2Bench/main.cpp
@@ -2,6 +2,7 @@
 #include "CEventCallback.hpp"
 #include "nbl/application_templates/MonoAssetManagerAndBuiltinResourceApplication.hpp"
 #include "app_resources/common.hlsl"
+#include "nbl/builtin/hlsl/workgroup2/arithmetic_config.hlsl"
 
 using namespace nbl;
 using namespace core;
@@ -508,27 +509,6 @@ class ArithmeticBenchApp final : public examples::SimpleWindowedApplication, pub
 	bool keepRunning() override { return numSubmits < MaxNumSubmits; }
 
 private:
-	// reflects calculations in workgroup2::ArithmeticConfiguration
-	uint32_t calculateItemsPerWorkgroup(const uint32_t workgroupSize, const uint32_t subgroupSize, const uint32_t itemsPerInvocation)
-	{
-		if (workgroupSize <= subgroupSize)
-			return workgroupSize * itemsPerInvocation;
-
-		const uint8_t subgroupSizeLog2 = hlsl::findMSB(subgroupSize);
-		const uint8_t workgroupSizeLog2 = hlsl::findMSB(workgroupSize);
-
-		const uint16_t levels = (workgroupSizeLog2 == subgroupSizeLog2) ? 1 :
-			(workgroupSizeLog2 > subgroupSizeLog2 * 2 + 2) ? 3 : 2;
-
-		const uint16_t itemsPerInvocationProductLog2 = max(workgroupSizeLog2 - subgroupSizeLog2 * levels, 0);
-		uint16_t itemsPerInvocation1 = (levels == 3) ? min(itemsPerInvocationProductLog2, 2) : itemsPerInvocationProductLog2;
-		itemsPerInvocation1 = uint16_t(1u) << itemsPerInvocation1;
-
-		uint32_t virtualWorkgroupSize = 1u << max(subgroupSizeLog2 * levels, workgroupSizeLog2);
-
-		return itemsPerInvocation * virtualWorkgroupSize;
-	}
-
 	// create pipeline (specialized every test) [TODO: turn into a future/async]
 	smart_refctd_ptr<IGPUComputePipeline> createPipeline(const ICPUShader* overridenUnspecialized, const IGPUPipelineLayout* layout, const uint8_t subgroupSizeLog2)
 	{
@@ -577,11 +557,12 @@ class ArithmeticBenchApp final : public examples::SimpleWindowedApplication, pub
 		options.preprocessorOptions.includeFinder = includeFinder;
 
 		const uint32_t subgroupSize = 0x1u << subgroupSizeLog2;
-		const uint32_t itemsPerWG = calculateItemsPerWorkgroup(workgroupSize, subgroupSize, itemsPerInvoc);
+		const uint32_t workgroupSizeLog2 = hlsl::findMSB(workgroupSize);
+		hlsl::workgroup2::SArithmeticConfiguration wgConfig = hlsl::workgroup2::SArithmeticConfiguration::create(workgroupSizeLog2, subgroupSizeLog2, itemsPerInvoc);
+		const uint32_t itemsPerWG = wgConfig.VirtualWorkgroupSize * wgConfig.ItemsPerInvocation_0;
 		smart_refctd_ptr<ICPUShader> overriddenUnspecialized;
 		if constexpr (WorkgroupBench)
 		{
-			const uint32_t workgroupSizeLog2 = hlsl::findMSB(workgroupSize);
 			const std::string definitions[7] = {
 				"workgroup2::" + arith_name,
 				std::to_string(workgroupSizeLog2),

From 6c6c6451fd31bbd4debaef22158fd4e0e9d819f2 Mon Sep 17 00:00:00 2001
From: keptsecret <sorchon@gmail.com>
Date: Mon, 16 Jun 2025 16:04:55 +0700
Subject: [PATCH 364/529] simplified some cpp code, write all benchmark
 descriptors at beginning

---
 23_Arithmetic2UnitTest/main.cpp | 24 ++++---------
 29_Arithmetic2Bench/main.cpp    | 62 +++++++++++++++++++--------------
 2 files changed, 43 insertions(+), 43 deletions(-)

diff --git a/23_Arithmetic2UnitTest/main.cpp b/23_Arithmetic2UnitTest/main.cpp
index 158fc5c4c..35983ef08 100644
--- a/23_Arithmetic2UnitTest/main.cpp
+++ b/23_Arithmetic2UnitTest/main.cpp
@@ -176,22 +176,12 @@ class Workgroup2ScanTestApp final : public application_templates::BasicMultiQueu
 		const auto MaxWorkgroupSize = m_physicalDevice->getLimits().maxComputeWorkGroupInvocations;
 		const auto MinSubgroupSize = m_physicalDevice->getLimits().minSubgroupSize;
 		const auto MaxSubgroupSize = m_physicalDevice->getLimits().maxSubgroupSize;
-		for (uint32_t useNative = 0; useNative < 2; useNative++)
+		for (uint32_t useNative = 0; useNative <= uint32_t(m_physicalDevice->getProperties().limits.shaderSubgroupArithmetic); useNative++)
 		{
-			bool b_useNative = false;
-			if (!m_physicalDevice->getProperties().limits.shaderSubgroupArithmetic && useNative == 0)
-			{
-				m_logger->log("Device property shaderSubgroupArithmetic is false! Skipping to emulated arithmetic...", ILogger::ELL_INFO);
-				continue;
-			}
-
 			if (useNative)
 				m_logger->log("Testing with emulated subgroup arithmetic", ILogger::ELL_INFO);
 			else
-			{
 				m_logger->log("Testing with native subgroup arithmetic", ILogger::ELL_INFO);
-				b_useNative = true;
-			}
 
 			for (auto subgroupSize = MinSubgroupSize; subgroupSize <= MaxSubgroupSize; subgroupSize *= 2u)
 			{
@@ -208,21 +198,21 @@ class Workgroup2ScanTestApp final : public application_templates::BasicMultiQueu
 						uint32_t itemsPerWG = workgroupSize * itemsPerInvocation;
 						m_logger->log("Testing Items per Invocation %u", ILogger::ELL_INFO, itemsPerInvocation);
 						bool passed = true;
-						passed = runTest<emulatedReduction, false>(subgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize, b_useNative, itemsPerWG, itemsPerInvocation) && passed;
+						passed = runTest<emulatedReduction, false>(subgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize, bool(useNative), itemsPerWG, itemsPerInvocation) && passed;
 						logTestOutcome(passed, itemsPerWG);
-						passed = runTest<emulatedScanInclusive, false>(subgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize, b_useNative, itemsPerWG, itemsPerInvocation) && passed;
+						passed = runTest<emulatedScanInclusive, false>(subgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize, bool(useNative), itemsPerWG, itemsPerInvocation) && passed;
 						logTestOutcome(passed, itemsPerWG);
-						passed = runTest<emulatedScanExclusive, false>(subgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize, b_useNative, itemsPerWG, itemsPerInvocation) && passed;
+						passed = runTest<emulatedScanExclusive, false>(subgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize, bool(useNative), itemsPerWG, itemsPerInvocation) && passed;
 						logTestOutcome(passed, itemsPerWG);
 
 						hlsl::workgroup2::SArithmeticConfiguration wgConfig = hlsl::workgroup2::SArithmeticConfiguration::create(hlsl::findMSB(workgroupSize), subgroupSizeLog2, itemsPerInvocation);
 						itemsPerWG = wgConfig.VirtualWorkgroupSize * wgConfig.ItemsPerInvocation_0;
 						m_logger->log("Testing Item Count %u", ILogger::ELL_INFO, itemsPerWG);
-						passed = runTest<emulatedReduction, true>(workgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize, b_useNative, itemsPerWG, itemsPerInvocation) && passed;
+						passed = runTest<emulatedReduction, true>(workgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize, bool(useNative), itemsPerWG, itemsPerInvocation) && passed;
 						logTestOutcome(passed, itemsPerWG);
-						passed = runTest<emulatedScanInclusive, true>(workgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize, b_useNative, itemsPerWG, itemsPerInvocation) && passed;
+						passed = runTest<emulatedScanInclusive, true>(workgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize, bool(useNative), itemsPerWG, itemsPerInvocation) && passed;
 						logTestOutcome(passed, itemsPerWG);
-						passed = runTest<emulatedScanExclusive, true>(workgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize, b_useNative, itemsPerWG, itemsPerInvocation) && passed;
+						passed = runTest<emulatedScanExclusive, true>(workgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize, bool(useNative), itemsPerWG, itemsPerInvocation) && passed;
 						logTestOutcome(passed, itemsPerWG);
 					}
 					m_api->endCapture();
diff --git a/29_Arithmetic2Bench/main.cpp b/29_Arithmetic2Bench/main.cpp
index 98ff65e05..9e98cfe5b 100644
--- a/29_Arithmetic2Bench/main.cpp
+++ b/29_Arithmetic2Bench/main.cpp
@@ -292,18 +292,45 @@ class ArithmeticBenchApp final : public examples::SimpleWindowedApplication, pub
 				benchLayout = m_device->createDescriptorSetLayout(binding);
 			}
 
-			benchPool = m_device->createDescriptorPoolForDSLayouts(IDescriptorPool::ECF_UPDATE_AFTER_BIND_BIT, { &benchLayout.get(),1 });
-			benchDs = benchPool->createDescriptorSet(smart_refctd_ptr(benchLayout));
+			const uint32_t setCount = ISwapchain::MaxImages;
+			benchPool = m_device->createDescriptorPoolForDSLayouts(IDescriptorPool::ECF_UPDATE_AFTER_BIND_BIT, { &benchLayout.get(),1 }, &setCount);
+			for (auto i = 0u; i < ISwapchain::MaxImages; i++)
+			{
+			    benchDs[i] = benchPool->createDescriptorSet(smart_refctd_ptr(benchLayout));
+				if (!benchDs[i])
+					return logFail("Could not create Descriptor Set!");
+			}
 
 			SPushConstantRange pcRange = { .stageFlags = IShader::E_SHADER_STAGE::ESS_COMPUTE, .offset = 0,.size = sizeof(PushConstantData) };
 			benchPplnLayout = m_device->createPipelineLayout({ &pcRange, 1 }, std::move(benchLayout));
 		}
 		if (UseNativeArithmetic && !m_physicalDevice->getProperties().limits.shaderSubgroupArithmetic)
 		{
-			m_logger->log("UseNativeArithmetic is true but device does not support shaderSubgroupArithmetic!", ILogger::ELL_ERROR);
-			exit(-1);
+			logFail("UseNativeArithmetic is true but device does not support shaderSubgroupArithmetic!");
+			return false;
+		}
+
+		IGPUDescriptorSet::SWriteDescriptorSet dsWrites[ISwapchain::MaxImages];
+		for (auto i = 0u; i < ISwapchain::MaxImages; i++)
+		{
+			if (swapchainImageViews[i].get() == nullptr)
+				continue;
+
+			video::IGPUDescriptorSet::SDescriptorInfo dsInfo;
+			dsInfo.info.image.imageLayout = IImage::LAYOUT::GENERAL;
+			dsInfo.desc = swapchainImageViews[i];
+
+			dsWrites[i] =
+			{
+				.dstSet = benchDs[i].get(),
+				.binding = 2u,
+				.arrayElement = 0u,
+				.count = 1u,
+				.info = &dsInfo,
+			};
+			m_device->updateDescriptorSets(1u, &dsWrites[i], 0u, nullptr);
 		}
-			
+
 
 		// load shader source from file
 		auto getShaderSource = [&](const char* filePath) -> auto
@@ -396,31 +423,14 @@ class ArithmeticBenchApp final : public examples::SimpleWindowedApplication, pub
 			cmdbuf->pipelineBarrier(E_DEPENDENCY_FLAGS::EDF_NONE, { .imgBarriers = imageBarriers });
 		}
 
-		video::IGPUDescriptorSet::SDescriptorInfo dsInfo;
-		dsInfo.info.image.imageLayout = IImage::LAYOUT::GENERAL;
-		dsInfo.desc = swapchainImageViews[m_currentImageAcquire.imageIndex];
-
-		IGPUDescriptorSet::SWriteDescriptorSet dsWrites[1u] =
-		{
-			{
-				.dstSet = benchDs.get(),
-				.binding = 2u,
-				.arrayElement = 0u,
-				.count = 1u,
-				.info = &dsInfo,
-			}
-		};
-		m_device->updateDescriptorSets(1u, dsWrites, 0u, nullptr);
-
-		const uint32_t elementCount = 1024*1024;
 		const auto MaxSubgroupSize = m_physicalDevice->getLimits().maxSubgroupSize;
 		const auto SubgroupSizeLog2 = hlsl::findMSB(MaxSubgroupSize);
 
-		cmdbuf->bindDescriptorSets(EPBP_COMPUTE, benchSets[0].pipeline->getLayout(), 0u, 1u, &benchDs.get());
+		cmdbuf->bindDescriptorSets(EPBP_COMPUTE, benchSets[0].pipeline->getLayout(), 0u, 1u, &benchDs[m_currentImageAcquire.imageIndex].get());
 		cmdbuf->pushConstants(benchSets[0].pipeline->getLayout(), IShader::E_SHADER_STAGE::ESS_COMPUTE, 0, sizeof(PushConstantData), &pc);
 
 		for (uint32_t i = 0; i < benchSets.size(); i++)
-			runBenchmark<DoWorkgroupBenchmarks>(cmdbuf, benchSets[i], elementCount, SubgroupSizeLog2);
+			runBenchmark<DoWorkgroupBenchmarks>(cmdbuf, benchSets[i], ElementCount, SubgroupSizeLog2);
 
 		// barrier transition to PRESENT
 		{
@@ -688,13 +698,13 @@ class ArithmeticBenchApp final : public examples::SimpleWindowedApplication, pub
 	uint32_t ItemsPerInvocation = 4u;
 	constexpr static inline uint32_t NumLoops = 1000u;
 	constexpr static inline uint32_t NumBenchmarks = 6u;
-	std::array<uint32_t, NumBenchmarks> workgroupSizes = { 32, 64, 128, 256, 512, 1024 };
+	std::array<uint32_t, NumBenchmarks> workgroupSizes = { 32, 64,  128, 256, 512, 1024 };
 	std::array<std::string, 3u> arithmeticOperations = { "reduction", "inclusive_scan", "exclusive_scan" };
 
 
 	std::array<BenchmarkSet, NumBenchmarks*3u> benchSets;
 	smart_refctd_ptr<IDescriptorPool> benchPool;
-	smart_refctd_ptr<IGPUDescriptorSet> benchDs;
+	std::array<smart_refctd_ptr<IGPUDescriptorSet>, ISwapchain::MaxImages> benchDs;
 
 	constexpr static inline uint32_t OutputBufferCount = 2u;
 	smart_refctd_ptr<IGPUBuffer> outputBuffers[OutputBufferCount];

From 638846ead247d596a7bbf75fe014e0a38001671d Mon Sep 17 00:00:00 2001
From: kevyuu <kevin.kayu@gmail.com>
Date: Mon, 16 Jun 2025 16:09:41 +0700
Subject: [PATCH 365/529] Fix ray tracing pipeline demo alpha

---
 71_RayTracingPipeline/main.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/71_RayTracingPipeline/main.cpp b/71_RayTracingPipeline/main.cpp
index 18f15a488..5ee6789ae 100644
--- a/71_RayTracingPipeline/main.cpp
+++ b/71_RayTracingPipeline/main.cpp
@@ -1131,6 +1131,7 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication,
 					.diffuse = {0.2, 0.2, 0.8},
 					.specular = {0.8, 0.8, 0.8},
 					.shininess = 1.0f,
+					.alpha = 1.0f,
 				},
 				.transform = getTranslationMatrix(-5.0f, 1.0f, 0),
 			},

From 6c251d10bb54af6cafa09c53ba9c95fa61ae0115 Mon Sep 17 00:00:00 2001
From: kevyuu <kevin.kayu@gmail.com>
Date: Mon, 16 Jun 2025 16:12:05 +0700
Subject: [PATCH 366/529] Remove test code in ray query geometry shaders

---
 .../app_resources/render.comp.hlsl            | 135 ------------------
 1 file changed, 135 deletions(-)

diff --git a/67_RayQueryGeometry/app_resources/render.comp.hlsl b/67_RayQueryGeometry/app_resources/render.comp.hlsl
index 7e8a7dad6..657d0bbf0 100644
--- a/67_RayQueryGeometry/app_resources/render.comp.hlsl
+++ b/67_RayQueryGeometry/app_resources/render.comp.hlsl
@@ -16,16 +16,6 @@ using namespace nbl::hlsl;
 [[vk::binding(1, 0)]] RWTexture2D<float4> outImage;
 [[vk::constant_id(0)]] const float shader_variant = 1.0;
 
-struct SGeomInfo2
-{
-    uint64_t vertexBufferAddress;
-    uint64_t indexBufferAddress;
-
-    uint32_t vertexStride : 29;
-    uint32_t indexType : 2; // 16 bit, 32 bit or none
-    uint32_t smoothNormals : 1;	// flat for cube, rectangle, disk
-};
-
 float3 unpackNormals3x10(uint32_t v)
 {
     // host side changes float32_t3 to EF_A2B10G10R10_SNORM_PACK32
@@ -35,77 +25,6 @@ float3 unpackNormals3x10(uint32_t v)
     return clamp(float3(pn) / 511.0, -1.0, 1.0);
 }
 
-float3 calculateSmoothNormals2(int instID, int primID, SGeomInfo2 geom, float2 bary)
-{
-    const uint indexType = geom.indexType;
-    const uint vertexStride = geom.vertexStride;
-
-    const uint64_t vertexBufferAddress = geom.vertexBufferAddress;
-    const uint64_t indexBufferAddress = geom.indexBufferAddress;
-
-    uint32_t3 indices;
-    switch (indexType)
-    {
-        case 0: // EIT_16BIT
-            indices = uint32_t3((nbl::hlsl::bda::__ptr<uint16_t3>::create(indexBufferAddress)+primID).deref().load());
-            break;
-        case 1: // EIT_32BIT
-            indices = uint32_t3((nbl::hlsl::bda::__ptr<uint32_t3>::create(indexBufferAddress)+primID).deref().load());
-            break;
-        default:    // EIT_NONE
-        {
-            indices[0] = primID * 3;
-            indices[1] = indices[0] + 1;
-            indices[2] = indices[0] + 2;
-        }
-    }
-
-    float3 n0, n1, n2;
-    switch (instID)
-    {
-        case OT_CUBE:
-        {
-            // TODO: document why the alignment is 2 here and nowhere else? isnt the `vertexStride` aligned to more than 2 anyway?
-            uint32_t v0 = vk::RawBufferLoad<uint32_t>(vertexBufferAddress + indices[0] * vertexStride, 2u);
-            uint32_t v1 = vk::RawBufferLoad<uint32_t>(vertexBufferAddress + indices[1] * vertexStride, 2u);
-            uint32_t v2 = vk::RawBufferLoad<uint32_t>(vertexBufferAddress + indices[2] * vertexStride, 2u);
-
-            n0 = normalize(nbl::hlsl::spirv::unpackSnorm4x8(v0).xyz);
-            n1 = normalize(nbl::hlsl::spirv::unpackSnorm4x8(v1).xyz);
-            n2 = normalize(nbl::hlsl::spirv::unpackSnorm4x8(v2).xyz);
-        }
-        break;
-        case OT_SPHERE:
-        case OT_CYLINDER:
-        case OT_ARROW:
-        case OT_CONE:
-        {
-            uint32_t v0 = vk::RawBufferLoad<uint32_t>(vertexBufferAddress + indices[0] * vertexStride);
-            uint32_t v1 = vk::RawBufferLoad<uint32_t>(vertexBufferAddress + indices[1] * vertexStride);
-            uint32_t v2 = vk::RawBufferLoad<uint32_t>(vertexBufferAddress + indices[2] * vertexStride);
-
-            n0 = normalize(unpackNormals3x10(v0));
-            n1 = normalize(unpackNormals3x10(v1));
-            n2 = normalize(unpackNormals3x10(v2));
-        }
-        break;
-        case OT_RECTANGLE:
-        case OT_DISK:
-        case OT_ICOSPHERE:
-        default:
-        {
-            n0 = normalize(vk::RawBufferLoad<float3>(vertexBufferAddress + indices[0] * vertexStride));
-            n1 = normalize(vk::RawBufferLoad<float3>(vertexBufferAddress + indices[1] * vertexStride));
-            n2 = normalize(vk::RawBufferLoad<float3>(vertexBufferAddress + indices[2] * vertexStride));
-        }
-    }
-
-    float3 barycentrics = float3(0.0, bary);
-    barycentrics.x = 1.0 - barycentrics.y - barycentrics.z;        
-
-    return barycentrics.x * n0 + barycentrics.y * n1 + barycentrics.z * n2;
-}
-
 float3 calculateSmoothNormals(int instID, int primID, SGeomInfo geom, float2 bary)
 {
     const uint indexType = geom.indexType;
@@ -221,57 +140,3 @@ void main(uint32_t3 threadID : SV_DispatchThreadID)
 
     outImage[threadID.xy] = color;
 }
-    
-[numthreads(WorkgroupSize, WorkgroupSize, 1)]
-[shader("compute")]
-void main2(uint32_t3 threadID : SV_DispatchThreadID)
-{
-    uint2 coords = threadID.xy;
-    coords.y = nbl::hlsl::glsl::gl_NumWorkGroups().y * WorkgroupSize - coords.y;    // need to invert it
-    
-
-    float4 NDC;
-    NDC.xy = float2(coords) * pc.scaleNDC;
-    NDC.xy += pc.offsetNDC;
-    NDC.zw = float2(0, 1.0);
-    float3 targetPos;
-    {
-        float4 tmp = mul(pc.invMVP, NDC);
-        targetPos = tmp.xyz / tmp.w;
-    }
-
-    float3 direction = normalize(targetPos - pc.camPos);
-
-    spirv::RayQueryKHR query;
-    spirv::rayQueryInitializeKHR(query, topLevelAS, spv::RayFlagsOpaqueKHRMask, 0xFF, pc.camPos, 0.01, direction, 1000.0);
-
-    while (spirv::rayQueryProceedKHR(query)) {}
-
-    float4 color = float4(0, 0, 0, 1);
-
-    if (spirv::rayQueryGetIntersectionTypeKHR(query, true) == spv::RayQueryCommittedIntersectionTypeRayQueryCommittedIntersectionTriangleKHR)
-    {
-        const int instID = spirv::rayQueryGetIntersectionInstanceIdKHR(query, true);
-        const int primID = spirv::rayQueryGetIntersectionPrimitiveIndexKHR(query, true);
-
-        // TODO: candidate for `bda::__ptr<SGeomInfo>`
-        const SGeomInfo2 geom = vk::RawBufferLoad<SGeomInfo2>(pc.geometryInfoBuffer + instID * sizeof(SGeomInfo2));
-        
-        float3 normals;
-        if (geom.smoothNormals)
-        {
-            float2 barycentrics = spirv::rayQueryGetIntersectionBarycentricsKHR(query, true);
-            normals = calculateSmoothNormals2(instID, primID, geom, barycentrics);
-        }
-        else
-        {
-            float3 pos[3] = spirv::rayQueryGetIntersectionTriangleVertexPositionsKHR(query, true);
-            normals = cross(pos[1] - pos[0], pos[2] - pos[0]);
-        }
-
-        normals = normalize(normals) * 0.5 + 0.5;
-        color = float4(normals, shader_variant);
-    }
-
-    outImage[threadID.xy] = color;
-}

From 2076b666c2fb8a86390b0e49e16290b6c1ed7483 Mon Sep 17 00:00:00 2001
From: keptsecret <sorchon@gmail.com>
Date: Mon, 16 Jun 2025 17:15:30 +0700
Subject: [PATCH 367/529] simplified data accessor template + usage

---
 .../app_resources/testSubgroup.comp.hlsl      |  5 +--
 .../app_resources/testWorkgroup.comp.hlsl     | 10 ++---
 23_Arithmetic2UnitTest/main.cpp               |  9 ++--
 .../app_resources/benchmarkSubgroup.comp.hlsl |  2 +-
 .../benchmarkWorkgroup.comp.hlsl              | 33 ++++++++-------
 29_Arithmetic2Bench/app_resources/common.hlsl |  1 -
 29_Arithmetic2Bench/main.cpp                  |  2 +-
 common/include/WorkgroupDataAccessors.hlsl    | 41 ++++++++++---------
 8 files changed, 53 insertions(+), 50 deletions(-)

diff --git a/23_Arithmetic2UnitTest/app_resources/testSubgroup.comp.hlsl b/23_Arithmetic2UnitTest/app_resources/testSubgroup.comp.hlsl
index 6cd496648..e079e5e63 100644
--- a/23_Arithmetic2UnitTest/app_resources/testSubgroup.comp.hlsl
+++ b/23_Arithmetic2UnitTest/app_resources/testSubgroup.comp.hlsl
@@ -24,13 +24,12 @@ static void subtest(NBL_CONST_REF_ARG(type_t) sourceVal)
 
     const uint64_t outputBufAddr = pc.pOutputBuf[Binop::BindingIndex];
 
-    if (glsl::gl_SubgroupSize()!=1u<<SUBGROUP_SIZE_LOG2)
-        vk::RawBufferStore<uint32_t>(outputBufAddr, glsl::gl_SubgroupSize());
+    assert(glsl::gl_SubgroupSize() == 1u<<SUBGROUP_SIZE_LOG2)
 
     operation_t<params_t> func;
     type_t val = func(sourceVal);
 
-    vk::RawBufferStore<type_t>(outputBufAddr + sizeof(uint32_t) + sizeof(type_t) * globalIndex(), val, sizeof(uint32_t));
+    vk::RawBufferStore<type_t>(outputBufAddr + sizeof(type_t) * globalIndex(), val, sizeof(uint32_t));
 }
 
 type_t test()
diff --git a/23_Arithmetic2UnitTest/app_resources/testWorkgroup.comp.hlsl b/23_Arithmetic2UnitTest/app_resources/testWorkgroup.comp.hlsl
index 97ff31481..4b30526a6 100644
--- a/23_Arithmetic2UnitTest/app_resources/testWorkgroup.comp.hlsl
+++ b/23_Arithmetic2UnitTest/app_resources/testWorkgroup.comp.hlsl
@@ -30,17 +30,18 @@ struct operation_t
     // workgroup scans do no return anything, but use the data accessor to do the storing directly
     void operator()()
     {
-        PreloadedDataProxy<config_t,Binop> dataAccessor = PreloadedDataProxy<config_t,Binop>::create();
+        using data_proxy_t = PreloadedDataProxy<config_t::WorkgroupSizeLog2,config_t::ItemsPerInvocation_0,config_t::VirtualWorkgroupSize/config_t::WorkgroupSize>;
+        data_proxy_t dataAccessor = data_proxy_t::create(pc.pInputBuf, pc.pOutputBuf[Binop::BindingIndex]);
         dataAccessor.preload();
 #if IS_REDUCTION
         otype_t value =
 #endif
-        OPERATION<config_t,binop_base_t,device_capabilities>::template __call<PreloadedDataProxy<config_t,Binop>, ScratchProxy>(dataAccessor,arithmeticAccessor);
+        OPERATION<config_t,binop_base_t,device_capabilities>::template __call<data_proxy_t, ScratchProxy>(dataAccessor,arithmeticAccessor);
         // we barrier before because we alias the accessors for Binop
         arithmeticAccessor.workgroupExecutionAndMemoryBarrier();
 #if IS_REDUCTION
         [unroll]
-        for (uint32_t i = 0; i < PreloadedDataProxy<config_t,Binop>::PreloadedDataCount; i++)
+        for (uint32_t i = 0; i < data_proxy_t::PreloadedDataCount; i++)
             dataAccessor.preloaded[i] = value;
 #endif
         dataAccessor.unload();
@@ -51,8 +52,7 @@ struct operation_t
 template<class Binop>
 static void subtest()
 {
-    if (glsl::gl_SubgroupSize()!=1u<<SUBGROUP_SIZE_LOG2)
-        vk::RawBufferStore<uint32_t>(pc.pOutputBuf[Binop::BindingIndex], glsl::gl_SubgroupSize());
+    assert(glsl::gl_SubgroupSize() == 1u<<SUBGROUP_SIZE_LOG2)
 
     operation_t<Binop,device_capabilities> func;
     func();
diff --git a/23_Arithmetic2UnitTest/main.cpp b/23_Arithmetic2UnitTest/main.cpp
index 35983ef08..6c979d7e5 100644
--- a/23_Arithmetic2UnitTest/main.cpp
+++ b/23_Arithmetic2UnitTest/main.cpp
@@ -89,7 +89,7 @@ class Workgroup2ScanTestApp final : public application_templates::BasicMultiQueu
 		for (auto i=0u; i<OutputBufferCount; i++)
 		{
 			IGPUBuffer::SCreationParams params = {};
-			params.size = sizeof(uint32_t) + gpuinputDataBuffer->getSize();
+			params.size = gpuinputDataBuffer->getSize();
 			params.usage = bitflag(IGPUBuffer::EUF_STORAGE_BUFFER_BIT) | IGPUBuffer::EUF_TRANSFER_SRC_BIT | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT;
 
 			outputBuffers[i] = m_device->createBuffer(std::move(params));
@@ -179,9 +179,9 @@ class Workgroup2ScanTestApp final : public application_templates::BasicMultiQueu
 		for (uint32_t useNative = 0; useNative <= uint32_t(m_physicalDevice->getProperties().limits.shaderSubgroupArithmetic); useNative++)
 		{
 			if (useNative)
-				m_logger->log("Testing with emulated subgroup arithmetic", ILogger::ELL_INFO);
-			else
 				m_logger->log("Testing with native subgroup arithmetic", ILogger::ELL_INFO);
+			else
+				m_logger->log("Testing with emulated subgroup arithmetic", ILogger::ELL_INFO);
 
 			for (auto subgroupSize = MinSubgroupSize; subgroupSize <= MaxSubgroupSize; subgroupSize *= 2u)
 			{
@@ -417,9 +417,8 @@ class Workgroup2ScanTestApp final : public application_templates::BasicMultiQueu
 		m_utils->downloadBufferRangeViaStagingBufferAutoSubmit(SIntendedSubmitInfo{.queue=transferDownQueue},bufferRange,resultsBuffer->getPointer());
 
 		using type_t = typename Binop::type_t;
-		const auto dataFromBuffer = reinterpret_cast<const uint32_t*>(resultsBuffer->getPointer());
+		const auto testData = reinterpret_cast<const uint32_t*>(resultsBuffer->getPointer());
 
-		const auto testData = reinterpret_cast<const type_t*>(dataFromBuffer + 1);
 		// TODO: parallel for (the temporary values need to be threadlocal or what?)
 		// now check if the data obtained has valid values
 		type_t* tmp = new type_t[itemsPerWG];
diff --git a/29_Arithmetic2Bench/app_resources/benchmarkSubgroup.comp.hlsl b/29_Arithmetic2Bench/app_resources/benchmarkSubgroup.comp.hlsl
index 2da7de38f..9141ade55 100644
--- a/29_Arithmetic2Bench/app_resources/benchmarkSubgroup.comp.hlsl
+++ b/29_Arithmetic2Bench/app_resources/benchmarkSubgroup.comp.hlsl
@@ -31,7 +31,7 @@ static void subbench(NBL_CONST_REF_ARG(type_t) sourceVal)
     for (uint32_t i = 0; i < NUM_LOOPS; i++)
         value = func(value);
 
-    vk::RawBufferStore<type_t>(outputBufAddr + sizeof(uint32_t) + sizeof(type_t) * globalIndex(), value, sizeof(uint32_t));
+    vk::RawBufferStore<type_t>(outputBufAddr + sizeof(type_t) * globalIndex(), value, sizeof(uint32_t));
 }
 
 void benchmark()
diff --git a/29_Arithmetic2Bench/app_resources/benchmarkWorkgroup.comp.hlsl b/29_Arithmetic2Bench/app_resources/benchmarkWorkgroup.comp.hlsl
index ad861a30d..561aadc56 100644
--- a/29_Arithmetic2Bench/app_resources/benchmarkWorkgroup.comp.hlsl
+++ b/29_Arithmetic2Bench/app_resources/benchmarkWorkgroup.comp.hlsl
@@ -19,29 +19,30 @@ groupshared uint32_t scratch[mpl::max_v<int16_t,config_t::SharedScratchElementCo
 
 #include "../../common/include/WorkgroupDataAccessors.hlsl"
 
-template<class Config, class Binop>
+template<uint16_t WorkgroupSizeLog2, uint16_t ItemsPerInvocation, uint16_t _PreloadedDataCount>
 struct RandomizedInputDataProxy
 {
-    using dtype_t = vector<uint32_t, Config::ItemsPerInvocation_0>;
+    using dtype_t = vector<uint32_t, ItemsPerInvocation>;
 
-    NBL_CONSTEXPR_STATIC_INLINE uint32_t PreloadedDataCount = Config::VirtualWorkgroupSize / Config::WorkgroupSize;
+    NBL_CONSTEXPR_STATIC_INLINE uint16_t PreloadedDataCount = _PreloadedDataCount;
+    NBL_CONSTEXPR_STATIC_INLINE uint16_t WorkgroupSize = uint16_t(1u) << WorkgroupSizeLog2;
 
-    static RandomizedInputDataProxy<Config, Binop> create()
+    static RandomizedInputDataProxy<WorkgroupSizeLog2, ItemsPerInvocation, PreloadedDataCount> create(uint64_t inputBuf, uint64_t outputBuf)
     {
-        RandomizedInputDataProxy<Config, Binop> retval;
-        retval.data = DataProxy<Config, Binop>::create();
+        RandomizedInputDataProxy<WorkgroupSizeLog2, ItemsPerInvocation, PreloadedDataCount> retval;
+        retval.data = DataProxy<WorkgroupSize*PreloadedDataCount, ItemsPerInvocation>::create(inputBuf, outputBuf);
         return retval;
     }
 
     template<typename AccessType, typename IndexType>
     void get(const IndexType ix, NBL_REF_ARG(AccessType) value)
     {
-        value = preloaded[ix>>Config::WorkgroupSizeLog2];
+        value = preloaded[ix>>WorkgroupSizeLog2];
     }
     template<typename AccessType, typename IndexType>
     void set(const IndexType ix, const AccessType value)
     {
-        preloaded[ix>>Config::WorkgroupSizeLog2] = value;
+        preloaded[ix>>WorkgroupSizeLog2] = value;
     }
 
     void preload()
@@ -51,7 +52,7 @@ struct RandomizedInputDataProxy
         [unroll]
         for (uint16_t idx = 0; idx < PreloadedDataCount; idx++)
             [unroll]
-            for (uint16_t i = 0; i < Config::ItemsPerInvocation_0; i++)
+            for (uint16_t i = 0; i < ItemsPerInvocation; i++)
                preloaded[idx][i] = xoroshiro();
     }
     void unload()
@@ -59,7 +60,7 @@ struct RandomizedInputDataProxy
         const uint16_t invocationIndex = workgroup::SubgroupContiguousIndex();
         [unroll]
         for (uint16_t idx = 0; idx < PreloadedDataCount; idx++)
-            data.template set<dtype_t, uint16_t>(idx * Config::WorkgroupSize + invocationIndex, preloaded[idx]);
+            data.template set<dtype_t, uint16_t>(idx * WorkgroupSize + invocationIndex, preloaded[idx]);
     }
 
     void workgroupExecutionAndMemoryBarrier()
@@ -68,29 +69,31 @@ struct RandomizedInputDataProxy
         //glsl::memoryBarrierShared(); implied by the above
     }
 
-    DataProxy<Config, Binop> data;
+    DataProxy<WorkgroupSize*PreloadedDataCount, ItemsPerInvocation> data;
     dtype_t preloaded[PreloadedDataCount];
 };
 
 static ScratchProxy arithmeticAccessor;
 
+using data_proxy_t = RandomizedInputDataProxy<config_t::WorkgroupSizeLog2,config_t::ItemsPerInvocation_0,config_t::VirtualWorkgroupSize/config_t::WorkgroupSize>;
+
 template<class Binop, class device_capabilities>
 struct operation_t
 {
     using binop_base_t = typename Binop::base_t;
     using otype_t = typename Binop::type_t;
 
-    void operator()(RandomizedInputDataProxy<config_t,Binop> dataAccessor)
+    void operator()(data_proxy_t dataAccessor)
     {
 #if IS_REDUCTION
         otype_t value = 
 #endif
-        OPERATION<config_t,binop_base_t,device_capabilities>::template __call<RandomizedInputDataProxy<config_t,Binop>, ScratchProxy>(dataAccessor,arithmeticAccessor);
+        OPERATION<config_t,binop_base_t,device_capabilities>::template __call<data_proxy_t, ScratchProxy>(dataAccessor,arithmeticAccessor);
         // we barrier before because we alias the accessors for Binop
         arithmeticAccessor.workgroupExecutionAndMemoryBarrier();
 #if IS_REDUCTION
         [unroll]
-        for (uint32_t i = 0; i < RandomizedInputDataProxy<config_t,Binop>::PreloadedDataCount; i++)
+        for (uint32_t i = 0; i < data_proxy_t::PreloadedDataCount; i++)
             dataAccessor.preloaded[i] = value;
 #endif
     }
@@ -99,7 +102,7 @@ struct operation_t
 template<class Binop>
 static void subbench()
 {
-    RandomizedInputDataProxy<config_t,Binop> dataAccessor = RandomizedInputDataProxy<config_t,Binop>::create();
+    data_proxy_t dataAccessor = data_proxy_t::create(0, pc.pOutputBuf[Binop::BindingIndex]);
     dataAccessor.preload();
 
     operation_t<Binop,device_capabilities> func;
diff --git a/29_Arithmetic2Bench/app_resources/common.hlsl b/29_Arithmetic2Bench/app_resources/common.hlsl
index 0cdcd7dad..cca5af987 100644
--- a/29_Arithmetic2Bench/app_resources/common.hlsl
+++ b/29_Arithmetic2Bench/app_resources/common.hlsl
@@ -3,7 +3,6 @@
 
 struct PushConstantData
 {
-    uint64_t pInputBuf;
     uint64_t pOutputBuf[2];
 };
 
diff --git a/29_Arithmetic2Bench/main.cpp b/29_Arithmetic2Bench/main.cpp
index 9e98cfe5b..945749320 100644
--- a/29_Arithmetic2Bench/main.cpp
+++ b/29_Arithmetic2Bench/main.cpp
@@ -698,7 +698,7 @@ class ArithmeticBenchApp final : public examples::SimpleWindowedApplication, pub
 	uint32_t ItemsPerInvocation = 4u;
 	constexpr static inline uint32_t NumLoops = 1000u;
 	constexpr static inline uint32_t NumBenchmarks = 6u;
-	std::array<uint32_t, NumBenchmarks> workgroupSizes = { 32, 64,  128, 256, 512, 1024 };
+	std::array<uint32_t, NumBenchmarks> workgroupSizes = { 32, 64, 128, 256, 512, 1024 };
 	std::array<std::string, 3u> arithmeticOperations = { "reduction", "inclusive_scan", "exclusive_scan" };
 
 
diff --git a/common/include/WorkgroupDataAccessors.hlsl b/common/include/WorkgroupDataAccessors.hlsl
index 6beadfbc9..e1774fad6 100644
--- a/common/include/WorkgroupDataAccessors.hlsl
+++ b/common/include/WorkgroupDataAccessors.hlsl
@@ -31,28 +31,29 @@ struct ScratchProxy
     }
 };
 
-template<class Config, class Binop>
+template<uint16_t WorkgroupSize, uint16_t ItemsPerInvocation>
 struct DataProxy
 {
-    using dtype_t = vector<uint32_t, Config::ItemsPerInvocation_0>;
+    using dtype_t = vector<uint32_t, ItemsPerInvocation>;
 
-    static DataProxy<Config, Binop> create()
+    static DataProxy<WorkgroupSize, ItemsPerInvocation> create(uint64_t inputBuf, uint64_t outputBuf)
     {
-        DataProxy<Config, Binop> retval;
-        retval.workgroupOffset = glsl::gl_WorkGroupID().x * Config::VirtualWorkgroupSize;
-        retval.outputBufAddr = sizeof(uint32_t) + pc.pOutputBuf[Binop::BindingIndex];
+        DataProxy<WorkgroupSize, ItemsPerInvocation> retval;
+        retval.workgroupOffset = glsl::gl_WorkGroupID().x * WorkgroupSize;
+        retval.inputBufAddr = inputBuf;
+        retval.outputBufAddr = outputBuf;
         return retval;
     }
 
     template<typename AccessType, typename IndexType>
     void get(const IndexType ix, NBL_REF_ARG(AccessType) value)
     {
-        value = vk::RawBufferLoad<AccessType>(pc.pInputBuf + (workgroupOffset + ix) * sizeof(AccessType));
+        value = vk::RawBufferLoad<AccessType>(inputBufAddr + (workgroupOffset + ix) * sizeof(AccessType));
     }
     template<typename AccessType, typename IndexType>
     void set(const IndexType ix, const AccessType value)
     {
-        vk::RawBufferStore<AccessType>(outputBufAddr + sizeof(AccessType) * (workgroupOffset+ix), value, sizeof(uint32_t));
+        vk::RawBufferStore<AccessType>(outputBufAddr + (workgroupOffset + ix) * sizeof(AccessType), value, sizeof(uint32_t));
     }
 
     void workgroupExecutionAndMemoryBarrier()
@@ -62,32 +63,34 @@ struct DataProxy
     }
 
     uint32_t workgroupOffset;
+    uint64_t inputBufAddr;
     uint64_t outputBufAddr;
 };
 
-template<class Config, class Binop>
+template<uint16_t WorkgroupSizeLog2, uint16_t ItemsPerInvocation, uint16_t _PreloadedDataCount>
 struct PreloadedDataProxy
 {
-    using dtype_t = vector<uint32_t, Config::ItemsPerInvocation_0>;
+    using dtype_t = vector<uint32_t, ItemsPerInvocation>;
 
-    NBL_CONSTEXPR_STATIC_INLINE uint32_t PreloadedDataCount = Config::VirtualWorkgroupSize / Config::WorkgroupSize;
+    NBL_CONSTEXPR_STATIC_INLINE uint16_t PreloadedDataCount = _PreloadedDataCount;
+    NBL_CONSTEXPR_STATIC_INLINE uint16_t WorkgroupSize = uint16_t(1u) << WorkgroupSizeLog2;
 
-    static PreloadedDataProxy<Config, Binop> create()
+    static PreloadedDataProxy<WorkgroupSizeLog2, ItemsPerInvocation, PreloadedDataCount> create(uint64_t inputBuf, uint64_t outputBuf)
     {
-        PreloadedDataProxy<Config, Binop> retval;
-        retval.data = DataProxy<Config, Binop>::create();
+        PreloadedDataProxy<WorkgroupSizeLog2, ItemsPerInvocation, PreloadedDataCount> retval;
+        retval.data = DataProxy<WorkgroupSize*PreloadedDataCount, ItemsPerInvocation>::create(inputBuf, outputBuf);
         return retval;
     }
 
     template<typename AccessType, typename IndexType>
     void get(const IndexType ix, NBL_REF_ARG(AccessType) value)
     {
-        value = preloaded[ix>>Config::WorkgroupSizeLog2];
+        value = preloaded[ix>>WorkgroupSizeLog2];
     }
     template<typename AccessType, typename IndexType>
     void set(const IndexType ix, const AccessType value)
     {
-        preloaded[ix>>Config::WorkgroupSizeLog2] = value;
+        preloaded[ix>>WorkgroupSizeLog2] = value;
     }
 
     void preload()
@@ -95,14 +98,14 @@ struct PreloadedDataProxy
         const uint16_t invocationIndex = workgroup::SubgroupContiguousIndex();
         [unroll]
         for (uint16_t idx = 0; idx < PreloadedDataCount; idx++)
-            data.template get<dtype_t, uint16_t>(idx * Config::WorkgroupSize + invocationIndex, preloaded[idx]);
+            data.template get<dtype_t, uint16_t>(idx * WorkgroupSize + invocationIndex, preloaded[idx]);
     }
     void unload()
     {
         const uint16_t invocationIndex = workgroup::SubgroupContiguousIndex();
         [unroll]
         for (uint16_t idx = 0; idx < PreloadedDataCount; idx++)
-            data.template set<dtype_t, uint16_t>(idx * Config::WorkgroupSize + invocationIndex, preloaded[idx]);
+            data.template set<dtype_t, uint16_t>(idx * WorkgroupSize + invocationIndex, preloaded[idx]);
     }
 
     void workgroupExecutionAndMemoryBarrier()
@@ -111,7 +114,7 @@ struct PreloadedDataProxy
         //glsl::memoryBarrierShared(); implied by the above
     }
 
-    DataProxy<Config, Binop> data;
+    DataProxy<WorkgroupSize*PreloadedDataCount, ItemsPerInvocation> data;
     dtype_t preloaded[PreloadedDataCount];
 };
 

From c111500a1f572b082e8fbb340c71bb8955d68244 Mon Sep 17 00:00:00 2001
From: Przemek <minikers21@gmail.com>
Date: Mon, 16 Jun 2025 15:30:42 +0200
Subject: [PATCH 368/529] Added option to draw grid without height texture
 provided

---
 62_CAD/DrawResourcesFiller.cpp                | 16 +++++++++------
 62_CAD/DrawResourcesFiller.h                  |  8 ++++++--
 62_CAD/main.cpp                               | 20 ++++++++++++++-----
 .../main_pipeline/fragment_shader.hlsl        | 14 +++++++++++--
 4 files changed, 43 insertions(+), 15 deletions(-)

diff --git a/62_CAD/DrawResourcesFiller.cpp b/62_CAD/DrawResourcesFiller.cpp
index 3935e26d3..fcf271383 100644
--- a/62_CAD/DrawResourcesFiller.cpp
+++ b/62_CAD/DrawResourcesFiller.cpp
@@ -746,22 +746,23 @@ void DrawResourcesFiller::drawGridDTM(
 	float gridCellWidth,
 	uint64_t textureID,
 	const DTMSettingsInfo& dtmSettingsInfo,
-	SIntendedSubmitInfo& intendedNextSubmit)
+	SIntendedSubmitInfo& intendedNextSubmit,
+	const bool drawGridOnly/* = false*/)
 {
 	GridDTMInfo gridDTMInfo;
 	gridDTMInfo.topLeft = topLeft;
 	gridDTMInfo.worldSpaceExtents = worldSpaceExtents;
 	gridDTMInfo.gridCellWidth = gridCellWidth;
-	gridDTMInfo.textureID = getImageIndexFromID(textureID, intendedNextSubmit); // for this to be valid and safe, this function needs to be called immediately after `addStaticImage` function to make sure image is in memory
+	if(!drawGridOnly)
+		gridDTMInfo.textureID = getImageIndexFromID(textureID, intendedNextSubmit); // for this to be valid and safe, this function needs to be called immediately after `addStaticImage` function to make sure image is in memory
 
 	// determine the thickes line
 	float thickestLineThickness = 0.0f;
-
 	if (dtmSettingsInfo.mode & E_DTM_MODE::OUTLINE)
 	{
 		thickestLineThickness = dtmSettingsInfo.outlineStyleInfo.worldSpaceLineWidth + dtmSettingsInfo.outlineStyleInfo.screenSpaceLineWidth;
 	}
-	else if (dtmSettingsInfo.mode & E_DTM_MODE::CONTOUR)
+	else if (dtmSettingsInfo.mode & E_DTM_MODE::CONTOUR && !drawGridOnly)
 	{
 		for (int i = 0; i < dtmSettingsInfo.contourSettingsCount; ++i)
 		{
@@ -772,7 +773,7 @@ void DrawResourcesFiller::drawGridDTM(
 	}
 	gridDTMInfo.thicknessOfTheThickestLine = thickestLineThickness;
 
-	setActiveDTMSettings(dtmSettingsInfo);
+	setActiveDTMSettings(dtmSettingsInfo, drawGridOnly);
 	beginMainObject(MainObjectType::GRID_DTM);
 
 	uint32_t mainObjectIdx = acquireActiveMainObjectIndex_SubmitIfNeeded(intendedNextSubmit);
@@ -1001,10 +1002,13 @@ void DrawResourcesFiller::setActiveLineStyle(const LineStyleInfo& lineStyle)
 	activeLineStyleIndex = InvalidStyleIdx;
 }
 
-void DrawResourcesFiller::setActiveDTMSettings(const DTMSettingsInfo& dtmSettingsInfo)
+void DrawResourcesFiller::setActiveDTMSettings(const DTMSettingsInfo& dtmSettingsInfo, const bool disableHeightRelatedDTMModes/* = false*/)
 {
 	activeDTMSettings = dtmSettingsInfo;
 	activeDTMSettingsIndex = InvalidDTMSettingsIdx;
+
+	if (disableHeightRelatedDTMModes)
+		activeDTMSettings.mode &= E_DTM_MODE::OUTLINE;
 }
 
 void DrawResourcesFiller::beginMainObject(MainObjectType type, TransformationType transformationType)
diff --git a/62_CAD/DrawResourcesFiller.h b/62_CAD/DrawResourcesFiller.h
index dd24ea2e9..d1128d556 100644
--- a/62_CAD/DrawResourcesFiller.h
+++ b/62_CAD/DrawResourcesFiller.h
@@ -243,7 +243,8 @@ struct DrawResourcesFiller
 		float gridCellWidth,
 		uint64_t textureID,
 		const DTMSettingsInfo& dtmSettingsInfo,
-		SIntendedSubmitInfo& intendedNextSubmit);
+		SIntendedSubmitInfo& intendedNextSubmit,
+		const bool drawGridOnly = false);
 
 	/**
 	 * @brief Adds a static 2D image to the draw resource set for rendering.
@@ -355,7 +356,10 @@ struct DrawResourcesFiller
 
 	// Setting Active Resources:
 	void setActiveLineStyle(const LineStyleInfo& lineStyle);
-	void setActiveDTMSettings(const DTMSettingsInfo& dtmSettingsInfo);
+	/**
+	* @param disableHeightRelatedDTMModes disables E_DTM_MODE::CONTOUR and E_DTOM_MODE::HEIGHT_SHADING, necessary when we want to draw a grid DTM without using a height map texture
+	*/
+	void setActiveDTMSettings(const DTMSettingsInfo& dtmSettingsInfo, const bool disableHeightRelatedDTMModes = false);
 
 	void beginMainObject(MainObjectType type, TransformationType transformationType = TransformationType::TT_NORMAL);
 	void endMainObject();
diff --git a/62_CAD/main.cpp b/62_CAD/main.cpp
index 49f6090e7..3195c7964 100644
--- a/62_CAD/main.cpp
+++ b/62_CAD/main.cpp
@@ -3654,16 +3654,26 @@ class ComputerAidedDesign final : public examples::SimpleWindowedApplication, pu
 			worldSpaceExtents.y = (heightMapExtent.height - 1) * HeightMapCellWidth;
 			const uint64_t heightMapTextureID = 0ull;
 
-			StaticImageInfo heightMapStaticImageInfo = {
+			constexpr bool DrawGridOnly = true;
+			
+			if(DrawGridOnly)
+			{
+				dtmInfo.mode = E_DTM_MODE::OUTLINE;
+				drawResourcesFiller.drawGridDTM(topLeft, worldSpaceExtents, HeightMapCellWidth, heightMapTextureID, dtmInfo, intendedNextSubmit, DrawGridOnly);
+			}
+			else
+			{
+				StaticImageInfo heightMapStaticImageInfo = {
 				.imageID = heightMapTextureID,
 				.cpuImage = gridDTMHeightMap,
 				.forceUpdate = false,
 				.imageViewFormatOverride = asset::E_FORMAT::EF_R32G32B32A32_UINT // for now we use only R32G32B32A32_* anyway
-			};
+				};
 
-			if (!drawResourcesFiller.ensureStaticImageAvailability(heightMapStaticImageInfo, intendedNextSubmit))
-				m_logger->log("Grid DTM height map texture unavailable!", ILogger::ELL_ERROR);
-			drawResourcesFiller.drawGridDTM(topLeft, worldSpaceExtents, HeightMapCellWidth, heightMapTextureID,  dtmInfo, intendedNextSubmit);
+				if (!drawResourcesFiller.ensureStaticImageAvailability(heightMapStaticImageInfo, intendedNextSubmit))
+					m_logger->log("Grid DTM height map texture unavailable!", ILogger::ELL_ERROR);
+				drawResourcesFiller.drawGridDTM(topLeft, worldSpaceExtents, HeightMapCellWidth, heightMapTextureID, dtmInfo, intendedNextSubmit);
+			}
 
 			// draw test polyline
 #if 0
diff --git a/62_CAD/shaders/main_pipeline/fragment_shader.hlsl b/62_CAD/shaders/main_pipeline/fragment_shader.hlsl
index aca52e937..f91b2ab51 100644
--- a/62_CAD/shaders/main_pipeline/fragment_shader.hlsl
+++ b/62_CAD/shaders/main_pipeline/fragment_shader.hlsl
@@ -474,12 +474,22 @@ float4 fragMain(PSInput input) : SV_TARGET
                     currentTriangleVertices[0] = float3(gridSpaceCellTopLeftCoords.x, gridSpaceCellTopLeftCoords.y, cellHeights.w);
                     currentTriangleVertices[1] = float3(gridSpaceCellTopLeftCoords.x + cellWidth, gridSpaceCellTopLeftCoords.y + cellWidth, cellHeights.y);
                     currentTriangleVertices[2] = triangleA ? float3(gridSpaceCellTopLeftCoords.x, gridSpaceCellTopLeftCoords.y + cellWidth, cellHeights.x) : float3(gridSpaceCellTopLeftCoords.x + cellWidth, gridSpaceCellTopLeftCoords.y, cellHeights.z);
+
+                    // TODO: use cell space instead https://github.com/Devsh-Graphics-Programming/Nabla-Examples-and-Tests/pull/186#discussion_r2133699055
+                    //currentTriangleVertices[0] = float3(0.0f, 0.0f, cellHeights.w);
+                    //currentTriangleVertices[1] = float3(cellWidth, cellWidth, cellHeights.y);
+                    //currentTriangleVertices[2] = triangleA ? float3(0.0f, cellWidth, cellHeights.x) : float3(cellWidth, 0.0f, cellHeights.z);
                 }
                 else
                 {
                     currentTriangleVertices[0] = float3(gridSpaceCellTopLeftCoords.x, gridSpaceCellTopLeftCoords.y + cellWidth, cellHeights.x);
                     currentTriangleVertices[1] = float3(gridSpaceCellTopLeftCoords.x + cellWidth, gridSpaceCellTopLeftCoords.y, cellHeights.z);
                     currentTriangleVertices[2] = triangleA ? float3(gridSpaceCellTopLeftCoords.x, gridSpaceCellTopLeftCoords.y, cellHeights.w) : float3(gridSpaceCellTopLeftCoords.x + cellWidth, gridSpaceCellTopLeftCoords.y + cellWidth, cellHeights.y);
+
+                    // TODO: use cell space instead https://github.com/Devsh-Graphics-Programming/Nabla-Examples-and-Tests/pull/186#discussion_r2133699055
+                    //currentTriangleVertices[0] = float3(0.0f, 0.0f + cellWidth, cellHeights.x);
+                    //currentTriangleVertices[1] = float3(0.0f + cellWidth, 0.0f, cellHeights.z);
+                    //currentTriangleVertices[2] = triangleA ? float3(0.0f, 0.0f, cellHeights.w) : float3(cellWidth, cellWidth, cellHeights.y);
                 }
 
                 bool isTriangleInvalid = isnan(currentTriangleVertices[0].z) || isnan(currentTriangleVertices[1].z) || isnan(currentTriangleVertices[2].z);
@@ -512,8 +522,8 @@ float4 fragMain(PSInput input) : SV_TARGET
                 outlineLineSegments[1].P1 = float32_t2(nearestLineRemainingCoords.x, topLeft.y + gridExtents.y);
 
                 // test diagonal draw (to draw diagonals height or contour shading must be enabled)
-                outlineLineSegments[0] = nbl::hlsl::shapes::Line<float>::construct(currentTriangleVertices[0].xy, currentTriangleVertices[1].xy);
-                outlineLineSegments[1] = nbl::hlsl::shapes::Line<float>::construct(currentTriangleVertices[0].xy, currentTriangleVertices[1].xy);
+                //outlineLineSegments[0] = nbl::hlsl::shapes::Line<float>::construct(currentTriangleVertices[0].xy, currentTriangleVertices[1].xy);
+                //outlineLineSegments[1] = nbl::hlsl::shapes::Line<float>::construct(currentTriangleVertices[0].xy, currentTriangleVertices[1].xy);
             }
 
             const float3 baryCoord = dtm::calculateDTMTriangleBarycentrics(currentTriangleVertices[0], currentTriangleVertices[1], currentTriangleVertices[2], input.position.xy);

From a7cfeeb63e7891912124ab1746150c768a5bbcc9 Mon Sep 17 00:00:00 2001
From: Przemek <minikers21@gmail.com>
Date: Mon, 16 Jun 2025 16:52:41 +0200
Subject: [PATCH 369/529] Fix

---
 62_CAD/DrawResourcesFiller.cpp | 12 ++++++------
 62_CAD/DrawResourcesFiller.h   | 12 ++++++------
 2 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/62_CAD/DrawResourcesFiller.cpp b/62_CAD/DrawResourcesFiller.cpp
index 8362addef..cc9e513e8 100644
--- a/62_CAD/DrawResourcesFiller.cpp
+++ b/62_CAD/DrawResourcesFiller.cpp
@@ -368,12 +368,12 @@ void DrawResourcesFiller::drawHatch(const Hatch& hatch, const float32_t4& color,
 }
 
 void DrawResourcesFiller::drawFixedGeometryHatch(
-		const en::nabla2d::Hatch& hatch,
+		const Hatch& hatch,
 		const float32_t4& foregroundColor,
 		const float32_t4& backgroundColor,
-		const en::nabla2d::HatchFillPattern fillPattern,
+		const HatchFillPattern fillPattern,
 		const float64_t3x3& transformation,
-		en::nabla2d::TransformationType transformationType, 
+		TransformationType transformationType, 
 		SIntendedSubmitInfo& intendedNextSubmit)
 {
 	// TODO[Optimization Idea]: don't draw hatch twice, we now have color storage buffer and we can treat rendering hatches like a procedural texture (requires 2 colors so no more abusing of linestyle for hatches)
@@ -389,7 +389,7 @@ void DrawResourcesFiller::drawFixedGeometryHatch(
 	const float32_t4& color,
 	const HatchFillPattern fillPattern,
 	const float64_t3x3& transformation,
-	en::nabla2d::TransformationType transformationType,
+	TransformationType transformationType,
 	SIntendedSubmitInfo& intendedNextSubmit)
 {
 	pushCustomProjection(getFixedGeometryFinalTransformationMatrix(transformation, transformationType));
@@ -401,7 +401,7 @@ void DrawResourcesFiller::drawFixedGeometryHatch(
 	const Hatch& hatch,
 	const float32_t4& color,
 	const float64_t3x3& transformation,
-	en::nabla2d::TransformationType transformationType,
+	TransformationType transformationType,
 	SIntendedSubmitInfo& intendedNextSubmit)
 {
 	drawFixedGeometryHatch(hatch, color, HatchFillPattern::SOLID_FILL, transformation, transformationType, intendedNextSubmit);
@@ -412,7 +412,7 @@ void DrawResourcesFiller::drawHatch_impl(
 	const float32_t4& color,
 	const HatchFillPattern fillPattern,
 	SIntendedSubmitInfo& intendedNextSubmit,
-	en::nabla2d::TransformationType transformationType)
+	TransformationType transformationType)
 {
 	if (color.a == 0.0f) // not visible
 		return;
diff --git a/62_CAD/DrawResourcesFiller.h b/62_CAD/DrawResourcesFiller.h
index 6a41849d2..1a74338e7 100644
--- a/62_CAD/DrawResourcesFiller.h
+++ b/62_CAD/DrawResourcesFiller.h
@@ -229,12 +229,12 @@ struct DrawResourcesFiller
 	
 	//! Convinience function for fixed-geometry Hatch with MSDF Pattern and a solid background
 	void drawFixedGeometryHatch(
-		const en::nabla2d::Hatch& hatch,
+		const Hatch& hatch,
 		const float32_t4& foregroundColor,
 		const float32_t4& backgroundColor,
-		const en::nabla2d::HatchFillPattern fillPattern,
+		const HatchFillPattern fillPattern,
 		const float64_t3x3& transformation,
-		en::nabla2d::TransformationType transformationType,
+		TransformationType transformationType,
 		SIntendedSubmitInfo& intendedNextSubmit);
 
 	// ! Fixed-geometry Hatch with MSDF Pattern
@@ -243,7 +243,7 @@ struct DrawResourcesFiller
 		const float32_t4& color,
 		const HatchFillPattern fillPattern,
 		const float64_t3x3& transformation,
-		en::nabla2d::TransformationType transformationType,
+		TransformationType transformationType,
 		SIntendedSubmitInfo& intendedNextSubmit);
 
 	// ! Solid Fill Fixed-geometry Hatch
@@ -251,7 +251,7 @@ struct DrawResourcesFiller
 		const Hatch& hatch,
 		const float32_t4& color,
 		const float64_t3x3& transformation,
-		en::nabla2d::TransformationType transformationType,
+		TransformationType transformationType,
 		SIntendedSubmitInfo& intendedNextSubmit);
 	
 	/// Used by SingleLineText, Issue drawing a font glyph
@@ -675,7 +675,7 @@ struct DrawResourcesFiller
 		const float32_t4& color,
 		const HatchFillPattern fillPattern,
 		SIntendedSubmitInfo& intendedNextSubmit,
-		en::nabla2d::TransformationType transformationType = en::nabla2d::TransformationType::TT_NORMAL);
+		TransformationType transformationType = TransformationType::TT_NORMAL);
 
 	void resetMainObjects()
 	{

From d1a8113db65c3cbd2be2d7ccf804c054f4aff1e2 Mon Sep 17 00:00:00 2001
From: devsh <devsh@devsh.eu>
Date: Mon, 16 Jun 2025 17:35:42 +0200
Subject: [PATCH 370/529] prep the push constants a little and move onto scene
 conversion

---
 09_GeometryCreator/main.cpp                   |   9 +-
 .../examples/common/SBasicViewParameters.hlsl |  28 +-
 .../geometry/CGeometryCreatorScene.hpp        | 411 ++++++------------
 .../nbl/examples/geometry/SPushConstants.hlsl |  33 ++
 4 files changed, 185 insertions(+), 296 deletions(-)
 create mode 100644 common/include/nbl/examples/geometry/SPushConstants.hlsl

diff --git a/09_GeometryCreator/main.cpp b/09_GeometryCreator/main.cpp
index 2a3a1553e..f246b5c79 100644
--- a/09_GeometryCreator/main.cpp
+++ b/09_GeometryCreator/main.cpp
@@ -37,6 +37,12 @@ class GeometryCreatorApp final : public examples::MonoWindowApplication
 					return logFail("Couldn't create Command Buffer!");
 			}
 
+//			auto scRes = static_cast<CDefaultSwapchainFramebuffers*>(m_surface->getSwapchainResources());
+//			.renderpass = core::smart_refctd_ptr<video::IGPURenderpass>(scRes->getRenderpass())
+			auto scene = CGeometryCreatorScene::create({
+				.utilities = m_utils,
+				.logger = m_logger
+			});
 #if 0
 			//using Builder = typename CScene::CreateResourcesDirectlyWithDevice::Builder;
 			using Builder = typename CScene::CreateResourcesWithAssetConverter::Builder;
@@ -92,9 +98,6 @@ class GeometryCreatorApp final : public examples::MonoWindowApplication
 			const auto viewMatrix = camera.getViewMatrix();
 			const auto viewProjectionMatrix = camera.getConcatenatedMatrix();
 
-			core::matrix3x4SIMD modelMatrix;
-			modelMatrix.setTranslation(nbl::core::vectorSIMDf(0, 0, 0, 0));
-			modelMatrix.setRotation(quaternion(0, 0, 0));
 
 			core::matrix3x4SIMD modelViewMatrix = core::concatenateBFollowedByA(viewMatrix, modelMatrix);
 			core::matrix4SIMD modelViewProjectionMatrix = core::concatenateBFollowedByA(viewProjectionMatrix, modelMatrix);
diff --git a/common/include/nbl/examples/common/SBasicViewParameters.hlsl b/common/include/nbl/examples/common/SBasicViewParameters.hlsl
index 0d0990186..b7ad31cb6 100644
--- a/common/include/nbl/examples/common/SBasicViewParameters.hlsl
+++ b/common/include/nbl/examples/common/SBasicViewParameters.hlsl
@@ -1,15 +1,27 @@
-#ifndef _S_BASIC_VIEW_PARAMETERS_COMMON_HLSL_
-#define _S_BASIC_VIEW_PARAMETERS_COMMON_HLSL_
+#ifndef _NBL_EXAMPLES_S_BASIC_VIEW_PARAMETERS_HLSL_
+#define _NBL_EXAMPLES_S_BASIC_VIEW_PARAMETERS_HLSL_
 
-#ifdef __HLSL_VERSION
-struct SBasicViewParameters //! matches CPU version size & alignment (160, 4)
+
+#include "nbl/builtin/hlsl/cpp_compat/matrix.hlsl"
+
+
+namespace nbl
+{
+namespace hlsl
+{
+namespace examples
+{
+
+struct SBasicViewParameters
 {
-	float4x4 MVP;
-	float3x4 MV;
-	float3x3 normalMat;
+	float32_t4x4 MVP;
+	float32_t3x4 MV;
+	float32_t3x3 normalMat;
 };
-#endif // _S_BASIC_VIEW_PARAMETERS_COMMON_HLSL_
 
+}
+}
+}
 #endif
 
 /*
diff --git a/common/include/nbl/examples/geometry/CGeometryCreatorScene.hpp b/common/include/nbl/examples/geometry/CGeometryCreatorScene.hpp
index e68441ffe..e39e536b0 100644
--- a/common/include/nbl/examples/geometry/CGeometryCreatorScene.hpp
+++ b/common/include/nbl/examples/geometry/CGeometryCreatorScene.hpp
@@ -3,11 +3,10 @@
 
 
 #include <nabla.h>
-
+#include "nbl/builtin/hlsl/math/linalg/fast_affine.hlsl"
 #include "nbl/asset/utils/CGeometryCreator.h"
 
-// soon to be deprecated!
-#include "nbl/examples/common/SBasicViewParameters.hlsl"
+#include "nbl/examples/geometry/SPushConstants.hlsl"
 
 // TODO: Arek bring back
 //#include "nbl/examples/geometry/spirv/builtin/CArchive.h"
@@ -17,9 +16,11 @@
 namespace nbl::examples
 {
 
-class CGeometryCreatorScene
+class CGeometryCreatorScene : public core::IReferenceCounted
 {
 	public:
+		using SPushConstants = hlsl::geometry_creator_scene::SPushConstants;
+		//
 		enum ObjectType : uint8_t
 		{
 			OT_CUBE,
@@ -32,144 +33,145 @@ class CGeometryCreatorScene
 			OT_ICOSPHERE,
 
 			OT_COUNT,
-			OT_UNKNOWN = std::numeric_limits<uint8_t>::max()
+			OT_UNKNOWN = OT_COUNT
 		};
-};
-#if 0
 
-struct ObjectMeta
-{
-	ObjectType type = OT_UNKNOWN;
-	std::string_view name = "Unknown";
-};
+#define EXPOSE_NABLA_NAMESPACES using namespace nbl::core; \
+using namespace nbl::system; \
+using namespace nbl::asset; \
+using namespace nbl::video
 
-constexpr static inline struct ClearValues
-{
-	nbl::video::IGPUCommandBuffer::SClearColorValue color = { .float32 = {0.f,0.f,0.f,1.f} };
-	nbl::video::IGPUCommandBuffer::SClearDepthStencilValue depth = { .depth = 0.f };
-} clear;
-
-#define TYPES_IMPL_BOILERPLATE(WithConverter) struct Types \
-{ \
-	using descriptor_set_layout_t = std::conditional_t<WithConverter, nbl::asset::ICPUDescriptorSetLayout, nbl::video::IGPUDescriptorSetLayout>; \
-	using pipeline_layout_t = std::conditional_t<WithConverter, nbl::asset::ICPUPipelineLayout, nbl::video::IGPUPipelineLayout>; \
-	using renderpass_t = std::conditional_t<WithConverter, nbl::asset::ICPURenderpass, nbl::video::IGPURenderpass>; \
-	using image_view_t = std::conditional_t<WithConverter, nbl::asset::ICPUImageView, nbl::video::IGPUImageView>; \
-	using image_t = std::conditional_t<WithConverter, nbl::asset::ICPUImage, nbl::video::IGPUImage>; \
-	using buffer_t = std::conditional_t<WithConverter, nbl::asset::ICPUBuffer, nbl::video::IGPUBuffer>; \
-	using shader_t = std::conditional_t<WithConverter, nbl::asset::ICPUShader, nbl::video::IGPUShader>; \
-	using graphics_pipeline_t = std::conditional_t<WithConverter, nbl::asset::ICPUGraphicsPipeline, nbl::video::IGPUGraphicsPipeline>; \
-	using descriptor_set = std::conditional_t<WithConverter, nbl::asset::ICPUDescriptorSet, nbl::video::IGPUDescriptorSet>; \
-}
-
-template<bool withAssetConverter>
-struct ResourcesBundleBase
-{
-	TYPES_IMPL_BOILERPLATE(withAssetConverter);
-
-	struct ReferenceObject
-	{
-		struct Bindings
+		//
+		struct SCreateParams
 		{
-			nbl::asset::SBufferBinding<typename Types::buffer_t> vertex, index;
+			core::smart_refctd_ptr<video::IUtilities> utilities;
+			core::smart_refctd_ptr<system::ILogger> logger;
 		};
+		static inline core::smart_refctd_ptr<CGeometryCreatorScene> create(SCreateParams&& params)
+		{
+			EXPOSE_NABLA_NAMESPACES;
+			auto* logger = params.logger.get();
+			assert(logger);
+			if (!params.utilities)
+			{
+				logger->log("Pass a non-null `IUtilities`!",ILogger::ELL_ERROR);
+				return nullptr;
+			}
+			auto device = params.utilities->getLogicalDevice();
 
-		nbl::core::smart_refctd_ptr<typename Types::graphics_pipeline_t> pipeline = nullptr;
-
-		Bindings bindings;
-		nbl::asset::E_INDEX_TYPE indexType = nbl::asset::E_INDEX_TYPE::EIT_UNKNOWN;
-		uint32_t indexCount = {};
-	};
-
-	using ReferenceDrawHook = std::pair<ReferenceObject, ObjectMeta>;
-
-	nbl::core::smart_refctd_ptr<typename Types::renderpass_t> renderpass;
-	std::array<ReferenceDrawHook, OT_COUNT> objects;
-	nbl::asset::SBufferBinding<typename Types::buffer_t> ubo;
-
-	struct
-	{
-		nbl::core::smart_refctd_ptr<typename Types::image_view_t> color, depth;
-	} attachments;
-
-	nbl::core::smart_refctd_ptr<typename Types::descriptor_set> descriptorSet;
-};
-
-struct ResourcesBundle : public ResourcesBundleBase<false>
-{
-	using base_t = ResourcesBundleBase<false>;
-};
+			constexpr auto DescriptorCount = 255;
+			smart_refctd_ptr<ICPUDescriptorSet> cpuDS;
+			{
+				// create Descriptor Set Layout
+				smart_refctd_ptr<ICPUDescriptorSetLayout> dsLayout;
+				{
+					const ICPUDescriptorSetLayout::SBinding bindings[] =
+					{
+						{
+							.binding = 0,
+							.type = IDescriptor::E_TYPE::ET_UNIFORM_TEXEL_BUFFER,
+							// some geometries may not have particular attributes
+							.createFlags = IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_PARTIALLY_BOUND_BIT,
+							.stageFlags = IShader::E_SHADER_STAGE::ESS_VERTEX|IShader::E_SHADER_STAGE::ESS_FRAGMENT,
+							.count = DescriptorCount
+						}
+					};
+					dsLayout = core::make_smart_refctd_ptr<ICPUDescriptorSetLayout>(bindings);
+					if (!dsLayout)
+					{
+						logger->log("Could not create descriptor set layout!", ILogger::ELL_ERROR);
+						return nullptr;
+					}
+				}
 
-#define EXPOSE_NABLA_NAMESPACES() using namespace nbl; \
-using namespace core; \
-using namespace asset; \
-using namespace video; \
-using namespace scene; \
-using namespace system
+				// create Descriptor Set
+				cpuDS = core::make_smart_refctd_ptr<ICPUDescriptorSet>(std::move(dsLayout));
+				if (!cpuDS)
+				{
+					logger->log("Could not descriptor set!", ILogger::ELL_ERROR);
+					return nullptr;
+				}
+			}
 
-template<bool withAssetConverter>
-class ResourceBuilder
-{
-public:
-	TYPES_IMPL_BOILERPLATE(withAssetConverter);
+			SInitParams init;
+			// create out geometries
+			{
+				auto* const outDescs = cpuDS->getDescriptorInfoStorage(IDescriptor::E_TYPE::ET_UNIFORM_TEXEL_BUFFER).data();
+				uint8_t nextDesc = 0;
+				auto allocateUTB = [DescriptorCount,outDescs,&nextDesc](const IGeometry<ICPUBuffer>::SDataView& view)->uint8_t
+				{
+					if (!view)
+						return DescriptorCount;
+					outDescs[nextDesc].desc = core::make_smart_refctd_ptr<ICPUBufferView>(view.src,view.composed.format);
+					return nextDesc++;
+				};
 
-	using this_t = ResourceBuilder<withAssetConverter>;
+				auto addGeometry = [&allocateUTB,&init](const ICPUPolygonGeometry* geom)->void
+				{
+					auto& out = init.geoms.emplace_back();
+					out.elementCount = geom->getPrimitiveCount()*geom->getIndexingCallback()->degree();
+					out.positionView = allocateUTB(geom->getPositionView());
+					out.normalView = allocateUTB(geom->getNormalView());
+					// the first view is usually the UV
+					if (const auto& auxViews = geom->getAuxAttributeViews(); !auxViews.empty())
+						out.uvView = allocateUTB(auxViews.front());
+				};
 
-	ResourceBuilder(nbl::video::IUtilities* const _utilities, nbl::video::IGPUCommandBuffer* const _commandBuffer, nbl::system::ILogger* const _logger, const nbl::asset::IGeometryCreator* const _geometryCreator)
-		: utilities(_utilities), commandBuffer(_commandBuffer), logger(_logger), geometries(_geometryCreator)
-	{
-		assert(utilities);
-		assert(logger);
-	}
+				auto creator = core::make_smart_refctd_ptr<CGeometryCreator>();
+				addGeometry(creator->createCube().get());
+			}
 
-	/*
-		if (withAssetConverter) then
-			-> .build cpu objects
-		else
-			-> .build gpu objects & record any resource update upload transfers into command buffer
-	*/
+			// convert the geometries
+			{
+				init.ds = nullptr;
+			}
 
-	inline bool build()
-	{
-		EXPOSE_NABLA_NAMESPACES();
+			return smart_refctd_ptr<CGeometryCreatorScene>(new CGeometryCreatorScene(std::move(init)),dont_grab);
+		}
 
-		if constexpr (!withAssetConverter)
+		//
+		struct SPackedGeometry
 		{
-			commandBuffer->reset(IGPUCommandBuffer::RESET_FLAGS::RELEASE_RESOURCES_BIT);
-			commandBuffer->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT);
-			commandBuffer->beginDebugMarker("Resources builder's buffers upload [manual]");
-		}
+			inline SPushConstants convert(const hlsl::float32_t3x4& model, const hlsl::float32_t3x4& view, const hlsl::float32_t4x4& viewProj)
+			{
+				return {
+					.basic = {
+						.MVP = hlsl::math::linalg::promoted_mul(viewProj,model),
+						.MV = hlsl::math::linalg::promoted_mul(view,model),
+						.normalMat = hlsl::inverse(hlsl::transpose(hlsl::float32_t3x3(view)))
+					},
+					.positionView = positionView,
+					.normalView = normalView,
+					.uvView = uvView
+				};
+			}
 
-		using functor_t = std::function<bool(void)>;
-
-		auto work = std::to_array
-		({
-			functor_t(std::bind(&this_t::createDescriptorSetLayout, this)),
-			functor_t(std::bind(&this_t::createPipelineLayout, this)),
-			functor_t(std::bind(&this_t::createRenderpass, this)),
-			functor_t(std::bind(&this_t::createFramebufferAttachments, this)),
-			functor_t(std::bind(&this_t::createShaders, this)),
-			functor_t(std::bind(&this_t::createGeometries, this)),
-			functor_t(std::bind(&this_t::createViewParametersUboBuffer, this)),
-			functor_t(std::bind(&this_t::createDescriptorSet, this))
-		});
-
-		for (auto& task : work)
-			if (!task())
-				return false;
+			core::smart_refctd_ptr<video::IGPUBuffer> indexBuffer = nullptr;
+			uint32_t elementCount = 0;
+			// indices into the descriptor set
+			uint8_t positionView = 0;
+			uint8_t normalView = 0;
+			uint8_t uvView = 0;
+			uint8_t indexType = EIT_UNKNOWN;
+			ObjectType type : 6 = ObjectType::OT_UNKNOWN;
+		};
+		std::span<const SPackedGeometry> getGeometries() const {return m_params.geoms;}
 
-		if constexpr (!withAssetConverter)
-			commandBuffer->end();
+	protected:
+		struct SInitParams
+		{
+			core::smart_refctd_ptr<IGPUDescriptorSet> ds;
+			core::vector<SPackedGeometry> geoms;
+		} m_params;
+		inline CGeometryCreatorScene(SInitParams&& _params) : m_params(std::move(_params)) {}
 
-		return true;
-	}
+#undef EXPOSE_NABLA_NAMESPACES
+};
 
-	/*
-		if (withAssetConverter) then
-			-> .convert cpu objects to gpu & update gpu buffers
-		else
-			-> update gpu buffers
-	*/
+#if 0
+class ResourceBuilder
+{
+public:
 
 	inline bool finalize(ResourcesBundle& output, nbl::video::CThreadSafeQueueAdapter* transferCapableQueue)
 	{
@@ -181,7 +183,6 @@ class ResourceBuilder
 			commandBuffers.front().cmdbuf = commandBuffer;
 		}
 
-		if constexpr (withAssetConverter)
 		{
 			// note that asset converter records basic transfer uploads itself, we only begin the recording with ONE_TIME_SUBMIT_BIT
 			commandBuffer->reset(IGPUCommandBuffer::RESET_FLAGS::RELEASE_RESOURCES_BIT);
@@ -401,43 +402,6 @@ class ResourceBuilder
 				}
 			}
 		}
-		else
-		{
-			auto completed = utilities->getLogicalDevice()->createSemaphore(0u);
-
-			std::array<IQueue::SSubmitInfo::SSemaphoreInfo, 1u> signals;
-			{
-				auto& signal = signals.front();
-				signal.value = 1;
-				signal.stageMask = bitflag(PIPELINE_STAGE_FLAGS::ALL_TRANSFER_BITS);
-				signal.semaphore = completed.get();
-			}
-
-			const IQueue::SSubmitInfo infos [] =
-			{
-				{
-					.waitSemaphores = {},
-					.commandBuffers = commandBuffers, // note that here our command buffer is already recorded!
-					.signalSemaphores = signals
-				}
-			};
-
-			if (transferCapableQueue->submit(infos) != IQueue::RESULT::SUCCESS)
-			{
-				logger->log("Failed to submit transfer upload operations!", ILogger::ELL_ERROR);
-				return false;
-			}
-
-			const ISemaphore::SWaitInfo info [] =
-			{ {
-				.semaphore = completed.get(),
-				.value = 1
-			} };
-
-			utilities->getLogicalDevice()->blockForSemaphores(info);
-
-			static_cast<ResourcesBundle::base_t&>(output) = static_cast<ResourcesBundle::base_t&>(scratch); // scratch has all ready to use allocated gpu resources with uploaded memory so now just assign resources to base output
-		}
 
 		// write the descriptor set
 		{
@@ -468,86 +432,7 @@ class ResourceBuilder
 	}
 
 private:
-	bool createDescriptorSetLayout()
-	{
-		EXPOSE_NABLA_NAMESPACES();
-
-		typename Types::descriptor_set_layout_t::SBinding bindings[] =
-		{
-			{
-				.binding = 0u,
-				.type = IDescriptor::E_TYPE::ET_UNIFORM_BUFFER,
-				.createFlags = Types::descriptor_set_layout_t::SBinding::E_CREATE_FLAGS::ECF_NONE,
-				.stageFlags = IShader::E_SHADER_STAGE::ESS_VERTEX | IShader::E_SHADER_STAGE::ESS_FRAGMENT,
-				.count = 1u,
-			}
-		};
-
-		if constexpr (withAssetConverter)
-			scratch.descriptorSetLayout = make_smart_refctd_ptr<ICPUDescriptorSetLayout>(bindings);
-		else
-			scratch.descriptorSetLayout = utilities->getLogicalDevice()->createDescriptorSetLayout(bindings);
-
-		if (!scratch.descriptorSetLayout)
-		{
-			logger->log("Could not descriptor set layout!", ILogger::ELL_ERROR);
-			return false;
-		}
 
-		return true;
-	}
-
-	bool createDescriptorSet()
-	{
-		EXPOSE_NABLA_NAMESPACES();
-
-		if constexpr (withAssetConverter)
-			scratch.descriptorSet = make_smart_refctd_ptr<ICPUDescriptorSet>(smart_refctd_ptr(scratch.descriptorSetLayout));
-		else
-		{
-			const IGPUDescriptorSetLayout* const layouts[] = { scratch.descriptorSetLayout.get()};
-			const uint32_t setCounts[] = { 1u };
-
-			// note descriptor set has back smart pointer to its pool, so we dont need to keep it explicitly
-			auto pool = utilities->getLogicalDevice()->createDescriptorPoolForDSLayouts(IDescriptorPool::E_CREATE_FLAGS::ECF_NONE, layouts, setCounts);
-
-			if (!pool)
-			{
-				logger->log("Could not create Descriptor Pool!", ILogger::ELL_ERROR);
-				return false;
-			}
-
-			pool->createDescriptorSets(layouts, &scratch.descriptorSet);
-		}
-
-		if (!scratch.descriptorSet)
-		{
-			logger->log("Could not create Descriptor Set!", ILogger::ELL_ERROR);
-			return false;
-		}
-
-		return true;
-	}
-
-	bool createPipelineLayout()
-	{
-		EXPOSE_NABLA_NAMESPACES();
-
-		const std::span<const SPushConstantRange> range = {};
-
-		if constexpr (withAssetConverter)
-			scratch.pipelineLayout = make_smart_refctd_ptr<ICPUPipelineLayout>(range, nullptr, smart_refctd_ptr(scratch.descriptorSetLayout), nullptr, nullptr);
-		else
-			scratch.pipelineLayout = utilities->getLogicalDevice()->createPipelineLayout(range, nullptr, smart_refctd_ptr(scratch.descriptorSetLayout), nullptr, nullptr);
-
-		if (!scratch.pipelineLayout)
-		{
-			logger->log("Could not create pipeline layout!", ILogger::ELL_ERROR);
-			return false;
-		}
-
-		return true;
-	}
 
 	bool createRenderpass()
 	{
@@ -646,8 +531,6 @@ class ResourceBuilder
 
 		if constexpr (withAssetConverter)
 			scratch.renderpass = ICPURenderpass::create(params);
-		else
-			scratch.renderpass = utilities->getLogicalDevice()->createRenderpass(params);
 
 		if (!scratch.renderpass)
 		{
@@ -747,8 +630,6 @@ class ResourceBuilder
 
 				if constexpr (withAssetConverter)
 					outView = make_smart_refctd_ptr<ICPUImageView>(std::move(params));
-				else
-					outView = utilities->getLogicalDevice()->createImageView(std::move(params));
  
 				if (!outView)
 				{
@@ -788,8 +669,6 @@ class ResourceBuilder
 				buffer->setContentHash(buffer->computeContentHash());
 				outShader = std::move(shader);
 			}
-			else
-				outShader = utilities->getLogicalDevice()->createShader(shader.get());
 
 			return outShader;
 		};
@@ -995,41 +874,6 @@ class ResourceBuilder
 		return true;
 	}
 
-	bool createViewParametersUboBuffer()
-	{
-		EXPOSE_NABLA_NAMESPACES();
-
-		using ibuffer_t = ::nbl::asset::IBuffer; // seems to be ambigous, both asset & core namespaces has IBuffer
-		constexpr static auto UboUsage = bitflag(ibuffer_t::EUF_UNIFORM_BUFFER_BIT) | ibuffer_t::EUF_TRANSFER_DST_BIT | ibuffer_t::EUF_INLINE_UPDATE_VIA_CMDBUF;
-
-		if constexpr (withAssetConverter)
-		{
-			auto uboBuffer = ICPUBuffer::create({ sizeof(SBasicViewParameters) });
-			uboBuffer->addUsageFlags(UboUsage);
-			uboBuffer->setContentHash(uboBuffer->computeContentHash());
-			scratch.ubo = { .offset = 0u, .buffer = std::move(uboBuffer) };
-		}
-		else
-		{
-			const auto mask = utilities->getLogicalDevice()->getPhysicalDevice()->getUpStreamingMemoryTypeBits();
-			auto uboBuffer = utilities->getLogicalDevice()->createBuffer(IGPUBuffer::SCreationParams({ .size = sizeof(SBasicViewParameters), .usage = UboUsage }));
-
-			if (!uboBuffer)
-				return false;
-
-			for (auto it : { uboBuffer })
-			{
-				IDeviceMemoryBacked::SDeviceMemoryRequirements reqs = it->getMemoryReqs();
-				reqs.memoryTypeBits &= mask;
-
-				utilities->getLogicalDevice()->allocate(reqs, it.get());
-			}
-
-			scratch.ubo = { .offset = 0u, .buffer = std::move(uboBuffer) };
-		}
-
-		return true;
-	}
 
 	struct GeometriesCpu
 	{
@@ -1099,9 +943,6 @@ class ResourceBuilder
 
 	ResourcesBundleScratch scratch;
 
-	nbl::video::IUtilities* const utilities;
-	nbl::video::IGPUCommandBuffer* const commandBuffer;
-	nbl::system::ILogger* const logger;
 	GeometriesCpu geometries;
 };
 
diff --git a/common/include/nbl/examples/geometry/SPushConstants.hlsl b/common/include/nbl/examples/geometry/SPushConstants.hlsl
new file mode 100644
index 000000000..f02ddea12
--- /dev/null
+++ b/common/include/nbl/examples/geometry/SPushConstants.hlsl
@@ -0,0 +1,33 @@
+#ifndef _NBL_EXAMPLES_S_PUSH_CONSTANTS_HLSL_
+#define _NBL_EXAMPLES_S_PUSH_CONSTANTS_HLSL_
+
+
+#include "nbl/examples/common/SBasicViewParameters.hlsl"
+
+
+namespace nbl
+{
+namespace hlsl
+{
+namespace examples
+{
+namespace geometry_creator_scene
+{
+
+struct SPushConstants
+{
+	SBasicViewParameters basic;
+	uint32_t positionView : 11;
+	uint32_t normalView : 10;
+	uint32_t uvView : 11;
+};
+
+}
+}
+}
+}
+#endif
+
+/*
+	do not remove this text, WAVE is so bad that you can get errors if no proper ending xD
+*/
\ No newline at end of file

From 9f5da5ca6dea5871981f8b014acde070b7372917 Mon Sep 17 00:00:00 2001
From: Przemek <minikers21@gmail.com>
Date: Mon, 16 Jun 2025 18:17:16 +0200
Subject: [PATCH 371/529] Fixed warnings

---
 62_CAD/DrawResourcesFiller.cpp                  | 17 +++++++++++++++--
 62_CAD/main.cpp                                 |  2 +-
 62_CAD/shaders/main_pipeline/dtm.hlsl           | 12 ++++++------
 .../shaders/main_pipeline/fragment_shader.hlsl  |  6 +++---
 4 files changed, 25 insertions(+), 12 deletions(-)

diff --git a/62_CAD/DrawResourcesFiller.cpp b/62_CAD/DrawResourcesFiller.cpp
index cc9e513e8..b40f6585c 100644
--- a/62_CAD/DrawResourcesFiller.cpp
+++ b/62_CAD/DrawResourcesFiller.cpp
@@ -777,8 +777,14 @@ void DrawResourcesFiller::drawGridDTM(
 	uint64_t textureID,
 	const DTMSettingsInfo& dtmSettingsInfo,
 	SIntendedSubmitInfo& intendedNextSubmit,
-	const bool drawGridOnly/* = false*/)
+	bool drawGridOnly/* = false*/)
 {
+	if (dtmSettingsInfo.mode == 0u)
+		return;
+
+	if (dtmSettingsInfo.mode == E_DTM_MODE::OUTLINE)
+		drawGridOnly = true;
+
 	GridDTMInfo gridDTMInfo;
 	gridDTMInfo.topLeft = topLeft;
 	gridDTMInfo.worldSpaceExtents = worldSpaceExtents;
@@ -2338,6 +2344,13 @@ DrawResourcesFiller::ImageAllocateResults DrawResourcesFiller::tryCreateAndAlloc
 		// Try creating the image and allocating memory for it:
 		nbl::video::IGPUImage::SCreationParams params = {};
 		params = imageParams;
+		
+		if (imageViewFormatOverride != asset::E_FORMAT::EF_COUNT && imageViewFormatOverride != imageParams.format)
+		{
+			// TODO: figure out why this crashes the app
+			//params.viewFormats.set(static_cast<size_t>(imageViewFormatOverride), true);
+			params.flags |= asset::IImage::E_CREATE_FLAGS::ECF_MUTABLE_FORMAT_BIT;
+		}
 		auto gpuImage = device->createImage(std::move(params));
 
 		if (gpuImage)
@@ -2368,7 +2381,7 @@ DrawResourcesFiller::ImageAllocateResults DrawResourcesFiller::tryCreateAndAlloc
 						IGPUImageView::SCreationParams viewParams = {
 							.image = gpuImage,
 							.viewType = IGPUImageView::ET_2D,
-							.format = (imageViewFormatOverride == asset::E_FORMAT::EF_COUNT) ? gpuImage->getCreationParameters().format : EF_R32G32B32A32_UINT
+							.format = (imageViewFormatOverride == asset::E_FORMAT::EF_COUNT) ? gpuImage->getCreationParameters().format : imageViewFormatOverride
 						};
 						ret.gpuImageView = device->createImageView(std::move(viewParams));
 						if (ret.gpuImageView)
diff --git a/62_CAD/main.cpp b/62_CAD/main.cpp
index 3195c7964..5cb4082bd 100644
--- a/62_CAD/main.cpp
+++ b/62_CAD/main.cpp
@@ -3654,7 +3654,7 @@ class ComputerAidedDesign final : public examples::SimpleWindowedApplication, pu
 			worldSpaceExtents.y = (heightMapExtent.height - 1) * HeightMapCellWidth;
 			const uint64_t heightMapTextureID = 0ull;
 
-			constexpr bool DrawGridOnly = true;
+			constexpr bool DrawGridOnly = false;
 			
 			if(DrawGridOnly)
 			{
diff --git a/62_CAD/shaders/main_pipeline/dtm.hlsl b/62_CAD/shaders/main_pipeline/dtm.hlsl
index 0aced1b89..68d58c3ad 100644
--- a/62_CAD/shaders/main_pipeline/dtm.hlsl
+++ b/62_CAD/shaders/main_pipeline/dtm.hlsl
@@ -118,14 +118,14 @@ float4 calculateDTMHeightColor(in DTMHeightShadingSettings settings, in float3 v
     if (heightMapSize > 0)
     {
         // partially based on https://www.shadertoy.com/view/XsXSz4 by Inigo Quilez
-        float2 e0 = v[1] - v[0];
-        float2 e1 = v[2] - v[1];
-        float2 e2 = v[0] - v[2];
+        float2 e0 = (v[1] - v[0]).xy;
+        float2 e1 = (v[2] - v[1]).xy;
+        float2 e2 = (v[0] - v[2]).xy;
 
         float triangleAreaSign = -sign(e0.x * e2.y - e0.y * e2.x);
-        float2 v0 = fragPos - v[0];
-        float2 v1 = fragPos - v[1];
-        float2 v2 = fragPos - v[2];
+        float2 v0 = fragPos - v[0].xy;
+        float2 v1 = fragPos - v[1].xy;
+        float2 v2 = fragPos - v[2].xy;
 
         float distanceToLine0 = sqrt(dot2(v0 - e0 * dot(v0, e0) / dot(e0, e0)));
         float distanceToLine1 = sqrt(dot2(v1 - e1 * dot(v1, e1) / dot(e1, e1)));
diff --git a/62_CAD/shaders/main_pipeline/fragment_shader.hlsl b/62_CAD/shaders/main_pipeline/fragment_shader.hlsl
index f91b2ab51..fb8e13673 100644
--- a/62_CAD/shaders/main_pipeline/fragment_shader.hlsl
+++ b/62_CAD/shaders/main_pipeline/fragment_shader.hlsl
@@ -137,7 +137,7 @@ float4 fragMain(PSInput input) : SV_TARGET
         v[1] = input.getScreenSpaceVertexAttribs(1);
         v[2] = input.getScreenSpaceVertexAttribs(2);
 
-        const float3 baryCoord = dtm::calculateDTMTriangleBarycentrics(v[0], v[1], v[2], input.position.xy);
+        const float3 baryCoord = dtm::calculateDTMTriangleBarycentrics(v[0].xy, v[1].xy, v[2].xy, input.position.xy);
         float height = baryCoord.x * v[0].z + baryCoord.y * v[1].z + baryCoord.z * v[2].z;
         float heightDeriv = fwidth(height);
 
@@ -526,9 +526,9 @@ float4 fragMain(PSInput input) : SV_TARGET
                 //outlineLineSegments[1] = nbl::hlsl::shapes::Line<float>::construct(currentTriangleVertices[0].xy, currentTriangleVertices[1].xy);
             }
 
-            const float3 baryCoord = dtm::calculateDTMTriangleBarycentrics(currentTriangleVertices[0], currentTriangleVertices[1], currentTriangleVertices[2], input.position.xy);
+            const float3 baryCoord = dtm::calculateDTMTriangleBarycentrics(currentTriangleVertices[0].xy, currentTriangleVertices[1].xy, currentTriangleVertices[2].xy, input.position.xy);
             float height = baryCoord.x * currentTriangleVertices[0].z + baryCoord.y * currentTriangleVertices[1].z + baryCoord.z * currentTriangleVertices[2].z;
-            float2 heightDeriv = fwidth(height);
+            float heightDeriv = fwidth(height);
 
             const bool outOfBoundsUV = uv.x < 0.0f || uv.y < 0.0f || uv.x > 1.0f || uv.y > 1.0f;
             float4 dtmColor = float4(0.0f, 0.0f, 0.0f, 0.0f);

From 629a0acb6110386e46a3a21fc7d0ece294c3c6d1 Mon Sep 17 00:00:00 2001
From: devsh <devsh@devsh.eu>
Date: Mon, 16 Jun 2025 21:28:10 +0200
Subject: [PATCH 372/529] correct small typos, get stuff to compile

---
 09_GeometryCreator/include/common.hpp               |  2 +-
 09_GeometryCreator/main.cpp                         | 11 ++---------
 .../nbl/examples/geometry/CGeometryCreatorScene.hpp | 13 +++++++------
 3 files changed, 10 insertions(+), 16 deletions(-)

diff --git a/09_GeometryCreator/include/common.hpp b/09_GeometryCreator/include/common.hpp
index 02197171d..d172e1959 100644
--- a/09_GeometryCreator/include/common.hpp
+++ b/09_GeometryCreator/include/common.hpp
@@ -12,6 +12,6 @@ using namespace asset;
 using namespace ui;
 using namespace video;
 using namespace scene;
-using namespace examples;
+using namespace nbl::examples;
 
 #endif // __NBL_THIS_EXAMPLE_COMMON_H_INCLUDED__
\ No newline at end of file
diff --git a/09_GeometryCreator/main.cpp b/09_GeometryCreator/main.cpp
index f246b5c79..af2c0ed93 100644
--- a/09_GeometryCreator/main.cpp
+++ b/09_GeometryCreator/main.cpp
@@ -4,9 +4,9 @@
 
 #include "common.hpp"
 
-class GeometryCreatorApp final : public examples::MonoWindowApplication
+class GeometryCreatorApp final : public MonoWindowApplication
 {
-		using base_t = examples::MonoWindowApplication;
+		using base_t = MonoWindowApplication;
 
 	public:
 		GeometryCreatorApp(const path& _localInputCWD, const path& _localOutputCWD, const path& _sharedInputCWD, const path& _sharedOutputCWD)
@@ -97,13 +97,6 @@ class GeometryCreatorApp final : public examples::MonoWindowApplication
 
 			const auto viewMatrix = camera.getViewMatrix();
 			const auto viewProjectionMatrix = camera.getConcatenatedMatrix();
-
-
-			core::matrix3x4SIMD modelViewMatrix = core::concatenateBFollowedByA(viewMatrix, modelMatrix);
-			core::matrix4SIMD modelViewProjectionMatrix = core::concatenateBFollowedByA(viewProjectionMatrix, modelMatrix);
-
-			core::matrix3x4SIMD normalMatrix;
-			modelViewMatrix.getSub3x3InverseTranspose(normalMatrix);
 #if 0
 			SBasicViewParameters uboData;
 			memcpy(uboData.MVP, modelViewProjectionMatrix.pointer(), sizeof(uboData.MVP));
diff --git a/common/include/nbl/examples/geometry/CGeometryCreatorScene.hpp b/common/include/nbl/examples/geometry/CGeometryCreatorScene.hpp
index e39e536b0..dbe3933d7 100644
--- a/common/include/nbl/examples/geometry/CGeometryCreatorScene.hpp
+++ b/common/include/nbl/examples/geometry/CGeometryCreatorScene.hpp
@@ -19,7 +19,7 @@ namespace nbl::examples
 class CGeometryCreatorScene : public core::IReferenceCounted
 {
 	public:
-		using SPushConstants = hlsl::geometry_creator_scene::SPushConstants;
+		using SPushConstants = hlsl::examples::geometry_creator_scene::SPushConstants;
 		//
 		enum ObjectType : uint8_t
 		{
@@ -134,11 +134,12 @@ using namespace nbl::video
 		{
 			inline SPushConstants convert(const hlsl::float32_t3x4& model, const hlsl::float32_t3x4& view, const hlsl::float32_t4x4& viewProj)
 			{
+				using namespace hlsl;
 				return {
 					.basic = {
-						.MVP = hlsl::math::linalg::promoted_mul(viewProj,model),
-						.MV = hlsl::math::linalg::promoted_mul(view,model),
-						.normalMat = hlsl::inverse(hlsl::transpose(hlsl::float32_t3x3(view)))
+						.MVP = math::linalg::promoted_mul<float32_t,4,4>(viewProj,model),
+						.MV = math::linalg::promoted_mul<float32_t,3,4>(view,model),
+						.normalMat = inverse(transpose(float32_t3x3(view)))
 					},
 					.positionView = positionView,
 					.normalView = normalView,
@@ -152,7 +153,7 @@ using namespace nbl::video
 			uint8_t positionView = 0;
 			uint8_t normalView = 0;
 			uint8_t uvView = 0;
-			uint8_t indexType = EIT_UNKNOWN;
+			uint8_t indexType = asset::EIT_UNKNOWN;
 			ObjectType type : 6 = ObjectType::OT_UNKNOWN;
 		};
 		std::span<const SPackedGeometry> getGeometries() const {return m_params.geoms;}
@@ -160,7 +161,7 @@ using namespace nbl::video
 	protected:
 		struct SInitParams
 		{
-			core::smart_refctd_ptr<IGPUDescriptorSet> ds;
+			core::smart_refctd_ptr<video::IGPUDescriptorSet> ds;
 			core::vector<SPackedGeometry> geoms;
 		} m_params;
 		inline CGeometryCreatorScene(SInitParams&& _params) : m_params(std::move(_params)) {}

From 37330ab2c7c0b4c2e49bdf4d8b4c64c724dd6f74 Mon Sep 17 00:00:00 2001
From: devsh <devsh@devsh.eu>
Date: Mon, 16 Jun 2025 22:40:22 +0200
Subject: [PATCH 373/529] add more test geometries

---
 .../examples/geometry/CGeometryCreatorScene.hpp  | 16 ++++++++++++++--
 1 file changed, 14 insertions(+), 2 deletions(-)

diff --git a/common/include/nbl/examples/geometry/CGeometryCreatorScene.hpp b/common/include/nbl/examples/geometry/CGeometryCreatorScene.hpp
index dbe3933d7..187d97768 100644
--- a/common/include/nbl/examples/geometry/CGeometryCreatorScene.hpp
+++ b/common/include/nbl/examples/geometry/CGeometryCreatorScene.hpp
@@ -109,7 +109,7 @@ using namespace nbl::video
 				auto addGeometry = [&allocateUTB,&init](const ICPUPolygonGeometry* geom)->void
 				{
 					auto& out = init.geoms.emplace_back();
-					out.elementCount = geom->getPrimitiveCount()*geom->getIndexingCallback()->degree();
+					out.elementCount = geom->getVertexReferenceCount();
 					out.positionView = allocateUTB(geom->getPositionView());
 					out.normalView = allocateUTB(geom->getNormalView());
 					// the first view is usually the UV
@@ -118,7 +118,19 @@ using namespace nbl::video
 				};
 
 				auto creator = core::make_smart_refctd_ptr<CGeometryCreator>();
-				addGeometry(creator->createCube().get());
+				/* TODO: others
+				ReferenceObjectCpu {.meta = {.type = OT_CUBE, .name = "Cube Mesh" }, .shadersType = GP_BASIC, .data = gc->createCubeMesh(nbl::core::vector3df(1.f, 1.f, 1.f)) },
+				ReferenceObjectCpu {.meta = {.type = OT_SPHERE, .name = "Sphere Mesh" }, .shadersType = GP_BASIC, .data = gc->createSphereMesh(2, 16, 16) },
+				ReferenceObjectCpu {.meta = {.type = OT_CYLINDER, .name = "Cylinder Mesh" }, .shadersType = GP_BASIC, .data = gc->createCylinderMesh(2, 2, 20) },
+				ReferenceObjectCpu {.meta = {.type = OT_RECTANGLE, .name = "Rectangle Mesh" }, .shadersType = GP_BASIC, .data = gc->createRectangleMesh(nbl::core::vector2df_SIMD(1.5, 3)) },
+				ReferenceObjectCpu {.meta = {.type = OT_DISK, .name = "Disk Mesh" }, .shadersType = GP_BASIC, .data = gc->createDiskMesh(2, 30) },
+				ReferenceObjectCpu {.meta = {.type = OT_ARROW, .name = "Arrow Mesh" }, .shadersType = GP_BASIC, .data = gc->createArrowMesh() },
+				ReferenceObjectCpu {.meta = {.type = OT_CONE, .name = "Cone Mesh" }, .shadersType = GP_CONE, .data = gc->createConeMesh(2, 3, 10) },
+				ReferenceObjectCpu {.meta = {.type = OT_ICOSPHERE, .name = "Icoshpere Mesh" }, .shadersType = GP_ICO, .data = gc->createIcoSphere(1, 3, true) }
+				*/
+				addGeometry(creator->createCube({1.f,1.f,1.f}).get());
+				addGeometry(creator->createRectangle({1.5f,3.f}).get());
+				addGeometry(creator->createDisk(2.f,30).get());
 			}
 
 			// convert the geometries

From 5200ea1cc42f94848f7daef4e70aaa4014743bce Mon Sep 17 00:00:00 2001
From: devsh <devsh@devsh.eu>
Date: Mon, 16 Jun 2025 23:39:45 +0200
Subject: [PATCH 374/529] fire up the converter and handle ownership between
 queues

---
 09_GeometryCreator/main.cpp                   |   8 +-
 .../geometry/CGeometryCreatorScene.hpp        | 470 +++++-------------
 2 files changed, 124 insertions(+), 354 deletions(-)

diff --git a/09_GeometryCreator/main.cpp b/09_GeometryCreator/main.cpp
index af2c0ed93..2e31c90dd 100644
--- a/09_GeometryCreator/main.cpp
+++ b/09_GeometryCreator/main.cpp
@@ -39,14 +39,16 @@ class GeometryCreatorApp final : public MonoWindowApplication
 
 //			auto scRes = static_cast<CDefaultSwapchainFramebuffers*>(m_surface->getSwapchainResources());
 //			.renderpass = core::smart_refctd_ptr<video::IGPURenderpass>(scRes->getRenderpass())
+			const uint32_t addtionalBufferOwnershipFamilies[] = {getGraphicsQueue()->getFamilyIndex()};
 			auto scene = CGeometryCreatorScene::create({
-				.utilities = m_utils,
-				.logger = m_logger
+				.transferQueue = getTransferUpQueue(),
+				.utilities = m_utils.get(),
+				.logger = m_logger.get(),
+				.addtionalBufferOwnershipFamilies = addtionalBufferOwnershipFamilies
 			});
 #if 0
 			//using Builder = typename CScene::CreateResourcesDirectlyWithDevice::Builder;
 			using Builder = typename CScene::CreateResourcesWithAssetConverter::Builder;
-			auto oneRunCmd = CScene::createCommandBuffer(m_utils->getLogicalDevice(), m_utils->getLogger(), gQueue->getFamilyIndex());
 			Builder builder(m_utils.get(), oneRunCmd.get(), m_logger.get(), geometry);
 
 			// gpu resources
diff --git a/common/include/nbl/examples/geometry/CGeometryCreatorScene.hpp b/common/include/nbl/examples/geometry/CGeometryCreatorScene.hpp
index 187d97768..dd462c03c 100644
--- a/common/include/nbl/examples/geometry/CGeometryCreatorScene.hpp
+++ b/common/include/nbl/examples/geometry/CGeometryCreatorScene.hpp
@@ -44,20 +44,26 @@ using namespace nbl::video
 		//
 		struct SCreateParams
 		{
-			core::smart_refctd_ptr<video::IUtilities> utilities;
-			core::smart_refctd_ptr<system::ILogger> logger;
+			video::IQueue* transferQueue;
+			video::IUtilities* utilities;
+			system::ILogger* logger;
+			std::span<const uint32_t> addtionalBufferOwnershipFamilies = {};
 		};
 		static inline core::smart_refctd_ptr<CGeometryCreatorScene> create(SCreateParams&& params)
 		{
 			EXPOSE_NABLA_NAMESPACES;
-			auto* logger = params.logger.get();
+			auto* logger = params.logger;
 			assert(logger);
+			if (!params.transferQueue)
+			{
+				logger->log("Pass a non-null `IQueue* transferQueue`!",ILogger::ELL_ERROR);
+				return nullptr;
+			}
 			if (!params.utilities)
 			{
-				logger->log("Pass a non-null `IUtilities`!",ILogger::ELL_ERROR);
+				logger->log("Pass a non-null `IUtilities* utilities`!",ILogger::ELL_ERROR);
 				return nullptr;
 			}
-			auto device = params.utilities->getLogicalDevice();
 
 			constexpr auto DescriptorCount = 255;
 			smart_refctd_ptr<ICPUDescriptorSet> cpuDS;
@@ -94,6 +100,8 @@ using namespace nbl::video
 			}
 
 			SInitParams init;
+			constexpr size_t NoIndexBufferMarker = 0xdeadbeefBADC0FFEull;
+			core::vector<smart_refctd_ptr<ICPUBuffer>> indexBuffers;
 			// create out geometries
 			{
 				auto* const outDescs = cpuDS->getDescriptorInfoStorage(IDescriptor::E_TYPE::ET_UNIFORM_TEXEL_BUFFER).data();
@@ -106,9 +114,16 @@ using namespace nbl::video
 					return nextDesc++;
 				};
 
-				auto addGeometry = [&allocateUTB,&init](const ICPUPolygonGeometry* geom)->void
+				auto addGeometry = [&allocateUTB,&indexBuffers,&init](const ICPUPolygonGeometry* geom)->void
 				{
 					auto& out = init.geoms.emplace_back();
+					if (const auto& view=geom->getIndexView(); view)
+					{
+						out.indexBuffer.offset = view.src.offset;
+						indexBuffers.push_back(view.src.buffer);
+					}
+					else
+						out.indexBuffer.offset = NoIndexBufferMarker;
 					out.elementCount = geom->getVertexReferenceCount();
 					out.positionView = allocateUTB(geom->getPositionView());
 					out.normalView = allocateUTB(geom->getNormalView());
@@ -135,7 +150,103 @@ using namespace nbl::video
 
 			// convert the geometries
 			{
-				init.ds = nullptr;
+				auto device = params.utilities->getLogicalDevice();
+				smart_refctd_ptr<CAssetConverter> converter = CAssetConverter::create({.device=device});
+
+
+				const auto transferFamily = params.transferQueue->getFamilyIndex();
+
+				struct SInputs : CAssetConverter::SInputs
+				{
+					virtual inline std::span<const uint32_t> getSharedOwnershipQueueFamilies(const size_t groupCopyID, const asset::ICPUBuffer* buffer, const CAssetConverter::patch_t<asset::ICPUBuffer>& patch) const
+					{
+						return sharedBufferOwnership;
+					}
+
+					core::vector<uint32_t> sharedBufferOwnership;
+				} inputs = {};
+				{
+					inputs.logger = logger;
+					// descriptor set should convert everthing downstream
+					std::get<CAssetConverter::SInputs::asset_span_t<ICPUDescriptorSet>>(inputs.assets) = {&cpuDS.get(),1};
+					// except index buffers
+					if (!indexBuffers.empty())
+						std::get<CAssetConverter::SInputs::asset_span_t<ICPUBuffer>>(inputs.assets) = {&indexBuffers.front().get(),indexBuffers.size()};
+					// set up shared ownership so we don't have to 
+					core::unordered_set<uint32_t> families;
+					families.insert(transferFamily);
+					families.insert(params.addtionalBufferOwnershipFamilies.begin(),params.addtionalBufferOwnershipFamilies.end());
+					if (families.size()>1)
+					for (const auto fam : families)
+						inputs.sharedBufferOwnership.push_back(fam);
+				}
+				
+				// reserve
+				auto reservation = converter->reserve(inputs);
+				if (!reservation)
+				{
+					logger->log("Failed to reserve GPU objects for CPU->GPU conversion!",ILogger::ELL_ERROR);
+					return nullptr;
+				}
+
+				// convert
+				{
+					auto semaphore = device->createSemaphore(0u);
+
+					constexpr auto MultiBuffering = 2;
+					std::array<smart_refctd_ptr<IGPUCommandBuffer>,MultiBuffering> commandBuffers = {};
+					{
+						auto pool = device->createCommandPool(transferFamily,IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT|IGPUCommandPool::CREATE_FLAGS::TRANSIENT_BIT);
+						pool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY,commandBuffers,smart_refctd_ptr<ILogger>(logger));
+					}
+					commandBuffers.front()->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT);
+
+					std::array<IQueue::SSubmitInfo::SCommandBufferInfo,MultiBuffering> commandBufferSubmits;
+					for (auto i=0; i<MultiBuffering; i++)
+						commandBufferSubmits[i].cmdbuf = commandBuffers[i].get();
+
+					SIntendedSubmitInfo transfer = {};
+					transfer.queue = params.transferQueue;
+					transfer.scratchCommandBuffers = commandBufferSubmits;
+					transfer.scratchSemaphore = {
+						.semaphore = semaphore.get(),
+						.value = 0u,
+						.stageMask = PIPELINE_STAGE_FLAGS::ALL_TRANSFER_BITS
+					};
+
+					CAssetConverter::SConvertParams cpar = {};
+					cpar.utilities = params.utilities;
+					cpar.transfer = &transfer;
+
+					// basically it records all data uploads and submits them right away
+					auto future = reservation.convert(cpar);
+					if (future.copy()!=IQueue::RESULT::SUCCESS)
+					{
+						logger->log("Failed to await submission feature!", ILogger::ELL_ERROR);
+						return nullptr;
+					}
+				}
+
+				// assign outputs
+				{
+					auto assign = [logger](auto& out, const auto& in)->bool
+					{
+						if (!in.value)
+						{
+							logger->log("Failed to convert CPU object to GPU!",ILogger::ELL_ERROR);
+							return false;
+						}
+						out = in.value;
+						return true;
+					};
+					if (!assign(init.ds,reservation.getGPUObjects<ICPUDescriptorSet>().front()))
+						return nullptr;
+					auto indexBufIt = reservation.getGPUObjects<ICPUBuffer>().data();
+					for (auto& entry : init.geoms)
+					if (entry.indexBuffer.offset!=NoIndexBufferMarker)
+					if (!assign(entry.indexBuffer.buffer,*(indexBufIt++)))
+						return nullptr;
+				}
 			}
 
 			return smart_refctd_ptr<CGeometryCreatorScene>(new CGeometryCreatorScene(std::move(init)),dont_grab);
@@ -159,7 +270,7 @@ using namespace nbl::video
 				};
 			}
 
-			core::smart_refctd_ptr<video::IGPUBuffer> indexBuffer = nullptr;
+			asset::SBufferBinding<video::IGPUBuffer> indexBuffer = {};
 			uint32_t elementCount = 0;
 			// indices into the descriptor set
 			uint8_t positionView = 0;
@@ -184,266 +295,6 @@ using namespace nbl::video
 #if 0
 class ResourceBuilder
 {
-public:
-
-	inline bool finalize(ResourcesBundle& output, nbl::video::CThreadSafeQueueAdapter* transferCapableQueue)
-	{
-		EXPOSE_NABLA_NAMESPACES();
-
-		// TODO: use multiple command buffers
-		std::array<IQueue::SSubmitInfo::SCommandBufferInfo,1u> commandBuffers = {};
-		{
-			commandBuffers.front().cmdbuf = commandBuffer;
-		}
-
-		{
-			// note that asset converter records basic transfer uploads itself, we only begin the recording with ONE_TIME_SUBMIT_BIT
-			commandBuffer->reset(IGPUCommandBuffer::RESET_FLAGS::RELEASE_RESOURCES_BIT);
-			commandBuffer->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT);
-			commandBuffer->beginDebugMarker("Resources builder's buffers upload [asset converter]");
-
-			// asset converter - scratch at this point has ready to convert cpu resources
-			smart_refctd_ptr<CAssetConverter> converter = CAssetConverter::create({ .device = utilities->getLogicalDevice(),.optimizer = {} });
-			CAssetConverter::SInputs inputs = {};
-			inputs.logger = logger;
-
-			struct ProxyCpuHooks
-			{
-				using object_size_t = std::tuple_size<decltype(scratch.objects)>;
-
-				std::array<ICPURenderpass*, 1u> renderpass;
-				std::array<ICPUGraphicsPipeline*, object_size_t::value> pipelines;
-				std::array<ICPUBuffer*, object_size_t::value * 2u + 1u > buffers;
-				std::array<ICPUImageView*, 2u> attachments;
-				std::array<ICPUDescriptorSet*, 1u> descriptorSet;
-			} hooks;
-
-			enum AttachmentIx
-			{
-				AI_COLOR = 0u,
-				AI_DEPTH = 1u,
-
-				AI_COUNT
-			};
-			
-			// gather CPU assets into span memory views
-			{ 
-				hooks.renderpass.front() = scratch.renderpass.get();
-				for (uint32_t i = 0u; i < hooks.pipelines.size(); ++i)
-				{
-					auto& [reference, meta] = scratch.objects[static_cast<ObjectType>(i)];
-					hooks.pipelines[i] = reference.pipeline.get();
-
-					// [[ [vertex, index] [vertex, index] [vertex, index] ... [ubo] ]]
-					hooks.buffers[2u * i + 0u] = reference.bindings.vertex.buffer.get();
-					hooks.buffers[2u * i + 1u] = reference.bindings.index.buffer.get();
-				}
-				hooks.buffers.back() = scratch.ubo.buffer.get();
-				hooks.attachments[AI_COLOR] = scratch.attachments.color.get();
-				hooks.attachments[AI_DEPTH] = scratch.attachments.depth.get();
-				hooks.descriptorSet.front() = scratch.descriptorSet.get();
-			}
-
-			// assign the CPU hooks to converter's inputs
-			{
-				std::get<CAssetConverter::SInputs::asset_span_t<ICPURenderpass>>(inputs.assets) = hooks.renderpass;
-				std::get<CAssetConverter::SInputs::asset_span_t<ICPUGraphicsPipeline>>(inputs.assets) = hooks.pipelines;
-				std::get<CAssetConverter::SInputs::asset_span_t<ICPUBuffer>>(inputs.assets) = hooks.buffers;
-				// std::get<CAssetConverter::SInputs::asset_span_t<ICPUImageView>>(inputs.assets) = hooks.attachments; // NOTE: THIS IS NOT IMPLEMENTED YET IN CONVERTER!
-				std::get<CAssetConverter::SInputs::asset_span_t<ICPUDescriptorSet>>(inputs.assets) = hooks.descriptorSet;
-			}
-
-			// reserve and create the GPU object handles
-			auto reservation = converter->reserve(inputs);
-			{
-				auto prepass = [&]<typename asset_type_t>(const auto& references) -> bool
-				{
-					// retrieve the reserved handles
-					auto objects = reservation.getGPUObjects<asset_type_t>();
-
-					uint32_t counter = {};
-					for (auto& object : objects)
-					{
-						// anything that fails to be reserved is a nullptr in the span of GPU Objects
-						auto gpu = object.value;
-						auto* reference = references[counter];
-
-						if (reference)
-						{
-							// validate
-							if (!gpu) // throw errors only if corresponding cpu hook was VALID (eg. we may have nullptr for some index buffers in the span for converter but it's OK, I'm too lazy to filter them before passing to the converter inputs and don't want to deal with dynamic alloc)
-							{
-								logger->log("Failed to convert a CPU object to GPU!", ILogger::ELL_ERROR);
-								return false;
-							}
-						}
-						
-						++counter;
-					}
-
-					return true;
-				};
-				
-				prepass.template operator() < ICPURenderpass > (hooks.renderpass);
-				prepass.template operator() < ICPUGraphicsPipeline > (hooks.pipelines);
-				prepass.template operator() < ICPUBuffer > (hooks.buffers);
-				// validate.template operator() < ICPUImageView > (hooks.attachments);
-				prepass.template operator() < ICPUDescriptorSet > (hooks.descriptorSet);
-			}
-
-			auto semaphore = utilities->getLogicalDevice()->createSemaphore(0u);
-
-			// TODO: compute submit as well for the images' mipmaps
-			SIntendedSubmitInfo transfer = {};
-			transfer.queue = transferCapableQueue;
-			transfer.scratchCommandBuffers = commandBuffers;
-			transfer.scratchSemaphore = {
-				.semaphore = semaphore.get(),
-				.value = 0u,
-				.stageMask = PIPELINE_STAGE_FLAGS::ALL_TRANSFER_BITS
-			};
-			// issue the convert call
-			{
-				CAssetConverter::SConvertParams params = {};
-				params.utilities = utilities;
-				params.transfer = &transfer;
-
-				// basically it records all data uploads and submits them right away
-				auto future = reservation.convert(params);
-				if (future.copy()!=IQueue::RESULT::SUCCESS)
-				{
-					logger->log("Failed to await submission feature!", ILogger::ELL_ERROR);
-					return false;
-				}
-
-				// assign gpu objects to output
-				auto& base = static_cast<ResourcesBundle::base_t&>(output);
-				{
-					auto&& [renderpass, pipelines, buffers, descriptorSet] = std::make_tuple(reservation.getGPUObjects<ICPURenderpass>().front().value, reservation.getGPUObjects<ICPUGraphicsPipeline>(), reservation.getGPUObjects<ICPUBuffer>(), reservation.getGPUObjects<ICPUDescriptorSet>().front().value);
-					{
-						base.renderpass = renderpass;
-						for (uint32_t i = 0u; i < pipelines.size(); ++i)
-						{
-							const auto type = static_cast<ObjectType>(i);
-							const auto& [rcpu, rmeta] = scratch.objects[type];
-							auto& [gpu, meta] = base.objects[type];
-
-							gpu.pipeline = pipelines[i].value;
-							// [[ [vertex, index] [vertex, index] [vertex, index] ... [ubo] ]]
-							gpu.bindings.vertex = {.offset = 0u, .buffer = buffers[2u * i + 0u].value};
-							gpu.bindings.index = {.offset = 0u, .buffer = buffers[2u * i + 1u].value};
-
-							gpu.indexCount = rcpu.indexCount;
-							gpu.indexType = rcpu.indexType;
-							meta.name = rmeta.name;
-							meta.type = rmeta.type;
-						}
-						base.ubo = {.offset = 0u, .buffer = buffers.back().value};
-						base.descriptorSet = descriptorSet;
-					
-						/*
-							// base.attachments.color = attachments[AI_COLOR].value;
-							// base.attachments.depth = attachments[AI_DEPTH].value;
-
-							note conversion of image views is not yet supported by the asset converter 
-							- it's complicated, we have to kinda temporary ignore DRY a bit here to not break the design which is correct
-
-							TEMPORARY: we patch attachments by allocating them ourselves here given cpu instances & parameters
-							TODO: remove following code once asset converter works with image views & update stuff
-						*/
-
-						for (uint32_t i = 0u; i < AI_COUNT; ++i)
-						{
-							const auto* reference = hooks.attachments[i];
-							auto& out = (i == AI_COLOR ? base.attachments.color : base.attachments.depth);
-
-							const auto& viewParams = reference->getCreationParameters();
-							const auto& imageParams = viewParams.image->getCreationParameters();
-
-							auto image = utilities->getLogicalDevice()->createImage
-							(
-								IGPUImage::SCreationParams
-								({
-									.type = imageParams.type,
-									.samples = imageParams.samples,
-									.format = imageParams.format,
-									.extent = imageParams.extent,
-									.mipLevels = imageParams.mipLevels,
-									.arrayLayers = imageParams.arrayLayers,
-									.usage = imageParams.usage
-								})
-							);
-
-							if (!image)
-							{
-								logger->log("Could not create image!", ILogger::ELL_ERROR);
-								return false;
-							}
-
-							bool IS_DEPTH = isDepthOrStencilFormat(imageParams.format);
-							std::string_view DEBUG_NAME = IS_DEPTH ? "UI Scene Depth Attachment Image" : "UI Scene Color Attachment Image";
-							image->setObjectDebugName(DEBUG_NAME.data());
-
-							if (!utilities->getLogicalDevice()->allocate(image->getMemoryReqs(), image.get()).isValid())
-							{
-								logger->log("Could not allocate memory for an image!", ILogger::ELL_ERROR);
-								return false;
-							}
-						
-							out = utilities->getLogicalDevice()->createImageView
-							(
-								IGPUImageView::SCreationParams
-								({
-									.flags = viewParams.flags,
-									.subUsages = viewParams.subUsages,
-									.image = std::move(image),
-									.viewType = viewParams.viewType,
-									.format = viewParams.format,
-									.subresourceRange = viewParams.subresourceRange
-								})
-							);
-
-							if (!out)
-							{
-								logger->log("Could not create image view!", ILogger::ELL_ERROR);
-								return false;
-							}
-						}
-
-						logger->log("Image View attachments has been allocated by hand after asset converter successful submit becasuse it doesn't support converting them yet!", ILogger::ELL_WARNING);
-					}
-				}
-			}
-		}
-
-		// write the descriptor set
-		{
-			// descriptor write ubo
-			IGPUDescriptorSet::SWriteDescriptorSet write;
-			write.dstSet = output.descriptorSet.get();
-			write.binding = 0;
-			write.arrayElement = 0u;
-			write.count = 1u;
-
-			IGPUDescriptorSet::SDescriptorInfo info;
-			{
-				info.desc = smart_refctd_ptr(output.ubo.buffer);
-				info.info.buffer.offset = output.ubo.offset;
-				info.info.buffer.size = output.ubo.buffer->getSize();
-			}
-
-			write.info = &info;
-
-			if(!utilities->getLogicalDevice()->updateDescriptorSets(1u, &write, 0u, nullptr))
-			{
-				logger->log("Could not write descriptor set!", ILogger::ELL_ERROR);
-				return false;
-			}
-		}
-
-		return true;
-	}
-
 private:
 
 
@@ -988,49 +839,6 @@ class CScene final : public nbl::core::IReferenceCounted
 		nbl::core::smart_refctd_ptr<nbl::video::ISemaphore> progress;
 	} semaphore;
 
-	struct CreateResourcesDirectlyWithDevice { using Builder = ResourceBuilder<false>; };
-	struct CreateResourcesWithAssetConverter { using Builder = ResourceBuilder<true>; };
-
-	~CScene() {}
-
-	static inline nbl::core::smart_refctd_ptr<nbl::video::IGPUCommandBuffer> createCommandBuffer(nbl::video::ILogicalDevice* const device, nbl::system::ILogger* const logger, const uint32_t familyIx)
-	{
-		EXPOSE_NABLA_NAMESPACES();
-		auto pool = device->createCommandPool(familyIx, IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT);
-
-		if (!pool)
-		{
-			logger->log("Couldn't create Command Pool!", ILogger::ELL_ERROR);
-			return nullptr;
-		}
-
-		nbl::core::smart_refctd_ptr<nbl::video::IGPUCommandBuffer> cmd;
-
-		if (!pool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY, { &cmd , 1 }))
-		{
-			logger->log("Couldn't create Command Buffer!", ILogger::ELL_ERROR);
-			return nullptr;
-		}
-
-		return cmd;
-	}
-
-	template<typename CreateWith, typename... Args>
-	static auto create(Args&&... args) -> decltype(auto)
-	{
-		EXPOSE_NABLA_NAMESPACES();
-
-		/*
-			user should call the constructor's args without last argument explicitly, this is a trick to make constructor templated, 
-			eg.create(smart_refctd_ptr(device), smart_refctd_ptr(logger), queuePointer, geometryPointer)
-		*/
-
-		auto* scene = new CScene(std::forward<Args>(args)..., CreateWith {});
-		smart_refctd_ptr<CScene> smart(scene, dont_grab);
-
-		return smart;
-	}
-
 	inline void begin()
 	{
 		EXPOSE_NABLA_NAMESPACES();
@@ -1109,46 +917,6 @@ class CScene final : public nbl::core::IReferenceCounted
 		m_commandBuffer->end();
 	}
 
-	inline bool submit()
-	{
-		EXPOSE_NABLA_NAMESPACES();
-
-		const IQueue::SSubmitInfo::SCommandBufferInfo buffers[] =
-		{
-			{ .cmdbuf = m_commandBuffer.get() }
-		};
-
-		const IQueue::SSubmitInfo::SSemaphoreInfo signals[] = { {.semaphore = semaphore.progress.get(),.value = semaphore.finishedValue,.stageMask = PIPELINE_STAGE_FLAGS::FRAMEBUFFER_SPACE_BITS} };
-
-		const IQueue::SSubmitInfo infos[] =
-		{
-			{
-				.waitSemaphores = {},
-				.commandBuffers = buffers,
-				.signalSemaphores = signals
-			}
-		};
-
-		return queue->submit(infos) == IQueue::RESULT::SUCCESS;
-	}
-
-	// note: must be updated outside render pass
-	inline void update()
-	{
-		EXPOSE_NABLA_NAMESPACES();
-
-		SBufferRange<IGPUBuffer> range;
-		range.buffer = smart_refctd_ptr(resources.ubo.buffer);
-		range.size = resources.ubo.buffer->getSize();
-
-		m_commandBuffer->updateBuffer(range, &object.viewParameters);
-	}
-
-	inline decltype(auto) getResources()
-	{
-		return (resources); // note: do not remove "()" - it makes the return type lvalue reference instead of copy 
-	}
-
 private:
 	template<typename CreateWith = CreateResourcesDirectlyWithDevice> // TODO: enforce constraints, only those 2 above are valid
 	CScene(nbl::core::smart_refctd_ptr<nbl::video::IUtilities> _utilities, nbl::core::smart_refctd_ptr<nbl::system::ILogger> _logger, nbl::video::CThreadSafeQueueAdapter* _graphicsQueue, const nbl::asset::IGeometryCreator* _geometryCreator, CreateWith createWith = {})

From a89ffbef02366cb06931ac3cbc9a4b05b8c15155 Mon Sep 17 00:00:00 2001
From: keptsecret <sorchon@gmail.com>
Date: Tue, 17 Jun 2025 15:43:04 +0700
Subject: [PATCH 375/529] make accessor template nicer to read

---
 .../app_resources/testWorkgroup.comp.hlsl     |  2 +-
 .../benchmarkWorkgroup.comp.hlsl              | 14 ++++++-------
 common/include/WorkgroupDataAccessors.hlsl    | 20 +++++++++----------
 3 files changed, 18 insertions(+), 18 deletions(-)

diff --git a/23_Arithmetic2UnitTest/app_resources/testWorkgroup.comp.hlsl b/23_Arithmetic2UnitTest/app_resources/testWorkgroup.comp.hlsl
index 4b30526a6..38e8b250f 100644
--- a/23_Arithmetic2UnitTest/app_resources/testWorkgroup.comp.hlsl
+++ b/23_Arithmetic2UnitTest/app_resources/testWorkgroup.comp.hlsl
@@ -30,7 +30,7 @@ struct operation_t
     // workgroup scans do no return anything, but use the data accessor to do the storing directly
     void operator()()
     {
-        using data_proxy_t = PreloadedDataProxy<config_t::WorkgroupSizeLog2,config_t::ItemsPerInvocation_0,config_t::VirtualWorkgroupSize/config_t::WorkgroupSize>;
+        using data_proxy_t = PreloadedDataProxy<config_t::WorkgroupSizeLog2,config_t::VirtualWorkgroupSize,config_t::ItemsPerInvocation_0>;
         data_proxy_t dataAccessor = data_proxy_t::create(pc.pInputBuf, pc.pOutputBuf[Binop::BindingIndex]);
         dataAccessor.preload();
 #if IS_REDUCTION
diff --git a/29_Arithmetic2Bench/app_resources/benchmarkWorkgroup.comp.hlsl b/29_Arithmetic2Bench/app_resources/benchmarkWorkgroup.comp.hlsl
index 561aadc56..50a9d912b 100644
--- a/29_Arithmetic2Bench/app_resources/benchmarkWorkgroup.comp.hlsl
+++ b/29_Arithmetic2Bench/app_resources/benchmarkWorkgroup.comp.hlsl
@@ -19,18 +19,18 @@ groupshared uint32_t scratch[mpl::max_v<int16_t,config_t::SharedScratchElementCo
 
 #include "../../common/include/WorkgroupDataAccessors.hlsl"
 
-template<uint16_t WorkgroupSizeLog2, uint16_t ItemsPerInvocation, uint16_t _PreloadedDataCount>
+template<uint16_t WorkgroupSizeLog2, uint16_t VirtualWorkgroupSize, uint16_t ItemsPerInvocation>
 struct RandomizedInputDataProxy
 {
     using dtype_t = vector<uint32_t, ItemsPerInvocation>;
 
-    NBL_CONSTEXPR_STATIC_INLINE uint16_t PreloadedDataCount = _PreloadedDataCount;
     NBL_CONSTEXPR_STATIC_INLINE uint16_t WorkgroupSize = uint16_t(1u) << WorkgroupSizeLog2;
+    NBL_CONSTEXPR_STATIC_INLINE uint16_t PreloadedDataCount = VirtualWorkgroupSize / WorkgroupSize;
 
-    static RandomizedInputDataProxy<WorkgroupSizeLog2, ItemsPerInvocation, PreloadedDataCount> create(uint64_t inputBuf, uint64_t outputBuf)
+    static RandomizedInputDataProxy<WorkgroupSizeLog2, VirtualWorkgroupSize, ItemsPerInvocation> create(uint64_t inputBuf, uint64_t outputBuf)
     {
-        RandomizedInputDataProxy<WorkgroupSizeLog2, ItemsPerInvocation, PreloadedDataCount> retval;
-        retval.data = DataProxy<WorkgroupSize*PreloadedDataCount, ItemsPerInvocation>::create(inputBuf, outputBuf);
+        RandomizedInputDataProxy<WorkgroupSizeLog2, VirtualWorkgroupSize, ItemsPerInvocation> retval;
+        retval.data = DataProxy<VirtualWorkgroupSize, ItemsPerInvocation>::create(inputBuf, outputBuf);
         return retval;
     }
 
@@ -69,13 +69,13 @@ struct RandomizedInputDataProxy
         //glsl::memoryBarrierShared(); implied by the above
     }
 
-    DataProxy<WorkgroupSize*PreloadedDataCount, ItemsPerInvocation> data;
+    DataProxy<VirtualWorkgroupSize, ItemsPerInvocation> data;
     dtype_t preloaded[PreloadedDataCount];
 };
 
 static ScratchProxy arithmeticAccessor;
 
-using data_proxy_t = RandomizedInputDataProxy<config_t::WorkgroupSizeLog2,config_t::ItemsPerInvocation_0,config_t::VirtualWorkgroupSize/config_t::WorkgroupSize>;
+using data_proxy_t = RandomizedInputDataProxy<config_t::WorkgroupSizeLog2,config_t::VirtualWorkgroupSize,config_t::ItemsPerInvocation_0>;
 
 template<class Binop, class device_capabilities>
 struct operation_t
diff --git a/common/include/WorkgroupDataAccessors.hlsl b/common/include/WorkgroupDataAccessors.hlsl
index e1774fad6..a274f5c08 100644
--- a/common/include/WorkgroupDataAccessors.hlsl
+++ b/common/include/WorkgroupDataAccessors.hlsl
@@ -31,15 +31,15 @@ struct ScratchProxy
     }
 };
 
-template<uint16_t WorkgroupSize, uint16_t ItemsPerInvocation>
+template<uint16_t VirtualWorkgroupSize, uint16_t ItemsPerInvocation>
 struct DataProxy
 {
     using dtype_t = vector<uint32_t, ItemsPerInvocation>;
 
-    static DataProxy<WorkgroupSize, ItemsPerInvocation> create(uint64_t inputBuf, uint64_t outputBuf)
+    static DataProxy<VirtualWorkgroupSize, ItemsPerInvocation> create(const uint64_t inputBuf, const uint64_t outputBuf)
     {
-        DataProxy<WorkgroupSize, ItemsPerInvocation> retval;
-        retval.workgroupOffset = glsl::gl_WorkGroupID().x * WorkgroupSize;
+        DataProxy<VirtualWorkgroupSize, ItemsPerInvocation> retval;
+        retval.workgroupOffset = glsl::gl_WorkGroupID().x * VirtualWorkgroupSize;
         retval.inputBufAddr = inputBuf;
         retval.outputBufAddr = outputBuf;
         return retval;
@@ -67,18 +67,18 @@ struct DataProxy
     uint64_t outputBufAddr;
 };
 
-template<uint16_t WorkgroupSizeLog2, uint16_t ItemsPerInvocation, uint16_t _PreloadedDataCount>
+template<uint16_t WorkgroupSizeLog2, uint16_t VirtualWorkgroupSize, uint16_t ItemsPerInvocation>
 struct PreloadedDataProxy
 {
     using dtype_t = vector<uint32_t, ItemsPerInvocation>;
 
-    NBL_CONSTEXPR_STATIC_INLINE uint16_t PreloadedDataCount = _PreloadedDataCount;
     NBL_CONSTEXPR_STATIC_INLINE uint16_t WorkgroupSize = uint16_t(1u) << WorkgroupSizeLog2;
+    NBL_CONSTEXPR_STATIC_INLINE uint16_t PreloadedDataCount = VirtualWorkgroupSize / WorkgroupSize;
 
-    static PreloadedDataProxy<WorkgroupSizeLog2, ItemsPerInvocation, PreloadedDataCount> create(uint64_t inputBuf, uint64_t outputBuf)
+    static PreloadedDataProxy<WorkgroupSizeLog2, VirtualWorkgroupSize, ItemsPerInvocation> create(const uint64_t inputBuf, const uint64_t outputBuf)
     {
-        PreloadedDataProxy<WorkgroupSizeLog2, ItemsPerInvocation, PreloadedDataCount> retval;
-        retval.data = DataProxy<WorkgroupSize*PreloadedDataCount, ItemsPerInvocation>::create(inputBuf, outputBuf);
+        PreloadedDataProxy<WorkgroupSizeLog2, VirtualWorkgroupSize, ItemsPerInvocation> retval;
+        retval.data = DataProxy<VirtualWorkgroupSize, ItemsPerInvocation>::create(inputBuf, outputBuf);
         return retval;
     }
 
@@ -114,7 +114,7 @@ struct PreloadedDataProxy
         //glsl::memoryBarrierShared(); implied by the above
     }
 
-    DataProxy<WorkgroupSize*PreloadedDataCount, ItemsPerInvocation> data;
+    DataProxy<VirtualWorkgroupSize, ItemsPerInvocation> data;
     dtype_t preloaded[PreloadedDataCount];
 };
 

From 272a26918cc32f15c7ec77acdde63fed7fdff921 Mon Sep 17 00:00:00 2001
From: devsh <devsh@devsh.eu>
Date: Tue, 17 Jun 2025 17:18:12 +0200
Subject: [PATCH 376/529] use Asset Converter for ICPUPolygonGeometry, and
 split into Scene and Renderer

---
 09_GeometryCreator/main.cpp                   |  57 +--
 .../common/CSwapchainFramebuffersAndDepth.hpp |   2 +-
 .../geometry/CGeometryCreatorScene.hpp        | 480 +++++-------------
 3 files changed, 163 insertions(+), 376 deletions(-)

diff --git a/09_GeometryCreator/main.cpp b/09_GeometryCreator/main.cpp
index 2e31c90dd..a98dcee5b 100644
--- a/09_GeometryCreator/main.cpp
+++ b/09_GeometryCreator/main.cpp
@@ -37,29 +37,25 @@ class GeometryCreatorApp final : public MonoWindowApplication
 					return logFail("Couldn't create Command Buffer!");
 			}
 
-//			auto scRes = static_cast<CDefaultSwapchainFramebuffers*>(m_surface->getSwapchainResources());
-//			.renderpass = core::smart_refctd_ptr<video::IGPURenderpass>(scRes->getRenderpass())
 			const uint32_t addtionalBufferOwnershipFamilies[] = {getGraphicsQueue()->getFamilyIndex()};
-			auto scene = CGeometryCreatorScene::create({
-				.transferQueue = getTransferUpQueue(),
-				.utilities = m_utils.get(),
-				.logger = m_logger.get(),
-				.addtionalBufferOwnershipFamilies = addtionalBufferOwnershipFamilies
-			});
-#if 0
-			//using Builder = typename CScene::CreateResourcesDirectlyWithDevice::Builder;
-			using Builder = typename CScene::CreateResourcesWithAssetConverter::Builder;
-			Builder builder(m_utils.get(), oneRunCmd.get(), m_logger.get(), geometry);
+			// we want to use the vertex data through UTBs
+			using usage_f = IGPUBuffer::E_USAGE_FLAGS;
+			CAssetConverter::patch_t<asset::ICPUPolygonGeometry> patch = {};
+			patch.positionBufferUsages = usage_f::EUF_UNIFORM_TEXEL_BUFFER_BIT;
+			patch.indexBufferUsages = usage_f::EUF_INDEX_BUFFER_BIT;
+			patch.otherBufferUsages = usage_f::EUF_UNIFORM_TEXEL_BUFFER_BIT;
+			auto scene = CGeometryCreatorScene::create(
+				{
+					.transferQueue = getTransferUpQueue(),
+					.utilities = m_utils.get(),
+					.logger = m_logger.get(),
+					.addtionalBufferOwnershipFamilies = addtionalBufferOwnershipFamilies
+				},patch
+			);
+			
+			auto scRes = static_cast<CDefaultSwapchainFramebuffers*>(m_surface->getSwapchainResources());
+			auto renderer = CSimpleDebugRenderer::create(scRes->getRenderpass(),0,scene.get());
 
-			// gpu resources
-			if (builder.build())
-			{
-				if (!builder.finalize(resources, gQueue))
-					m_logger->log("Could not finalize resource objects to gpu objects!", ILogger::ELL_ERROR);
-			}
-			else
-				m_logger->log("Could not build resource objects!", ILogger::ELL_ERROR);
-#endif
 			// camera
 			{
 				core::vectorSIMDf cameraPosition(-5.81655884, 2.58630896, -4.23974705);
@@ -139,7 +135,7 @@ class GeometryCreatorApp final : public MonoWindowApplication
 					.extent = {m_window->getWidth(),m_window->getHeight()}
 				};
 
-				const IGPUCommandBuffer::SClearColorValue clearValue = { .float32 = {0.f,0.f,0.f,1.f} };
+				const IGPUCommandBuffer::SClearColorValue clearValue = { .float32 = {1.f,0.f,1.f,1.f} };
 				const IGPUCommandBuffer::SClearDepthStencilValue depthValue = { .depth = 0.f };
 				auto scRes = static_cast<CDefaultSwapchainFramebuffers*>(m_surface->getSwapchainResources());
 				const IGPUCommandBuffer::SRenderpassBeginInfo info =
@@ -214,19 +210,20 @@ class GeometryCreatorApp final : public MonoWindowApplication
 		{
 			// Subsequent submits don't wait for each other, hence its important to have External Dependencies which prevent users of the depth attachment overlapping.
 			const static IGPURenderpass::SCreationParams::SSubpassDependency dependencies[] = {
-				// wipe-transition of Color to ATTACHMENT_OPTIMAL
+				// wipe-transition of Color to ATTACHMENT_OPTIMAL and depth
 				{
 					.srcSubpass = IGPURenderpass::SCreationParams::SSubpassDependency::External,
 					.dstSubpass = 0,
 					.memoryBarrier = {
-						// last place where the depth can get modified in previous frame
+						// last place where the depth can get modified in previous frame, `COLOR_ATTACHMENT_OUTPUT_BIT` is implicitly later
 						.srcStageMask = PIPELINE_STAGE_FLAGS::LATE_FRAGMENT_TESTS_BIT,
-						// only write ops, reads can't be made available
-						.srcAccessMask = ACCESS_FLAGS::DEPTH_STENCIL_ATTACHMENT_WRITE_BIT,
+						// don't want any writes to be available, we'll clear 
+						.srcAccessMask = ACCESS_FLAGS::NONE,
 						// destination needs to wait as early as possible
-						.dstStageMask = PIPELINE_STAGE_FLAGS::EARLY_FRAGMENT_TESTS_BIT,
-						// because of depth test needing a read and a write
-						.dstAccessMask = ACCESS_FLAGS::DEPTH_STENCIL_ATTACHMENT_WRITE_BIT | ACCESS_FLAGS::DEPTH_STENCIL_ATTACHMENT_READ_BIT
+						// TODO: `COLOR_ATTACHMENT_OUTPUT_BIT` shouldn't be needed, because its a logically later stage, see TODO in `ECommonEnums.h`
+						.dstStageMask = PIPELINE_STAGE_FLAGS::EARLY_FRAGMENT_TESTS_BIT | PIPELINE_STAGE_FLAGS::COLOR_ATTACHMENT_OUTPUT_BIT,
+						// because depth and color get cleared first no read mask
+						.dstAccessMask = ACCESS_FLAGS::DEPTH_STENCIL_ATTACHMENT_WRITE_BIT | ACCESS_FLAGS::COLOR_ATTACHMENT_WRITE_BIT
 					}
 					// leave view offsets and flags default
 				},
@@ -235,7 +232,7 @@ class GeometryCreatorApp final : public MonoWindowApplication
 					.srcSubpass = 0,
 					.dstSubpass = IGPURenderpass::SCreationParams::SSubpassDependency::External,
 					.memoryBarrier = {
-						// last place where the depth can get modified
+						// last place where the color can get modified, depth is implicitly earlier
 						.srcStageMask = PIPELINE_STAGE_FLAGS::COLOR_ATTACHMENT_OUTPUT_BIT,
 						// only write ops, reads can't be made available
 						.srcAccessMask = ACCESS_FLAGS::COLOR_ATTACHMENT_WRITE_BIT
diff --git a/common/include/nbl/examples/common/CSwapchainFramebuffersAndDepth.hpp b/common/include/nbl/examples/common/CSwapchainFramebuffersAndDepth.hpp
index a79d59730..ef88fb325 100644
--- a/common/include/nbl/examples/common/CSwapchainFramebuffersAndDepth.hpp
+++ b/common/include/nbl/examples/common/CSwapchainFramebuffersAndDepth.hpp
@@ -36,7 +36,7 @@ class CSwapchainFramebuffersAndDepth final : public video::CDefaultSwapchainFram
 					/*.loadOp = */{IGPURenderpass::LOAD_OP::CLEAR},
 					/*.storeOp = */{IGPURenderpass::STORE_OP::STORE},
 					/*.initialLayout = */{IGPUImage::LAYOUT::UNDEFINED}, // because we clear we don't care about contents
-					/*.finalLayout = */{IGPUImage::LAYOUT::ATTACHMENT_OPTIMAL} // transition to presentation right away so we can skip a barrier
+					/*.finalLayout = */{IGPUImage::LAYOUT::ATTACHMENT_OPTIMAL}
 				}},
 				IGPURenderpass::SCreationParams::DepthStencilAttachmentsEnd
 			};
diff --git a/common/include/nbl/examples/geometry/CGeometryCreatorScene.hpp b/common/include/nbl/examples/geometry/CGeometryCreatorScene.hpp
index dd462c03c..1f8d1ac6a 100644
--- a/common/include/nbl/examples/geometry/CGeometryCreatorScene.hpp
+++ b/common/include/nbl/examples/geometry/CGeometryCreatorScene.hpp
@@ -16,10 +16,14 @@
 namespace nbl::examples
 {
 
+#define EXPOSE_NABLA_NAMESPACES using namespace nbl::core; \
+using namespace nbl::system; \
+using namespace nbl::asset; \
+using namespace nbl::video
+
 class CGeometryCreatorScene : public core::IReferenceCounted
 {
 	public:
-		using SPushConstants = hlsl::examples::geometry_creator_scene::SPushConstants;
 		//
 		enum ObjectType : uint8_t
 		{
@@ -36,11 +40,6 @@ class CGeometryCreatorScene : public core::IReferenceCounted
 			OT_UNKNOWN = OT_COUNT
 		};
 
-#define EXPOSE_NABLA_NAMESPACES using namespace nbl::core; \
-using namespace nbl::system; \
-using namespace nbl::asset; \
-using namespace nbl::video
-
 		//
 		struct SCreateParams
 		{
@@ -49,7 +48,7 @@ using namespace nbl::video
 			system::ILogger* logger;
 			std::span<const uint32_t> addtionalBufferOwnershipFamilies = {};
 		};
-		static inline core::smart_refctd_ptr<CGeometryCreatorScene> create(SCreateParams&& params)
+		static inline core::smart_refctd_ptr<CGeometryCreatorScene> create(SCreateParams&& params, const video::CAssetConverter::patch_t<asset::ICPUPolygonGeometry>& geometryPatch)
 		{
 			EXPOSE_NABLA_NAMESPACES;
 			auto* logger = params.logger;
@@ -65,71 +64,15 @@ using namespace nbl::video
 				return nullptr;
 			}
 
-			constexpr auto DescriptorCount = 255;
-			smart_refctd_ptr<ICPUDescriptorSet> cpuDS;
-			{
-				// create Descriptor Set Layout
-				smart_refctd_ptr<ICPUDescriptorSetLayout> dsLayout;
-				{
-					const ICPUDescriptorSetLayout::SBinding bindings[] =
-					{
-						{
-							.binding = 0,
-							.type = IDescriptor::E_TYPE::ET_UNIFORM_TEXEL_BUFFER,
-							// some geometries may not have particular attributes
-							.createFlags = IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_PARTIALLY_BOUND_BIT,
-							.stageFlags = IShader::E_SHADER_STAGE::ESS_VERTEX|IShader::E_SHADER_STAGE::ESS_FRAGMENT,
-							.count = DescriptorCount
-						}
-					};
-					dsLayout = core::make_smart_refctd_ptr<ICPUDescriptorSetLayout>(bindings);
-					if (!dsLayout)
-					{
-						logger->log("Could not create descriptor set layout!", ILogger::ELL_ERROR);
-						return nullptr;
-					}
-				}
-
-				// create Descriptor Set
-				cpuDS = core::make_smart_refctd_ptr<ICPUDescriptorSet>(std::move(dsLayout));
-				if (!cpuDS)
-				{
-					logger->log("Could not descriptor set!", ILogger::ELL_ERROR);
-					return nullptr;
-				}
-			}
 
-			SInitParams init;
-			constexpr size_t NoIndexBufferMarker = 0xdeadbeefBADC0FFEull;
-			core::vector<smart_refctd_ptr<ICPUBuffer>> indexBuffers;
+			core::vector<SNamedGeometry> namedGeometries;
+			core::vector<smart_refctd_ptr<const ICPUPolygonGeometry>> geometries;
 			// create out geometries
 			{
-				auto* const outDescs = cpuDS->getDescriptorInfoStorage(IDescriptor::E_TYPE::ET_UNIFORM_TEXEL_BUFFER).data();
-				uint8_t nextDesc = 0;
-				auto allocateUTB = [DescriptorCount,outDescs,&nextDesc](const IGeometry<ICPUBuffer>::SDataView& view)->uint8_t
+				auto addGeometry = [&namedGeometries,&geometries](const std::string_view name, smart_refctd_ptr<const ICPUPolygonGeometry>&& geom)->void
 				{
-					if (!view)
-						return DescriptorCount;
-					outDescs[nextDesc].desc = core::make_smart_refctd_ptr<ICPUBufferView>(view.src,view.composed.format);
-					return nextDesc++;
-				};
-
-				auto addGeometry = [&allocateUTB,&indexBuffers,&init](const ICPUPolygonGeometry* geom)->void
-				{
-					auto& out = init.geoms.emplace_back();
-					if (const auto& view=geom->getIndexView(); view)
-					{
-						out.indexBuffer.offset = view.src.offset;
-						indexBuffers.push_back(view.src.buffer);
-					}
-					else
-						out.indexBuffer.offset = NoIndexBufferMarker;
-					out.elementCount = geom->getVertexReferenceCount();
-					out.positionView = allocateUTB(geom->getPositionView());
-					out.normalView = allocateUTB(geom->getNormalView());
-					// the first view is usually the UV
-					if (const auto& auxViews = geom->getAuxAttributeViews(); !auxViews.empty())
-						out.uvView = allocateUTB(auxViews.front());
+					namedGeometries.emplace_back().name = name;
+					geometries.push_back(std::move(geom));
 				};
 
 				auto creator = core::make_smart_refctd_ptr<CGeometryCreator>();
@@ -143,9 +86,9 @@ using namespace nbl::video
 				ReferenceObjectCpu {.meta = {.type = OT_CONE, .name = "Cone Mesh" }, .shadersType = GP_CONE, .data = gc->createConeMesh(2, 3, 10) },
 				ReferenceObjectCpu {.meta = {.type = OT_ICOSPHERE, .name = "Icoshpere Mesh" }, .shadersType = GP_ICO, .data = gc->createIcoSphere(1, 3, true) }
 				*/
-				addGeometry(creator->createCube({1.f,1.f,1.f}).get());
-				addGeometry(creator->createRectangle({1.5f,3.f}).get());
-				addGeometry(creator->createDisk(2.f,30).get());
+				addGeometry("Cube",creator->createCube({1.f,1.f,1.f}));
+				addGeometry("Rectangle",creator->createRectangle({1.5f,3.f}));
+				addGeometry("Disk",creator->createDisk(2.f,30));
 			}
 
 			// convert the geometries
@@ -165,13 +108,11 @@ using namespace nbl::video
 
 					core::vector<uint32_t> sharedBufferOwnership;
 				} inputs = {};
+				core::vector<CAssetConverter::patch_t<ICPUPolygonGeometry>> patches(geometries.size(),geometryPatch);
 				{
 					inputs.logger = logger;
-					// descriptor set should convert everthing downstream
-					std::get<CAssetConverter::SInputs::asset_span_t<ICPUDescriptorSet>>(inputs.assets) = {&cpuDS.get(),1};
-					// except index buffers
-					if (!indexBuffers.empty())
-						std::get<CAssetConverter::SInputs::asset_span_t<ICPUBuffer>>(inputs.assets) = {&indexBuffers.front().get(),indexBuffers.size()};
+					std::get<CAssetConverter::SInputs::asset_span_t<ICPUPolygonGeometry>>(inputs.assets) = {&geometries.front().get(),geometries.size()};
+					std::get<CAssetConverter::SInputs::patch_span_t<ICPUPolygonGeometry>>(inputs.patches) = patches;
 					// set up shared ownership so we don't have to 
 					core::unordered_set<uint32_t> families;
 					families.insert(transferFamily);
@@ -229,30 +170,44 @@ using namespace nbl::video
 
 				// assign outputs
 				{
-					auto assign = [logger](auto& out, const auto& in)->bool
+					auto inIt = reservation.getGPUObjects<ICPUPolygonGeometry>().data();
+					for (auto outIt=namedGeometries.begin(); outIt!=namedGeometries.end(); inIt++)
 					{
-						if (!in.value)
+						if (inIt->value)
+							(outIt++)->geom = inIt->value;
+						else
 						{
-							logger->log("Failed to convert CPU object to GPU!",ILogger::ELL_ERROR);
-							return false;
+							logger->log("Failed to convert ICPUPolygonGeometry %s to GPU!",ILogger::ELL_ERROR,outIt->name.data());
+							outIt = namedGeometries.erase(outIt);
 						}
-						out = in.value;
-						return true;
-					};
-					if (!assign(init.ds,reservation.getGPUObjects<ICPUDescriptorSet>().front()))
-						return nullptr;
-					auto indexBufIt = reservation.getGPUObjects<ICPUBuffer>().data();
-					for (auto& entry : init.geoms)
-					if (entry.indexBuffer.offset!=NoIndexBufferMarker)
-					if (!assign(entry.indexBuffer.buffer,*(indexBufIt++)))
-						return nullptr;
+					}
 				}
 			}
 
-			return smart_refctd_ptr<CGeometryCreatorScene>(new CGeometryCreatorScene(std::move(init)),dont_grab);
+			return smart_refctd_ptr<CGeometryCreatorScene>(new CGeometryCreatorScene(std::move(namedGeometries)),dont_grab);
 		}
 
 		//
+		struct SNamedGeometry
+		{
+			std::string_view name = {};
+			core::smart_refctd_ptr<video::IGPUPolygonGeometry> geom;
+		};
+		std::span<const SNamedGeometry> getGeometries() const {return m_geometries;}
+
+	protected:
+		inline CGeometryCreatorScene(core::vector<SNamedGeometry>&& _geometries) : m_geometries(std::move(_geometries)) {}
+
+		core::vector<SNamedGeometry> m_geometries;
+};
+
+class CSimpleDebugRenderer final : public core::IReferenceCounted
+{
+	public:
+		//
+		constexpr static inline auto DescriptorCount = 255;
+		//
+		using SPushConstants = hlsl::examples::geometry_creator_scene::SPushConstants;
 		struct SPackedGeometry
 		{
 			inline SPushConstants convert(const hlsl::float32_t3x4& model, const hlsl::float32_t3x4& view, const hlsl::float32_t4x4& viewProj)
@@ -270,251 +225,132 @@ using namespace nbl::video
 				};
 			}
 
-			asset::SBufferBinding<video::IGPUBuffer> indexBuffer = {};
+			asset::SBufferBinding<const video::IGPUBuffer> indexBuffer = {};
 			uint32_t elementCount = 0;
 			// indices into the descriptor set
 			uint8_t positionView = 0;
 			uint8_t normalView = 0;
 			uint8_t uvView = 0;
 			uint8_t indexType = asset::EIT_UNKNOWN;
-			ObjectType type : 6 = ObjectType::OT_UNKNOWN;
 		};
-		std::span<const SPackedGeometry> getGeometries() const {return m_params.geoms;}
 
-	protected:
-		struct SInitParams
+		static inline core::smart_refctd_ptr<CSimpleDebugRenderer> create(video::IGPURenderpass* renderpass, const uint32_t subpassIX, const CGeometryCreatorScene* scene)
 		{
-			core::smart_refctd_ptr<video::IGPUDescriptorSet> ds;
-			core::vector<SPackedGeometry> geoms;
-		} m_params;
-		inline CGeometryCreatorScene(SInitParams&& _params) : m_params(std::move(_params)) {}
-
-#undef EXPOSE_NABLA_NAMESPACES
-};
+			EXPOSE_NABLA_NAMESPACES;
 
-#if 0
-class ResourceBuilder
-{
-private:
+			if (!renderpass)
+				return nullptr;
+			auto device = const_cast<ILogicalDevice*>(renderpass->getOriginDevice());
+			auto logger = device->getLogger();
 
+			if (!scene)
+				return nullptr;
+			const auto namedGeoms = scene->getGeometries();
+			if (namedGeoms.empty())
+				return nullptr;
 
-	bool createRenderpass()
-	{
-		EXPOSE_NABLA_NAMESPACES();
+			// TODO: Load Shaders and Create Pipelines
 
-		static constexpr Types::renderpass_t::SCreationParams::SColorAttachmentDescription colorAttachments[] =
-		{
-			{
-				{
-					{
-						.format = ColorFboAttachmentFormat,
-						.samples = Samples,
-						.mayAlias = false
-					},
-					/* .loadOp = */ Types::renderpass_t::LOAD_OP::CLEAR,
-					/* .storeOp = */ Types::renderpass_t::STORE_OP::STORE,
-					/* .initialLayout = */ Types::image_t::LAYOUT::UNDEFINED,
-					/* .finalLayout = */ Types::image_t::LAYOUT::READ_ONLY_OPTIMAL
-				}
-			},
-			Types::renderpass_t::SCreationParams::ColorAttachmentsEnd
-		};
+			SInitParams init;
 
-		static constexpr Types::renderpass_t::SCreationParams::SDepthStencilAttachmentDescription depthAttachments[] =
-		{
+			// create descriptor set
 			{
+				// create Descriptor Set Layout
+				smart_refctd_ptr<IGPUDescriptorSetLayout> dsLayout;
 				{
+					const IGPUDescriptorSetLayout::SBinding bindings[] =
 					{
-						.format = DepthFboAttachmentFormat,
-						.samples = Samples,
-						.mayAlias = false
-					},
-					/* .loadOp = */ {Types::renderpass_t::LOAD_OP::CLEAR},
-					/* .storeOp = */ {Types::renderpass_t::STORE_OP::STORE},
-					/* .initialLayout = */ {Types::image_t::LAYOUT::UNDEFINED},
-					/* .finalLayout = */ {Types::image_t::LAYOUT::ATTACHMENT_OPTIMAL}
-				}
-			},
-			Types::renderpass_t::SCreationParams::DepthStencilAttachmentsEnd
-		};
-
-		typename Types::renderpass_t::SCreationParams::SSubpassDescription subpasses[] =
-		{
-			{},
-			Types::renderpass_t::SCreationParams::SubpassesEnd
-		};
-
-		subpasses[0].depthStencilAttachment.render = { .attachmentIndex = 0u,.layout = Types::image_t::LAYOUT::ATTACHMENT_OPTIMAL };
-		subpasses[0].colorAttachments[0] = { .render = {.attachmentIndex = 0u, .layout = Types::image_t::LAYOUT::ATTACHMENT_OPTIMAL } };
-
-		static constexpr Types::renderpass_t::SCreationParams::SSubpassDependency dependencies[] =
-		{
-			// wipe-transition of Color to ATTACHMENT_OPTIMAL
-			{
-				.srcSubpass = Types::renderpass_t::SCreationParams::SSubpassDependency::External,
-				.dstSubpass = 0,
-				.memoryBarrier =
-				{
-				// 
-				.srcStageMask = PIPELINE_STAGE_FLAGS::FRAGMENT_SHADER_BIT,
-				// only write ops, reads can't be made available
-				.srcAccessMask = ACCESS_FLAGS::SAMPLED_READ_BIT,
-				// destination needs to wait as early as possible
-				.dstStageMask = PIPELINE_STAGE_FLAGS::EARLY_FRAGMENT_TESTS_BIT | PIPELINE_STAGE_FLAGS::COLOR_ATTACHMENT_OUTPUT_BIT,
-				// because of depth test needing a read and a write
-				.dstAccessMask = ACCESS_FLAGS::DEPTH_STENCIL_ATTACHMENT_WRITE_BIT | ACCESS_FLAGS::DEPTH_STENCIL_ATTACHMENT_READ_BIT | ACCESS_FLAGS::COLOR_ATTACHMENT_READ_BIT | ACCESS_FLAGS::COLOR_ATTACHMENT_WRITE_BIT
-			}
-			// leave view offsets and flags default
-			},
-			// color from ATTACHMENT_OPTIMAL to PRESENT_SRC
-			{
-				.srcSubpass = 0,
-				.dstSubpass = Types::renderpass_t::SCreationParams::SSubpassDependency::External,
-				.memoryBarrier =
-				{
-				// last place where the depth can get modified
-				.srcStageMask = PIPELINE_STAGE_FLAGS::COLOR_ATTACHMENT_OUTPUT_BIT,
-				// only write ops, reads can't be made available
-				.srcAccessMask = ACCESS_FLAGS::COLOR_ATTACHMENT_WRITE_BIT,
-				// 
-				.dstStageMask = PIPELINE_STAGE_FLAGS::FRAGMENT_SHADER_BIT,
-				//
-				.dstAccessMask = ACCESS_FLAGS::SAMPLED_READ_BIT
-				// 
-				}
-			// leave view offsets and flags default
-			},
-			Types::renderpass_t::SCreationParams::DependenciesEnd
-		};
-
-		typename Types::renderpass_t::SCreationParams params = {};
-		params.colorAttachments = colorAttachments;
-		params.depthStencilAttachments = depthAttachments;
-		params.subpasses = subpasses;
-		params.dependencies = dependencies;
-
-		if constexpr (withAssetConverter)
-			scratch.renderpass = ICPURenderpass::create(params);
-
-		if (!scratch.renderpass)
-		{
-			logger->log("Could not create render pass!", ILogger::ELL_ERROR);
-			return false;
-		}
-
-		return true;
-	}
-
-	bool createFramebufferAttachments()
-	{
-		EXPOSE_NABLA_NAMESPACES();
-
-		auto createImageView = [&]<E_FORMAT format>(smart_refctd_ptr<typename Types::image_view_t>& outView) -> smart_refctd_ptr<typename Types::image_view_t>
-		{
-			constexpr bool IS_DEPTH = isDepthOrStencilFormat<format>();
-			constexpr auto USAGE = [](const bool isDepth)
-			{
-				bitflag<Types::image_t::E_USAGE_FLAGS> usage = Types::image_t::EUF_RENDER_ATTACHMENT_BIT;
-
-				if (!isDepth)
-					usage |= Types::image_t::EUF_SAMPLED_BIT;
-
-				return usage;
-			}(IS_DEPTH);
-			constexpr auto ASPECT = IS_DEPTH ? IImage::E_ASPECT_FLAGS::EAF_DEPTH_BIT : IImage::E_ASPECT_FLAGS::EAF_COLOR_BIT;
-			constexpr std::string_view DEBUG_NAME = IS_DEPTH ? "UI Scene Depth Attachment Image" : "UI Scene Color Attachment Image";
-			{
-				smart_refctd_ptr<typename Types::image_t> image;
-				{
-					auto params = typename Types::image_t::SCreationParams(
+						{
+							.binding = 0,
+							.type = IDescriptor::E_TYPE::ET_UNIFORM_TEXEL_BUFFER,
+							// some geometries may not have particular attributes
+							.createFlags = IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_PARTIALLY_BOUND_BIT,
+							.stageFlags = IShader::E_SHADER_STAGE::ESS_VERTEX|IShader::E_SHADER_STAGE::ESS_FRAGMENT,
+							.count = DescriptorCount
+						}
+					};
+					dsLayout = device->createDescriptorSetLayout(bindings);
+					if (!dsLayout)
 					{
-						.type = Types::image_t::ET_2D,
-						.samples = Samples,
-						.format = format,
-						.extent = { FramebufferW, FramebufferH, 1u },
-						.mipLevels = 1u,
-						.arrayLayers = 1u,
-						.usage = USAGE
-					});
-
-					if constexpr (withAssetConverter)
-						image = ICPUImage::create(params);
-					else
-						image = utilities->getLogicalDevice()->createImage(std::move(params));
+						logger->log("Could not create descriptor set layout!",ILogger::ELL_ERROR);
+						return nullptr;
+					}
 				}
 
-				if (!image)
+				// create Descriptor Set
+				auto pool = device->createDescriptorPoolForDSLayouts(IDescriptorPool::ECF_UPDATE_AFTER_BIND_BIT,{&dsLayout.get(),1});
+				init.ds = pool->createDescriptorSet(std::move(dsLayout));
+				if (!init.ds)
 				{
-					logger->log("Could not create image!", ILogger::ELL_ERROR);
+					logger->log("Could not descriptor set!",ILogger::ELL_ERROR);
 					return nullptr;
 				}
+			}
 
-				if constexpr (withAssetConverter)
+			// write geometries' attributes to descriptor set
+			{
+				core::vector<IGPUDescriptorSet::SDescriptorInfo> infos;
+				auto allocateUTB = [device,&infos](const IGeometry<const IGPUBuffer>::SDataView& view)->uint8_t
 				{
-					auto dummyBuffer = ICPUBuffer::create({ FramebufferW * FramebufferH * getTexelOrBlockBytesize<format>() });
-					dummyBuffer->setContentHash(dummyBuffer->computeContentHash());
-
-					auto regions = make_refctd_dynamic_array<smart_refctd_dynamic_array<ICPUImage::SBufferCopy>>(1u);
-					auto& region = regions->front();
-
-					region.imageSubresource = { .aspectMask = ASPECT, .mipLevel = 0u, .baseArrayLayer = 0u, .layerCount = 0u };
-					region.bufferOffset = 0u;
-					region.bufferRowLength = IImageAssetHandlerBase::calcPitchInBlocks(FramebufferW, getTexelOrBlockBytesize<format>());
-					region.bufferImageHeight = 0u;
-					region.imageOffset = { 0u, 0u, 0u };
-					region.imageExtent = { FramebufferW, FramebufferH, 1u };
+					if (!view)
+						return DescriptorCount;
+					const auto retval = infos.size();
+					infos.emplace_back().desc = device->createBufferView(view.src, view.composed.format);
+					return retval;
+				};
 
-					if (!image->setBufferAndRegions(std::move(dummyBuffer), regions))
-					{
-						logger->log("Could not set image's regions!", ILogger::ELL_ERROR);
-						return nullptr;
-					}
-					image->setContentHash(image->computeContentHash());
-				}
-				else
+				for (const auto& entry : namedGeoms)
 				{
-					image->setObjectDebugName(DEBUG_NAME.data());
-
-					if (!utilities->getLogicalDevice()->allocate(image->getMemoryReqs(), image.get()).isValid())
+					const auto* geom = entry.geom.get();
+					// could also check device origin on all buffers
+					if (!geom->valid())
+						continue;
+					auto& out = init.geoms.emplace_back();
+					if (const auto& view=geom->getIndexView(); view)
 					{
-						logger->log("Could not allocate memory for an image!", ILogger::ELL_ERROR);
-						return nullptr;
+						out.indexBuffer.offset = view.src.offset;
+						out.indexBuffer.buffer = view.src.buffer;
 					}
+					out.elementCount = geom->getVertexReferenceCount();
+					out.positionView = allocateUTB(geom->getPositionView());
+					out.normalView = allocateUTB(geom->getNormalView());
+					// the first view is usually the UV
+					if (const auto& auxViews = geom->getAuxAttributeViews(); !auxViews.empty())
+						out.uvView = allocateUTB(auxViews.front());
 				}
 
-				auto params = typename Types::image_view_t::SCreationParams
-				({
-					.flags = Types::image_view_t::ECF_NONE,
-					.subUsages = USAGE,
-					.image = std::move(image),
-					.viewType = Types::image_view_t::ET_2D,
-					.format = format,
-					.subresourceRange = { .aspectMask = ASPECT, .baseMipLevel = 0u, .levelCount = 1u, .baseArrayLayer = 0u, .layerCount = 1u }
-				});
-
-				if constexpr (withAssetConverter)
-					outView = make_smart_refctd_ptr<ICPUImageView>(std::move(params));
- 
-				if (!outView)
-				{
-					logger->log("Could not create image view!", ILogger::ELL_ERROR);
+				if (infos.empty())
+					return nullptr;
+				const IGPUDescriptorSet::SWriteDescriptorSet write = {
+					.dstSet = init.ds.get(),
+					.binding = 0,
+					.arrayElement = 0,
+					.count = static_cast<uint32_t>(infos.size()),
+					.info = infos.data()
+				};
+				if (!device->updateDescriptorSets({&write,1},{}))
 					return nullptr;
-				}
-
-				return smart_refctd_ptr(outView);
 			}
-		};
 
-		const bool allocated = createImageView.template operator() < ColorFboAttachmentFormat > (scratch.attachments.color) && createImageView.template operator() < DepthFboAttachmentFormat > (scratch.attachments.depth);
+			return smart_refctd_ptr<CSimpleDebugRenderer>(new CSimpleDebugRenderer(std::move(init)),dont_grab);
+		}
+
 
-		if (!allocated)
+	protected:
+		struct SInitParams
 		{
-			logger->log("Could not allocate frame buffer's attachments!", ILogger::ELL_ERROR);
-			return false;
-		}
+			core::smart_refctd_ptr<video::IGPUDescriptorSet> ds;
+			core::vector<SPackedGeometry> geoms;
+		} m_params;
 
-		return true;
-	}
+		inline CSimpleDebugRenderer(SInitParams&& _params) : m_params(std::move(_params)) {}
+};
+
+#undef EXPOSE_NABLA_NAMESPACES
+#if 0
+class ResourceBuilder
+{
+private:
 
 	bool createShaders()
 	{
@@ -852,45 +688,6 @@ class CScene final : public nbl::core::IReferenceCounted
 
 	inline void record()
 	{
-		EXPOSE_NABLA_NAMESPACES();
-
-		const struct 
-		{
-			const uint32_t width, height;
-		} fbo = { .width = m_frameBuffer->getCreationParameters().width, .height = m_frameBuffer->getCreationParameters().height };
-
-		SViewport viewport;
-		{
-			viewport.minDepth = 1.f;
-			viewport.maxDepth = 0.f;
-			viewport.x = 0u;
-			viewport.y = 0u;
-			viewport.width = fbo.width;
-			viewport.height = fbo.height;
-		}
-
-		m_commandBuffer->setViewport(0u, 1u, &viewport);
-		
-		VkRect2D scissor = {};
-		scissor.offset = { 0, 0 };
-		scissor.extent = { fbo.width, fbo.height };
-		m_commandBuffer->setScissor(0u, 1u, &scissor);
-
-		const VkRect2D renderArea =
-		{
-			.offset = { 0,0 },
-			.extent = { fbo.width, fbo.height }
-		};
-
-		const IGPUCommandBuffer::SRenderpassBeginInfo info =
-		{
-			.framebuffer = m_frameBuffer.get(),
-			.colorClearValues = &clear.color,
-			.depthStencilClearValues = &clear.depth,
-			.renderArea = renderArea
-		};
-
-		m_commandBuffer->beginRenderPass(info, IGPUCommandBuffer::SUBPASS_CONTENTS::INLINE);
 
 		const auto& [hook, meta] = resources.objects[object.meta.type];
 		auto* rawPipeline = hook.pipeline.get();
@@ -908,13 +705,6 @@ class CScene final : public nbl::core::IReferenceCounted
 		}
 		else
 			m_commandBuffer->draw(hook.indexCount, 1, 0, 0);
-
-		m_commandBuffer->endRenderPass();
-	}
-
-	inline void end()
-	{
-		m_commandBuffer->end();
 	}
 
 private:

From 8df4f585c92afc7a7541dd263b4dd13de2c6be6e Mon Sep 17 00:00:00 2001
From: Przemek <minikers21@gmail.com>
Date: Tue, 17 Jun 2025 21:16:50 +0200
Subject: [PATCH 377/529] Initial neighbouring cells drawing

---
 62_CAD/shaders/main_pipeline/dtm.hlsl         |  90 ++++++++++++
 .../main_pipeline/fragment_shader.hlsl        | 138 +++++++++++-------
 .../shaders/main_pipeline/vertex_shader.hlsl  |   4 +-
 3 files changed, 176 insertions(+), 56 deletions(-)

diff --git a/62_CAD/shaders/main_pipeline/dtm.hlsl b/62_CAD/shaders/main_pipeline/dtm.hlsl
index 68d58c3ad..e90f685ba 100644
--- a/62_CAD/shaders/main_pipeline/dtm.hlsl
+++ b/62_CAD/shaders/main_pipeline/dtm.hlsl
@@ -450,6 +450,96 @@ E_CELL_DIAGONAL resolveGridDTMCellDiagonal(in uint32_t4 cellData)
     return INVALID;
 }
 
+struct GridDTMTriangle
+{
+    float3 vertices[3];
+};
+
+/**
+* grid consists of square cells and cells are divided into two triangles:
+* depending on mode it is
+* either:        or:
+* v2a-------v1   v0-------v2b
+* |  A     / |   | \     B  |
+* |     /    |   |    \     |
+* |  /  B    |   |   A   \  |
+* v0-------v2b   v2a-------v1
+*/
+struct GridDTMCell
+{
+    GridDTMTriangle triangleA;
+    GridDTMTriangle triangleB;
+};
+
+struct GridDTMHeightMapData
+{
+    // heihts.x - bottom left texel
+    // heihts.y - bottom right texel
+    // heihts.z - top right texel
+    // heihts.w - top left texel
+    float4 heights;
+    E_CELL_DIAGONAL cellDiagonal;
+};
+
+GridDTMHeightMapData retrieveGridDTMCellDataFromHeightMap(in float2 gridExtents, in float2 cellCoords, const float cellWidth, in Texture2D<uint32_t> heightMap)
+{
+    GridDTMHeightMapData output;
+
+    const float2 maxCellCoords = float2(round(gridExtents.x / cellWidth), round(gridExtents.y / cellWidth));
+    const float2 location = (cellCoords + float2(0.5f, 0.5f)) / maxCellCoords;
+    uint32_t4 cellData = heightMap.Gather(textureSampler, float2(location.x, location.y), 0);
+
+    printf("%u %u %u %u", cellData.x, cellData.y, cellData.z, cellData.w);
+
+    output.heights = asfloat(cellData);
+    output.cellDiagonal = dtm::resolveGridDTMCellDiagonal(cellData);
+    return output;
+}
+
+GridDTMCell calculateCellTriangles(in float2 topLeft, in float2 gridExtents, in float2 cellCoords, const float cellWidth, in Texture2D<uint32_t> heightMap)
+{
+    GridDTMCell output;
+
+    // heightData.heihts.x - bottom left texel
+    // heightData.heihts.y - bottom right texel
+    // heightData.heihts.z - top right texel
+    // heightData.heihts.w - top left texel
+    dtm::GridDTMHeightMapData heightData = dtm::retrieveGridDTMCellDataFromHeightMap(gridExtents, cellCoords, cellWidth, heightMap);
+    const bool diagonalFromTopLeftToBottomRight = heightData.cellDiagonal == E_CELL_DIAGONAL::TOP_LEFT_TO_BOTTOM_RIGHT;
+    float2 gridSpaceCellTopLeftCoords = cellCoords * cellWidth;
+
+    if (diagonalFromTopLeftToBottomRight)
+    {
+        output.triangleA.vertices[0] = float3(gridSpaceCellTopLeftCoords.x, gridSpaceCellTopLeftCoords.y, heightData.heights.w);
+        output.triangleA.vertices[1] = float3(gridSpaceCellTopLeftCoords.x + cellWidth, gridSpaceCellTopLeftCoords.y + cellWidth, heightData.heights.y);
+        output.triangleA.vertices[2] = float3(gridSpaceCellTopLeftCoords.x, gridSpaceCellTopLeftCoords.y + cellWidth, heightData.heights.x);
+
+        output.triangleB.vertices[0] = float3(gridSpaceCellTopLeftCoords.x, gridSpaceCellTopLeftCoords.y, heightData.heights.w);
+        output.triangleB.vertices[1] = float3(gridSpaceCellTopLeftCoords.x + cellWidth, gridSpaceCellTopLeftCoords.y + cellWidth, heightData.heights.y);
+        output.triangleB.vertices[2] = float3(gridSpaceCellTopLeftCoords.x + cellWidth, gridSpaceCellTopLeftCoords.y, heightData.heights.z);
+    }
+    else
+    {
+        output.triangleA.vertices[0] = float3(gridSpaceCellTopLeftCoords.x, gridSpaceCellTopLeftCoords.y + cellWidth, heightData.heights.x);
+        output.triangleA.vertices[1] = float3(gridSpaceCellTopLeftCoords.x + cellWidth, gridSpaceCellTopLeftCoords.y, heightData.heights.z);
+        output.triangleA.vertices[2] = float3(gridSpaceCellTopLeftCoords.x, gridSpaceCellTopLeftCoords.y, heightData.heights.w);
+
+        output.triangleB.vertices[0] = float3(gridSpaceCellTopLeftCoords.x, gridSpaceCellTopLeftCoords.y + cellWidth, heightData.heights.x);
+        output.triangleB.vertices[1] = float3(gridSpaceCellTopLeftCoords.x + cellWidth, gridSpaceCellTopLeftCoords.y, heightData.heights.z);
+        output.triangleB.vertices[2] = float3(gridSpaceCellTopLeftCoords.x + cellWidth, gridSpaceCellTopLeftCoords.y + cellWidth, heightData.heights.y);
+    }
+
+    // move from grid space to screen space
+    [unroll]
+    for (int i = 0; i < 3; ++i)
+    {
+        output.triangleA.vertices[i].xy += topLeft;
+        output.triangleB.vertices[i].xy += topLeft;
+    }
+
+    return output;
+}
+
 }
 
 #endif
\ No newline at end of file
diff --git a/62_CAD/shaders/main_pipeline/fragment_shader.hlsl b/62_CAD/shaders/main_pipeline/fragment_shader.hlsl
index fb8e13673..25564a964 100644
--- a/62_CAD/shaders/main_pipeline/fragment_shader.hlsl
+++ b/62_CAD/shaders/main_pipeline/fragment_shader.hlsl
@@ -117,6 +117,14 @@ float32_t4 calculateFinalColor<true>(const uint2 fragCoord, const float localAlp
     return color;
 }
 
+bool isLineValid(in nbl::hlsl::shapes::Line<float> l)
+{
+    bool isAnyLineComponentNaN = any(bool4(isnan(l.P0.x), isnan(l.P0.y), isnan(l.P1.x), isnan(l.P1.y)));
+    if (isAnyLineComponentNaN)
+        return false;
+    return true;
+}
+
 [[vk::spvexecutionmode(spv::ExecutionModePixelInterlockOrderedEXT)]]
 [shader("pixel")]
 float4 fragMain(PSInput input) : SV_TARGET
@@ -422,8 +430,8 @@ float4 fragMain(PSInput input) : SV_TARGET
             float2 cellCoords;
             {
                 float2 gridSpacePosDivGridCellWidth = gridSpacePos / cellWidth;
-                cellCoords.x = uint32_t(gridSpacePosDivGridCellWidth.x);
-                cellCoords.y = uint32_t(gridSpacePosDivGridCellWidth.y);
+                cellCoords.x = int32_t(gridSpacePosDivGridCellWidth.x);
+                cellCoords.y = int32_t(gridSpacePosDivGridCellWidth.y);
             }
 
             float2 gridSpaceCellTopLeftCoords = cellCoords * cellWidth;
@@ -439,31 +447,24 @@ float4 fragMain(PSInput input) : SV_TARGET
             // 
 
             // calculate screen space coordinates of vertices of the current tiranlge within the grid
-            float3 currentTriangleVertices[3];
+            dtm::GridDTMTriangle currentTriangle;
+            dtm::GridDTMCell neighbouringCells[8];
+            if (dtmSettings.drawContourEnabled() || dtmSettings.drawHeightShadingEnabled())
             {
-                float2 insideCellCoord = gridSpacePos - float2(cellWidth, cellWidth) * cellCoords; // TODO: use fmod instead?
-                
-                uint32_t4 cellData;
-                // cellHeihts.x - bottom left texel
-                // cellHeihts.y - bottom right texel
-                // cellHeihts.z - top right texel
-                // cellHeihts.w - top left texel
-                float4 cellHeights = float4(InvalidGridDTMHeightValue, InvalidGridDTMHeightValue, InvalidGridDTMHeightValue, InvalidGridDTMHeightValue);
-                if (textureId != InvalidTextureIndex)
-                {
-                    const float2 maxCellCoords = float2(round(gridExtents.x / cellWidth), round(gridExtents.y / cellWidth));
-                    const float2 location = (cellCoords + float2(0.5f, 0.5f)) / maxCellCoords;
-
-                    cellData = texturesU32[NonUniformResourceIndex(textureId)].Gather(textureSampler, float2(location.x, location.y), 0);
-                    cellHeights = asfloat(cellData);
-                }
-
-                const E_CELL_DIAGONAL cellDiagonal = dtm::resolveGridDTMCellDiagonal(cellData);
-                const bool diagonalFromTopLeftToBottomRight = cellDiagonal == E_CELL_DIAGONAL::TOP_LEFT_TO_BOTTOM_RIGHT;
+                if (textureId == InvalidTextureIndex)
+                    discard;
 
-                if (cellDiagonal == E_CELL_DIAGONAL::INVALID)
+                // heightData.heihts.x - bottom left texel
+                // heightData.heihts.y - bottom right texel
+                // heightData.heihts.z - top right texel
+                // heightData.heihts.w - top left texel
+                dtm::GridDTMHeightMapData heightData = dtm::retrieveGridDTMCellDataFromHeightMap(gridExtents, cellCoords, cellWidth, texturesU32[NonUniformResourceIndex(textureId)]);
+                if (heightData.cellDiagonal == E_CELL_DIAGONAL::INVALID)
                     discard;
 
+                const bool diagonalFromTopLeftToBottomRight = heightData.cellDiagonal == E_CELL_DIAGONAL::TOP_LEFT_TO_BOTTOM_RIGHT;
+
+                float2 insideCellCoord = gridSpacePos - float2(cellWidth, cellWidth) * cellCoords; // TODO: use fmod instead?
                 // my ASCII art above explains which triangle is A and which is B
                 const bool triangleA = diagonalFromTopLeftToBottomRight ?
                     insideCellCoord.x < insideCellCoord.y :
@@ -471,29 +472,29 @@ float4 fragMain(PSInput input) : SV_TARGET
 
                 if (diagonalFromTopLeftToBottomRight)
                 {
-                    currentTriangleVertices[0] = float3(gridSpaceCellTopLeftCoords.x, gridSpaceCellTopLeftCoords.y, cellHeights.w);
-                    currentTriangleVertices[1] = float3(gridSpaceCellTopLeftCoords.x + cellWidth, gridSpaceCellTopLeftCoords.y + cellWidth, cellHeights.y);
-                    currentTriangleVertices[2] = triangleA ? float3(gridSpaceCellTopLeftCoords.x, gridSpaceCellTopLeftCoords.y + cellWidth, cellHeights.x) : float3(gridSpaceCellTopLeftCoords.x + cellWidth, gridSpaceCellTopLeftCoords.y, cellHeights.z);
+                    currentTriangle.vertices[0] = float3(gridSpaceCellTopLeftCoords.x, gridSpaceCellTopLeftCoords.y, heightData.heights.w);
+                    currentTriangle.vertices[1] = float3(gridSpaceCellTopLeftCoords.x + cellWidth, gridSpaceCellTopLeftCoords.y + cellWidth, heightData.heights.y);
+                    currentTriangle.vertices[2] = triangleA ? float3(gridSpaceCellTopLeftCoords.x, gridSpaceCellTopLeftCoords.y + cellWidth, heightData.heights.x) : float3(gridSpaceCellTopLeftCoords.x + cellWidth, gridSpaceCellTopLeftCoords.y, heightData.heights.z);
 
                     // TODO: use cell space instead https://github.com/Devsh-Graphics-Programming/Nabla-Examples-and-Tests/pull/186#discussion_r2133699055
-                    //currentTriangleVertices[0] = float3(0.0f, 0.0f, cellHeights.w);
-                    //currentTriangleVertices[1] = float3(cellWidth, cellWidth, cellHeights.y);
-                    //currentTriangleVertices[2] = triangleA ? float3(0.0f, cellWidth, cellHeights.x) : float3(cellWidth, 0.0f, cellHeights.z);
+                    //currentTriangle.vertices[0] = float3(0.0f, 0.0f, heightData.heights.w);
+                    //currentTriangle.vertices[1] = float3(cellWidth, cellWidth, heightData.heights.y);
+                    //currentTriangle.vertices[2] = triangleA ? float3(0.0f, cellWidth, heightData.heights.x) : float3(cellWidth, 0.0f, heightData.heights.z);
                 }
                 else
                 {
-                    currentTriangleVertices[0] = float3(gridSpaceCellTopLeftCoords.x, gridSpaceCellTopLeftCoords.y + cellWidth, cellHeights.x);
-                    currentTriangleVertices[1] = float3(gridSpaceCellTopLeftCoords.x + cellWidth, gridSpaceCellTopLeftCoords.y, cellHeights.z);
-                    currentTriangleVertices[2] = triangleA ? float3(gridSpaceCellTopLeftCoords.x, gridSpaceCellTopLeftCoords.y, cellHeights.w) : float3(gridSpaceCellTopLeftCoords.x + cellWidth, gridSpaceCellTopLeftCoords.y + cellWidth, cellHeights.y);
+                    currentTriangle.vertices[0] = float3(gridSpaceCellTopLeftCoords.x, gridSpaceCellTopLeftCoords.y + cellWidth, heightData.heights.x);
+                    currentTriangle.vertices[1] = float3(gridSpaceCellTopLeftCoords.x + cellWidth, gridSpaceCellTopLeftCoords.y, heightData.heights.z);
+                    currentTriangle.vertices[2] = triangleA ? float3(gridSpaceCellTopLeftCoords.x, gridSpaceCellTopLeftCoords.y, heightData.heights.w) : float3(gridSpaceCellTopLeftCoords.x + cellWidth, gridSpaceCellTopLeftCoords.y + cellWidth, heightData.heights.y);
 
                     // TODO: use cell space instead https://github.com/Devsh-Graphics-Programming/Nabla-Examples-and-Tests/pull/186#discussion_r2133699055
-                    //currentTriangleVertices[0] = float3(0.0f, 0.0f + cellWidth, cellHeights.x);
-                    //currentTriangleVertices[1] = float3(0.0f + cellWidth, 0.0f, cellHeights.z);
-                    //currentTriangleVertices[2] = triangleA ? float3(0.0f, 0.0f, cellHeights.w) : float3(cellWidth, cellWidth, cellHeights.y);
+                    //currentTriangle.vertices[0] = float3(0.0f, 0.0f + cellWidth, heightData.heights.x);
+                    //currentTriangle.vertices[1] = float3(0.0f + cellWidth, 0.0f, heightData.heights.z);
+                    //currentTriangle.vertices[2] = triangleA ? float3(0.0f, 0.0f, heightData.heights.w) : float3(cellWidth, cellWidth, heightData.heights.y);
                 }
 
-                bool isTriangleInvalid = isnan(currentTriangleVertices[0].z) || isnan(currentTriangleVertices[1].z) || isnan(currentTriangleVertices[2].z);
-                bool isCellPartiallyInvalid = isnan(cellHeights.x) || isnan(cellHeights.y) || isnan(cellHeights.z) || isnan(cellHeights.w);
+                bool isTriangleInvalid = isnan(currentTriangle.vertices[0].z) || isnan(currentTriangle.vertices[1].z) || isnan(currentTriangle.vertices[2].z);
+                bool isCellPartiallyInvalid = isnan(heightData.heights.x) || isnan(heightData.heights.y) || isnan(heightData.heights.z) || isnan(heightData.heights.w);
 
                 if (isTriangleInvalid)
                     discard;
@@ -501,46 +502,75 @@ float4 fragMain(PSInput input) : SV_TARGET
                 // move from grid space to screen space
                 [unroll]
                 for (int i = 0; i < 3; ++i)
-                    currentTriangleVertices[i].xy += topLeft;
-
-                float distancesToVerticalCellSides = min(insideCellCoord.x, cellWidth - insideCellCoord.x);
-                float distancesToHorizontalCellSides = min(insideCellCoord.y, cellWidth - insideCellCoord.y);
-
-                float patternCellCoord = distancesToVerticalCellSides >= distancesToHorizontalCellSides ? cellCoords.x : cellCoords.y;
+                    currentTriangle.vertices[i].xy += topLeft;
+
+                const float2 neighbouringCellsCellOffsets[8] = {
+                    float2(-1.0f, -1.0f),
+                    float2(0.0f, -1.0f),
+                    float2(1.0f, -1.0f),
+                    float2(-1.0f, 0.0f),
+                    float2(-1.0f, 0.0f),
+                    float2(-1.0f, 1.0f),
+                    float2(0.0f, 1.0f),
+                    float2(1.0f, 1.0f)
+                };
+
+                // construct triangles of neighbouring cells
+                for (int i = 0; i < 8; ++i)
+                {
+                    float2 neighbouringCellCoords = cellCoords + neighbouringCellsCellOffsets[i];
+                    neighbouringCells[i] = dtm::calculateCellTriangles(topLeft, gridExtents, neighbouringCellCoords, cellWidth, texturesU32[NonUniformResourceIndex(textureId)]);
+                }
             }
 
             // find the nearest horizontal and vertical line to the fragment
             nbl::hlsl::shapes::Line<float> outlineLineSegments[2];
             {
                 const float halfCellWidth = cellWidth * 0.5f;
-                const float2 nearestLineRemainingCoords = int2((gridSpacePos + halfCellWidth) / cellWidth) * cellWidth + topLeft;
+                const float2 horizontalBounds = float2(topLeft.y, topLeft.y + gridExtents.y);
+                const float2 verticalBounds = float2(topLeft.x, topLeft.x + gridExtents.x);
+                float2 nearestLineRemainingCoords = int2((gridSpacePos + halfCellWidth) / cellWidth) * cellWidth + topLeft;
+                // shift lines outside of the grid to a bound
+                nearestLineRemainingCoords.x = clamp(nearestLineRemainingCoords.x, verticalBounds.x, verticalBounds.y);
+                nearestLineRemainingCoords.y = clamp(nearestLineRemainingCoords.y, horizontalBounds.x, horizontalBounds.y);
 
                 // find the nearest horizontal line
-                outlineLineSegments[0].P0 = float32_t2(topLeft.x, nearestLineRemainingCoords.y);
-                outlineLineSegments[0].P1 = float32_t2(topLeft.x + gridExtents.x, nearestLineRemainingCoords.y);
-                outlineLineSegments[1].P0 = float32_t2(nearestLineRemainingCoords.x, topLeft.y);
-                outlineLineSegments[1].P1 = float32_t2(nearestLineRemainingCoords.x, topLeft.y + gridExtents.y);
+                outlineLineSegments[0].P0 = float32_t2(verticalBounds.x, nearestLineRemainingCoords.y);
+                outlineLineSegments[0].P1 = float32_t2(verticalBounds.y, nearestLineRemainingCoords.y);
+                // find the nearest vertical line
+                outlineLineSegments[1].P0 = float32_t2(nearestLineRemainingCoords.x, horizontalBounds.x);
+                outlineLineSegments[1].P1 = float32_t2(nearestLineRemainingCoords.x, horizontalBounds.y);
 
                 // test diagonal draw (to draw diagonals height or contour shading must be enabled)
                 //outlineLineSegments[0] = nbl::hlsl::shapes::Line<float>::construct(currentTriangleVertices[0].xy, currentTriangleVertices[1].xy);
                 //outlineLineSegments[1] = nbl::hlsl::shapes::Line<float>::construct(currentTriangleVertices[0].xy, currentTriangleVertices[1].xy);
             }
 
-            const float3 baryCoord = dtm::calculateDTMTriangleBarycentrics(currentTriangleVertices[0].xy, currentTriangleVertices[1].xy, currentTriangleVertices[2].xy, input.position.xy);
-            float height = baryCoord.x * currentTriangleVertices[0].z + baryCoord.y * currentTriangleVertices[1].z + baryCoord.z * currentTriangleVertices[2].z;
+            const float3 baryCoord = dtm::calculateDTMTriangleBarycentrics(currentTriangle.vertices[0].xy, currentTriangle.vertices[1].xy, currentTriangle.vertices[2].xy, input.position.xy);
+            float height = baryCoord.x * currentTriangle.vertices[0].z + baryCoord.y * currentTriangle.vertices[1].z + baryCoord.z * currentTriangle.vertices[2].z;
             float heightDeriv = fwidth(height);
 
             const bool outOfBoundsUV = uv.x < 0.0f || uv.y < 0.0f || uv.x > 1.0f || uv.y > 1.0f;
             float4 dtmColor = float4(0.0f, 0.0f, 0.0f, 0.0f);
-            if (dtmSettings.drawContourEnabled())
+            if (dtmSettings.drawContourEnabled() && !outOfBoundsUV)
             {
                 for (int i = dtmSettings.contourSettingsCount-1u; i >= 0; --i) 
-                    dtmColor = dtm::blendUnder(dtmColor, dtm::calculateDTMContourColor(dtmSettings.contourSettings[i], currentTriangleVertices, input.position.xy, height));
+                    dtmColor = dtm::blendUnder(dtmColor, dtm::calculateDTMContourColor(dtmSettings.contourSettings[i], currentTriangle.vertices, input.position.xy, height));
+
+                // draw shit form neighbouring cells
+                for (int i = 0; i < 8; ++i)
+                {
+                    for (int j = dtmSettings.contourSettingsCount - 1u; j >= 0; --j)
+                    {
+                        dtmColor = dtm::blendUnder(dtmColor, dtm::calculateDTMContourColor(dtmSettings.contourSettings[i], neighbouringCells[i].triangleA.vertices, input.position.xy, height));
+                        dtmColor = dtm::blendUnder(dtmColor, dtm::calculateDTMContourColor(dtmSettings.contourSettings[i], neighbouringCells[i].triangleB.vertices, input.position.xy, height));
+                    }
+                }
             }
             if (dtmSettings.drawOutlineEnabled())
                 dtmColor = dtm::blendUnder(dtmColor, dtm::calculateGridDTMOutlineColor(dtmSettings.outlineLineStyleIdx, outlineLineSegments, input.position.xy, 0.0f));
             if (dtmSettings.drawHeightShadingEnabled() && !outOfBoundsUV)
-                dtmColor = dtm::blendUnder(dtmColor, dtm::calculateDTMHeightColor(dtmSettings.heightShadingSettings, currentTriangleVertices, heightDeriv, input.position.xy, height));
+                dtmColor = dtm::blendUnder(dtmColor, dtm::calculateDTMHeightColor(dtmSettings.heightShadingSettings, currentTriangle.vertices, heightDeriv, input.position.xy, height));
 
             textureColor = dtmColor.rgb / dtmColor.a;
             localAlpha = dtmColor.a;
@@ -569,11 +599,11 @@ float4 fragMain(PSInput input) : SV_TARGET
             }
         }
 
-        uint2 fragCoord = uint2(input.position.xy);
         
         if (localAlpha <= 0)
             discard;
         
+        uint2 fragCoord = uint2(input.position.xy);
         const bool colorFromTexture = objType == ObjectType::STREAMED_IMAGE || objType == ObjectType::STATIC_IMAGE || objType == ObjectType::GRID_DTM;
 
         return calculateFinalColor<DeviceConfigCaps::fragmentShaderPixelInterlock>(fragCoord, localAlpha, currentMainObjectIdx, textureColor, colorFromTexture);
diff --git a/62_CAD/shaders/main_pipeline/vertex_shader.hlsl b/62_CAD/shaders/main_pipeline/vertex_shader.hlsl
index 6aa43cdf6..fd327e7fd 100644
--- a/62_CAD/shaders/main_pipeline/vertex_shader.hlsl
+++ b/62_CAD/shaders/main_pipeline/vertex_shader.hlsl
@@ -652,8 +652,8 @@ PSInput main(uint vertexID : SV_VertexID)
             float gridCellWidth = vk::RawBufferLoad<float>(globals.pointers.geometryBuffer + drawObj.geometryAddress + 2 * sizeof(pfloat64_t2) + sizeof(uint32_t), 8u);
             float thicknessOfTheThickestLine = vk::RawBufferLoad<float>(globals.pointers.geometryBuffer + drawObj.geometryAddress + 2 * sizeof(pfloat64_t2) + sizeof(uint32_t) + sizeof(float), 8u);
 
-            // for testing purpose
-            thicknessOfTheThickestLine += 200.0f;
+            // test large dilation
+            //thicknessOfTheThickestLine += 200.0f;
 
             const float2 corner = float2(bool2(vertexIdx & 0x1u, vertexIdx >> 1));
             worldSpaceExtents.y = ieee754::flipSign(worldSpaceExtents.y);

From 3a487ac9cc933b0866707611e45eb7615813adf2 Mon Sep 17 00:00:00 2001
From: devsh <devsh@devsh.eu>
Date: Tue, 17 Jun 2025 23:47:08 +0200
Subject: [PATCH 378/529] make example 30 run again

---
 30_ComputeShaderPathTracer/main.cpp | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/30_ComputeShaderPathTracer/main.cpp b/30_ComputeShaderPathTracer/main.cpp
index ed93cf81f..201eacaf3 100644
--- a/30_ComputeShaderPathTracer/main.cpp
+++ b/30_ComputeShaderPathTracer/main.cpp
@@ -534,6 +534,9 @@ class ComputeShaderPathtracer final : public examples::SimpleWindowedApplication
 					region.imageExtent = scrambleMapCPU->getCreationParameters().extent;
 
 					scrambleMapCPU->setBufferAndRegions(std::move(texelBuffer), regions);
+
+					// programmatically user-created IPreHashed need to have their hash computed (loaders do it while loading)
+					scrambleMapCPU->setContentHash(scrambleMapCPU->computeContentHash());
 				}
 
 				std::array<ICPUImage*, 2> cpuImgs = { envMapCPU.get(), scrambleMapCPU.get()};

From 8fd7f5d9e8d75a14cbb17a623f60a916355a2e90 Mon Sep 17 00:00:00 2001
From: keptsecret <sorchon@gmail.com>
Date: Wed, 18 Jun 2025 09:52:41 +0700
Subject: [PATCH 379/529] fix ex 11 fft

---
 11_FFT/app_resources/shader.comp.hlsl | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/11_FFT/app_resources/shader.comp.hlsl b/11_FFT/app_resources/shader.comp.hlsl
index ecbf4f092..63a85b0c4 100644
--- a/11_FFT/app_resources/shader.comp.hlsl
+++ b/11_FFT/app_resources/shader.comp.hlsl
@@ -14,13 +14,13 @@ uint32_t3 glsl::gl_WorkGroupSize() { return uint32_t3(uint32_t(ConstevalParamete
 
 struct SharedMemoryAccessor 
 {
-	template <typename IndexType, typename AccessType>
+	template <typename AccessType, typename IndexType>
 	void set(IndexType idx, AccessType value)
 	{
 		sharedmem[idx] = value;
 	}
 
-	template <typename IndexType, typename AccessType>
+	template <typename AccessType, typename IndexType>
 	void get(IndexType idx, NBL_REF_ARG(AccessType) value)
 	{
 		value = sharedmem[idx];
@@ -44,14 +44,14 @@ struct Accessor
     }
 
 	// TODO: can't use our own BDA yet, because it doesn't support the types `workgroup::FFT` will invoke these templates with
-	template <typename AccessType>
-	void get(const uint32_t index, NBL_REF_ARG(AccessType) value)
+	template <typename AccessType, typename IndexType>
+	void get(const IndexType index, NBL_REF_ARG(AccessType) value)
 	{
 		value = vk::RawBufferLoad<AccessType>(address + index * sizeof(AccessType));
 	}
 
-	template <typename AccessType>
-	void set(const uint32_t index, const AccessType value)
+	template <typename AccessType, typename IndexType>
+	void set(const IndexType index, const AccessType value)
 	{
 		vk::RawBufferStore<AccessType>(address + index * sizeof(AccessType), value);
 	}

From e5d4a354946afdb08db81cea3867b08ff2bd0a4b Mon Sep 17 00:00:00 2001
From: kevyuu <kevin.kayu@gmail.com>
Date: Wed, 18 Jun 2025 11:01:28 +0700
Subject: [PATCH 380/529] Fix fft bloom example to use the reworked shader spec
 info interface

---
 28_FFTBloom/main.cpp | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/28_FFTBloom/main.cpp b/28_FFTBloom/main.cpp
index 4718a4090..b528d3c41 100644
--- a/28_FFTBloom/main.cpp
+++ b/28_FFTBloom/main.cpp
@@ -723,10 +723,9 @@ class FFTBloomApp final : public examples::SimpleWindowedApplication, public app
 				params[i].layout = pipelineLayout.get();
 				params[i].shader.shader = shaders[i].get();
 				params[i].shader.entryPoint = "main";
-				params[i].shader.stage = hlsl::ShaderStage::ESS_COMPUTE;
 				// Normalization doesn't require full subgroups
-				params[i].shader.requireFullSubgroups = bool(2-i);
-				params[i].shader.requiredSubgroupSize = static_cast<IPipelineBase::SShaderSpecInfo::SUBGROUP_SIZE>(hlsl::findMSB(deviceLimits.maxSubgroupSize));
+				params[i].cached.requireFullSubgroups = bool(2-i);
+				params[i].shader.requiredSubgroupSize = static_cast<IPipelineBase::SUBGROUP_SIZE>(hlsl::findMSB(deviceLimits.maxSubgroupSize));
 			}
 			
 			smart_refctd_ptr<IGPUComputePipeline> pipelines[3];
@@ -928,9 +927,8 @@ class FFTBloomApp final : public examples::SimpleWindowedApplication, public app
 			params[i].layout = pipelineLayout.get();
 			params[i].shader.shader = shaders[i].get();
 			params[i].shader.entryPoint = "main";
-			params[i].shader.stage = hlsl::ShaderStage::ESS_COMPUTE;
-			params[i].shader.requiredSubgroupSize = static_cast<IPipelineBase::SShaderSpecInfo::SUBGROUP_SIZE>(hlsl::findMSB(deviceLimits.maxSubgroupSize));
-			params[i].shader.requireFullSubgroups = true;
+			params[i].shader.requiredSubgroupSize = static_cast<IPipelineBase::SUBGROUP_SIZE>(hlsl::findMSB(deviceLimits.maxSubgroupSize));
+			params[i].cached.requireFullSubgroups = true;
 		}
 
 		smart_refctd_ptr<IGPUComputePipeline> pipelines[3];

From 8ae32e1cbc991c31da7f75d55a9958f188ebba1d Mon Sep 17 00:00:00 2001
From: keptsecret <sorchon@gmail.com>
Date: Wed, 18 Jun 2025 12:06:09 +0700
Subject: [PATCH 381/529] removed redundant barrier

---
 29_Arithmetic2Bench/main.cpp | 30 +++---------------------------
 1 file changed, 3 insertions(+), 27 deletions(-)

diff --git a/29_Arithmetic2Bench/main.cpp b/29_Arithmetic2Bench/main.cpp
index 945749320..e88a59cae 100644
--- a/29_Arithmetic2Bench/main.cpp
+++ b/29_Arithmetic2Bench/main.cpp
@@ -398,31 +398,6 @@ class ArithmeticBenchApp final : public examples::SimpleWindowedApplication, pub
 		cmdbuf->reset(IGPUCommandBuffer::RESET_FLAGS::RELEASE_RESOURCES_BIT);
 		cmdbuf->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT);
 
-		// barrier transition to GENERAL
-		{
-			IGPUCommandBuffer::SPipelineBarrierDependencyInfo::image_barrier_t imageBarriers[1];
-			imageBarriers[0].barrier = {
-				   .dep = {
-					   .srcStageMask = PIPELINE_STAGE_FLAGS::NONE,
-					   .srcAccessMask = ACCESS_FLAGS::NONE,
-					   .dstStageMask = PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT,
-					   .dstAccessMask = ACCESS_FLAGS::SHADER_WRITE_BITS
-					}
-			};
-			imageBarriers[0].image = m_surface->getSwapchainResources()->getImage(m_currentImageAcquire.imageIndex);
-			imageBarriers[0].subresourceRange = {
-				.aspectMask = IImage::EAF_COLOR_BIT,
-				.baseMipLevel = 0u,
-				.levelCount = 1u,
-				.baseArrayLayer = 0u,
-				.layerCount = 1u
-			};
-			imageBarriers[0].oldLayout = IImage::LAYOUT::UNDEFINED;
-			imageBarriers[0].newLayout = IImage::LAYOUT::GENERAL;
-
-			cmdbuf->pipelineBarrier(E_DEPENDENCY_FLAGS::EDF_NONE, { .imgBarriers = imageBarriers });
-		}
-
 		const auto MaxSubgroupSize = m_physicalDevice->getLimits().maxSubgroupSize;
 		const auto SubgroupSizeLog2 = hlsl::findMSB(MaxSubgroupSize);
 
@@ -451,7 +426,7 @@ class ArithmeticBenchApp final : public examples::SimpleWindowedApplication, pub
 				.baseArrayLayer = 0u,
 				.layerCount = 1u
 			};
-			imageBarriers[0].oldLayout = IImage::LAYOUT::GENERAL;
+			imageBarriers[0].oldLayout = IImage::LAYOUT::UNDEFINED;
 			imageBarriers[0].newLayout = IImage::LAYOUT::PRESENT_SRC;
 
 			cmdbuf->pipelineBarrier(E_DEPENDENCY_FLAGS::EDF_NONE, { .imgBarriers = imageBarriers });
@@ -568,7 +543,8 @@ class ArithmeticBenchApp final : public examples::SimpleWindowedApplication, pub
 
 		const uint32_t subgroupSize = 0x1u << subgroupSizeLog2;
 		const uint32_t workgroupSizeLog2 = hlsl::findMSB(workgroupSize);
-		hlsl::workgroup2::SArithmeticConfiguration wgConfig = hlsl::workgroup2::SArithmeticConfiguration::create(workgroupSizeLog2, subgroupSizeLog2, itemsPerInvoc);
+		hlsl::workgroup2::SArithmeticConfiguration wgConfig;
+	    wgConfig.init(workgroupSizeLog2, subgroupSizeLog2, itemsPerInvoc);
 		const uint32_t itemsPerWG = wgConfig.VirtualWorkgroupSize * wgConfig.ItemsPerInvocation_0;
 		smart_refctd_ptr<ICPUShader> overriddenUnspecialized;
 		if constexpr (WorkgroupBench)

From 7121e8b266f1cc59b3d0e56db6248a84e26e26ad Mon Sep 17 00:00:00 2001
From: kevyuu <kevin.kayu@gmail.com>
Date: Wed, 18 Jun 2025 13:04:29 +0700
Subject: [PATCH 382/529] Fix example 05

---
 .../app_resources/shader.comp.hlsl                             | 1 +
 05_StreamingAndBufferDeviceAddressApp/main.cpp                 | 3 ++-
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/05_StreamingAndBufferDeviceAddressApp/app_resources/shader.comp.hlsl b/05_StreamingAndBufferDeviceAddressApp/app_resources/shader.comp.hlsl
index 4aeef0e0f..af38ffada 100644
--- a/05_StreamingAndBufferDeviceAddressApp/app_resources/shader.comp.hlsl
+++ b/05_StreamingAndBufferDeviceAddressApp/app_resources/shader.comp.hlsl
@@ -10,6 +10,7 @@ template<typename capability_traits=nbl::hlsl::jit::device_capabilities_traits>
 void dummyTraitTest() {}
 
 [numthreads(WorkgroupSize,1,1)]
+[shader("compute")]
 void main(uint32_t3 ID : SV_DispatchThreadID)
 {
 	dummyTraitTest();
diff --git a/05_StreamingAndBufferDeviceAddressApp/main.cpp b/05_StreamingAndBufferDeviceAddressApp/main.cpp
index c6c537363..f98e38f66 100644
--- a/05_StreamingAndBufferDeviceAddressApp/main.cpp
+++ b/05_StreamingAndBufferDeviceAddressApp/main.cpp
@@ -102,7 +102,8 @@ class StreamingAndBufferDeviceAddressApp final : public application_templates::M
 					return logFail("Could not load shader!");
 
 				// lets go straight from ICPUSpecializedShader to IGPUSpecializedShader
-				shader = IAsset::castDown<IShader>(assets[0]);
+				const auto shaderSource = IAsset::castDown<IShader>(assets[0]);
+				shader = m_device->compileShader({shaderSource.get()});
 				// The down-cast should not fail!
 				assert(shader);
 			}

From 683aa878ae5c9f252226955e240ab477524339e7 Mon Sep 17 00:00:00 2001
From: keptsecret <sorchon@gmail.com>
Date: Wed, 18 Jun 2025 14:03:41 +0700
Subject: [PATCH 383/529] use builtin bda accessor

---
 .../app_resources/testSubgroup.comp.hlsl         |  2 +-
 .../app_resources/benchmarkSubgroup.comp.hlsl    |  2 +-
 common/include/WorkgroupDataAccessors.hlsl       | 16 ++++++++--------
 3 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/23_Arithmetic2UnitTest/app_resources/testSubgroup.comp.hlsl b/23_Arithmetic2UnitTest/app_resources/testSubgroup.comp.hlsl
index e079e5e63..3b99e5a79 100644
--- a/23_Arithmetic2UnitTest/app_resources/testSubgroup.comp.hlsl
+++ b/23_Arithmetic2UnitTest/app_resources/testSubgroup.comp.hlsl
@@ -7,7 +7,7 @@
 #include "nbl/builtin/hlsl/subgroup2/arithmetic_portability.hlsl"
 
 #include "shaderCommon.hlsl"
-#include "nbl/builtin/hlsl/workgroup/basic.hlsl"
+#include "nbl/builtin/hlsl/workgroup2/basic.hlsl"
 
 typedef vector<uint32_t, ITEMS_PER_INVOCATION> type_t;
 
diff --git a/29_Arithmetic2Bench/app_resources/benchmarkSubgroup.comp.hlsl b/29_Arithmetic2Bench/app_resources/benchmarkSubgroup.comp.hlsl
index 9141ade55..2c102c13d 100644
--- a/29_Arithmetic2Bench/app_resources/benchmarkSubgroup.comp.hlsl
+++ b/29_Arithmetic2Bench/app_resources/benchmarkSubgroup.comp.hlsl
@@ -8,7 +8,7 @@
 #include "nbl/builtin/hlsl/random/xoroshiro.hlsl"
 
 #include "shaderCommon.hlsl"
-#include "nbl/builtin/hlsl/workgroup/basic.hlsl"
+#include "nbl/builtin/hlsl/workgroup2/basic.hlsl"
 
 typedef vector<uint32_t, ITEMS_PER_INVOCATION> type_t;
 
diff --git a/common/include/WorkgroupDataAccessors.hlsl b/common/include/WorkgroupDataAccessors.hlsl
index a274f5c08..7287a4135 100644
--- a/common/include/WorkgroupDataAccessors.hlsl
+++ b/common/include/WorkgroupDataAccessors.hlsl
@@ -1,6 +1,8 @@
 #ifndef _WORKGROUP_DATA_ACCESSORS_HLSL_
 #define _WORKGROUP_DATA_ACCESSORS_HLSL_
 
+#include "nbl/builtin/hlsl/bda/legacy_bda_accessor.hlsl"
+
 namespace nbl
 {
 namespace hlsl
@@ -35,25 +37,25 @@ template<uint16_t VirtualWorkgroupSize, uint16_t ItemsPerInvocation>
 struct DataProxy
 {
     using dtype_t = vector<uint32_t, ItemsPerInvocation>;
+    // function template AccessType should be the same as dtype_t
 
     static DataProxy<VirtualWorkgroupSize, ItemsPerInvocation> create(const uint64_t inputBuf, const uint64_t outputBuf)
     {
         DataProxy<VirtualWorkgroupSize, ItemsPerInvocation> retval;
-        retval.workgroupOffset = glsl::gl_WorkGroupID().x * VirtualWorkgroupSize;
-        retval.inputBufAddr = inputBuf;
-        retval.outputBufAddr = outputBuf;
+        const uint32_t workgroupOffset = glsl::gl_WorkGroupID().x * VirtualWorkgroupSize * sizeof(dtype_t);
+        retval.accessor = DoubleLegacyBdaAccessor<dtype_t>::create(inputBuf + workgroupOffset, outputBuf + workgroupOffset);
         return retval;
     }
 
     template<typename AccessType, typename IndexType>
     void get(const IndexType ix, NBL_REF_ARG(AccessType) value)
     {
-        value = vk::RawBufferLoad<AccessType>(inputBufAddr + (workgroupOffset + ix) * sizeof(AccessType));
+        accessor.get(ix, value);
     }
     template<typename AccessType, typename IndexType>
     void set(const IndexType ix, const AccessType value)
     {
-        vk::RawBufferStore<AccessType>(outputBufAddr + (workgroupOffset + ix) * sizeof(AccessType), value, sizeof(uint32_t));
+        accessor.set(ix, value);
     }
 
     void workgroupExecutionAndMemoryBarrier()
@@ -62,9 +64,7 @@ struct DataProxy
         //glsl::memoryBarrierShared(); implied by the above
     }
 
-    uint32_t workgroupOffset;
-    uint64_t inputBufAddr;
-    uint64_t outputBufAddr;
+    DoubleLegacyBdaAccessor<dtype_t> accessor;
 };
 
 template<uint16_t WorkgroupSizeLog2, uint16_t VirtualWorkgroupSize, uint16_t ItemsPerInvocation>

From 1c56eb05453fb7c3ba3c03c7e8d130279c8f4873 Mon Sep 17 00:00:00 2001
From: keptsecret <sorchon@gmail.com>
Date: Wed, 18 Jun 2025 14:33:30 +0700
Subject: [PATCH 384/529] reduce workgroup macro definitions, use config string

---
 .../app_resources/shaderCommon.hlsl           | 11 -----
 .../app_resources/testWorkgroup.comp.hlsl     |  8 ++--
 23_Arithmetic2UnitTest/main.cpp               | 44 +++++++++----------
 3 files changed, 24 insertions(+), 39 deletions(-)

diff --git a/23_Arithmetic2UnitTest/app_resources/shaderCommon.hlsl b/23_Arithmetic2UnitTest/app_resources/shaderCommon.hlsl
index 6b9575ccd..3793b08f8 100644
--- a/23_Arithmetic2UnitTest/app_resources/shaderCommon.hlsl
+++ b/23_Arithmetic2UnitTest/app_resources/shaderCommon.hlsl
@@ -3,13 +3,6 @@
 using namespace nbl;
 using namespace hlsl;
 
-// https://github.com/microsoft/DirectXShaderCompiler/issues/6144
-uint32_t3 nbl::hlsl::glsl::gl_WorkGroupSize() {return uint32_t3(WORKGROUP_SIZE,1,1);}
-
-#ifndef ITEMS_PER_INVOCATION
-#error "Define ITEMS_PER_INVOCATION!"
-#endif
-
 [[vk::push_constant]] PushConstantData pc;
 
 struct device_capabilities
@@ -24,7 +17,3 @@ struct device_capabilities
 #ifndef OPERATION
 #error "Define OPERATION!"
 #endif
-
-#ifndef SUBGROUP_SIZE_LOG2
-#error "Define SUBGROUP_SIZE_LOG2!"
-#endif
diff --git a/23_Arithmetic2UnitTest/app_resources/testWorkgroup.comp.hlsl b/23_Arithmetic2UnitTest/app_resources/testWorkgroup.comp.hlsl
index 38e8b250f..2a32ed20e 100644
--- a/23_Arithmetic2UnitTest/app_resources/testWorkgroup.comp.hlsl
+++ b/23_Arithmetic2UnitTest/app_resources/testWorkgroup.comp.hlsl
@@ -5,12 +5,10 @@
 #include "nbl/builtin/hlsl/subgroup2/arithmetic_portability.hlsl"
 #include "nbl/builtin/hlsl/workgroup2/arithmetic.hlsl"
 
-static const uint32_t WORKGROUP_SIZE = 1u << WORKGROUP_SIZE_LOG2;
+using config_t = WORKGROUP_CONFIG_T;
 
 #include "shaderCommon.hlsl"
 
-using config_t = workgroup2::ArithmeticConfiguration<WORKGROUP_SIZE_LOG2, SUBGROUP_SIZE_LOG2, ITEMS_PER_INVOCATION>;
-
 typedef vector<uint32_t, config_t::ItemsPerInvocation_0> type_t;
 
 // final (level 1/2) scan needs to fit in one subgroup exactly
@@ -52,7 +50,7 @@ struct operation_t
 template<class Binop>
 static void subtest()
 {
-    assert(glsl::gl_SubgroupSize() == 1u<<SUBGROUP_SIZE_LOG2)
+    assert(glsl::gl_SubgroupSize() == config_t::SubgroupSize)
 
     operation_t<Binop,device_capabilities> func;
     func();
@@ -69,7 +67,7 @@ void test()
     subtest<arithmetic::maximum<uint32_t> >();
 }
 
-[numthreads(WORKGROUP_SIZE,1,1)]
+[numthreads(config_t::WorkgroupSize,1,1)]
 void main()
 {
     test();
diff --git a/23_Arithmetic2UnitTest/main.cpp b/23_Arithmetic2UnitTest/main.cpp
index 6c979d7e5..51847e710 100644
--- a/23_Arithmetic2UnitTest/main.cpp
+++ b/23_Arithmetic2UnitTest/main.cpp
@@ -186,7 +186,7 @@ class Workgroup2ScanTestApp final : public application_templates::BasicMultiQueu
 			for (auto subgroupSize = MinSubgroupSize; subgroupSize <= MaxSubgroupSize; subgroupSize *= 2u)
 			{
 				const uint8_t subgroupSizeLog2 = hlsl::findMSB(subgroupSize);
-				for (uint32_t workgroupSize = subgroupSize; workgroupSize <= MaxWorkgroupSize; workgroupSize *= 2u)
+				for (uint32_t workgroupSize = 64; workgroupSize <= MaxWorkgroupSize; workgroupSize *= 2u)
 				{
 					// make sure renderdoc captures everything for debugging
 					m_api->startCapture();
@@ -198,14 +198,15 @@ class Workgroup2ScanTestApp final : public application_templates::BasicMultiQueu
 						uint32_t itemsPerWG = workgroupSize * itemsPerInvocation;
 						m_logger->log("Testing Items per Invocation %u", ILogger::ELL_INFO, itemsPerInvocation);
 						bool passed = true;
-						passed = runTest<emulatedReduction, false>(subgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize, bool(useNative), itemsPerWG, itemsPerInvocation) && passed;
-						logTestOutcome(passed, itemsPerWG);
-						passed = runTest<emulatedScanInclusive, false>(subgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize, bool(useNative), itemsPerWG, itemsPerInvocation) && passed;
-						logTestOutcome(passed, itemsPerWG);
-						passed = runTest<emulatedScanExclusive, false>(subgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize, bool(useNative), itemsPerWG, itemsPerInvocation) && passed;
-						logTestOutcome(passed, itemsPerWG);
-
-						hlsl::workgroup2::SArithmeticConfiguration wgConfig = hlsl::workgroup2::SArithmeticConfiguration::create(hlsl::findMSB(workgroupSize), subgroupSizeLog2, itemsPerInvocation);
+						//passed = runTest<emulatedReduction, false>(subgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize, bool(useNative), itemsPerWG, itemsPerInvocation) && passed;
+						//logTestOutcome(passed, itemsPerWG);
+						//passed = runTest<emulatedScanInclusive, false>(subgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize, bool(useNative), itemsPerWG, itemsPerInvocation) && passed;
+						//logTestOutcome(passed, itemsPerWG);
+						//passed = runTest<emulatedScanExclusive, false>(subgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize, bool(useNative), itemsPerWG, itemsPerInvocation) && passed;
+						//logTestOutcome(passed, itemsPerWG);
+
+						hlsl::workgroup2::SArithmeticConfiguration wgConfig;
+					    wgConfig.init(hlsl::findMSB(workgroupSize), subgroupSizeLog2, itemsPerInvocation);
 						itemsPerWG = wgConfig.VirtualWorkgroupSize * wgConfig.ItemsPerInvocation_0;
 						m_logger->log("Testing Item Count %u", ILogger::ELL_INFO, itemsPerWG);
 						passed = runTest<emulatedReduction, true>(workgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize, bool(useNative), itemsPerWG, itemsPerInvocation) && passed;
@@ -306,28 +307,25 @@ class Workgroup2ScanTestApp final : public application_templates::BasicMultiQueu
 		smart_refctd_ptr<ICPUShader> overriddenUnspecialized;
 		if constexpr (WorkgroupTest)
 		{
-			const std::string definitions[6] = {
+			hlsl::workgroup2::SArithmeticConfiguration wgConfig;
+			wgConfig.init(hlsl::findMSB(workgroupSize), subgroupSizeLog2, itemsPerInvoc);
+
+			const std::string definitions[3] = {
 				"workgroup2::" + arith_name,
-				std::to_string(workgroupSizeLog2),
-				std::to_string(itemsPerWG),
-				std::to_string(itemsPerInvoc),
-				std::to_string(subgroupSizeLog2),
+				wgConfig.getConfigTemplateStructString(),
 				std::to_string(arith_name=="reduction")
 			};
 
-			const IShaderCompiler::SMacroDefinition defines[7] = {
+			const IShaderCompiler::SMacroDefinition defines[4] = {
 				{ "OPERATION", definitions[0] },
-				{ "WORKGROUP_SIZE_LOG2", definitions[1] },
-				{ "ITEMS_PER_WG", definitions[2] },
-				{ "ITEMS_PER_INVOCATION", definitions[3] },
-				{ "SUBGROUP_SIZE_LOG2", definitions[4] },
-				{ "IS_REDUCTION", definitions[5] },
+				{ "WORKGROUP_CONFIG_T", definitions[1] },
+				{ "IS_REDUCTION", definitions[2] },
 				{ "TEST_NATIVE", "1" }
 			};
 			if (useNative)
-				options.preprocessorOptions.extraDefines = { defines, defines + 7 };
+				options.preprocessorOptions.extraDefines = { defines, defines + 4 };
 			else
-				options.preprocessorOptions.extraDefines = { defines, defines + 6 };
+				options.preprocessorOptions.extraDefines = { defines, defines + 3 };
 
 			overriddenUnspecialized = compiler->compileToSPIRV((const char*)source->getContent()->getPointer(), options);
 		}
@@ -358,7 +356,7 @@ class Workgroup2ScanTestApp final : public application_templates::BasicMultiQueu
 		auto pipeline = createPipeline(overriddenUnspecialized.get(),subgroupSizeLog2);
 
 		// TODO: overlap dispatches with memory readbacks (requires multiple copies of `buffers`)
-		uint32_t workgroupCount = min(elementCount / itemsPerWG, m_physicalDevice->getLimits().maxComputeWorkGroupCount[0]);
+		uint32_t workgroupCount = 1;// min(elementCount / itemsPerWG, m_physicalDevice->getLimits().maxComputeWorkGroupCount[0]);
 
 		cmdbuf->begin(IGPUCommandBuffer::USAGE::NONE);
 		cmdbuf->bindComputePipeline(pipeline.get());

From 3e910b55890aac6ccc3fcd9e1c43b5f5ee84b0df Mon Sep 17 00:00:00 2001
From: keptsecret <sorchon@gmail.com>
Date: Wed, 18 Jun 2025 15:16:51 +0700
Subject: [PATCH 385/529] similar config string thing but for subgroups

---
 .../app_resources/testSubgroup.comp.hlsl      | 29 +++++++++--------
 23_Arithmetic2UnitTest/main.cpp               | 32 ++++++++++---------
 2 files changed, 32 insertions(+), 29 deletions(-)

diff --git a/23_Arithmetic2UnitTest/app_resources/testSubgroup.comp.hlsl b/23_Arithmetic2UnitTest/app_resources/testSubgroup.comp.hlsl
index 3b99e5a79..3105aec56 100644
--- a/23_Arithmetic2UnitTest/app_resources/testSubgroup.comp.hlsl
+++ b/23_Arithmetic2UnitTest/app_resources/testSubgroup.comp.hlsl
@@ -5,28 +5,29 @@
 #include "nbl/builtin/hlsl/glsl_compat/core.hlsl"
 #include "nbl/builtin/hlsl/glsl_compat/subgroup_basic.hlsl"
 #include "nbl/builtin/hlsl/subgroup2/arithmetic_portability.hlsl"
+#include "nbl/builtin/hlsl/subgroup2/arithmetic_params.hlsl"
 
 #include "shaderCommon.hlsl"
 #include "nbl/builtin/hlsl/workgroup2/basic.hlsl"
 
-typedef vector<uint32_t, ITEMS_PER_INVOCATION> type_t;
+template<class Binop, class device_capabilities>
+using params_t = SUBGROUP_CONFIG_T;
+
+typedef vector<uint32_t, params_t<typename arithmetic::bit_and<uint32_t>::base_t, device_capabilities>::ItemsPerInvocation> type_t;
 
 uint32_t globalIndex()
 {
     return glsl::gl_WorkGroupID().x*WORKGROUP_SIZE+workgroup::SubgroupContiguousIndex();
 }
 
-template<class Binop, uint32_t N>
+template<class Binop>
 static void subtest(NBL_CONST_REF_ARG(type_t) sourceVal)
 {
-    using config_t = subgroup2::Configuration<SUBGROUP_SIZE_LOG2>;
-    using params_t = subgroup2::ArithmeticParams<config_t, typename Binop::base_t, N, device_capabilities>;
-
     const uint64_t outputBufAddr = pc.pOutputBuf[Binop::BindingIndex];
 
-    assert(glsl::gl_SubgroupSize() == 1u<<SUBGROUP_SIZE_LOG2)
+    assert(glsl::gl_SubgroupSize() == params_t<typename Binop::base_t, device_capabilities>::config_t::Size)
 
-    operation_t<params_t> func;
+    operation_t<params_t<typename Binop::base_t, device_capabilities> > func;
     type_t val = func(sourceVal);
 
     vk::RawBufferStore<type_t>(outputBufAddr + sizeof(type_t) * globalIndex(), val, sizeof(uint32_t));
@@ -37,13 +38,13 @@ type_t test()
     const uint32_t idx = globalIndex();
     type_t sourceVal = vk::RawBufferLoad<type_t>(pc.pInputBuf + idx * sizeof(type_t));
 
-    subtest<arithmetic::bit_and<uint32_t>, ITEMS_PER_INVOCATION>(sourceVal);
-    subtest<arithmetic::bit_xor<uint32_t>, ITEMS_PER_INVOCATION>(sourceVal);
-    subtest<arithmetic::bit_or<uint32_t>, ITEMS_PER_INVOCATION>(sourceVal);
-    subtest<arithmetic::plus<uint32_t>, ITEMS_PER_INVOCATION>(sourceVal);
-    subtest<arithmetic::multiplies<uint32_t>, ITEMS_PER_INVOCATION>(sourceVal);
-    subtest<arithmetic::minimum<uint32_t>, ITEMS_PER_INVOCATION>(sourceVal);
-    subtest<arithmetic::maximum<uint32_t>, ITEMS_PER_INVOCATION>(sourceVal);
+    subtest<arithmetic::bit_and<uint32_t> >(sourceVal);
+    subtest<arithmetic::bit_xor<uint32_t> >(sourceVal);
+    subtest<arithmetic::bit_or<uint32_t> >(sourceVal);
+    subtest<arithmetic::plus<uint32_t> >(sourceVal);
+    subtest<arithmetic::multiplies<uint32_t> >(sourceVal);
+    subtest<arithmetic::minimum<uint32_t> >(sourceVal);
+    subtest<arithmetic::maximum<uint32_t> >(sourceVal);
     return sourceVal;
 }
 
diff --git a/23_Arithmetic2UnitTest/main.cpp b/23_Arithmetic2UnitTest/main.cpp
index 51847e710..65ef126ad 100644
--- a/23_Arithmetic2UnitTest/main.cpp
+++ b/23_Arithmetic2UnitTest/main.cpp
@@ -2,6 +2,7 @@
 #include "nbl/application_templates/MonoAssetManagerAndBuiltinResourceApplication.hpp"
 #include "app_resources/common.hlsl"
 #include "nbl/builtin/hlsl/workgroup2/arithmetic_config.hlsl"
+#include "nbl/builtin/hlsl/subgroup2/arithmetic_params.hlsl"
 
 using namespace nbl;
 using namespace core;
@@ -186,7 +187,7 @@ class Workgroup2ScanTestApp final : public application_templates::BasicMultiQueu
 			for (auto subgroupSize = MinSubgroupSize; subgroupSize <= MaxSubgroupSize; subgroupSize *= 2u)
 			{
 				const uint8_t subgroupSizeLog2 = hlsl::findMSB(subgroupSize);
-				for (uint32_t workgroupSize = 64; workgroupSize <= MaxWorkgroupSize; workgroupSize *= 2u)
+				for (uint32_t workgroupSize = subgroupSize; workgroupSize <= MaxWorkgroupSize; workgroupSize *= 2u)
 				{
 					// make sure renderdoc captures everything for debugging
 					m_api->startCapture();
@@ -198,12 +199,12 @@ class Workgroup2ScanTestApp final : public application_templates::BasicMultiQueu
 						uint32_t itemsPerWG = workgroupSize * itemsPerInvocation;
 						m_logger->log("Testing Items per Invocation %u", ILogger::ELL_INFO, itemsPerInvocation);
 						bool passed = true;
-						//passed = runTest<emulatedReduction, false>(subgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize, bool(useNative), itemsPerWG, itemsPerInvocation) && passed;
-						//logTestOutcome(passed, itemsPerWG);
-						//passed = runTest<emulatedScanInclusive, false>(subgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize, bool(useNative), itemsPerWG, itemsPerInvocation) && passed;
-						//logTestOutcome(passed, itemsPerWG);
-						//passed = runTest<emulatedScanExclusive, false>(subgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize, bool(useNative), itemsPerWG, itemsPerInvocation) && passed;
-						//logTestOutcome(passed, itemsPerWG);
+						passed = runTest<emulatedReduction, false>(subgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize, bool(useNative), itemsPerWG, itemsPerInvocation) && passed;
+						logTestOutcome(passed, itemsPerWG);
+						passed = runTest<emulatedScanInclusive, false>(subgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize, bool(useNative), itemsPerWG, itemsPerInvocation) && passed;
+						logTestOutcome(passed, itemsPerWG);
+						passed = runTest<emulatedScanExclusive, false>(subgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize, bool(useNative), itemsPerWG, itemsPerInvocation) && passed;
+						logTestOutcome(passed, itemsPerWG);
 
 						hlsl::workgroup2::SArithmeticConfiguration wgConfig;
 					    wgConfig.init(hlsl::findMSB(workgroupSize), subgroupSizeLog2, itemsPerInvocation);
@@ -331,24 +332,25 @@ class Workgroup2ScanTestApp final : public application_templates::BasicMultiQueu
 		}
 		else
 		{
-			const std::string definitions[4] = { 
+			hlsl::subgroup2::SArithmeticParams sgParams;
+			sgParams.init(subgroupSizeLog2, itemsPerInvoc);
+
+			const std::string definitions[3] = { 
 				"subgroup2::" + arith_name,
 				std::to_string(workgroupSize),
-				std::to_string(itemsPerInvoc),
-				std::to_string(subgroupSizeLog2)
+				sgParams.getParamTemplateStructString()
 			};
 
-			const IShaderCompiler::SMacroDefinition defines[5] = {
+			const IShaderCompiler::SMacroDefinition defines[4] = {
 				{ "OPERATION", definitions[0] },
 				{ "WORKGROUP_SIZE", definitions[1] },
-				{ "ITEMS_PER_INVOCATION", definitions[2] },
-				{ "SUBGROUP_SIZE_LOG2", definitions[3] },
+				{ "SUBGROUP_CONFIG_T", definitions[2] },
 				{ "TEST_NATIVE", "1" }
 			};
 			if (useNative)
-				options.preprocessorOptions.extraDefines = { defines, defines + 5 };
-			else
 				options.preprocessorOptions.extraDefines = { defines, defines + 4 };
+			else
+				options.preprocessorOptions.extraDefines = { defines, defines + 3 };
 
 			overriddenUnspecialized = compiler->compileToSPIRV((const char*)source->getContent()->getPointer(), options);
 		}

From c446d7ef3cd2d6773fe2bf261d6d867341ebeff1 Mon Sep 17 00:00:00 2001
From: devsh <devsh@devsh.eu>
Date: Wed, 18 Jun 2025 10:18:01 +0200
Subject: [PATCH 386/529] prep for the final stretch in ex 09 (shaders)

---
 09_GeometryCreator/main.cpp                   |  71 ++---
 .../geometry/CGeometryCreatorScene.hpp        | 301 +++++++-----------
 .../nbl/examples/geometry/SPushConstants.hlsl |  10 +-
 3 files changed, 142 insertions(+), 240 deletions(-)

diff --git a/09_GeometryCreator/main.cpp b/09_GeometryCreator/main.cpp
index a98dcee5b..5bbe40f37 100644
--- a/09_GeometryCreator/main.cpp
+++ b/09_GeometryCreator/main.cpp
@@ -44,7 +44,7 @@ class GeometryCreatorApp final : public MonoWindowApplication
 			patch.positionBufferUsages = usage_f::EUF_UNIFORM_TEXEL_BUFFER_BIT;
 			patch.indexBufferUsages = usage_f::EUF_INDEX_BUFFER_BIT;
 			patch.otherBufferUsages = usage_f::EUF_UNIFORM_TEXEL_BUFFER_BIT;
-			auto scene = CGeometryCreatorScene::create(
+			m_scene = CGeometryCreatorScene::create(
 				{
 					.transferQueue = getTransferUpQueue(),
 					.utilities = m_utils.get(),
@@ -54,7 +54,9 @@ class GeometryCreatorApp final : public MonoWindowApplication
 			);
 			
 			auto scRes = static_cast<CDefaultSwapchainFramebuffers*>(m_surface->getSwapchainResources());
-			auto renderer = CSimpleDebugRenderer::create(scRes->getRenderpass(),0,scene.get());
+			m_renderer = CSimpleDebugRenderer::create(scRes->getRenderpass(),0,m_scene.get());
+			if (!m_renderer)
+				return logFail("Could not create Renderer!");
 
 			// camera
 			{
@@ -84,30 +86,8 @@ class GeometryCreatorApp final : public MonoWindowApplication
 				mouse.consumeEvents([&](const IMouseEventChannel::range_t& events) -> void { camera.mouseProcess(events); mouseProcess(events); }, m_logger.get());
 				keyboard.consumeEvents([&](const IKeyboardEventChannel::range_t& events) -> void { camera.keyboardProcess(events); }, m_logger.get());
 				camera.endInputProcessing(nextPresentationTimestamp);
-#if 0
-				const auto type = static_cast<ObjectType>(gcIndex);
-				const auto& [gpu, meta] = resources.objects[type];
-
-				object.meta.type = type;
-				object.meta.name = meta.name;
-#endif
 			}
 
-			const auto viewMatrix = camera.getViewMatrix();
-			const auto viewProjectionMatrix = camera.getConcatenatedMatrix();
-#if 0
-			SBasicViewParameters uboData;
-			memcpy(uboData.MVP, modelViewProjectionMatrix.pointer(), sizeof(uboData.MVP));
-			memcpy(uboData.MV, modelViewMatrix.pointer(), sizeof(uboData.MV));
-			memcpy(uboData.NormalMat, normalMatrix.pointer(), sizeof(uboData.NormalMat));
-			{
-				SBufferRange<IGPUBuffer> range;
-				range.buffer = core::smart_refctd_ptr(resources.ubo.buffer);
-				range.size = resources.ubo.buffer->getSize();
-
-				cb->updateBuffer(range, &uboData);
-			}
-#endif
 			auto* queue = getGraphicsQueue();
 
 			asset::SViewport viewport;
@@ -148,24 +128,17 @@ class GeometryCreatorApp final : public MonoWindowApplication
 
 				cb->beginRenderPass(info, IGPUCommandBuffer::SUBPASS_CONTENTS::INLINE);
 			}
-#if 0
-			const auto& [hook, meta] = resources.objects[object.meta.type];
-			auto* rawPipeline = hook.pipeline.get();
-
-			SBufferBinding<const IGPUBuffer> vertex = hook.bindings.vertex, index = hook.bindings.index;
 
-			cb->bindGraphicsPipeline(rawPipeline);
-			cb->bindDescriptorSets(EPBP_GRAPHICS, rawPipeline->getLayout(), 1, 1, &resources.descriptorSet.get());
-			cb->bindVertexBuffers(0, 1, &vertex);
-
-			if (index.buffer && hook.indexType != EIT_UNKNOWN)
+			float32_t3x4 viewMatrix;
+			float32_t4x4 viewProjMatrix;
+			// TODO: get rid of legacy matrices
 			{
-				cb->bindIndexBuffer(index, hook.indexType);
-				cb->drawIndexed(hook.indexCount, 1, 0, 0, 0);
+				memcpy(&viewMatrix,camera.getViewMatrix().pointer(),sizeof(viewMatrix));
+				memcpy(&viewProjMatrix,camera.getConcatenatedMatrix().pointer(),sizeof(viewMatrix));
 			}
-			else
-				cb->draw(hook.indexCount, 1, 0, 0);
-#endif
+			const auto viewParams = CSimpleDebugRenderer::SViewParams(viewMatrix,viewProjMatrix);
+			m_renderer->render(cb,viewParams);
+
 			cb->endRenderPass();
 			cb->end();
 
@@ -199,7 +172,9 @@ class GeometryCreatorApp final : public MonoWindowApplication
 
 			std::string caption = "[Nabla Engine] Geometry Creator";
 			{
-//					caption += ", displaying [" + std::string(object.meta.name.data()) + "]";
+				caption += ", displaying [" + 
+				caption += m_scene->getGeometries()[gcIndex].name;
+				caption += "]";
 				m_window->setCaption(caption);
 			}
 			return retval;
@@ -246,17 +221,20 @@ class GeometryCreatorApp final : public MonoWindowApplication
 		}
 
 	private:
+		//
+		smart_refctd_ptr<CGeometryCreatorScene> m_scene;
+		smart_refctd_ptr<CSimpleDebugRenderer> m_renderer;
+		//
 		smart_refctd_ptr<ISemaphore> m_semaphore;
 		uint64_t m_realFrameIx = 0;
 		std::array<smart_refctd_ptr<IGPUCommandBuffer>,base_t::MaxFramesInFlight> m_cmdBufs;
-
+		//
 		InputSystem::ChannelReader<IMouseEventChannel> mouse;
 		InputSystem::ChannelReader<IKeyboardEventChannel> keyboard;
 
+		//
 		Camera camera = Camera(core::vectorSIMDf(0, 0, 0), core::vectorSIMDf(0, 0, 0), core::matrix4SIMD());
 
-//		ResourcesBundle resources;
-//		ObjectDrawHookCpu object;
 		uint16_t gcIndex = {};
 
 		void mouseProcess(const nbl::ui::IMouseEventChannel::range_t& events)
@@ -265,8 +243,11 @@ class GeometryCreatorApp final : public MonoWindowApplication
 			{
 				auto ev = *eventIt;
 
-				if (ev.type == nbl::ui::SMouseEvent::EET_SCROLL)
-					gcIndex = std::clamp<uint16_t>(int16_t(gcIndex) + int16_t(core::sign(ev.scrollEvent.verticalScroll)), int64_t(0), int64_t(CGeometryCreatorScene::OT_COUNT - (uint8_t)1u));
+				if (ev.type==nbl::ui::SMouseEvent::EET_SCROLL && m_renderer)
+				{
+					gcIndex += int16_t(core::sign(ev.scrollEvent.verticalScroll));
+					gcIndex = core::clamp(gcIndex,0ull,m_renderer->getInitParams().geoms.size());
+				}
 			}
 		}
 };
diff --git a/common/include/nbl/examples/geometry/CGeometryCreatorScene.hpp b/common/include/nbl/examples/geometry/CGeometryCreatorScene.hpp
index 1f8d1ac6a..74b5d02d8 100644
--- a/common/include/nbl/examples/geometry/CGeometryCreatorScene.hpp
+++ b/common/include/nbl/examples/geometry/CGeometryCreatorScene.hpp
@@ -3,15 +3,8 @@
 
 
 #include <nabla.h>
-#include "nbl/builtin/hlsl/math/linalg/fast_affine.hlsl"
 #include "nbl/asset/utils/CGeometryCreator.h"
 
-#include "nbl/examples/geometry/SPushConstants.hlsl"
-
-// TODO: Arek bring back
-//#include "nbl/examples/geometry/spirv/builtin/CArchive.h"
-//#include "nbl/examples/geometry/spirv/builtin/builtinResources.h"
-
 
 namespace nbl::examples
 {
@@ -201,39 +194,84 @@ class CGeometryCreatorScene : public core::IReferenceCounted
 		core::vector<SNamedGeometry> m_geometries;
 };
 
+}
+//!
+
+
+#include "nbl/builtin/hlsl/math/linalg/fast_affine.hlsl"
+#include "nbl/examples/geometry/SPushConstants.hlsl"
+
+// TODO: Arek bring back
+//#include "nbl/examples/geometry/spirv/builtin/CArchive.h"
+//#include "nbl/examples/geometry/spirv/builtin/builtinResources.h"
+
+
+namespace nbl::examples
+{
+
 class CSimpleDebugRenderer final : public core::IReferenceCounted
 {
 	public:
 		//
 		constexpr static inline auto DescriptorCount = 255;
 		//
-		using SPushConstants = hlsl::examples::geometry_creator_scene::SPushConstants;
-		struct SPackedGeometry
+		struct SViewParams
 		{
-			inline SPushConstants convert(const hlsl::float32_t3x4& model, const hlsl::float32_t3x4& view, const hlsl::float32_t4x4& viewProj)
+			inline SViewParams(const hlsl::float32_t3x4& _view, const hlsl::float32_t4x4& _viewProj)
 			{
-				using namespace hlsl;
-				return {
-					.basic = {
-						.MVP = math::linalg::promoted_mul<float32_t,4,4>(viewProj,model),
-						.MV = math::linalg::promoted_mul<float32_t,3,4>(view,model),
-						.normalMat = inverse(transpose(float32_t3x3(view)))
-					},
-					.positionView = positionView,
-					.normalView = normalView,
-					.uvView = uvView
+				view = _view;
+				viewProj = _viewProj;
+				using namespace nbl::hlsl;
+				normal = transpose(inverse(float32_t3x3(view)));
+			}
+
+			inline auto computeForInstance(hlsl::float32_t3x4 world) const
+			{
+				using namespace nbl::hlsl;
+				hlsl::examples::geometry_creator_scene::SInstanceMatrices retval = {
+					.worldViewProj = float32_t4x4(math::linalg::promoted_mul(float64_t4x4(viewProj),float64_t3x4(world)))
 				};
+				const auto sub3x3 = mul(float64_t3x3(viewProj),float64_t3x3(world));
+				retval.normal = float32_t3x3(transpose(inverse(sub3x3)));
+				return retval;
 			}
 
+			hlsl::float32_t3x4 view;
+			hlsl::float32_t4x4 viewProj;
+			hlsl::float32_t3x3 normal;
+		};
+		//
+		struct SPackedGeometry
+		{
+			core::smart_refctd_ptr<const video::IGPUGraphicsPipeline> pipeline = {};
 			asset::SBufferBinding<const video::IGPUBuffer> indexBuffer = {};
 			uint32_t elementCount = 0;
 			// indices into the descriptor set
 			uint8_t positionView = 0;
 			uint8_t normalView = 0;
 			uint8_t uvView = 0;
-			uint8_t indexType = asset::EIT_UNKNOWN;
+			asset::E_INDEX_TYPE indexType = asset::EIT_UNKNOWN;
 		};
+		//
+		struct SInstance
+		{
+			using SPushConstants = hlsl::examples::geometry_creator_scene::SPushConstants;
+			inline SPushConstants computePushConstants(const SViewParams& viewParams) const
+			{
+				using namespace hlsl;
+				return {
+					.matrices = viewParams.computeForInstance(world),
+					.positionView = packedGeo->positionView,
+					.normalView = packedGeo->normalView,
+					.uvView = packedGeo->uvView
+				};
+			}
 
+			hlsl::float32_t3x4 world;
+			const SPackedGeometry* packedGeo;
+		};
+
+		//
 		static inline core::smart_refctd_ptr<CSimpleDebugRenderer> create(video::IGPURenderpass* renderpass, const uint32_t subpassIX, const CGeometryCreatorScene* scene)
 		{
 			EXPOSE_NABLA_NAMESPACES;
@@ -249,8 +287,6 @@ class CSimpleDebugRenderer final : public core::IReferenceCounted
 			if (namedGeoms.empty())
 				return nullptr;
 
-			// TODO: Load Shaders and Create Pipelines
-
 			SInitParams init;
 
 			// create descriptor set
@@ -287,6 +323,19 @@ class CSimpleDebugRenderer final : public core::IReferenceCounted
 				}
 			}
 
+			//
+			const SPushConstantRange ranges[] = {{
+				.stageFlags = hlsl::ShaderStage::ESS_VERTEX,
+				.offset = 0,
+				.size = sizeof(SInstance::SPushConstants),
+			}};
+			init.layout = device->createPipelineLayout(ranges,smart_refctd_ptr<const IGPUDescriptorSetLayout>(init.ds->getLayout()));
+
+			// TODO: Load Shaders and Create Pipelines
+			{
+				//
+			}
+
 			// write geometries' attributes to descriptor set
 			{
 				core::vector<IGPUDescriptorSet::SDescriptorInfo> infos;
@@ -335,15 +384,48 @@ class CSimpleDebugRenderer final : public core::IReferenceCounted
 			return smart_refctd_ptr<CSimpleDebugRenderer>(new CSimpleDebugRenderer(std::move(init)),dont_grab);
 		}
 
-
-	protected:
+		//
 		struct SInitParams
 		{
 			core::smart_refctd_ptr<video::IGPUDescriptorSet> ds;
+			core::smart_refctd_ptr<video::IGPUPipelineLayout> layout;
 			core::vector<SPackedGeometry> geoms;
-		} m_params;
+		};
+		inline const SInitParams& getInitParams() const {return m_params;}
 
+		//
+		inline void render(video::IGPUCommandBuffer* cmdbuf, const SViewParams& viewParams) const
+		{
+			EXPOSE_NABLA_NAMESPACES;
+
+			cmdbuf->beginDebugMarker("CSimpleDebugRenderer::render");
+
+			const auto* layout = m_params.layout.get();
+			cmdbuf->bindDescriptorSets(E_PIPELINE_BIND_POINT::EPBP_GRAPHICS,layout,0,1,&m_params.ds.get());
+
+			for (const auto& instance : m_instances)
+			{
+				const auto* geo = instance.packedGeo;
+				cmdbuf->bindGraphicsPipeline(geo->pipeline.get());
+				const auto pc = instance.computePushConstants(viewParams);
+				cmdbuf->pushConstants(layout,hlsl::ShaderStage::ESS_VERTEX,0,sizeof(pc),&pc);
+				if (geo->indexBuffer)
+				{
+					cmdbuf->bindIndexBuffer(geo->indexBuffer,geo->indexType);
+					cmdbuf->drawIndexed(geo->elementCount,1,0,0,0);
+				}
+				else
+					cmdbuf->draw(geo->elementCount,1,0,0);
+			}
+			cmdbuf->endDebugMarker();
+		}
+
+		core::vector<SInstance> m_instances;
+
+	protected:
 		inline CSimpleDebugRenderer(SInitParams&& _params) : m_params(std::move(_params)) {}
+
+		SInitParams m_params;
 };
 
 #undef EXPOSE_NABLA_NAMESPACES
@@ -586,182 +668,15 @@ class ResourceBuilder
 			GP_COUNT
 		};
 
-		struct ReferenceObjectCpu
-		{
-			ObjectMeta meta;
-			GeometryShader shadersType;
-			nbl::asset::CGeometryCreator::return_type data;
-		};
 
-		GeometriesCpu(const nbl::asset::IGeometryCreator* _gc)
-			: gc(_gc),
-			objects
-			({
-				ReferenceObjectCpu {.meta = {.type = OT_CUBE, .name = "Cube Mesh" }, .shadersType = GP_BASIC, .data = gc->createCubeMesh(nbl::core::vector3df(1.f, 1.f, 1.f)) },
-				ReferenceObjectCpu {.meta = {.type = OT_SPHERE, .name = "Sphere Mesh" }, .shadersType = GP_BASIC, .data = gc->createSphereMesh(2, 16, 16) },
-				ReferenceObjectCpu {.meta = {.type = OT_CYLINDER, .name = "Cylinder Mesh" }, .shadersType = GP_BASIC, .data = gc->createCylinderMesh(2, 2, 20) },
-				ReferenceObjectCpu {.meta = {.type = OT_RECTANGLE, .name = "Rectangle Mesh" }, .shadersType = GP_BASIC, .data = gc->createRectangleMesh(nbl::core::vector2df_SIMD(1.5, 3)) },
-				ReferenceObjectCpu {.meta = {.type = OT_DISK, .name = "Disk Mesh" }, .shadersType = GP_BASIC, .data = gc->createDiskMesh(2, 30) },
-				ReferenceObjectCpu {.meta = {.type = OT_ARROW, .name = "Arrow Mesh" }, .shadersType = GP_BASIC, .data = gc->createArrowMesh() },
-				ReferenceObjectCpu {.meta = {.type = OT_CONE, .name = "Cone Mesh" }, .shadersType = GP_CONE, .data = gc->createConeMesh(2, 3, 10) },
-				ReferenceObjectCpu {.meta = {.type = OT_ICOSPHERE, .name = "Icoshpere Mesh" }, .shadersType = GP_ICO, .data = gc->createIcoSphere(1, 3, true) }
-			})
-		{
-			gc = nullptr; // one shot
-		}
-
-	private:
-		const nbl::asset::IGeometryCreator* gc;
-
-	public:
-		const std::array<ReferenceObjectCpu, OT_COUNT> objects;
 	};
 
-	using resources_bundle_base_t = ResourcesBundleBase<withAssetConverter>;
-
-	struct ResourcesBundleScratch : public resources_bundle_base_t
-	{
-		using Types = resources_bundle_base_t::Types;
-
-		ResourcesBundleScratch()
-			: resources_bundle_base_t() {}
-
 		struct Shaders
 		{
 			nbl::core::smart_refctd_ptr<typename Types::shader_t> vertex = nullptr, fragment = nullptr;
 		};
 
-		nbl::core::smart_refctd_ptr<typename Types::descriptor_set_layout_t> descriptorSetLayout;
-		nbl::core::smart_refctd_ptr<typename Types::pipeline_layout_t> pipelineLayout;
 		std::array<Shaders, GeometriesCpu::GP_COUNT> shaders;
-	};
-
-	// TODO: we could make those params templated with default values like below
-	static constexpr auto FramebufferW = 1280u, FramebufferH = 720u;
-	static constexpr auto ColorFboAttachmentFormat = nbl::asset::EF_R8G8B8A8_SRGB, DepthFboAttachmentFormat = nbl::asset::EF_D16_UNORM;
-	static constexpr auto Samples = nbl::video::IGPUImage::ESCF_1_BIT;
-
-	ResourcesBundleScratch scratch;
-
-	GeometriesCpu geometries;
-};
-
-#undef TYPES_IMPL_BOILERPLATE
-
-struct ObjectDrawHookCpu
-{
-	nbl::core::matrix3x4SIMD model;
-	nbl::asset::SBasicViewParameters viewParameters;
-	ObjectMeta meta;
-};
-
-/*
-	Rendering to offline framebuffer which we don't present, color 
-	scene attachment texture we use for second UI renderpass 
-	sampling it & rendering into desired GUI area.
-
-	The scene can be created from simple geometry
-	using our Geomtry Creator class.
-*/
-
-class CScene final : public nbl::core::IReferenceCounted
-{
-public:
-	ObjectDrawHookCpu object; // TODO: this could be a vector (to not complicate the example I leave it single object), we would need a better system for drawing then to make only 1 max 2 indirect draw calls (indexed and not indexed objects)
-
-	struct
-	{
-		const uint32_t startedValue = 0, finishedValue = 0x45;
-		nbl::core::smart_refctd_ptr<nbl::video::ISemaphore> progress;
-	} semaphore;
-
-	inline void begin()
-	{
-		EXPOSE_NABLA_NAMESPACES();
-
-		m_commandBuffer->reset(IGPUCommandBuffer::RESET_FLAGS::RELEASE_RESOURCES_BIT);
-		m_commandBuffer->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT);
-		m_commandBuffer->beginDebugMarker("UISampleApp Offline Scene Frame");
-
-		semaphore.progress = m_utilities->getLogicalDevice()->createSemaphore(semaphore.startedValue);
-	}
-
-	inline void record()
-	{
-
-		const auto& [hook, meta] = resources.objects[object.meta.type];
-		auto* rawPipeline = hook.pipeline.get();
-
-		SBufferBinding<const IGPUBuffer> vertex = hook.bindings.vertex, index = hook.bindings.index;
-
-		m_commandBuffer->bindGraphicsPipeline(rawPipeline);
-		m_commandBuffer->bindDescriptorSets(EPBP_GRAPHICS, rawPipeline->getLayout(), 1, 1, &resources.descriptorSet.get());
-		m_commandBuffer->bindVertexBuffers(0, 1, &vertex);
-
-		if (index.buffer && hook.indexType != EIT_UNKNOWN)
-		{
-			m_commandBuffer->bindIndexBuffer(index, hook.indexType);
-			m_commandBuffer->drawIndexed(hook.indexCount, 1, 0, 0, 0);
-		}
-		else
-			m_commandBuffer->draw(hook.indexCount, 1, 0, 0);
-	}
-
-private:
-	template<typename CreateWith = CreateResourcesDirectlyWithDevice> // TODO: enforce constraints, only those 2 above are valid
-	CScene(nbl::core::smart_refctd_ptr<nbl::video::IUtilities> _utilities, nbl::core::smart_refctd_ptr<nbl::system::ILogger> _logger, nbl::video::CThreadSafeQueueAdapter* _graphicsQueue, const nbl::asset::IGeometryCreator* _geometryCreator, CreateWith createWith = {})
-		: m_utilities(nbl::core::smart_refctd_ptr(_utilities)), m_logger(nbl::core::smart_refctd_ptr(_logger)), queue(_graphicsQueue)
-	{
-		EXPOSE_NABLA_NAMESPACES();
-		using Builder = typename CreateWith::Builder;
-
-		m_commandBuffer = createCommandBuffer(m_utilities->getLogicalDevice(), m_utilities->getLogger(), queue->getFamilyIndex());
-		Builder builder(m_utilities.get(), m_commandBuffer.get(), m_logger.get(), _geometryCreator);
-
-		// gpu resources
-		if (builder.build())
-		{
-			if (!builder.finalize(resources, queue))
-				m_logger->log("Could not finalize resource objects to gpu objects!", ILogger::ELL_ERROR);
-		}
-		else
-			m_logger->log("Could not build resource objects!", ILogger::ELL_ERROR);
-
-		// frame buffer
-		{
-			const auto extent = resources.attachments.color->getCreationParameters().image->getCreationParameters().extent;
-
-			IGPUFramebuffer::SCreationParams params =
-			{
-				{
-					.renderpass = smart_refctd_ptr(resources.renderpass),
-					.depthStencilAttachments = &resources.attachments.depth.get(),
-					.colorAttachments = &resources.attachments.color.get(),
-					.width = extent.width,
-					.height = extent.height,
-					.layers = 1u
-				}
-			};
-
-			m_frameBuffer = m_utilities->getLogicalDevice()->createFramebuffer(std::move(params));
-
-			if (!m_frameBuffer)
-			{
-				m_logger->log("Could not create frame buffer!", ILogger::ELL_ERROR);
-				return;
-			}
-		}
-	}
-
-	nbl::core::smart_refctd_ptr<nbl::video::IUtilities> m_utilities;
-	nbl::core::smart_refctd_ptr<nbl::system::ILogger> m_logger;
-
-	nbl::video::CThreadSafeQueueAdapter* queue;
-	nbl::core::smart_refctd_ptr<nbl::video::IGPUCommandBuffer> m_commandBuffer;
-
-	nbl::core::smart_refctd_ptr<nbl::video::IGPUFramebuffer> m_frameBuffer;
-
-	ResourcesBundle resources;
 };
 #endif
 
diff --git a/common/include/nbl/examples/geometry/SPushConstants.hlsl b/common/include/nbl/examples/geometry/SPushConstants.hlsl
index f02ddea12..2048f1f3f 100644
--- a/common/include/nbl/examples/geometry/SPushConstants.hlsl
+++ b/common/include/nbl/examples/geometry/SPushConstants.hlsl
@@ -2,7 +2,7 @@
 #define _NBL_EXAMPLES_S_PUSH_CONSTANTS_HLSL_
 
 
-#include "nbl/examples/common/SBasicViewParameters.hlsl"
+#include "nbl/builtin/hlsl/cpp_compat.hlsl"
 
 
 namespace nbl
@@ -14,9 +14,15 @@ namespace examples
 namespace geometry_creator_scene
 {
 
+struct SInstanceMatrices
+{
+	float32_t4x4 worldViewProj;
+	float32_t3x3 normal;
+};
+
 struct SPushConstants
 {
-	SBasicViewParameters basic;
+	SInstanceMatrices matrices;
 	uint32_t positionView : 11;
 	uint32_t normalView : 10;
 	uint32_t uvView : 11;

From e76bfcc4f642c3c5f01f5b6fecfaa737307f1ea1 Mon Sep 17 00:00:00 2001
From: devsh <devsh@devsh.eu>
Date: Wed, 18 Jun 2025 10:22:52 +0200
Subject: [PATCH 387/529] move ex09 renderer to its own include

---
 common/include/nbl/examples/PCH.hpp           |   1 +
 .../geometry/CGeometryCreatorScene.hpp        | 511 +-----------------
 .../geometry/CSimpleDebugRenderer.hpp         | 493 +++++++++++++++++
 3 files changed, 499 insertions(+), 506 deletions(-)
 create mode 100644 common/include/nbl/examples/geometry/CSimpleDebugRenderer.hpp

diff --git a/common/include/nbl/examples/PCH.hpp b/common/include/nbl/examples/PCH.hpp
index 179c9f037..ed5da666e 100644
--- a/common/include/nbl/examples/PCH.hpp
+++ b/common/include/nbl/examples/PCH.hpp
@@ -18,6 +18,7 @@
 #include "nbl/examples/cameras/CCamera.hpp"
 
 #include "nbl/examples/geometry/CGeometryCreatorScene.hpp"
+#include "nbl/examples/geometry/CSimpleDebugRenderer.hpp"
 
 
 #endif // _NBL_EXAMPLES_COMMON_PCH_HPP_
\ No newline at end of file
diff --git a/common/include/nbl/examples/geometry/CGeometryCreatorScene.hpp b/common/include/nbl/examples/geometry/CGeometryCreatorScene.hpp
index 74b5d02d8..8a73f2e14 100644
--- a/common/include/nbl/examples/geometry/CGeometryCreatorScene.hpp
+++ b/common/include/nbl/examples/geometry/CGeometryCreatorScene.hpp
@@ -9,30 +9,14 @@
 namespace nbl::examples
 {
 
-#define EXPOSE_NABLA_NAMESPACES using namespace nbl::core; \
-using namespace nbl::system; \
-using namespace nbl::asset; \
-using namespace nbl::video
-
 class CGeometryCreatorScene : public core::IReferenceCounted
 {
+#define EXPOSE_NABLA_NAMESPACES \
+			using namespace nbl::core; \
+			using namespace nbl::system; \
+			using namespace nbl::asset; \
+			using namespace nbl::video
 	public:
-		//
-		enum ObjectType : uint8_t
-		{
-			OT_CUBE,
-			OT_SPHERE,
-			OT_CYLINDER,
-			OT_RECTANGLE,
-			OT_DISK,
-			OT_ARROW,
-			OT_CONE,
-			OT_ICOSPHERE,
-
-			OT_COUNT,
-			OT_UNKNOWN = OT_COUNT
-		};
-
 		//
 		struct SCreateParams
 		{
@@ -192,493 +176,8 @@ class CGeometryCreatorScene : public core::IReferenceCounted
 		inline CGeometryCreatorScene(core::vector<SNamedGeometry>&& _geometries) : m_geometries(std::move(_geometries)) {}
 
 		core::vector<SNamedGeometry> m_geometries;
-};
-
-}
-//!
-
-
-#include "nbl/builtin/hlsl/math/linalg/fast_affine.hlsl"
-#include "nbl/examples/geometry/SPushConstants.hlsl"
-
-// TODO: Arek bring back
-//#include "nbl/examples/geometry/spirv/builtin/CArchive.h"
-//#include "nbl/examples/geometry/spirv/builtin/builtinResources.h"
-
-
-namespace nbl::examples
-{
-
-class CSimpleDebugRenderer final : public core::IReferenceCounted
-{
-	public:
-		//
-		constexpr static inline auto DescriptorCount = 255;
-		//
-		struct SViewParams
-		{
-			inline SViewParams(const hlsl::float32_t3x4& _view, const hlsl::float32_t4x4& _viewProj)
-			{
-				view = _view;
-				viewProj = _viewProj;
-				using namespace nbl::hlsl;
-				normal = transpose(inverse(float32_t3x3(view)));
-			}
-
-			inline auto computeForInstance(hlsl::float32_t3x4 world) const
-			{
-				using namespace nbl::hlsl;
-				hlsl::examples::geometry_creator_scene::SInstanceMatrices retval = {
-					.worldViewProj = float32_t4x4(math::linalg::promoted_mul(float64_t4x4(viewProj),float64_t3x4(world)))
-				};
-				const auto sub3x3 = mul(float64_t3x3(viewProj),float64_t3x3(world));
-				retval.normal = float32_t3x3(transpose(inverse(sub3x3)));
-				return retval;
-			}
-
-			hlsl::float32_t3x4 view;
-			hlsl::float32_t4x4 viewProj;
-			hlsl::float32_t3x3 normal;
-		};
-		//
-		struct SPackedGeometry
-		{
-			core::smart_refctd_ptr<const video::IGPUGraphicsPipeline> pipeline = {};
-			asset::SBufferBinding<const video::IGPUBuffer> indexBuffer = {};
-			uint32_t elementCount = 0;
-			// indices into the descriptor set
-			uint8_t positionView = 0;
-			uint8_t normalView = 0;
-			uint8_t uvView = 0;
-			asset::E_INDEX_TYPE indexType = asset::EIT_UNKNOWN;
-		};
-		//
-		struct SInstance
-		{
-			using SPushConstants = hlsl::examples::geometry_creator_scene::SPushConstants;
-			inline SPushConstants computePushConstants(const SViewParams& viewParams) const
-			{
-				using namespace hlsl;
-				return {
-					.matrices = viewParams.computeForInstance(world),
-					.positionView = packedGeo->positionView,
-					.normalView = packedGeo->normalView,
-					.uvView = packedGeo->uvView
-				};
-			}
-
-			hlsl::float32_t3x4 world;
-			const SPackedGeometry* packedGeo;
-		};
-
-		//
-		static inline core::smart_refctd_ptr<CSimpleDebugRenderer> create(video::IGPURenderpass* renderpass, const uint32_t subpassIX, const CGeometryCreatorScene* scene)
-		{
-			EXPOSE_NABLA_NAMESPACES;
-
-			if (!renderpass)
-				return nullptr;
-			auto device = const_cast<ILogicalDevice*>(renderpass->getOriginDevice());
-			auto logger = device->getLogger();
-
-			if (!scene)
-				return nullptr;
-			const auto namedGeoms = scene->getGeometries();
-			if (namedGeoms.empty())
-				return nullptr;
-
-			SInitParams init;
-
-			// create descriptor set
-			{
-				// create Descriptor Set Layout
-				smart_refctd_ptr<IGPUDescriptorSetLayout> dsLayout;
-				{
-					const IGPUDescriptorSetLayout::SBinding bindings[] =
-					{
-						{
-							.binding = 0,
-							.type = IDescriptor::E_TYPE::ET_UNIFORM_TEXEL_BUFFER,
-							// some geometries may not have particular attributes
-							.createFlags = IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_PARTIALLY_BOUND_BIT,
-							.stageFlags = IShader::E_SHADER_STAGE::ESS_VERTEX|IShader::E_SHADER_STAGE::ESS_FRAGMENT,
-							.count = DescriptorCount
-						}
-					};
-					dsLayout = device->createDescriptorSetLayout(bindings);
-					if (!dsLayout)
-					{
-						logger->log("Could not create descriptor set layout!",ILogger::ELL_ERROR);
-						return nullptr;
-					}
-				}
-
-				// create Descriptor Set
-				auto pool = device->createDescriptorPoolForDSLayouts(IDescriptorPool::ECF_UPDATE_AFTER_BIND_BIT,{&dsLayout.get(),1});
-				init.ds = pool->createDescriptorSet(std::move(dsLayout));
-				if (!init.ds)
-				{
-					logger->log("Could not descriptor set!",ILogger::ELL_ERROR);
-					return nullptr;
-				}
-			}
-
-			//
-			const SPushConstantRange ranges[] = {{
-				.stageFlags = hlsl::ShaderStage::ESS_VERTEX,
-				.offset = 0,
-				.size = sizeof(SInstance::SPushConstants),
-			}};
-			init.layout = device->createPipelineLayout(ranges,smart_refctd_ptr<const IGPUDescriptorSetLayout>(init.ds->getLayout()));
-
-			// TODO: Load Shaders and Create Pipelines
-			{
-				//
-			}
-
-			// write geometries' attributes to descriptor set
-			{
-				core::vector<IGPUDescriptorSet::SDescriptorInfo> infos;
-				auto allocateUTB = [device,&infos](const IGeometry<const IGPUBuffer>::SDataView& view)->uint8_t
-				{
-					if (!view)
-						return DescriptorCount;
-					const auto retval = infos.size();
-					infos.emplace_back().desc = device->createBufferView(view.src, view.composed.format);
-					return retval;
-				};
-
-				for (const auto& entry : namedGeoms)
-				{
-					const auto* geom = entry.geom.get();
-					// could also check device origin on all buffers
-					if (!geom->valid())
-						continue;
-					auto& out = init.geoms.emplace_back();
-					if (const auto& view=geom->getIndexView(); view)
-					{
-						out.indexBuffer.offset = view.src.offset;
-						out.indexBuffer.buffer = view.src.buffer;
-					}
-					out.elementCount = geom->getVertexReferenceCount();
-					out.positionView = allocateUTB(geom->getPositionView());
-					out.normalView = allocateUTB(geom->getNormalView());
-					// the first view is usually the UV
-					if (const auto& auxViews = geom->getAuxAttributeViews(); !auxViews.empty())
-						out.uvView = allocateUTB(auxViews.front());
-				}
-
-				if (infos.empty())
-					return nullptr;
-				const IGPUDescriptorSet::SWriteDescriptorSet write = {
-					.dstSet = init.ds.get(),
-					.binding = 0,
-					.arrayElement = 0,
-					.count = static_cast<uint32_t>(infos.size()),
-					.info = infos.data()
-				};
-				if (!device->updateDescriptorSets({&write,1},{}))
-					return nullptr;
-			}
-
-			return smart_refctd_ptr<CSimpleDebugRenderer>(new CSimpleDebugRenderer(std::move(init)),dont_grab);
-		}
-
-		//
-		struct SInitParams
-		{
-			core::smart_refctd_ptr<video::IGPUDescriptorSet> ds;
-			core::smart_refctd_ptr<video::IGPUPipelineLayout> layout;
-			core::vector<SPackedGeometry> geoms;
-		};
-		inline const SInitParams& getInitParams() const {return m_params;}
-
-		//
-		inline void render(video::IGPUCommandBuffer* cmdbuf, const SViewParams& viewParams) const
-		{
-			EXPOSE_NABLA_NAMESPACES;
-
-			cmdbuf->beginDebugMarker("CSimpleDebugRenderer::render");
-
-			const auto* layout = m_params.layout.get();
-			cmdbuf->bindDescriptorSets(E_PIPELINE_BIND_POINT::EPBP_GRAPHICS,layout,0,1,&m_params.ds.get());
-
-			for (const auto& instance : m_instances)
-			{
-				const auto* geo = instance.packedGeo;
-				cmdbuf->bindGraphicsPipeline(geo->pipeline.get());
-				const auto pc = instance.computePushConstants(viewParams);
-				cmdbuf->pushConstants(layout,hlsl::ShaderStage::ESS_VERTEX,0,sizeof(pc),&pc);
-				if (geo->indexBuffer)
-				{
-					cmdbuf->bindIndexBuffer(geo->indexBuffer,geo->indexType);
-					cmdbuf->drawIndexed(geo->elementCount,1,0,0,0);
-				}
-				else
-					cmdbuf->draw(geo->elementCount,1,0,0);
-			}
-			cmdbuf->endDebugMarker();
-		}
-
-		core::vector<SInstance> m_instances;
-
-	protected:
-		inline CSimpleDebugRenderer(SInitParams&& _params) : m_params(std::move(_params)) {}
-
-		SInitParams m_params;
-};
-
 #undef EXPOSE_NABLA_NAMESPACES
-#if 0
-class ResourceBuilder
-{
-private:
-
-	bool createShaders()
-	{
-		EXPOSE_NABLA_NAMESPACES();
-
-		auto createShader = [&]<StringLiteral virtualPath>(IShader::E_SHADER_STAGE stage, smart_refctd_ptr<typename Types::shader_t>& outShader) -> smart_refctd_ptr<typename Types::shader_t>
-		{
-			// TODO: use SPIRV loader & our ::system ns to get those cpu shaders, do not create myself (shit I forgot it exists)
-
-			const SBuiltinFile& in = ::geometry::creator::spirv::builtin::get_resource<virtualPath>();
-			const auto buffer = ICPUBuffer::create({ { in.size }, (void*)in.contents, core::getNullMemoryResource() }, adopt_memory);
-			auto shader = make_smart_refctd_ptr<ICPUShader>(smart_refctd_ptr(buffer), stage, IShader::E_CONTENT_TYPE::ECT_SPIRV, ""); // must create cpu instance regardless underlying type
-
-			if constexpr (withAssetConverter)
-			{
-				buffer->setContentHash(buffer->computeContentHash());
-				outShader = std::move(shader);
-			}
-
-			return outShader;
-		};
-
-		typename ResourcesBundleScratch::Shaders& basic = scratch.shaders[GeometriesCpu::GP_BASIC];
-		createShader.template operator() < NBL_CORE_UNIQUE_STRING_LITERAL_TYPE("geometryCreator/spirv/gc.basic.vertex.spv") > (IShader::E_SHADER_STAGE::ESS_VERTEX, basic.vertex);
-		createShader.template operator() < NBL_CORE_UNIQUE_STRING_LITERAL_TYPE("geometryCreator/spirv/gc.basic.fragment.spv") > (IShader::E_SHADER_STAGE::ESS_FRAGMENT, basic.fragment);
-
-		typename ResourcesBundleScratch::Shaders& cone = scratch.shaders[GeometriesCpu::GP_CONE];
-		createShader.template operator() < NBL_CORE_UNIQUE_STRING_LITERAL_TYPE("geometryCreator/spirv/gc.cone.vertex.spv") > (IShader::E_SHADER_STAGE::ESS_VERTEX, cone.vertex);
-		createShader.template operator() < NBL_CORE_UNIQUE_STRING_LITERAL_TYPE("geometryCreator/spirv/gc.basic.fragment.spv") > (IShader::E_SHADER_STAGE::ESS_FRAGMENT, cone.fragment); // note we reuse fragment from basic!
-
-		typename ResourcesBundleScratch::Shaders& ico = scratch.shaders[GeometriesCpu::GP_ICO];
-		createShader.template operator() < NBL_CORE_UNIQUE_STRING_LITERAL_TYPE("geometryCreator/spirv/gc.ico.vertex.spv") > (IShader::E_SHADER_STAGE::ESS_VERTEX, ico.vertex);
-		createShader.template operator() < NBL_CORE_UNIQUE_STRING_LITERAL_TYPE("geometryCreator/spirv/gc.basic.fragment.spv") > (IShader::E_SHADER_STAGE::ESS_FRAGMENT, ico.fragment); // note we reuse fragment from basic!
-			
-		for (const auto& it : scratch.shaders)
-		{
-			if (!it.vertex || !it.fragment)
-			{
-				logger->log("Could not create shaders!", ILogger::ELL_ERROR);
-				return false;
-			}
-		}
-
-		return true;
-	}
-
-	bool createGeometries()
-	{
-		EXPOSE_NABLA_NAMESPACES();
-
-		for (uint32_t i = 0; i < geometries.objects.size(); ++i)
-		{
-			const auto& inGeometry = geometries.objects[i];
-			auto& [obj, meta] = scratch.objects[i];
-
-			bool status = true;
-
-			meta.name = inGeometry.meta.name;
-			meta.type = inGeometry.meta.type;
-
-			struct
-			{
-				SBlendParams blend;
-				SRasterizationParams rasterization;
-				typename Types::graphics_pipeline_t::SCreationParams pipeline;
-			} params;
-				
-			{
-				params.blend.logicOp = ELO_NO_OP;
-
-				auto& b = params.blend.blendParams[0];
-				b.srcColorFactor = EBF_SRC_ALPHA;
-				b.dstColorFactor = EBF_ONE_MINUS_SRC_ALPHA;
-				b.colorBlendOp = EBO_ADD;
-				b.srcAlphaFactor = EBF_SRC_ALPHA;
-				b.dstAlphaFactor = EBF_SRC_ALPHA;
-				b.alphaBlendOp = EBO_ADD;
-				b.colorWriteMask = (1u << 0u) | (1u << 1u) | (1u << 2u) | (1u << 3u);
-			}
-
-			params.rasterization.faceCullingMode = EFCM_NONE;
-			{
-				const typename Types::shader_t::SSpecInfo info [] =
-				{
-					{.entryPoint = "VSMain", .shader = scratch.shaders[inGeometry.shadersType].vertex.get() },
-					{.entryPoint = "PSMain", .shader = scratch.shaders[inGeometry.shadersType].fragment.get() }
-				};
-
-				params.pipeline.layout = scratch.pipelineLayout.get();
-				params.pipeline.shaders = info;
-				params.pipeline.renderpass = scratch.renderpass.get();
-				params.pipeline.cached = { .vertexInput = inGeometry.data.inputParams, .primitiveAssembly = inGeometry.data.assemblyParams, .rasterization = params.rasterization, .blend = params.blend, .subpassIx = 0u };
-
-				obj.indexCount = inGeometry.data.indexCount;
-				obj.indexType = inGeometry.data.indexType;
-
-				// TODO: cache pipeline & try lookup for existing one first maybe
-
-				// similar issue like with shaders again, in this case gpu contructor allows for extra cache parameters + there is no constructor you can use to fire make_smart_refctd_ptr yourself for cpu
-				if constexpr (withAssetConverter)
-					obj.pipeline = ICPUGraphicsPipeline::create(params.pipeline);
-				else
-				{
-					const std::array<const IGPUGraphicsPipeline::SCreationParams,1> info = { { params.pipeline } };
-					utilities->getLogicalDevice()->createGraphicsPipelines(nullptr, info, &obj.pipeline);
-				}
-
-				if (!obj.pipeline)
-				{
-					logger->log("Could not create graphics pipeline for [%s] object!", ILogger::ELL_ERROR, meta.name.data());
-					status = false;
-				}
-
-				// object buffers
-				auto createVIBuffers = [&]() -> bool
-				{
-					using ibuffer_t = ::nbl::asset::IBuffer; // seems to be ambigous, both asset & core namespaces has IBuffer
-
-					// note: similar issue like with shaders, this time with cpu-gpu constructors differing in arguments
-					auto vBuffer = smart_refctd_ptr(inGeometry.data.bindings[0].buffer); // no offset
-					constexpr static auto VERTEX_USAGE = bitflag(ibuffer_t::EUF_VERTEX_BUFFER_BIT) | ibuffer_t::EUF_TRANSFER_DST_BIT | ibuffer_t::EUF_INLINE_UPDATE_VIA_CMDBUF;
-					obj.bindings.vertex.offset = 0u;
-						
-					auto iBuffer = smart_refctd_ptr(inGeometry.data.indexBuffer.buffer); // no offset
-					constexpr static auto INDEX_USAGE = bitflag(ibuffer_t::EUF_INDEX_BUFFER_BIT) | ibuffer_t::EUF_VERTEX_BUFFER_BIT | ibuffer_t::EUF_TRANSFER_DST_BIT | ibuffer_t::EUF_INLINE_UPDATE_VIA_CMDBUF;
-					obj.bindings.index.offset = 0u;
-
-					if constexpr (withAssetConverter)
-					{
-						if (!vBuffer)
-							return false;
-
-						vBuffer->addUsageFlags(VERTEX_USAGE);
-						vBuffer->setContentHash(vBuffer->computeContentHash());
-						obj.bindings.vertex = { .offset = 0u, .buffer = vBuffer };
-
-						if (inGeometry.data.indexType != EIT_UNKNOWN)
-							if (iBuffer)
-							{
-								iBuffer->addUsageFlags(INDEX_USAGE);
-								iBuffer->setContentHash(iBuffer->computeContentHash());
-							}
-							else
-								return false;
-
-						obj.bindings.index = { .offset = 0u, .buffer = iBuffer };
-					}
-					else
-					{
-						auto vertexBuffer = utilities->getLogicalDevice()->createBuffer(IGPUBuffer::SCreationParams({ .size = vBuffer->getSize(), .usage = VERTEX_USAGE }));
-						auto indexBuffer = iBuffer ? utilities->getLogicalDevice()->createBuffer(IGPUBuffer::SCreationParams({ .size = iBuffer->getSize(), .usage = INDEX_USAGE })) : nullptr;
-
-						if (!vertexBuffer)
-							return false;
-
-						if (inGeometry.data.indexType != EIT_UNKNOWN)
-							if (!indexBuffer)
-								return false;
-
-						const auto mask = utilities->getLogicalDevice()->getPhysicalDevice()->getUpStreamingMemoryTypeBits();
-						for (auto it : { vertexBuffer , indexBuffer })
-						{
-							if (it)
-							{
-								auto reqs = it->getMemoryReqs();
-								reqs.memoryTypeBits &= mask;
-
-								utilities->getLogicalDevice()->allocate(reqs, it.get());
-							}
-						}
-
-						// record transfer uploads
-						obj.bindings.vertex = { .offset = 0u, .buffer = std::move(vertexBuffer) };
-						{
-							const SBufferRange<IGPUBuffer> range = { .offset = obj.bindings.vertex.offset, .size = obj.bindings.vertex.buffer->getSize(), .buffer = obj.bindings.vertex.buffer };
-							if (!commandBuffer->updateBuffer(range, vBuffer->getPointer()))
-							{
-								logger->log("Could not record vertex buffer transfer upload for [%s] object!", ILogger::ELL_ERROR, meta.name.data());
-								status = false;
-							}
-						}
-						obj.bindings.index = { .offset = 0u, .buffer = std::move(indexBuffer) };
-						{
-							if (iBuffer)
-							{
-								const SBufferRange<IGPUBuffer> range = { .offset = obj.bindings.index.offset, .size = obj.bindings.index.buffer->getSize(), .buffer = obj.bindings.index.buffer };
-
-								if (!commandBuffer->updateBuffer(range, iBuffer->getPointer()))
-								{
-									logger->log("Could not record index buffer transfer upload for [%s] object!", ILogger::ELL_ERROR, meta.name.data());
-									status = false;
-								}
-							}
-						}
-					}
-						
-					return true;
-				};
-
-				if (!createVIBuffers())
-				{
-					logger->log("Could not create buffers for [%s] object!", ILogger::ELL_ERROR, meta.name.data());
-					status = false;
-				}
-
-				if (!status)
-				{
-					logger->log("[%s] object will not be created!", ILogger::ELL_ERROR, meta.name.data());
-
-					obj.bindings.vertex = {};
-					obj.bindings.index = {};
-					obj.indexCount = 0u;
-					obj.indexType = E_INDEX_TYPE::EIT_UNKNOWN;
-					obj.pipeline = nullptr;
-
-					continue;
-				}
-			}
-		}
-
-		return true;
-	}
-
-
-	struct GeometriesCpu
-	{
-		enum GeometryShader
-		{
-			GP_BASIC = 0,
-			GP_CONE,
-			GP_ICO,
-
-			GP_COUNT
-		};
-
-
-	};
-
-		struct Shaders
-		{
-			nbl::core::smart_refctd_ptr<typename Types::shader_t> vertex = nullptr, fragment = nullptr;
-		};
-
-		std::array<Shaders, GeometriesCpu::GP_COUNT> shaders;
 };
-#endif
 
 }
 #endif
\ No newline at end of file
diff --git a/common/include/nbl/examples/geometry/CSimpleDebugRenderer.hpp b/common/include/nbl/examples/geometry/CSimpleDebugRenderer.hpp
new file mode 100644
index 000000000..b6f6b4aaf
--- /dev/null
+++ b/common/include/nbl/examples/geometry/CSimpleDebugRenderer.hpp
@@ -0,0 +1,493 @@
+#ifndef _NBL_EXAMPLES_C_SIMPLE_DEBUG_RENDERER_H_INCLUDED_
+#define _NBL_EXAMPLES_C_SIMPLE_DEBUG_RENDERER_H_INCLUDED_
+
+
+#include "nbl/builtin/hlsl/math/linalg/fast_affine.hlsl"
+#include "nbl/examples/geometry/SPushConstants.hlsl"
+
+// TODO: Arek bring back
+//#include "nbl/examples/geometry/spirv/builtin/CArchive.h"
+//#include "nbl/examples/geometry/spirv/builtin/builtinResources.h"
+
+
+namespace nbl::examples
+{
+
+class CSimpleDebugRenderer final : public core::IReferenceCounted
+{
+#define EXPOSE_NABLA_NAMESPACES \
+			using namespace nbl::core; \
+			using namespace nbl::system; \
+			using namespace nbl::asset; \
+			using namespace nbl::video
+	public:
+		//
+		constexpr static inline auto DescriptorCount = 255;
+		//
+		struct SViewParams
+		{
+			inline SViewParams(const hlsl::float32_t3x4& _view, const hlsl::float32_t4x4& _viewProj)
+			{
+				view = _view;
+				viewProj = _viewProj;
+				using namespace nbl::hlsl;
+				normal = transpose(inverse(float32_t3x3(view)));
+			}
+
+			inline auto computeForInstance(hlsl::float32_t3x4 world) const
+			{
+				using namespace nbl::hlsl;
+				hlsl::examples::geometry_creator_scene::SInstanceMatrices retval = {
+					.worldViewProj = float32_t4x4(math::linalg::promoted_mul(float64_t4x4(viewProj),float64_t3x4(world)))
+				};
+				const auto sub3x3 = mul(float64_t3x3(viewProj),float64_t3x3(world));
+				retval.normal = float32_t3x3(transpose(inverse(sub3x3)));
+				return retval;
+			}
+
+			hlsl::float32_t3x4 view;
+			hlsl::float32_t4x4 viewProj;
+			hlsl::float32_t3x3 normal;
+		};
+		//
+		struct SPackedGeometry
+		{
+			core::smart_refctd_ptr<const video::IGPUGraphicsPipeline> pipeline = {};
+			asset::SBufferBinding<const video::IGPUBuffer> indexBuffer = {};
+			uint32_t elementCount = 0;
+			// indices into the descriptor set
+			uint8_t positionView = 0;
+			uint8_t normalView = 0;
+			uint8_t uvView = 0;
+			asset::E_INDEX_TYPE indexType = asset::EIT_UNKNOWN;
+		};
+		//
+		struct SInstance
+		{
+			using SPushConstants = hlsl::examples::geometry_creator_scene::SPushConstants;
+			inline SPushConstants computePushConstants(const SViewParams& viewParams) const
+			{
+				using namespace hlsl;
+				return {
+					.matrices = viewParams.computeForInstance(world),
+					.positionView = packedGeo->positionView,
+					.normalView = packedGeo->normalView,
+					.uvView = packedGeo->uvView
+				};
+			}
+
+			hlsl::float32_t3x4 world;
+			const SPackedGeometry* packedGeo;
+		};
+
+		//
+		static inline core::smart_refctd_ptr<CSimpleDebugRenderer> create(video::IGPURenderpass* renderpass, const uint32_t subpassIX, const CGeometryCreatorScene* scene)
+		{
+			EXPOSE_NABLA_NAMESPACES;
+
+			if (!renderpass)
+				return nullptr;
+			auto device = const_cast<ILogicalDevice*>(renderpass->getOriginDevice());
+			auto logger = device->getLogger();
+
+			if (!scene)
+				return nullptr;
+			const auto namedGeoms = scene->getGeometries();
+			if (namedGeoms.empty())
+				return nullptr;
+
+			SInitParams init;
+
+			// create descriptor set
+			{
+				// create Descriptor Set Layout
+				smart_refctd_ptr<IGPUDescriptorSetLayout> dsLayout;
+				{
+					const IGPUDescriptorSetLayout::SBinding bindings[] =
+					{
+						{
+							.binding = 0,
+							.type = IDescriptor::E_TYPE::ET_UNIFORM_TEXEL_BUFFER,
+							// some geometries may not have particular attributes
+							.createFlags = IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_PARTIALLY_BOUND_BIT,
+							.stageFlags = IShader::E_SHADER_STAGE::ESS_VERTEX|IShader::E_SHADER_STAGE::ESS_FRAGMENT,
+							.count = DescriptorCount
+						}
+					};
+					dsLayout = device->createDescriptorSetLayout(bindings);
+					if (!dsLayout)
+					{
+						logger->log("Could not create descriptor set layout!",ILogger::ELL_ERROR);
+						return nullptr;
+					}
+				}
+
+				// create Descriptor Set
+				auto pool = device->createDescriptorPoolForDSLayouts(IDescriptorPool::ECF_UPDATE_AFTER_BIND_BIT,{&dsLayout.get(),1});
+				init.ds = pool->createDescriptorSet(std::move(dsLayout));
+				if (!init.ds)
+				{
+					logger->log("Could not descriptor set!",ILogger::ELL_ERROR);
+					return nullptr;
+				}
+			}
+
+			//
+			const SPushConstantRange ranges[] = {{
+				.stageFlags = hlsl::ShaderStage::ESS_VERTEX,
+				.offset = 0,
+				.size = sizeof(SInstance::SPushConstants),
+			}};
+			init.layout = device->createPipelineLayout(ranges,smart_refctd_ptr<const IGPUDescriptorSetLayout>(init.ds->getLayout()));
+
+			// TODO: Load Shaders and Create Pipelines
+			{
+				//
+			}
+
+			// write geometries' attributes to descriptor set
+			{
+				core::vector<IGPUDescriptorSet::SDescriptorInfo> infos;
+				auto allocateUTB = [device,&infos](const IGeometry<const IGPUBuffer>::SDataView& view)->uint8_t
+				{
+					if (!view)
+						return DescriptorCount;
+					const auto retval = infos.size();
+					infos.emplace_back().desc = device->createBufferView(view.src, view.composed.format);
+					return retval;
+				};
+
+				for (const auto& entry : namedGeoms)
+				{
+					const auto* geom = entry.geom.get();
+					// could also check device origin on all buffers
+					if (!geom->valid())
+						continue;
+					auto& out = init.geoms.emplace_back();
+					if (const auto& view=geom->getIndexView(); view)
+					{
+						out.indexBuffer.offset = view.src.offset;
+						out.indexBuffer.buffer = view.src.buffer;
+					}
+					out.elementCount = geom->getVertexReferenceCount();
+					out.positionView = allocateUTB(geom->getPositionView());
+					out.normalView = allocateUTB(geom->getNormalView());
+					// the first view is usually the UV
+					if (const auto& auxViews = geom->getAuxAttributeViews(); !auxViews.empty())
+						out.uvView = allocateUTB(auxViews.front());
+				}
+
+				if (infos.empty())
+					return nullptr;
+				const IGPUDescriptorSet::SWriteDescriptorSet write = {
+					.dstSet = init.ds.get(),
+					.binding = 0,
+					.arrayElement = 0,
+					.count = static_cast<uint32_t>(infos.size()),
+					.info = infos.data()
+				};
+				if (!device->updateDescriptorSets({&write,1},{}))
+					return nullptr;
+			}
+
+			return smart_refctd_ptr<CSimpleDebugRenderer>(new CSimpleDebugRenderer(std::move(init)),dont_grab);
+		}
+
+		//
+		struct SInitParams
+		{
+			core::smart_refctd_ptr<video::IGPUDescriptorSet> ds;
+			core::smart_refctd_ptr<video::IGPUPipelineLayout> layout;
+			core::vector<SPackedGeometry> geoms;
+		};
+		inline const SInitParams& getInitParams() const {return m_params;}
+
+		//
+		inline void render(video::IGPUCommandBuffer* cmdbuf, const SViewParams& viewParams) const
+		{
+			EXPOSE_NABLA_NAMESPACES;
+
+			cmdbuf->beginDebugMarker("CSimpleDebugRenderer::render");
+
+			const auto* layout = m_params.layout.get();
+			cmdbuf->bindDescriptorSets(E_PIPELINE_BIND_POINT::EPBP_GRAPHICS,layout,0,1,&m_params.ds.get());
+
+			for (const auto& instance : m_instances)
+			{
+				const auto* geo = instance.packedGeo;
+				cmdbuf->bindGraphicsPipeline(geo->pipeline.get());
+				const auto pc = instance.computePushConstants(viewParams);
+				cmdbuf->pushConstants(layout,hlsl::ShaderStage::ESS_VERTEX,0,sizeof(pc),&pc);
+				if (geo->indexBuffer)
+				{
+					cmdbuf->bindIndexBuffer(geo->indexBuffer,geo->indexType);
+					cmdbuf->drawIndexed(geo->elementCount,1,0,0,0);
+				}
+				else
+					cmdbuf->draw(geo->elementCount,1,0,0);
+			}
+			cmdbuf->endDebugMarker();
+		}
+
+		core::vector<SInstance> m_instances;
+
+	protected:
+		inline CSimpleDebugRenderer(SInitParams&& _params) : m_params(std::move(_params)) {}
+
+		SInitParams m_params;
+#undef EXPOSE_NABLA_NAMESPACES
+};
+
+#if 0
+class ResourceBuilder
+{
+private:
+
+	bool createShaders()
+	{
+		EXPOSE_NABLA_NAMESPACES();
+
+		auto createShader = [&]<StringLiteral virtualPath>(IShader::E_SHADER_STAGE stage, smart_refctd_ptr<typename Types::shader_t>& outShader) -> smart_refctd_ptr<typename Types::shader_t>
+		{
+			// TODO: use SPIRV loader & our ::system ns to get those cpu shaders, do not create myself (shit I forgot it exists)
+
+			const SBuiltinFile& in = ::geometry::creator::spirv::builtin::get_resource<virtualPath>();
+			const auto buffer = ICPUBuffer::create({ { in.size }, (void*)in.contents, core::getNullMemoryResource() }, adopt_memory);
+			auto shader = make_smart_refctd_ptr<ICPUShader>(smart_refctd_ptr(buffer), stage, IShader::E_CONTENT_TYPE::ECT_SPIRV, ""); // must create cpu instance regardless underlying type
+
+			if constexpr (withAssetConverter)
+			{
+				buffer->setContentHash(buffer->computeContentHash());
+				outShader = std::move(shader);
+			}
+
+			return outShader;
+		};
+
+		typename ResourcesBundleScratch::Shaders& basic = scratch.shaders[GeometriesCpu::GP_BASIC];
+		createShader.template operator() < NBL_CORE_UNIQUE_STRING_LITERAL_TYPE("geometryCreator/spirv/gc.basic.vertex.spv") > (IShader::E_SHADER_STAGE::ESS_VERTEX, basic.vertex);
+		createShader.template operator() < NBL_CORE_UNIQUE_STRING_LITERAL_TYPE("geometryCreator/spirv/gc.basic.fragment.spv") > (IShader::E_SHADER_STAGE::ESS_FRAGMENT, basic.fragment);
+
+		typename ResourcesBundleScratch::Shaders& cone = scratch.shaders[GeometriesCpu::GP_CONE];
+		createShader.template operator() < NBL_CORE_UNIQUE_STRING_LITERAL_TYPE("geometryCreator/spirv/gc.cone.vertex.spv") > (IShader::E_SHADER_STAGE::ESS_VERTEX, cone.vertex);
+		createShader.template operator() < NBL_CORE_UNIQUE_STRING_LITERAL_TYPE("geometryCreator/spirv/gc.basic.fragment.spv") > (IShader::E_SHADER_STAGE::ESS_FRAGMENT, cone.fragment); // note we reuse fragment from basic!
+
+		typename ResourcesBundleScratch::Shaders& ico = scratch.shaders[GeometriesCpu::GP_ICO];
+		createShader.template operator() < NBL_CORE_UNIQUE_STRING_LITERAL_TYPE("geometryCreator/spirv/gc.ico.vertex.spv") > (IShader::E_SHADER_STAGE::ESS_VERTEX, ico.vertex);
+		createShader.template operator() < NBL_CORE_UNIQUE_STRING_LITERAL_TYPE("geometryCreator/spirv/gc.basic.fragment.spv") > (IShader::E_SHADER_STAGE::ESS_FRAGMENT, ico.fragment); // note we reuse fragment from basic!
+			
+		for (const auto& it : scratch.shaders)
+		{
+			if (!it.vertex || !it.fragment)
+			{
+				logger->log("Could not create shaders!", ILogger::ELL_ERROR);
+				return false;
+			}
+		}
+
+		return true;
+	}
+
+	bool createGeometries()
+	{
+		EXPOSE_NABLA_NAMESPACES();
+
+		for (uint32_t i = 0; i < geometries.objects.size(); ++i)
+		{
+			const auto& inGeometry = geometries.objects[i];
+			auto& [obj, meta] = scratch.objects[i];
+
+			bool status = true;
+
+			meta.name = inGeometry.meta.name;
+			meta.type = inGeometry.meta.type;
+
+			struct
+			{
+				SBlendParams blend;
+				SRasterizationParams rasterization;
+				typename Types::graphics_pipeline_t::SCreationParams pipeline;
+			} params;
+				
+			{
+				params.blend.logicOp = ELO_NO_OP;
+
+				auto& b = params.blend.blendParams[0];
+				b.srcColorFactor = EBF_SRC_ALPHA;
+				b.dstColorFactor = EBF_ONE_MINUS_SRC_ALPHA;
+				b.colorBlendOp = EBO_ADD;
+				b.srcAlphaFactor = EBF_SRC_ALPHA;
+				b.dstAlphaFactor = EBF_SRC_ALPHA;
+				b.alphaBlendOp = EBO_ADD;
+				b.colorWriteMask = (1u << 0u) | (1u << 1u) | (1u << 2u) | (1u << 3u);
+			}
+
+			params.rasterization.faceCullingMode = EFCM_NONE;
+			{
+				const typename Types::shader_t::SSpecInfo info [] =
+				{
+					{.entryPoint = "VSMain", .shader = scratch.shaders[inGeometry.shadersType].vertex.get() },
+					{.entryPoint = "PSMain", .shader = scratch.shaders[inGeometry.shadersType].fragment.get() }
+				};
+
+				params.pipeline.layout = scratch.pipelineLayout.get();
+				params.pipeline.shaders = info;
+				params.pipeline.renderpass = scratch.renderpass.get();
+				params.pipeline.cached = { .vertexInput = inGeometry.data.inputParams, .primitiveAssembly = inGeometry.data.assemblyParams, .rasterization = params.rasterization, .blend = params.blend, .subpassIx = 0u };
+
+				obj.indexCount = inGeometry.data.indexCount;
+				obj.indexType = inGeometry.data.indexType;
+
+				// TODO: cache pipeline & try lookup for existing one first maybe
+
+				// similar issue like with shaders again, in this case gpu contructor allows for extra cache parameters + there is no constructor you can use to fire make_smart_refctd_ptr yourself for cpu
+				if constexpr (withAssetConverter)
+					obj.pipeline = ICPUGraphicsPipeline::create(params.pipeline);
+				else
+				{
+					const std::array<const IGPUGraphicsPipeline::SCreationParams,1> info = { { params.pipeline } };
+					utilities->getLogicalDevice()->createGraphicsPipelines(nullptr, info, &obj.pipeline);
+				}
+
+				if (!obj.pipeline)
+				{
+					logger->log("Could not create graphics pipeline for [%s] object!", ILogger::ELL_ERROR, meta.name.data());
+					status = false;
+				}
+
+				// object buffers
+				auto createVIBuffers = [&]() -> bool
+				{
+					using ibuffer_t = ::nbl::asset::IBuffer; // seems to be ambigous, both asset & core namespaces has IBuffer
+
+					// note: similar issue like with shaders, this time with cpu-gpu constructors differing in arguments
+					auto vBuffer = smart_refctd_ptr(inGeometry.data.bindings[0].buffer); // no offset
+					constexpr static auto VERTEX_USAGE = bitflag(ibuffer_t::EUF_VERTEX_BUFFER_BIT) | ibuffer_t::EUF_TRANSFER_DST_BIT | ibuffer_t::EUF_INLINE_UPDATE_VIA_CMDBUF;
+					obj.bindings.vertex.offset = 0u;
+						
+					auto iBuffer = smart_refctd_ptr(inGeometry.data.indexBuffer.buffer); // no offset
+					constexpr static auto INDEX_USAGE = bitflag(ibuffer_t::EUF_INDEX_BUFFER_BIT) | ibuffer_t::EUF_VERTEX_BUFFER_BIT | ibuffer_t::EUF_TRANSFER_DST_BIT | ibuffer_t::EUF_INLINE_UPDATE_VIA_CMDBUF;
+					obj.bindings.index.offset = 0u;
+
+					if constexpr (withAssetConverter)
+					{
+						if (!vBuffer)
+							return false;
+
+						vBuffer->addUsageFlags(VERTEX_USAGE);
+						vBuffer->setContentHash(vBuffer->computeContentHash());
+						obj.bindings.vertex = { .offset = 0u, .buffer = vBuffer };
+
+						if (inGeometry.data.indexType != EIT_UNKNOWN)
+							if (iBuffer)
+							{
+								iBuffer->addUsageFlags(INDEX_USAGE);
+								iBuffer->setContentHash(iBuffer->computeContentHash());
+							}
+							else
+								return false;
+
+						obj.bindings.index = { .offset = 0u, .buffer = iBuffer };
+					}
+					else
+					{
+						auto vertexBuffer = utilities->getLogicalDevice()->createBuffer(IGPUBuffer::SCreationParams({ .size = vBuffer->getSize(), .usage = VERTEX_USAGE }));
+						auto indexBuffer = iBuffer ? utilities->getLogicalDevice()->createBuffer(IGPUBuffer::SCreationParams({ .size = iBuffer->getSize(), .usage = INDEX_USAGE })) : nullptr;
+
+						if (!vertexBuffer)
+							return false;
+
+						if (inGeometry.data.indexType != EIT_UNKNOWN)
+							if (!indexBuffer)
+								return false;
+
+						const auto mask = utilities->getLogicalDevice()->getPhysicalDevice()->getUpStreamingMemoryTypeBits();
+						for (auto it : { vertexBuffer , indexBuffer })
+						{
+							if (it)
+							{
+								auto reqs = it->getMemoryReqs();
+								reqs.memoryTypeBits &= mask;
+
+								utilities->getLogicalDevice()->allocate(reqs, it.get());
+							}
+						}
+
+						// record transfer uploads
+						obj.bindings.vertex = { .offset = 0u, .buffer = std::move(vertexBuffer) };
+						{
+							const SBufferRange<IGPUBuffer> range = { .offset = obj.bindings.vertex.offset, .size = obj.bindings.vertex.buffer->getSize(), .buffer = obj.bindings.vertex.buffer };
+							if (!commandBuffer->updateBuffer(range, vBuffer->getPointer()))
+							{
+								logger->log("Could not record vertex buffer transfer upload for [%s] object!", ILogger::ELL_ERROR, meta.name.data());
+								status = false;
+							}
+						}
+						obj.bindings.index = { .offset = 0u, .buffer = std::move(indexBuffer) };
+						{
+							if (iBuffer)
+							{
+								const SBufferRange<IGPUBuffer> range = { .offset = obj.bindings.index.offset, .size = obj.bindings.index.buffer->getSize(), .buffer = obj.bindings.index.buffer };
+
+								if (!commandBuffer->updateBuffer(range, iBuffer->getPointer()))
+								{
+									logger->log("Could not record index buffer transfer upload for [%s] object!", ILogger::ELL_ERROR, meta.name.data());
+									status = false;
+								}
+							}
+						}
+					}
+						
+					return true;
+				};
+
+				if (!createVIBuffers())
+				{
+					logger->log("Could not create buffers for [%s] object!", ILogger::ELL_ERROR, meta.name.data());
+					status = false;
+				}
+
+				if (!status)
+				{
+					logger->log("[%s] object will not be created!", ILogger::ELL_ERROR, meta.name.data());
+
+					obj.bindings.vertex = {};
+					obj.bindings.index = {};
+					obj.indexCount = 0u;
+					obj.indexType = E_INDEX_TYPE::EIT_UNKNOWN;
+					obj.pipeline = nullptr;
+
+					continue;
+				}
+			}
+		}
+
+		return true;
+	}
+
+
+	struct GeometriesCpu
+	{
+		enum GeometryShader
+		{
+			GP_BASIC = 0,
+			GP_CONE,
+			GP_ICO,
+
+			GP_COUNT
+		};
+
+
+	};
+
+		struct Shaders
+		{
+			nbl::core::smart_refctd_ptr<typename Types::shader_t> vertex = nullptr, fragment = nullptr;
+		};
+
+		std::array<Shaders, GeometriesCpu::GP_COUNT> shaders;
+};
+#endif
+
+}
+#endif
\ No newline at end of file

From 0ba8eed179bd6a4d86a63625cb9254a6f4a9714c Mon Sep 17 00:00:00 2001
From: keptsecret <sorchon@gmail.com>
Date: Wed, 18 Jun 2025 15:53:44 +0700
Subject: [PATCH 388/529] apply reduced macro definitions to benchmark ex

---
 .../app_resources/benchmarkSubgroup.comp.hlsl | 19 +++++----
 .../benchmarkWorkgroup.comp.hlsl              |  6 +--
 .../app_resources/shaderCommon.hlsl           | 11 -----
 29_Arithmetic2Bench/main.cpp                  | 42 +++++++++----------
 4 files changed, 33 insertions(+), 45 deletions(-)

diff --git a/29_Arithmetic2Bench/app_resources/benchmarkSubgroup.comp.hlsl b/29_Arithmetic2Bench/app_resources/benchmarkSubgroup.comp.hlsl
index 2c102c13d..f6ad3e678 100644
--- a/29_Arithmetic2Bench/app_resources/benchmarkSubgroup.comp.hlsl
+++ b/29_Arithmetic2Bench/app_resources/benchmarkSubgroup.comp.hlsl
@@ -4,29 +4,33 @@
 
 #include "nbl/builtin/hlsl/glsl_compat/core.hlsl"
 #include "nbl/builtin/hlsl/glsl_compat/subgroup_basic.hlsl"
+#include "nbl/builtin/hlsl/subgroup2/arithmetic_params.hlsl"
 #include "nbl/builtin/hlsl/subgroup2/arithmetic_portability.hlsl"
 #include "nbl/builtin/hlsl/random/xoroshiro.hlsl"
 
 #include "shaderCommon.hlsl"
 #include "nbl/builtin/hlsl/workgroup2/basic.hlsl"
 
-typedef vector<uint32_t, ITEMS_PER_INVOCATION> type_t;
+template<class Binop, class device_capabilities>
+using params_t = SUBGROUP_CONFIG_T;
+
+NBL_CONSTEXPR_STATIC_INLINE uint32_t ItemsPerInvocation = params_t<typename arithmetic::plus<uint32_t>::base_t, device_capabilities>::ItemsPerInvocation;
+
+typedef vector<uint32_t, ItemsPerInvocation> type_t;
 
 uint32_t globalIndex()
 {
     return glsl::gl_WorkGroupID().x*WORKGROUP_SIZE+workgroup::SubgroupContiguousIndex();
 }
 
-template<class Binop, uint32_t N>
+template<class Binop>
 static void subbench(NBL_CONST_REF_ARG(type_t) sourceVal)
 {
-    using config_t = subgroup2::Configuration<SUBGROUP_SIZE_LOG2>;
-    using params_t = subgroup2::ArithmeticParams<config_t, typename Binop::base_t, N, device_capabilities>;
     type_t value = sourceVal;
 
     const uint64_t outputBufAddr = pc.pOutputBuf[Binop::BindingIndex];
 
-    operation_t<params_t> func;
+    operation_t<params_t<typename Binop::base_t, device_capabilities> > func;
     // [unroll]
     for (uint32_t i = 0; i < NUM_LOOPS; i++)
         value = func(value);
@@ -36,13 +40,14 @@ static void subbench(NBL_CONST_REF_ARG(type_t) sourceVal)
 
 void benchmark()
 {
+    const uint32_t invocationIndex = globalIndex();
     type_t sourceVal;
     Xoroshiro64Star xoroshiro = Xoroshiro64Star::construct(uint32_t2(invocationIndex,invocationIndex+1));
     [unroll]
-    for (uint16_t i = 0; i < Config::ItemsPerInvocation_0; i++)
+    for (uint16_t i = 0; i < ItemsPerInvocation; i++)
         sourceVal[i] = xoroshiro();
 
-    subbench<arithmetic::plus<uint32_t>, ITEMS_PER_INVOCATION>(sourceVal);
+    subbench<arithmetic::plus<uint32_t> >(sourceVal);
 }
 
 [numthreads(WORKGROUP_SIZE,1,1)]
diff --git a/29_Arithmetic2Bench/app_resources/benchmarkWorkgroup.comp.hlsl b/29_Arithmetic2Bench/app_resources/benchmarkWorkgroup.comp.hlsl
index 50a9d912b..a56945467 100644
--- a/29_Arithmetic2Bench/app_resources/benchmarkWorkgroup.comp.hlsl
+++ b/29_Arithmetic2Bench/app_resources/benchmarkWorkgroup.comp.hlsl
@@ -6,12 +6,10 @@
 #include "nbl/builtin/hlsl/workgroup2/arithmetic.hlsl"
 #include "nbl/builtin/hlsl/random/xoroshiro.hlsl"
 
-static const uint32_t WORKGROUP_SIZE = 1u << WORKGROUP_SIZE_LOG2;
+using config_t = WORKGROUP_CONFIG_T;
 
 #include "shaderCommon.hlsl"
 
-using config_t = workgroup2::ArithmeticConfiguration<WORKGROUP_SIZE_LOG2, SUBGROUP_SIZE_LOG2, ITEMS_PER_INVOCATION>;
-
 typedef vector<uint32_t, config_t::ItemsPerInvocation_0> type_t;
 
 // final (level 1/2) scan needs to fit in one subgroup exactly
@@ -119,7 +117,7 @@ void benchmark()
 }
 
 
-[numthreads(WORKGROUP_SIZE,1,1)]
+[numthreads(config_t::WorkgroupSize,1,1)]
 void main()
 {
     benchmark();
diff --git a/29_Arithmetic2Bench/app_resources/shaderCommon.hlsl b/29_Arithmetic2Bench/app_resources/shaderCommon.hlsl
index 4866efe81..242ededd8 100644
--- a/29_Arithmetic2Bench/app_resources/shaderCommon.hlsl
+++ b/29_Arithmetic2Bench/app_resources/shaderCommon.hlsl
@@ -3,13 +3,6 @@
 using namespace nbl;
 using namespace hlsl;
 
-// https://github.com/microsoft/DirectXShaderCompiler/issues/6144
-uint32_t3 nbl::hlsl::glsl::gl_WorkGroupSize() {return uint32_t3(WORKGROUP_SIZE,1,1);}
-
-#ifndef ITEMS_PER_INVOCATION
-#error "Define ITEMS_PER_INVOCATION!"
-#endif
-
 [[vk::push_constant]] PushConstantData pc;
 
 struct device_capabilities
@@ -25,10 +18,6 @@ struct device_capabilities
 #error "Define OPERATION!"
 #endif
 
-#ifndef SUBGROUP_SIZE_LOG2
-#error "Define SUBGROUP_SIZE_LOG2!"
-#endif
-
 #ifndef NUM_LOOPS
 #error "Define NUM_LOOPS!"
 #endif
diff --git a/29_Arithmetic2Bench/main.cpp b/29_Arithmetic2Bench/main.cpp
index e88a59cae..2d5afeb4c 100644
--- a/29_Arithmetic2Bench/main.cpp
+++ b/29_Arithmetic2Bench/main.cpp
@@ -3,6 +3,7 @@
 #include "nbl/application_templates/MonoAssetManagerAndBuiltinResourceApplication.hpp"
 #include "app_resources/common.hlsl"
 #include "nbl/builtin/hlsl/workgroup2/arithmetic_config.hlsl"
+#include "nbl/builtin/hlsl/subgroup2/arithmetic_params.hlsl"
 
 using namespace nbl;
 using namespace core;
@@ -549,55 +550,50 @@ class ArithmeticBenchApp final : public examples::SimpleWindowedApplication, pub
 		smart_refctd_ptr<ICPUShader> overriddenUnspecialized;
 		if constexpr (WorkgroupBench)
 		{
-			const std::string definitions[7] = {
+			const std::string definitions[4] = {
 				"workgroup2::" + arith_name,
-				std::to_string(workgroupSizeLog2),
-				std::to_string(itemsPerWG),
-				std::to_string(itemsPerInvoc),
-				std::to_string(subgroupSizeLog2),
+				wgConfig.getConfigTemplateStructString(),
 				std::to_string(numLoops),
 				std::to_string(arith_name=="reduction")
 			};
 
-			const IShaderCompiler::SMacroDefinition defines[8] = {
+			const IShaderCompiler::SMacroDefinition defines[5] = {
 				{ "OPERATION", definitions[0] },
-				{ "WORKGROUP_SIZE_LOG2", definitions[1] },
-				{ "ITEMS_PER_WG", definitions[2] },
-				{ "ITEMS_PER_INVOCATION", definitions[3] },
-				{ "SUBGROUP_SIZE_LOG2", definitions[4] },
-				{ "NUM_LOOPS", definitions[5] },
-				{ "IS_REDUCTION", definitions[6] },
+				{ "WORKGROUP_CONFIG_T", definitions[1] },
+				{ "NUM_LOOPS", definitions[2] },
+				{ "IS_REDUCTION", definitions[3] },
 				{ "TEST_NATIVE", "1" }
 			};
 			if (UseNativeArithmetic)
-				options.preprocessorOptions.extraDefines = { defines, defines + 8 };
+				options.preprocessorOptions.extraDefines = { defines, defines + 5 };
 			else
-				options.preprocessorOptions.extraDefines = { defines, defines + 7 };
+				options.preprocessorOptions.extraDefines = { defines, defines + 4 };
 
 			overriddenUnspecialized = compiler->compileToSPIRV((const char*)source->getContent()->getPointer(), options);
 		}
 		else
 		{
-			const std::string definitions[5] = { 
+			hlsl::subgroup2::SArithmeticParams sgParams;
+			sgParams.init(subgroupSizeLog2, itemsPerInvoc);
+
+			const std::string definitions[4] = { 
 				"subgroup2::" + arith_name,
 				std::to_string(workgroupSize),
-				std::to_string(itemsPerInvoc),
-				std::to_string(subgroupSizeLog2),
+				sgParams.getParamTemplateStructString(),
 				std::to_string(numLoops)
 			};
 
-			const IShaderCompiler::SMacroDefinition defines[6] = {
+			const IShaderCompiler::SMacroDefinition defines[5] = {
 				{ "OPERATION", definitions[0] },
 				{ "WORKGROUP_SIZE", definitions[1] },
-				{ "ITEMS_PER_INVOCATION", definitions[2] },
-				{ "SUBGROUP_SIZE_LOG2", definitions[3] },
-				{ "NUM_LOOPS", definitions[4] },
+				{ "SUBGROUP_CONFIG_T", definitions[2] },
+				{ "NUM_LOOPS", definitions[3] },
 				{ "TEST_NATIVE", "1" }
 			};
 			if (UseNativeArithmetic)
-				options.preprocessorOptions.extraDefines = { defines, defines + 6 };
-			else
 				options.preprocessorOptions.extraDefines = { defines, defines + 5 };
+			else
+				options.preprocessorOptions.extraDefines = { defines, defines + 4 };
 
 			overriddenUnspecialized = compiler->compileToSPIRV((const char*)source->getContent()->getPointer(), options);
 		}

From c3786dfe24dd3f4d9ff2b60c63496f40bd3238b5 Mon Sep 17 00:00:00 2001
From: devsh <devsh@devsh.eu>
Date: Wed, 18 Jun 2025 12:27:43 +0200
Subject: [PATCH 389/529] prep for shader loading

---
 09_GeometryCreator/main.cpp                   |  27 +-
 common/include/nbl/examples/PCH.hpp           |   2 +
 .../geometry/CGeometryCreatorScene.hpp        |  16 +
 .../geometry/CSimpleDebugRenderer.hpp         | 332 ++++--------------
 4 files changed, 110 insertions(+), 267 deletions(-)

diff --git a/09_GeometryCreator/main.cpp b/09_GeometryCreator/main.cpp
index 5bbe40f37..4c982e8f8 100644
--- a/09_GeometryCreator/main.cpp
+++ b/09_GeometryCreator/main.cpp
@@ -4,24 +4,27 @@
 
 #include "common.hpp"
 
-class GeometryCreatorApp final : public MonoWindowApplication
+class GeometryCreatorApp final : public MonoWindowApplication, public application_templates::MonoAssetManagerAndBuiltinResourceApplication
 {
-		using base_t = MonoWindowApplication;
+	using device_base_t = MonoWindowApplication;
+	using asset_base_t = application_templates::MonoAssetManagerAndBuiltinResourceApplication;
 
 	public:
 		GeometryCreatorApp(const path& _localInputCWD, const path& _localOutputCWD, const path& _sharedInputCWD, const path& _sharedOutputCWD)
-			: base_t({1280,720}, EF_D16_UNORM, _localInputCWD, _localOutputCWD, _sharedInputCWD, _sharedOutputCWD) {}
+			: device_base_t({1280,720}, EF_D16_UNORM, _localInputCWD, _localOutputCWD, _sharedInputCWD, _sharedOutputCWD) {}
 
 		SPhysicalDeviceFeatures getRequiredDeviceFeatures() const override
 		{
-			auto retval = base_t::getRequiredDeviceFeatures();
+			auto retval = device_base_t::getRequiredDeviceFeatures();
 			retval.geometryShader = true;
 			return retval;
 		}
 
 		inline bool onAppInitialized(smart_refctd_ptr<ISystem>&& system) override
 		{
-			if (!base_t::onAppInitialized(smart_refctd_ptr(system)))
+			if (!asset_base_t::onAppInitialized(smart_refctd_ptr(system)))
+				return false;
+			if (!device_base_t::onAppInitialized(smart_refctd_ptr(system)))
 				return false;
 
 			m_semaphore = m_device->createSemaphore(m_realFrameIx);
@@ -54,7 +57,7 @@ class GeometryCreatorApp final : public MonoWindowApplication
 			);
 			
 			auto scRes = static_cast<CDefaultSwapchainFramebuffers*>(m_surface->getSwapchainResources());
-			m_renderer = CSimpleDebugRenderer::create(scRes->getRenderpass(),0,m_scene.get());
+			m_renderer = CSimpleDebugRenderer::create(m_assetMgr.get(),scRes->getRenderpass(),0,m_scene.get());
 			if (!m_renderer)
 				return logFail("Could not create Renderer!");
 
@@ -75,7 +78,7 @@ class GeometryCreatorApp final : public MonoWindowApplication
 			m_inputSystem->getDefaultMouse(&mouse);
 			m_inputSystem->getDefaultKeyboard(&keyboard);
 
-			const auto resourceIx = m_realFrameIx % base_t::MaxFramesInFlight;
+			const auto resourceIx = m_realFrameIx % device_base_t::MaxFramesInFlight;
 
 			auto* const cb = m_cmdBufs.data()[resourceIx].get();
 			cb->reset(IGPUCommandBuffer::RESET_FLAGS::RELEASE_RESOURCES_BIT);
@@ -120,7 +123,7 @@ class GeometryCreatorApp final : public MonoWindowApplication
 				auto scRes = static_cast<CDefaultSwapchainFramebuffers*>(m_surface->getSwapchainResources());
 				const IGPUCommandBuffer::SRenderpassBeginInfo info =
 				{
-					.framebuffer = scRes->getFramebuffer(base_t::getCurrentAcquire().imageIndex),
+					.framebuffer = scRes->getFramebuffer(device_base_t::getCurrentAcquire().imageIndex),
 					.colorClearValues = &clearValue,
 					.depthStencilClearValues = &depthValue,
 					.renderArea = currentRenderArea
@@ -153,7 +156,11 @@ class GeometryCreatorApp final : public MonoWindowApplication
 				{.cmdbuf = cb }
 			};
 			const IQueue::SSubmitInfo::SSemaphoreInfo acquired[] = {
-				{.semaphore = base_t::getCurrentAcquire().semaphore, .value = base_t::getCurrentAcquire().acquireCount, .stageMask = PIPELINE_STAGE_FLAGS::NONE}
+				{
+					.semaphore = device_base_t::getCurrentAcquire().semaphore,
+					.value = device_base_t::getCurrentAcquire().acquireCount,
+					.stageMask = PIPELINE_STAGE_FLAGS::NONE
+				}
 			};
 			const IQueue::SSubmitInfo infos[] =
 			{
@@ -227,7 +234,7 @@ class GeometryCreatorApp final : public MonoWindowApplication
 		//
 		smart_refctd_ptr<ISemaphore> m_semaphore;
 		uint64_t m_realFrameIx = 0;
-		std::array<smart_refctd_ptr<IGPUCommandBuffer>,base_t::MaxFramesInFlight> m_cmdBufs;
+		std::array<smart_refctd_ptr<IGPUCommandBuffer>,device_base_t::MaxFramesInFlight> m_cmdBufs;
 		//
 		InputSystem::ChannelReader<IMouseEventChannel> mouse;
 		InputSystem::ChannelReader<IKeyboardEventChannel> keyboard;
diff --git a/common/include/nbl/examples/PCH.hpp b/common/include/nbl/examples/PCH.hpp
index ed5da666e..2c08a2d84 100644
--- a/common/include/nbl/examples/PCH.hpp
+++ b/common/include/nbl/examples/PCH.hpp
@@ -6,6 +6,8 @@
 
 
 #include <nabla.h>
+// why isnt this in `nabla.h` ?
+#include "nbl/application_templates/MonoAssetManagerAndBuiltinResourceApplication.hpp"
 
 // #include "nbl/ui/CGraphicalApplicationAndroid.h"
 // #include "nbl/ui/CWindowManagerAndroid.h"
diff --git a/common/include/nbl/examples/geometry/CGeometryCreatorScene.hpp b/common/include/nbl/examples/geometry/CGeometryCreatorScene.hpp
index 8a73f2e14..12c12e3f3 100644
--- a/common/include/nbl/examples/geometry/CGeometryCreatorScene.hpp
+++ b/common/include/nbl/examples/geometry/CGeometryCreatorScene.hpp
@@ -180,4 +180,20 @@ class CGeometryCreatorScene : public core::IReferenceCounted
 };
 
 }
+#endif
+
+
+
+#if 0
+		typename ResourcesBundleScratch::Shaders& basic = scratch.shaders[GeometriesCpu::GP_BASIC];
+		createShader.template operator() < NBL_CORE_UNIQUE_STRING_LITERAL_TYPE("geometryCreator/spirv/gc.basic.vertex.spv") > (IShader::E_SHADER_STAGE::ESS_VERTEX, basic.vertex);
+		createShader.template operator() < NBL_CORE_UNIQUE_STRING_LITERAL_TYPE("geometryCreator/spirv/gc.basic.fragment.spv") > (IShader::E_SHADER_STAGE::ESS_FRAGMENT, basic.fragment);
+
+		typename ResourcesBundleScratch::Shaders& cone = scratch.shaders[GeometriesCpu::GP_CONE];
+		createShader.template operator() < NBL_CORE_UNIQUE_STRING_LITERAL_TYPE("geometryCreator/spirv/gc.cone.vertex.spv") > (IShader::E_SHADER_STAGE::ESS_VERTEX, cone.vertex);
+		createShader.template operator() < NBL_CORE_UNIQUE_STRING_LITERAL_TYPE("geometryCreator/spirv/gc.basic.fragment.spv") > (IShader::E_SHADER_STAGE::ESS_FRAGMENT, cone.fragment); // note we reuse fragment from basic!
+
+		typename ResourcesBundleScratch::Shaders& ico = scratch.shaders[GeometriesCpu::GP_ICO];
+		createShader.template operator() < NBL_CORE_UNIQUE_STRING_LITERAL_TYPE("geometryCreator/spirv/gc.ico.vertex.spv") > (IShader::E_SHADER_STAGE::ESS_VERTEX, ico.vertex);
+		createShader.template operator() < NBL_CORE_UNIQUE_STRING_LITERAL_TYPE("geometryCreator/spirv/gc.basic.fragment.spv") > (IShader::E_SHADER_STAGE::ESS_FRAGMENT, ico.fragment); // note we reuse fragment from basic!
 #endif
\ No newline at end of file
diff --git a/common/include/nbl/examples/geometry/CSimpleDebugRenderer.hpp b/common/include/nbl/examples/geometry/CSimpleDebugRenderer.hpp
index b6f6b4aaf..e18c6664a 100644
--- a/common/include/nbl/examples/geometry/CSimpleDebugRenderer.hpp
+++ b/common/include/nbl/examples/geometry/CSimpleDebugRenderer.hpp
@@ -81,21 +81,33 @@ class CSimpleDebugRenderer final : public core::IReferenceCounted
 		};
 
 		//
-		static inline core::smart_refctd_ptr<CSimpleDebugRenderer> create(video::IGPURenderpass* renderpass, const uint32_t subpassIX, const CGeometryCreatorScene* scene)
+		static inline core::smart_refctd_ptr<CSimpleDebugRenderer> create(asset::IAssetManager* assMan, video::IGPURenderpass* renderpass, const uint32_t subpassIX, const CGeometryCreatorScene* scene)
 		{
 			EXPOSE_NABLA_NAMESPACES;
 
-			if (!renderpass)
+			if (!!renderpass)
 				return nullptr;
 			auto device = const_cast<ILogicalDevice*>(renderpass->getOriginDevice());
 			auto logger = device->getLogger();
 
-			if (!scene)
+			if (!assMan || !scene)
 				return nullptr;
 			const auto namedGeoms = scene->getGeometries();
 			if (namedGeoms.empty())
 				return nullptr;
 
+			// load shader
+			smart_refctd_ptr<ICPUShader> shader;
+			{
+				const auto bundle = assMan->getAsset("nbl/examples/geometry/spirv/unified.spv",{});
+				const auto contents = bundle.getContents();
+				if (bundle.getAssetType()!=IAsset::ET_SHADER || contents.empty())
+					return nullptr;
+				shader = IAsset::castDown<ICPUShader>(contents[0]);
+				if (!shader)
+					return nullptr;
+			}
+
 			SInitParams init;
 
 			// create descriptor set
@@ -132,7 +144,7 @@ class CSimpleDebugRenderer final : public core::IReferenceCounted
 				}
 			}
 
-			//
+			// create pipeline layout
 			const SPushConstantRange ranges[] = {{
 				.stageFlags = hlsl::ShaderStage::ESS_VERTEX,
 				.offset = 0,
@@ -140,9 +152,64 @@ class CSimpleDebugRenderer final : public core::IReferenceCounted
 			}};
 			init.layout = device->createPipelineLayout(ranges,smart_refctd_ptr<const IGPUDescriptorSetLayout>(init.ds->getLayout()));
 
-			// TODO: Load Shaders and Create Pipelines
+			// create pipelines
+			enum PipelineType : uint8_t
+			{
+				BasicTriangleList,
+				BasicTriangleFan,
+				Cone,
+				Count
+			};
+			smart_refctd_ptr<IGPUGraphicsPipeline> pipelines[PipelineType::Count] = {};
 			{
-				//
+				IGPUGraphicsPipeline::SCreationParams params[PipelineType::Count] = {};
+				for (auto i=0; i< PipelineType::Count; i++)
+				{
+					const auto type = static_cast<PipelineType>(i);
+					// no vertex input
+					{
+						auto& primitiveAssembly = params[i].cached.primitiveAssembly;
+						switch (type)
+						{
+							case PipelineType::BasicTriangleFan:
+								primitiveAssembly.primitiveType = E_PRIMITIVE_TOPOLOGY::EPT_TRIANGLE_FAN;
+								break;
+							default:
+								primitiveAssembly.primitiveType = E_PRIMITIVE_TOPOLOGY::EPT_TRIANGLE_LIST;
+								break;
+						}
+						primitiveAssembly.primitiveRestartEnable = false;
+						primitiveAssembly.tessPatchVertCount = 3;
+					}
+					{
+						auto& rasterization = params[i].cached.rasterization;
+						rasterization.faceCullingMode = EFCM_NONE;
+					}
+					{
+						auto& blend = params[i].cached.blend;
+						// everything as default
+					}
+					params[i].cached.subpassIx = subpassIX;
+					params[i].renderpass = renderpass;
+				}
+				/*
+		typename ResourcesBundleScratch::Shaders& basic = scratch.shaders[GeometriesCpu::GP_BASIC];
+		createShader.template operator() < NBL_CORE_UNIQUE_STRING_LITERAL_TYPE("geometryCreator/spirv/gc.basic.vertex.spv") > (IShader::E_SHADER_STAGE::ESS_VERTEX, basic.vertex);
+		createShader.template operator() < NBL_CORE_UNIQUE_STRING_LITERAL_TYPE("geometryCreator/spirv/gc.basic.fragment.spv") > (IShader::E_SHADER_STAGE::ESS_FRAGMENT, basic.fragment);
+
+		typename ResourcesBundleScratch::Shaders& cone = scratch.shaders[GeometriesCpu::GP_CONE];
+		createShader.template operator() < NBL_CORE_UNIQUE_STRING_LITERAL_TYPE("geometryCreator/spirv/gc.cone.vertex.spv") > (IShader::E_SHADER_STAGE::ESS_VERTEX, cone.vertex);
+		createShader.template operator() < NBL_CORE_UNIQUE_STRING_LITERAL_TYPE("geometryCreator/spirv/gc.basic.fragment.spv") > (IShader::E_SHADER_STAGE::ESS_FRAGMENT, cone.fragment); // note we reuse fragment from basic!
+
+		typename ResourcesBundleScratch::Shaders& ico = scratch.shaders[GeometriesCpu::GP_ICO];
+		createShader.template operator() < NBL_CORE_UNIQUE_STRING_LITERAL_TYPE("geometryCreator/spirv/gc.ico.vertex.spv") > (IShader::E_SHADER_STAGE::ESS_VERTEX, ico.vertex);
+		createShader.template operator() < NBL_CORE_UNIQUE_STRING_LITERAL_TYPE("geometryCreator/spirv/gc.basic.fragment.spv") > (IShader::E_SHADER_STAGE::ESS_FRAGMENT, ico.fragment); // note we reuse fragment from basic!
+				*/
+				if (!device->createGraphicsPipelines(nullptr,params,pipelines))
+				{
+					logger->log("Could not create Graphics Pipelines!",ILogger::ELL_ERROR);
+					return nullptr;
+				}
 			}
 
 			// write geometries' attributes to descriptor set
@@ -164,6 +231,8 @@ class CSimpleDebugRenderer final : public core::IReferenceCounted
 					if (!geom->valid())
 						continue;
 					auto& out = init.geoms.emplace_back();
+// TODO: handle special cases
+					out.pipeline = pipelines[PipelineType::BasicTriangleList];
 					if (const auto& view=geom->getIndexView(); view)
 					{
 						out.indexBuffer.offset = view.src.offset;
@@ -238,256 +307,5 @@ class CSimpleDebugRenderer final : public core::IReferenceCounted
 #undef EXPOSE_NABLA_NAMESPACES
 };
 
-#if 0
-class ResourceBuilder
-{
-private:
-
-	bool createShaders()
-	{
-		EXPOSE_NABLA_NAMESPACES();
-
-		auto createShader = [&]<StringLiteral virtualPath>(IShader::E_SHADER_STAGE stage, smart_refctd_ptr<typename Types::shader_t>& outShader) -> smart_refctd_ptr<typename Types::shader_t>
-		{
-			// TODO: use SPIRV loader & our ::system ns to get those cpu shaders, do not create myself (shit I forgot it exists)
-
-			const SBuiltinFile& in = ::geometry::creator::spirv::builtin::get_resource<virtualPath>();
-			const auto buffer = ICPUBuffer::create({ { in.size }, (void*)in.contents, core::getNullMemoryResource() }, adopt_memory);
-			auto shader = make_smart_refctd_ptr<ICPUShader>(smart_refctd_ptr(buffer), stage, IShader::E_CONTENT_TYPE::ECT_SPIRV, ""); // must create cpu instance regardless underlying type
-
-			if constexpr (withAssetConverter)
-			{
-				buffer->setContentHash(buffer->computeContentHash());
-				outShader = std::move(shader);
-			}
-
-			return outShader;
-		};
-
-		typename ResourcesBundleScratch::Shaders& basic = scratch.shaders[GeometriesCpu::GP_BASIC];
-		createShader.template operator() < NBL_CORE_UNIQUE_STRING_LITERAL_TYPE("geometryCreator/spirv/gc.basic.vertex.spv") > (IShader::E_SHADER_STAGE::ESS_VERTEX, basic.vertex);
-		createShader.template operator() < NBL_CORE_UNIQUE_STRING_LITERAL_TYPE("geometryCreator/spirv/gc.basic.fragment.spv") > (IShader::E_SHADER_STAGE::ESS_FRAGMENT, basic.fragment);
-
-		typename ResourcesBundleScratch::Shaders& cone = scratch.shaders[GeometriesCpu::GP_CONE];
-		createShader.template operator() < NBL_CORE_UNIQUE_STRING_LITERAL_TYPE("geometryCreator/spirv/gc.cone.vertex.spv") > (IShader::E_SHADER_STAGE::ESS_VERTEX, cone.vertex);
-		createShader.template operator() < NBL_CORE_UNIQUE_STRING_LITERAL_TYPE("geometryCreator/spirv/gc.basic.fragment.spv") > (IShader::E_SHADER_STAGE::ESS_FRAGMENT, cone.fragment); // note we reuse fragment from basic!
-
-		typename ResourcesBundleScratch::Shaders& ico = scratch.shaders[GeometriesCpu::GP_ICO];
-		createShader.template operator() < NBL_CORE_UNIQUE_STRING_LITERAL_TYPE("geometryCreator/spirv/gc.ico.vertex.spv") > (IShader::E_SHADER_STAGE::ESS_VERTEX, ico.vertex);
-		createShader.template operator() < NBL_CORE_UNIQUE_STRING_LITERAL_TYPE("geometryCreator/spirv/gc.basic.fragment.spv") > (IShader::E_SHADER_STAGE::ESS_FRAGMENT, ico.fragment); // note we reuse fragment from basic!
-			
-		for (const auto& it : scratch.shaders)
-		{
-			if (!it.vertex || !it.fragment)
-			{
-				logger->log("Could not create shaders!", ILogger::ELL_ERROR);
-				return false;
-			}
-		}
-
-		return true;
-	}
-
-	bool createGeometries()
-	{
-		EXPOSE_NABLA_NAMESPACES();
-
-		for (uint32_t i = 0; i < geometries.objects.size(); ++i)
-		{
-			const auto& inGeometry = geometries.objects[i];
-			auto& [obj, meta] = scratch.objects[i];
-
-			bool status = true;
-
-			meta.name = inGeometry.meta.name;
-			meta.type = inGeometry.meta.type;
-
-			struct
-			{
-				SBlendParams blend;
-				SRasterizationParams rasterization;
-				typename Types::graphics_pipeline_t::SCreationParams pipeline;
-			} params;
-				
-			{
-				params.blend.logicOp = ELO_NO_OP;
-
-				auto& b = params.blend.blendParams[0];
-				b.srcColorFactor = EBF_SRC_ALPHA;
-				b.dstColorFactor = EBF_ONE_MINUS_SRC_ALPHA;
-				b.colorBlendOp = EBO_ADD;
-				b.srcAlphaFactor = EBF_SRC_ALPHA;
-				b.dstAlphaFactor = EBF_SRC_ALPHA;
-				b.alphaBlendOp = EBO_ADD;
-				b.colorWriteMask = (1u << 0u) | (1u << 1u) | (1u << 2u) | (1u << 3u);
-			}
-
-			params.rasterization.faceCullingMode = EFCM_NONE;
-			{
-				const typename Types::shader_t::SSpecInfo info [] =
-				{
-					{.entryPoint = "VSMain", .shader = scratch.shaders[inGeometry.shadersType].vertex.get() },
-					{.entryPoint = "PSMain", .shader = scratch.shaders[inGeometry.shadersType].fragment.get() }
-				};
-
-				params.pipeline.layout = scratch.pipelineLayout.get();
-				params.pipeline.shaders = info;
-				params.pipeline.renderpass = scratch.renderpass.get();
-				params.pipeline.cached = { .vertexInput = inGeometry.data.inputParams, .primitiveAssembly = inGeometry.data.assemblyParams, .rasterization = params.rasterization, .blend = params.blend, .subpassIx = 0u };
-
-				obj.indexCount = inGeometry.data.indexCount;
-				obj.indexType = inGeometry.data.indexType;
-
-				// TODO: cache pipeline & try lookup for existing one first maybe
-
-				// similar issue like with shaders again, in this case gpu contructor allows for extra cache parameters + there is no constructor you can use to fire make_smart_refctd_ptr yourself for cpu
-				if constexpr (withAssetConverter)
-					obj.pipeline = ICPUGraphicsPipeline::create(params.pipeline);
-				else
-				{
-					const std::array<const IGPUGraphicsPipeline::SCreationParams,1> info = { { params.pipeline } };
-					utilities->getLogicalDevice()->createGraphicsPipelines(nullptr, info, &obj.pipeline);
-				}
-
-				if (!obj.pipeline)
-				{
-					logger->log("Could not create graphics pipeline for [%s] object!", ILogger::ELL_ERROR, meta.name.data());
-					status = false;
-				}
-
-				// object buffers
-				auto createVIBuffers = [&]() -> bool
-				{
-					using ibuffer_t = ::nbl::asset::IBuffer; // seems to be ambigous, both asset & core namespaces has IBuffer
-
-					// note: similar issue like with shaders, this time with cpu-gpu constructors differing in arguments
-					auto vBuffer = smart_refctd_ptr(inGeometry.data.bindings[0].buffer); // no offset
-					constexpr static auto VERTEX_USAGE = bitflag(ibuffer_t::EUF_VERTEX_BUFFER_BIT) | ibuffer_t::EUF_TRANSFER_DST_BIT | ibuffer_t::EUF_INLINE_UPDATE_VIA_CMDBUF;
-					obj.bindings.vertex.offset = 0u;
-						
-					auto iBuffer = smart_refctd_ptr(inGeometry.data.indexBuffer.buffer); // no offset
-					constexpr static auto INDEX_USAGE = bitflag(ibuffer_t::EUF_INDEX_BUFFER_BIT) | ibuffer_t::EUF_VERTEX_BUFFER_BIT | ibuffer_t::EUF_TRANSFER_DST_BIT | ibuffer_t::EUF_INLINE_UPDATE_VIA_CMDBUF;
-					obj.bindings.index.offset = 0u;
-
-					if constexpr (withAssetConverter)
-					{
-						if (!vBuffer)
-							return false;
-
-						vBuffer->addUsageFlags(VERTEX_USAGE);
-						vBuffer->setContentHash(vBuffer->computeContentHash());
-						obj.bindings.vertex = { .offset = 0u, .buffer = vBuffer };
-
-						if (inGeometry.data.indexType != EIT_UNKNOWN)
-							if (iBuffer)
-							{
-								iBuffer->addUsageFlags(INDEX_USAGE);
-								iBuffer->setContentHash(iBuffer->computeContentHash());
-							}
-							else
-								return false;
-
-						obj.bindings.index = { .offset = 0u, .buffer = iBuffer };
-					}
-					else
-					{
-						auto vertexBuffer = utilities->getLogicalDevice()->createBuffer(IGPUBuffer::SCreationParams({ .size = vBuffer->getSize(), .usage = VERTEX_USAGE }));
-						auto indexBuffer = iBuffer ? utilities->getLogicalDevice()->createBuffer(IGPUBuffer::SCreationParams({ .size = iBuffer->getSize(), .usage = INDEX_USAGE })) : nullptr;
-
-						if (!vertexBuffer)
-							return false;
-
-						if (inGeometry.data.indexType != EIT_UNKNOWN)
-							if (!indexBuffer)
-								return false;
-
-						const auto mask = utilities->getLogicalDevice()->getPhysicalDevice()->getUpStreamingMemoryTypeBits();
-						for (auto it : { vertexBuffer , indexBuffer })
-						{
-							if (it)
-							{
-								auto reqs = it->getMemoryReqs();
-								reqs.memoryTypeBits &= mask;
-
-								utilities->getLogicalDevice()->allocate(reqs, it.get());
-							}
-						}
-
-						// record transfer uploads
-						obj.bindings.vertex = { .offset = 0u, .buffer = std::move(vertexBuffer) };
-						{
-							const SBufferRange<IGPUBuffer> range = { .offset = obj.bindings.vertex.offset, .size = obj.bindings.vertex.buffer->getSize(), .buffer = obj.bindings.vertex.buffer };
-							if (!commandBuffer->updateBuffer(range, vBuffer->getPointer()))
-							{
-								logger->log("Could not record vertex buffer transfer upload for [%s] object!", ILogger::ELL_ERROR, meta.name.data());
-								status = false;
-							}
-						}
-						obj.bindings.index = { .offset = 0u, .buffer = std::move(indexBuffer) };
-						{
-							if (iBuffer)
-							{
-								const SBufferRange<IGPUBuffer> range = { .offset = obj.bindings.index.offset, .size = obj.bindings.index.buffer->getSize(), .buffer = obj.bindings.index.buffer };
-
-								if (!commandBuffer->updateBuffer(range, iBuffer->getPointer()))
-								{
-									logger->log("Could not record index buffer transfer upload for [%s] object!", ILogger::ELL_ERROR, meta.name.data());
-									status = false;
-								}
-							}
-						}
-					}
-						
-					return true;
-				};
-
-				if (!createVIBuffers())
-				{
-					logger->log("Could not create buffers for [%s] object!", ILogger::ELL_ERROR, meta.name.data());
-					status = false;
-				}
-
-				if (!status)
-				{
-					logger->log("[%s] object will not be created!", ILogger::ELL_ERROR, meta.name.data());
-
-					obj.bindings.vertex = {};
-					obj.bindings.index = {};
-					obj.indexCount = 0u;
-					obj.indexType = E_INDEX_TYPE::EIT_UNKNOWN;
-					obj.pipeline = nullptr;
-
-					continue;
-				}
-			}
-		}
-
-		return true;
-	}
-
-
-	struct GeometriesCpu
-	{
-		enum GeometryShader
-		{
-			GP_BASIC = 0,
-			GP_CONE,
-			GP_ICO,
-
-			GP_COUNT
-		};
-
-
-	};
-
-		struct Shaders
-		{
-			nbl::core::smart_refctd_ptr<typename Types::shader_t> vertex = nullptr, fragment = nullptr;
-		};
-
-		std::array<Shaders, GeometriesCpu::GP_COUNT> shaders;
-};
-#endif
-
 }
 #endif
\ No newline at end of file

From 173a3c960bcf10bf7705e1a0c30f49727db87daa Mon Sep 17 00:00:00 2001
From: kevyuu <kevin.kayu@gmail.com>
Date: Wed, 18 Jun 2025 18:17:11 +0700
Subject: [PATCH 390/529] Fix compile error for example 23 and 29

---
 23_Arithmetic2UnitTest/main.cpp | 16 ++++++++--------
 29_Arithmetic2Bench/main.cpp    | 18 +++++++++---------
 2 files changed, 17 insertions(+), 17 deletions(-)

diff --git a/23_Arithmetic2UnitTest/main.cpp b/23_Arithmetic2UnitTest/main.cpp
index 65ef126ad..da0d3de7d 100644
--- a/23_Arithmetic2UnitTest/main.cpp
+++ b/23_Arithmetic2UnitTest/main.cpp
@@ -157,7 +157,7 @@ class Workgroup2ScanTestApp final : public application_templates::BasicMultiQueu
 				exit(-1);
 			}
 			auto firstAssetInBundle = bundle.getContents()[0];
-			return smart_refctd_ptr_static_cast<ICPUShader>(firstAssetInBundle);
+			return smart_refctd_ptr_static_cast<IShader>(firstAssetInBundle);
 		};
 
 		auto subgroupTestSource = getShaderSource("app_resources/testSubgroup.comp.hlsl");
@@ -263,18 +263,18 @@ class Workgroup2ScanTestApp final : public application_templates::BasicMultiQueu
 	}
 
 	// create pipeline (specialized every test) [TODO: turn into a future/async]
-	smart_refctd_ptr<IGPUComputePipeline> createPipeline(const ICPUShader* overridenUnspecialized, const uint8_t subgroupSizeLog2)
+	smart_refctd_ptr<IGPUComputePipeline> createPipeline(const IShader* overridenUnspecialized, const uint8_t subgroupSizeLog2)
 	{
-		auto shader = m_device->createShader(overridenUnspecialized);
+		auto shader = m_device->compileShader({ overridenUnspecialized });
 		IGPUComputePipeline::SCreationParams params = {};
 		params.layout = pipelineLayout.get();
 		params.shader = {
-			.entryPoint = "main",
 			.shader = shader.get(),
+			.entryPoint = "main",
+			.requiredSubgroupSize = static_cast<IPipelineBase::SUBGROUP_SIZE>(subgroupSizeLog2),
 			.entries = nullptr,
-			.requiredSubgroupSize = static_cast<IGPUShader::SSpecInfo::SUBGROUP_SIZE>(subgroupSizeLog2),
-			.requireFullSubgroups = true
 		};
+		params.cached.requireFullSubgroups = true;
 		core::smart_refctd_ptr<IGPUComputePipeline> pipeline;
 		if (!m_device->createComputePipelines(m_spirv_isa_cache.get(),{&params,1},&pipeline))
 			return nullptr;
@@ -282,7 +282,7 @@ class Workgroup2ScanTestApp final : public application_templates::BasicMultiQueu
 	}
 
 	template<template<class> class Arithmetic, bool WorkgroupTest>
-	bool runTest(const smart_refctd_ptr<const ICPUShader>& source, const uint32_t elementCount, const uint8_t subgroupSizeLog2, const uint32_t workgroupSize, bool useNative, uint32_t itemsPerWG, uint32_t itemsPerInvoc = 1u)
+	bool runTest(const smart_refctd_ptr<const IShader>& source, const uint32_t elementCount, const uint8_t subgroupSizeLog2, const uint32_t workgroupSize, bool useNative, uint32_t itemsPerWG, uint32_t itemsPerInvoc = 1u)
 	{
 		std::string arith_name = Arithmetic<arithmetic::bit_xor<float>>::name;
 		const uint32_t workgroupSizeLog2 = hlsl::findMSB(workgroupSize);
@@ -305,7 +305,7 @@ class Workgroup2ScanTestApp final : public application_templates::BasicMultiQueu
 		auto* includeFinder = compiler->getDefaultIncludeFinder();
 		options.preprocessorOptions.includeFinder = includeFinder;
 
-		smart_refctd_ptr<ICPUShader> overriddenUnspecialized;
+		smart_refctd_ptr<IShader> overriddenUnspecialized;
 		if constexpr (WorkgroupTest)
 		{
 			hlsl::workgroup2::SArithmeticConfiguration wgConfig;
diff --git a/29_Arithmetic2Bench/main.cpp b/29_Arithmetic2Bench/main.cpp
index 2d5afeb4c..61e94607b 100644
--- a/29_Arithmetic2Bench/main.cpp
+++ b/29_Arithmetic2Bench/main.cpp
@@ -346,12 +346,12 @@ class ArithmeticBenchApp final : public examples::SimpleWindowedApplication, pub
 				exit(-1);
 			}
 			auto firstAssetInBundle = bundle.getContents()[0];
-			return smart_refctd_ptr_static_cast<ICPUShader>(firstAssetInBundle);
+			return smart_refctd_ptr_static_cast<IShader>(firstAssetInBundle);
 		};
 
 		// for each workgroup size (manually adjust items per invoc, operation else uses up a lot of ram)
 		const auto MaxSubgroupSize = m_physicalDevice->getLimits().maxSubgroupSize;
-		smart_refctd_ptr<ICPUShader> shaderSource;
+		smart_refctd_ptr<IShader> shaderSource;
 		if constexpr (DoWorkgroupBenchmarks)
 			shaderSource = getShaderSource("app_resources/benchmarkWorkgroup.comp.hlsl");
 		else
@@ -496,18 +496,18 @@ class ArithmeticBenchApp final : public examples::SimpleWindowedApplication, pub
 
 private:
 	// create pipeline (specialized every test) [TODO: turn into a future/async]
-	smart_refctd_ptr<IGPUComputePipeline> createPipeline(const ICPUShader* overridenUnspecialized, const IGPUPipelineLayout* layout, const uint8_t subgroupSizeLog2)
+	smart_refctd_ptr<IGPUComputePipeline> createPipeline(const IShader* overridenUnspecialized, const IGPUPipelineLayout* layout, const uint8_t subgroupSizeLog2)
 	{
-		auto shader = m_device->createShader(overridenUnspecialized);
+		auto shader = m_device->compileShader({ overridenUnspecialized });
 		IGPUComputePipeline::SCreationParams params = {};
 		params.layout = layout;
 		params.shader = {
-			.entryPoint = "main",
 			.shader = shader.get(),
+			.entryPoint = "main",
+			.requiredSubgroupSize = static_cast<IPipelineBase::SUBGROUP_SIZE>(subgroupSizeLog2),
 			.entries = nullptr,
-			.requiredSubgroupSize = static_cast<IGPUShader::SSpecInfo::SUBGROUP_SIZE>(subgroupSizeLog2),
-			.requireFullSubgroups = true
 		};
+		params.cached.requireFullSubgroups = true;
 		core::smart_refctd_ptr<IGPUComputePipeline> pipeline;
 		if (!m_device->createComputePipelines(nullptr,{&params,1},&pipeline))
 			return nullptr;
@@ -522,7 +522,7 @@ class ArithmeticBenchApp final : public examples::SimpleWindowedApplication, pub
 	};
 
 	template<bool WorkgroupBench>
-	BenchmarkSet createBenchmarkPipelines(const smart_refctd_ptr<const ICPUShader>&source, const IGPUPipelineLayout* layout, const uint32_t elementCount, const std::string& arith_name, const uint8_t subgroupSizeLog2, const uint32_t workgroupSize, uint32_t itemsPerInvoc = 1u, uint32_t numLoops = 8u)
+	BenchmarkSet createBenchmarkPipelines(const smart_refctd_ptr<const IShader>&source, const IGPUPipelineLayout* layout, const uint32_t elementCount, const std::string& arith_name, const uint8_t subgroupSizeLog2, const uint32_t workgroupSize, uint32_t itemsPerInvoc = 1u, uint32_t numLoops = 8u)
 	{
 		auto compiler = make_smart_refctd_ptr<asset::CHLSLCompiler>(smart_refctd_ptr(m_system));
 		CHLSLCompiler::SOptions options = {};
@@ -547,7 +547,7 @@ class ArithmeticBenchApp final : public examples::SimpleWindowedApplication, pub
 		hlsl::workgroup2::SArithmeticConfiguration wgConfig;
 	    wgConfig.init(workgroupSizeLog2, subgroupSizeLog2, itemsPerInvoc);
 		const uint32_t itemsPerWG = wgConfig.VirtualWorkgroupSize * wgConfig.ItemsPerInvocation_0;
-		smart_refctd_ptr<ICPUShader> overriddenUnspecialized;
+		smart_refctd_ptr<IShader> overriddenUnspecialized;
 		if constexpr (WorkgroupBench)
 		{
 			const std::string definitions[4] = {

From d03f31b1c623618cbfa43855c23183d7b20afd07 Mon Sep 17 00:00:00 2001
From: devsh <devsh@devsh.eu>
Date: Wed, 18 Jun 2025 23:00:49 +0200
Subject: [PATCH 391/529] kill a CMakeLists.txt not used in the example PCH
 branch

---
 .../geometry/CGeometryCreatorScene.hpp        | 16 ----
 .../src/nbl/examples/geometry/CMakeLists.txt  | 73 -------------------
 2 files changed, 89 deletions(-)
 delete mode 100644 common/src/nbl/examples/geometry/CMakeLists.txt

diff --git a/common/include/nbl/examples/geometry/CGeometryCreatorScene.hpp b/common/include/nbl/examples/geometry/CGeometryCreatorScene.hpp
index 12c12e3f3..8a73f2e14 100644
--- a/common/include/nbl/examples/geometry/CGeometryCreatorScene.hpp
+++ b/common/include/nbl/examples/geometry/CGeometryCreatorScene.hpp
@@ -180,20 +180,4 @@ class CGeometryCreatorScene : public core::IReferenceCounted
 };
 
 }
-#endif
-
-
-
-#if 0
-		typename ResourcesBundleScratch::Shaders& basic = scratch.shaders[GeometriesCpu::GP_BASIC];
-		createShader.template operator() < NBL_CORE_UNIQUE_STRING_LITERAL_TYPE("geometryCreator/spirv/gc.basic.vertex.spv") > (IShader::E_SHADER_STAGE::ESS_VERTEX, basic.vertex);
-		createShader.template operator() < NBL_CORE_UNIQUE_STRING_LITERAL_TYPE("geometryCreator/spirv/gc.basic.fragment.spv") > (IShader::E_SHADER_STAGE::ESS_FRAGMENT, basic.fragment);
-
-		typename ResourcesBundleScratch::Shaders& cone = scratch.shaders[GeometriesCpu::GP_CONE];
-		createShader.template operator() < NBL_CORE_UNIQUE_STRING_LITERAL_TYPE("geometryCreator/spirv/gc.cone.vertex.spv") > (IShader::E_SHADER_STAGE::ESS_VERTEX, cone.vertex);
-		createShader.template operator() < NBL_CORE_UNIQUE_STRING_LITERAL_TYPE("geometryCreator/spirv/gc.basic.fragment.spv") > (IShader::E_SHADER_STAGE::ESS_FRAGMENT, cone.fragment); // note we reuse fragment from basic!
-
-		typename ResourcesBundleScratch::Shaders& ico = scratch.shaders[GeometriesCpu::GP_ICO];
-		createShader.template operator() < NBL_CORE_UNIQUE_STRING_LITERAL_TYPE("geometryCreator/spirv/gc.ico.vertex.spv") > (IShader::E_SHADER_STAGE::ESS_VERTEX, ico.vertex);
-		createShader.template operator() < NBL_CORE_UNIQUE_STRING_LITERAL_TYPE("geometryCreator/spirv/gc.basic.fragment.spv") > (IShader::E_SHADER_STAGE::ESS_FRAGMENT, ico.fragment); // note we reuse fragment from basic!
 #endif
\ No newline at end of file
diff --git a/common/src/nbl/examples/geometry/CMakeLists.txt b/common/src/nbl/examples/geometry/CMakeLists.txt
deleted file mode 100644
index c402a2b8a..000000000
--- a/common/src/nbl/examples/geometry/CMakeLists.txt
+++ /dev/null
@@ -1,73 +0,0 @@
-# TODO: let arek figure out how to redo the shaders
-#[===[
-
-# shaders IO directories
-set(NBL_EXAMPLES_GEOMETRY_INPUT_SHADERS_DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}/shaders")
-get_filename_component(_EXAMPLES_GEOMETRY_SPIRV_BR_BUNDLE_SEARCH_DIRECTORY_ "${CMAKE_CURRENT_BINARY_DIR}/shaders/include" ABSOLUTE)
-get_filename_component(_EXAMPLES_GEOMETRY_SPIRV_BR_OUTPUT_DIRECTORY_HEADER_ "${CMAKE_CURRENT_BINARY_DIR}/builtin/include" ABSOLUTE)
-get_filename_component(_EXAMPLES_GEOMETRY_SPIRV_BR_OUTPUT_DIRECTORY_SOURCE_ "${CMAKE_CURRENT_BINARY_DIR}/builtin/src" ABSOLUTE)
-set(NBL_EXAMPLES_GEOMETRY_OUTPUT_SPIRV_DIRECTORY "${_EXAMPLES_GEOMETRY_SPIRV_BR_BUNDLE_SEARCH_DIRECTORY_}/nbl/examples/geometry/spirv")
-
-# list of input source shaders
-set(NBL_EXAMPLES_GEOMETRY_INPUT_SHADERS
-	# geometry creator
-	"${NBL_EXAMPLES_GEOMETRY_INPUT_SHADERS_DIRECTORY}/gc.basic.fragment.hlsl"
-	"${NBL_EXAMPLES_GEOMETRY_INPUT_SHADERS_DIRECTORY}/gc.basic.vertex.hlsl"
-	"${NBL_EXAMPLES_GEOMETRY_INPUT_SHADERS_DIRECTORY}/gc.cone.vertex.hlsl"
-	"${NBL_EXAMPLES_GEOMETRY_INPUT_SHADERS_DIRECTORY}/gc.ico.vertex.hlsl"
-	
-	# grid
-	"${NBL_EXAMPLES_GEOMETRY_INPUT_SHADERS_DIRECTORY}/grid.vertex.hlsl"
-	"${NBL_EXAMPLES_GEOMETRY_INPUT_SHADERS_DIRECTORY}/grid.fragment.hlsl"
-)
-
-file(GLOB_RECURSE NBL_EXAMPLES_GEOMETRY_INPUT_COMMONS CONFIGURE_DEPENDS "${NBL_EXAMPLES_GEOMETRY_INPUT_SHADERS_DIRECTORY}/template/*.hlsl")
-
-include("${NBL_ROOT_PATH}/src/nbl/builtin/utils.cmake")
-
-foreach(NBL_INPUT_SHADER IN LISTS NBL_EXAMPLES_GEOMETRY_INPUT_SHADERS)
-	cmake_path(GET NBL_INPUT_SHADER FILENAME NBL_INPUT_SHADER_FILENAME)
-	cmake_path(GET NBL_INPUT_SHADER_FILENAME STEM LAST_ONLY NBL_SHADER_STEM) # filename without .hlsl extension
-	cmake_path(GET NBL_SHADER_STEM EXTENSION LAST_ONLY NBL_SHADER_TYPE) # .<shader type>
-	
-	set(NBL_OUTPUT_SPIRV_FILENAME "${NBL_SHADER_STEM}.spv")
-	set(NBL_OUTPUT_SPIRV_PATH "${NBL_EXAMPLES_GEOMETRY_OUTPUT_SPIRV_DIRECTORY}/${NBL_OUTPUT_SPIRV_FILENAME}")
-
-	if(NBL_SHADER_TYPE STREQUAL .vertex)
-		set(NBL_NSC_COMPILE_OPTIONS -T vs_6_8 -E VSMain)
-	elseif(NBL_SHADER_TYPE STREQUAL .geometry)
-		set(NBL_NSC_COMPILE_OPTIONS -T gs_6_8 -E GSMain)
-	elseif(NBL_SHADER_TYPE STREQUAL .fragment)
-		set(NBL_NSC_COMPILE_OPTIONS -T ps_6_8 -E PSMain)
-	else()
-		message(FATAL_ERROR "Input shader is supposed to be <name>.<shader type>.hlsl!")
-	endif()
-	
-	set(NBL_NSC_COMPILE_COMMAND
-		"$<TARGET_FILE:nsc>"
-		-Fc "${NBL_OUTPUT_SPIRV_PATH}"
-		-I "${NBL_EXAMPLES_API_INCLUDE_DIRECTORY}"
-		${NBL_NSC_COMPILE_OPTIONS} # this should come from shader's [#pragma WAVE <compile options>] but our NSC doesn't seem to work properly currently
-		"${NBL_INPUT_SHADER}"
-	)
-	
-	set(NBL_DEPENDS
-		"${NBL_INPUT_SHADER}"
-		${NBL_EXAMPLES_GEOMETRY_INPUT_COMMONS}
-	)
-		
-	add_custom_command(OUTPUT "${NBL_OUTPUT_SPIRV_PATH}"
-	   COMMAND ${NBL_NSC_COMPILE_COMMAND}
-	   DEPENDS ${NBL_DEPENDS}
-	   WORKING_DIRECTORY "${NBL_EXAMPLES_GEOMETRY_INPUT_SHADERS_DIRECTORY}"
-	   COMMENT "Generating \"${NBL_OUTPUT_SPIRV_PATH}\""
-	   VERBATIM
-	   COMMAND_EXPAND_LISTS
-	)
-	
-	list(APPEND NBL_EXAMPLES_GEOMETRY_OUTPUT_SPIRV_BUILTINS "${NBL_OUTPUT_SPIRV_PATH}")
-	LIST_BUILTIN_RESOURCE(GEOMETRY_CREATOR_SPIRV_RESOURCES_TO_EMBED "geometry/spirv/${NBL_OUTPUT_SPIRV_FILENAME}")
-endforeach()
-
-ADD_CUSTOM_BUILTIN_RESOURCES(geometryCreatorSpirvBRD GEOMETRY_CREATOR_SPIRV_RESOURCES_TO_EMBED "${_EXAMPLES_GEOMETRY_SPIRV_BR_BUNDLE_SEARCH_DIRECTORY_}" "nbl" "geometry::spirv::builtin" "${_EXAMPLES_GEOMETRY_SPIRV_BR_OUTPUT_DIRECTORY_HEADER_}" "${_EXAMPLES_GEOMETRY_SPIRV_BR_OUTPUT_DIRECTORY_SOURCE_}" "STATIC" "INTERNAL")
-]===]
\ No newline at end of file

From f6aebbf086db6151f43798a622ab427ba7463142 Mon Sep 17 00:00:00 2001
From: devsh <devsh@devsh.eu>
Date: Wed, 18 Jun 2025 23:02:19 +0200
Subject: [PATCH 392/529] commit shaders somewhere for now

---
 common/src/nbl/examples/CMakeLists.txt        |  7 +-
 .../geometry/shaders/grid.vertex.hlsl         |  6 --
 .../template/gc.basic.vertex.input.hlsl       | 18 ++---
 .../geometry/shaders/template/gc.common.hlsl  | 22 ++++---
 .../template/gc.cone.vertex.input.hlsl        | 15 ++---
 .../shaders/template/gc.ico.vertex.input.hlsl | 16 ++---
 .../geometry/shaders/template/gc.vertex.hlsl  |  9 +--
 .../shaders/template/grid.common.hlsl         | 65 ++++++++++---------
 8 files changed, 70 insertions(+), 88 deletions(-)

diff --git a/common/src/nbl/examples/CMakeLists.txt b/common/src/nbl/examples/CMakeLists.txt
index 96ccaabea..032c038b4 100644
--- a/common/src/nbl/examples/CMakeLists.txt
+++ b/common/src/nbl/examples/CMakeLists.txt
@@ -1,9 +1,8 @@
-# TODO: @AnastaZluk redo the PCH
-# add_subdirectory(pch EXCLUDE_FROM_ALL)
-
 # we add common libraries
 # add_subdirectory(cameras EXCLUDE_FROM_ALL) # header only currently
-add_subdirectory(geometry EXCLUDE_FROM_ALL)
+
+# TODO builtin SPIR-V shaders
+# add_subdirectory(geometry EXCLUDE_FROM_ALL)
 
 # we get all available targets inclusive & below this directory
 NBL_GET_ALL_TARGETS(NBL_SUBDIRECTORY_TARGETS)
diff --git a/common/src/nbl/examples/geometry/shaders/grid.vertex.hlsl b/common/src/nbl/examples/geometry/shaders/grid.vertex.hlsl
index 167b981d3..389c37bf2 100644
--- a/common/src/nbl/examples/geometry/shaders/grid.vertex.hlsl
+++ b/common/src/nbl/examples/geometry/shaders/grid.vertex.hlsl
@@ -1,11 +1,5 @@
 #include "template/grid.common.hlsl"
 
-// set 1, binding 0
-[[vk::binding(0, 1)]]
-cbuffer CameraData
-{
-    SBasicViewParameters params;
-};
 
 PSInput VSMain(VSInput input)
 {
diff --git a/common/src/nbl/examples/geometry/shaders/template/gc.basic.vertex.input.hlsl b/common/src/nbl/examples/geometry/shaders/template/gc.basic.vertex.input.hlsl
index d9e2fa172..862d4508e 100644
--- a/common/src/nbl/examples/geometry/shaders/template/gc.basic.vertex.input.hlsl
+++ b/common/src/nbl/examples/geometry/shaders/template/gc.basic.vertex.input.hlsl
@@ -1,16 +1,12 @@
-#ifndef _THIS_EXAMPLE_GC_BASIC_VERTEX_INPUT_HLSL_
-#define _THIS_EXAMPLE_GC_BASIC_VERTEX_INPUT_HLSL_
+#ifndef _NBL_EXAMPLES_GC_BASIC_VERTEX_INPUT_HLSL_
+#define _NBL_EXAMPLES_GC_BASIC_VERTEX_INPUT_HLSL_
 
-struct VSInput
-{
-    [[vk::location(0)]] float3 position : POSITION;
-    [[vk::location(1)]] float4 color : COLOR;
-    [[vk::location(2)]] float2 uv : TEXCOORD;
-    [[vk::location(3)]] float3 normal : NORMAL;
-};
-
-#endif // _THIS_EXAMPLE_GC_BASIC_VERTEX_INPUT_HLSL_
+[[vk::binding(0)]] Buffer<float32_t3> position;
+[[vk::binding(1)]] Buffer<float32_t3> normal;
+[[vk::binding(2)]] Buffer<float32_t2> uv;
+[[vk::binding(3)]] Buffer<float32_t3> color;
 
+#endif // _NBL_EXAMPLES_GC_BASIC_VERTEX_INPUT_HLSL_
 /*
     do not remove this text, WAVE is so bad that you can get errors if no proper ending xD
 */
diff --git a/common/src/nbl/examples/geometry/shaders/template/gc.common.hlsl b/common/src/nbl/examples/geometry/shaders/template/gc.common.hlsl
index 26e2885f7..ff40fb3c8 100644
--- a/common/src/nbl/examples/geometry/shaders/template/gc.common.hlsl
+++ b/common/src/nbl/examples/geometry/shaders/template/gc.common.hlsl
@@ -1,17 +1,21 @@
-#ifndef _THIS_EXAMPLE_GC_COMMON_HLSL_
-#define _THIS_EXAMPLE_GC_COMMON_HLSL_
+#ifndef _NBL_EXAMPLES_GC_COMMON_HLSL_
+#define _NBL_EXAMPLES_GC_COMMON_HLSL_
+
+
+#include "common/SBasicViewParameters.hlsl"
 
 #ifdef __HLSL_VERSION
-	struct PSInput
-	{
-		float4 position : SV_Position;
-		float4 color : COLOR0;
-	};
+[[vk::push_constant]] SBasicViewParameters params;
+
+struct PSInput
+{
+	float4 position : SV_Position;
+	float3 color : COLOR0;
+};
 #endif // __HLSL_VERSION
 
-#include "common/SBasicViewParameters.hlsl"
 
-#endif // _THIS_EXAMPLE_GC_COMMON_HLSL_
+#endif // _NBL_EXAMPLES_GC_COMMON_HLSL_
 
 /*
 	do not remove this text, WAVE is so bad that you can get errors if no proper ending xD
diff --git a/common/src/nbl/examples/geometry/shaders/template/gc.cone.vertex.input.hlsl b/common/src/nbl/examples/geometry/shaders/template/gc.cone.vertex.input.hlsl
index 66221fef1..7c40f54ab 100644
--- a/common/src/nbl/examples/geometry/shaders/template/gc.cone.vertex.input.hlsl
+++ b/common/src/nbl/examples/geometry/shaders/template/gc.cone.vertex.input.hlsl
@@ -1,14 +1,11 @@
-#ifndef _THIS_EXAMPLE_GC_CONE_VERTEX_INPUT_HLSL_
-#define _THIS_EXAMPLE_GC_CONE_VERTEX_INPUT_HLSL_
+#ifndef _NBL_EXAMPLES_GEOMETRY_CONE_VERTEX_INPUT_HLSL_
+#define _NBL_EXAMPLES_GEOMETRY_CONE_VERTEX_INPUT_HLSL_
 
-struct VSInput
-{
-    [[vk::location(0)]] float3 position : POSITION;
-    [[vk::location(1)]] float4 color : COLOR;
-    [[vk::location(2)]] float3 normal : NORMAL;
-};
+[[vk::binding(0)]] Buffer<float32_t3> position;
+[[vk::binding(1)]] Buffer<float32_t3> normal;
+[[vk::binding(2)]] Buffer<float32_t3> color;
 
-#endif // _THIS_EXAMPLE_GC_CONE_VERTEX_INPUT_HLSL_
+#endif // _NBL_EXAMPLES_GEOMETRY_CONE_VERTEX_INPUT_HLSL_
 
 /*
     do not remove this text, WAVE is so bad that you can get errors if no proper ending xD
diff --git a/common/src/nbl/examples/geometry/shaders/template/gc.ico.vertex.input.hlsl b/common/src/nbl/examples/geometry/shaders/template/gc.ico.vertex.input.hlsl
index 6b85486d9..67092ccf0 100644
--- a/common/src/nbl/examples/geometry/shaders/template/gc.ico.vertex.input.hlsl
+++ b/common/src/nbl/examples/geometry/shaders/template/gc.ico.vertex.input.hlsl
@@ -1,15 +1,11 @@
-#ifndef _THIS_EXAMPLE_GC_ICO_VERTEX_INPUT_HLSL_
-#define _THIS_EXAMPLE_GC_ICO_VERTEX_INPUT_HLSL_
+#ifndef _NBL_EXAMPLES_GEOMETRY_ICO_VERTEX_INPUT_HLSL_
+#define _NBL_EXAMPLES_GEOMETRY_ICO_VERTEX_INPUT_HLSL_
 
-struct VSInput
-{
-    [[vk::location(0)]] float3 position : POSITION;
-    [[vk::location(1)]] float3 normal : NORMAL;
-    [[vk::location(2)]] float2 uv : TEXCOORD;
-};
-
-#endif // _THIS_EXAMPLE_GC_ICO_VERTEX_INPUT_HLSL_
+[[vk::binding(0)]] Buffer<float32_t3> position;
+[[vk::binding(1)]] Buffer<float32_t3> normal;
+[[vk::binding(2)]] Buffer<float32_t2> uv;
 
+#endif // _NBL_EXAMPLES_GEOMETRY_ICO_VERTEX_INPUT_HLSL_
 /*
     do not remove this text, WAVE is so bad that you can get errors if no proper ending xD
 */
diff --git a/common/src/nbl/examples/geometry/shaders/template/gc.vertex.hlsl b/common/src/nbl/examples/geometry/shaders/template/gc.vertex.hlsl
index 5a8f26722..e878bf7d7 100644
--- a/common/src/nbl/examples/geometry/shaders/template/gc.vertex.hlsl
+++ b/common/src/nbl/examples/geometry/shaders/template/gc.vertex.hlsl
@@ -1,13 +1,6 @@
 #include "gc.common.hlsl"
 
-// set 1, binding 0
-[[vk::binding(0, 1)]]
-cbuffer CameraData
-{
-    SBasicViewParameters params;
-};
-
-PSInput VSMain(VSInput input)
+PSInput VSMain()
 {
     PSInput output;
 
diff --git a/common/src/nbl/examples/geometry/shaders/template/grid.common.hlsl b/common/src/nbl/examples/geometry/shaders/template/grid.common.hlsl
index 616412245..7ec9017e9 100644
--- a/common/src/nbl/examples/geometry/shaders/template/grid.common.hlsl
+++ b/common/src/nbl/examples/geometry/shaders/template/grid.common.hlsl
@@ -1,40 +1,43 @@
-#ifndef _THIS_EXAMPLE_GRID_COMMON_HLSL_
-#define _THIS_EXAMPLE_GRID_COMMON_HLSL_
+#ifndef _NBL_EXAMPLES_GRID_COMMON_HLSL_
+#define _NBL_EXAMPLES_GRID_COMMON_HLSL_
+
+#include "common/SBasicViewParameters.hlsl"
 
 #ifdef __HLSL_VERSION
-    struct VSInput
-	{
-		[[vk::location(0)]] float3 position : POSITION;
-		[[vk::location(1)]] float4 color : COLOR;
-		[[vk::location(2)]] float2 uv : TEXCOORD;
-		[[vk::location(3)]] float3 normal : NORMAL;
-	};
-
-    struct PSInput
-    {
-        float4 position : SV_Position;
-        float2 uv : TEXCOORD0;
-    };
-
-    float gridTextureGradBox(float2 p, float2 ddx, float2 ddy)
-    {
-        float N = 30.0; // grid ratio
-        float2 w = max(abs(ddx), abs(ddy)) + 0.01; // filter kernel
-
-        // analytic (box) filtering
-        float2 a = p + 0.5 * w;
-        float2 b = p - 0.5 * w;
-        float2 i = (floor(a) + min(frac(a) * N, 1.0) - floor(b) - min(frac(b) * N, 1.0)) / (N * w);
-
-        // pattern
-        return (1.0 - i.x) * (1.0 - i.y);
-    }
+// TODO: why is there even a mesh with HW vertices for this?
+struct VSInput
+{
+	[[vk::location(0)]] float3 position : POSITION;
+	[[vk::location(1)]] float4 color : COLOR;
+	[[vk::location(2)]] float2 uv : TEXCOORD;
+	[[vk::location(3)]] float3 normal : NORMAL;
+};
+
+struct PSInput
+{
+    float4 position : SV_Position;
+    float2 uv : TEXCOORD0;
+};
+
+[[vk::push_constant]] SBasicViewParameters params;
 #endif // __HLSL_VERSION
 
-#include "common/SBasicViewParameters.hlsl"
 
-#endif // _THIS_EXAMPLE_GRID_COMMON_HLSL_
+float gridTextureGradBox(float2 p, float2 ddx, float2 ddy)
+{
+    float N = 30.0; // grid ratio
+    float2 w = max(abs(ddx), abs(ddy)) + 0.01; // filter kernel
+
+    // analytic (box) filtering
+    float2 a = p + 0.5 * w;
+    float2 b = p - 0.5 * w;
+    float2 i = (floor(a) + min(frac(a) * N, 1.0) - floor(b) - min(frac(b) * N, 1.0)) / (N * w);
+
+    // pattern
+    return (1.0 - i.x) * (1.0 - i.y);
+}
 
+#endif // _NBL_EXAMPLES_GRID_COMMON_HLSL_
 /*
     do not remove this text, WAVE is so bad that you can get errors if no proper ending xD
 */
\ No newline at end of file

From 8965fb33e3cff0b0cf1d05f5fc4124072c8898e3 Mon Sep 17 00:00:00 2001
From: devsh <devsh@devsh.eu>
Date: Thu, 19 Jun 2025 00:14:23 +0200
Subject: [PATCH 393/529] get ex 09 in line with Stageless Shaders

---
 .../geometry/CSimpleDebugRenderer.hpp         | 76 +++++++++----------
 1 file changed, 37 insertions(+), 39 deletions(-)

diff --git a/common/include/nbl/examples/geometry/CSimpleDebugRenderer.hpp b/common/include/nbl/examples/geometry/CSimpleDebugRenderer.hpp
index e18c6664a..bd190c082 100644
--- a/common/include/nbl/examples/geometry/CSimpleDebugRenderer.hpp
+++ b/common/include/nbl/examples/geometry/CSimpleDebugRenderer.hpp
@@ -97,13 +97,14 @@ class CSimpleDebugRenderer final : public core::IReferenceCounted
 				return nullptr;
 
 			// load shader
-			smart_refctd_ptr<ICPUShader> shader;
+			smart_refctd_ptr<IShader> shader;
 			{
-				const auto bundle = assMan->getAsset("nbl/examples/geometry/spirv/unified.spv",{});
+				const auto bundle = assMan->getAsset("nbl/examples/geometry/shaders/unified.hlsl",{});
+				//const auto bundle = assMan->getAsset("nbl/examples/geometry/shaders/unified.spv",{});
 				const auto contents = bundle.getContents();
 				if (bundle.getAssetType()!=IAsset::ET_SHADER || contents.empty())
 					return nullptr;
-				shader = IAsset::castDown<ICPUShader>(contents[0]);
+				shader = IAsset::castDown<IShader>(contents[0]);
 				if (!shader)
 					return nullptr;
 			}
@@ -163,48 +164,35 @@ class CSimpleDebugRenderer final : public core::IReferenceCounted
 			smart_refctd_ptr<IGPUGraphicsPipeline> pipelines[PipelineType::Count] = {};
 			{
 				IGPUGraphicsPipeline::SCreationParams params[PipelineType::Count] = {};
+				params[PipelineType::BasicTriangleList].vertexShader = {.shader=shader.get(),.entryPoint="BasicTriangleListVS"};
+				params[PipelineType::BasicTriangleList].fragmentShader = {.shader=shader.get(),.entryPoint="BasicFS"};
+				params[PipelineType::BasicTriangleFan].vertexShader = {.shader=shader.get(),.entryPoint="BasicTriangleFanVS"};
+				params[PipelineType::BasicTriangleFan].fragmentShader = {.shader=shader.get(),.entryPoint="BasicFS"};
+				params[PipelineType::Cone].vertexShader = {.shader=shader.get(),.entryPoint="ConeVS"};
+				params[PipelineType::Cone].fragmentShader = {.shader=shader.get(),.entryPoint="ConeFS"};
 				for (auto i=0; i< PipelineType::Count; i++)
 				{
-					const auto type = static_cast<PipelineType>(i);
+					params[i].layout = init.layout.get();
 					// no vertex input
+					auto& primitiveAssembly = params[i].cached.primitiveAssembly;
+					auto& rasterization = params[i].cached.rasterization;
+					auto& blend = params[i].cached.blend;
+					const auto type = static_cast<PipelineType>(i);
+					switch (type)
 					{
-						auto& primitiveAssembly = params[i].cached.primitiveAssembly;
-						switch (type)
-						{
-							case PipelineType::BasicTriangleFan:
-								primitiveAssembly.primitiveType = E_PRIMITIVE_TOPOLOGY::EPT_TRIANGLE_FAN;
-								break;
-							default:
-								primitiveAssembly.primitiveType = E_PRIMITIVE_TOPOLOGY::EPT_TRIANGLE_LIST;
-								break;
-						}
-						primitiveAssembly.primitiveRestartEnable = false;
-						primitiveAssembly.tessPatchVertCount = 3;
-					}
-					{
-						auto& rasterization = params[i].cached.rasterization;
-						rasterization.faceCullingMode = EFCM_NONE;
-					}
-					{
-						auto& blend = params[i].cached.blend;
-						// everything as default
+						case PipelineType::BasicTriangleFan:
+							primitiveAssembly.primitiveType = E_PRIMITIVE_TOPOLOGY::EPT_TRIANGLE_FAN;
+							break;
+						default:
+							primitiveAssembly.primitiveType = E_PRIMITIVE_TOPOLOGY::EPT_TRIANGLE_LIST;
+							break;
 					}
+					primitiveAssembly.primitiveRestartEnable = false;
+					primitiveAssembly.tessPatchVertCount = 3;
+					rasterization.faceCullingMode = EFCM_NONE;
 					params[i].cached.subpassIx = subpassIX;
 					params[i].renderpass = renderpass;
 				}
-				/*
-		typename ResourcesBundleScratch::Shaders& basic = scratch.shaders[GeometriesCpu::GP_BASIC];
-		createShader.template operator() < NBL_CORE_UNIQUE_STRING_LITERAL_TYPE("geometryCreator/spirv/gc.basic.vertex.spv") > (IShader::E_SHADER_STAGE::ESS_VERTEX, basic.vertex);
-		createShader.template operator() < NBL_CORE_UNIQUE_STRING_LITERAL_TYPE("geometryCreator/spirv/gc.basic.fragment.spv") > (IShader::E_SHADER_STAGE::ESS_FRAGMENT, basic.fragment);
-
-		typename ResourcesBundleScratch::Shaders& cone = scratch.shaders[GeometriesCpu::GP_CONE];
-		createShader.template operator() < NBL_CORE_UNIQUE_STRING_LITERAL_TYPE("geometryCreator/spirv/gc.cone.vertex.spv") > (IShader::E_SHADER_STAGE::ESS_VERTEX, cone.vertex);
-		createShader.template operator() < NBL_CORE_UNIQUE_STRING_LITERAL_TYPE("geometryCreator/spirv/gc.basic.fragment.spv") > (IShader::E_SHADER_STAGE::ESS_FRAGMENT, cone.fragment); // note we reuse fragment from basic!
-
-		typename ResourcesBundleScratch::Shaders& ico = scratch.shaders[GeometriesCpu::GP_ICO];
-		createShader.template operator() < NBL_CORE_UNIQUE_STRING_LITERAL_TYPE("geometryCreator/spirv/gc.ico.vertex.spv") > (IShader::E_SHADER_STAGE::ESS_VERTEX, ico.vertex);
-		createShader.template operator() < NBL_CORE_UNIQUE_STRING_LITERAL_TYPE("geometryCreator/spirv/gc.basic.fragment.spv") > (IShader::E_SHADER_STAGE::ESS_FRAGMENT, ico.fragment); // note we reuse fragment from basic!
-				*/
 				if (!device->createGraphicsPipelines(nullptr,params,pipelines))
 				{
 					logger->log("Could not create Graphics Pipelines!",ILogger::ELL_ERROR);
@@ -231,8 +219,18 @@ class CSimpleDebugRenderer final : public core::IReferenceCounted
 					if (!geom->valid())
 						continue;
 					auto& out = init.geoms.emplace_back();
-// TODO: handle special cases
-					out.pipeline = pipelines[PipelineType::BasicTriangleList];
+					switch (geom->getIndexingCallback()->knownTopology())
+					{
+						case E_PRIMITIVE_TOPOLOGY::EPT_TRIANGLE_FAN:
+							out.pipeline = pipelines[PipelineType::BasicTriangleFan];
+							break;
+						default:
+							out.pipeline = pipelines[PipelineType::BasicTriangleList];
+							break;
+					}
+					// special case
+					if (entry.name=="Cone")
+						out.pipeline = pipelines[PipelineType::Cone];
 					if (const auto& view=geom->getIndexView(); view)
 					{
 						out.indexBuffer.offset = view.src.offset;

From a860c432a1a05f8fb114fd8574d3eae6f60c4204 Mon Sep 17 00:00:00 2001
From: devsh <devsh@devsh.eu>
Date: Thu, 19 Jun 2025 01:18:24 +0200
Subject: [PATCH 394/529] fix up examples before the PCH PR

---
 03_DeviceSelectionAndSharedSources/main.cpp   |  5 ++-
 07_StagingAndMultipleQueues/main.cpp          |  7 +---
 08_HelloSwapchain/main.cpp                    |  2 +-
 10_CountingSort/main.cpp                      |  4 +--
 24_ColorSpaceTest/main.cpp                    |  9 +++--
 26_Blur/main.cpp                              | 15 ++++----
 27_MPMCScheduler/main.cpp                     |  9 +++--
 28_FFTBloom/main.cpp                          | 11 +++---
 .../include/nbl/this_example/common.hpp       | 16 +++------
 30_ComputeShaderPathTracer/main.cpp           | 10 +++---
 70_FLIPFluids/main.cpp                        | 34 ++++++++-----------
 common/include/nbl/examples/examples.hpp      | 11 ++++++
 12 files changed, 60 insertions(+), 73 deletions(-)
 create mode 100644 common/include/nbl/examples/examples.hpp

diff --git a/03_DeviceSelectionAndSharedSources/main.cpp b/03_DeviceSelectionAndSharedSources/main.cpp
index 5fb584e4d..6c99aff7f 100644
--- a/03_DeviceSelectionAndSharedSources/main.cpp
+++ b/03_DeviceSelectionAndSharedSources/main.cpp
@@ -2,10 +2,9 @@
 // This file is part of the "Nabla Engine".
 // For conditions of distribution and use, see copyright notice in nabla.h
 
-#include "nbl/application_templates/MonoDeviceApplication.hpp"
-#include "nbl/application_templates/MonoAssetManagerAndBuiltinResourceApplication.hpp"
+#include "nbl/examples/examples.hpp"
+// TODO: why isn't this in `nabla.h` ?
 #include "nbl/asset/metadata/CHLSLMetadata.h"
-#include "CommonPCH/PCH.hpp"
 
 using namespace nbl;
 using namespace core;
diff --git a/07_StagingAndMultipleQueues/main.cpp b/07_StagingAndMultipleQueues/main.cpp
index 17c64d30e..a1a06f4f4 100644
--- a/07_StagingAndMultipleQueues/main.cpp
+++ b/07_StagingAndMultipleQueues/main.cpp
@@ -3,12 +3,7 @@
 // For conditions of distribution and use, see copyright notice in nabla.h
 
 // I've moved out a tiny part of this example into a shared header for reuse, please open and read it.
-
-#include "nbl/application_templates/BasicMultiQueueApplication.hpp"
-#include "nbl/application_templates/MonoAssetManagerAndBuiltinResourceApplication.hpp"
-
-// get asset converter
-#include "CommonPCH/PCH.hpp"
+#include "nbl/examples/examples.hpp"
 
 using namespace nbl;
 using namespace core;
diff --git a/08_HelloSwapchain/main.cpp b/08_HelloSwapchain/main.cpp
index 9137fe77a..cd294b0d2 100644
--- a/08_HelloSwapchain/main.cpp
+++ b/08_HelloSwapchain/main.cpp
@@ -1,7 +1,7 @@
 // Copyright (C) 2018-2024 - DevSH Graphics Programming Sp. z O.O.
 // This file is part of the "Nabla Engine".
 // For conditions of distribution and use, see copyright notice in nabla.h
-#include "SimpleWindowedApplication.hpp"
+#include "nbl/examples/examples.hpp"
 
 //
 #include "nbl/video/surface/CSurfaceVulkan.h"
diff --git a/10_CountingSort/main.cpp b/10_CountingSort/main.cpp
index de2ffca8b..0efc0518e 100644
--- a/10_CountingSort/main.cpp
+++ b/10_CountingSort/main.cpp
@@ -1,6 +1,4 @@
-#include "nbl/application_templates/MonoDeviceApplication.hpp"
-#include "nbl/application_templates/MonoAssetManagerAndBuiltinResourceApplication.hpp"
-#include "CommonPCH/PCH.hpp"
+#include "nbl/examples/examples.hpp"
 
 using namespace nbl;
 using namespace core;
diff --git a/24_ColorSpaceTest/main.cpp b/24_ColorSpaceTest/main.cpp
index fae93cf45..56af4fc79 100644
--- a/24_ColorSpaceTest/main.cpp
+++ b/24_ColorSpaceTest/main.cpp
@@ -1,10 +1,8 @@
 ﻿// Copyright (C) 2018-2024 - DevSH Graphics Programming Sp. z O.O.
 // This file is part of the "Nabla Engine".
 // For conditions of distribution and use, see copyright notice in nabla.h
-#include "nbl/application_templates/MonoAssetManagerAndBuiltinResourceApplication.hpp"
-#include "SimpleWindowedApplication.hpp"
+#include "nbl/examples/examples.hpp"
 
-#include "nbl/video/surface/CSurfaceVulkan.h"
 #include "nbl/ext/FullScreenTriangle/FullScreenTriangle.h"
 
 #include "nlohmann/json.hpp"
@@ -19,13 +17,14 @@ using namespace system;
 using namespace asset;
 using namespace ui;
 using namespace video;
+using namespace nbl::examples;
 
 // defines for sampler tests can be found in the file below
 #include "app_resources/push_constants.hlsl"
 
-class ColorSpaceTestSampleApp final : public examples::SimpleWindowedApplication, public application_templates::MonoAssetManagerAndBuiltinResourceApplication
+class ColorSpaceTestSampleApp final : public SimpleWindowedApplication, public application_templates::MonoAssetManagerAndBuiltinResourceApplication
 {
-		using device_base_t = examples::SimpleWindowedApplication;
+		using device_base_t = SimpleWindowedApplication;
 		using asset_base_t = application_templates::MonoAssetManagerAndBuiltinResourceApplication;
 		using clock_t = std::chrono::steady_clock;
 		using perf_clock_resolution_t = std::chrono::milliseconds;
diff --git a/26_Blur/main.cpp b/26_Blur/main.cpp
index bd4b6dedc..e5105c778 100644
--- a/26_Blur/main.cpp
+++ b/26_Blur/main.cpp
@@ -1,27 +1,24 @@
 // Copyright (C) 2024-2025 - DevSH Graphics Programming Sp. z O.O.
 // This file is part of the "Nabla Engine".
 // For conditions of distribution and use, see copyright notice in nabla.h
+#include "nbl/examples/examples.hpp"
+
 #include <bit>
 #include <limits>
 
-#include "nabla.h"
-#include "SimpleWindowedApplication.hpp"
-#include "InputSystem.hpp"
-#include "CEventCallback.hpp"
-#include "nbl/application_templates/MonoAssetManagerAndBuiltinResourceApplication.hpp"
-
 using namespace nbl;
 using namespace nbl::core;
 using namespace nbl::system;
 using namespace nbl::asset;
 using namespace nbl::ui;
 using namespace nbl::video;
+using namespace nbl::examples;
 
 #include "app_resources/common.hlsl"
 
-class BlurApp final : public examples::SimpleWindowedApplication, public application_templates::MonoAssetManagerAndBuiltinResourceApplication
+class BlurApp final : public SimpleWindowedApplication, public application_templates::MonoAssetManagerAndBuiltinResourceApplication
 {
-		using device_base_t = examples::SimpleWindowedApplication;
+		using device_base_t = SimpleWindowedApplication;
 		using asset_base_t = application_templates::MonoAssetManagerAndBuiltinResourceApplication;
 		using clock_t = std::chrono::steady_clock;
 
@@ -262,7 +259,7 @@ class BlurApp final : public examples::SimpleWindowedApplication, public applica
 					ISPIRVOptimizer::EOP_LOCAL_MULTI_STORE_ELIM
 				};
 				auto opt = make_smart_refctd_ptr<ISPIRVOptimizer>(optPasses);
-				shader = m_device->createShader(source.get(), opt.get());
+				shader = m_device->compileShader({ source.get(),opt.get() });
 #else
 				shader = m_device->compileShader({ source.get() });
 #endif
diff --git a/27_MPMCScheduler/main.cpp b/27_MPMCScheduler/main.cpp
index 33768c981..18d396135 100644
--- a/27_MPMCScheduler/main.cpp
+++ b/27_MPMCScheduler/main.cpp
@@ -1,9 +1,7 @@
 // Copyright (C) 2024-2025 - DevSH Graphics Programming Sp. z O.O.
 // This file is part of the "Nabla Engine".
 // For conditions of distribution and use, see copyright notice in nabla.h
-#include "nabla.h"
-#include "nbl/application_templates/MonoAssetManagerAndBuiltinResourceApplication.hpp"
-#include "SimpleWindowedApplication.hpp"
+#include "nbl/examples/examples.hpp"
 
 using namespace nbl;
 using namespace nbl::core;
@@ -11,12 +9,13 @@ using namespace nbl::system;
 using namespace nbl::asset;
 using namespace nbl::ui;
 using namespace nbl::video;
+using namespace nbl::examples;
 
 #include "app_resources/common.hlsl"
 
-class MPMCSchedulerApp final : public examples::SimpleWindowedApplication, public application_templates::MonoAssetManagerAndBuiltinResourceApplication
+class MPMCSchedulerApp final : public SimpleWindowedApplication, public application_templates::MonoAssetManagerAndBuiltinResourceApplication
 {
-		using device_base_t = examples::SimpleWindowedApplication;
+		using device_base_t = SimpleWindowedApplication;
 		using asset_base_t = application_templates::MonoAssetManagerAndBuiltinResourceApplication;
 		using clock_t = std::chrono::steady_clock;
 
diff --git a/28_FFTBloom/main.cpp b/28_FFTBloom/main.cpp
index b528d3c41..16835ecf6 100644
--- a/28_FFTBloom/main.cpp
+++ b/28_FFTBloom/main.cpp
@@ -1,9 +1,7 @@
 // Copyright (C) 2018-2024 - DevSH Graphics Programming Sp. z O.O.
 // This file is part of the "Nabla Engine".
 // For conditions of distribution and use, see copyright notice in nabla.h
-
-#include "SimpleWindowedApplication.hpp"
-#include "nbl/application_templates/MonoAssetManagerAndBuiltinResourceApplication.hpp"
+#include "nbl/examples/examples.hpp"
 
 using namespace nbl;
 using namespace core;
@@ -11,6 +9,7 @@ using namespace system;
 using namespace asset;
 using namespace video;
 using namespace ui;
+using namespace nbl::examples;
 
 #include "app_resources/common.hlsl"
 #include "nbl/builtin/hlsl/bit.hlsl"
@@ -19,9 +18,9 @@ using namespace ui;
 constexpr uint32_t WIN_W = 1280;
 constexpr uint32_t WIN_H = 720;
 
-class FFTBloomApp final : public examples::SimpleWindowedApplication, public application_templates::MonoAssetManagerAndBuiltinResourceApplication
+class FFTBloomApp final : public SimpleWindowedApplication, public application_templates::MonoAssetManagerAndBuiltinResourceApplication
 {
-	using device_base_t = examples::SimpleWindowedApplication;
+	using device_base_t = SimpleWindowedApplication;
 	using asset_base_t = application_templates::MonoAssetManagerAndBuiltinResourceApplication;
 	using clock_t = std::chrono::steady_clock;
 
@@ -212,7 +211,7 @@ class FFTBloomApp final : public examples::SimpleWindowedApplication, public app
 		#ifndef _NBL_DEBUG
 		ISPIRVOptimizer::E_OPTIMIZER_PASS optPasses = ISPIRVOptimizer::EOP_STRIP_DEBUG_INFO;
 		auto opt = make_smart_refctd_ptr<ISPIRVOptimizer>(std::span<ISPIRVOptimizer::E_OPTIMIZER_PASS>(&optPasses, 1));
-		return m_device->createShader({ HLSLShader.get(), opt.get(), m_readCache.get(), m_writeCache.get()});
+		return m_device->compileShader({ HLSLShader.get(), opt.get(), m_readCache.get(), m_writeCache.get()});
 		#else 
 		return m_device->compileShader({ HLSLShader.get(), nullptr, m_readCache.get(), m_writeCache.get() });
 		#endif
diff --git a/30_ComputeShaderPathTracer/include/nbl/this_example/common.hpp b/30_ComputeShaderPathTracer/include/nbl/this_example/common.hpp
index ff3dd8095..3745ca512 100644
--- a/30_ComputeShaderPathTracer/include/nbl/this_example/common.hpp
+++ b/30_ComputeShaderPathTracer/include/nbl/this_example/common.hpp
@@ -1,17 +1,11 @@
-#ifndef __NBL_THIS_EXAMPLE_COMMON_H_INCLUDED__
-#define __NBL_THIS_EXAMPLE_COMMON_H_INCLUDED__
+#ifndef _NBL_THIS_EXAMPLE_COMMON_H_INCLUDED_
+#define _NBL_THIS_EXAMPLE_COMMON_H_INCLUDED_
 
-#include <nabla.h>
-
-// common api
-#include "CCamera.hpp"
-#include "SimpleWindowedApplication.hpp"
-#include "nbl/application_templates/MonoAssetManagerAndBuiltinResourceApplication.hpp"
-#include "CEventCallback.hpp"
+#include "nbl/examples/examples.hpp"
 
 // example's own headers
-#include "nbl/ui/ICursorControl.h"
+#include "nbl/ui/ICursorControl.h" // TODO: why not in nabla.h ?
 #include "nbl/ext/ImGui/ImGui.h"
 #include "imgui/imgui_internal.h"
 
-#endif // __NBL_THIS_EXAMPLE_COMMON_H_INCLUDED__
\ No newline at end of file
+#endif // _NBL_THIS_EXAMPLE_COMMON_H_INCLUDED_
\ No newline at end of file
diff --git a/30_ComputeShaderPathTracer/main.cpp b/30_ComputeShaderPathTracer/main.cpp
index 62602c7f9..487388ea0 100644
--- a/30_ComputeShaderPathTracer/main.cpp
+++ b/30_ComputeShaderPathTracer/main.cpp
@@ -1,12 +1,13 @@
 // Copyright (C) 2018-2020 - DevSH Graphics Programming Sp. z O.O.
 // This file is part of the "Nabla Engine".
 // For conditions of distribution and use, see copyright notice in nabla.h
+#include "nbl/ext/FullScreenTriangle/FullScreenTriangle.h"
 
 #include "nbl/this_example/common.hpp"
-#include "nbl/asset/interchange/IImageAssetHandlerBase.h"
-#include "nbl/ext/FullScreenTriangle/FullScreenTriangle.h"
+
 #include "nbl/builtin/hlsl/surface_transform.h"
 
+
 using namespace nbl;
 using namespace core;
 using namespace hlsl;
@@ -14,6 +15,7 @@ using namespace system;
 using namespace asset;
 using namespace ui;
 using namespace video;
+using namespace nbl::examples;
 
 // TODO: share push constants
 struct PTPushConstant {
@@ -24,9 +26,9 @@ struct PTPushConstant {
 
 // TODO: Add a QueryPool for timestamping once its ready (actually add IMGUI mspf plotter)
 // TODO: Do buffer creation using assConv
-class ComputeShaderPathtracer final : public examples::SimpleWindowedApplication, public application_templates::MonoAssetManagerAndBuiltinResourceApplication
+class ComputeShaderPathtracer final : public SimpleWindowedApplication, public application_templates::MonoAssetManagerAndBuiltinResourceApplication
 {
-		using device_base_t = examples::SimpleWindowedApplication;
+		using device_base_t = SimpleWindowedApplication;
 		using asset_base_t = application_templates::MonoAssetManagerAndBuiltinResourceApplication;
 		using clock_t = std::chrono::steady_clock;
 
diff --git a/70_FLIPFluids/main.cpp b/70_FLIPFluids/main.cpp
index c0f68ca49..d66b56811 100644
--- a/70_FLIPFluids/main.cpp
+++ b/70_FLIPFluids/main.cpp
@@ -1,30 +1,24 @@
-#include <nabla.h>
-
-#include "nbl/application_templates/MonoAssetManagerAndBuiltinResourceApplication.hpp"
-#include "SimpleWindowedApplication.hpp"
-#include "InputSystem.hpp"
-#include "CCamera.hpp"
-
-#include "glm/glm/glm.hpp"
-#include <nbl/builtin/hlsl/cpp_compat.hlsl>
-#include <nbl/builtin/hlsl/cpp_compat/matrix.hlsl>
-
+// Copyright (C) 2024-2025 - DevSH Graphics Programming Sp. z O.O.
+// This file is part of the "Nabla Engine".
+// For conditions of distribution and use, see copyright notice in nabla.h
+#include "nbl/examples/examples.hpp"
+// TODO: why is it not in nabla.h ?
 #include "nbl/asset/metadata/CHLSLMetadata.h"
 
+using namespace nbl::core;
 using namespace nbl::hlsl;
-using namespace nbl;
-using namespace core;
-using namespace hlsl;
-using namespace system;
-using namespace asset;
-using namespace ui;
-using namespace video;
+using namespace nbl::system;
+using namespace nbl::asset;
+using namespace nbl::ui;
+using namespace nbl::video;
+using namespace nbl::examples;
 
 #include "app_resources/common.hlsl"
 #include "app_resources/gridUtils.hlsl"
 #include "app_resources/render_common.hlsl"
 #include "app_resources/descriptor_bindings.hlsl"
 
+
 enum SimPresets
 {
     CENTER_DROP,
@@ -167,9 +161,9 @@ class CEventCallback : public ISimpleManagedSurface::ICallback
     nbl::system::logger_opt_smart_ptr m_logger = nullptr;
 };
 
-class FLIPFluidsApp final : public examples::SimpleWindowedApplication, public application_templates::MonoAssetManagerAndBuiltinResourceApplication
+class FLIPFluidsApp final : public SimpleWindowedApplication, public application_templates::MonoAssetManagerAndBuiltinResourceApplication
 {
-    using device_base_t = examples::SimpleWindowedApplication;
+    using device_base_t = SimpleWindowedApplication;
     using asset_base_t = application_templates::MonoAssetManagerAndBuiltinResourceApplication;
     using clock_t = std::chrono::steady_clock;
 
diff --git a/common/include/nbl/examples/examples.hpp b/common/include/nbl/examples/examples.hpp
new file mode 100644
index 000000000..a7d8f92e4
--- /dev/null
+++ b/common/include/nbl/examples/examples.hpp
@@ -0,0 +1,11 @@
+// Copyright (C) 2018-2025 - DevSH Graphics Programming Sp. z O.O.
+// This file is part of the "Nabla Engine".
+// For conditions of distribution and use, see copyright notice in nabla.h
+#ifndef _NBL_EXAMPLES_HPP_
+#define _NBL_EXAMPLES_HPP_
+
+
+#include "nbl/examples/PCH.hpp"
+
+
+#endif // _NBL_EXAMPLES_HPP_
\ No newline at end of file

From 2dc268211efc70e8319114d32ba5749e9fcd9a4e Mon Sep 17 00:00:00 2001
From: devsh <devsh@devsh.eu>
Date: Thu, 19 Jun 2025 01:21:02 +0200
Subject: [PATCH 395/529] prep example 71 for @kevinyu

Use `ICPUPolygonGeometry::exportForBLAS` to make the Triangle Geometries
https://github.com/Devsh-Graphics-Programming/Nabla/blob/1f52d2f48ff077cb430cc78285fb12dd7e093f74/include/nbl/asset/IPolygonGeometry.h#L209
---
 71_RayTracingPipeline/include/common.hpp | 99 +++---------------------
 71_RayTracingPipeline/main.cpp           | 20 +++--
 2 files changed, 24 insertions(+), 95 deletions(-)

diff --git a/71_RayTracingPipeline/include/common.hpp b/71_RayTracingPipeline/include/common.hpp
index 3b66fd3e9..c60e0c3e5 100644
--- a/71_RayTracingPipeline/include/common.hpp
+++ b/71_RayTracingPipeline/include/common.hpp
@@ -1,97 +1,22 @@
-#ifndef __NBL_THIS_EXAMPLE_COMMON_H_INCLUDED__
-#define __NBL_THIS_EXAMPLE_COMMON_H_INCLUDED__
+#ifndef _NBL_THIS_EXAMPLE_COMMON_H_INCLUDED_
+#define _NBL_THIS_EXAMPLE_COMMON_H_INCLUDED_
 
-#include <nabla.h>
-#include "nbl/asset/utils/CGeometryCreator.h"
-#include "nbl/application_templates/MonoAssetManagerAndBuiltinResourceApplication.hpp"
+#include "nbl/examples/examples.hpp"
 
-#include "SimpleWindowedApplication.hpp"
-
-#include "InputSystem.hpp"
-#include "CEventCallback.hpp"
-
-#include "CCamera.hpp"
-
-#include <nbl/builtin/hlsl/cpp_compat.hlsl>
-#include <nbl/builtin/hlsl/cpp_compat/matrix.hlsl>
-#include <nbl/asset/IRayTracingPipeline.h>
+using namespace nbl;
+using namespace nbl::core;
+using namespace nbl::hlsl;
+using namespace nbl::system;
+using namespace nbl::asset;
+using namespace nbl::ui;
+using namespace nbl::video;
+using namespace nbl::application_templates;
+using namespace nbl::examples;
 
 #include "nbl/ui/ICursorControl.h"
 #include "nbl/ext/ImGui/ImGui.h"
 #include "imgui/imgui_internal.h"
 
-using namespace nbl;
-using namespace core;
-using namespace hlsl;
-using namespace system;
-using namespace asset;
-using namespace ui;
-using namespace video;
-using namespace scene;
-
 #include "app_resources/common.hlsl"
 
-namespace nbl::scene
-{
-
-enum ObjectType : uint8_t
-{
-	OT_CUBE,
-	OT_SPHERE,
-	OT_CYLINDER,
-	OT_RECTANGLE,
-	OT_DISK,
-	OT_ARROW,
-	OT_CONE,
-	OT_ICOSPHERE,
-
-	OT_COUNT,
-	OT_UNKNOWN = std::numeric_limits<uint8_t>::max()
-};
-
-static constexpr uint32_t s_smoothNormals[OT_COUNT] = { 0, 1, 1, 0, 0, 1, 1, 1 };
-
-struct ObjectMeta
-{
-	ObjectType type = OT_UNKNOWN;
-	std::string_view name = "Unknown";
-};
-
-struct ObjectDrawHookCpu
-{
-	nbl::core::matrix3x4SIMD model;
-	nbl::asset::SBasicViewParameters viewParameters;
-	ObjectMeta meta;
-};
-
-struct ReferenceObjectCpu
-{
-	ObjectMeta meta;
-	nbl::asset::CGeometryCreator::return_type data;
-	Material material;
-  core::matrix3x4SIMD transform;
-};
-
-struct ReferenceObjectGpu
-{
-	struct Bindings
-	{
-		nbl::asset::SBufferBinding<IGPUBuffer> vertex, index;
-	};
-
-	ObjectMeta meta;
-	Bindings bindings;
-	uint32_t vertexStride;
-	nbl::asset::E_INDEX_TYPE indexType = nbl::asset::E_INDEX_TYPE::EIT_UNKNOWN;
-	uint32_t indexCount = {};
-	MaterialPacked material;
-  core::matrix3x4SIMD transform;
-
-	const bool useIndex() const
-	{
-		return bindings.index.buffer && (indexType != E_INDEX_TYPE::EIT_UNKNOWN);
-	}
-};
-}
-
 #endif // __NBL_THIS_EXAMPLE_COMMON_H_INCLUDED__
diff --git a/71_RayTracingPipeline/main.cpp b/71_RayTracingPipeline/main.cpp
index 519ff8473..453e9cf69 100644
--- a/71_RayTracingPipeline/main.cpp
+++ b/71_RayTracingPipeline/main.cpp
@@ -1,15 +1,15 @@
 // Copyright (C) 2018-2024 - DevSH Graphics Programming Sp. z O.O.
 // This file is part of the "Nabla Engine".
 // For conditions of distribution and use, see copyright notice in nabla.h
-
 #include "common.hpp"
+
 #include "nbl/ext/FullScreenTriangle/FullScreenTriangle.h"
 #include "nbl/builtin/hlsl/indirect_commands.hlsl"
 
 
-class RaytracingPipelineApp final : public examples::SimpleWindowedApplication, public application_templates::MonoAssetManagerAndBuiltinResourceApplication
+class RaytracingPipelineApp final : public SimpleWindowedApplication, public application_templates::MonoAssetManagerAndBuiltinResourceApplication
 {
-	using device_base_t = examples::SimpleWindowedApplication;
+	using device_base_t = SimpleWindowedApplication;
 	using asset_base_t = application_templates::MonoAssetManagerAndBuiltinResourceApplication;
 	using clock_t = std::chrono::steady_clock;
 
@@ -375,12 +375,11 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication,
 		}
 
 		auto assetManager = make_smart_refctd_ptr<nbl::asset::IAssetManager>(smart_refctd_ptr(system));
-		auto* geometryCreator = assetManager->getGeometryCreator();
 
 		if (!createIndirectBuffer())
 			return logFail("Could not create indirect buffer");
 
-		if (!createAccelerationStructuresFromGeometry(geometryCreator))
+		if (!createAccelerationStructuresFromGeometry())
 			return logFail("Could not create acceleration structures from geometry creator");
 
 		ISampler::SParams samplerParams = {
@@ -1082,7 +1081,7 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication,
 		return true;
 	}
 
-	bool createAccelerationStructuresFromGeometry(const IGeometryCreator* gc)
+	bool createAccelerationStructuresFromGeometry()
 	{
 		auto queue = getGraphicsQueue();
 		// get geometries into ICPUBuffers
@@ -1109,6 +1108,10 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication,
 		planeTransform.setRotation(quaternion::fromAngleAxis(core::radians(-90.0f), vector3df_SIMD{ 1, 0, 0 }));
 
 		// triangles geometries
+		auto geometryCreator = make_smart_refctd_ptr<CGeometryCreator>();
+#if 1
+		return false;
+#else
 		const auto cpuObjects = std::array{
 			ReferenceObjectCpu {
 				.meta = {.type = OT_RECTANGLE, .name = "Plane Mesh"},
@@ -1513,7 +1516,7 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication,
 			params.size = geomInfoBuffer->getSize();
 			m_utils->createFilledDeviceLocalBufferOnDedMem(SIntendedSubmitInfo{ .queue = queue }, std::move(params), geomInfos).move_into(m_triangleGeomInfoBuffer);
 		}
-
+#endif
 		return true;
 	}
 
@@ -1567,7 +1570,8 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication,
 	} m_ui;
 	core::smart_refctd_ptr<IDescriptorPool> m_guiDescriptorSetPool;
 
-	core::vector<ReferenceObjectGpu> m_gpuTriangleGeometries;
+	// TODO: how much of this do we actually have to keep ?
+//	core::vector<ReferenceObjectGpu> m_gpuTriangleGeometries;
 	core::vector<SProceduralGeomInfo> m_gpuIntersectionSpheres;
 	uint32_t m_intersectionHitGroupIdx;
 

From be46ec3d8ccb4dfca3768be03089f01912804c96 Mon Sep 17 00:00:00 2001
From: devsh <devsh@devsh.eu>
Date: Thu, 19 Jun 2025 01:28:21 +0200
Subject: [PATCH 396/529] prep example 67 for further work

---
 67_RayQueryGeometry/include/common.hpp | 101 +++----------------------
 67_RayQueryGeometry/main.cpp           |  34 +++------
 2 files changed, 23 insertions(+), 112 deletions(-)

diff --git a/67_RayQueryGeometry/include/common.hpp b/67_RayQueryGeometry/include/common.hpp
index 0595c7203..bcf896f55 100644
--- a/67_RayQueryGeometry/include/common.hpp
+++ b/67_RayQueryGeometry/include/common.hpp
@@ -1,95 +1,18 @@
-#ifndef __NBL_THIS_EXAMPLE_COMMON_H_INCLUDED__
-#define __NBL_THIS_EXAMPLE_COMMON_H_INCLUDED__
+#ifndef _NBL_THIS_EXAMPLE_COMMON_H_INCLUDED_
+#define _NBL_THIS_EXAMPLE_COMMON_H_INCLUDED_
 
-#include <nabla.h>
-#include "nbl/asset/utils/CGeometryCreator.h"
-#include "nbl/application_templates/MonoAssetManagerAndBuiltinResourceApplication.hpp"
-
-#include "SimpleWindowedApplication.hpp"
-
-#include "InputSystem.hpp"
-#include "CEventCallback.hpp"
-
-#include "CCamera.hpp"
-
-#include <nbl/builtin/hlsl/cpp_compat.hlsl>
-#include <nbl/builtin/hlsl/cpp_compat/matrix.hlsl>
+#include "nbl/examples/examples.hpp"
 
 using namespace nbl;
-using namespace core;
-using namespace hlsl;
-using namespace system;
-using namespace asset;
-using namespace ui;
-using namespace video;
-using namespace scene;
+using namespace nbl::core;
+using namespace nbl::hlsl;
+using namespace nbl::system;
+using namespace nbl::asset;
+using namespace nbl::ui;
+using namespace nbl::video;
+using namespace nbl::application_templates;
+using namespace nbl::examples;
 
 #include "app_resources/common.hlsl"
 
-namespace nbl::scene
-{
-enum ObjectType : uint8_t
-{
-	OT_CUBE,
-	OT_SPHERE,
-	OT_CYLINDER,
-	OT_RECTANGLE,
-	OT_DISK,
-	OT_ARROW,
-	OT_CONE,
-	OT_ICOSPHERE,
-
-	OT_COUNT,
-	OT_UNKNOWN = std::numeric_limits<uint8_t>::max()
-};
-
-struct ObjectMeta
-{
-	ObjectType type = OT_UNKNOWN;
-	std::string_view name = "Unknown";
-};
-
-struct ObjectDrawHookCpu
-{
-	nbl::core::matrix3x4SIMD model;
-	nbl::asset::SBasicViewParameters viewParameters;
-	ObjectMeta meta;
-};
-
-enum GeometryShader
-{
-	GP_BASIC = 0,
-	GP_CONE,
-	GP_ICO,
-
-	GP_COUNT
-};
-
-struct ReferenceObjectCpu
-{
-	ObjectMeta meta;
-	GeometryShader shadersType;
-	nbl::asset::CGeometryCreator::return_type data;
-};
-
-struct ReferenceObjectGpu
-{
-	struct Bindings
-	{
-		nbl::asset::SBufferBinding<IGPUBuffer> vertex, index;
-	};
-
-	ObjectMeta meta;
-	Bindings bindings;
-	uint32_t vertexStride;
-	nbl::asset::E_INDEX_TYPE indexType = nbl::asset::E_INDEX_TYPE::EIT_UNKNOWN;
-	uint32_t indexCount = {};
-
-	const bool useIndex() const
-	{
-		return bindings.index.buffer && (indexType != E_INDEX_TYPE::EIT_UNKNOWN);
-	}
-};
-}
-
-#endif // __NBL_THIS_EXAMPLE_COMMON_H_INCLUDED__
\ No newline at end of file
+#endif // _NBL_THIS_EXAMPLE_COMMON_H_INCLUDED_
\ No newline at end of file
diff --git a/67_RayQueryGeometry/main.cpp b/67_RayQueryGeometry/main.cpp
index fdee5c5a1..495f3a3e2 100644
--- a/67_RayQueryGeometry/main.cpp
+++ b/67_RayQueryGeometry/main.cpp
@@ -3,10 +3,10 @@
 // For conditions of distribution and use, see copyright notice in nabla.h
 #include "common.hpp"
 
-class RayQueryGeometryApp final : public examples::SimpleWindowedApplication, public application_templates::MonoAssetManagerAndBuiltinResourceApplication
+class RayQueryGeometryApp final : public SimpleWindowedApplication, public MonoAssetManagerAndBuiltinResourceApplication
 {
-		using device_base_t = examples::SimpleWindowedApplication;
-		using asset_base_t = application_templates::MonoAssetManagerAndBuiltinResourceApplication;
+		using device_base_t = SimpleWindowedApplication;
+		using asset_base_t = MonoAssetManagerAndBuiltinResourceApplication;
 		using clock_t = std::chrono::steady_clock;
 
 		constexpr static inline uint32_t WIN_W = 1280, WIN_H = 720;
@@ -121,7 +121,6 @@ class RayQueryGeometryApp final : public examples::SimpleWindowedApplication, pu
 				return logFail("Could not create HDR Image");
 
 			auto assetManager = make_smart_refctd_ptr<nbl::asset::IAssetManager>(smart_refctd_ptr(system));
-			auto* geometryCreator = assetManager->getGeometryCreator();
 
 			auto cQueue = getComputeQueue();
 
@@ -138,9 +137,9 @@ class RayQueryGeometryApp final : public examples::SimpleWindowedApplication, pu
 					std::this_thread::yield();
 			}
 			// Nsight is special and can't capture anything not on the queue that performs the swapchain acquire/release
-			createAccelerationStructureDS(gQueue,geometryCreator);
+			createAccelerationStructureDS(gQueue);
 #else
-			createAccelerationStructureDS(cQueue,geometryCreator);
+			createAccelerationStructureDS(cQueue);
 #endif
 			if (!renderDs)
 				return logFail("Could not create acceleration structures and descriptor set");
@@ -258,11 +257,9 @@ class RayQueryGeometryApp final : public examples::SimpleWindowedApplication, pu
 			cmdbuf->beginDebugMarker("RayQueryGeometryApp Frame");
 			{
 				camera.beginInputProcessing(nextPresentationTimestamp);
-				mouse.consumeEvents([&](const IMouseEventChannel::range_t& events) -> void { camera.mouseProcess(events); mouseProcess(events); }, m_logger.get());
+				mouse.consumeEvents([&](const IMouseEventChannel::range_t& events) -> void { camera.mouseProcess(events); }, m_logger.get());
 				keyboard.consumeEvents([&](const IKeyboardEventChannel::range_t& events) -> void { camera.keyboardProcess(events); }, m_logger.get());
 				camera.endInputProcessing(nextPresentationTimestamp);
-
-				const auto type = static_cast<ObjectType>(gcIndex);
 			}
 
 			const auto viewMatrix = camera.getViewMatrix();
@@ -487,9 +484,12 @@ class RayQueryGeometryApp final : public examples::SimpleWindowedApplication, pu
 			return (dim + size - 1) / size;
 		}
 
-		smart_refctd_ptr<IGPUDescriptorSet> createAccelerationStructureDS(video::CThreadSafeQueueAdapter* queue, const IGeometryCreator* gc)
+		smart_refctd_ptr<IGPUDescriptorSet> createAccelerationStructureDS(video::CThreadSafeQueueAdapter* queue)
 		{
 			// get geometries in ICPUBuffers
+#if 1
+			return nullptr;
+#else
 			std::array<ReferenceObjectCpu, OT_COUNT> objectsCpu;
 			objectsCpu[OT_CUBE] = ReferenceObjectCpu{ .meta = {.type = OT_CUBE, .name = "Cube Mesh" }, .shadersType = GP_BASIC, .data = gc->createCubeMesh(nbl::core::vector3df(1.f, 1.f, 1.f)) };
 			objectsCpu[OT_SPHERE] = ReferenceObjectCpu{ .meta = {.type = OT_SPHERE, .name = "Sphere Mesh" }, .shadersType = GP_BASIC, .data = gc->createSphereMesh(2, 16, 16) };
@@ -892,6 +892,7 @@ class RayQueryGeometryApp final : public examples::SimpleWindowedApplication, pu
 			m_api->endCapture();
 
 			return reservation.getGPUObjects<ICPUDescriptorSet>().front().value;
+#endif
 		}
 
 
@@ -915,19 +916,6 @@ class RayQueryGeometryApp final : public examples::SimpleWindowedApplication, pu
 
 		smart_refctd_ptr<IGPUComputePipeline> renderPipeline;
 		smart_refctd_ptr<IGPUDescriptorSet> renderDs;
-
-		uint16_t gcIndex = {};
-
-		void mouseProcess(const nbl::ui::IMouseEventChannel::range_t& events)
-		{
-			for (auto eventIt = events.begin(); eventIt != events.end(); eventIt++)
-			{
-				auto ev = *eventIt;
-
-				if (ev.type == nbl::ui::SMouseEvent::EET_SCROLL)
-					gcIndex = std::clamp<uint16_t>(int16_t(gcIndex) + int16_t(core::sign(ev.scrollEvent.verticalScroll)), int64_t(0), int64_t(OT_COUNT - (uint8_t)1u));
-			}
-		}
 };
 
 NBL_MAIN_FUNC(RayQueryGeometryApp)
\ No newline at end of file

From 9aa748347856f53e4847d0d17b4be521481e7857 Mon Sep 17 00:00:00 2001
From: devsh <devsh@devsh.eu>
Date: Thu, 19 Jun 2025 01:33:27 +0200
Subject: [PATCH 397/529] @AnastaZIuk I need your help for getting the mounting
 of `common/include/nbl/examples` under `nbl/examples` either as dir or
 builtin archive

---
 .../app_resources/testWorkgroup.comp.hlsl        |  3 ++-
 .../app_resources/benchmarkWorkgroup.comp.hlsl   |  3 ++-
 29_Arithmetic2Bench/main.cpp                     | 16 ++++++++--------
 .../examples/workgroup/DataAccessors.hlsl}       | 13 ++++++++++---
 4 files changed, 22 insertions(+), 13 deletions(-)
 rename common/include/{WorkgroupDataAccessors.hlsl => nbl/examples/workgroup/DataAccessors.hlsl} (96%)

diff --git a/23_Arithmetic2UnitTest/app_resources/testWorkgroup.comp.hlsl b/23_Arithmetic2UnitTest/app_resources/testWorkgroup.comp.hlsl
index 2a32ed20e..a3e70b8ff 100644
--- a/23_Arithmetic2UnitTest/app_resources/testWorkgroup.comp.hlsl
+++ b/23_Arithmetic2UnitTest/app_resources/testWorkgroup.comp.hlsl
@@ -14,7 +14,8 @@ typedef vector<uint32_t, config_t::ItemsPerInvocation_0> type_t;
 // final (level 1/2) scan needs to fit in one subgroup exactly
 groupshared uint32_t scratch[mpl::max_v<int16_t,config_t::SharedScratchElementCount,1>];
 
-#include "../../common/include/WorkgroupDataAccessors.hlsl"
+#include "nbl/examples/workgroup/DataAccessors.hlsl"
+using namespace nbl::hlsl::examples::workgroup;
 
 static ScratchProxy arithmeticAccessor;
 
diff --git a/29_Arithmetic2Bench/app_resources/benchmarkWorkgroup.comp.hlsl b/29_Arithmetic2Bench/app_resources/benchmarkWorkgroup.comp.hlsl
index a56945467..58912691f 100644
--- a/29_Arithmetic2Bench/app_resources/benchmarkWorkgroup.comp.hlsl
+++ b/29_Arithmetic2Bench/app_resources/benchmarkWorkgroup.comp.hlsl
@@ -15,7 +15,8 @@ typedef vector<uint32_t, config_t::ItemsPerInvocation_0> type_t;
 // final (level 1/2) scan needs to fit in one subgroup exactly
 groupshared uint32_t scratch[mpl::max_v<int16_t,config_t::SharedScratchElementCount,1>];
 
-#include "../../common/include/WorkgroupDataAccessors.hlsl"
+#include "nbl/examples/workgroup/DataAccessors.hlsl"
+using namespace nbl::hlsl::examples::workgroup;
 
 template<uint16_t WorkgroupSizeLog2, uint16_t VirtualWorkgroupSize, uint16_t ItemsPerInvocation>
 struct RandomizedInputDataProxy
diff --git a/29_Arithmetic2Bench/main.cpp b/29_Arithmetic2Bench/main.cpp
index 61e94607b..0a0e3b35f 100644
--- a/29_Arithmetic2Bench/main.cpp
+++ b/29_Arithmetic2Bench/main.cpp
@@ -1,16 +1,16 @@
-#include "SimpleWindowedApplication.hpp"
-#include "CEventCallback.hpp"
-#include "nbl/application_templates/MonoAssetManagerAndBuiltinResourceApplication.hpp"
+#include "nbl/examples/examples.hpp"
 #include "app_resources/common.hlsl"
 #include "nbl/builtin/hlsl/workgroup2/arithmetic_config.hlsl"
 #include "nbl/builtin/hlsl/subgroup2/arithmetic_params.hlsl"
 
 using namespace nbl;
-using namespace core;
-using namespace system;
-using namespace asset;
-using namespace ui;
-using namespace video;
+using namespace nbl::core;
+using namespace nbl::system;
+using namespace nbl::asset;
+using namespace nbl::ui;
+using namespace nbl::video;
+using namespace nbl::examples;
+
 
 template<typename SwapchainResources> requires std::is_base_of_v<ISimpleManagedSurface::ISwapchainResources, SwapchainResources>
 class CExplicitSurfaceFormatResizeSurface final : public ISimpleManagedSurface
diff --git a/common/include/WorkgroupDataAccessors.hlsl b/common/include/nbl/examples/workgroup/DataAccessors.hlsl
similarity index 96%
rename from common/include/WorkgroupDataAccessors.hlsl
rename to common/include/nbl/examples/workgroup/DataAccessors.hlsl
index 7287a4135..f94121ec0 100644
--- a/common/include/WorkgroupDataAccessors.hlsl
+++ b/common/include/nbl/examples/workgroup/DataAccessors.hlsl
@@ -1,12 +1,18 @@
-#ifndef _WORKGROUP_DATA_ACCESSORS_HLSL_
-#define _WORKGROUP_DATA_ACCESSORS_HLSL_
+#ifndef _NBL_EXAMPLES_WORKGROUP_DATA_ACCESSORS_HLSL_
+#define _NBL_EXAMPLES_WORKGROUP_DATA_ACCESSORS_HLSL_
+
 
 #include "nbl/builtin/hlsl/bda/legacy_bda_accessor.hlsl"
 
+
 namespace nbl
 {
 namespace hlsl
 {
+namespace examples
+{
+namespace workgroup
+{
 
 struct ScratchProxy
 {
@@ -120,5 +126,6 @@ struct PreloadedDataProxy
 
 }
 }
-
+}
+}
 #endif

From 846d2fda842a15014d38047f457ed96362927eed Mon Sep 17 00:00:00 2001
From: devsh <devsh@devsh.eu>
Date: Thu, 19 Jun 2025 01:34:31 +0200
Subject: [PATCH 398/529] prep UI example for further work

---
 61_UI/include/common.hpp | 28 +++++++++++-----------------
 61_UI/main.cpp           | 10 +++-------
 2 files changed, 14 insertions(+), 24 deletions(-)

diff --git a/61_UI/include/common.hpp b/61_UI/include/common.hpp
index a5def7551..fe7d086dd 100644
--- a/61_UI/include/common.hpp
+++ b/61_UI/include/common.hpp
@@ -1,25 +1,19 @@
-#ifndef __NBL_THIS_EXAMPLE_COMMON_H_INCLUDED__
-#define __NBL_THIS_EXAMPLE_COMMON_H_INCLUDED__
+#ifndef _NBL_THIS_EXAMPLE_COMMON_H_INCLUDED_
+#define _NBL_THIS_EXAMPLE_COMMON_H_INCLUDED_
 
-#include <nabla.h>
 
-// common api
-#include "CCamera.hpp"
-#include "SimpleWindowedApplication.hpp"
-#include "CEventCallback.hpp"
+#include "nbl/examples/examples.hpp"
 
 // the example's headers
 #include "transform.hpp"
-#include "CGeomtryCreatorScene.hpp"
 
 using namespace nbl;
-using namespace core;
-using namespace hlsl;
-using namespace system;
-using namespace asset;
-using namespace ui;
-using namespace video;
-using namespace scene;
-using namespace geometrycreator;
+using namespace nbl::core;
+using namespace nbl::hlsl;
+using namespace nbl::system;
+using namespace nbl::asset;
+using namespace nbl::ui;
+using namespace nbl::video;
+using namespace nbl::examples;
 
-#endif // __NBL_THIS_EXAMPLE_COMMON_H_INCLUDED__
\ No newline at end of file
+#endif // _NBL_THIS_EXAMPLE_COMMON_H_INCLUDED_
\ No newline at end of file
diff --git a/61_UI/main.cpp b/61_UI/main.cpp
index 470d5e723..17d028f29 100644
--- a/61_UI/main.cpp
+++ b/61_UI/main.cpp
@@ -14,14 +14,11 @@
 	handle scene's object translations.
 */
 
-class UISampleApp final : public examples::SimpleWindowedApplication
+class UISampleApp final : public SimpleWindowedApplication
 {
-	using device_base_t = examples::SimpleWindowedApplication;
-	using clock_t = std::chrono::steady_clock;
+		using device_base_t = SimpleWindowedApplication;
 
-	_NBL_STATIC_INLINE_CONSTEXPR uint32_t WIN_W = 1280, WIN_H = 720;
-
-	constexpr static inline clock_t::duration DisplayImageDuration = std::chrono::milliseconds(900);
+		_NBL_STATIC_INLINE_CONSTEXPR uint32_t WIN_W = 1280, WIN_H = 720;
 
 	public:
 		inline UISampleApp(const path& _localInputCWD, const path& _localOutputCWD, const path& _sharedInputCWD, const path& _sharedOutputCWD) 
@@ -63,7 +60,6 @@ class UISampleApp final : public examples::SimpleWindowedApplication
 				return false;
 
 			m_assetManager = make_smart_refctd_ptr<nbl::asset::IAssetManager>(smart_refctd_ptr(m_system));
-			auto* geometry = m_assetManager->getGeometryCreator();
 
 			m_semaphore = m_device->createSemaphore(m_realFrameIx);
 			if (!m_semaphore)

From a3475622a9bcea1751dd50c8d652136cfc38faa7 Mon Sep 17 00:00:00 2001
From: devsh <devsh@devsh.eu>
Date: Thu, 19 Jun 2025 02:19:47 +0200
Subject: [PATCH 399/529] note down embedding TODOs for arek

---
 09_GeometryCreator/main.cpp                   |  6 +++
 .../geometry/CSimpleDebugRenderer.hpp         | 15 +++---
 .../nbl/examples/geometry/SPushConstants.hlsl |  5 ++
 .../examples/geometry/shaders/unified.hlsl    | 52 +++++++++++++++++++
 4 files changed, 70 insertions(+), 8 deletions(-)
 create mode 100644 common/src/nbl/examples/geometry/shaders/unified.hlsl

diff --git a/09_GeometryCreator/main.cpp b/09_GeometryCreator/main.cpp
index 4c982e8f8..a78c385ee 100644
--- a/09_GeometryCreator/main.cpp
+++ b/09_GeometryCreator/main.cpp
@@ -4,6 +4,8 @@
 
 #include "common.hpp"
 
+// TODO: Arek, we should have a `nbl::examples` class inheriting from `application_templates::MonoAssetManagerAndBuiltinResourceApplication` which
+// during `onAppInitialized` also mounts correct `common/include/nbl/examples` and `common/src/nbl/examples` as folder or builtin
 class GeometryCreatorApp final : public MonoWindowApplication, public application_templates::MonoAssetManagerAndBuiltinResourceApplication
 {
 	using device_base_t = MonoWindowApplication;
@@ -55,6 +57,10 @@ class GeometryCreatorApp final : public MonoWindowApplication, public applicatio
 					.addtionalBufferOwnershipFamilies = addtionalBufferOwnershipFamilies
 				},patch
 			);
+
+			// TODO: this is plain wrong Arek
+			auto commonArchive = make_smart_refctd_ptr<system::CMountDirectoryArchive>(localInputCWD/"app_resources",smart_refctd_ptr(m_logger),m_system.get());
+			m_system->mount(make_smart_refctd_ptr<system::CMountDirectoryArchive>(localInputCWD/"../common/src/nbl/examples",smart_refctd_ptr(m_logger),m_system.get()),"nbl/examples");
 			
 			auto scRes = static_cast<CDefaultSwapchainFramebuffers*>(m_surface->getSwapchainResources());
 			m_renderer = CSimpleDebugRenderer::create(m_assetMgr.get(),scRes->getRenderpass(),0,m_scene.get());
diff --git a/common/include/nbl/examples/geometry/CSimpleDebugRenderer.hpp b/common/include/nbl/examples/geometry/CSimpleDebugRenderer.hpp
index bd190c082..7db627050 100644
--- a/common/include/nbl/examples/geometry/CSimpleDebugRenderer.hpp
+++ b/common/include/nbl/examples/geometry/CSimpleDebugRenderer.hpp
@@ -21,8 +21,6 @@ class CSimpleDebugRenderer final : public core::IReferenceCounted
 			using namespace nbl::asset; \
 			using namespace nbl::video
 	public:
-		//
-		constexpr static inline auto DescriptorCount = 255;
 		//
 		struct SViewParams
 		{
@@ -85,7 +83,7 @@ class CSimpleDebugRenderer final : public core::IReferenceCounted
 		{
 			EXPOSE_NABLA_NAMESPACES;
 
-			if (!!renderpass)
+			if (!renderpass)
 				return nullptr;
 			auto device = const_cast<ILogicalDevice*>(renderpass->getOriginDevice());
 			auto logger = device->getLogger();
@@ -100,9 +98,10 @@ class CSimpleDebugRenderer final : public core::IReferenceCounted
 			smart_refctd_ptr<IShader> shader;
 			{
 				const auto bundle = assMan->getAsset("nbl/examples/geometry/shaders/unified.hlsl",{});
+// TODO: Arek
 				//const auto bundle = assMan->getAsset("nbl/examples/geometry/shaders/unified.spv",{});
 				const auto contents = bundle.getContents();
-				if (bundle.getAssetType()!=IAsset::ET_SHADER || contents.empty())
+				if (contents.empty() || bundle.getAssetType()!=IAsset::ET_SHADER)
 					return nullptr;
 				shader = IAsset::castDown<IShader>(contents[0]);
 				if (!shader)
@@ -124,7 +123,7 @@ class CSimpleDebugRenderer final : public core::IReferenceCounted
 							// some geometries may not have particular attributes
 							.createFlags = IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_PARTIALLY_BOUND_BIT,
 							.stageFlags = IShader::E_SHADER_STAGE::ESS_VERTEX|IShader::E_SHADER_STAGE::ESS_FRAGMENT,
-							.count = DescriptorCount
+							.count = SInstance::SPushConstants::DescriptorCount
 						}
 					};
 					dsLayout = device->createDescriptorSetLayout(bindings);
@@ -164,9 +163,9 @@ class CSimpleDebugRenderer final : public core::IReferenceCounted
 			smart_refctd_ptr<IGPUGraphicsPipeline> pipelines[PipelineType::Count] = {};
 			{
 				IGPUGraphicsPipeline::SCreationParams params[PipelineType::Count] = {};
-				params[PipelineType::BasicTriangleList].vertexShader = {.shader=shader.get(),.entryPoint="BasicTriangleListVS"};
+				params[PipelineType::BasicTriangleList].vertexShader = {.shader=shader.get(),.entryPoint="BasicVS"};
 				params[PipelineType::BasicTriangleList].fragmentShader = {.shader=shader.get(),.entryPoint="BasicFS"};
-				params[PipelineType::BasicTriangleFan].vertexShader = {.shader=shader.get(),.entryPoint="BasicTriangleFanVS"};
+				params[PipelineType::BasicTriangleFan].vertexShader = {.shader=shader.get(),.entryPoint="BasicVS"};
 				params[PipelineType::BasicTriangleFan].fragmentShader = {.shader=shader.get(),.entryPoint="BasicFS"};
 				params[PipelineType::Cone].vertexShader = {.shader=shader.get(),.entryPoint="ConeVS"};
 				params[PipelineType::Cone].fragmentShader = {.shader=shader.get(),.entryPoint="ConeFS"};
@@ -206,7 +205,7 @@ class CSimpleDebugRenderer final : public core::IReferenceCounted
 				auto allocateUTB = [device,&infos](const IGeometry<const IGPUBuffer>::SDataView& view)->uint8_t
 				{
 					if (!view)
-						return DescriptorCount;
+						return SInstance::SPushConstants::DescriptorCount;
 					const auto retval = infos.size();
 					infos.emplace_back().desc = device->createBufferView(view.src, view.composed.format);
 					return retval;
diff --git a/common/include/nbl/examples/geometry/SPushConstants.hlsl b/common/include/nbl/examples/geometry/SPushConstants.hlsl
index 2048f1f3f..932210d0d 100644
--- a/common/include/nbl/examples/geometry/SPushConstants.hlsl
+++ b/common/include/nbl/examples/geometry/SPushConstants.hlsl
@@ -22,6 +22,11 @@ struct SInstanceMatrices
 
 struct SPushConstants
 {
+	// no idea if DXC still has this bug with Push Constant static variables
+#ifndef __HLSL_VERSiON
+	NBL_CONSTEXPR_STATIC_INLINE uint32_t DescriptorCount = 255;
+#endif
+
 	SInstanceMatrices matrices;
 	uint32_t positionView : 11;
 	uint32_t normalView : 10;
diff --git a/common/src/nbl/examples/geometry/shaders/unified.hlsl b/common/src/nbl/examples/geometry/shaders/unified.hlsl
new file mode 100644
index 000000000..1c24ee870
--- /dev/null
+++ b/common/src/nbl/examples/geometry/shaders/unified.hlsl
@@ -0,0 +1,52 @@
+//
+#include "nbl/examples/geometry/SPushConstants.hlsl"
+using namespace nbl::hlsl;
+using namespace nbl::hlsl::examples::geometry_creator_scene;
+
+// for dat sweet programmable pulling
+[[vk::binding(0)]] Buffer<float32_t4> utbs[/*SPushConstants::DescriptorCount*/255];
+
+//
+[[vk::push_constant]] SPushConstants pc;
+
+//
+struct SInterpolants
+{
+	float32_t4 position : SV_Position;
+	float32_t3 color : COLOR0;
+};
+#include "nbl/builtin/hlsl/math/linalg/fast_affine.hlsl"
+
+//
+SInterpolants BasicVS()
+{
+    const float32_t3 position = utbs[pc.positionView].xyz;
+
+    SInterpolants output;
+    output.position = math::linalg::promoted_mul(pc.matrices.worldViewProj,position);
+    output.color = mul(pc.matrices.normalMat,utbs[pc.normalView].xyz)*0.5+promote<float32_t3>(0.5f);
+    return output;
+}
+float32_t4 BasicFS(SInterpolants input) : SV_Target0
+{
+    return float32_t4(input.color,1.f);
+}
+
+// TODO: do smooth normals on the cone
+SInterpolants ConeVS()
+{
+    const float32_t3 position = utbs[pc.positionView].xyz;
+
+    SInterpolants output;
+    output.position = math::linalg::promoted_mul(pc.matrices.worldViewProj,position);
+    output.color = mul(inverse(transpose(pc.matrices.normalMat)),position);
+    return output;
+}
+float32_t4 ConeFS(SInterpolants input) : SV_Target0
+{
+    const float32_t2x3 dViewPos_dScreen = float32_t2x3(
+        ddx(input.color),
+        ddy(input.color)
+    );
+    return float32_t4(normalize(cross(X,Y))*0.5f+promote<float32_t3>(0.5f),1.f);
+}
\ No newline at end of file

From cf4e27959c5e58c562df3f3602a1b8a77b0d4dc7 Mon Sep 17 00:00:00 2001
From: devsh <devsh@devsh.eu>
Date: Thu, 19 Jun 2025 02:19:58 +0200
Subject: [PATCH 400/529] remove unused shaders

---
 .../geometry/shaders/gc.basic.fragment.hlsl   |  6 -----
 .../geometry/shaders/gc.basic.vertex.hlsl     |  6 -----
 .../geometry/shaders/gc.cone.vertex.hlsl      |  6 -----
 .../geometry/shaders/gc.ico.vertex.hlsl       |  6 -----
 .../geometry/shaders/grid.fragment.hlsl       | 12 ----------
 .../template/gc.basic.vertex.input.hlsl       | 12 ----------
 .../geometry/shaders/template/gc.common.hlsl  | 22 -------------------
 .../template/gc.cone.vertex.input.hlsl        | 12 ----------
 .../shaders/template/gc.ico.vertex.input.hlsl | 11 ----------
 .../geometry/shaders/template/gc.vertex.hlsl  | 15 -------------
 10 files changed, 108 deletions(-)
 delete mode 100644 common/src/nbl/examples/geometry/shaders/gc.basic.fragment.hlsl
 delete mode 100644 common/src/nbl/examples/geometry/shaders/gc.basic.vertex.hlsl
 delete mode 100644 common/src/nbl/examples/geometry/shaders/gc.cone.vertex.hlsl
 delete mode 100644 common/src/nbl/examples/geometry/shaders/gc.ico.vertex.hlsl
 delete mode 100644 common/src/nbl/examples/geometry/shaders/grid.fragment.hlsl
 delete mode 100644 common/src/nbl/examples/geometry/shaders/template/gc.basic.vertex.input.hlsl
 delete mode 100644 common/src/nbl/examples/geometry/shaders/template/gc.common.hlsl
 delete mode 100644 common/src/nbl/examples/geometry/shaders/template/gc.cone.vertex.input.hlsl
 delete mode 100644 common/src/nbl/examples/geometry/shaders/template/gc.ico.vertex.input.hlsl
 delete mode 100644 common/src/nbl/examples/geometry/shaders/template/gc.vertex.hlsl

diff --git a/common/src/nbl/examples/geometry/shaders/gc.basic.fragment.hlsl b/common/src/nbl/examples/geometry/shaders/gc.basic.fragment.hlsl
deleted file mode 100644
index 3dc9b9f1d..000000000
--- a/common/src/nbl/examples/geometry/shaders/gc.basic.fragment.hlsl
+++ /dev/null
@@ -1,6 +0,0 @@
-#include "template/gc.common.hlsl"
-
-float4 PSMain(PSInput input) : SV_Target0
-{
-    return input.color;
-}
\ No newline at end of file
diff --git a/common/src/nbl/examples/geometry/shaders/gc.basic.vertex.hlsl b/common/src/nbl/examples/geometry/shaders/gc.basic.vertex.hlsl
deleted file mode 100644
index 1afd468d9..000000000
--- a/common/src/nbl/examples/geometry/shaders/gc.basic.vertex.hlsl
+++ /dev/null
@@ -1,6 +0,0 @@
-#include "template/gc.basic.vertex.input.hlsl"
-#include "template/gc.vertex.hlsl"
-
-/*
-    do not remove this text, WAVE is so bad that you can get errors if no proper ending xD
-*/
diff --git a/common/src/nbl/examples/geometry/shaders/gc.cone.vertex.hlsl b/common/src/nbl/examples/geometry/shaders/gc.cone.vertex.hlsl
deleted file mode 100644
index ee0c42431..000000000
--- a/common/src/nbl/examples/geometry/shaders/gc.cone.vertex.hlsl
+++ /dev/null
@@ -1,6 +0,0 @@
-#include "template/gc.cone.vertex.input.hlsl"
-#include "template/gc.vertex.hlsl"
-
-/*
-    do not remove this text, WAVE is so bad that you can get errors if no proper ending xD
-*/
diff --git a/common/src/nbl/examples/geometry/shaders/gc.ico.vertex.hlsl b/common/src/nbl/examples/geometry/shaders/gc.ico.vertex.hlsl
deleted file mode 100644
index d63fdc809..000000000
--- a/common/src/nbl/examples/geometry/shaders/gc.ico.vertex.hlsl
+++ /dev/null
@@ -1,6 +0,0 @@
-#include "template/gc.ico.vertex.input.hlsl"
-#include "template/gc.vertex.hlsl"
-
-/*
-    do not remove this text, WAVE is so bad that you can get errors if no proper ending xD
-*/
diff --git a/common/src/nbl/examples/geometry/shaders/grid.fragment.hlsl b/common/src/nbl/examples/geometry/shaders/grid.fragment.hlsl
deleted file mode 100644
index 4b4c1e691..000000000
--- a/common/src/nbl/examples/geometry/shaders/grid.fragment.hlsl
+++ /dev/null
@@ -1,12 +0,0 @@
-#include "template/grid.common.hlsl"
-
-float4 PSMain(PSInput input) : SV_Target0
-{
-    float2 uv = (input.uv - float2(0.5, 0.5)) + 0.5 / 30.0;
-    float grid = gridTextureGradBox(uv, ddx(input.uv), ddy(input.uv));
-    float4 fragColor = float4(1.0 - grid, 1.0 - grid, 1.0 - grid, 1.0);
-    fragColor *= 0.25;
-    fragColor *= 0.3 + 0.6 * smoothstep(0.0, 0.1, 1.0 - length(input.uv) / 5.5);
-    
-    return fragColor;
-}
\ No newline at end of file
diff --git a/common/src/nbl/examples/geometry/shaders/template/gc.basic.vertex.input.hlsl b/common/src/nbl/examples/geometry/shaders/template/gc.basic.vertex.input.hlsl
deleted file mode 100644
index 862d4508e..000000000
--- a/common/src/nbl/examples/geometry/shaders/template/gc.basic.vertex.input.hlsl
+++ /dev/null
@@ -1,12 +0,0 @@
-#ifndef _NBL_EXAMPLES_GC_BASIC_VERTEX_INPUT_HLSL_
-#define _NBL_EXAMPLES_GC_BASIC_VERTEX_INPUT_HLSL_
-
-[[vk::binding(0)]] Buffer<float32_t3> position;
-[[vk::binding(1)]] Buffer<float32_t3> normal;
-[[vk::binding(2)]] Buffer<float32_t2> uv;
-[[vk::binding(3)]] Buffer<float32_t3> color;
-
-#endif // _NBL_EXAMPLES_GC_BASIC_VERTEX_INPUT_HLSL_
-/*
-    do not remove this text, WAVE is so bad that you can get errors if no proper ending xD
-*/
diff --git a/common/src/nbl/examples/geometry/shaders/template/gc.common.hlsl b/common/src/nbl/examples/geometry/shaders/template/gc.common.hlsl
deleted file mode 100644
index ff40fb3c8..000000000
--- a/common/src/nbl/examples/geometry/shaders/template/gc.common.hlsl
+++ /dev/null
@@ -1,22 +0,0 @@
-#ifndef _NBL_EXAMPLES_GC_COMMON_HLSL_
-#define _NBL_EXAMPLES_GC_COMMON_HLSL_
-
-
-#include "common/SBasicViewParameters.hlsl"
-
-#ifdef __HLSL_VERSION
-[[vk::push_constant]] SBasicViewParameters params;
-
-struct PSInput
-{
-	float4 position : SV_Position;
-	float3 color : COLOR0;
-};
-#endif // __HLSL_VERSION
-
-
-#endif // _NBL_EXAMPLES_GC_COMMON_HLSL_
-
-/*
-	do not remove this text, WAVE is so bad that you can get errors if no proper ending xD
-*/
\ No newline at end of file
diff --git a/common/src/nbl/examples/geometry/shaders/template/gc.cone.vertex.input.hlsl b/common/src/nbl/examples/geometry/shaders/template/gc.cone.vertex.input.hlsl
deleted file mode 100644
index 7c40f54ab..000000000
--- a/common/src/nbl/examples/geometry/shaders/template/gc.cone.vertex.input.hlsl
+++ /dev/null
@@ -1,12 +0,0 @@
-#ifndef _NBL_EXAMPLES_GEOMETRY_CONE_VERTEX_INPUT_HLSL_
-#define _NBL_EXAMPLES_GEOMETRY_CONE_VERTEX_INPUT_HLSL_
-
-[[vk::binding(0)]] Buffer<float32_t3> position;
-[[vk::binding(1)]] Buffer<float32_t3> normal;
-[[vk::binding(2)]] Buffer<float32_t3> color;
-
-#endif // _NBL_EXAMPLES_GEOMETRY_CONE_VERTEX_INPUT_HLSL_
-
-/*
-    do not remove this text, WAVE is so bad that you can get errors if no proper ending xD
-*/
diff --git a/common/src/nbl/examples/geometry/shaders/template/gc.ico.vertex.input.hlsl b/common/src/nbl/examples/geometry/shaders/template/gc.ico.vertex.input.hlsl
deleted file mode 100644
index 67092ccf0..000000000
--- a/common/src/nbl/examples/geometry/shaders/template/gc.ico.vertex.input.hlsl
+++ /dev/null
@@ -1,11 +0,0 @@
-#ifndef _NBL_EXAMPLES_GEOMETRY_ICO_VERTEX_INPUT_HLSL_
-#define _NBL_EXAMPLES_GEOMETRY_ICO_VERTEX_INPUT_HLSL_
-
-[[vk::binding(0)]] Buffer<float32_t3> position;
-[[vk::binding(1)]] Buffer<float32_t3> normal;
-[[vk::binding(2)]] Buffer<float32_t2> uv;
-
-#endif // _NBL_EXAMPLES_GEOMETRY_ICO_VERTEX_INPUT_HLSL_
-/*
-    do not remove this text, WAVE is so bad that you can get errors if no proper ending xD
-*/
diff --git a/common/src/nbl/examples/geometry/shaders/template/gc.vertex.hlsl b/common/src/nbl/examples/geometry/shaders/template/gc.vertex.hlsl
deleted file mode 100644
index e878bf7d7..000000000
--- a/common/src/nbl/examples/geometry/shaders/template/gc.vertex.hlsl
+++ /dev/null
@@ -1,15 +0,0 @@
-#include "gc.common.hlsl"
-
-PSInput VSMain()
-{
-    PSInput output;
-
-    output.position = mul(params.MVP, float4(input.position, 1.0));
-    output.color = float4(input.normal * 0.5 + 0.5, 1.0);
-
-    return output;
-}
-
-/*
-    do not remove this text, WAVE is so bad that you can get errors if no proper ending xD
-*/

From 08c28efbe827edf85c564894c86d591841a938ed Mon Sep 17 00:00:00 2001
From: devsh <devsh@devsh.eu>
Date: Thu, 19 Jun 2025 02:37:23 +0200
Subject: [PATCH 401/529] get the example to find shaders and create pipelines,
 but nothing on screen

---
 09_GeometryCreator/main.cpp                   |  7 ++---
 .../geometry/CSimpleDebugRenderer.hpp         |  1 +
 .../examples/geometry/shaders/unified.hlsl    | 27 +++++++++++--------
 3 files changed, 21 insertions(+), 14 deletions(-)

diff --git a/09_GeometryCreator/main.cpp b/09_GeometryCreator/main.cpp
index a78c385ee..422cc7285 100644
--- a/09_GeometryCreator/main.cpp
+++ b/09_GeometryCreator/main.cpp
@@ -13,7 +13,8 @@ class GeometryCreatorApp final : public MonoWindowApplication, public applicatio
 
 	public:
 		GeometryCreatorApp(const path& _localInputCWD, const path& _localOutputCWD, const path& _sharedInputCWD, const path& _sharedOutputCWD)
-			: device_base_t({1280,720}, EF_D16_UNORM, _localInputCWD, _localOutputCWD, _sharedInputCWD, _sharedOutputCWD) {}
+			: IApplicationFramework(_localInputCWD, _localOutputCWD, _sharedInputCWD, _sharedOutputCWD),
+			device_base_t({1280,720}, EF_D16_UNORM, _localInputCWD, _localOutputCWD, _sharedInputCWD, _sharedOutputCWD) {}
 
 		SPhysicalDeviceFeatures getRequiredDeviceFeatures() const override
 		{
@@ -59,7 +60,7 @@ class GeometryCreatorApp final : public MonoWindowApplication, public applicatio
 			);
 
 			// TODO: this is plain wrong Arek
-			auto commonArchive = make_smart_refctd_ptr<system::CMountDirectoryArchive>(localInputCWD/"app_resources",smart_refctd_ptr(m_logger),m_system.get());
+			m_system->mount(make_smart_refctd_ptr<system::CMountDirectoryArchive>(localInputCWD/"../common/include/nbl/examples",smart_refctd_ptr(m_logger),m_system.get()),"nbl/examples");
 			m_system->mount(make_smart_refctd_ptr<system::CMountDirectoryArchive>(localInputCWD/"../common/src/nbl/examples",smart_refctd_ptr(m_logger),m_system.get()),"nbl/examples");
 			
 			auto scRes = static_cast<CDefaultSwapchainFramebuffers*>(m_surface->getSwapchainResources());
@@ -259,7 +260,7 @@ class GeometryCreatorApp final : public MonoWindowApplication, public applicatio
 				if (ev.type==nbl::ui::SMouseEvent::EET_SCROLL && m_renderer)
 				{
 					gcIndex += int16_t(core::sign(ev.scrollEvent.verticalScroll));
-					gcIndex = core::clamp(gcIndex,0ull,m_renderer->getInitParams().geoms.size());
+					gcIndex = core::clamp(gcIndex,0ull,m_renderer->getInitParams().geoms.size()-1);
 				}
 			}
 		}
diff --git a/common/include/nbl/examples/geometry/CSimpleDebugRenderer.hpp b/common/include/nbl/examples/geometry/CSimpleDebugRenderer.hpp
index 7db627050..4308425a2 100644
--- a/common/include/nbl/examples/geometry/CSimpleDebugRenderer.hpp
+++ b/common/include/nbl/examples/geometry/CSimpleDebugRenderer.hpp
@@ -104,6 +104,7 @@ class CSimpleDebugRenderer final : public core::IReferenceCounted
 				if (contents.empty() || bundle.getAssetType()!=IAsset::ET_SHADER)
 					return nullptr;
 				shader = IAsset::castDown<IShader>(contents[0]);
+				shader = device->compileShader({.source=shader.get()});
 				if (!shader)
 					return nullptr;
 			}
diff --git a/common/src/nbl/examples/geometry/shaders/unified.hlsl b/common/src/nbl/examples/geometry/shaders/unified.hlsl
index 1c24ee870..bc6b6e13a 100644
--- a/common/src/nbl/examples/geometry/shaders/unified.hlsl
+++ b/common/src/nbl/examples/geometry/shaders/unified.hlsl
@@ -13,40 +13,45 @@ using namespace nbl::hlsl::examples::geometry_creator_scene;
 struct SInterpolants
 {
 	float32_t4 position : SV_Position;
-	float32_t3 color : COLOR0;
+	float32_t3 meta : COLOR0;
 };
 #include "nbl/builtin/hlsl/math/linalg/fast_affine.hlsl"
 
 //
-SInterpolants BasicVS()
+[shader("vertex")]
+SInterpolants BasicVS(uint32_t VertexIndex : SV_VertexID)
 {
-    const float32_t3 position = utbs[pc.positionView].xyz;
+    const float32_t3 position = utbs[pc.positionView][VertexIndex].xyz;
 
     SInterpolants output;
     output.position = math::linalg::promoted_mul(pc.matrices.worldViewProj,position);
-    output.color = mul(pc.matrices.normalMat,utbs[pc.normalView].xyz)*0.5+promote<float32_t3>(0.5f);
+    output.meta = mul(pc.matrices.normal,utbs[pc.normalView][VertexIndex].xyz);
     return output;
 }
+[shader("pixel")]
 float32_t4 BasicFS(SInterpolants input) : SV_Target0
 {
-    return float32_t4(input.color,1.f);
+    return float32_t4(normalize(input.meta)*0.5f+promote<float32_t3>(0.5f),1.f);
 }
 
 // TODO: do smooth normals on the cone
-SInterpolants ConeVS()
+[shader("vertex")]
+SInterpolants ConeVS(uint32_t VertexIndex : SV_VertexID)
 {
-    const float32_t3 position = utbs[pc.positionView].xyz;
+    const float32_t3 position = utbs[pc.positionView][VertexIndex].xyz;
 
     SInterpolants output;
     output.position = math::linalg::promoted_mul(pc.matrices.worldViewProj,position);
-    output.color = mul(inverse(transpose(pc.matrices.normalMat)),position);
+    output.meta = mul(inverse(transpose(pc.matrices.normal)),position);
     return output;
 }
+[shader("pixel")]
 float32_t4 ConeFS(SInterpolants input) : SV_Target0
 {
     const float32_t2x3 dViewPos_dScreen = float32_t2x3(
-        ddx(input.color),
-        ddy(input.color)
+        ddx(input.meta),
+        ddy(input.meta)
     );
-    return float32_t4(normalize(cross(X,Y))*0.5f+promote<float32_t3>(0.5f),1.f);
+    const float32_t3 normal = cross(dViewPos_dScreen[0],dViewPos_dScreen[1]);
+    return float32_t4(normalize(normal)*0.5f+promote<float32_t3>(0.5f),1.f);
 }
\ No newline at end of file

From 17b0579da795261b5f2c20b4f040e4bbe674ca10 Mon Sep 17 00:00:00 2001
From: devsh <devsh@devsh.eu>
Date: Thu, 19 Jun 2025 02:47:37 +0200
Subject: [PATCH 402/529] scene was empty, no wonder nothing drew

---
 09_GeometryCreator/main.cpp | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/09_GeometryCreator/main.cpp b/09_GeometryCreator/main.cpp
index 422cc7285..c087eba07 100644
--- a/09_GeometryCreator/main.cpp
+++ b/09_GeometryCreator/main.cpp
@@ -67,6 +67,12 @@ class GeometryCreatorApp final : public MonoWindowApplication, public applicatio
 			m_renderer = CSimpleDebugRenderer::create(m_assetMgr.get(),scRes->getRenderpass(),0,m_scene.get());
 			if (!m_renderer)
 				return logFail("Could not create Renderer!");
+			m_renderer->m_instances.resize(1);
+			m_renderer->m_instances[0].world = float32_t3x4(
+				float32_t4(1,0,0,0),
+				float32_t4(0,1,0,0),
+				float32_t4(0,0,1,0)
+			);
 
 			// camera
 			{
@@ -147,6 +153,9 @@ class GeometryCreatorApp final : public MonoWindowApplication, public applicatio
 				memcpy(&viewProjMatrix,camera.getConcatenatedMatrix().pointer(),sizeof(viewMatrix));
 			}
 			const auto viewParams = CSimpleDebugRenderer::SViewParams(viewMatrix,viewProjMatrix);
+
+			// tear down scene every frame
+			m_renderer->m_instances[0].packedGeo = m_renderer->getInitParams().geoms.data()+gcIndex;
 			m_renderer->render(cb,viewParams);
 
 			cb->endRenderPass();

From 62f1a2684f6bc21f5c7f6bd84ed552f248f64657 Mon Sep 17 00:00:00 2001
From: devsh <devsh@devsh.eu>
Date: Thu, 19 Jun 2025 02:58:03 +0200
Subject: [PATCH 403/529] index type was not being set

---
 .../nbl/examples/geometry/CGeometryCreatorScene.hpp  |  2 +-
 .../nbl/examples/geometry/CSimpleDebugRenderer.hpp   | 12 ++++++++++++
 2 files changed, 13 insertions(+), 1 deletion(-)

diff --git a/common/include/nbl/examples/geometry/CGeometryCreatorScene.hpp b/common/include/nbl/examples/geometry/CGeometryCreatorScene.hpp
index 8a73f2e14..63b3d7a8d 100644
--- a/common/include/nbl/examples/geometry/CGeometryCreatorScene.hpp
+++ b/common/include/nbl/examples/geometry/CGeometryCreatorScene.hpp
@@ -167,7 +167,7 @@ class CGeometryCreatorScene : public core::IReferenceCounted
 		//
 		struct SNamedGeometry
 		{
-			std::string_view name = {};
+			std::string name = {};
 			core::smart_refctd_ptr<video::IGPUPolygonGeometry> geom;
 		};
 		std::span<const SNamedGeometry> getGeometries() const {return m_geometries;}
diff --git a/common/include/nbl/examples/geometry/CSimpleDebugRenderer.hpp b/common/include/nbl/examples/geometry/CSimpleDebugRenderer.hpp
index 4308425a2..474f1d350 100644
--- a/common/include/nbl/examples/geometry/CSimpleDebugRenderer.hpp
+++ b/common/include/nbl/examples/geometry/CSimpleDebugRenderer.hpp
@@ -235,6 +235,18 @@ class CSimpleDebugRenderer final : public core::IReferenceCounted
 					{
 						out.indexBuffer.offset = view.src.offset;
 						out.indexBuffer.buffer = view.src.buffer;
+						switch (view.composed.format)
+						{
+							case E_FORMAT::EF_R16_UINT:
+								out.indexType = EIT_16BIT;
+								break;
+							case E_FORMAT::EF_R32_UINT:
+								out.indexType = EIT_32BIT;
+								break;
+							default:
+								assert(false);
+								return nullptr;
+						}
 					}
 					out.elementCount = geom->getVertexReferenceCount();
 					out.positionView = allocateUTB(geom->getPositionView());

From fcae0b438805e33c2884f75ea28aaa16e273e11f Mon Sep 17 00:00:00 2001
From: kevyuu <kevin.kayu@gmail.com>
Date: Thu, 19 Jun 2025 19:14:49 +0700
Subject: [PATCH 404/529] Initial commit for example 71 to use
 ICPUPolygonGeometry

---
 71_RayTracingPipeline/include/common.hpp |  62 ++++++++++
 71_RayTracingPipeline/main.cpp           | 145 ++++++-----------------
 2 files changed, 101 insertions(+), 106 deletions(-)

diff --git a/71_RayTracingPipeline/include/common.hpp b/71_RayTracingPipeline/include/common.hpp
index c60e0c3e5..184d424c7 100644
--- a/71_RayTracingPipeline/include/common.hpp
+++ b/71_RayTracingPipeline/include/common.hpp
@@ -19,4 +19,66 @@ using namespace nbl::examples;
 
 #include "app_resources/common.hlsl"
 
+namespace nbl::scene
+{
+
+enum ObjectType : uint8_t
+{
+	OT_CUBE,
+	OT_SPHERE,
+	OT_CYLINDER,
+	OT_RECTANGLE,
+	OT_DISK,
+	OT_ARROW,
+	OT_CONE,
+	OT_ICOSPHERE,
+
+	OT_COUNT,
+	OT_UNKNOWN = std::numeric_limits<uint8_t>::max()
+};
+
+static constexpr uint32_t s_smoothNormals[OT_COUNT] = { 0, 1, 1, 0, 0, 1, 1, 1 };
+
+struct ObjectMeta
+{
+	ObjectType type = OT_UNKNOWN;
+	std::string_view name = "Unknown";
+};
+
+struct ObjectDrawHookCpu
+{
+	nbl::core::matrix3x4SIMD model;
+	ObjectMeta meta;
+};
+
+struct ReferenceObjectCpu
+{
+	ObjectMeta meta;
+	core::smart_refctd_ptr<ICPUPolygonGeometry> data;
+	Material material;
+  core::matrix3x4SIMD transform;
+};
+
+struct ReferenceObjectGpu
+{
+	struct Bindings
+	{
+		nbl::asset::SBufferBinding<IGPUBuffer> vertex, index;
+	};
+
+	ObjectMeta meta;
+	Bindings bindings;
+	uint32_t vertexStride;
+	nbl::asset::E_INDEX_TYPE indexType = nbl::asset::E_INDEX_TYPE::EIT_UNKNOWN;
+	uint32_t indexCount = {};
+	MaterialPacked material;
+  core::matrix3x4SIMD transform;
+
+	const bool useIndex() const
+	{
+		return bindings.index.buffer && (indexType != E_INDEX_TYPE::EIT_UNKNOWN);
+	}
+};
+}
+
 #endif // __NBL_THIS_EXAMPLE_COMMON_H_INCLUDED__
diff --git a/71_RayTracingPipeline/main.cpp b/71_RayTracingPipeline/main.cpp
index 453e9cf69..382e5cccb 100644
--- a/71_RayTracingPipeline/main.cpp
+++ b/71_RayTracingPipeline/main.cpp
@@ -1109,25 +1109,23 @@ class RaytracingPipelineApp final : public SimpleWindowedApplication, public app
 
 		// triangles geometries
 		auto geometryCreator = make_smart_refctd_ptr<CGeometryCreator>();
-#if 1
-		return false;
-#else
+
 		const auto cpuObjects = std::array{
-			ReferenceObjectCpu {
-				.meta = {.type = OT_RECTANGLE, .name = "Plane Mesh"},
-				.data = gc->createRectangleMesh(nbl::core::vector2df_SIMD(10, 10)),
+			scene::ReferenceObjectCpu {
+				.meta = {.type = scene::OT_RECTANGLE, .name = "Plane Mesh"},
+				.data = geometryCreator->createRectangle({10, 10}),
 				.material = defaultMaterial,
 				.transform = planeTransform,
 			},
-			ReferenceObjectCpu {
-				.meta = {.type = OT_CUBE, .name = "Cube Mesh"},
-				.data = gc->createCubeMesh(nbl::core::vector3df(1, 1, 1)),
+			scene::ReferenceObjectCpu {
+				.meta = {.type = scene::OT_CUBE, .name = "Cube Mesh"},
+				.data = geometryCreator->createCube({1, 1, 1}),
 				.material = defaultMaterial,
 				.transform = getTranslationMatrix(0, 0.5f, 0),
 			},
-			ReferenceObjectCpu {
-				.meta = {.type = OT_CUBE, .name = "Cube Mesh 2"},
-				.data = gc->createCubeMesh(nbl::core::vector3df(1.5, 1.5, 1.5)),
+			scene::ReferenceObjectCpu {
+				.meta = {.type = scene::OT_CUBE, .name = "Cube Mesh 2"},
+				.data = geometryCreator->createCube({1.5, 1.5, 1.5}),
 				.material = Material{
 					.ambient = {0.1, 0.1, 0.2},
 					.diffuse = {0.2, 0.2, 0.8},
@@ -1137,9 +1135,9 @@ class RaytracingPipelineApp final : public SimpleWindowedApplication, public app
 				},
 				.transform = getTranslationMatrix(-5.0f, 1.0f, 0),
 			},
-			ReferenceObjectCpu {
-				.meta = {.type = OT_CUBE, .name = "Transparent Cube Mesh"},
-				.data = gc->createCubeMesh(nbl::core::vector3df(1.5, 1.5, 1.5)),
+			scene::ReferenceObjectCpu {
+				.meta = {.type = scene::OT_CUBE, .name = "Transparent Cube Mesh"},
+				.data = geometryCreator->createCube({1.5, 1.5, 1.5}),
 				.material = Material{
 					.ambient = {0.1, 0.2, 0.1},
 					.diffuse = {0.2, 0.8, 0.2},
@@ -1151,40 +1149,6 @@ class RaytracingPipelineApp final : public SimpleWindowedApplication, public app
 			},
 		};
 
-		struct CPUTriBufferBindings
-		{
-			nbl::asset::SBufferBinding<ICPUBuffer> vertex, index;
-		};
-		std::array<CPUTriBufferBindings, std::size(cpuObjects)> cpuTriBuffers;
-
-		for (uint32_t i = 0; i < cpuObjects.size(); i++)
-		{
-			const auto& cpuObject = cpuObjects[i];
-
-			auto vBuffer = smart_refctd_ptr(cpuObject.data.bindings[0].buffer); // no offset
-			auto vUsage = bitflag(IGPUBuffer::EUF_STORAGE_BUFFER_BIT) | IGPUBuffer::EUF_TRANSFER_DST_BIT | IGPUBuffer::EUF_INLINE_UPDATE_VIA_CMDBUF |
-				IGPUBuffer::EUF_ACCELERATION_STRUCTURE_BUILD_INPUT_READ_ONLY_BIT | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT;
-			vBuffer->addUsageFlags(vUsage);
-			vBuffer->setContentHash(vBuffer->computeContentHash());
-
-			auto iBuffer = smart_refctd_ptr(cpuObject.data.indexBuffer.buffer); // no offset
-			auto iUsage = bitflag(IGPUBuffer::EUF_STORAGE_BUFFER_BIT) | IGPUBuffer::EUF_TRANSFER_DST_BIT | IGPUBuffer::EUF_INLINE_UPDATE_VIA_CMDBUF |
-				IGPUBuffer::EUF_ACCELERATION_STRUCTURE_BUILD_INPUT_READ_ONLY_BIT | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT;
-
-			if (cpuObject.data.indexType != EIT_UNKNOWN)
-				if (iBuffer)
-				{
-					iBuffer->addUsageFlags(iUsage);
-					iBuffer->setContentHash(iBuffer->computeContentHash());
-				}
-
-			cpuTriBuffers[i] = {
-			  .vertex = {.offset = 0, .buffer = vBuffer},
-			  .index = {.offset = 0, .buffer = iBuffer},
-			};
-
-		}
-
 		// procedural geometries
 		using Aabb = IGPUBottomLevelAccelerationStructure::AABB_t;
 
@@ -1233,10 +1197,10 @@ class RaytracingPipelineApp final : public SimpleWindowedApplication, public app
 		const auto blasCount = std::size(cpuObjects) + 1;
 		const auto proceduralBlasIdx = std::size(cpuObjects);
 
-		std::array<smart_refctd_ptr<ICPUBottomLevelAccelerationStructure>, std::size(cpuObjects)+1u> cpuBlas;
+		std::array<smart_refctd_ptr<ICPUBottomLevelAccelerationStructure>, std::size(cpuObjects)+1u> cpuBlasList;
 		for (uint32_t i = 0; i < blasCount; i++)
 		{
-			auto& blas = cpuBlas[i];
+			auto& blas = cpuBlasList[i];
 			blas = make_smart_refctd_ptr<ICPUBottomLevelAccelerationStructure>();
 
 			if (i == proceduralBlasIdx)
@@ -1256,30 +1220,15 @@ class RaytracingPipelineApp final : public SimpleWindowedApplication, public app
 			}
 			else
 			{
-				auto triangles = make_refctd_dynamic_array<smart_refctd_dynamic_array<ICPUBottomLevelAccelerationStructure::Triangles<ICPUBuffer>>>(1u);
+				auto triangles = make_refctd_dynamic_array<smart_refctd_dynamic_array<ICPUBottomLevelAccelerationStructure::Triangles<ICPUBuffer>>>(cpuObjects[i].data->exportForBLAS());
 				auto primitiveCounts = make_refctd_dynamic_array<smart_refctd_dynamic_array<uint32_t>>(1u);
 
 				auto& tri = triangles->front();
-				auto& primCount = primitiveCounts->front();
-				const auto& geom = cpuObjects[i];
-				const auto& cpuBuf = cpuTriBuffers[i];
 
-				const bool useIndex = geom.data.indexType != EIT_UNKNOWN;
-				const uint32_t vertexStride = geom.data.inputParams.bindings[0].stride;
-				const uint32_t numVertices = cpuBuf.vertex.buffer->getSize() / vertexStride;
+				auto& primCount = primitiveCounts->front();
+				primCount = cpuObjects[i].data->getPrimitiveCount();
 
-				if (useIndex)
-					primCount = geom.data.indexCount / 3;
-				else
-					primCount = numVertices / 3;
-
-				tri.vertexData[0] = cpuBuf.vertex;
-				tri.indexData = useIndex ? cpuBuf.index : cpuBuf.vertex;
-				tri.maxVertex = numVertices - 1;
-				tri.vertexStride = vertexStride;
-				tri.vertexFormat = EF_R32G32B32_SFLOAT;
-				tri.indexType = geom.data.indexType;
-				tri.geometryFlags = geom.material.isTransparent() ?
+				tri.geometryFlags = cpuObjects[i].material.isTransparent() ?
 					IGPUBottomLevelAccelerationStructure::GEOMETRY_FLAGS::NO_DUPLICATE_ANY_HIT_INVOCATION_BIT :
 					IGPUBottomLevelAccelerationStructure::GEOMETRY_FLAGS::OPAQUE_BIT;
 
@@ -1305,7 +1254,7 @@ class RaytracingPipelineApp final : public SimpleWindowedApplication, public app
 			{
 				const auto isProceduralInstance = i == proceduralBlasIdx;
 				ICPUTopLevelAccelerationStructure::StaticInstance inst;
-				inst.base.blas = cpuBlas[i];
+				inst.base.blas = cpuBlasList[i];
 				inst.base.flags = static_cast<uint32_t>(IGPUTopLevelAccelerationStructure::INSTANCE_FLAGS::TRIANGLE_FACING_CULL_DISABLE_BIT);
 				inst.base.instanceCustomIndex = i;
 				inst.base.instanceShaderBindingTableRecordOffset = isProceduralInstance ? 2 : 0;;
@@ -1356,18 +1305,19 @@ class RaytracingPipelineApp final : public SimpleWindowedApplication, public app
 		inputs.allocator = &myalloc;
 
 		std::array<ICPUTopLevelAccelerationStructure*, 1u> tmpTlas;
-		std::array<ICPUBuffer*, 2 * std::size(cpuObjects) + 1u> tmpBuffers;
+		std::array<ICPUPolygonGeometry*, std::size(cpuObjects)> tmpGeometries;
+		std::array<ICPUBuffer*, 1> tmpBuffers;
 		{
 			tmpTlas[0] = cpuTlas.get();
+			tmpBuffers[0] = cpuProcBuffer.get();
 			for (uint32_t i = 0; i < cpuObjects.size(); i++)
 			{
-				tmpBuffers[2 * i + 0] = cpuTriBuffers[i].vertex.buffer.get();
-				tmpBuffers[2 * i + 1] = cpuTriBuffers[i].index.buffer.get();
+				tmpGeometries[i] = cpuObjects[i].data.get();
 			}
-			tmpBuffers[2 * proceduralBlasIdx] = cpuProcBuffer.get();
 
 			std::get<CAssetConverter::SInputs::asset_span_t<ICPUTopLevelAccelerationStructure>>(inputs.assets) = tmpTlas;
 			std::get<CAssetConverter::SInputs::asset_span_t<ICPUBuffer>>(inputs.assets) = tmpBuffers;
+			std::get<CAssetConverter::SInputs::asset_span_t<ICPUPolygonGeometry>>(inputs.assets) = tmpGeometries;
 		}
 
 		auto reservation = converter->reserve(inputs);
@@ -1475,37 +1425,24 @@ class RaytracingPipelineApp final : public SimpleWindowedApplication, public app
 			auto&& tlases = reservation.getGPUObjects<ICPUTopLevelAccelerationStructure>();
 			m_gpuTlas = tlases[0].value;
 			auto&& buffers = reservation.getGPUObjects<ICPUBuffer>();
-			for (uint32_t i = 0; i < cpuObjects.size(); i++)
-			{
-				auto& cpuObject = cpuObjects[i];
-
-				m_gpuTriangleGeometries.push_back(ReferenceObjectGpu{
-				  .meta = cpuObject.meta,
-				  .bindings = {
-					.vertex = {.offset = 0, .buffer = buffers[2 * i + 0].value },
-					.index = {.offset = 0, .buffer = buffers[2 * i + 1].value },
-				  },
-				  .vertexStride = cpuObject.data.inputParams.bindings[0].stride,
-				  .indexType = cpuObject.data.indexType,
-				  .indexCount = cpuObject.data.indexCount,
-				  .material = hlsl::_static_cast<MaterialPacked>(cpuObject.material),
-				  .transform = cpuObject.transform,
-					});
-			}
+
 			m_proceduralAabbBuffer = buffers[2 * proceduralBlasIdx].value;
 
-			for (uint32_t i = 0; i < m_gpuTriangleGeometries.size(); i++)
+			for (uint32_t i = 0; i < cpuObjects.size(); i++)
 			{
-				const auto& gpuObject = m_gpuTriangleGeometries[i];
-				const uint64_t vertexBufferAddress = gpuObject.bindings.vertex.buffer->getDeviceAddress();
+				const auto& cpuObject = cpuObjects[i];
+				const auto& cpuBlas = cpuBlasList[i];
+				const auto& geometry = cpuBlas->getTriangleGeometries()[0];
+				const uint64_t vertexBufferAddress = buffers[2 * i].value->getDeviceAddress();
+				const uint64_t indexBufferAddress = buffers[(2 * i) + 1].value->getDeviceAddress();
 				geomInfos[i] = {
-				  .material = gpuObject.material,
+				  .material = hlsl::_static_cast<MaterialPacked>(cpuObject.material),
 				  .vertexBufferAddress = vertexBufferAddress,
-				  .indexBufferAddress = gpuObject.useIndex() ? gpuObject.bindings.index.buffer->getDeviceAddress() : vertexBufferAddress,
-				  .vertexStride = gpuObject.vertexStride,
-				  .objType = gpuObject.meta.type,
-				  .indexType = gpuObject.indexType,
-				  .smoothNormals = s_smoothNormals[gpuObject.meta.type],
+				  .indexBufferAddress = geometry.indexData.buffer ? indexBufferAddress : vertexBufferAddress,
+				  .vertexStride = geometry.vertexStride,
+				  .objType = cpuObject.meta.type,
+				  .indexType = geometry.indexType,
+				  .smoothNormals = scene::s_smoothNormals[cpuObject.meta.type],
 				};
 			}
 		}
@@ -1516,12 +1453,10 @@ class RaytracingPipelineApp final : public SimpleWindowedApplication, public app
 			params.size = geomInfoBuffer->getSize();
 			m_utils->createFilledDeviceLocalBufferOnDedMem(SIntendedSubmitInfo{ .queue = queue }, std::move(params), geomInfos).move_into(m_triangleGeomInfoBuffer);
 		}
-#endif
+
 		return true;
 	}
 
-
-
 	smart_refctd_ptr<IWindow> m_window;
 	smart_refctd_ptr<CSimpleResizeSurface<ISimpleManagedSurface::ISwapchainResources>> m_surface;
 	smart_refctd_ptr<ISemaphore> m_semaphore;
@@ -1570,8 +1505,6 @@ class RaytracingPipelineApp final : public SimpleWindowedApplication, public app
 	} m_ui;
 	core::smart_refctd_ptr<IDescriptorPool> m_guiDescriptorSetPool;
 
-	// TODO: how much of this do we actually have to keep ?
-//	core::vector<ReferenceObjectGpu> m_gpuTriangleGeometries;
 	core::vector<SProceduralGeomInfo> m_gpuIntersectionSpheres;
 	uint32_t m_intersectionHitGroupIdx;
 

From e419580318a44baaf7f2050bd988200f1ab00f08 Mon Sep 17 00:00:00 2001
From: Arkadiusz Lachowicz <areklachowicz@gmail.com>
Date: Thu, 19 Jun 2025 23:13:28 +0200
Subject: [PATCH 405/529] docs docs docs, adjust to comments and test on 01
 after updates

---
 01_HelloCoreSystemAsset/main.cpp              |  2 +-
 CMakeLists.txt                                | 32 +++++++++-----
 common/CMakeLists.txt                         | 43 ++++++++++++++++++-
 common/include/nbl/examples/PCH.hpp           | 18 ++++++--
 common/include/nbl/examples/api.hpp           | 24 -----------
 common/include/nbl/examples/examples.hpp      | 10 ++++-
 common/src/nbl/examples/CMakeLists.txt        | 14 +-----
 .../src/nbl/examples/cameras/CMakeLists.txt   |  7 ---
 8 files changed, 90 insertions(+), 60 deletions(-)
 delete mode 100644 common/include/nbl/examples/api.hpp
 delete mode 100644 common/src/nbl/examples/cameras/CMakeLists.txt

diff --git a/01_HelloCoreSystemAsset/main.cpp b/01_HelloCoreSystemAsset/main.cpp
index 96e4a0d4e..7ca4badb4 100644
--- a/01_HelloCoreSystemAsset/main.cpp
+++ b/01_HelloCoreSystemAsset/main.cpp
@@ -3,7 +3,7 @@
 // For conditions of distribution and use, see copyright notice in nabla.h
 
 // <nabla.h> public interface and common examples API, always include first before std:: headers
-#include "nbl/examples/api.hpp"
+#include "nbl/examples/examples.hpp"
 
 #include "nbl/system/IApplicationFramework.h"
 
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 3a168b061..41ed86b52 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -9,6 +9,17 @@ if(NBL_BUILD_EXAMPLES)
 		nbl_android_create_media_storage_apk()
 	endif()
 
+	#! Common api library & precompiled headers for Nabla framework examples
+	add_subdirectory(common EXCLUDE_FROM_ALL)
+
+	#! use "EXCLUDE_FROM_ALL" to exclude an example from the NablaExamples project
+	#[[
+		useful if we don't want the example to be tested by CI but still want
+		the example's project to be generated
+
+		https://cmake.org/cmake/help/latest/prop_tgt/EXCLUDE_FROM_ALL.html
+	]]
+
 	# showcase the use of `nbl::core`,`nbl::system` and `nbl::asset`
 	add_subdirectory(01_HelloCoreSystemAsset)
 	# showcase the use of `system::IApplicationFramework` and `nbl::video`
@@ -31,7 +42,6 @@ if(NBL_BUILD_EXAMPLES)
 	# showcase use of FFT for post-FX Bloom  effect
 	add_subdirectory(11_FFT)
 
-
 	# Waiting for a refactor
 	#add_subdirectory(27_PLYSTLDemo)
 	#add_subdirectory(33_Draw3DLine)
@@ -42,7 +52,7 @@ if(NBL_BUILD_EXAMPLES)
 	add_subdirectory(22_CppCompat)
 	add_subdirectory(23_Arithmetic2UnitTest)
 	add_subdirectory(24_ColorSpaceTest)
-	add_subdirectory(25_FilterTest)
+	add_subdirectory(25_FilterTest EXCLUDE_FROM_ALL)
 	add_subdirectory(26_Blur)
 	add_subdirectory(27_MPMCScheduler)	
 	add_subdirectory(28_FFTBloom)
@@ -58,27 +68,29 @@ if(NBL_BUILD_EXAMPLES)
 	# endif()
 
 	#add_subdirectory(43_SumAndCDFFilters)
-	add_subdirectory(47_DerivMapTest)
-	add_subdirectory(54_Transformations)
-	add_subdirectory(55_RGB18E7S3)
+	add_subdirectory(47_DerivMapTest EXCLUDE_FROM_ALL)
+	add_subdirectory(54_Transformations EXCLUDE_FROM_ALL)
+	add_subdirectory(55_RGB18E7S3 EXCLUDE_FROM_ALL)
 	add_subdirectory(61_UI)
 	add_subdirectory(62_CAD)
-	add_subdirectory(62_SchusslerTest)
+	add_subdirectory(62_SchusslerTest EXCLUDE_FROM_ALL)
 	add_subdirectory(64_EmulatedFloatTest)
-	add_subdirectory(0_ImportanceSamplingEnvMaps) #TODO: integrate back into 42
+	add_subdirectory(0_ImportanceSamplingEnvMaps EXCLUDE_FROM_ALL) #TODO: integrate back into 42
 
-	add_subdirectory(66_HLSLBxDFTests)
+	add_subdirectory(66_HLSLBxDFTests EXCLUDE_FROM_ALL)
 	add_subdirectory(67_RayQueryGeometry)
 	add_subdirectory(68_JpegLoading)
 
   	add_subdirectory(70_FLIPFluids)
 	add_subdirectory(71_RayTracingPipeline)
 
+	# add new examples *before* NBL_GET_ALL_TARGETS invocation, it gathers recursively all targets created so far in this subdirectory
 	NBL_GET_ALL_TARGETS(TARGETS)
 
-	# PCH & CommonAPI library for Nabla framework examples
-	add_subdirectory(common EXCLUDE_FROM_ALL)
+	# we want to loop only over the examples so we exclude examples' interface libraries created in common subdirectory
+	list(REMOVE_ITEM TARGETS ${NBL_EXAMPLES_API_TARGET} ${NBL_EXAMPLES_API_LIBRARIES})
 
+	# we link common example api library and force examples to reuse its PCH
 	foreach(T IN LISTS TARGETS)
         target_link_libraries(${T} PUBLIC ${NBL_EXAMPLES_API_TARGET})
 		target_include_directories(${T} PUBLIC $<TARGET_PROPERTY:${NBL_EXAMPLES_API_TARGET},INCLUDE_DIRECTORIES>)
diff --git a/common/CMakeLists.txt b/common/CMakeLists.txt
index 3cdcce82d..3a55e7a26 100644
--- a/common/CMakeLists.txt
+++ b/common/CMakeLists.txt
@@ -1,7 +1,48 @@
+#! Examples API proxy library
+#[[
+    We create the Nabla Examples API as a static library extension, this
+    allows all examples to reuse a single precompiled header (PCH)
+    instead of generating their own
+
+    The PCH includes Nabla.h + example common interface headers and takes 
+    around 1 GB per configuration, so sharing it avoids significant disk space waste
+]]
+
 nbl_create_ext_library_project(ExamplesAPI "" "${CMAKE_CURRENT_SOURCE_DIR}/src/nbl/examples/pch.cpp" "${CMAKE_CURRENT_SOURCE_DIR}/include" "" "")
 
 set_target_properties(${LIB_NAME} PROPERTIES DISABLE_PRECOMPILE_HEADERS OFF)
 target_precompile_headers(${LIB_NAME} PUBLIC "${CMAKE_CURRENT_SOURCE_DIR}/include/nbl/examples/PCH.hpp")
 
+#! Examples API common libraries
+#[[
+    The rule is to avoid creating additional libraries as part of the examples' common
+    interface in order to prevent generating another precompiled header (PCH) and wasting disk space
+
+    If you have new utilities that could be shared across examples then try to implement them as header only
+    and include in the PCH or in `examples.h` *if you cannot* (open the header to see details)
+
+    but If you have a good reason to create library because you cannot make it header only 
+    AND you *can REUSE* the examples' PCH then go ahead anyway and put it under `src/nbl/examples`, 
+    otherwise keep it header only - a good example would be to use our embedded-whatever-you-want tool
+    which does create library but can reuse example's PCH
+]]
+
+#! NOTE: as I write it we don't have any targets there yet
 add_subdirectory("src/nbl/examples" EXCLUDE_FROM_ALL)
-set(NBL_EXAMPLES_API_TARGET ${LIB_NAME} PARENT_SCOPE)
\ No newline at end of file
+
+NBL_GET_ALL_TARGETS(TARGETS)
+list(REMOVE_ITEM TARGETS ${LIB_NAME})
+
+# the Examples API proxy library CMake target name
+#[[
+    this one gets linked to each executable automatically
+]]
+set(NBL_EXAMPLES_API_TARGET ${LIB_NAME} PARENT_SCOPE)
+
+#! names of CMake targets created in src/nbl/examples
+#[[
+    if your example wants to use anything from src/nbl/examples
+    then you must target_link_libraries() the lib you want as we 
+    don't link all those libraries to each executable automatically
+]]
+set(NBL_EXAMPLES_API_LIBRARIES ${TARGETS} PARENT_SCOPE)
\ No newline at end of file
diff --git a/common/include/nbl/examples/PCH.hpp b/common/include/nbl/examples/PCH.hpp
index 671f8b331..5316ce2e8 100644
--- a/common/include/nbl/examples/PCH.hpp
+++ b/common/include/nbl/examples/PCH.hpp
@@ -4,17 +4,29 @@
 #ifndef _NBL_EXAMPLES_PCH_HPP_
 #define _NBL_EXAMPLES_PCH_HPP_
 
-//! public Nabla declarations
+//! Precompiled header (PCH) for Nabla Examples
 /*
     NOTE: currently our whole public and private interface is broken
     and private headers leak to public includes
 */
+
+//! Nabla declarations
 #include "nabla.h"
 
-//! common example headers
+//! Common example interface headers
 
 // why isnt this in `nabla.h` ?
-#include "nbl/application_templates/MonoAssetManagerAndBuiltinResourceApplication.hpp"
+/*
+    because it does stuff like
+
+    #ifdef NBL_EMBED_BUILTIN_RESOURCES
+    #include "nbl/this_example/builtin/CArchive.h"
+    #endif
+
+    hence also cannot be there in PCH but rather in examples.h -> compile errors
+    but only *if* we decide each example handles builtins on NBL_EMBED_BUILTIN_RESOURCES
+*/
+// #include "nbl/application_templates/MonoAssetManagerAndBuiltinResourceApplication.hpp"
 
 #include "nbl/examples/common/SimpleWindowedApplication.hpp"
 #include "nbl/examples/common/MonoWindowApplication.hpp"
diff --git a/common/include/nbl/examples/api.hpp b/common/include/nbl/examples/api.hpp
deleted file mode 100644
index 9b809b8ea..000000000
--- a/common/include/nbl/examples/api.hpp
+++ /dev/null
@@ -1,24 +0,0 @@
-// Copyright (C) 2018-2025 - DevSH Graphics Programming Sp. z O.O.
-// This file is part of the "Nabla Engine".
-// For conditions of distribution and use, see copyright notice in nabla.h
-#ifndef _NBL_EXAMPLES_API_HPP_
-#define _NBL_EXAMPLES_API_HPP_
-
-//! PCH for examples
-/*
-    PCH is compiled only once *if* an example can be promoted to use it, it is
-    when its compile options & definitions set is the same as nblExamplesAPI's
-    each example links to, otherwise it compiles its own PCH
-*/
-#include "nbl/examples/PCH.hpp"
-
-//! common headers used across examples which cannot be part of PCH
-/*
-    NOTE: put here if a header requires defines which may be differ
-*/
-
-// broken? probably to refactor or even remove?
-// #include "nbl/examples/geometry/CGeometryCreatorScene.hpp"
-
-
-#endif // _NBL_EXAMPLES_API_HPP_
\ No newline at end of file
diff --git a/common/include/nbl/examples/examples.hpp b/common/include/nbl/examples/examples.hpp
index a7d8f92e4..985a3960a 100644
--- a/common/include/nbl/examples/examples.hpp
+++ b/common/include/nbl/examples/examples.hpp
@@ -4,8 +4,16 @@
 #ifndef _NBL_EXAMPLES_HPP_
 #define _NBL_EXAMPLES_HPP_
 
-
+//! Precompiled header shared across all examples
 #include "nbl/examples/PCH.hpp"
 
+//! Example specific headers that must not be included in the PCH
+/*
+    NOTE: Add here if they depend on preprocessor definitions
+    or macros that are specific to individual example targets
+    (eg. defined in CMake)
+*/
+
+// #include "..."
 
 #endif // _NBL_EXAMPLES_HPP_
\ No newline at end of file
diff --git a/common/src/nbl/examples/CMakeLists.txt b/common/src/nbl/examples/CMakeLists.txt
index 65c312582..a95372eea 100644
--- a/common/src/nbl/examples/CMakeLists.txt
+++ b/common/src/nbl/examples/CMakeLists.txt
@@ -1,16 +1,4 @@
 # TODO builtin SPIR-V shaders
 # add_subdirectory(geometry EXCLUDE_FROM_ALL)
 
-# TODO: slightly redo and make docs once I get n4ce embed SPIRV tool to build system
-
-# we get all available targets inclusive & below this directory
-# NBL_GET_ALL_TARGETS(NBL_SUBDIRECTORY_TARGETS)
-
-# then we expose common include search directories to all common libraries + create link interface
-# foreach(NBL_TARGET IN LISTS NBL_SUBDIRECTORY_TARGETS)
-#    target_include_directories(${NBL_TARGET} PUBLIC $<TARGET_PROPERTY:nblExamplesAPI,INTERFACE_INCLUDE_DIRECTORIES>)
-#    target_link_libraries(nblExamplesAPI INTERFACE ${NBL_TARGET})
-#endforeach()
-
-#
-# set(NBL_COMMON_API_TARGETS ${NBL_SUBDIRECTORY_TARGETS} PARENT_SCOPE)
\ No newline at end of file
+# TODO: make docs once I get n4ce embed SPIRV tool to build system and then use the tool with Matts new shader
\ No newline at end of file
diff --git a/common/src/nbl/examples/cameras/CMakeLists.txt b/common/src/nbl/examples/cameras/CMakeLists.txt
deleted file mode 100644
index 0b0e59cdc..000000000
--- a/common/src/nbl/examples/cameras/CMakeLists.txt
+++ /dev/null
@@ -1,7 +0,0 @@
-# header only currently
-
-#set(NBL_EXAMPLES_CAMERA_LIB_SOURCES
-#    "${CMAKE_CURRENT_SOURCE_DIR}/*.cpp"
-#)
-
-#nbl_create_ext_library_project(ExampleCameras "" "${NBL_EXAMPLES_CAMERA_LIB_SOURCES}" "" "" "")
\ No newline at end of file

From cfd609c3f01819abd504416d2b52ae72191c15b1 Mon Sep 17 00:00:00 2001
From: devsh <devsh@devsh.eu>
Date: Fri, 20 Jun 2025 16:54:44 +0200
Subject: [PATCH 406/529] matrix math in linalg was fine, I made a typo

---
 09_GeometryCreator/main.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/09_GeometryCreator/main.cpp b/09_GeometryCreator/main.cpp
index c087eba07..000bec369 100644
--- a/09_GeometryCreator/main.cpp
+++ b/09_GeometryCreator/main.cpp
@@ -150,13 +150,13 @@ class GeometryCreatorApp final : public MonoWindowApplication, public applicatio
 			// TODO: get rid of legacy matrices
 			{
 				memcpy(&viewMatrix,camera.getViewMatrix().pointer(),sizeof(viewMatrix));
-				memcpy(&viewProjMatrix,camera.getConcatenatedMatrix().pointer(),sizeof(viewMatrix));
+				memcpy(&viewProjMatrix,camera.getConcatenatedMatrix().pointer(),sizeof(viewProjMatrix));
 			}
 			const auto viewParams = CSimpleDebugRenderer::SViewParams(viewMatrix,viewProjMatrix);
 
 			// tear down scene every frame
 			m_renderer->m_instances[0].packedGeo = m_renderer->getInitParams().geoms.data()+gcIndex;
-			m_renderer->render(cb,viewParams);
+ 			m_renderer->render(cb,viewParams);
 
 			cb->endRenderPass();
 			cb->end();

From 6242357bb8b98793da13885bdae4586c009e6984 Mon Sep 17 00:00:00 2001
From: devsh <devsh@devsh.eu>
Date: Sat, 21 Jun 2025 00:00:47 +0200
Subject: [PATCH 407/529] fix a caption bug

---
 09_GeometryCreator/main.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/09_GeometryCreator/main.cpp b/09_GeometryCreator/main.cpp
index 000bec369..6fddd8282 100644
--- a/09_GeometryCreator/main.cpp
+++ b/09_GeometryCreator/main.cpp
@@ -195,7 +195,7 @@ class GeometryCreatorApp final : public MonoWindowApplication, public applicatio
 
 			std::string caption = "[Nabla Engine] Geometry Creator";
 			{
-				caption += ", displaying [" + 
+				caption += ", displaying [";
 				caption += m_scene->getGeometries()[gcIndex].name;
 				caption += "]";
 				m_window->setCaption(caption);

From ef865e79e8d85c2bf0d507f8ee26231bcd3d1d58 Mon Sep 17 00:00:00 2001
From: Erfan Ahmadi <ahmadierfan99@gmail.com>
Date: Sat, 21 Jun 2025 13:54:35 +0400
Subject: [PATCH 408/529] Small change

---
 62_CAD/shaders/globals.hlsl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/62_CAD/shaders/globals.hlsl b/62_CAD/shaders/globals.hlsl
index 255c46d8a..41c149205 100644
--- a/62_CAD/shaders/globals.hlsl
+++ b/62_CAD/shaders/globals.hlsl
@@ -577,7 +577,7 @@ NBL_CONSTEXPR MajorAxis SelectedMinorAxis = MajorAxis::MAJOR_X; //(MajorAxis) (1
 // Text or MSDF Hatches
 NBL_CONSTEXPR float MSDFPixelRange = 4.0f;
 NBL_CONSTEXPR float MSDFPixelRangeHalf = MSDFPixelRange / 2.0f;
-NBL_CONSTEXPR float MSDFSize = 32.0f; 
+NBL_CONSTEXPR float MSDFSize = 64.0f; 
 NBL_CONSTEXPR uint32_t MSDFMips = 4; 
 NBL_CONSTEXPR float HatchFillMSDFSceenSpaceSize = 8.0; 
 

From 19167c57e6fb80b762263513b10873c85990d09e Mon Sep 17 00:00:00 2001
From: devsh <devsh@devsh.eu>
Date: Sat, 21 Jun 2025 23:36:08 +0200
Subject: [PATCH 409/529] leave pointers on how to port example 62_CAD, also
 `EXCLUDE_FROM_ALL` the examples which are WIP on this branch

---
 62_CAD/main.cpp | 24 ++++++++++++------------
 CMakeLists.txt  |  8 ++++----
 2 files changed, 16 insertions(+), 16 deletions(-)

diff --git a/62_CAD/main.cpp b/62_CAD/main.cpp
index 637c88eda..f873914e2 100644
--- a/62_CAD/main.cpp
+++ b/62_CAD/main.cpp
@@ -1,17 +1,17 @@
-﻿
-using namespace nbl::hlsl;
-using namespace nbl;
-using namespace core;
-using namespace system;
-using namespace asset;
-using namespace ui;
-using namespace video;
+﻿// TODO: Copyright notice
+
 
+#include "nbl/examples/examples.hpp"
+
+using namespace nbl;
+using namespace nbl::core;
+using namespace nbl::hlsl;
+using namespace nbl::system;
+using namespace nbl::asset;
+using namespace nbl::ui;
+using namespace nbl::video;
+// TODO: probably need to be `using namespace nbl::examples` as well, see other examples
 
-#include "nbl/application_templates/MonoAssetManagerAndBuiltinResourceApplication.hpp"
-#include "SimpleWindowedApplication.hpp"
-#include "InputSystem.hpp"
-#include "nbl/video/utilities/CSimpleResizeSurface.h"
 
 #include "nbl/ext/FullScreenTriangle/FullScreenTriangle.h"
 #include "nbl/ext/TextRendering/TextRendering.h"
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 41ed86b52..aa3880762 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -71,18 +71,18 @@ if(NBL_BUILD_EXAMPLES)
 	add_subdirectory(47_DerivMapTest EXCLUDE_FROM_ALL)
 	add_subdirectory(54_Transformations EXCLUDE_FROM_ALL)
 	add_subdirectory(55_RGB18E7S3 EXCLUDE_FROM_ALL)
-	add_subdirectory(61_UI)
-	add_subdirectory(62_CAD)
+	add_subdirectory(61_UI EXCLUDE_FROM_ALL) # TODO: resurrect before `mesh_loaders` merge
+	add_subdirectory(62_CAD EXCLUDE_FROM_ALL) # TODO: Erfan, Przemek, Francisco and co. need to resurrect this
 	add_subdirectory(62_SchusslerTest EXCLUDE_FROM_ALL)
 	add_subdirectory(64_EmulatedFloatTest)
 	add_subdirectory(0_ImportanceSamplingEnvMaps EXCLUDE_FROM_ALL) #TODO: integrate back into 42
 
 	add_subdirectory(66_HLSLBxDFTests EXCLUDE_FROM_ALL)
-	add_subdirectory(67_RayQueryGeometry)
+	add_subdirectory(67_RayQueryGeometry EXCLUDE_FROM_ALL) # TODO: resurrect before `mesh_loaders` merge
 	add_subdirectory(68_JpegLoading)
 
   	add_subdirectory(70_FLIPFluids)
-	add_subdirectory(71_RayTracingPipeline)
+	add_subdirectory(71_RayTracingPipeline EXCLUDE_FROM_ALL) # TODO: resurrect before `mesh_loaders` merge
 
 	# add new examples *before* NBL_GET_ALL_TARGETS invocation, it gathers recursively all targets created so far in this subdirectory
 	NBL_GET_ALL_TARGETS(TARGETS)

From a0622f3c0a71fc8d2e9c4f0f426f3f5695dc89dd Mon Sep 17 00:00:00 2001
From: devsh <devsh@devsh.eu>
Date: Sat, 21 Jun 2025 23:49:19 +0200
Subject: [PATCH 410/529] kill old meshloaders example that didn't work since
 forever

---
 06_MeshLoaders/CMakeLists.txt       |   6 -
 06_MeshLoaders/config.json.template |  28 --
 06_MeshLoaders/main.cpp             | 563 ----------------------------
 06_MeshLoaders/pipeline.groovy      |  50 ---
 4 files changed, 647 deletions(-)
 delete mode 100644 06_MeshLoaders/CMakeLists.txt
 delete mode 100644 06_MeshLoaders/config.json.template
 delete mode 100644 06_MeshLoaders/main.cpp
 delete mode 100644 06_MeshLoaders/pipeline.groovy

diff --git a/06_MeshLoaders/CMakeLists.txt b/06_MeshLoaders/CMakeLists.txt
deleted file mode 100644
index 2f9218f93..000000000
--- a/06_MeshLoaders/CMakeLists.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-include(common RESULT_VARIABLE RES)
-if(NOT RES)
-	message(FATAL_ERROR "common.cmake not found. Should be in {repo_root}/cmake directory")
-endif()
-
-nbl_create_executable_project("" "" "" "" "${NBL_EXECUTABLE_PROJECT_CREATION_PCH_TARGET}")
\ No newline at end of file
diff --git a/06_MeshLoaders/config.json.template b/06_MeshLoaders/config.json.template
deleted file mode 100644
index f961745c1..000000000
--- a/06_MeshLoaders/config.json.template
+++ /dev/null
@@ -1,28 +0,0 @@
-{
-  "enableParallelBuild": true,
-  "threadsPerBuildProcess" : 2,
-  "isExecuted": false,
-  "scriptPath": "",
-  "cmake": {
-    "configurations": [ "Release", "Debug", "RelWithDebInfo" ],
-    "buildModes": [],
-    "requiredOptions": []
-  }, 
-  "profiles": [
-    {
-      "backend": "vulkan",
-      "platform": "windows",
-      "buildModes": [],
-      "runConfiguration": "Release",
-      "gpuArchitectures": []
-    }
-  ],
-  "dependencies": [],
-  "data": [
-    {
-      "dependencies": [],
-      "command": [""],
-      "outputs": []
-    }
-  ]
-}
\ No newline at end of file
diff --git a/06_MeshLoaders/main.cpp b/06_MeshLoaders/main.cpp
deleted file mode 100644
index 75135c033..000000000
--- a/06_MeshLoaders/main.cpp
+++ /dev/null
@@ -1,563 +0,0 @@
-// Copyright (C) 2018-2020 - DevSH Graphics Programming Sp. z O.O.
-// This file is part of the "Nabla Engine".
-// For conditions of distribution and use, see copyright notice in nabla.h
-
-#define _NBL_STATIC_LIB_
-#include <iostream>
-#include <cstdio>
-#include <nabla.h>
-
-#include "CCamera.hpp"
-#include "../common/CommonAPI.h"
-#include "nbl/ext/ScreenShot/ScreenShot.h"
-
-using namespace nbl;
-using namespace core;
-using namespace ui;
-/*
-    Uncomment for more detailed logging
-*/
-
-// #define NBL_MORE_LOGS
-
-class MeshLoadersApp : public ApplicationBase
-{
-    constexpr static uint32_t WIN_W = 1280;
-    constexpr static uint32_t WIN_H = 720;
-    constexpr static uint32_t SC_IMG_COUNT = 3u;
-    constexpr static uint32_t FRAMES_IN_FLIGHT = 5u;
-    constexpr static uint64_t MAX_TIMEOUT = 99999999999999ull;
-    constexpr static size_t NBL_FRAMES_TO_AVERAGE = 100ull;
-
-    static_assert(FRAMES_IN_FLIGHT > SC_IMG_COUNT);
-public:
-    nbl::core::smart_refctd_ptr<nbl::ui::IWindowManager> windowManager;
-    nbl::core::smart_refctd_ptr<nbl::ui::IWindow> window;
-    nbl::core::smart_refctd_ptr<CommonAPI::CommonAPIEventCallback> windowCb;
-    nbl::core::smart_refctd_ptr<nbl::video::IAPIConnection> apiConnection;
-    nbl::core::smart_refctd_ptr<nbl::video::ISurface> surface;
-    nbl::core::smart_refctd_ptr<nbl::video::IUtilities> utilities;
-    nbl::core::smart_refctd_ptr<nbl::video::ILogicalDevice> logicalDevice;
-    nbl::video::IPhysicalDevice* physicalDevice;
-    std::array<video::IGPUQueue*, CommonAPI::InitOutput::MaxQueuesCount> queues;
-    nbl::core::smart_refctd_ptr<nbl::video::ISwapchain> swapchain;
-    nbl::core::smart_refctd_ptr<nbl::video::IGPURenderpass> renderpass;
-    nbl::core::smart_refctd_dynamic_array<nbl::core::smart_refctd_ptr<nbl::video::IGPUFramebuffer>> fbo;
-    std::array<std::array<nbl::core::smart_refctd_ptr<nbl::video::IGPUCommandPool>, CommonAPI::InitOutput::MaxFramesInFlight>, CommonAPI::InitOutput::MaxQueuesCount> commandPools;
-    nbl::core::smart_refctd_ptr<nbl::system::ISystem> system;
-    nbl::core::smart_refctd_ptr<nbl::asset::IAssetManager> assetManager;
-    nbl::video::IGPUObjectFromAssetConverter::SParams cpu2gpuParams;
-    nbl::core::smart_refctd_ptr<nbl::system::ILogger> logger;
-    nbl::core::smart_refctd_ptr<CommonAPI::InputSystem> inputSystem;
-
-    nbl::video::IGPUObjectFromAssetConverter cpu2gpu;
-    
-    video::IDeviceMemoryBacked::SDeviceMemoryRequirements ubomemreq;
-    core::smart_refctd_ptr<video::IGPUBuffer> gpuubo;
-    core::smart_refctd_ptr<video::IGPUDescriptorSet> gpuds1;
-
-    core::smart_refctd_ptr<video::IQueryPool> occlusionQueryPool;
-    core::smart_refctd_ptr<video::IQueryPool> timestampQueryPool;
-
-    asset::ICPUMesh* meshRaw = nullptr;
-    const asset::COBJMetadata* metaOBJ = nullptr;
-
-    core::smart_refctd_ptr<video::IGPUFence> frameComplete[FRAMES_IN_FLIGHT] = { nullptr };
-    core::smart_refctd_ptr<video::IGPUSemaphore> imageAcquire[FRAMES_IN_FLIGHT] = { nullptr };
-    core::smart_refctd_ptr<video::IGPUSemaphore> renderFinished[FRAMES_IN_FLIGHT] = { nullptr };
-    core::smart_refctd_ptr<video::IGPUCommandBuffer> commandBuffers[FRAMES_IN_FLIGHT];
-
-    CommonAPI::InputSystem::ChannelReader<IMouseEventChannel> mouse;
-    CommonAPI::InputSystem::ChannelReader<IKeyboardEventChannel> keyboard;
-    Camera camera = Camera(vectorSIMDf(0, 0, 0), vectorSIMDf(0, 0, 0), matrix4SIMD());
-
-    using RENDERPASS_INDEPENDENT_PIPELINE_ADRESS = size_t;
-    std::map<RENDERPASS_INDEPENDENT_PIPELINE_ADRESS, core::smart_refctd_ptr<video::IGPUGraphicsPipeline>> gpuPipelines;
-    core::smart_refctd_ptr<video::IGPUMesh> gpumesh;
-    const asset::ICPUMeshBuffer* firstMeshBuffer;
-    const nbl::asset::COBJMetadata::CRenderpassIndependentPipeline* pipelineMetadata;
-    nbl::video::ISwapchain::SCreationParams m_swapchainCreationParams;
-
-    uint32_t ds1UboBinding = 0;
-    int resourceIx;
-    uint32_t acquiredNextFBO = {};
-    std::chrono::steady_clock::time_point lastTime;
-    bool frameDataFilled = false;
-    size_t frame_count = 0ull;
-    double time_sum = 0;
-    double dtList[NBL_FRAMES_TO_AVERAGE] = {};
-
-    video::CDumbPresentationOracle oracle;
-    
-    core::smart_refctd_ptr<video::IGPUBuffer> queryResultsBuffer;
-
-    void setWindow(core::smart_refctd_ptr<nbl::ui::IWindow>&& wnd) override
-    {
-        window = std::move(wnd);
-    }
-    void setSystem(core::smart_refctd_ptr<nbl::system::ISystem>&& s) override
-    {
-        system = std::move(s);
-    }
-    nbl::ui::IWindow* getWindow() override
-    {
-        return window.get();
-    }
-    video::IAPIConnection* getAPIConnection() override
-    {
-        return apiConnection.get();
-    }
-    video::ILogicalDevice* getLogicalDevice()  override
-    {
-        return logicalDevice.get();
-    }
-    video::IGPURenderpass* getRenderpass() override
-    {
-        return renderpass.get();
-    }
-    void setSurface(core::smart_refctd_ptr<video::ISurface>&& s) override
-    {
-        surface = std::move(s);
-    }
-    void setFBOs(std::vector<core::smart_refctd_ptr<video::IGPUFramebuffer>>& f) override
-    {
-        for (int i = 0; i < f.size(); i++)
-        {
-            fbo->begin()[i] = core::smart_refctd_ptr(f[i]);
-        }
-    }
-    void setSwapchain(core::smart_refctd_ptr<video::ISwapchain>&& s) override
-    {
-        swapchain = std::move(s);
-    }
-    uint32_t getSwapchainImageCount() override
-    {
-        return swapchain->getImageCount();
-    }
-    virtual nbl::asset::E_FORMAT getDepthFormat() override
-    {
-        return nbl::asset::EF_D32_SFLOAT;
-    }
-
-    void getAndLogQueryPoolResults()
-    {
-#ifdef QUERY_POOL_LOGS
-        {
-            uint64_t samples_passed[4] = {};
-            auto queryResultFlags = core::bitflag<video::IQueryPool::E_QUERY_RESULTS_FLAGS>(video::IQueryPool::EQRF_WITH_AVAILABILITY_BIT) | video::IQueryPool::EQRF_64_BIT;
-            logicalDevice->getQueryPoolResults(occlusionQueryPool.get(), 0u, 2u, sizeof(samples_passed), samples_passed, sizeof(uint64_t) * 2, queryResultFlags);
-            logger->log("[AVAIL+64] SamplesPassed[0] = %d, SamplesPassed[1] = %d, Result Available = %d, %d", system::ILogger::ELL_INFO, samples_passed[0], samples_passed[2], samples_passed[1], samples_passed[3]);
-        }
-        {
-            uint64_t samples_passed[4] = {};
-            auto queryResultFlags = core::bitflag<video::IQueryPool::E_QUERY_RESULTS_FLAGS>(video::IQueryPool::EQRF_WITH_AVAILABILITY_BIT) | video::IQueryPool::EQRF_64_BIT | video::IQueryPool::EQRF_WAIT_BIT;
-            logicalDevice->getQueryPoolResults(occlusionQueryPool.get(), 0u, 2u, sizeof(samples_passed), samples_passed, sizeof(uint64_t) * 2, queryResultFlags);
-            logger->log("[WAIT+AVAIL+64] SamplesPassed[0] = %d, SamplesPassed[1] = %d, Result Available = %d, %d", system::ILogger::ELL_INFO, samples_passed[0], samples_passed[2], samples_passed[1], samples_passed[3]);
-        }
-        {
-            uint32_t samples_passed[2] = {};
-            auto queryResultFlags = core::bitflag<video::IQueryPool::E_QUERY_RESULTS_FLAGS>(video::IQueryPool::EQRF_WAIT_BIT);
-            logicalDevice->getQueryPoolResults(occlusionQueryPool.get(), 0u, 2u, sizeof(samples_passed), samples_passed, sizeof(uint32_t), queryResultFlags);
-            logger->log("[WAIT] SamplesPassed[0] = %d, SamplesPassed[1] = %d", system::ILogger::ELL_INFO, samples_passed[0], samples_passed[1]);
-        }
-        {
-            uint64_t timestamps[4] = {};
-            auto queryResultFlags = core::bitflag<video::IQueryPool::E_QUERY_RESULTS_FLAGS>(video::IQueryPool::EQRF_WAIT_BIT) | video::IQueryPool::EQRF_WITH_AVAILABILITY_BIT | video::IQueryPool::EQRF_64_BIT;
-            logicalDevice->getQueryPoolResults(timestampQueryPool.get(), 0u, 2u, sizeof(timestamps), timestamps, sizeof(uint64_t) * 2ull, queryResultFlags);
-            float timePassed = (timestamps[2] - timestamps[0]) * physicalDevice->getLimits().timestampPeriodInNanoSeconds;
-            logger->log("Time Passed (Seconds) = %f", system::ILogger::ELL_INFO, (timePassed * 1e-9));
-            logger->log("Timestamps availablity: %d, %d", system::ILogger::ELL_INFO, timestamps[1], timestamps[3]);
-        }
-#endif
-    }
-
-    APP_CONSTRUCTOR(MeshLoadersApp)
-    void onAppInitialized_impl() override
-    {
-        const auto swapchainImageUsage = static_cast<asset::IImage::E_USAGE_FLAGS>(asset::IImage::EUF_COLOR_ATTACHMENT_BIT | asset::IImage::EUF_TRANSFER_SRC_BIT);
-        CommonAPI::InitParams initParams;
-        initParams.window = core::smart_refctd_ptr(window);
-        initParams.apiType = video::EAT_VULKAN;
-        initParams.appName = { _NBL_APP_NAME_ };
-        initParams.framesInFlight = FRAMES_IN_FLIGHT;
-        initParams.windowWidth = WIN_W;
-        initParams.windowHeight = WIN_H;
-        initParams.swapchainImageCount = SC_IMG_COUNT;
-        initParams.swapchainImageUsage = swapchainImageUsage;
-        initParams.depthFormat = nbl::asset::EF_D32_SFLOAT;
-        auto initOutput = CommonAPI::InitWithDefaultExt(std::move(initParams));
-
-        window = std::move(initParams.window);
-        windowCb = std::move(initParams.windowCb);
-        apiConnection = std::move(initOutput.apiConnection);
-        surface = std::move(initOutput.surface);
-        utilities = std::move(initOutput.utilities);
-        logicalDevice = std::move(initOutput.logicalDevice);
-        physicalDevice = initOutput.physicalDevice;
-        queues = std::move(initOutput.queues);
-        renderpass = std::move(initOutput.renderToSwapchainRenderpass);
-        commandPools = std::move(initOutput.commandPools);
-        system = std::move(initOutput.system);
-        assetManager = std::move(initOutput.assetManager);
-        cpu2gpuParams = std::move(initOutput.cpu2gpuParams);
-        logger = std::move(initOutput.logger);
-        inputSystem = std::move(initOutput.inputSystem);
-        m_swapchainCreationParams = std::move(initOutput.swapchainCreationParams);
-
-        CommonAPI::createSwapchain(std::move(logicalDevice), m_swapchainCreationParams, WIN_W, WIN_H, swapchain);
-        assert(swapchain);
-        fbo = CommonAPI::createFBOWithSwapchainImages(
-            swapchain->getImageCount(), WIN_W, WIN_H,
-            logicalDevice, swapchain, renderpass,
-            nbl::asset::EF_D32_SFLOAT
-        );
-        
-        // Occlusion Query
-        {
-            video::IQueryPool::SCreationParams queryPoolCreationParams = {};
-            queryPoolCreationParams.queryType = video::IQueryPool::EQT_OCCLUSION;
-            queryPoolCreationParams.queryCount = 2u;
-            occlusionQueryPool = logicalDevice->createQueryPool(std::move(queryPoolCreationParams));
-        }
-
-        // Timestamp Query
-        video::IQueryPool::SCreationParams queryPoolCreationParams = {};
-        {
-            video::IQueryPool::SCreationParams queryPoolCreationParams = {};
-            queryPoolCreationParams.queryType = video::IQueryPool::EQT_TIMESTAMP;
-            queryPoolCreationParams.queryCount = 2u;
-            timestampQueryPool = logicalDevice->createQueryPool(std::move(queryPoolCreationParams));
-        }
-
-        {
-            // SAMPLES_PASSED_0 + AVAILABILIY_0 + SAMPLES_PASSED_1 + AVAILABILIY_1 (uint32_t)
-            const size_t queriesSize = sizeof(uint32_t) * 4;
-            video::IGPUBuffer::SCreationParams gpuuboCreationParams;
-            gpuuboCreationParams.size = queriesSize;
-            gpuuboCreationParams.usage = core::bitflag<asset::IBuffer::E_USAGE_FLAGS>(asset::IBuffer::EUF_UNIFORM_BUFFER_BIT)|asset::IBuffer::EUF_TRANSFER_DST_BIT|asset::IBuffer::EUF_INLINE_UPDATE_VIA_CMDBUF;
-            gpuuboCreationParams.queueFamilyIndexCount = 0u;
-            gpuuboCreationParams.queueFamilyIndices = nullptr;
-
-            queryResultsBuffer = logicalDevice->createBuffer(std::move(gpuuboCreationParams));
-            auto memReqs = queryResultsBuffer->getMemoryReqs();
-            memReqs.memoryTypeBits &= physicalDevice->getDeviceLocalMemoryTypeBits();
-            auto queriesMem = logicalDevice->allocate(memReqs, queryResultsBuffer.get());
-
-            queryResultsBuffer->setObjectDebugName("QueryResults");
-        }
-
-        nbl::video::IGPUObjectFromAssetConverter cpu2gpu;
-        {
-            auto* quantNormalCache = assetManager->getMeshManipulator()->getQuantNormalCache();
-            quantNormalCache->loadCacheFromFile<asset::EF_A2B10G10R10_SNORM_PACK32>(system.get(), sharedOutputCWD / "normalCache101010.sse");
-
-            system::path archPath = sharedInputCWD / "sponza.zip";
-            auto arch = system->openFileArchive(archPath);
-            // test no alias loading (TODO: fix loading from absolute paths)
-            system->mount(std::move(arch));
-            asset::IAssetLoader::SAssetLoadParams loadParams;
-            loadParams.workingDirectory = sharedInputCWD;
-            loadParams.logger = logger.get();
-            auto meshes_bundle = assetManager->getAsset((sharedInputCWD / "sponza.zip/sponza.obj").string(), loadParams);
-            assert(!meshes_bundle.getContents().empty());
-
-            metaOBJ = meshes_bundle.getMetadata()->selfCast<const asset::COBJMetadata>();
-
-            auto cpuMesh = meshes_bundle.getContents().begin()[0];
-            meshRaw = static_cast<asset::ICPUMesh*>(cpuMesh.get());
-
-            quantNormalCache->saveCacheToFile<asset::EF_A2B10G10R10_SNORM_PACK32>(system.get(), sharedOutputCWD / "normalCache101010.sse");
-        }
-
-        // Fix FrontFace and BlendParams for meshBuffers
-        for (size_t i = 0ull; i < meshRaw->getMeshBuffers().size(); ++i)
-        {
-            auto& meshBuffer = meshRaw->getMeshBuffers().begin()[i];
-            meshBuffer->getPipeline()->getRasterizationParams().frontFaceIsCCW = false;
-        }
-
-        // we can safely assume that all meshbuffers within mesh loaded from OBJ has same DS1 layout (used for camera-specific data)
-        firstMeshBuffer = *meshRaw->getMeshBuffers().begin();
-        pipelineMetadata = metaOBJ->getAssetSpecificMetadata(firstMeshBuffer->getPipeline());
-
-        // so we can create just one DS
-        const asset::ICPUDescriptorSetLayout* ds1layout = firstMeshBuffer->getPipeline()->getLayout()->getDescriptorSetLayout(1u);
-        ds1UboBinding = ds1layout->getDescriptorRedirect(asset::IDescriptor::E_TYPE::ET_UNIFORM_BUFFER).getBinding(asset::ICPUDescriptorSetLayout::CBindingRedirect::storage_range_index_t{ 0 }).data;
-
-        size_t neededDS1UBOsz = 0ull;
-        {
-            for (const auto& shdrIn : pipelineMetadata->m_inputSemantics)
-                if (shdrIn.descriptorSection.type == asset::IRenderpassIndependentPipelineMetadata::ShaderInput::E_TYPE::ET_UNIFORM_BUFFER && shdrIn.descriptorSection.uniformBufferObject.set == 1u && shdrIn.descriptorSection.uniformBufferObject.binding == ds1UboBinding)
-                    neededDS1UBOsz = std::max<size_t>(neededDS1UBOsz, shdrIn.descriptorSection.uniformBufferObject.relByteoffset + shdrIn.descriptorSection.uniformBufferObject.bytesize);
-        }
-
-        core::smart_refctd_ptr<video::IGPUDescriptorSetLayout> gpuds1layout;
-        {
-            auto gpu_array = cpu2gpu.getGPUObjectsFromAssets(&ds1layout, &ds1layout + 1, cpu2gpuParams);
-            if (!gpu_array || gpu_array->size() < 1u || !(*gpu_array)[0])
-                assert(false);
-
-            gpuds1layout = (*gpu_array)[0];
-        }
-
-        core::smart_refctd_ptr<video::IDescriptorPool> descriptorPool = nullptr;
-        {
-            video::IDescriptorPool::SCreateInfo createInfo = {};
-            createInfo.maxSets = 1u;
-            createInfo.maxDescriptorCount[static_cast<uint32_t>(asset::IDescriptor::E_TYPE::ET_UNIFORM_BUFFER)] = 1u;
-            descriptorPool = logicalDevice->createDescriptorPool(std::move(createInfo));
-        }
-
-        video::IGPUBuffer::SCreationParams gpuuboCreationParams;
-        gpuuboCreationParams.size = neededDS1UBOsz;
-        gpuuboCreationParams.usage = core::bitflag<asset::IBuffer::E_USAGE_FLAGS>(asset::IBuffer::EUF_UNIFORM_BUFFER_BIT) | asset::IBuffer::EUF_TRANSFER_DST_BIT | asset::IBuffer::EUF_INLINE_UPDATE_VIA_CMDBUF;
-        gpuuboCreationParams.queueFamilyIndexCount = 0u;
-        gpuuboCreationParams.queueFamilyIndices = nullptr;
-
-        gpuubo = logicalDevice->createBuffer(std::move(gpuuboCreationParams));
-        auto gpuuboMemReqs = gpuubo->getMemoryReqs();
-        gpuuboMemReqs.memoryTypeBits &= physicalDevice->getDeviceLocalMemoryTypeBits();
-        auto uboMemoryOffset = logicalDevice->allocate(gpuuboMemReqs, gpuubo.get(), video::IDeviceMemoryAllocation::E_MEMORY_ALLOCATE_FLAGS::EMAF_NONE);
-
-        gpuds1 = descriptorPool->createDescriptorSet(std::move(gpuds1layout));
-
-        {
-            video::IGPUDescriptorSet::SWriteDescriptorSet write;
-            write.dstSet = gpuds1.get();
-            write.binding = ds1UboBinding;
-            write.count = 1u;
-            write.arrayElement = 0u;
-            write.descriptorType = asset::IDescriptor::E_TYPE::ET_UNIFORM_BUFFER;
-            video::IGPUDescriptorSet::SDescriptorInfo info;
-            {
-                info.desc = gpuubo;
-                info.info.buffer.offset = 0ull;
-                info.info.buffer.size = neededDS1UBOsz;
-            }
-            write.info = &info;
-            logicalDevice->updateDescriptorSets(1u, &write, 0u, nullptr);
-        }
-        {
-            cpu2gpuParams.beginCommandBuffers();
-
-            auto gpu_array = cpu2gpu.getGPUObjectsFromAssets(&meshRaw, &meshRaw + 1, cpu2gpuParams);
-            if (!gpu_array || gpu_array->size() < 1u || !(*gpu_array)[0])
-                assert(false);
-            
-            cpu2gpuParams.waitForCreationToComplete(false);
-
-            gpumesh = (*gpu_array)[0];
-        }
-       
-        {
-            for (size_t i = 0; i < gpumesh->getMeshBuffers().size(); ++i)
-            {
-                auto gpuIndependentPipeline = gpumesh->getMeshBuffers().begin()[i]->getPipeline();
-
-                nbl::video::IGPUGraphicsPipeline::SCreationParams graphicsPipelineParams;
-                graphicsPipelineParams.renderpassIndependent = core::smart_refctd_ptr<nbl::video::IGPURenderpassIndependentPipeline>(const_cast<video::IGPURenderpassIndependentPipeline*>(gpuIndependentPipeline));
-                graphicsPipelineParams.renderpass = core::smart_refctd_ptr(renderpass);
-
-                const RENDERPASS_INDEPENDENT_PIPELINE_ADRESS adress = reinterpret_cast<RENDERPASS_INDEPENDENT_PIPELINE_ADRESS>(graphicsPipelineParams.renderpassIndependent.get());
-                gpuPipelines[adress] = logicalDevice->createGraphicsPipeline(nullptr, std::move(graphicsPipelineParams));
-            }
-        }
-
-        core::vectorSIMDf cameraPosition(-250.0f,177.0f,1.69f);
-        core::vectorSIMDf cameraTarget(50.0f,125.0f,-3.0f);
-        matrix4SIMD projectionMatrix = matrix4SIMD::buildProjectionMatrixPerspectiveFovLH(core::radians(60.0f), video::ISurface::getTransformedAspectRatio(swapchain->getPreTransform(), WIN_W, WIN_H), 0.1, 10000);
-        camera = Camera(cameraPosition, cameraTarget, projectionMatrix, 10.f, 1.f);
-        lastTime = std::chrono::steady_clock::now();
-
-        for (size_t i = 0ull; i < NBL_FRAMES_TO_AVERAGE; ++i)
-            dtList[i] = 0.0;
-
-        oracle.reportBeginFrameRecord();
-        
-
-       const auto& graphicsCommandPools = commandPools[CommonAPI::InitOutput::EQT_GRAPHICS];
-		for (uint32_t i = 0u; i < FRAMES_IN_FLIGHT; i++)
-		{
-			logicalDevice->createCommandBuffers(graphicsCommandPools[i].get(), video::IGPUCommandBuffer::EL_PRIMARY, 1, commandBuffers+i);
-            imageAcquire[i] = logicalDevice->createSemaphore();
-            renderFinished[i] = logicalDevice->createSemaphore();
-        }
-
-        constexpr uint64_t MAX_TIMEOUT = 99999999999999ull;
-        uint32_t acquiredNextFBO = {};
-        resourceIx = -1;
-    }
-    void onAppTerminated_impl() override
-    {
-        const auto& fboCreationParams = fbo->begin()[acquiredNextFBO]->getCreationParameters();
-        auto gpuSourceImageView = fboCreationParams.attachments[0];
-
-        bool status = ext::ScreenShot::createScreenShot(
-            logicalDevice.get(),
-            queues[CommonAPI::InitOutput::EQT_TRANSFER_DOWN],
-            renderFinished[resourceIx].get(),
-            gpuSourceImageView.get(),
-            assetManager.get(),
-            "ScreenShot.png",
-            asset::IImage::EL_PRESENT_SRC,
-            asset::EAF_NONE);
-
-        assert(status);
-        logicalDevice->waitIdle();
-    }
-    void workLoopBody() override
-    {
-        ++resourceIx;
-        if (resourceIx >= FRAMES_IN_FLIGHT)
-            resourceIx = 0;
-
-        auto& commandBuffer = commandBuffers[resourceIx];
-        auto& fence = frameComplete[resourceIx];
-        if (fence)
-            logicalDevice->blockForFences(1u, &fence.get());
-        else
-            fence = logicalDevice->createFence(static_cast<video::IGPUFence::E_CREATE_FLAGS>(0));
-
-        commandBuffer->reset(nbl::video::IGPUCommandBuffer::ERF_RELEASE_RESOURCES_BIT);
-        commandBuffer->begin(nbl::video::IGPUCommandBuffer::EU_NONE);
-
-        const auto nextPresentationTimestamp = oracle.acquireNextImage(swapchain.get(), imageAcquire[resourceIx].get(), nullptr, &acquiredNextFBO);
-        {
-            inputSystem->getDefaultMouse(&mouse);
-            inputSystem->getDefaultKeyboard(&keyboard);
-
-            camera.beginInputProcessing(nextPresentationTimestamp);
-            mouse.consumeEvents([&](const IMouseEventChannel::range_t& events) -> void { camera.mouseProcess(events); }, logger.get());
-            keyboard.consumeEvents([&](const IKeyboardEventChannel::range_t& events) -> void { camera.keyboardProcess(events); }, logger.get());
-            camera.endInputProcessing(nextPresentationTimestamp);
-        }
-
-        const auto& viewMatrix = camera.getViewMatrix();
-        const auto& viewProjectionMatrix = matrix4SIMD::concatenateBFollowedByAPrecisely(
-            video::ISurface::getSurfaceTransformationMatrix(swapchain->getPreTransform()),
-            camera.getConcatenatedMatrix()
-        );
-
-        asset::SViewport viewport;
-        viewport.minDepth = 1.f;
-        viewport.maxDepth = 0.f;
-        viewport.x = 0u;
-        viewport.y = 0u;
-        viewport.width = WIN_W;
-        viewport.height = WIN_H;
-        commandBuffer->setViewport(0u, 1u, &viewport);
-        
-        VkRect2D scissor = {};
-        scissor.offset = { 0, 0 };
-        scissor.extent = { WIN_W, WIN_H };
-        commandBuffer->setScissor(0u, 1u, &scissor);
-
-        core::matrix3x4SIMD modelMatrix;
-        modelMatrix.setTranslation(nbl::core::vectorSIMDf(0, 0, 0, 0));
-        core::matrix4SIMD mvp = core::concatenateBFollowedByA(viewProjectionMatrix, modelMatrix);
-
-        const size_t uboSize = gpuubo->getSize();
-        core::vector<uint8_t> uboData(uboSize);
-        for (const auto& shdrIn : pipelineMetadata->m_inputSemantics)
-        {
-            if (shdrIn.descriptorSection.type == asset::IRenderpassIndependentPipelineMetadata::ShaderInput::E_TYPE::ET_UNIFORM_BUFFER && shdrIn.descriptorSection.uniformBufferObject.set == 1u && shdrIn.descriptorSection.uniformBufferObject.binding == ds1UboBinding)
-            {
-                switch (shdrIn.type)
-                {
-                case asset::IRenderpassIndependentPipelineMetadata::ECSI_WORLD_VIEW_PROJ:
-                {
-                    memcpy(uboData.data() + shdrIn.descriptorSection.uniformBufferObject.relByteoffset, mvp.pointer(), shdrIn.descriptorSection.uniformBufferObject.bytesize);
-                } break;
-
-                case asset::IRenderpassIndependentPipelineMetadata::ECSI_WORLD_VIEW:
-                {
-                    memcpy(uboData.data() + shdrIn.descriptorSection.uniformBufferObject.relByteoffset, viewMatrix.pointer(), shdrIn.descriptorSection.uniformBufferObject.bytesize);
-                } break;
-
-                case asset::IRenderpassIndependentPipelineMetadata::ECSI_WORLD_VIEW_INVERSE_TRANSPOSE:
-                {
-                    memcpy(uboData.data() + shdrIn.descriptorSection.uniformBufferObject.relByteoffset, viewMatrix.pointer(), shdrIn.descriptorSection.uniformBufferObject.bytesize);
-                } break;
-                }
-            }
-        }
-        commandBuffer->updateBuffer(gpuubo.get(), 0ull, uboSize, uboData.data());
-        
-        nbl::video::IGPUCommandBuffer::SRenderpassBeginInfo beginInfo;
-        {
-            VkRect2D area;
-            area.offset = { 0,0 };
-            area.extent = { WIN_W, WIN_H };
-            asset::SClearValue clear[2] = {};
-            clear[0].color.float32[0] = 1.f;
-            clear[0].color.float32[1] = 1.f;
-            clear[0].color.float32[2] = 1.f;
-            clear[0].color.float32[3] = 1.f;
-            clear[1].depthStencil.depth = 0.f;
-
-            beginInfo.clearValueCount = 2u;
-            beginInfo.framebuffer = fbo->begin()[acquiredNextFBO];
-            beginInfo.renderpass = renderpass;
-            beginInfo.renderArea = area;
-            beginInfo.clearValues = clear;
-        }
-
-        commandBuffer->resetQueryPool(occlusionQueryPool.get(), 0u, 2u);
-        commandBuffer->resetQueryPool(timestampQueryPool.get(), 0u, 2u);
-        commandBuffer->beginRenderPass(&beginInfo, nbl::asset::ESC_INLINE);
-        
-        commandBuffer->writeTimestamp(asset::E_PIPELINE_STAGE_FLAGS::EPSF_TOP_OF_PIPE_BIT, timestampQueryPool.get(), 0u);
-        for (size_t i = 0; i < gpumesh->getMeshBuffers().size(); ++i)
-        {
-            if(i < 2)
-                commandBuffer->beginQuery(occlusionQueryPool.get(), i);
-            auto gpuMeshBuffer = gpumesh->getMeshBuffers().begin()[i];
-            auto gpuGraphicsPipeline = gpuPipelines[reinterpret_cast<RENDERPASS_INDEPENDENT_PIPELINE_ADRESS>(gpuMeshBuffer->getPipeline())];
-
-            const video::IGPURenderpassIndependentPipeline* gpuRenderpassIndependentPipeline = gpuMeshBuffer->getPipeline();
-            const video::IGPUDescriptorSet* ds3 = gpuMeshBuffer->getAttachedDescriptorSet();
-
-            commandBuffer->bindGraphicsPipeline(gpuGraphicsPipeline.get());
-
-            const video::IGPUDescriptorSet* gpuds1_ptr = gpuds1.get();
-            commandBuffer->bindDescriptorSets(asset::EPBP_GRAPHICS, gpuRenderpassIndependentPipeline->getLayout(), 1u, 1u, &gpuds1_ptr);
-            const video::IGPUDescriptorSet* gpuds3_ptr = gpuMeshBuffer->getAttachedDescriptorSet();
-            if (gpuds3_ptr)
-                commandBuffer->bindDescriptorSets(asset::EPBP_GRAPHICS, gpuRenderpassIndependentPipeline->getLayout(), 3u, 1u, &gpuds3_ptr);
-            commandBuffer->pushConstants(gpuRenderpassIndependentPipeline->getLayout(), asset::IShader::ESS_FRAGMENT, 0u, gpuMeshBuffer->MAX_PUSH_CONSTANT_BYTESIZE, gpuMeshBuffer->getPushConstantsDataPtr());
-
-            commandBuffer->drawMeshBuffer(gpuMeshBuffer);
-
-            if(i < 2)
-                commandBuffer->endQuery(occlusionQueryPool.get(), i);
-        }
-        commandBuffer->writeTimestamp(asset::E_PIPELINE_STAGE_FLAGS::EPSF_BOTTOM_OF_PIPE_BIT, timestampQueryPool.get(), 1u);
-
-        commandBuffer->endRenderPass();
-
-        auto queryResultFlags = core::bitflag<video::IQueryPool::E_QUERY_RESULTS_FLAGS>(video::IQueryPool::EQRF_WAIT_BIT) | video::IQueryPool::EQRF_WITH_AVAILABILITY_BIT;
-        commandBuffer->copyQueryPoolResults(occlusionQueryPool.get(), 0, 2, queryResultsBuffer.get(), 0u, sizeof(uint32_t) * 2, queryResultFlags);
-
-        commandBuffer->end();
-        
-        logicalDevice->resetFences(1, &fence.get());
-        CommonAPI::Submit(
-            logicalDevice.get(),
-            commandBuffer.get(),
-            queues[CommonAPI::InitOutput::EQT_COMPUTE],
-            imageAcquire[resourceIx].get(),
-            renderFinished[resourceIx].get(),
-            fence.get());
-        CommonAPI::Present(logicalDevice.get(), 
-            swapchain.get(),
-            queues[CommonAPI::InitOutput::EQT_GRAPHICS], renderFinished[resourceIx].get(), acquiredNextFBO);
-
-        getAndLogQueryPoolResults();
-    }
-    bool keepRunning() override
-    {
-        return windowCb->isWindowOpen();
-    }
-};
-
-NBL_COMMON_API_MAIN(MeshLoadersApp)
diff --git a/06_MeshLoaders/pipeline.groovy b/06_MeshLoaders/pipeline.groovy
deleted file mode 100644
index 0923d296f..000000000
--- a/06_MeshLoaders/pipeline.groovy
+++ /dev/null
@@ -1,50 +0,0 @@
-import org.DevshGraphicsProgramming.Agent
-import org.DevshGraphicsProgramming.BuilderInfo
-import org.DevshGraphicsProgramming.IBuilder
-
-class CMeshLoadersBuilder extends IBuilder
-{
-	public CMeshLoadersBuilder(Agent _agent, _info)
-	{
-		super(_agent, _info)
-	}
-	
-	@Override
-	public boolean prepare(Map axisMapping)
-	{
-		return true
-	}
-	
-	@Override
-  	public boolean build(Map axisMapping)
-	{
-		IBuilder.CONFIGURATION config = axisMapping.get("CONFIGURATION")
-		IBuilder.BUILD_TYPE buildType = axisMapping.get("BUILD_TYPE")
-		
-		def nameOfBuildDirectory = getNameOfBuildDirectory(buildType)
-		def nameOfConfig = getNameOfConfig(config)
-		
-		agent.execute("cmake --build ${info.rootProjectPath}/${nameOfBuildDirectory}/${info.targetProjectPathRelativeToRoot} --target ${info.targetBaseName} --config ${nameOfConfig} -j12 -v")
-		
-		return true
-	}
-	
-	@Override
-  	public boolean test(Map axisMapping)
-	{
-		return true
-	}
-	
-	@Override
-	public boolean install(Map axisMapping)
-	{
-		return true
-	}
-}
-
-def create(Agent _agent, _info)
-{
-	return new CMeshLoadersBuilder(_agent, _info)
-}
-
-return this
\ No newline at end of file

From f494d859787936e0308e6661dffce9043361935e Mon Sep 17 00:00:00 2001
From: devsh <devsh@devsh.eu>
Date: Sat, 21 Jun 2025 23:49:36 +0200
Subject: [PATCH 411/529] yes `NBL_EMBED_BUILTIN_RESOURCES` should control both
 Nable and examples_tests w.r.t. the question of
 `MonoAssetManagerAndBuiltinResourceApplication.hpp`

There really isn't a usecase for:
- embedding Nabla resources but not embedding example (there's no harm in Nabla resources not being embedded)
- even more strangely, not embedding Nabla resources, but embedding example.
---
 common/include/nbl/examples/PCH.hpp | 14 ++------------
 1 file changed, 2 insertions(+), 12 deletions(-)

diff --git a/common/include/nbl/examples/PCH.hpp b/common/include/nbl/examples/PCH.hpp
index 5316ce2e8..3b1e6beaa 100644
--- a/common/include/nbl/examples/PCH.hpp
+++ b/common/include/nbl/examples/PCH.hpp
@@ -15,18 +15,8 @@
 
 //! Common example interface headers
 
-// why isnt this in `nabla.h` ?
-/*
-    because it does stuff like
-
-    #ifdef NBL_EMBED_BUILTIN_RESOURCES
-    #include "nbl/this_example/builtin/CArchive.h"
-    #endif
-
-    hence also cannot be there in PCH but rather in examples.h -> compile errors
-    but only *if* we decide each example handles builtins on NBL_EMBED_BUILTIN_RESOURCES
-*/
-// #include "nbl/application_templates/MonoAssetManagerAndBuiltinResourceApplication.hpp"
+// TODO: examine moving this header to `nbl/examples/common`
+#include "nbl/application_templates/MonoAssetManagerAndBuiltinResourceApplication.hpp"
 
 #include "nbl/examples/common/SimpleWindowedApplication.hpp"
 #include "nbl/examples/common/MonoWindowApplication.hpp"

From ce268fb50f481d94e3db8d5e3eee69ffa3e30af2 Mon Sep 17 00:00:00 2001
From: devsh <devsh@devsh.eu>
Date: Sun, 22 Jun 2025 01:14:05 +0200
Subject: [PATCH 412/529] Adjust all examples after splitting
 `MonoAssetManagerAndBuiltinResourceApplication` in two

---
 03_DeviceSelectionAndSharedSources/Testers.h  |  3 +-
 03_DeviceSelectionAndSharedSources/main.cpp   | 17 +++--
 .../main.cpp                                  |  6 +-
 06_HelloGraphicsQueue/main.cpp                | 20 +++---
 07_StagingAndMultipleQueues/main.cpp          | 15 +++--
 09_GeometryCreator/main.cpp                   | 12 ++--
 10_CountingSort/main.cpp                      | 15 +++--
 11_FFT/main.cpp                               | 21 +++----
 22_CppCompat/CIntrinsicsTester.h              |  7 ++-
 22_CppCompat/CTgmathTester.h                  |  7 ++-
 22_CppCompat/ITester.h                        |  7 ++-
 22_CppCompat/main.cpp                         | 22 +++----
 23_Arithmetic2UnitTest/main.cpp               | 12 ++--
 24_ColorSpaceTest/main.cpp                    |  4 +-
 26_Blur/main.cpp                              |  8 ++-
 27_MPMCScheduler/main.cpp                     |  7 ++-
 28_FFTBloom/main.cpp                          | 19 +++---
 29_Arithmetic2Bench/main.cpp                  |  4 +-
 30_ComputeShaderPathTracer/main.cpp           | 23 ++++---
 64_EmulatedFloatTest/main.cpp                 | 13 ++--
 68_JpegLoading/main.cpp                       | 22 ++++---
 70_FLIPFluids/main.cpp                        |  7 ++-
 common/include/nbl/examples/PCH.hpp           |  4 +-
 .../common/BuiltinResourcesApplication.hpp    | 63 +++++++++++++++++++
 24 files changed, 219 insertions(+), 119 deletions(-)
 create mode 100644 common/include/nbl/examples/common/BuiltinResourcesApplication.hpp

diff --git a/03_DeviceSelectionAndSharedSources/Testers.h b/03_DeviceSelectionAndSharedSources/Testers.h
index 9a4016d20..f957e50a0 100644
--- a/03_DeviceSelectionAndSharedSources/Testers.h
+++ b/03_DeviceSelectionAndSharedSources/Testers.h
@@ -4,8 +4,7 @@
 #ifndef _NBL_TESTERS_H_INCLUDED_
 #define _NBL_TESTERS_H_INCLUDED_
 
-#include "nbl/application_templates/MonoDeviceApplication.hpp"
-#include "nbl/application_templates/MonoAssetManagerAndBuiltinResourceApplication.hpp"
+#include "nbl/examples/examples.hpp"
 
 using namespace nbl;
 
diff --git a/03_DeviceSelectionAndSharedSources/main.cpp b/03_DeviceSelectionAndSharedSources/main.cpp
index 6c99aff7f..c09228ce5 100644
--- a/03_DeviceSelectionAndSharedSources/main.cpp
+++ b/03_DeviceSelectionAndSharedSources/main.cpp
@@ -2,15 +2,20 @@
 // This file is part of the "Nabla Engine".
 // For conditions of distribution and use, see copyright notice in nabla.h
 
+
 #include "nbl/examples/examples.hpp"
 // TODO: why isn't this in `nabla.h` ?
 #include "nbl/asset/metadata/CHLSLMetadata.h"
 
+
 using namespace nbl;
-using namespace core;
-using namespace system;
-using namespace asset;
-using namespace video;
+using namespace nbl::core;
+using namespace nbl::hlsl;
+using namespace nbl::system;
+using namespace nbl::asset;
+using namespace nbl::ui;
+using namespace nbl::video;
+using namespace nbl::examples;
 
 // TODO[Przemek]: update comments
 
@@ -21,10 +26,10 @@ using namespace video;
 constexpr bool ENABLE_TESTS = false;
 
 // This time we create the device in the base class and also use a base class to give us an Asset Manager and an already mounted built-in resource archive
-class DeviceSelectionAndSharedSourcesApp final : public application_templates::MonoDeviceApplication, public application_templates::MonoAssetManagerAndBuiltinResourceApplication
+class DeviceSelectionAndSharedSourcesApp final : public application_templates::MonoDeviceApplication, public BuiltinResourcesApplication
 {
 	using device_base_t = application_templates::MonoDeviceApplication;
-	using asset_base_t = application_templates::MonoAssetManagerAndBuiltinResourceApplication;
+	using asset_base_t = BuiltinResourcesApplication;
 public:
 	// Yay thanks to multiple inheritance we cannot forward ctors anymore
 	DeviceSelectionAndSharedSourcesApp(const path& _localInputCWD, const path& _localOutputCWD, const path& _sharedInputCWD, const path& _sharedOutputCWD) :
diff --git a/05_StreamingAndBufferDeviceAddressApp/main.cpp b/05_StreamingAndBufferDeviceAddressApp/main.cpp
index f98e38f66..a648acefb 100644
--- a/05_StreamingAndBufferDeviceAddressApp/main.cpp
+++ b/05_StreamingAndBufferDeviceAddressApp/main.cpp
@@ -5,7 +5,7 @@
 
 // I've moved out a tiny part of this example into a shared header for reuse, please open and read it.
 #include "nbl/application_templates/MonoDeviceApplication.hpp"
-#include "nbl/application_templates/MonoAssetManagerAndBuiltinResourceApplication.hpp"
+#include "nbl/examples/common/BuiltinResourcesApplication.hpp"
 
 
 using namespace nbl;
@@ -20,10 +20,10 @@ using namespace video;
 
 
 // In this application we'll cover buffer streaming, Buffer Device Address (BDA) and push constants 
-class StreamingAndBufferDeviceAddressApp final : public application_templates::MonoDeviceApplication, public application_templates::MonoAssetManagerAndBuiltinResourceApplication
+class StreamingAndBufferDeviceAddressApp final : public application_templates::MonoDeviceApplication, public examples::BuiltinResourcesApplication
 {
 		using device_base_t = application_templates::MonoDeviceApplication;
-		using asset_base_t = application_templates::MonoAssetManagerAndBuiltinResourceApplication;
+		using asset_base_t = examples::BuiltinResourcesApplication;
 
 		// This is the first example that submits multiple workloads in-flight. 
 		// What the shader does is it computes the minimum distance of each point against K other random input points.
diff --git a/06_HelloGraphicsQueue/main.cpp b/06_HelloGraphicsQueue/main.cpp
index dc2f3ebb4..07d6affd3 100644
--- a/06_HelloGraphicsQueue/main.cpp
+++ b/06_HelloGraphicsQueue/main.cpp
@@ -3,18 +3,20 @@
 // For conditions of distribution and use, see copyright notice in nabla.h
 
 
-// I've moved out a tiny part of this example into a shared header for reuse, please open and read it.
-#include "nbl/application_templates/MonoDeviceApplication.hpp"
-#include "nbl/application_templates/MonoAssetManagerAndBuiltinResourceApplication.hpp"
+#include "nbl/examples/examples.hpp"
 
 #include "nbl/ext/ScreenShot/ScreenShot.h"
 
 
 using namespace nbl;
-using namespace core;
-using namespace system;
-using namespace asset;
-using namespace video;
+using namespace nbl::core;
+using namespace nbl::hlsl;
+using namespace nbl::system;
+using namespace nbl::asset;
+using namespace nbl::ui;
+using namespace nbl::video;
+using namespace nbl::examples;
+
 
 // Here we showcase the use of Graphics Queue only 
 // Steps we take in this example:
@@ -26,10 +28,10 @@ using namespace video;
 // - save the smallImg to disk
 // 
 // all without using IUtilities.
-class HelloGraphicsQueueApp final : public application_templates::MonoDeviceApplication, public application_templates::MonoAssetManagerAndBuiltinResourceApplication
+class HelloGraphicsQueueApp final : public application_templates::MonoDeviceApplication, public BuiltinResourcesApplication
 {
 		using device_base_t = application_templates::MonoDeviceApplication;
-		using asset_base_t = application_templates::MonoAssetManagerAndBuiltinResourceApplication;
+		using asset_base_t = BuiltinResourcesApplication;
 
 	public:
 		// Yay thanks to multiple inheritance we cannot forward ctors anymore.
diff --git a/07_StagingAndMultipleQueues/main.cpp b/07_StagingAndMultipleQueues/main.cpp
index a1a06f4f4..fc6bf4551 100644
--- a/07_StagingAndMultipleQueues/main.cpp
+++ b/07_StagingAndMultipleQueues/main.cpp
@@ -6,18 +6,21 @@
 #include "nbl/examples/examples.hpp"
 
 using namespace nbl;
-using namespace core;
-using namespace system;
-using namespace asset;
-using namespace video;
+using namespace nbl::core;
+using namespace nbl::hlsl;
+using namespace nbl::system;
+using namespace nbl::asset;
+using namespace nbl::ui;
+using namespace nbl::video;
+using namespace nbl::examples;
 
 #include "app_resources/common.hlsl"
 
 // This time we let the new base class score and pick queue families, as well as initialize `nbl::video::IUtilities` for us
-class StagingAndMultipleQueuesApp final : public application_templates::BasicMultiQueueApplication, public application_templates::MonoAssetManagerAndBuiltinResourceApplication
+class StagingAndMultipleQueuesApp final : public application_templates::BasicMultiQueueApplication, public BuiltinResourcesApplication
 {
 	using device_base_t = application_templates::BasicMultiQueueApplication;
-	using asset_base_t = application_templates::MonoAssetManagerAndBuiltinResourceApplication;
+	using asset_base_t = BuiltinResourcesApplication;
 
 	// TODO: would be cool if we used `system::ISystem::listItemsInDirectory(sharedInputCWD/"GLI")` as our dataset
 	static constexpr std::array imagesToLoad = {
diff --git a/09_GeometryCreator/main.cpp b/09_GeometryCreator/main.cpp
index 6fddd8282..38daebaa5 100644
--- a/09_GeometryCreator/main.cpp
+++ b/09_GeometryCreator/main.cpp
@@ -2,14 +2,14 @@
 // This file is part of the "Nabla Engine".
 // For conditions of distribution and use, see copyright notice in nabla.h
 
+
 #include "common.hpp"
 
-// TODO: Arek, we should have a `nbl::examples` class inheriting from `application_templates::MonoAssetManagerAndBuiltinResourceApplication` which
-// during `onAppInitialized` also mounts correct `common/include/nbl/examples` and `common/src/nbl/examples` as folder or builtin
-class GeometryCreatorApp final : public MonoWindowApplication, public application_templates::MonoAssetManagerAndBuiltinResourceApplication
+
+class GeometryCreatorApp final : public MonoWindowApplication, public BuiltinResourcesApplication
 {
 	using device_base_t = MonoWindowApplication;
-	using asset_base_t = application_templates::MonoAssetManagerAndBuiltinResourceApplication;
+	using asset_base_t = BuiltinResourcesApplication;
 
 	public:
 		GeometryCreatorApp(const path& _localInputCWD, const path& _localOutputCWD, const path& _sharedInputCWD, const path& _sharedOutputCWD)
@@ -58,10 +58,6 @@ class GeometryCreatorApp final : public MonoWindowApplication, public applicatio
 					.addtionalBufferOwnershipFamilies = addtionalBufferOwnershipFamilies
 				},patch
 			);
-
-			// TODO: this is plain wrong Arek
-			m_system->mount(make_smart_refctd_ptr<system::CMountDirectoryArchive>(localInputCWD/"../common/include/nbl/examples",smart_refctd_ptr(m_logger),m_system.get()),"nbl/examples");
-			m_system->mount(make_smart_refctd_ptr<system::CMountDirectoryArchive>(localInputCWD/"../common/src/nbl/examples",smart_refctd_ptr(m_logger),m_system.get()),"nbl/examples");
 			
 			auto scRes = static_cast<CDefaultSwapchainFramebuffers*>(m_surface->getSwapchainResources());
 			m_renderer = CSimpleDebugRenderer::create(m_assetMgr.get(),scRes->getRenderpass(),0,m_scene.get());
diff --git a/10_CountingSort/main.cpp b/10_CountingSort/main.cpp
index 0efc0518e..d51650919 100644
--- a/10_CountingSort/main.cpp
+++ b/10_CountingSort/main.cpp
@@ -1,18 +1,21 @@
 #include "nbl/examples/examples.hpp"
 
 using namespace nbl;
-using namespace core;
-using namespace system;
-using namespace asset;
-using namespace video;
+using namespace nbl::core;
+using namespace nbl::hlsl;
+using namespace nbl::system;
+using namespace nbl::asset;
+using namespace nbl::ui;
+using namespace nbl::video;
+using namespace nbl::examples;
 
 #include "app_resources/common.hlsl"
 #include "nbl/builtin/hlsl/bit.hlsl"
 
-class CountingSortApp final : public application_templates::MonoDeviceApplication, public application_templates::MonoAssetManagerAndBuiltinResourceApplication
+class CountingSortApp final : public application_templates::MonoDeviceApplication, public BuiltinResourcesApplication
 {
 		using device_base_t = application_templates::MonoDeviceApplication;
-		using asset_base_t = application_templates::MonoAssetManagerAndBuiltinResourceApplication;
+		using asset_base_t = BuiltinResourcesApplication;
 
 	public:
 		// Yay thanks to multiple inheritance we cannot forward ctors anymore
diff --git a/11_FFT/main.cpp b/11_FFT/main.cpp
index ad9bbfd47..3829e8481 100644
--- a/11_FFT/main.cpp
+++ b/11_FFT/main.cpp
@@ -3,17 +3,16 @@
 // For conditions of distribution and use, see copyright notice in nabla.h
 
 
-// I've moved out a tiny part of this example into a shared header for reuse, please open and read it.
-#include "nbl/application_templates/MonoDeviceApplication.hpp"
-#include "nbl/application_templates/MonoAssetManagerAndBuiltinResourceApplication.hpp"
-
+#include "nbl/examples/examples.hpp"
 
 using namespace nbl;
-using namespace core;
-using namespace system;
-using namespace asset;
-using namespace video;
-
+using namespace nbl::core;
+using namespace nbl::hlsl;
+using namespace nbl::system;
+using namespace nbl::asset;
+using namespace nbl::ui;
+using namespace nbl::video;
+using namespace nbl::examples;
 
 #include "app_resources/common.hlsl"
 #include "nbl/builtin/hlsl/bit.hlsl"
@@ -21,10 +20,10 @@ using namespace video;
 
 
 // Simple showcase of how to run FFT on a 1D array
-class FFT_Test final : public application_templates::MonoDeviceApplication, public application_templates::MonoAssetManagerAndBuiltinResourceApplication
+class FFT_Test final : public application_templates::MonoDeviceApplication, public BuiltinResourcesApplication
 {
 	using device_base_t = application_templates::MonoDeviceApplication;
-	using asset_base_t = application_templates::MonoAssetManagerAndBuiltinResourceApplication;
+	using asset_base_t = BuiltinResourcesApplication;
 
 	smart_refctd_ptr<IGPUComputePipeline> m_pipeline;
 
diff --git a/22_CppCompat/CIntrinsicsTester.h b/22_CppCompat/CIntrinsicsTester.h
index 77aa2c1ca..d053977c0 100644
--- a/22_CppCompat/CIntrinsicsTester.h
+++ b/22_CppCompat/CIntrinsicsTester.h
@@ -1,12 +1,13 @@
 #ifndef _NBL_EXAMPLES_TESTS_22_CPP_COMPAT_C_INTRINSICS_TESTER_INCLUDED_
 #define _NBL_EXAMPLES_TESTS_22_CPP_COMPAT_C_INTRINSICS_TESTER_INCLUDED_
 
-#include <nabla.h>
+
+#include "nbl/examples/examples.hpp"
+
 #include "app_resources/common.hlsl"
-#include "nbl/application_templates/MonoDeviceApplication.hpp"
-#include "nbl/application_templates/MonoAssetManagerAndBuiltinResourceApplication.hpp"
 #include "ITester.h"
 
+
 using namespace nbl;
 
 class CIntrinsicsTester final : public ITester
diff --git a/22_CppCompat/CTgmathTester.h b/22_CppCompat/CTgmathTester.h
index 6d2b23c73..63b0e483e 100644
--- a/22_CppCompat/CTgmathTester.h
+++ b/22_CppCompat/CTgmathTester.h
@@ -1,12 +1,13 @@
 #ifndef _NBL_EXAMPLES_TESTS_22_CPP_COMPAT_C_TGMATH_TESTER_INCLUDED_
 #define _NBL_EXAMPLES_TESTS_22_CPP_COMPAT_C_TGMATH_TESTER_INCLUDED_
 
-#include <nabla.h>
+
+#include "nbl/examples/examples.hpp"
+
 #include "app_resources/common.hlsl"
-#include "nbl/application_templates/MonoDeviceApplication.hpp"
-#include "nbl/application_templates/MonoAssetManagerAndBuiltinResourceApplication.hpp"
 #include "ITester.h"
 
+
 using namespace nbl;
 
 class CTgmathTester final : public ITester
diff --git a/22_CppCompat/ITester.h b/22_CppCompat/ITester.h
index 32138f198..9f2353c95 100644
--- a/22_CppCompat/ITester.h
+++ b/22_CppCompat/ITester.h
@@ -1,12 +1,13 @@
 #ifndef _NBL_EXAMPLES_TESTS_22_CPP_COMPAT_I_TESTER_INCLUDED_
 #define _NBL_EXAMPLES_TESTS_22_CPP_COMPAT_I_TESTER_INCLUDED_
 
-#include <nabla.h>
+
+#include "nbl/examples/examples.hpp"
+
 #include "app_resources/common.hlsl"
-#include "nbl/application_templates/MonoDeviceApplication.hpp"
-#include "nbl/application_templates/MonoAssetManagerAndBuiltinResourceApplication.hpp"
 #include "nbl/asset/metadata/CHLSLMetadata.h"
 
+
 using namespace nbl;
 
 class ITester 
diff --git a/22_CppCompat/main.cpp b/22_CppCompat/main.cpp
index a5a819d49..70c8d7b3a 100644
--- a/22_CppCompat/main.cpp
+++ b/22_CppCompat/main.cpp
@@ -1,26 +1,26 @@
 // Copyright (C) 2018-2024 - DevSH Graphics Programming Sp. z O.O.
 // This file is part of the "Nabla Engine".
 // For conditions of distribution and use, see copyright notice in nabla.h
-#include <nabla.h>
-#include <iostream>
-#include <cstdio>
-#include <assert.h>
 
-#include "nbl/application_templates/MonoDeviceApplication.hpp"
-#include "nbl/application_templates/MonoAssetManagerAndBuiltinResourceApplication.hpp"
 
 #include "app_resources/common.hlsl"
 
 #include "CTgmathTester.h"
 #include "CIntrinsicsTester.h"
 
+#include <iostream>
+#include <cstdio>
+#include <assert.h>
+
+
+using namespace nbl;
 using namespace nbl::core;
 using namespace nbl::hlsl;
 using namespace nbl::system;
 using namespace nbl::asset;
+using namespace nbl::ui;
 using namespace nbl::video;
-using namespace nbl::application_templates;
-
+using namespace nbl::examples;
 
 //using namespace glm;
 
@@ -43,10 +43,10 @@ struct T
     float32_t4      h;
 };
 
-class CompatibilityTest final : public MonoDeviceApplication, public MonoAssetManagerAndBuiltinResourceApplication
+class CompatibilityTest final : public application_templates::MonoDeviceApplication, public BuiltinResourcesApplication
 {
-    using device_base_t = MonoDeviceApplication;
-    using asset_base_t = MonoAssetManagerAndBuiltinResourceApplication;
+    using device_base_t = application_templates::MonoDeviceApplication;
+    using asset_base_t = BuiltinResourcesApplication;
 public:
     CompatibilityTest(const path& _localInputCWD, const path& _localOutputCWD, const path& _sharedInputCWD, const path& _sharedOutputCWD) :
         IApplicationFramework(_localInputCWD, _localOutputCWD, _sharedInputCWD, _sharedOutputCWD) {}
diff --git a/23_Arithmetic2UnitTest/main.cpp b/23_Arithmetic2UnitTest/main.cpp
index da0d3de7d..3939fd443 100644
--- a/23_Arithmetic2UnitTest/main.cpp
+++ b/23_Arithmetic2UnitTest/main.cpp
@@ -1,9 +1,13 @@
-#include "nbl/application_templates/BasicMultiQueueApplication.hpp"
-#include "nbl/application_templates/MonoAssetManagerAndBuiltinResourceApplication.hpp"
+// TODO: copyright notice
+
+
+#include "nbl/examples/examples.hpp"
+
 #include "app_resources/common.hlsl"
 #include "nbl/builtin/hlsl/workgroup2/arithmetic_config.hlsl"
 #include "nbl/builtin/hlsl/subgroup2/arithmetic_params.hlsl"
 
+
 using namespace nbl;
 using namespace core;
 using namespace asset;
@@ -47,10 +51,10 @@ struct emulatedScanExclusive
 	static inline constexpr const char* name = "exclusive_scan";
 };
 
-class Workgroup2ScanTestApp final : public application_templates::BasicMultiQueueApplication, public application_templates::MonoAssetManagerAndBuiltinResourceApplication
+class Workgroup2ScanTestApp final : public application_templates::BasicMultiQueueApplication, public examples::BuiltinResourcesApplication
 {
 	using device_base_t = application_templates::BasicMultiQueueApplication;
-	using asset_base_t = application_templates::MonoAssetManagerAndBuiltinResourceApplication;
+	using asset_base_t = examples::BuiltinResourcesApplication;
 
 public:
 	Workgroup2ScanTestApp(const path& _localInputCWD, const path& _localOutputCWD, const path& _sharedInputCWD, const path& _sharedOutputCWD) :
diff --git a/24_ColorSpaceTest/main.cpp b/24_ColorSpaceTest/main.cpp
index 56af4fc79..84c55ef3a 100644
--- a/24_ColorSpaceTest/main.cpp
+++ b/24_ColorSpaceTest/main.cpp
@@ -22,10 +22,10 @@ using namespace nbl::examples;
 // defines for sampler tests can be found in the file below
 #include "app_resources/push_constants.hlsl"
 
-class ColorSpaceTestSampleApp final : public SimpleWindowedApplication, public application_templates::MonoAssetManagerAndBuiltinResourceApplication
+class ColorSpaceTestSampleApp final : public SimpleWindowedApplication, public BuiltinResourcesApplication
 {
 		using device_base_t = SimpleWindowedApplication;
-		using asset_base_t = application_templates::MonoAssetManagerAndBuiltinResourceApplication;
+		using asset_base_t = BuiltinResourcesApplication;
 		using clock_t = std::chrono::steady_clock;
 		using perf_clock_resolution_t = std::chrono::milliseconds;
 
diff --git a/26_Blur/main.cpp b/26_Blur/main.cpp
index e5105c778..83cf140d6 100644
--- a/26_Blur/main.cpp
+++ b/26_Blur/main.cpp
@@ -1,6 +1,8 @@
 // Copyright (C) 2024-2025 - DevSH Graphics Programming Sp. z O.O.
 // This file is part of the "Nabla Engine".
 // For conditions of distribution and use, see copyright notice in nabla.h
+
+
 #include "nbl/examples/examples.hpp"
 
 #include <bit>
@@ -16,10 +18,12 @@ using namespace nbl::examples;
 
 #include "app_resources/common.hlsl"
 
-class BlurApp final : public SimpleWindowedApplication, public application_templates::MonoAssetManagerAndBuiltinResourceApplication
+
+
+class BlurApp final : public SimpleWindowedApplication, public BuiltinResourcesApplication
 {
 		using device_base_t = SimpleWindowedApplication;
-		using asset_base_t = application_templates::MonoAssetManagerAndBuiltinResourceApplication;
+		using asset_base_t = BuiltinResourcesApplication;
 		using clock_t = std::chrono::steady_clock;
 
 	public:
diff --git a/27_MPMCScheduler/main.cpp b/27_MPMCScheduler/main.cpp
index 18d396135..580335a35 100644
--- a/27_MPMCScheduler/main.cpp
+++ b/27_MPMCScheduler/main.cpp
@@ -1,6 +1,8 @@
 // Copyright (C) 2024-2025 - DevSH Graphics Programming Sp. z O.O.
 // This file is part of the "Nabla Engine".
 // For conditions of distribution and use, see copyright notice in nabla.h
+
+
 #include "nbl/examples/examples.hpp"
 
 using namespace nbl;
@@ -13,10 +15,11 @@ using namespace nbl::examples;
 
 #include "app_resources/common.hlsl"
 
-class MPMCSchedulerApp final : public SimpleWindowedApplication, public application_templates::MonoAssetManagerAndBuiltinResourceApplication
+
+class MPMCSchedulerApp final : public SimpleWindowedApplication, public BuiltinResourcesApplication
 {
 		using device_base_t = SimpleWindowedApplication;
-		using asset_base_t = application_templates::MonoAssetManagerAndBuiltinResourceApplication;
+		using asset_base_t = BuiltinResourcesApplication;
 		using clock_t = std::chrono::steady_clock;
 
 		constexpr static inline uint32_t WIN_W = 1280, WIN_H = 720;
diff --git a/28_FFTBloom/main.cpp b/28_FFTBloom/main.cpp
index 16835ecf6..049bbd581 100644
--- a/28_FFTBloom/main.cpp
+++ b/28_FFTBloom/main.cpp
@@ -1,27 +1,32 @@
 // Copyright (C) 2018-2024 - DevSH Graphics Programming Sp. z O.O.
 // This file is part of the "Nabla Engine".
 // For conditions of distribution and use, see copyright notice in nabla.h
+
+
 #include "nbl/examples/examples.hpp"
 
 using namespace nbl;
-using namespace core;
-using namespace system;
-using namespace asset;
-using namespace video;
-using namespace ui;
+using namespace nbl::core;
+using namespace nbl::hlsl;
+using namespace nbl::system;
+using namespace nbl::asset;
+using namespace nbl::ui;
+using namespace nbl::video;
 using namespace nbl::examples;
 
 #include "app_resources/common.hlsl"
 #include "nbl/builtin/hlsl/bit.hlsl"
 
+
+
 // Defaults that match this example's image
 constexpr uint32_t WIN_W = 1280;
 constexpr uint32_t WIN_H = 720;
 
-class FFTBloomApp final : public SimpleWindowedApplication, public application_templates::MonoAssetManagerAndBuiltinResourceApplication
+class FFTBloomApp final : public SimpleWindowedApplication, public BuiltinResourcesApplication
 {
 	using device_base_t = SimpleWindowedApplication;
-	using asset_base_t = application_templates::MonoAssetManagerAndBuiltinResourceApplication;
+	using asset_base_t = BuiltinResourcesApplication;
 	using clock_t = std::chrono::steady_clock;
 
 	// Windowed App members
diff --git a/29_Arithmetic2Bench/main.cpp b/29_Arithmetic2Bench/main.cpp
index 0a0e3b35f..75f483db0 100644
--- a/29_Arithmetic2Bench/main.cpp
+++ b/29_Arithmetic2Bench/main.cpp
@@ -167,10 +167,10 @@ class CExplicitSurfaceFormatResizeSurface final : public ISimpleManagedSurface
 };
 
 // NOTE added swapchain + drawing frames to be able to profile with Nsight, which still doesn't support profiling headless compute shaders
-class ArithmeticBenchApp final : public examples::SimpleWindowedApplication, public application_templates::MonoAssetManagerAndBuiltinResourceApplication
+class ArithmeticBenchApp final : public examples::SimpleWindowedApplication, public examples::BuiltinResourcesApplication
 {
 	using device_base_t = examples::SimpleWindowedApplication;
-	using asset_base_t = application_templates::MonoAssetManagerAndBuiltinResourceApplication;
+	using asset_base_t = examples::BuiltinResourcesApplication;
 
 	constexpr static inline uint32_t WIN_W = 1280;
 	constexpr static inline uint32_t WIN_H = 720;
diff --git a/30_ComputeShaderPathTracer/main.cpp b/30_ComputeShaderPathTracer/main.cpp
index 487388ea0..54bc64495 100644
--- a/30_ComputeShaderPathTracer/main.cpp
+++ b/30_ComputeShaderPathTracer/main.cpp
@@ -1,20 +1,23 @@
 // Copyright (C) 2018-2020 - DevSH Graphics Programming Sp. z O.O.
 // This file is part of the "Nabla Engine".
 // For conditions of distribution and use, see copyright notice in nabla.h
-#include "nbl/ext/FullScreenTriangle/FullScreenTriangle.h"
 
-#include "nbl/this_example/common.hpp"
 
+#include "nbl/examples/examples.hpp"
+
+#include "nbl/ext/FullScreenTriangle/FullScreenTriangle.h"
 #include "nbl/builtin/hlsl/surface_transform.h"
 
+#include "nbl/this_example/common.hpp"
+
 
 using namespace nbl;
-using namespace core;
-using namespace hlsl;
-using namespace system;
-using namespace asset;
-using namespace ui;
-using namespace video;
+using namespace nbl::core;
+using namespace nbl::hlsl;
+using namespace nbl::system;
+using namespace nbl::asset;
+using namespace nbl::ui;
+using namespace nbl::video;
 using namespace nbl::examples;
 
 // TODO: share push constants
@@ -26,10 +29,10 @@ struct PTPushConstant {
 
 // TODO: Add a QueryPool for timestamping once its ready (actually add IMGUI mspf plotter)
 // TODO: Do buffer creation using assConv
-class ComputeShaderPathtracer final : public SimpleWindowedApplication, public application_templates::MonoAssetManagerAndBuiltinResourceApplication
+class ComputeShaderPathtracer final : public SimpleWindowedApplication, public BuiltinResourcesApplication
 {
 		using device_base_t = SimpleWindowedApplication;
-		using asset_base_t = application_templates::MonoAssetManagerAndBuiltinResourceApplication;
+		using asset_base_t = BuiltinResourcesApplication;
 		using clock_t = std::chrono::steady_clock;
 
 		enum E_LIGHT_GEOMETRY : uint8_t
diff --git a/64_EmulatedFloatTest/main.cpp b/64_EmulatedFloatTest/main.cpp
index b44cb2b4e..fd3e465e7 100644
--- a/64_EmulatedFloatTest/main.cpp
+++ b/64_EmulatedFloatTest/main.cpp
@@ -1,35 +1,38 @@
 // Copyright (C) 2018-2024 - DevSH Graphics Programming Sp. z O.O.
 // This file is part of the "Nabla Engine".
 // For conditions of distribution and use, see copyright notice in nabla.h
+
+
+#include "nbl/examples/examples.hpp"
+
 #include <nabla.h>
 #include <iostream>
 #include <cstdio>
 #include <assert.h>
 #include <cfenv>
 
-#include "nbl/application_templates/MonoDeviceApplication.hpp"
-#include "nbl/application_templates/MonoAssetManagerAndBuiltinResourceApplication.hpp"
-
 #include "app_resources/common.hlsl"
 #include "app_resources/benchmark/common.hlsl"
 #include "nbl/builtin/hlsl/ieee754.hlsl"
 
 #include <nbl\builtin\hlsl\math\quadrature\gauss_legendre\gauss_legendre.hlsl>
 
+
 using namespace nbl::core;
 using namespace nbl::hlsl;
 using namespace nbl::system;
 using namespace nbl::asset;
 using namespace nbl::video;
 using namespace nbl::application_templates;
+using namespace nbl::examples;
 
 constexpr bool DoTests = true;
 constexpr bool DoBenchmark = true;
 
-class CompatibilityTest final : public MonoDeviceApplication, public MonoAssetManagerAndBuiltinResourceApplication
+class CompatibilityTest final : public MonoDeviceApplication, public BuiltinResourcesApplication
 {
     using device_base_t = MonoDeviceApplication;
-    using asset_base_t = MonoAssetManagerAndBuiltinResourceApplication;
+    using asset_base_t = BuiltinResourcesApplication;
 public:
     CompatibilityTest(const path& _localInputCWD, const path& _localOutputCWD, const path& _sharedInputCWD, const path& _sharedOutputCWD) :
         IApplicationFramework(_localInputCWD, _localOutputCWD, _sharedInputCWD, _sharedOutputCWD) {}
diff --git a/68_JpegLoading/main.cpp b/68_JpegLoading/main.cpp
index 5ef9b637d..663b40759 100644
--- a/68_JpegLoading/main.cpp
+++ b/68_JpegLoading/main.cpp
@@ -1,22 +1,26 @@
 // Copyright (C) 2018-2024 - DevSH GrapMonoAssetManagerAndBuiltinResourceApplicationhics Programming Sp. z O.O.
 // This file is part of the "Nabla Engine".
 // For conditions of distribution and use, see copyright notice in nabla.h
-#include "nbl/application_templates/MonoAssetManagerAndBuiltinResourceApplication.hpp"
+
+
+#include "nbl/examples/examples.hpp"
 
 #include <future>
 
 #include "nlohmann/json.hpp"
 #include "argparse/argparse.hpp"
 
+
 using json = nlohmann::json;
 
 using namespace nbl;
-using namespace core;
-using namespace hlsl;
-using namespace system;
-using namespace asset;
-using namespace ui;
-using namespace video;
+using namespace nbl::core;
+using namespace nbl::hlsl;
+using namespace nbl::system;
+using namespace nbl::asset;
+using namespace nbl::ui;
+using namespace nbl::video;
+using namespace nbl::examples;
 
 class ThreadPool
 {
@@ -76,11 +80,11 @@ using task_t = std::function<void()>;
    std::atomic<bool> m_shouldStop = false;
 };
 
-class JpegLoaderApp final : public application_templates::MonoAssetManagerAndBuiltinResourceApplication
+class JpegLoaderApp final : public BuiltinResourcesApplication
 {
    using clock_t = std::chrono::steady_clock;
    using clock_resolution_t = std::chrono::milliseconds;
-   using base_t = application_templates::MonoAssetManagerAndBuiltinResourceApplication;
+   using base_t = BuiltinResourcesApplication;
    public:
    using base_t::base_t;
 
diff --git a/70_FLIPFluids/main.cpp b/70_FLIPFluids/main.cpp
index d66b56811..66596c526 100644
--- a/70_FLIPFluids/main.cpp
+++ b/70_FLIPFluids/main.cpp
@@ -1,10 +1,13 @@
 // Copyright (C) 2024-2025 - DevSH Graphics Programming Sp. z O.O.
 // This file is part of the "Nabla Engine".
 // For conditions of distribution and use, see copyright notice in nabla.h
+
+
 #include "nbl/examples/examples.hpp"
 // TODO: why is it not in nabla.h ?
 #include "nbl/asset/metadata/CHLSLMetadata.h"
 
+using namespace nbl;
 using namespace nbl::core;
 using namespace nbl::hlsl;
 using namespace nbl::system;
@@ -161,10 +164,10 @@ class CEventCallback : public ISimpleManagedSurface::ICallback
     nbl::system::logger_opt_smart_ptr m_logger = nullptr;
 };
 
-class FLIPFluidsApp final : public SimpleWindowedApplication, public application_templates::MonoAssetManagerAndBuiltinResourceApplication
+class FLIPFluidsApp final : public SimpleWindowedApplication, public BuiltinResourcesApplication
 {
     using device_base_t = SimpleWindowedApplication;
-    using asset_base_t = application_templates::MonoAssetManagerAndBuiltinResourceApplication;
+    using asset_base_t = BuiltinResourcesApplication;
     using clock_t = std::chrono::steady_clock;
 
     constexpr static inline uint32_t WIN_WIDTH = 1280, WIN_HEIGHT = 720;
diff --git a/common/include/nbl/examples/PCH.hpp b/common/include/nbl/examples/PCH.hpp
index 3b1e6beaa..4d2025f5f 100644
--- a/common/include/nbl/examples/PCH.hpp
+++ b/common/include/nbl/examples/PCH.hpp
@@ -15,9 +15,7 @@
 
 //! Common example interface headers
 
-// TODO: examine moving this header to `nbl/examples/common`
-#include "nbl/application_templates/MonoAssetManagerAndBuiltinResourceApplication.hpp"
-
+#include "nbl/examples/common/BuiltinResourcesApplication.hpp"
 #include "nbl/examples/common/SimpleWindowedApplication.hpp"
 #include "nbl/examples/common/MonoWindowApplication.hpp"
 #include "nbl/examples/common/InputSystem.hpp"
diff --git a/common/include/nbl/examples/common/BuiltinResourcesApplication.hpp b/common/include/nbl/examples/common/BuiltinResourcesApplication.hpp
new file mode 100644
index 000000000..2f1884470
--- /dev/null
+++ b/common/include/nbl/examples/common/BuiltinResourcesApplication.hpp
@@ -0,0 +1,63 @@
+// Copyright (C) 2023-2025 - DevSH Graphics Programming Sp. z O.O.
+// This file is part of the "Nabla Engine".
+// For conditions of distribution and use, see copyright notice in nabla.h
+#ifndef _NBL_EXAMPLES_BUILTIN_RESOURCE_APPLICATION_HPP_INCLUDED_
+#define _NBL_EXAMPLES_BUILTIN_RESOURCE_APPLICATION_HPP_INCLUDED_
+
+
+// we need a system, logger and an asset manager
+#include "nbl/application_templates/MonoAssetManagerApplication.hpp"
+
+#ifdef NBL_EMBED_BUILTIN_RESOURCES
+// TODO: the include/header `nbl/examples` archive
+// TODO: the source `nbl/examples` archive
+// TODO: the build `nbl/examples` archive
+#include "nbl/this_example/builtin/CArchive.h"
+#endif
+
+
+namespace nbl::examples
+{
+
+// Virtual Inheritance because apps might end up doing diamond inheritance
+class BuiltinResourcesApplication : public virtual application_templates::MonoAssetManagerApplication
+{
+		using base_t = MonoAssetManagerApplication;
+
+	public:
+		using base_t::base_t;
+
+	protected:
+		// need this one for skipping passing all args into ApplicationFramework
+		BuiltinResourcesApplication() = default;
+
+		virtual bool onAppInitialized(core::smart_refctd_ptr<system::ISystem>&& system) override
+		{
+			if (!base_t::onAppInitialized(std::move(system)))
+				return false;
+
+			using namespace core;
+
+			smart_refctd_ptr<system::IFileArchive> examplesHeaderArch,examplesSourceArch,examplesBuildArch,thisExampleArch;
+		#ifdef NBL_EMBED_BUILTIN_RESOURCES
+// TODO: the 3 examples archives
+			thisExampleArch = make_smart_refctd_ptr<nbl::this_example::builtin::CArchive>(smart_refctd_ptr(m_logger));
+		#else
+			examplesHeaderArch = make_smart_refctd_ptr<system::CMountDirectoryArchive>(localInputCWD/"../common/include/nbl/examples",smart_refctd_ptr(m_logger),m_system.get());
+			examplesSourceArch = make_smart_refctd_ptr<system::CMountDirectoryArchive>(localInputCWD/"../common/src/nbl/examples",smart_refctd_ptr(m_logger),m_system.get());
+// TODO: examplesBuildArch =
+			thisExampleArch = make_smart_refctd_ptr<system::CMountDirectoryArchive>(localInputCWD/"app_resources",smart_refctd_ptr(m_logger),m_system.get());
+		#endif
+			// yes all 3 aliases are meant to be the same
+			m_system->mount(std::move(examplesHeaderArch),"nbl/examples");
+			m_system->mount(std::move(examplesSourceArch),"nbl/examples");
+//			m_system->mount(std::move(examplesBuildArch),"nbl/examples");
+			m_system->mount(std::move(thisExampleArch),"app_resources");
+
+			return true;
+		}
+};
+
+}
+
+#endif // _CAMERA_IMPL_
\ No newline at end of file

From 20a5438ef09fddbecb96e40c4180227bf443ffcd Mon Sep 17 00:00:00 2001
From: devsh <devsh@devsh.eu>
Date: Sun, 22 Jun 2025 01:19:57 +0200
Subject: [PATCH 413/529] correct namespace ambiguities affecting example 23
 and 29

---
 common/include/nbl/examples/workgroup/DataAccessors.hlsl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/common/include/nbl/examples/workgroup/DataAccessors.hlsl b/common/include/nbl/examples/workgroup/DataAccessors.hlsl
index f94121ec0..ca5915f2c 100644
--- a/common/include/nbl/examples/workgroup/DataAccessors.hlsl
+++ b/common/include/nbl/examples/workgroup/DataAccessors.hlsl
@@ -101,14 +101,14 @@ struct PreloadedDataProxy
 
     void preload()
     {
-        const uint16_t invocationIndex = workgroup::SubgroupContiguousIndex();
+        const uint16_t invocationIndex = hlsl::workgroup::SubgroupContiguousIndex();
         [unroll]
         for (uint16_t idx = 0; idx < PreloadedDataCount; idx++)
             data.template get<dtype_t, uint16_t>(idx * WorkgroupSize + invocationIndex, preloaded[idx]);
     }
     void unload()
     {
-        const uint16_t invocationIndex = workgroup::SubgroupContiguousIndex();
+        const uint16_t invocationIndex = hlsl::workgroup::SubgroupContiguousIndex();
         [unroll]
         for (uint16_t idx = 0; idx < PreloadedDataCount; idx++)
             data.template set<dtype_t, uint16_t>(idx * WorkgroupSize + invocationIndex, preloaded[idx]);

From 21a88ff56a08673716648cba490fa9282ce8b065 Mon Sep 17 00:00:00 2001
From: devsh <devsh@devsh.eu>
Date: Sun, 22 Jun 2025 02:00:58 +0200
Subject: [PATCH 414/529] aaah the `BuiltinResourcesApplication.hpp` header
 needs some special treatment to NOT include `this_example/CArchive`

---
 common/include/nbl/examples/PCH.hpp                          | 2 --
 .../nbl/examples/common/BuiltinResourcesApplication.hpp      | 5 ++++-
 common/include/nbl/examples/examples.hpp                     | 5 +++++
 3 files changed, 9 insertions(+), 3 deletions(-)

diff --git a/common/include/nbl/examples/PCH.hpp b/common/include/nbl/examples/PCH.hpp
index 4d2025f5f..0905465c2 100644
--- a/common/include/nbl/examples/PCH.hpp
+++ b/common/include/nbl/examples/PCH.hpp
@@ -14,8 +14,6 @@
 #include "nabla.h"
 
 //! Common example interface headers
-
-#include "nbl/examples/common/BuiltinResourcesApplication.hpp"
 #include "nbl/examples/common/SimpleWindowedApplication.hpp"
 #include "nbl/examples/common/MonoWindowApplication.hpp"
 #include "nbl/examples/common/InputSystem.hpp"
diff --git a/common/include/nbl/examples/common/BuiltinResourcesApplication.hpp b/common/include/nbl/examples/common/BuiltinResourcesApplication.hpp
index 2f1884470..d183a9f4b 100644
--- a/common/include/nbl/examples/common/BuiltinResourcesApplication.hpp
+++ b/common/include/nbl/examples/common/BuiltinResourcesApplication.hpp
@@ -12,6 +12,7 @@
 // TODO: the include/header `nbl/examples` archive
 // TODO: the source `nbl/examples` archive
 // TODO: the build `nbl/examples` archive
+// TODO: make the `this_example` optional, only if the example has builtins
 #include "nbl/this_example/builtin/CArchive.h"
 #endif
 
@@ -41,7 +42,9 @@ class BuiltinResourcesApplication : public virtual application_templates::MonoAs
 			smart_refctd_ptr<system::IFileArchive> examplesHeaderArch,examplesSourceArch,examplesBuildArch,thisExampleArch;
 		#ifdef NBL_EMBED_BUILTIN_RESOURCES
 // TODO: the 3 examples archives
-			thisExampleArch = make_smart_refctd_ptr<nbl::this_example::builtin::CArchive>(smart_refctd_ptr(m_logger));
+			#ifdef _NBL_THIS_EXAMPLE_BUILTIN_C_ARCHIVE_H_
+				thisExampleArch = make_smart_refctd_ptr<nbl::this_example::builtin::CArchive>(smart_refctd_ptr(m_logger));
+			#endif
 		#else
 			examplesHeaderArch = make_smart_refctd_ptr<system::CMountDirectoryArchive>(localInputCWD/"../common/include/nbl/examples",smart_refctd_ptr(m_logger),m_system.get());
 			examplesSourceArch = make_smart_refctd_ptr<system::CMountDirectoryArchive>(localInputCWD/"../common/src/nbl/examples",smart_refctd_ptr(m_logger),m_system.get());
diff --git a/common/include/nbl/examples/examples.hpp b/common/include/nbl/examples/examples.hpp
index 985a3960a..d82303514 100644
--- a/common/include/nbl/examples/examples.hpp
+++ b/common/include/nbl/examples/examples.hpp
@@ -4,6 +4,7 @@
 #ifndef _NBL_EXAMPLES_HPP_
 #define _NBL_EXAMPLES_HPP_
 
+
 //! Precompiled header shared across all examples
 #include "nbl/examples/PCH.hpp"
 
@@ -16,4 +17,8 @@
 
 // #include "..."
 
+// Cannot be in PCH because depens on definition of `this_example` for Example's builtins
+#include "nbl/examples/common/BuiltinResourcesApplication.hpp"
+
+
 #endif // _NBL_EXAMPLES_HPP_
\ No newline at end of file

From 1b3c19cb84d618f20c77469c86a665544889aff7 Mon Sep 17 00:00:00 2001
From: devsh <devsh@devsh.eu>
Date: Mon, 23 Jun 2025 09:46:26 +0200
Subject: [PATCH 415/529] make the `BuiltinResourcesApplication` work for
 examples without builtins too

---
 09_GeometryCreator/include/common.hpp                     | 5 +++--
 .../nbl/examples/common/BuiltinResourcesApplication.hpp   | 8 +++++---
 2 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/09_GeometryCreator/include/common.hpp b/09_GeometryCreator/include/common.hpp
index d172e1959..84cd8118a 100644
--- a/09_GeometryCreator/include/common.hpp
+++ b/09_GeometryCreator/include/common.hpp
@@ -1,8 +1,8 @@
 #ifndef _NBL_THIS_EXAMPLE_COMMON_H_INCLUDED_
 #define _NBL_THIS_EXAMPLE_COMMON_H_INCLUDED_
 
-// TODO: @AnastaZIuk do we even make that explicit?
-#include "nbl/examples/PCH.hpp"
+
+#include "nbl/examples/examples.hpp"
 
 using namespace nbl;
 using namespace core;
@@ -14,4 +14,5 @@ using namespace video;
 using namespace scene;
 using namespace nbl::examples;
 
+
 #endif // __NBL_THIS_EXAMPLE_COMMON_H_INCLUDED__
\ No newline at end of file
diff --git a/common/include/nbl/examples/common/BuiltinResourcesApplication.hpp b/common/include/nbl/examples/common/BuiltinResourcesApplication.hpp
index d183a9f4b..aa1949ecd 100644
--- a/common/include/nbl/examples/common/BuiltinResourcesApplication.hpp
+++ b/common/include/nbl/examples/common/BuiltinResourcesApplication.hpp
@@ -12,8 +12,9 @@
 // TODO: the include/header `nbl/examples` archive
 // TODO: the source `nbl/examples` archive
 // TODO: the build `nbl/examples` archive
-// TODO: make the `this_example` optional, only if the example has builtins
-#include "nbl/this_example/builtin/CArchive.h"
+#if __has_include("nbl/this_example/builtin/CArchive.h")
+	#include "nbl/this_example/builtin/CArchive.h"
+#endif
 #endif
 
 
@@ -55,7 +56,8 @@ class BuiltinResourcesApplication : public virtual application_templates::MonoAs
 			m_system->mount(std::move(examplesHeaderArch),"nbl/examples");
 			m_system->mount(std::move(examplesSourceArch),"nbl/examples");
 //			m_system->mount(std::move(examplesBuildArch),"nbl/examples");
-			m_system->mount(std::move(thisExampleArch),"app_resources");
+			if (thisExampleArch)
+				m_system->mount(std::move(thisExampleArch),"app_resources");
 
 			return true;
 		}

From 8b89fa202b8cb6e84bb1465837ca318accdac471 Mon Sep 17 00:00:00 2001
From: Erfan Ahmadi <ahmadierfan99@gmail.com>
Date: Mon, 23 Jun 2025 16:05:56 +0400
Subject: [PATCH 416/529] WIP Grid DTM Work

---
 62_CAD/DrawResourcesFiller.cpp                | 19 ++---
 62_CAD/DrawResourcesFiller.h                  |  9 +-
 62_CAD/main.cpp                               |  4 +-
 62_CAD/shaders/main_pipeline/dtm.hlsl         |  1 +
 .../main_pipeline/fragment_shader.hlsl        | 83 ++++++++++++++-----
 5 files changed, 73 insertions(+), 43 deletions(-)

diff --git a/62_CAD/DrawResourcesFiller.cpp b/62_CAD/DrawResourcesFiller.cpp
index b40f6585c..296221fb5 100644
--- a/62_CAD/DrawResourcesFiller.cpp
+++ b/62_CAD/DrawResourcesFiller.cpp
@@ -776,21 +776,19 @@ void DrawResourcesFiller::drawGridDTM(
 	float gridCellWidth,
 	uint64_t textureID,
 	const DTMSettingsInfo& dtmSettingsInfo,
-	SIntendedSubmitInfo& intendedNextSubmit,
-	bool drawGridOnly/* = false*/)
+	SIntendedSubmitInfo& intendedNextSubmit)
 {
 	if (dtmSettingsInfo.mode == 0u)
 		return;
 
-	if (dtmSettingsInfo.mode == E_DTM_MODE::OUTLINE)
-		drawGridOnly = true;
-
 	GridDTMInfo gridDTMInfo;
 	gridDTMInfo.topLeft = topLeft;
 	gridDTMInfo.worldSpaceExtents = worldSpaceExtents;
 	gridDTMInfo.gridCellWidth = gridCellWidth;
-	if(!drawGridOnly)
+	if (textureID != InvalidTextureIndex)
 		gridDTMInfo.textureID = getImageIndexFromID(textureID, intendedNextSubmit); // for this to be valid and safe, this function needs to be called immediately after `addStaticImage` function to make sure image is in memory
+	else
+		gridDTMInfo.textureID = InvalidTextureIndex;
 
 	// determine the thickes line
 	float thickestLineThickness = 0.0f;
@@ -798,7 +796,7 @@ void DrawResourcesFiller::drawGridDTM(
 	{
 		thickestLineThickness = dtmSettingsInfo.outlineStyleInfo.worldSpaceLineWidth + dtmSettingsInfo.outlineStyleInfo.screenSpaceLineWidth;
 	}
-	else if (dtmSettingsInfo.mode & E_DTM_MODE::CONTOUR && !drawGridOnly)
+	else if (dtmSettingsInfo.mode & E_DTM_MODE::CONTOUR)
 	{
 		for (int i = 0; i < dtmSettingsInfo.contourSettingsCount; ++i)
 		{
@@ -809,7 +807,7 @@ void DrawResourcesFiller::drawGridDTM(
 	}
 	gridDTMInfo.thicknessOfTheThickestLine = thickestLineThickness;
 
-	setActiveDTMSettings(dtmSettingsInfo, drawGridOnly);
+	setActiveDTMSettings(dtmSettingsInfo);
 	beginMainObject(MainObjectType::GRID_DTM);
 
 	uint32_t mainObjectIdx = acquireActiveMainObjectIndex_SubmitIfNeeded(intendedNextSubmit);
@@ -1038,13 +1036,10 @@ void DrawResourcesFiller::setActiveLineStyle(const LineStyleInfo& lineStyle)
 	activeLineStyleIndex = InvalidStyleIdx;
 }
 
-void DrawResourcesFiller::setActiveDTMSettings(const DTMSettingsInfo& dtmSettingsInfo, const bool disableHeightRelatedDTMModes/* = false*/)
+void DrawResourcesFiller::setActiveDTMSettings(const DTMSettingsInfo& dtmSettingsInfo)
 {
 	activeDTMSettings = dtmSettingsInfo;
 	activeDTMSettingsIndex = InvalidDTMSettingsIdx;
-
-	if (disableHeightRelatedDTMModes)
-		activeDTMSettings.mode &= E_DTM_MODE::OUTLINE;
 }
 
 void DrawResourcesFiller::beginMainObject(MainObjectType type, TransformationType transformationType)
diff --git a/62_CAD/DrawResourcesFiller.h b/62_CAD/DrawResourcesFiller.h
index 1a74338e7..547926767 100644
--- a/62_CAD/DrawResourcesFiller.h
+++ b/62_CAD/DrawResourcesFiller.h
@@ -270,8 +270,7 @@ struct DrawResourcesFiller
 		float gridCellWidth,
 		uint64_t textureID,
 		const DTMSettingsInfo& dtmSettingsInfo,
-		SIntendedSubmitInfo& intendedNextSubmit,
-		const bool drawGridOnly = false);
+		SIntendedSubmitInfo& intendedNextSubmit);
 
 	/**
 	 * @brief Adds a static 2D image to the draw resource set for rendering.
@@ -383,10 +382,8 @@ struct DrawResourcesFiller
 
 	// Setting Active Resources:
 	void setActiveLineStyle(const LineStyleInfo& lineStyle);
-	/**
-	* @param disableHeightRelatedDTMModes disables E_DTM_MODE::CONTOUR and E_DTOM_MODE::HEIGHT_SHADING, necessary when we want to draw a grid DTM without using a height map texture
-	*/
-	void setActiveDTMSettings(const DTMSettingsInfo& dtmSettingsInfo, const bool disableHeightRelatedDTMModes = false);
+	
+	void setActiveDTMSettings(const DTMSettingsInfo& dtmSettingsInfo);
 
 	void beginMainObject(MainObjectType type, TransformationType transformationType = TransformationType::TT_NORMAL);
 	void endMainObject();
diff --git a/62_CAD/main.cpp b/62_CAD/main.cpp
index 5cb4082bd..905bdc98d 100644
--- a/62_CAD/main.cpp
+++ b/62_CAD/main.cpp
@@ -3654,12 +3654,12 @@ class ComputerAidedDesign final : public examples::SimpleWindowedApplication, pu
 			worldSpaceExtents.y = (heightMapExtent.height - 1) * HeightMapCellWidth;
 			const uint64_t heightMapTextureID = 0ull;
 
-			constexpr bool DrawGridOnly = false;
+			constexpr bool DrawGridOnly = true;
 			
 			if(DrawGridOnly)
 			{
 				dtmInfo.mode = E_DTM_MODE::OUTLINE;
-				drawResourcesFiller.drawGridDTM(topLeft, worldSpaceExtents, HeightMapCellWidth, heightMapTextureID, dtmInfo, intendedNextSubmit, DrawGridOnly);
+				drawResourcesFiller.drawGridDTM(topLeft, worldSpaceExtents, HeightMapCellWidth, InvalidTextureIndex, dtmInfo, intendedNextSubmit);
 			}
 			else
 			{
diff --git a/62_CAD/shaders/main_pipeline/dtm.hlsl b/62_CAD/shaders/main_pipeline/dtm.hlsl
index e90f685ba..8b1d80174 100644
--- a/62_CAD/shaders/main_pipeline/dtm.hlsl
+++ b/62_CAD/shaders/main_pipeline/dtm.hlsl
@@ -371,6 +371,7 @@ float4 calculateDTMOutlineColor(in uint outlineLineStyleIdx, in float3 v[3], in
     return outputColor;
 }
 
+// It's literally sdf with 2 line shapes
 float4 calculateGridDTMOutlineColor(in uint outlineLineStyleIdx, in nbl::hlsl::shapes::Line<float> outlineLineSegments[2], in float2 fragPos, in float phaseShift)
 {
     LineStyle outlineStyle = loadLineStyle(outlineLineStyleIdx);
diff --git a/62_CAD/shaders/main_pipeline/fragment_shader.hlsl b/62_CAD/shaders/main_pipeline/fragment_shader.hlsl
index 25564a964..8e5fff907 100644
--- a/62_CAD/shaders/main_pipeline/fragment_shader.hlsl
+++ b/62_CAD/shaders/main_pipeline/fragment_shader.hlsl
@@ -446,6 +446,65 @@ float4 fragMain(PSInput input) : SV_TARGET
             // v0-------v2b   v2a-------v1
             // 
 
+            const bool gridOnly = textureId == InvalidTextureIndex && dtmSettings.drawOutlineEnabled();
+            if (gridOnly)
+            {
+                nbl::hlsl::shapes::Line<float> outlineLineSegments[2];
+                
+                const float halfCellWidth = cellWidth * 0.5f;
+                const float2 horizontalBounds = float2(topLeft.y, topLeft.y + gridExtents.y);
+                const float2 verticalBounds = float2(topLeft.x, topLeft.x + gridExtents.x);
+                float2 nearestLineRemainingCoords = int2((gridSpacePos + halfCellWidth) / cellWidth) * cellWidth + topLeft;
+                // shift lines outside of the grid to a bound
+                nearestLineRemainingCoords.x = clamp(nearestLineRemainingCoords.x, verticalBounds.x, verticalBounds.y);
+                nearestLineRemainingCoords.y = clamp(nearestLineRemainingCoords.y, horizontalBounds.x, horizontalBounds.y);
+
+                // find the nearest horizontal line
+                outlineLineSegments[0].P0 = float32_t2(verticalBounds.x, nearestLineRemainingCoords.y);
+                outlineLineSegments[0].P1 = float32_t2(verticalBounds.y, nearestLineRemainingCoords.y);
+                // find the nearest vertical line
+                outlineLineSegments[1].P0 = float32_t2(nearestLineRemainingCoords.x, horizontalBounds.x);
+                outlineLineSegments[1].P1 = float32_t2(nearestLineRemainingCoords.x, horizontalBounds.y);
+                
+                float4 dtmColor = dtm::calculateGridDTMOutlineColor(dtmSettings.outlineLineStyleIdx, outlineLineSegments, input.position.xy, 0.0f);
+                textureColor = dtmColor.rgb;
+                localAlpha = dtmColor.a;
+            }
+            else
+            {
+            
+                // calculate insideCellCoord and figure out the 4 cells we're gonna do sdf with
+                float2 insideCellCoord = gridSpacePos - float2(cellWidth, cellWidth) * cellCoords; // TODO: use fmod instead?
+                // 0.2, 0.1 --> 0, 0 ---> [0, 0], [-1, -1], [-1, 0], [0, -1]
+                float offsetX = round(insideCellCoord.x) - 1.0f;
+                float offsetY = round(insideCellCoord.y) - 1.0f;
+                
+                // for each of those cells (some might be out of bounds, so we skip)
+                    // gather 
+                    // then figure out their triangles (A and B) and fill array of max 8 triangles (dtm::GridDTMTriangle)
+
+                // Contours:
+                // Is Contours Enabled?
+                    // for each contour settings (in reverse)
+                        // float sdf = max;
+                        // for each triangle
+                            // sdf = min(sdf, sdfOfContourSettings[i]);
+                        // based on sdf, the contour line style + smoothstep: we compute color and alpha
+                        // blendUnder
+
+                // Outlines:
+                // Is Outlines Enabled?
+                    // float sdf = max;
+                    // for each triangle
+                        // sdf = min(sdf, sdfOfOutlineSetting);
+                    // based on sdf, the outline line style + smoothstep: we compute color and alpha
+                    // blendUnder
+                
+                // Height Shading:
+                    // We just do sdf with current triangle (if valid)
+            }
+            
+#if 0
             // calculate screen space coordinates of vertices of the current tiranlge within the grid
             dtm::GridDTMTriangle currentTriangle;
             dtm::GridDTMCell neighbouringCells[8];
@@ -523,29 +582,6 @@ float4 fragMain(PSInput input) : SV_TARGET
                 }
             }
 
-            // find the nearest horizontal and vertical line to the fragment
-            nbl::hlsl::shapes::Line<float> outlineLineSegments[2];
-            {
-                const float halfCellWidth = cellWidth * 0.5f;
-                const float2 horizontalBounds = float2(topLeft.y, topLeft.y + gridExtents.y);
-                const float2 verticalBounds = float2(topLeft.x, topLeft.x + gridExtents.x);
-                float2 nearestLineRemainingCoords = int2((gridSpacePos + halfCellWidth) / cellWidth) * cellWidth + topLeft;
-                // shift lines outside of the grid to a bound
-                nearestLineRemainingCoords.x = clamp(nearestLineRemainingCoords.x, verticalBounds.x, verticalBounds.y);
-                nearestLineRemainingCoords.y = clamp(nearestLineRemainingCoords.y, horizontalBounds.x, horizontalBounds.y);
-
-                // find the nearest horizontal line
-                outlineLineSegments[0].P0 = float32_t2(verticalBounds.x, nearestLineRemainingCoords.y);
-                outlineLineSegments[0].P1 = float32_t2(verticalBounds.y, nearestLineRemainingCoords.y);
-                // find the nearest vertical line
-                outlineLineSegments[1].P0 = float32_t2(nearestLineRemainingCoords.x, horizontalBounds.x);
-                outlineLineSegments[1].P1 = float32_t2(nearestLineRemainingCoords.x, horizontalBounds.y);
-
-                // test diagonal draw (to draw diagonals height or contour shading must be enabled)
-                //outlineLineSegments[0] = nbl::hlsl::shapes::Line<float>::construct(currentTriangleVertices[0].xy, currentTriangleVertices[1].xy);
-                //outlineLineSegments[1] = nbl::hlsl::shapes::Line<float>::construct(currentTriangleVertices[0].xy, currentTriangleVertices[1].xy);
-            }
-
             const float3 baryCoord = dtm::calculateDTMTriangleBarycentrics(currentTriangle.vertices[0].xy, currentTriangle.vertices[1].xy, currentTriangle.vertices[2].xy, input.position.xy);
             float height = baryCoord.x * currentTriangle.vertices[0].z + baryCoord.y * currentTriangle.vertices[1].z + baryCoord.z * currentTriangle.vertices[2].z;
             float heightDeriv = fwidth(height);
@@ -585,6 +621,7 @@ float4 fragMain(PSInput input) : SV_TARGET
                 textureColor = float3(0.0f, 0.0f, 1.0f);
 
             localAlpha = 0.5f;*/
+#endif
         }
         else if (objType == ObjectType::STREAMED_IMAGE) 
         {

From de1ccfa5fa0bb802e4ecad21beef5d18e21d7c37 Mon Sep 17 00:00:00 2001
From: Erfan Ahmadi <ahmadierfan99@gmail.com>
Date: Mon, 23 Jun 2025 17:01:31 +0400
Subject: [PATCH 417/529] WIP contours in grid dtms

---
 62_CAD/shaders/main_pipeline/dtm.hlsl         | 15 +--
 .../main_pipeline/fragment_shader.hlsl        | 99 ++++++++++++++-----
 2 files changed, 77 insertions(+), 37 deletions(-)

diff --git a/62_CAD/shaders/main_pipeline/dtm.hlsl b/62_CAD/shaders/main_pipeline/dtm.hlsl
index 8b1d80174..6c638be58 100644
--- a/62_CAD/shaders/main_pipeline/dtm.hlsl
+++ b/62_CAD/shaders/main_pipeline/dtm.hlsl
@@ -230,11 +230,9 @@ float4 calculateDTMHeightColor(in DTMHeightShadingSettings settings, in float3 v
     return outputColor;
 }
 
-float4 calculateDTMContourColor(in DTMContourSettings contourSettings, in float3 v[3], in float2 fragPos, in float height)
+float calculateDTMContourSDF(in LineStyle contourStyle, in float3 v[3], in float2 fragPos, in float height)
 {
-    float4 outputColor = float4(0.0f, 0.0f, 0.0f, 0.0f);
-
-    LineStyle contourStyle = loadLineStyle(contourSettings.contourLineStyleIdx);
+    float distance = nbl::hlsl::numeric_limits<float>::max;
     const float contourThickness = (contourStyle.screenSpaceLineWidth + contourStyle.worldSpaceLineWidth * globals.screenToWorldRatio) * 0.5f;
     float stretch = 1.0f;
     float phaseShift = 0.0f;
@@ -283,7 +281,6 @@ float4 calculateDTMContourColor(in DTMContourSettings contourSettings, in float3
     {
         nbl::hlsl::shapes::Line<float> lineSegment = nbl::hlsl::shapes::Line<float>::construct(contourLinePoints[0], contourLinePoints[1]);
 
-        float distance = nbl::hlsl::numeric_limits<float>::max;
         if (!contourStyle.hasStipples() || stretch == InvalidStyleStretchValue)
         {
             distance = ClippedSignedDistance< nbl::hlsl::shapes::Line<float> >::sdf(lineSegment, fragPos, contourThickness, contourStyle.isRoadStyleFlag);
@@ -297,15 +294,9 @@ float4 calculateDTMContourColor(in DTMContourSettings contourSettings, in float3
             LineStyleClipper clipper = LineStyleClipper::construct(contourStyle, lineSegment, arcLenCalc, phaseShift, stretch, globals.worldToScreenRatio);
             distance = ClippedSignedDistance<nbl::hlsl::shapes::Line<float>, LineStyleClipper>::sdf(lineSegment, fragPos, contourThickness, contourStyle.isRoadStyleFlag, clipper);
         }
-
-        outputColor.a = 1.0f - smoothstep(-globals.antiAliasingFactor, globals.antiAliasingFactor, distance);
-        outputColor.a *= contourStyle.color.a;
-        outputColor.rgb = contourStyle.color.rgb;
-
-        return outputColor;
     }
 
-    return float4(0.0f, 0.0f, 0.0f, 0.0f);
+    return distance;
 }
 
 float4 calculateDTMOutlineColor(in uint outlineLineStyleIdx, in float3 v[3], in float2 fragPos)
diff --git a/62_CAD/shaders/main_pipeline/fragment_shader.hlsl b/62_CAD/shaders/main_pipeline/fragment_shader.hlsl
index 8e5fff907..2627be5b3 100644
--- a/62_CAD/shaders/main_pipeline/fragment_shader.hlsl
+++ b/62_CAD/shaders/main_pipeline/fragment_shader.hlsl
@@ -156,7 +156,13 @@ float4 fragMain(PSInput input) : SV_TARGET
         if (dtmSettings.drawContourEnabled())
         {
             for(uint32_t i = 0; i < dtmSettings.contourSettingsCount; ++i) // TODO: should reverse the order with blendUnder
-                dtmColor = dtm::blendUnder(dtmColor, dtm::calculateDTMContourColor(dtmSettings.contourSettings[i], v, input.position.xy, height));
+            {
+                LineStyle contourStyle = loadLineStyle(dtmSettings.contourSettings[i].contourLineStyleIdx);
+                float sdf = dtm::calculateDTMContourColor(contourStyle, v, input.position.xy, height);
+                float4 contourColor = contourStyle.color;
+                contourColor.a *= 1.0f - smoothstep(-globals.antiAliasingFactor, globals.antiAliasingFactor, sdf);
+                dtmColor = dtm::blendUnder(dtmColor, contourColor);
+            }
         }
         if (dtmSettings.drawHeightShadingEnabled())
             dtmColor = dtm::blendUnder(dtmColor, dtm::calculateDTMHeightColor(dtmSettings.heightShadingSettings, v, heightDeriv, input.position.xy, height));
@@ -425,13 +431,14 @@ float4 fragMain(PSInput input) : SV_TARGET
             float2 topLeft = input.getGridDTMScreenSpaceTopLeft();
             float2 gridExtents = input.getGridDTMScreenSpaceGridExtents();
             const float cellWidth = input.getGridDTMScreenSpaceCellWidth();
+            float2 gridDimensions = gridExtents / cellWidth; // TODO: Figure out if it's better to precomp in vtx
 
             float2 gridSpacePos = uv * gridExtents;
-            float2 cellCoords;
+            float2 gridSpacePosDivGridCellWidth = gridSpacePos / cellWidth;
+            float2 cellCoords; // rename to currentCellCoords
             {
-                float2 gridSpacePosDivGridCellWidth = gridSpacePos / cellWidth;
-                cellCoords.x = int32_t(gridSpacePosDivGridCellWidth.x);
-                cellCoords.y = int32_t(gridSpacePosDivGridCellWidth.y);
+                cellCoords.x = floor(gridSpacePosDivGridCellWidth.x);
+                cellCoords.y = floor(gridSpacePosDivGridCellWidth.y);
             }
 
             float2 gridSpaceCellTopLeftCoords = cellCoords * cellWidth;
@@ -472,30 +479,72 @@ float4 fragMain(PSInput input) : SV_TARGET
             }
             else
             {
-            
-                // calculate insideCellCoord and figure out the 4 cells we're gonna do sdf with
-                float2 insideCellCoord = gridSpacePos - float2(cellWidth, cellWidth) * cellCoords; // TODO: use fmod instead?
-                // 0.2, 0.1 --> 0, 0 ---> [0, 0], [-1, -1], [-1, 0], [0, -1]
-                float offsetX = round(insideCellCoord.x) - 1.0f;
-                float offsetY = round(insideCellCoord.y) - 1.0f;
+                // calculate localUV and figure out the 4 cells we're gonna do sdf with
+                float2 localUV = gridSpacePosDivGridCellWidth - cellCoords; // TODO: use fmod instead?
+                float2 offset = round(localUV) * 2.0f - 1.0f;
+                
+                const uint32_t MaxTrianglesToDoSDFWith = 8u;
+                dtm::GridDTMTriangle triangles[MaxTrianglesToDoSDFWith];
+                float interpolatedHeights[MaxTrianglesToDoSDFWith]; // these are height based on barycentric interpolation of current pixel with all the triangles above
+                uint32_t triangleCount = 0u;
+                
+                const uint32_t MaxLinesToDoSDFWith = 4u;
+                // TODO: Lines to do SDF with
+                // But only do if outlines are enabled
+                
+                // TODO: UNROLL
+                for (int i = 0; i < 2; ++i)
+                {
+                    for (int j = 0; j < 2; ++j)
+                    {
+                        float2 cellCoord = cellCoords + float2(i, j) * offset;
+                        const bool isCellWithinRange = 
+                            cellCoord.x >= 0.0f && cellCoord.y >= 0.0f && 
+                            cellCoord.x <= gridDimensions.x && cellCoord.y <= gridDimensions.y;
+                        if (isCellWithinRange)
+                        {
+                            // Triangle thing
+                            // topLeft, in float2 gridExtents, in float2 cellCoords, const float cellWidth, in Texture2D<uint32_t> heightMap;
+                            dtm::GridDTMCell gridCellFormed = calculateCellTriangles(topLeft, gridExtents, cellCoord, cellWidth, texturesU32[NonUniformResourceIndex(textureId)]);
+                            // Check the validity of the triangles and only add if valid
+                            triangles[triangleCount++] = gridCellFormed.triangleA;
+                            triangles[triangleCount++] = gridCellFormed.triangleB;
+                        }
+                    }
+                }
                 
-                // for each of those cells (some might be out of bounds, so we skip)
-                    // gather 
-                    // then figure out their triangles (A and B) and fill array of max 8 triangles (dtm::GridDTMTriangle)
-
-                // Contours:
-                // Is Contours Enabled?
-                    // for each contour settings (in reverse)
-                        // float sdf = max;
-                        // for each triangle
-                            // sdf = min(sdf, sdfOfContourSettings[i]);
-                        // based on sdf, the contour line style + smoothstep: we compute color and alpha
-                        // blendUnder
+                // float heightDeriv = fwidth(height);
+                // For height shading, merge this loop with the previous one, because baryCoord all positive means point inside triangle and we can use that to figure out the triangle we want to do height shading for.
+                for (int t = 0; t < trianglesCount; ++t)
+                {
+                    dtm::GridDTMTriangle tri = triangles[t];
+                    const float3 baryCoord = dtm::calculateDTMTriangleBarycentrics(tri.vertices[0].xy, tri.vertices[1].xy, tri.vertices[2].xy, input.position.xy);
+                    interpolatedHeights[t] = baryCoord.x * tri.vertices[0].z + baryCoord.y * tri.vertices[1].z + baryCoord.z * tri.vertices[2].z;
+                }
+
+                float4 dtmColor = float4(0.0f, 0.0f, 0.0f, 0.0f);
+                if (dtmSettings.drawContourEnabled())
+                {
+                    for (int i = dtmSettings.contourSettingsCount-1u; i >= 0; --i) 
+                    {
+                        LineStyle contourStyle = loadLineStyle(dtmSettings.contourSettings[i].contourLineStyleIdx);
+                        float sdf = nbl::hlsl::numeric_limits<float>::max;
+                        for (int t = 0; t < trianglesCount; ++t)
+                        {
+                            dtm::GridDTMTriangle tri = triangles[t];
+                            sdf = min(sdf, dtm::calculateDTMContourSDF(contourStyle, tri.vertices, input.position.xy, interpolatedHeights[t]));
+                        }
+                        
+                        float4 contourColor = contourStyle.color;
+                        contourColor.a *= 1.0f - smoothstep(-globals.antiAliasingFactor, globals.antiAliasingFactor, sdf);
+                        dtmColor = dtm::blendUnder(dtmColor, contourColor);
+                    }
+                }
 
                 // Outlines:
                 // Is Outlines Enabled?
                     // float sdf = max;
-                    // for each triangle
+                    // for each line
                         // sdf = min(sdf, sdfOfOutlineSetting);
                     // based on sdf, the outline line style + smoothstep: we compute color and alpha
                     // blendUnder
@@ -606,7 +655,7 @@ float4 fragMain(PSInput input) : SV_TARGET
             if (dtmSettings.drawOutlineEnabled())
                 dtmColor = dtm::blendUnder(dtmColor, dtm::calculateGridDTMOutlineColor(dtmSettings.outlineLineStyleIdx, outlineLineSegments, input.position.xy, 0.0f));
             if (dtmSettings.drawHeightShadingEnabled() && !outOfBoundsUV)
-                dtmColor = dtm::blendUnder(dtmColor, dtm::calculateDTMHeightColor(dtmSettings.heightShadingSettings, currentTriangle.vertices, heightDeriv, input.position.xy, height));
+                dtmColor = dtm::blendUnder(dtmColor, dtm::calculateDTMHeightColor(dtmSettings.heightShadingSettings, currentTriangle.vertices, heightDeriv, input.position.xy, interpolatedHeights[0]));
 
             textureColor = dtmColor.rgb / dtmColor.a;
             localAlpha = dtmColor.a;

From bb68b7bc660b3631c2ca0d124fcf197b321cc2d0 Mon Sep 17 00:00:00 2001
From: Arkadiusz Lachowicz <areklachowicz@gmail.com>
Date: Mon, 23 Jun 2025 16:26:41 +0200
Subject: [PATCH 418/529] create REGISTER_COMMON_BUILTINS, register example API
 source & include archives, update
 common/include/nbl/examples/common/BuiltinResourcesApplication.hpp and test
 on 09 example

---
 CMakeLists.txt                                |  2 +
 common/CMakeLists.txt                         | 56 ++++++++++++++++++-
 .../common/BuiltinResourcesApplication.hpp    | 11 ++--
 3 files changed, 62 insertions(+), 7 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index aa3880762..56c0ee60c 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -95,6 +95,8 @@ if(NBL_BUILD_EXAMPLES)
         target_link_libraries(${T} PUBLIC ${NBL_EXAMPLES_API_TARGET})
 		target_include_directories(${T} PUBLIC $<TARGET_PROPERTY:${NBL_EXAMPLES_API_TARGET},INCLUDE_DIRECTORIES>)
 		target_precompile_headers(${T} REUSE_FROM "${NBL_EXAMPLES_API_TARGET}")
+		LINK_BUILTIN_RESOURCES_TO_TARGET(${T} NblExtExamplesAPIBuiltinsSource)
+		LINK_BUILTIN_RESOURCES_TO_TARGET(${T} NblExtExamplesAPIBuiltinsInclude)
     endforeach()
 
 	NBL_ADJUST_FOLDERS(examples)
diff --git a/common/CMakeLists.txt b/common/CMakeLists.txt
index 3a55e7a26..b32e1a394 100644
--- a/common/CMakeLists.txt
+++ b/common/CMakeLists.txt
@@ -13,6 +13,53 @@ nbl_create_ext_library_project(ExamplesAPI "" "${CMAKE_CURRENT_SOURCE_DIR}/src/n
 set_target_properties(${LIB_NAME} PROPERTIES DISABLE_PRECOMPILE_HEADERS OFF)
 target_precompile_headers(${LIB_NAME} PUBLIC "${CMAKE_CURRENT_SOURCE_DIR}/include/nbl/examples/PCH.hpp")
 
+function(REGISTER_COMMON_BUILTINS)
+	cmake_parse_arguments(EX "" "TARGET;ARCHIVE_ABS_ENTRY;ARCHIVE_NAMESPACE" "GLOB_RGX" ${ARGN})
+
+    get_filename_component(INPUT_DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}" ABSOLUTE)
+    get_filename_component(OUTPUT_SRC "${CMAKE_CURRENT_BINARY_DIR}/builtin/${EX_TARGET}/src" ABSOLUTE)
+    get_filename_component(OUTPUT_INCLUDE "${CMAKE_CURRENT_BINARY_DIR}/builtin/${EX_TARGET}/include" ABSOLUTE)
+
+    set(KEYS_ENTRY "${INPUT_DIRECTORY}/${EX_ARCHIVE_ABS_ENTRY}")
+    list(TRANSFORM EX_GLOB_RGX PREPEND "${KEYS_ENTRY}/")
+	file(GLOB_RECURSE KEYS RELATIVE "${KEYS_ENTRY}" CONFIGURE_DEPENDS ${EX_GLOB_RGX})
+
+    #[[
+        note we do force you to specify full globbing expressions relative to keys entry which we do not filter
+        because if runtime outputs .spv compilation artifacts/shader cache preprocessed.hlsl(s) to source you will hit CMake
+        reconfiguration each time the file content or timestampts change and it could lead to embeding intermediate trash
+    ]]
+	
+    unset(EXAMPLES_RESOURCES_TO_EMBED)
+	foreach(KEY IN LISTS KEYS)
+		LIST_BUILTIN_RESOURCE(EXAMPLES_RESOURCES_TO_EMBED "${KEY}")
+	endforeach()
+
+    ADD_CUSTOM_BUILTIN_RESOURCES(${EX_TARGET} EXAMPLES_RESOURCES_TO_EMBED "${INPUT_DIRECTORY}" "${EX_ARCHIVE_ABS_ENTRY}" "${EX_ARCHIVE_NAMESPACE}" "${OUTPUT_INCLUDE}" "${OUTPUT_SRC}")
+
+    # even though builtin target is static library its still valid to reuse common PCH to boost its build speed to not preprocess entire Nabla again
+    set_target_properties(${EX_TARGET} PROPERTIES DISABLE_PRECOMPILE_HEADERS OFF)
+    target_precompile_headers(${EX_TARGET} REUSE_FROM "${LIB_NAME}")
+
+    target_include_directories(${EX_TARGET} PUBLIC "${INPUT_DIRECTORY}/include")
+    target_link_libraries(${EX_TARGET} INTERFACE ${LIB_NAME})
+endfunction()
+
+#! common example API builtins as static library targets linked to each example
+if(NBL_EMBED_BUILTIN_RESOURCES)
+    REGISTER_COMMON_BUILTINS(TARGET NblExtExamplesAPIBuiltinsSource 
+        ARCHIVE_ABS_ENTRY src/nbl/examples 
+        ARCHIVE_NAMESPACE nbl::builtin::examples::src 
+        GLOB_RGX *.hlsl *.txt
+    )
+
+    REGISTER_COMMON_BUILTINS(TARGET NblExtExamplesAPIBuiltinsInclude 
+        ARCHIVE_ABS_ENTRY include/nbl/examples 
+        ARCHIVE_NAMESPACE nbl::builtin::examples::include
+        GLOB_RGX *.hpp *.h *.hlsl *.txt
+    )
+endif()
+
 #! Examples API common libraries
 #[[
     The rule is to avoid creating additional libraries as part of the examples' common
@@ -24,7 +71,8 @@ target_precompile_headers(${LIB_NAME} PUBLIC "${CMAKE_CURRENT_SOURCE_DIR}/includ
     but If you have a good reason to create library because you cannot make it header only 
     AND you *can REUSE* the examples' PCH then go ahead anyway and put it under `src/nbl/examples`, 
     otherwise keep it header only - a good example would be to use our embedded-whatever-you-want tool
-    which does create library but can reuse example's PCH
+    which does create library but can reuse example's PCH (see NblExtExamplesAPIBuiltinsSource 
+    and NblExtExamplesAPIBuiltinsInclude targets)
 ]]
 
 #! NOTE: as I write it we don't have any targets there yet
@@ -35,7 +83,7 @@ list(REMOVE_ITEM TARGETS ${LIB_NAME})
 
 # the Examples API proxy library CMake target name
 #[[
-    this one gets linked to each executable automatically
+    this one gets linked to each executable automatically with its interface libraries
 ]]
 set(NBL_EXAMPLES_API_TARGET ${LIB_NAME} PARENT_SCOPE)
 
@@ -45,4 +93,6 @@ set(NBL_EXAMPLES_API_TARGET ${LIB_NAME} PARENT_SCOPE)
     then you must target_link_libraries() the lib you want as we 
     don't link all those libraries to each executable automatically
 ]]
-set(NBL_EXAMPLES_API_LIBRARIES ${TARGETS} PARENT_SCOPE)
\ No newline at end of file
+set(NBL_EXAMPLES_API_LIBRARIES ${TARGETS} PARENT_SCOPE)
+
+NBL_ADJUST_FOLDERS(common)
\ No newline at end of file
diff --git a/common/include/nbl/examples/common/BuiltinResourcesApplication.hpp b/common/include/nbl/examples/common/BuiltinResourcesApplication.hpp
index aa1949ecd..c32bbc3ea 100644
--- a/common/include/nbl/examples/common/BuiltinResourcesApplication.hpp
+++ b/common/include/nbl/examples/common/BuiltinResourcesApplication.hpp
@@ -9,9 +9,9 @@
 #include "nbl/application_templates/MonoAssetManagerApplication.hpp"
 
 #ifdef NBL_EMBED_BUILTIN_RESOURCES
-// TODO: the include/header `nbl/examples` archive
-// TODO: the source `nbl/examples` archive
-// TODO: the build `nbl/examples` archive
+	#include "nbl/builtin/examples/include/CArchive.h"
+	#include "nbl/builtin/examples/src/CArchive.h"
+	// TODO: the build `nbl/examples` archive
 #if __has_include("nbl/this_example/builtin/CArchive.h")
 	#include "nbl/this_example/builtin/CArchive.h"
 #endif
@@ -42,7 +42,10 @@ class BuiltinResourcesApplication : public virtual application_templates::MonoAs
 
 			smart_refctd_ptr<system::IFileArchive> examplesHeaderArch,examplesSourceArch,examplesBuildArch,thisExampleArch;
 		#ifdef NBL_EMBED_BUILTIN_RESOURCES
-// TODO: the 3 examples archives
+			examplesHeaderArch = core::make_smart_refctd_ptr<nbl::builtin::examples::include::CArchive>(smart_refctd_ptr(m_logger));
+			examplesSourceArch = core::make_smart_refctd_ptr<nbl::builtin::examples::src::CArchive>(smart_refctd_ptr(m_logger));
+			// TODO: the build archive
+
 			#ifdef _NBL_THIS_EXAMPLE_BUILTIN_C_ARCHIVE_H_
 				thisExampleArch = make_smart_refctd_ptr<nbl::this_example::builtin::CArchive>(smart_refctd_ptr(m_logger));
 			#endif

From 7ec6846d21be7893cf169703b0f7406e90bb8680 Mon Sep 17 00:00:00 2001
From: devsh <devsh@devsh.eu>
Date: Mon, 23 Jun 2025 17:13:04 +0200
Subject: [PATCH 419/529] make CSwapchainFramebuffersAndDepth support runtime
 depth buffer resignation

---
 .../common/CSwapchainFramebuffersAndDepth.hpp | 57 +++++++++++--------
 1 file changed, 32 insertions(+), 25 deletions(-)

diff --git a/common/include/nbl/examples/common/CSwapchainFramebuffersAndDepth.hpp b/common/include/nbl/examples/common/CSwapchainFramebuffersAndDepth.hpp
index ef88fb325..c7d780fdf 100644
--- a/common/include/nbl/examples/common/CSwapchainFramebuffersAndDepth.hpp
+++ b/common/include/nbl/examples/common/CSwapchainFramebuffersAndDepth.hpp
@@ -18,6 +18,10 @@ class CSwapchainFramebuffersAndDepth final : public video::CDefaultSwapchainFram
 		template<typename... Args>
 		inline CSwapchainFramebuffersAndDepth(video::ILogicalDevice* device, const asset::E_FORMAT _desiredDepthFormat, Args&&... args) : base_t(device,std::forward<Args>(args)...)
 		{
+			// user didn't want any depth
+			if (_desiredDepthFormat==asset::EF_UNKNOWN)
+				return;
+
 			using namespace nbl::asset;
 			using namespace nbl::video;
 			const IPhysicalDevice::SImageFormatPromotionRequest req = {
@@ -55,32 +59,34 @@ class CSwapchainFramebuffersAndDepth final : public video::CDefaultSwapchainFram
 		{
 			using namespace nbl::asset;
 			using namespace nbl::video;
-			// DOCS: why are we not using `m_device` here? any particular reason?
-			auto device = const_cast<ILogicalDevice*>(m_renderpass->getOriginDevice());
-
-			const auto depthFormat = m_renderpass->getCreationParameters().depthStencilAttachments[0].format;
-			const auto& sharedParams = getSwapchain()->getCreationParameters().sharedParams;
-			auto image = device->createImage({ IImage::SCreationParams{
-				.type = IGPUImage::ET_2D,
-				.samples = IGPUImage::ESCF_1_BIT,
-				.format = depthFormat,
-				.extent = {sharedParams.width,sharedParams.height,1},
-				.mipLevels = 1,
-				.arrayLayers = 1,
-				.depthUsage = IGPUImage::EUF_RENDER_ATTACHMENT_BIT
-			} });
+			if (m_depthFormat!=asset::EF_UNKNOWN)
+			{
+				// DOCS: why are we not using `m_device` here? any particular reason?
+				auto device = const_cast<ILogicalDevice*>(m_renderpass->getOriginDevice());
 
-			device->allocate(image->getMemoryReqs(), image.get());
+				const auto depthFormat = m_renderpass->getCreationParameters().depthStencilAttachments[0].format;
+				const auto& sharedParams = getSwapchain()->getCreationParameters().sharedParams;
+				auto image = device->createImage({ IImage::SCreationParams{
+					.type = IGPUImage::ET_2D,
+					.samples = IGPUImage::ESCF_1_BIT,
+					.format = depthFormat,
+					.extent = {sharedParams.width,sharedParams.height,1},
+					.mipLevels = 1,
+					.arrayLayers = 1,
+					.depthUsage = IGPUImage::EUF_RENDER_ATTACHMENT_BIT
+				} });
 
-			m_depthBuffer = device->createImageView({
-				.flags = IGPUImageView::ECF_NONE,
-				.subUsages = IGPUImage::EUF_RENDER_ATTACHMENT_BIT,
-				.image = std::move(image),
-				.viewType = IGPUImageView::ET_2D,
-				.format = depthFormat,
-				.subresourceRange = {IGPUImage::EAF_DEPTH_BIT,0,1,0,1}
-				});
+				device->allocate(image->getMemoryReqs(), image.get());
 
+				m_depthBuffer = device->createImageView({
+					.flags = IGPUImageView::ECF_NONE,
+					.subUsages = IGPUImage::EUF_RENDER_ATTACHMENT_BIT,
+					.image = std::move(image),
+					.viewType = IGPUImageView::ET_2D,
+					.format = depthFormat,
+					.subresourceRange = {IGPUImage::EAF_DEPTH_BIT,0,1,0,1}
+					});
+			}
 			const auto retval = base_t::onCreateSwapchain_impl(qFam);
 			m_depthBuffer = nullptr;
 			return retval;
@@ -88,11 +94,12 @@ class CSwapchainFramebuffersAndDepth final : public video::CDefaultSwapchainFram
 
 		inline core::smart_refctd_ptr<video::IGPUFramebuffer> createFramebuffer(video::IGPUFramebuffer::SCreationParams&& params) override
 		{
-			params.depthStencilAttachments = &m_depthBuffer.get();
+			if (m_depthBuffer)
+				params.depthStencilAttachments = &m_depthBuffer.get();
 			return m_device->createFramebuffer(std::move(params));
 		}
 
-		asset::E_FORMAT m_depthFormat;
+		asset::E_FORMAT m_depthFormat = asset::EF_UNKNOWN;
 		// only used to pass a parameter from `onCreateSwapchain_impl` to `createFramebuffer`
 		core::smart_refctd_ptr<video::IGPUImageView> m_depthBuffer;
 };

From 64e7b26f196e7636d2b2aa9c7f09275042d1a82b Mon Sep 17 00:00:00 2001
From: devsh <devsh@devsh.eu>
Date: Mon, 23 Jun 2025 17:34:41 +0200
Subject: [PATCH 420/529] remake example 61, not thoroughly tested, some TODOs
 remain for @AnastaZIuk

---
 09_GeometryCreator/main.cpp |   11 +-
 61_UI/include/transform.hpp |   22 +-
 61_UI/main.cpp              | 1350 ++++++++++++++++++-----------------
 3 files changed, 721 insertions(+), 662 deletions(-)

diff --git a/09_GeometryCreator/main.cpp b/09_GeometryCreator/main.cpp
index 38daebaa5..1a959f7a0 100644
--- a/09_GeometryCreator/main.cpp
+++ b/09_GeometryCreator/main.cpp
@@ -16,13 +16,6 @@ class GeometryCreatorApp final : public MonoWindowApplication, public BuiltinRes
 			: IApplicationFramework(_localInputCWD, _localOutputCWD, _sharedInputCWD, _sharedOutputCWD),
 			device_base_t({1280,720}, EF_D16_UNORM, _localInputCWD, _localOutputCWD, _sharedInputCWD, _sharedOutputCWD) {}
 
-		SPhysicalDeviceFeatures getRequiredDeviceFeatures() const override
-		{
-			auto retval = device_base_t::getRequiredDeviceFeatures();
-			retval.geometryShader = true;
-			return retval;
-		}
-
 		inline bool onAppInitialized(smart_refctd_ptr<ISystem>&& system) override
 		{
 			if (!asset_base_t::onAppInitialized(smart_refctd_ptr(system)))
@@ -100,7 +93,6 @@ class GeometryCreatorApp final : public MonoWindowApplication, public BuiltinRes
 				camera.endInputProcessing(nextPresentationTimestamp);
 			}
 
-			auto* queue = getGraphicsQueue();
 
 			asset::SViewport viewport;
 			{
@@ -155,6 +147,7 @@ class GeometryCreatorApp final : public MonoWindowApplication, public BuiltinRes
  			m_renderer->render(cb,viewParams);
 
 			cb->endRenderPass();
+			cb->endDebugMarker();
 			cb->end();
 
 			IQueue::SSubmitInfo::SSemaphoreInfo retval =
@@ -183,7 +176,7 @@ class GeometryCreatorApp final : public MonoWindowApplication, public BuiltinRes
 				}
 			};
 
-			if (queue->submit(infos) != IQueue::RESULT::SUCCESS)
+			if (getGraphicsQueue()->submit(infos) != IQueue::RESULT::SUCCESS)
 			{
 				retval.semaphore = nullptr; // so that we don't wait on semaphore that will never signal
 				m_realFrameIx--;
diff --git a/61_UI/include/transform.hpp b/61_UI/include/transform.hpp
index 88a78f751..fb1672c2f 100644
--- a/61_UI/include/transform.hpp
+++ b/61_UI/include/transform.hpp
@@ -1,20 +1,23 @@
-#ifndef __NBL_THIS_EXAMPLE_TRANSFORM_H_INCLUDED__
-#define __NBL_THIS_EXAMPLE_TRANSFORM_H_INCLUDED__
+#ifndef _NBL_THIS_EXAMPLE_TRANSFORM_H_INCLUDED_
+#define _NBL_THIS_EXAMPLE_TRANSFORM_H_INCLUDED_
+
 
 #include "nbl/ui/ICursorControl.h"
+
 #include "nbl/ext/ImGui/ImGui.h"
+
 #include "imgui/imgui_internal.h"
 #include "imguizmo/ImGuizmo.h"
 
-static constexpr inline auto OfflineSceneTextureIx = 1u;
 
 struct TransformRequestParams
 {
-	bool useWindow = true, editTransformDecomposition = false, enableViewManipulate = false;
 	float camDistance = 8.f;
+	uint8_t sceneTexDescIx = ~0;
+	bool useWindow = true, editTransformDecomposition = false, enableViewManipulate = false;
 };
 
-void EditTransform(float* cameraView, const float* cameraProjection, float* matrix, const TransformRequestParams& params)
+nbl::hlsl::uint16_t2 EditTransform(float* cameraView, const float* cameraProjection, float* matrix, const TransformRequestParams& params)
 {
 	static ImGuizmo::OPERATION mCurrentGizmoOperation(ImGuizmo::TRANSLATE);
 	static ImGuizmo::MODE mCurrentGizmoMode(ImGuizmo::LOCAL);
@@ -99,11 +102,12 @@ void EditTransform(float* cameraView, const float* cameraProjection, float* matr
 		rendered is aligned to our texture scene using 
         imgui  "cursor" screen positions
 	*/
-
+// TODO: this shouldn't be handled here I think
 	SImResourceInfo info;
-	info.textureID = OfflineSceneTextureIx;
+	info.textureID = params.sceneTexDescIx;
 	info.samplerIx = (uint16_t)nbl::ext::imgui::UI::DefaultSamplerIx::USER;
 
+	nbl::hlsl::uint16_t2 retval;
 	if (params.useWindow)
 	{
 		ImGui::SetNextWindowSize(ImVec2(800, 400), ImGuiCond_Appearing);
@@ -118,6 +122,7 @@ void EditTransform(float* cameraView, const float* cameraProjection, float* matr
 
 		ImGui::Image(info, contentRegionSize);
 		ImGuizmo::SetRect(cursorPos.x, cursorPos.y, contentRegionSize.x, contentRegionSize.y);
+		retval = {contentRegionSize.x,contentRegionSize.y};
 
 		viewManipulateRight = cursorPos.x + contentRegionSize.x;
 		viewManipulateTop = cursorPos.y;
@@ -137,6 +142,7 @@ void EditTransform(float* cameraView, const float* cameraProjection, float* matr
 
 		ImGui::Image(info, contentRegionSize);
 		ImGuizmo::SetRect(cursorPos.x, cursorPos.y, contentRegionSize.x, contentRegionSize.y);
+		retval = {contentRegionSize.x,contentRegionSize.y};
 
 		viewManipulateRight = cursorPos.x + contentRegionSize.x;
 		viewManipulateTop = cursorPos.y;
@@ -149,6 +155,8 @@ void EditTransform(float* cameraView, const float* cameraProjection, float* matr
 
 	ImGui::End();
 	ImGui::PopStyleColor();
+
+	return retval;
 }
 
 #endif // __NBL_THIS_EXAMPLE_TRANSFORM_H_INCLUDED__
\ No newline at end of file
diff --git a/61_UI/main.cpp b/61_UI/main.cpp
index 17d028f29..d4f21f2e8 100644
--- a/61_UI/main.cpp
+++ b/61_UI/main.cpp
@@ -5,790 +5,848 @@
 #include "common.hpp"
 
 /*
-	Renders scene texture to an offline
-	framebuffer which color attachment
-	is then sampled into a imgui window.
+Renders scene texture to an offscreen framebuffer whose color attachment is then sampled into a imgui window.
 
-	Written with Nabla, it's UI extension
-	and got integrated with ImGuizmo to 
-	handle scene's object translations.
+Written with Nabla's UI extension and got integrated with ImGuizmo to handle scene's object translations.
 */
-
-class UISampleApp final : public SimpleWindowedApplication
+class UISampleApp final : public MonoWindowApplication, public BuiltinResourcesApplication
 {
-		using device_base_t = SimpleWindowedApplication;
-
-		_NBL_STATIC_INLINE_CONSTEXPR uint32_t WIN_W = 1280, WIN_H = 720;
+		using device_base_t = MonoWindowApplication;
+		using asset_base_t = BuiltinResourcesApplication;
 
 	public:
 		inline UISampleApp(const path& _localInputCWD, const path& _localOutputCWD, const path& _sharedInputCWD, const path& _sharedOutputCWD) 
-			: IApplicationFramework(_localInputCWD, _localOutputCWD, _sharedInputCWD, _sharedOutputCWD) {}
-
-		inline core::vector<video::SPhysicalDeviceFilter::SurfaceCompatibility> getSurfaces() const override
-		{
-			if (!m_surface)
-			{
-				{
-					auto windowCallback = core::make_smart_refctd_ptr<CEventCallback>(smart_refctd_ptr(m_inputSystem), smart_refctd_ptr(m_logger));
-					IWindow::SCreationParams params = {};
-					params.callback = core::make_smart_refctd_ptr<nbl::video::ISimpleManagedSurface::ICallback>();
-					params.width = WIN_W;
-					params.height = WIN_H;
-					params.x = 32;
-					params.y = 32;
-					params.flags = ui::IWindow::ECF_HIDDEN | IWindow::ECF_BORDERLESS | IWindow::ECF_RESIZABLE;
-					params.windowCaption = "UISampleApp";
-					params.callback = windowCallback;
-					const_cast<std::remove_const_t<decltype(m_window)>&>(m_window) = m_winMgr->createWindow(std::move(params));
-				}
-
-				auto surface = CSurfaceVulkanWin32::create(smart_refctd_ptr(m_api), smart_refctd_ptr_static_cast<IWindowWin32>(m_window));
-				const_cast<std::remove_const_t<decltype(m_surface)>&>(m_surface) = nbl::video::CSimpleResizeSurface<nbl::video::CDefaultSwapchainFramebuffers>::create(std::move(surface));
-			}
-
-			if (m_surface)
-				return { {m_surface->getSurface()/*,EQF_NONE*/} };
-
-			return {};
-		}
+			: IApplicationFramework(_localInputCWD, _localOutputCWD, _sharedInputCWD, _sharedOutputCWD),
+			device_base_t({1280,720}, EF_UNKNOWN, _localInputCWD, _localOutputCWD, _sharedInputCWD, _sharedOutputCWD) {}
 
 		inline bool onAppInitialized(smart_refctd_ptr<ISystem>&& system) override
 		{
-			m_inputSystem = make_smart_refctd_ptr<InputSystem>(logger_opt_smart_ptr(smart_refctd_ptr(m_logger)));
-
+			if (!asset_base_t::onAppInitialized(smart_refctd_ptr(system)))
+				return false;
 			if (!device_base_t::onAppInitialized(smart_refctd_ptr(system)))
 				return false;
 
-			m_assetManager = make_smart_refctd_ptr<nbl::asset::IAssetManager>(smart_refctd_ptr(m_system));
-
 			m_semaphore = m_device->createSemaphore(m_realFrameIx);
 			if (!m_semaphore)
 				return logFail("Failed to Create a Semaphore!");
 
-			ISwapchain::SCreationParams swapchainParams = { .surface = m_surface->getSurface() };
-			if (!swapchainParams.deduceFormat(m_physicalDevice))
-				return logFail("Could not choose a Surface Format for the Swapchain!");
-
-			const static IGPURenderpass::SCreationParams::SSubpassDependency dependencies[] = 
+			auto pool = m_device->createCommandPool(getGraphicsQueue()->getFamilyIndex(),IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT);
+			for (auto i = 0u; i<MaxFramesInFlight; i++)
 			{
-				{
-					.srcSubpass = IGPURenderpass::SCreationParams::SSubpassDependency::External,
-					.dstSubpass = 0,
-					.memoryBarrier = 
-					{
-						.srcStageMask = asset::PIPELINE_STAGE_FLAGS::COPY_BIT,
-						.srcAccessMask = asset::ACCESS_FLAGS::TRANSFER_WRITE_BIT,
-						.dstStageMask = asset::PIPELINE_STAGE_FLAGS::COLOR_ATTACHMENT_OUTPUT_BIT,
-						.dstAccessMask = asset::ACCESS_FLAGS::COLOR_ATTACHMENT_WRITE_BIT
-					}
-				},
-				{
-					.srcSubpass = 0,
-					.dstSubpass = IGPURenderpass::SCreationParams::SSubpassDependency::External,
-					.memoryBarrier = 
-					{
-						.srcStageMask = asset::PIPELINE_STAGE_FLAGS::COLOR_ATTACHMENT_OUTPUT_BIT,
-						.srcAccessMask = asset::ACCESS_FLAGS::COLOR_ATTACHMENT_WRITE_BIT
-					}
-				},
-				IGPURenderpass::SCreationParams::DependenciesEnd
-			};
-
-			auto scResources = std::make_unique<CDefaultSwapchainFramebuffers>(m_device.get(), swapchainParams.surfaceFormat.format, dependencies);
-			auto* renderpass = scResources->getRenderpass();
-			
-			if (!renderpass)
-				return logFail("Failed to create Renderpass!");
-
-			auto gQueue = getGraphicsQueue();
-			if (!m_surface || !m_surface->init(gQueue, std::move(scResources), swapchainParams.sharedParams))
-				return logFail("Could not create Window & Surface or initialize the Surface!");
-
-			m_cmdPool = m_device->createCommandPool(gQueue->getFamilyIndex(), IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT);
-	
-			for (auto i = 0u; i < MaxFramesInFlight; i++)
-			{
-				if (!m_cmdPool)
+				if (!pool)
 					return logFail("Couldn't create Command Pool!");
-				if (!m_cmdPool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY, { m_cmdBufs.data() + i, 1 }))
+				if (!pool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY,{m_cmdBufs.data()+i,1}))
 					return logFail("Couldn't create Command Buffer!");
 			}
 			
-			//pass.scene = CScene::create<CScene::CreateResourcesDirectlyWithDevice>(smart_refctd_ptr(m_utils), smart_refctd_ptr(m_logger), gQueue, geometry);
-			pass.scene = CScene::create<CScene::CreateResourcesWithAssetConverter>(smart_refctd_ptr(m_utils), smart_refctd_ptr(m_logger), gQueue, geometry);
-
-			nbl::ext::imgui::UI::SCreationParameters params;
-
-			params.resources.texturesInfo = { .setIx = 0u, .bindingIx = 0u };
-			params.resources.samplersInfo = { .setIx = 0u, .bindingIx = 1u };
-			params.assetManager = m_assetManager;
-			params.pipelineCache = nullptr;
-			params.pipelineLayout = nbl::ext::imgui::UI::createDefaultPipelineLayout(m_utils->getLogicalDevice(), params.resources.texturesInfo, params.resources.samplersInfo, TexturesAmount);
-			params.renderpass = smart_refctd_ptr<IGPURenderpass>(renderpass);
-			params.streamingBuffer = nullptr;
-			params.subpassIx = 0u;
-			params.transfer = getTransferUpQueue();
-			params.utilities = m_utils;
-			{
-				pass.ui.manager = nbl::ext::imgui::UI::create(std::move(params));
-
-				if (!pass.ui.manager)
-					return false;
-
-				// note that we use default layout provided by our extension, but you are free to create your own by filling nbl::ext::imgui::UI::S_CREATION_PARAMETERS::resources
-				const auto* descriptorSetLayout = pass.ui.manager->getPipeline()->getLayout()->getDescriptorSetLayout(0u);
-				const auto& params = pass.ui.manager->getCreationParameters();
-
-				IDescriptorPool::SCreateInfo descriptorPoolInfo = {};
-				descriptorPoolInfo.maxDescriptorCount[static_cast<uint32_t>(asset::IDescriptor::E_TYPE::ET_SAMPLER)] = (uint32_t)nbl::ext::imgui::UI::DefaultSamplerIx::COUNT;
-				descriptorPoolInfo.maxDescriptorCount[static_cast<uint32_t>(asset::IDescriptor::E_TYPE::ET_SAMPLED_IMAGE)] = TexturesAmount;
-				descriptorPoolInfo.maxSets = 1u;
-				descriptorPoolInfo.flags = IDescriptorPool::E_CREATE_FLAGS::ECF_UPDATE_AFTER_BIND_BIT;
-
-				m_descriptorSetPool = m_device->createDescriptorPool(std::move(descriptorPoolInfo));
-				assert(m_descriptorSetPool);
-
-				m_descriptorSetPool->createDescriptorSets(1u, &descriptorSetLayout, &pass.ui.descriptorSet);
-				assert(pass.ui.descriptorSet);
-			}
-			pass.ui.manager->registerListener([this]() -> void
+			const uint32_t addtionalBufferOwnershipFamilies[] = {getGraphicsQueue()->getFamilyIndex()};
+			// we want to use the vertex data through UTBs
+			using usage_f = IGPUBuffer::E_USAGE_FLAGS;
+			CAssetConverter::patch_t<asset::ICPUPolygonGeometry> patch = {};
+			patch.positionBufferUsages = usage_f::EUF_UNIFORM_TEXEL_BUFFER_BIT;
+			patch.indexBufferUsages = usage_f::EUF_INDEX_BUFFER_BIT;
+			patch.otherBufferUsages = usage_f::EUF_UNIFORM_TEXEL_BUFFER_BIT;
+			m_scene = CGeometryCreatorScene::create(
 				{
-					ImGuiIO& io = ImGui::GetIO();
-
-					camera.setProjectionMatrix([&]() 
-					{
-						static matrix4SIMD projection;
-
-						if (isPerspective)
-							if(isLH)
-								projection = matrix4SIMD::buildProjectionMatrixPerspectiveFovLH(core::radians(fov), io.DisplaySize.x / io.DisplaySize.y, zNear, zFar);
-							else
-								projection = matrix4SIMD::buildProjectionMatrixPerspectiveFovRH(core::radians(fov), io.DisplaySize.x / io.DisplaySize.y, zNear, zFar);
-						else
+					.transferQueue = getTransferUpQueue(),
+					.utilities = m_utils.get(),
+					.logger = m_logger.get(),
+					.addtionalBufferOwnershipFamilies = addtionalBufferOwnershipFamilies
+				},patch
+			);
+			
+			// for the scene drawing pass
+			{
+				IGPURenderpass::SCreationParams params = {};
+				const IGPURenderpass::SCreationParams::SDepthStencilAttachmentDescription depthAttachments[] = {
+					{{
 						{
-							float viewHeight = viewWidth * io.DisplaySize.y / io.DisplaySize.x;
-
-							if(isLH)
-								projection = matrix4SIMD::buildProjectionMatrixOrthoLH(viewWidth, viewHeight, zNear, zFar);
-							else
-								projection = matrix4SIMD::buildProjectionMatrixOrthoRH(viewWidth, viewHeight, zNear, zFar);
+							.format = sceneRenderDepthFormat,
+							.samples = IGPUImage::ESCF_1_BIT,
+							.mayAlias = false
+						},
+						/*.loadOp = */{IGPURenderpass::LOAD_OP::CLEAR},
+						/*.storeOp = */{IGPURenderpass::STORE_OP::STORE},
+						/*.initialLayout = */{IGPUImage::LAYOUT::UNDEFINED},
+						/*.finalLayout = */{IGPUImage::LAYOUT::ATTACHMENT_OPTIMAL}
+					}},
+					IGPURenderpass::SCreationParams::DepthStencilAttachmentsEnd
+				};
+				params.depthStencilAttachments = depthAttachments;
+				const IGPURenderpass::SCreationParams::SColorAttachmentDescription colorAttachments[] = {
+					{{
+						{
+							.format = finalSceneRenderFormat,
+							.samples = IGPUImage::E_SAMPLE_COUNT_FLAGS::ESCF_1_BIT,
+							.mayAlias = false
+						},
+						/*.loadOp = */IGPURenderpass::LOAD_OP::CLEAR,
+						/*.storeOp = */IGPURenderpass::STORE_OP::STORE,
+						/*.initialLayout = */IGPUImage::LAYOUT::UNDEFINED,
+						/*.finalLayout = */ IGPUImage::LAYOUT::READ_ONLY_OPTIMAL // ImGUI shall read
+					}},
+					IGPURenderpass::SCreationParams::ColorAttachmentsEnd
+				};
+				params.colorAttachments = colorAttachments;
+				IGPURenderpass::SCreationParams::SSubpassDescription subpasses[] = {
+					{},
+					IGPURenderpass::SCreationParams::SubpassesEnd
+				};
+				subpasses[0].colorAttachments[0] = {.render={.attachmentIndex=0,.layout=IGPUImage::LAYOUT::ATTACHMENT_OPTIMAL}};
+				params.subpasses = subpasses;
+				
+				const static IGPURenderpass::SCreationParams::SSubpassDependency dependencies[] = {
+					// wipe-transition of Color to ATTACHMENT_OPTIMAL and depth
+					{
+						.srcSubpass = IGPURenderpass::SCreationParams::SSubpassDependency::External,
+						.dstSubpass = 0,
+						.memoryBarrier = {
+							// last place where the depth can get modified in previous frame, `COLOR_ATTACHMENT_OUTPUT_BIT` is implicitly later
+							// while color is sampled by ImGUI
+							.srcStageMask = PIPELINE_STAGE_FLAGS::LATE_FRAGMENT_TESTS_BIT|PIPELINE_STAGE_FLAGS::FRAGMENT_SHADER_BIT,
+							// don't want any writes to be available, as we are clearing both attachments
+							.srcAccessMask = ACCESS_FLAGS::NONE,
+							// destination needs to wait as early as possible
+							// TODO: `COLOR_ATTACHMENT_OUTPUT_BIT` shouldn't be needed, because its a logically later stage, see TODO in `ECommonEnums.h`
+							.dstStageMask = PIPELINE_STAGE_FLAGS::EARLY_FRAGMENT_TESTS_BIT|PIPELINE_STAGE_FLAGS::COLOR_ATTACHMENT_OUTPUT_BIT,
+							// because depth and color get cleared first no read mask
+							.dstAccessMask = ACCESS_FLAGS::DEPTH_STENCIL_ATTACHMENT_WRITE_BIT|ACCESS_FLAGS::COLOR_ATTACHMENT_WRITE_BIT
 						}
+						// leave view offsets and flags default
+					},
+					{
+						.srcSubpass = 0,
+						.dstSubpass = IGPURenderpass::SCreationParams::SSubpassDependency::External,
+						.memoryBarrier = {
+							// last place where the color can get modified, depth is implicitly earlier
+							.srcStageMask = PIPELINE_STAGE_FLAGS::COLOR_ATTACHMENT_OUTPUT_BIT,
+							// only write ops, reads can't be made available, also won't be using depth so don't care about it being visible to anyone else
+							.srcAccessMask = ACCESS_FLAGS::COLOR_ATTACHMENT_WRITE_BIT,
+							// the ImGUI will sample the color, then next frame we overwrite both attachments
+							.dstStageMask = PIPELINE_STAGE_FLAGS::FRAGMENT_SHADER_BIT|PIPELINE_STAGE_FLAGS::EARLY_FRAGMENT_TESTS_BIT,
+							// but we only care about the availability-visibility chain between renderpass and imgui 
+							.dstAccessMask = ACCESS_FLAGS::SAMPLED_READ_BIT
+						}
+						// leave view offsets and flags default
+					},
+					IGPURenderpass::SCreationParams::DependenciesEnd
+				};
+				params.dependencies = {};
+				m_renderpass = m_device->createRenderpass(std::move(params));
+				if (!m_renderpass)
+					return logFail("Failed to create Scene Renderpass!");
+			}
+			m_renderer = CSimpleDebugRenderer::create(m_assetMgr.get(),m_renderpass.get(),0,m_scene.get());
 
-						return projection;
-					}());
-
-					ImGuizmo::SetOrthographic(false);
-					ImGuizmo::BeginFrame();
-
-					ImGui::SetNextWindowPos(ImVec2(1024, 100), ImGuiCond_Appearing);
-					ImGui::SetNextWindowSize(ImVec2(256, 256), ImGuiCond_Appearing);
-
-					// create a window and insert the inspector
-					ImGui::SetNextWindowPos(ImVec2(10, 10), ImGuiCond_Appearing);
-					ImGui::SetNextWindowSize(ImVec2(320, 340), ImGuiCond_Appearing);
-					ImGui::Begin("Editor");
-
-					if (ImGui::RadioButton("Full view", !transformParams.useWindow))
-						transformParams.useWindow = false;
-
-					ImGui::SameLine();
-
-					if (ImGui::RadioButton("Window", transformParams.useWindow))
-						transformParams.useWindow = true;
-
-					ImGui::Text("Camera");
-					bool viewDirty = false;
-
-					if (ImGui::RadioButton("LH", isLH))
-						isLH = true;
-
-					ImGui::SameLine();
-
-					if (ImGui::RadioButton("RH", !isLH))
-						isLH = false;
-
-					if (ImGui::RadioButton("Perspective", isPerspective))
-						isPerspective = true;
+			// Create ImGUI
+			{
+				ext::imgui::UI::SCreationParameters params = {};
+				params.resources.texturesInfo = {.setIx=0u,.bindingIx=TexturesImGUIBindingIndex};
+				params.resources.samplersInfo = {.setIx=0u,.bindingIx=1u};
+				params.utilities = m_utils;
+				params.transfer = getTransferUpQueue();
+				params.pipelineLayout = ext::imgui::UI::createDefaultPipelineLayout(m_utils->getLogicalDevice(),params.resources.texturesInfo,params.resources.samplersInfo,MaxImGUITextures);
+				params.assetManager = make_smart_refctd_ptr<IAssetManager>(smart_refctd_ptr(m_system));
+				params.renderpass = m_renderpass;
+				params.subpassIx = 0u;
+				params.pipelineCache = nullptr;
+				interface.imGUI = ext::imgui::UI::create(std::move(params));
+				if (!interface.imGUI)
+					return logFail("Failed to create `nbl::ext::imgui::UI` class");
+			}
 
-					ImGui::SameLine();
+			// create rest of User Interface
+			{
+				auto* imgui = interface.imGUI.get();
+				// create the suballocated descriptor set
+				{
+					// note that we use default layout provided by our extension, but you are free to create your own by filling ext::imgui::UI::S_CREATION_PARAMETERS::resources
+					const auto* layout = imgui->getPipeline()->getLayout()->getDescriptorSetLayout(0u);
+					auto pool = m_device->createDescriptorPoolForDSLayouts(IDescriptorPool::E_CREATE_FLAGS::ECF_UPDATE_AFTER_BIND_BIT,{&layout,1});
+					auto ds = pool->createDescriptorSet(smart_refctd_ptr<const IGPUDescriptorSetLayout>(layout));
+					interface.subAllocDS = make_smart_refctd_ptr<SubAllocatedDescriptorSet>(std::move(ds));
+					if (!interface.subAllocDS)
+						return logFail("Failed to create the descriptor set");
+					// make sure Texture Atlas slot is taken for eternity
+					{
+						auto dummy = SubAllocatedDescriptorSet::invalid_value;
+						interface.subAllocDS->multi_allocate(0,1,&dummy);
+						assert(dummy==ext::imgui::UI::FontAtlasTexId);
+					}
+					// write constant descriptors, note we don't create info & write pair for the samplers because UI extension's are immutable and baked into DS layout
+					IGPUDescriptorSet::SDescriptorInfo info = {};
+					info.desc = smart_refctd_ptr<nbl::video::IGPUImageView>(interface.imGUI->getFontAtlasView());
+					info.info.image.imageLayout = IImage::LAYOUT::READ_ONLY_OPTIMAL;
+					const IGPUDescriptorSet::SWriteDescriptorSet write = {
+						.dstSet = interface.subAllocDS->getDescriptorSet(),
+						.binding = TexturesImGUIBindingIndex,
+						.arrayElement = ext::imgui::UI::FontAtlasTexId,
+						.count = 1,
+						.info = &info
+					};
+					if (!m_device->updateDescriptorSets({&write,1},{}))
+						return logFail("Failed to write the descriptor set");
+				}
+				imgui->registerListener([this](){interface();});
+			}
 
-					if (ImGui::RadioButton("Orthographic", !isPerspective))
-						isPerspective = false;
+			interface.camera.mapKeysToArrows();
 
-					ImGui::Checkbox("Enable \"view manipulate\"", &transformParams.enableViewManipulate);
-					ImGui::Checkbox("Enable camera movement", &move);
-					ImGui::SliderFloat("Move speed", &moveSpeed, 0.1f, 10.f);
-					ImGui::SliderFloat("Rotate speed", &rotateSpeed, 0.1f, 10.f);
+			onAppInitializedFinish();
+			return true;
+		}
 
-					// ImGui::Checkbox("Flip Gizmo's Y axis", &flipGizmoY); // let's not expose it to be changed in UI but keep the logic in case
+		inline void beginRenderpass(IGPUCommandBuffer* cb, const IGPUCommandBuffer::SRenderpassBeginInfo& info)
+		{
+			cb->beginRenderPass(info,IGPUCommandBuffer::SUBPASS_CONTENTS::INLINE);
+			cb->setScissor(0,1,&info.renderArea);
+			const SViewport viewport = {
+				.x = 0,
+				.y = 0,
+				.width = static_cast<float>(info.renderArea.extent.width),
+				.height = static_cast<float>(info.renderArea.extent.height)
+			};
+			cb->setViewport(0u,1u,&viewport);
+		}
 
-					if (isPerspective)
-						ImGui::SliderFloat("Fov", &fov, 20.f, 150.f);
-					else
-						ImGui::SliderFloat("Ortho width", &viewWidth, 1, 20);
+		inline IQueue::SSubmitInfo::SSemaphoreInfo renderFrame(const std::chrono::microseconds nextPresentationTimestamp) override
+		{
+			// CPU events
+			update(nextPresentationTimestamp);
 
-					ImGui::SliderFloat("zNear", &zNear, 0.1f, 100.f);
-					ImGui::SliderFloat("zFar", &zFar, 110.f, 10000.f);
+			const auto& virtualWindowRes = interface.sceneResolution;
+			if (!m_framebuffer || m_framebuffer->getCreationParameters().width!=virtualWindowRes[0] || m_framebuffer->getCreationParameters().height!=virtualWindowRes[1])
+				recreateFramebuffer(virtualWindowRes);
 
-					viewDirty |= ImGui::SliderFloat("Distance", &transformParams.camDistance, 1.f, 69.f);
+			//
+			const auto resourceIx = m_realFrameIx % MaxFramesInFlight;
 
-					if (viewDirty || firstFrame)
+			auto* const cb = m_cmdBufs.data()[resourceIx].get();
+			cb->reset(IGPUCommandBuffer::RESET_FLAGS::RELEASE_RESOURCES_BIT);
+			cb->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT);
+			// clear to black for both things
+			const IGPUCommandBuffer::SClearColorValue clearValue = { .float32 = {0.f,0.f,0.f,1.f} };
+			{
+				cb->beginDebugMarker("UISampleApp Scene Frame");
+				{
+					const IGPUCommandBuffer::SRenderpassBeginInfo renderpassInfo =
 					{
-						core::vectorSIMDf cameraPosition(cosf(camYAngle)* cosf(camXAngle)* transformParams.camDistance, sinf(camXAngle)* transformParams.camDistance, sinf(camYAngle)* cosf(camXAngle)* transformParams.camDistance);
-						core::vectorSIMDf cameraTarget(0.f, 0.f, 0.f);
-						const static core::vectorSIMDf up(0.f, 1.f, 0.f);
-
-						camera.setPosition(cameraPosition);
-						camera.setTarget(cameraTarget);
-						camera.setBackupUpVector(up);
-
-						camera.recomputeViewMatrix();
-
-						firstFrame = false;
+						.framebuffer = m_framebuffer.get(),
+						.colorClearValues = &clearValue,
+						.depthStencilClearValues = nullptr,
+						.renderArea = {
+							.offset = {0,0},
+							.extent = {virtualWindowRes[0],virtualWindowRes[1]}
+						}
+					};
+					beginRenderpass(cb,renderpassInfo);
+				}
+				// draw scene
+				{
+					float32_t3x4 viewMatrix;
+					float32_t4x4 viewProjMatrix;
+					// TODO: get rid of legacy matrices
+					{
+						const auto& camera = interface.camera;
+						memcpy(&viewMatrix,camera.getViewMatrix().pointer(),sizeof(viewMatrix));
+						memcpy(&viewProjMatrix,camera.getConcatenatedMatrix().pointer(),sizeof(viewProjMatrix));
 					}
+					const auto viewParams = CSimpleDebugRenderer::SViewParams(viewMatrix,viewProjMatrix);
 
-					ImGui::Text("X: %f Y: %f", io.MousePos.x, io.MousePos.y);
-					if (ImGuizmo::IsUsing())
+					// tear down scene every frame
+					m_renderer->m_instances[0].packedGeo = m_renderer->getInitParams().geoms.data()+interface.gcIndex;
+ 					m_renderer->render(cb,viewParams);
+				}
+				cb->endRenderPass();
+				cb->endDebugMarker();
+			}
+			{
+				cb->beginDebugMarker("UISampleApp IMGUI Frame");
+				{
+					auto scRes = static_cast<CDefaultSwapchainFramebuffers*>(m_surface->getSwapchainResources());
+					const IGPUCommandBuffer::SRenderpassBeginInfo renderpassInfo =
 					{
-						ImGui::Text("Using gizmo");
-					}
-					else
+						.framebuffer = scRes->getFramebuffer(device_base_t::getCurrentAcquire().imageIndex),
+						.colorClearValues = &clearValue,
+						.depthStencilClearValues = nullptr,
+						.renderArea = {
+							.offset = {0,0},
+							.extent = {m_window->getWidth(),m_window->getHeight()}
+						}
+					};
+					beginRenderpass(cb,renderpassInfo);
+				}
+				// draw ImGUI
+				{
+					auto* imgui = interface.imGUI.get();
+					auto* pipeline = imgui->getPipeline();
+					cb->bindGraphicsPipeline(pipeline);
+					// note that we use default UI pipeline layout where uiParams.resources.textures.setIx == uiParams.resources.samplers.setIx
+					const auto* ds = interface.subAllocDS->getDescriptorSet();
+					cb->bindDescriptorSets(EPBP_GRAPHICS,pipeline->getLayout(),imgui->getCreationParameters().resources.texturesInfo.setIx,1u,&ds);
+					// a timepoint in the future to release streaming resources for geometry
+					const ISemaphore::SWaitInfo drawFinished = {.semaphore=m_semaphore.get(),.value=m_realFrameIx+1u};
+					if (!imgui->render(cb,drawFinished))
 					{
-						ImGui::Text(ImGuizmo::IsOver() ? "Over gizmo" : "");
-						ImGui::SameLine();
-						ImGui::Text(ImGuizmo::IsOver(ImGuizmo::TRANSLATE) ? "Over translate gizmo" : "");
-						ImGui::SameLine();
-						ImGui::Text(ImGuizmo::IsOver(ImGuizmo::ROTATE) ? "Over rotate gizmo" : "");
-						ImGui::SameLine();
-						ImGui::Text(ImGuizmo::IsOver(ImGuizmo::SCALE) ? "Over scale gizmo" : "");
+						m_logger->log("TODO: need to present acquired image before bailing because its already acquired.",ILogger::ELL_ERROR);
+						return {};
 					}
-					ImGui::Separator();
-
-					/*
-					* ImGuizmo expects view & perspective matrix to be column major both with 4x4 layout
-					* and Nabla uses row major matricies - 3x4 matrix for view & 4x4 for projection
-
-					- VIEW:
-
-						ImGuizmo
-
-						|     X[0]          Y[0]          Z[0]         0.0f |
-						|     X[1]          Y[1]          Z[1]         0.0f |
-						|     X[2]          Y[2]          Z[2]         0.0f |
-						| -Dot(X, eye)  -Dot(Y, eye)  -Dot(Z, eye)     1.0f |
-
-						Nabla
-
-						|     X[0]         X[1]           X[2]     -Dot(X, eye)  |
-						|     Y[0]         Y[1]           Y[2]     -Dot(Y, eye)  |
-						|     Z[0]         Z[1]           Z[2]     -Dot(Z, eye)  |
+				}
+				cb->endRenderPass();
+				cb->endDebugMarker();
+			}
+			cb->end();
 
-						<ImGuizmo View Matrix> = transpose(nbl::core::matrix4SIMD(<Nabla View Matrix>))
+			//updateGUIDescriptorSet();
 
-					- PERSPECTIVE [PROJECTION CASE]:
+			IQueue::SSubmitInfo::SSemaphoreInfo retval =
+			{
+				.semaphore = m_semaphore.get(),
+				.value = ++m_realFrameIx,
+				.stageMask = PIPELINE_STAGE_FLAGS::ALL_GRAPHICS_BITS
+			};
+			const IQueue::SSubmitInfo::SCommandBufferInfo commandBuffers[] =
+			{
+				{.cmdbuf = cb }
+			};
+			const IQueue::SSubmitInfo::SSemaphoreInfo acquired[] = {
+				{
+					.semaphore = device_base_t::getCurrentAcquire().semaphore,
+					.value = device_base_t::getCurrentAcquire().acquireCount,
+					.stageMask = PIPELINE_STAGE_FLAGS::NONE
+				}
+			};
+			const IQueue::SSubmitInfo infos[] =
+			{
+				{
+					.waitSemaphores = acquired,
+					.commandBuffers = commandBuffers,
+					.signalSemaphores = {&retval,1}
+				}
+			};
+			
+			if (getGraphicsQueue()->submit(infos) != IQueue::RESULT::SUCCESS)
+			{
+				retval.semaphore = nullptr; // so that we don't wait on semaphore that will never signal
+				m_realFrameIx--;
+			}
 
-						ImGuizmo
 
-						|      (temp / temp2)                 (0.0)                       (0.0)                   (0.0)  |
-						|          (0.0)                  (temp / temp3)                  (0.0)                   (0.0)  |
-						| ((right + left) / temp2)   ((top + bottom) / temp3)    ((-zfar - znear) / temp4)       (-1.0f) |
-						|          (0.0)                      (0.0)               ((-temp * zfar) / temp4)        (0.0)  |
+			m_window->setCaption("[Nabla Engine] UI App Test Demo");
+			return retval;
+		}
 
-						Nabla
+	protected:
+		const video::IGPURenderpass::SCreationParams::SSubpassDependency* getDefaultSubpassDependencies() const override
+		{
+			// Subsequent submits don't wait for each other, but they wait for acquire and get waited on by present
+			const static IGPURenderpass::SCreationParams::SSubpassDependency dependencies[] = {
+				// don't want any writes to be available, we'll clear, only thing to worry about is the layout transition
+				{
+					.srcSubpass = IGPURenderpass::SCreationParams::SSubpassDependency::External,
+					.dstSubpass = 0,
+					.memoryBarrier = {
+						.srcStageMask = PIPELINE_STAGE_FLAGS::NONE, // should sync against the semaphore wait anyway 
+						.srcAccessMask = ACCESS_FLAGS::NONE,
+						// layout transition needs to finish before the color write
+						.dstStageMask = PIPELINE_STAGE_FLAGS::COLOR_ATTACHMENT_OUTPUT_BIT,
+						.dstAccessMask = ACCESS_FLAGS::COLOR_ATTACHMENT_WRITE_BIT
+					}
+					// leave view offsets and flags default
+				},
+				// want layout transition to begin after all color output is done
+				{
+					.srcSubpass = 0,
+					.dstSubpass = IGPURenderpass::SCreationParams::SSubpassDependency::External,
+					.memoryBarrier = {
+						// last place where the color can get modified, depth is implicitly earlier
+						.srcStageMask = PIPELINE_STAGE_FLAGS::COLOR_ATTACHMENT_OUTPUT_BIT,
+						// only write ops, reads can't be made available
+						.srcAccessMask = ACCESS_FLAGS::COLOR_ATTACHMENT_WRITE_BIT
+						// spec says nothing is needed when presentation is the destination
+					}
+					// leave view offsets and flags default
+				},
+				IGPURenderpass::SCreationParams::DependenciesEnd
+			};
+			return dependencies;
+		}
 
-						|            w                        (0.0)                       (0.0)                   (0.0)               |
-						|          (0.0)                       -h                         (0.0)                   (0.0)               |
-						|          (0.0)                      (0.0)               (-zFar/(zFar-zNear))     (-zNear*zFar/(zFar-zNear)) |
-						|          (0.0)                      (0.0)                      (-1.0)                   (0.0)               |
+	private:
+		inline void update(const std::chrono::microseconds nextPresentationTimestamp)
+		{
+			auto& camera = interface.camera;
+			camera.setMoveSpeed(interface.moveSpeed);
+			camera.setRotateSpeed(interface.rotateSpeed);
 
-						<ImGuizmo Projection Matrix> = transpose(<Nabla Projection Matrix>)
 
-					*
-					* the ViewManipulate final call (inside EditTransform) returns world space column major matrix for an object,
-					* note it also modifies input view matrix but projection matrix is immutable
-					*/
+			m_inputSystem->getDefaultMouse(&mouse);
+			m_inputSystem->getDefaultKeyboard(&keyboard);
 
-					static struct
-					{
-						core::matrix4SIMD view, projection, model;
-					} imguizmoM16InOut;
+			struct
+			{
+				std::vector<SMouseEvent> mouse{};
+				std::vector<SKeyboardEvent> keyboard{};
+			} uiEvents;
 
-					ImGuizmo::SetID(0u);
+			// TODO: should be a member really
+			static std::chrono::microseconds previousEventTimestamp{};
 
-					imguizmoM16InOut.view = core::transpose(matrix4SIMD(camera.getViewMatrix()));
-					imguizmoM16InOut.projection = core::transpose(camera.getProjectionMatrix());
-					imguizmoM16InOut.model = core::transpose(core::matrix4SIMD(pass.scene->object.model));
+			// I think begin/end should always be called on camera, just events shouldn't be fed, why?
+			// If you stop begin/end, whatever keys were up/down get their up/down values frozen leading to
+			// `perActionDt` becoming obnoxiously large the first time the even processing resumes due to
+			// `timeDiff` being computed since `lastVirtualUpTimeStamp` 
+			camera.beginInputProcessing(nextPresentationTimestamp);
+			{
+				mouse.consumeEvents([&](const IMouseEventChannel::range_t& events) -> void
 					{
-						if (flipGizmoY) // note we allow to flip gizmo just to match our coordinates
-							imguizmoM16InOut.projection[1][1] *= -1.f; // https://johannesugb.github.io/gpu-programming/why-do-opengl-proj-matrices-fail-in-vulkan/	
+						if (interface.move)
+							camera.mouseProcess(events); // don't capture the events, only let camera handle them with its impl
 
-						transformParams.editTransformDecomposition = true;
-						EditTransform(imguizmoM16InOut.view.pointer(), imguizmoM16InOut.projection.pointer(), imguizmoM16InOut.model.pointer(), transformParams);
-					}
+						for (const auto& e : events) // here capture
+						{
+							if (e.timeStamp < previousEventTimestamp)
+								continue;
 
-					// to Nabla + update camera & model matrices
-					const auto& view = camera.getViewMatrix();
-					const auto& projection = camera.getProjectionMatrix();
+							previousEventTimestamp = e.timeStamp;
+							uiEvents.mouse.emplace_back(e);
 
-					// TODO: make it more nicely
-					const_cast<core::matrix3x4SIMD&>(view) = core::transpose(imguizmoM16InOut.view).extractSub3x4(); // a hack, correct way would be to use inverse matrix and get position + target because now it will bring you back to last position & target when switching from gizmo move to manual move (but from manual to gizmo is ok)
-					camera.setProjectionMatrix(projection); // update concatanated matrix
+							if (e.type==nbl::ui::SMouseEvent::EET_SCROLL && m_renderer)
+							{
+								interface.gcIndex += int16_t(core::sign(e.scrollEvent.verticalScroll));
+								interface.gcIndex = core::clamp(interface.gcIndex,0ull,m_renderer->getInitParams().geoms.size()-1);
+							}
+						}
+					},
+					m_logger.get()
+				);
+				keyboard.consumeEvents([&](const IKeyboardEventChannel::range_t& events) -> void
 					{
-						static nbl::core::matrix3x4SIMD modelView, normal;
-						static nbl::core::matrix4SIMD modelViewProjection;
+						if (interface.move)
+							camera.keyboardProcess(events); // don't capture the events, only let camera handle them with its impl
 
-						auto& hook = pass.scene->object;
-						hook.model = core::transpose(imguizmoM16InOut.model).extractSub3x4();
+						for (const auto& e : events) // here capture
 						{
-							const auto& references = pass.scene->getResources().objects;
-							const auto type = static_cast<ObjectType>(gcIndex);
+							if (e.timeStamp < previousEventTimestamp)
+								continue;
 
-							const auto& [gpu, meta] = references[type];
-							hook.meta.type = type;
-							hook.meta.name = meta.name;
+							previousEventTimestamp = e.timeStamp;
+							uiEvents.keyboard.emplace_back(e);
 						}
+					},
+					m_logger.get()
+				);
+			}
+			camera.endInputProcessing(nextPresentationTimestamp);
 
-						auto& ubo = hook.viewParameters;
+			const auto cursorPosition = m_window->getCursorControl()->getPosition();
 
-						modelView = nbl::core::concatenateBFollowedByA(view, hook.model);
-						modelView.getSub3x3InverseTranspose(normal);
-						modelViewProjection = nbl::core::concatenateBFollowedByA(camera.getConcatenatedMatrix(), hook.model);
+			ext::imgui::UI::SUpdateParameters params = 
+			{
+				.mousePosition = float32_t2(cursorPosition.x,cursorPosition.y) - float32_t2(m_window->getX(),m_window->getY()),
+				.displaySize = {m_window->getWidth(),m_window->getHeight()},
+				.mouseEvents = uiEvents.mouse,
+				.keyboardEvents = uiEvents.keyboard
+			};
 
-						memcpy(ubo.MVP, modelViewProjection.pointer(), sizeof(ubo.MVP));
-						memcpy(ubo.MV, modelView.pointer(), sizeof(ubo.MV));
-						memcpy(ubo.NormalMat, normal.pointer(), sizeof(ubo.NormalMat));
+			interface.objectName = m_scene->getGeometries()[interface.gcIndex].name;
+			interface.imGUI->update(params);
+		}
 
-						// object meta display
-						{
-							ImGui::Begin("Object");
-							ImGui::Text("type: \"%s\"", hook.meta.name.data());
-							ImGui::End();
-						}
+		void recreateFramebuffer(const uint16_t2 resolution)
+		{
+			auto createImageAndView = [&](E_FORMAT format)->smart_refctd_ptr<IGPUImageView>
+			{
+				auto image = m_device->createImage({{
+					.type = IGPUImage::ET_2D,
+					.samples = IGPUImage::ESCF_1_BIT,
+					.format = format,
+					.extent = {resolution.x,resolution.y,1},
+					.mipLevels = 1,
+					.arrayLayers = 1,
+					.usage = IGPUImage::EUF_RENDER_ATTACHMENT_BIT|IGPUImage::EUF_SAMPLED_BIT
+				}});
+				if (!m_device->allocate(image->getMemoryReqs(),image.get()).isValid())
+					return nullptr;
+				return m_device->createImageView({
+					.image = std::move(image),
+					.viewType = IGPUImageView::ET_2D,
+					.format = format,
+					.subresourceRange = {
+						.aspectMask = isDepthOrStencilFormat(format) ? IGPUImage::EAF_DEPTH_BIT:IGPUImage::EAF_COLOR_BIT,
 					}
-					
-					// view matrices editor
-					{
-						ImGui::Begin("Matrices");
-
-						auto addMatrixTable = [&](const char* topText, const char* tableName, const int rows, const int columns, const float* pointer, const bool withSeparator = true)
-						{
-							ImGui::Text(topText);
-							if (ImGui::BeginTable(tableName, columns))
-							{
-								for (int y = 0; y < rows; ++y)
-								{
-									ImGui::TableNextRow();
-									for (int x = 0; x < columns; ++x)
-									{
-										ImGui::TableSetColumnIndex(x);
-										ImGui::Text("%.3f", *(pointer + (y * columns) + x));
-									}
-								}
-								ImGui::EndTable();
-							}
+				});
+			};
 
-							if (withSeparator)
-								ImGui::Separator();
-						};
+			m_renderColorView = createImageAndView(finalSceneRenderFormat);
+			auto depthView = createImageAndView(sceneRenderDepthFormat);
+			m_framebuffer = m_device->createFramebuffer({ {
+				.renderpass = m_renderpass,
+				.depthStencilAttachments = &depthView.get(),
+				.colorAttachments = &m_renderColorView.get(),
+				.width = resolution.x,
+				.height = resolution.y
+			}});
+
+			// release previous slot and its image
+			interface.subAllocDS->multi_deallocate(0,1,&interface.renderColorViewDescIndex,{.semaphore=m_semaphore.get(),.value=m_realFrameIx});
+			//
+			interface.subAllocDS->multi_allocate(0,1,&interface.renderColorViewDescIndex);
+			// update descriptor set
+			IGPUDescriptorSet::SDescriptorInfo info = {};
+			info.desc = m_renderColorView;
+			info.info.image.imageLayout = IGPUImage::LAYOUT::READ_ONLY_OPTIMAL;
+			const IGPUDescriptorSet::SWriteDescriptorSet write = {
+				.dstSet = interface.subAllocDS->getDescriptorSet(),
+				.binding = TexturesImGUIBindingIndex,
+				.arrayElement = interface.renderColorViewDescIndex,
+				.count = 1,
+				.info = &info
+			};
+			m_device->updateDescriptorSets({&write,1},{});
+		}
 
-						addMatrixTable("Model Matrix", "ModelMatrixTable", 3, 4, pass.scene->object.model.pointer());
-						addMatrixTable("Camera View Matrix", "ViewMatrixTable", 3, 4, view.pointer());
-						addMatrixTable("Camera View Projection Matrix", "ViewProjectionMatrixTable", 4, 4, projection.pointer(), false);
+		// Maximum frames which can be simultaneously submitted, used to cycle through our per-frame resources like command buffers
+		constexpr static inline uint32_t MaxFramesInFlight = 3u;
+		constexpr static inline auto sceneRenderDepthFormat = EF_D32_SFLOAT;
+		constexpr static inline auto finalSceneRenderFormat = EF_R8G8B8A8_SRGB;
+		constexpr static inline auto TexturesImGUIBindingIndex = 0u;
+		// we create the Descriptor Set with a few slots extra to spare, so we don't have to `waitIdle` the device whenever ImGUI virtual window resizes
+		constexpr static inline auto MaxImGUITextures = 2u+MaxFramesInFlight;
+
+		//
+		smart_refctd_ptr<CGeometryCreatorScene> m_scene;
+		smart_refctd_ptr<IGPURenderpass> m_renderpass;
+		smart_refctd_ptr<CSimpleDebugRenderer> m_renderer;
+		smart_refctd_ptr<IGPUImageView> m_renderColorView;
+		smart_refctd_ptr<IGPUFramebuffer> m_framebuffer;
+		//
+		smart_refctd_ptr<ISemaphore> m_semaphore;
+		uint64_t m_realFrameIx = 0;
+		std::array<smart_refctd_ptr<IGPUCommandBuffer>,MaxFramesInFlight> m_cmdBufs;
+		//
+		InputSystem::ChannelReader<IMouseEventChannel> mouse;
+		InputSystem::ChannelReader<IKeyboardEventChannel> keyboard;
+		// UI stuff
+		struct CInterface
+		{
+			void operator()()
+			{
+				ImGuiIO& io = ImGui::GetIO();
 
-						ImGui::End();
-					}
+				// TODO: why is this a lambda and not just an assignment in a scope ?
+				camera.setProjectionMatrix([&]() 
+				{
+					matrix4SIMD projection;
 
-					// Nabla Imgui backend MDI buffer info
-					// To be 100% accurate and not overly conservative we'd have to explicitly `cull_frees` and defragment each time,
-					// so unless you do that, don't use this basic info to optimize the size of your IMGUI buffer.
+					if (isPerspective)
+						if(isLH)
+							projection = matrix4SIMD::buildProjectionMatrixPerspectiveFovLH(core::radians(fov), io.DisplaySize.x / io.DisplaySize.y, zNear, zFar);
+						else
+							projection = matrix4SIMD::buildProjectionMatrixPerspectiveFovRH(core::radians(fov), io.DisplaySize.x / io.DisplaySize.y, zNear, zFar);
+					else
 					{
-						auto* streaminingBuffer = pass.ui.manager->getStreamingBuffer();
-
-						const size_t total = streaminingBuffer->get_total_size();			// total memory range size for which allocation can be requested
-						const size_t freeSize = streaminingBuffer->getAddressAllocator().get_free_size();		// max total free bloock memory size we can still allocate from total memory available
-						const size_t consumedMemory = total - freeSize;			// memory currently consumed by streaming buffer
-
-						float freePercentage = 100.0f * (float)(freeSize) / (float)total;
-						float allocatedPercentage = (float)(consumedMemory) / (float)total;
+						float viewHeight = viewWidth * io.DisplaySize.y / io.DisplaySize.x;
 
-						ImVec2 barSize = ImVec2(400, 30);
-						float windowPadding = 10.0f;
-						float verticalPadding = ImGui::GetStyle().FramePadding.y;
+						if(isLH)
+							projection = matrix4SIMD::buildProjectionMatrixOrthoLH(viewWidth, viewHeight, zNear, zFar);
+						else
+							projection = matrix4SIMD::buildProjectionMatrixOrthoRH(viewWidth, viewHeight, zNear, zFar);
+					}
 
-						ImGui::SetNextWindowSize(ImVec2(barSize.x + 2 * windowPadding, 110 + verticalPadding), ImGuiCond_Always);
-						ImGui::Begin("Nabla Imgui MDI Buffer Info", nullptr, ImGuiWindowFlags_NoResize | ImGuiWindowFlags_NoScrollbar);
+					return projection;
+				}());
 
-						ImGui::Text("Total Allocated Size: %zu bytes", total);
-						ImGui::Text("In use: %zu bytes", consumedMemory);
-						ImGui::Text("Buffer Usage:");
+				ImGuizmo::SetOrthographic(false);
+				ImGuizmo::BeginFrame();
 
-						ImGui::SetCursorPosX(windowPadding);
+				ImGui::SetNextWindowPos(ImVec2(1024, 100), ImGuiCond_Appearing);
+				ImGui::SetNextWindowSize(ImVec2(256, 256), ImGuiCond_Appearing);
 
-						if (freePercentage > 70.0f)
-							ImGui::PushStyleColor(ImGuiCol_PlotHistogram, ImVec4(0.0f, 1.0f, 0.0f, 0.4f));  // Green
-						else if (freePercentage > 30.0f)
-							ImGui::PushStyleColor(ImGuiCol_PlotHistogram, ImVec4(1.0f, 1.0f, 0.0f, 0.4f));  // Yellow
-						else
-							ImGui::PushStyleColor(ImGuiCol_PlotHistogram, ImVec4(1.0f, 0.0f, 0.0f, 0.4f));  // Red
+				// create a window and insert the inspector
+				ImGui::SetNextWindowPos(ImVec2(10, 10), ImGuiCond_Appearing);
+				ImGui::SetNextWindowSize(ImVec2(320, 340), ImGuiCond_Appearing);
+				ImGui::Begin("Editor");
 
-						ImGui::ProgressBar(allocatedPercentage, barSize, "");
+				if (ImGui::RadioButton("Full view", !transformParams.useWindow))
+					transformParams.useWindow = false;
 
-						ImGui::PopStyleColor();
+				ImGui::SameLine();
 
-						ImDrawList* drawList = ImGui::GetWindowDrawList();
+				if (ImGui::RadioButton("Window", transformParams.useWindow))
+					transformParams.useWindow = true;
 
-						ImVec2 progressBarPos = ImGui::GetItemRectMin();
-						ImVec2 progressBarSize = ImGui::GetItemRectSize();
+				ImGui::Text("Camera");
+				bool viewDirty = false;
 
-						const char* text = "%.2f%% free";
-						char textBuffer[64];
-						snprintf(textBuffer, sizeof(textBuffer), text, freePercentage);
+				if (ImGui::RadioButton("LH", isLH))
+					isLH = true;
 
-						ImVec2 textSize = ImGui::CalcTextSize(textBuffer);
-						ImVec2 textPos = ImVec2
-						(
-							progressBarPos.x + (progressBarSize.x - textSize.x) * 0.5f,
-							progressBarPos.y + (progressBarSize.y - textSize.y) * 0.5f
-						);
+				ImGui::SameLine();
 
-						ImVec4 bgColor = ImGui::GetStyleColorVec4(ImGuiCol_WindowBg);
-						drawList->AddRectFilled
-						(
-							ImVec2(textPos.x - 5, textPos.y - 2),
-							ImVec2(textPos.x + textSize.x + 5, textPos.y + textSize.y + 2),
-							ImGui::GetColorU32(bgColor)
-						);
+				if (ImGui::RadioButton("RH", !isLH))
+					isLH = false;
 
-						ImGui::SetCursorScreenPos(textPos);
-						ImGui::Text("%s", textBuffer);
+				if (ImGui::RadioButton("Perspective", isPerspective))
+					isPerspective = true;
 
-						ImGui::Dummy(ImVec2(0.0f, verticalPadding));
+				ImGui::SameLine();
 
-						ImGui::End();
-					}
+				if (ImGui::RadioButton("Orthographic", !isPerspective))
+					isPerspective = false;
 
-					ImGui::End();
-				}
-			);
+				ImGui::Checkbox("Enable \"view manipulate\"", &transformParams.enableViewManipulate);
+				ImGui::Checkbox("Enable camera movement", &move);
+				ImGui::SliderFloat("Move speed", &moveSpeed, 0.1f, 10.f);
+				ImGui::SliderFloat("Rotate speed", &rotateSpeed, 0.1f, 10.f);
 
-			m_winMgr->setWindowSize(m_window.get(), WIN_W, WIN_H);
-			m_surface->recreateSwapchain();
-			m_winMgr->show(m_window.get());
-			oracle.reportBeginFrameRecord();
-			camera.mapKeysToArrows();
+				// ImGui::Checkbox("Flip Gizmo's Y axis", &flipGizmoY); // let's not expose it to be changed in UI but keep the logic in case
 
-			return true;
-		}
+				if (isPerspective)
+					ImGui::SliderFloat("Fov", &fov, 20.f, 150.f);
+				else
+					ImGui::SliderFloat("Ortho width", &viewWidth, 1, 20);
 
-		bool updateGUIDescriptorSet()
-		{
-			// texture atlas + our scene texture, note we don't create info & write pair for the font sampler because UI extension's is immutable and baked into DS layout
-			static std::array<IGPUDescriptorSet::SDescriptorInfo, TexturesAmount> descriptorInfo;
-			static IGPUDescriptorSet::SWriteDescriptorSet writes[TexturesAmount];
+				ImGui::SliderFloat("zNear", &zNear, 0.1f, 100.f);
+				ImGui::SliderFloat("zFar", &zFar, 110.f, 10000.f);
 
-			descriptorInfo[nbl::ext::imgui::UI::FontAtlasTexId].info.image.imageLayout = IImage::LAYOUT::READ_ONLY_OPTIMAL;
-			descriptorInfo[nbl::ext::imgui::UI::FontAtlasTexId].desc = core::smart_refctd_ptr<nbl::video::IGPUImageView>(pass.ui.manager->getFontAtlasView());
+				viewDirty |= ImGui::SliderFloat("Distance", &transformParams.camDistance, 1.f, 69.f);
 
-			descriptorInfo[OfflineSceneTextureIx].info.image.imageLayout = IImage::LAYOUT::READ_ONLY_OPTIMAL;
-			descriptorInfo[OfflineSceneTextureIx].desc = pass.scene->getResources().attachments.color;
+				if (viewDirty || firstFrame)
+				{
+					core::vectorSIMDf cameraPosition(cosf(camYAngle)* cosf(camXAngle)* transformParams.camDistance, sinf(camXAngle)* transformParams.camDistance, sinf(camYAngle)* cosf(camXAngle)* transformParams.camDistance);
+					core::vectorSIMDf cameraTarget(0.f, 0.f, 0.f);
+					const static core::vectorSIMDf up(0.f, 1.f, 0.f);
 
-			for (uint32_t i = 0; i < descriptorInfo.size(); ++i)
-			{
-				writes[i].dstSet = pass.ui.descriptorSet.get();
-				writes[i].binding = 0u;
-				writes[i].arrayElement = i;
-				writes[i].count = 1u;
-			}
-			writes[nbl::ext::imgui::UI::FontAtlasTexId].info = descriptorInfo.data() + nbl::ext::imgui::UI::FontAtlasTexId;
-			writes[OfflineSceneTextureIx].info = descriptorInfo.data() + OfflineSceneTextureIx;
+					camera.setPosition(cameraPosition);
+					camera.setTarget(cameraTarget);
+					camera.setBackupUpVector(up);
 
-			return m_device->updateDescriptorSets(writes, {});
-		}
+					camera.recomputeViewMatrix();
+				}
+				firstFrame = false;
 
-		inline void workLoopBody() override
-		{
-			// framesInFlight: ensuring safe execution of command buffers and acquires, `framesInFlight` only affect semaphore waits, don't use this to index your resources because it can change with swapchain recreation.
-			const uint32_t framesInFlight = core::min(MaxFramesInFlight, m_surface->getMaxAcquiresInFlight());
-			// We block for semaphores for 2 reasons here:
-				// A) Resource: Can't use resource like a command buffer BEFORE previous use is finished! [MaxFramesInFlight]
-				// B) Acquire: Can't have more acquires in flight than a certain threshold returned by swapchain or your surface helper class. [MaxAcquiresInFlight]
-			if (m_realFrameIx >= framesInFlight)
-			{
-				const ISemaphore::SWaitInfo cbDonePending[] =
+				ImGui::Text("X: %f Y: %f", io.MousePos.x, io.MousePos.y);
+				if (ImGuizmo::IsUsing())
 				{
-					{
-						.semaphore = m_semaphore.get(),
-						.value = m_realFrameIx + 1 - framesInFlight
-					}
-				};
-				if (m_device->blockForSemaphores(cbDonePending) != ISemaphore::WAIT_RESULT::SUCCESS)
-					return;
-			}
+					ImGui::Text("Using gizmo");
+				}
+				else
+				{
+					ImGui::Text(ImGuizmo::IsOver() ? "Over gizmo" : "");
+					ImGui::SameLine();
+					ImGui::Text(ImGuizmo::IsOver(ImGuizmo::TRANSLATE) ? "Over translate gizmo" : "");
+					ImGui::SameLine();
+					ImGui::Text(ImGuizmo::IsOver(ImGuizmo::ROTATE) ? "Over rotate gizmo" : "");
+					ImGui::SameLine();
+					ImGui::Text(ImGuizmo::IsOver(ImGuizmo::SCALE) ? "Over scale gizmo" : "");
+				}
+				ImGui::Separator();
 
-			const auto resourceIx = m_realFrameIx % MaxFramesInFlight;
+				/*
+				* ImGuizmo expects view & perspective matrix to be column major both with 4x4 layout
+				* and Nabla uses row major matricies - 3x4 matrix for view & 4x4 for projection
 
-			// CPU events
-			update();
+				- VIEW:
 
-			// render whole scene to offline frame buffer & submit
-			pass.scene->begin();
-			{
-				pass.scene->update();
-				pass.scene->record();
-				pass.scene->end();
-			}
-			pass.scene->submit();
+					ImGuizmo
 
-			auto* const cb = m_cmdBufs.data()[resourceIx].get();
-			cb->reset(IGPUCommandBuffer::RESET_FLAGS::RELEASE_RESOURCES_BIT);
-			cb->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT);
-			cb->beginDebugMarker("UISampleApp IMGUI Frame");
+					|     X[0]          Y[0]          Z[0]         0.0f |
+					|     X[1]          Y[1]          Z[1]         0.0f |
+					|     X[2]          Y[2]          Z[2]         0.0f |
+					| -Dot(X, eye)  -Dot(Y, eye)  -Dot(Z, eye)     1.0f |
 
-			auto* queue = getGraphicsQueue();
+					Nabla
 
-			asset::SViewport viewport;
-			{
-				viewport.minDepth = 1.f;
-				viewport.maxDepth = 0.f;
-				viewport.x = 0u;
-				viewport.y = 0u;
-				viewport.width = WIN_W;
-				viewport.height = WIN_H;
-			}
-			cb->setViewport(0u, 1u, &viewport);
+					|     X[0]         X[1]           X[2]     -Dot(X, eye)  |
+					|     Y[0]         Y[1]           Y[2]     -Dot(Y, eye)  |
+					|     Z[0]         Z[1]           Z[2]     -Dot(Z, eye)  |
 
-			const VkRect2D currentRenderArea =
-			{
-				.offset = {0,0},
-				.extent = {m_window->getWidth(),m_window->getHeight()}
-			};
+					<ImGuizmo View Matrix> = transpose(nbl::core::matrix4SIMD(<Nabla View Matrix>))
 
-			IQueue::SSubmitInfo::SCommandBufferInfo commandBuffersInfo[] = {{.cmdbuf = cb }};
+				- PERSPECTIVE [PROJECTION CASE]:
 
-			// UI render pass
-			{
-				auto scRes = static_cast<CDefaultSwapchainFramebuffers*>(m_surface->getSwapchainResources());
-				const IGPUCommandBuffer::SRenderpassBeginInfo renderpassInfo = 
-				{
-					.framebuffer = scRes->getFramebuffer(m_currentImageAcquire.imageIndex),
-					.colorClearValues = &clear.color,
-					.depthStencilClearValues = nullptr,
-					.renderArea = currentRenderArea
-				};
-				nbl::video::ISemaphore::SWaitInfo waitInfo = { .semaphore = m_semaphore.get(), .value = m_realFrameIx + 1u };
+					ImGuizmo
 
-				cb->beginRenderPass(renderpassInfo, IGPUCommandBuffer::SUBPASS_CONTENTS::INLINE);
-				const auto uiParams = pass.ui.manager->getCreationParameters();
-				auto* pipeline = pass.ui.manager->getPipeline();
-				cb->bindGraphicsPipeline(pipeline);
-				cb->bindDescriptorSets(EPBP_GRAPHICS, pipeline->getLayout(), uiParams.resources.texturesInfo.setIx, 1u, &pass.ui.descriptorSet.get()); // note that we use default UI pipeline layout where uiParams.resources.textures.setIx == uiParams.resources.samplers.setIx
-				
-				if (!keepRunning())
-					return;
-				
-				if (!pass.ui.manager->render(cb,waitInfo))
-				{
-					// TODO: need to present acquired image before bailing because its already acquired
-					return;
-				}
-				cb->endRenderPass();
-			}
-			cb->end();
-			{
-				const IQueue::SSubmitInfo::SSemaphoreInfo rendered[] = 
-				{ 
-					{
-						.semaphore = m_semaphore.get(),
-						.value = ++m_realFrameIx,
-						.stageMask = PIPELINE_STAGE_FLAGS::COLOR_ATTACHMENT_OUTPUT_BIT
-					} 
-				};
+					|      (temp / temp2)                 (0.0)                       (0.0)                   (0.0)  |
+					|          (0.0)                  (temp / temp3)                  (0.0)                   (0.0)  |
+					| ((right + left) / temp2)   ((top + bottom) / temp3)    ((-zfar - znear) / temp4)       (-1.0f) |
+					|          (0.0)                      (0.0)               ((-temp * zfar) / temp4)        (0.0)  |
 
-				{
-					{
-						const IQueue::SSubmitInfo::SSemaphoreInfo acquired[] = 
-						{ 
-							{
-								.semaphore = m_currentImageAcquire.semaphore,
-								.value = m_currentImageAcquire.acquireCount,
-								.stageMask = PIPELINE_STAGE_FLAGS::NONE
-							} 
-						};
-
-						const IQueue::SSubmitInfo infos[] = 
-						{ 
-							{
-								.waitSemaphores = acquired,
-								.commandBuffers = commandBuffersInfo,
-								.signalSemaphores = rendered
-							} 
-						};
-
-						const nbl::video::ISemaphore::SWaitInfo waitInfos[] = 
-						{ {
-							.semaphore = pass.scene->semaphore.progress.get(),
-							.value = pass.scene->semaphore.finishedValue
-						} };
-						
-						m_device->blockForSemaphores(waitInfos);
-
-						updateGUIDescriptorSet();
-
-						if (queue->submit(infos) != IQueue::RESULT::SUCCESS)
-							m_realFrameIx--;
-					}
-				}
+					Nabla
 
-				m_window->setCaption("[Nabla Engine] UI App Test Demo");
-				m_surface->present(m_currentImageAcquire.imageIndex, rendered);
-			}
-		}
+					|            w                        (0.0)                       (0.0)                   (0.0)               |
+					|          (0.0)                       -h                         (0.0)                   (0.0)               |
+					|          (0.0)                      (0.0)               (-zFar/(zFar-zNear))     (-zNear*zFar/(zFar-zNear)) |
+					|          (0.0)                      (0.0)                      (-1.0)                   (0.0)               |
 
-		inline bool keepRunning() override
-		{
-			if (m_surface->irrecoverable())
-				return false;
+					<ImGuizmo Projection Matrix> = transpose(<Nabla Projection Matrix>)
 
-			return true;
-		}
-
-		inline bool onAppTerminated() override
-		{
-			return device_base_t::onAppTerminated();
-		}
-
-		inline void update()
-		{
-			camera.setMoveSpeed(moveSpeed);
-			camera.setRotateSpeed(rotateSpeed);
-
-			static std::chrono::microseconds previousEventTimestamp{};
-
-			m_inputSystem->getDefaultMouse(&mouse);
-			m_inputSystem->getDefaultKeyboard(&keyboard);
-
-			auto updatePresentationTimestamp = [&]()
-			{
-				m_currentImageAcquire = m_surface->acquireNextImage();
+				*
+				* the ViewManipulate final call (inside EditTransform) returns world space column major matrix for an object,
+				* note it also modifies input view matrix but projection matrix is immutable
+				*/
 
-				oracle.reportEndFrameRecord();
-				const auto timestamp = oracle.getNextPresentationTimeStamp();
-				oracle.reportBeginFrameRecord();
+// TODO: do all computation using `hlsl::matrix` and its `hlsl::float32_tNxM` aliases
+				static struct
+				{
+					core::matrix4SIMD view, projection, model;
+				} imguizmoM16InOut;
 
-				return timestamp;
-			};
+				ImGuizmo::SetID(0u);
 
-			const auto nextPresentationTimestamp = updatePresentationTimestamp();
+				imguizmoM16InOut.view = core::transpose(matrix4SIMD(camera.getViewMatrix()));
+				imguizmoM16InOut.projection = core::transpose(camera.getProjectionMatrix());
+				imguizmoM16InOut.model = core::transpose(matrix4SIMD(model));
+				{
+					if (flipGizmoY) // note we allow to flip gizmo just to match our coordinates
+						imguizmoM16InOut.projection[1][1] *= -1.f; // https://johannesugb.github.io/gpu-programming/why-do-opengl-proj-matrices-fail-in-vulkan/	
 
-			struct
-			{
-				std::vector<SMouseEvent> mouse{};
-				std::vector<SKeyboardEvent> keyboard{};
-			} capturedEvents;
+					transformParams.editTransformDecomposition = true;
+					sceneResolution = EditTransform(imguizmoM16InOut.view.pointer(), imguizmoM16InOut.projection.pointer(), imguizmoM16InOut.model.pointer(), transformParams);
+				}
 
-			if (move) camera.beginInputProcessing(nextPresentationTimestamp);
-			{
-				mouse.consumeEvents([&](const IMouseEventChannel::range_t& events) -> void
+				// to Nabla + update camera & model matrices
+// TODO: make it more nicely, extract:
+// - Position by computing inverse of the view matrix and grabbing its translation
+// - Target from 3rd row without W component of view matrix multiplied by some arbitrary distance value (can be the length of position from origin) and adding the position
+// But then set the view matrix this way anyway, because up-vector may not be compatible
+				const auto& view = camera.getViewMatrix();
+				const_cast<core::matrix3x4SIMD&>(view) = core::transpose(imguizmoM16InOut.view).extractSub3x4(); // a hack, correct way would be to use inverse matrix and get position + target because now it will bring you back to last position & target when switching from gizmo move to manual move (but from manual to gizmo is ok)
+				// update concatanated matrix
+				const auto& projection = camera.getProjectionMatrix();
+				camera.setProjectionMatrix(projection);
+
+				// object meta display
+				{
+					ImGui::Begin("Object");
+					ImGui::Text("type: \"%s\"", objectName.data());
+					ImGui::End();
+				}
+					
+				// view matrices editor
 				{
-					if (move)
-						camera.mouseProcess(events); // don't capture the events, only let camera handle them with its impl
+					ImGui::Begin("Matrices");
 
-					for (const auto& e : events) // here capture
+					auto addMatrixTable = [&](const char* topText, const char* tableName, const int rows, const int columns, const float* pointer, const bool withSeparator = true)
 					{
-						if (e.timeStamp < previousEventTimestamp)
-							continue;
+						ImGui::Text(topText);
+						if (ImGui::BeginTable(tableName, columns))
+						{
+							for (int y = 0; y < rows; ++y)
+							{
+								ImGui::TableNextRow();
+								for (int x = 0; x < columns; ++x)
+								{
+									ImGui::TableSetColumnIndex(x);
+									ImGui::Text("%.3f", *(pointer + (y * columns) + x));
+								}
+							}
+							ImGui::EndTable();
+						}
 
-						previousEventTimestamp = e.timeStamp;
-						capturedEvents.mouse.emplace_back(e);
+						if (withSeparator)
+							ImGui::Separator();
+					};
 
-						if (e.type == nbl::ui::SMouseEvent::EET_SCROLL)
-							gcIndex = std::clamp<uint16_t>(int16_t(gcIndex) + int16_t(core::sign(e.scrollEvent.verticalScroll)), int64_t(0), int64_t(OT_COUNT - (uint8_t)1u));
-					}
-				}, m_logger.get());
+					addMatrixTable("Model Matrix", "ModelMatrixTable", 3, 4, model.pointer());
+					addMatrixTable("Camera View Matrix", "ViewMatrixTable", 3, 4, view.pointer());
+					addMatrixTable("Camera View Projection Matrix", "ViewProjectionMatrixTable", 4, 4, projection.pointer(), false);
+
+					ImGui::End();
+				}
 
-			keyboard.consumeEvents([&](const IKeyboardEventChannel::range_t& events) -> void
+				// Nabla Imgui backend MDI buffer info
+				// To be 100% accurate and not overly conservative we'd have to explicitly `cull_frees` and defragment each time,
+				// so unless you do that, don't use this basic info to optimize the size of your IMGUI buffer.
 				{
-					if (move)
-						camera.keyboardProcess(events); // don't capture the events, only let camera handle them with its impl
+					auto* streaminingBuffer = imGUI->getStreamingBuffer();
 
-					for (const auto& e : events) // here capture
-					{
-						if (e.timeStamp < previousEventTimestamp)
-							continue;
+					const size_t total = streaminingBuffer->get_total_size();			// total memory range size for which allocation can be requested
+					const size_t freeSize = streaminingBuffer->getAddressAllocator().get_free_size();		// max total free bloock memory size we can still allocate from total memory available
+					const size_t consumedMemory = total - freeSize;			// memory currently consumed by streaming buffer
 
-						previousEventTimestamp = e.timeStamp;
-						capturedEvents.keyboard.emplace_back(e);
-					}
-				}, m_logger.get());
-			}
-			if (move) camera.endInputProcessing(nextPresentationTimestamp);
+					float freePercentage = 100.0f * (float)(freeSize) / (float)total;
+					float allocatedPercentage = (float)(consumedMemory) / (float)total;
 
-			const auto cursorPosition = m_window->getCursorControl()->getPosition();
+					ImVec2 barSize = ImVec2(400, 30);
+					float windowPadding = 10.0f;
+					float verticalPadding = ImGui::GetStyle().FramePadding.y;
 
-			nbl::ext::imgui::UI::SUpdateParameters params = 
-			{
-				.mousePosition = nbl::hlsl::float32_t2(cursorPosition.x, cursorPosition.y) - nbl::hlsl::float32_t2(m_window->getX(), m_window->getY()),
-				.displaySize = { m_window->getWidth(), m_window->getHeight() },
-				.mouseEvents = { capturedEvents.mouse.data(), capturedEvents.mouse.size() },
-				.keyboardEvents = { capturedEvents.keyboard.data(), capturedEvents.keyboard.size() }
-			};
+					ImGui::SetNextWindowSize(ImVec2(barSize.x + 2 * windowPadding, 110 + verticalPadding), ImGuiCond_Always);
+					ImGui::Begin("Nabla Imgui MDI Buffer Info", nullptr, ImGuiWindowFlags_NoResize | ImGuiWindowFlags_NoScrollbar);
 
-			pass.ui.manager->update(params);
-		}
+					ImGui::Text("Total Allocated Size: %zu bytes", total);
+					ImGui::Text("In use: %zu bytes", consumedMemory);
+					ImGui::Text("Buffer Usage:");
 
-	private:
-		// Maximum frames which can be simultaneously submitted, used to cycle through our per-frame resources like command buffers
-		constexpr static inline uint32_t MaxFramesInFlight = 3u;
+					ImGui::SetCursorPosX(windowPadding);
 
-		smart_refctd_ptr<IWindow> m_window;
-		smart_refctd_ptr<CSimpleResizeSurface<CDefaultSwapchainFramebuffers>> m_surface;
-		smart_refctd_ptr<IGPUGraphicsPipeline> m_pipeline;
-		smart_refctd_ptr<ISemaphore> m_semaphore;
-		smart_refctd_ptr<IGPUCommandPool> m_cmdPool;
-		uint64_t m_realFrameIx = 0;
-		std::array<smart_refctd_ptr<IGPUCommandBuffer>, MaxFramesInFlight> m_cmdBufs;
-		ISimpleManagedSurface::SAcquireResult m_currentImageAcquire = {};
+					if (freePercentage > 70.0f)
+						ImGui::PushStyleColor(ImGuiCol_PlotHistogram, ImVec4(0.0f, 1.0f, 0.0f, 0.4f));  // Green
+					else if (freePercentage > 30.0f)
+						ImGui::PushStyleColor(ImGuiCol_PlotHistogram, ImVec4(1.0f, 1.0f, 0.0f, 0.4f));  // Yellow
+					else
+						ImGui::PushStyleColor(ImGuiCol_PlotHistogram, ImVec4(1.0f, 0.0f, 0.0f, 0.4f));  // Red
 
-		smart_refctd_ptr<nbl::asset::IAssetManager> m_assetManager;
-		core::smart_refctd_ptr<InputSystem> m_inputSystem;
-		InputSystem::ChannelReader<IMouseEventChannel> mouse;
-		InputSystem::ChannelReader<IKeyboardEventChannel> keyboard;
+					ImGui::ProgressBar(allocatedPercentage, barSize, "");
 
-		constexpr static inline auto TexturesAmount = 2u;
+					ImGui::PopStyleColor();
 
-		core::smart_refctd_ptr<IDescriptorPool> m_descriptorSetPool;
+					ImDrawList* drawList = ImGui::GetWindowDrawList();
 
-		struct C_UI
-		{
-			nbl::core::smart_refctd_ptr<nbl::ext::imgui::UI> manager;
+					ImVec2 progressBarPos = ImGui::GetItemRectMin();
+					ImVec2 progressBarSize = ImGui::GetItemRectSize();
 
-			struct
-			{
-				core::smart_refctd_ptr<video::IGPUSampler> gui, scene;
-			} samplers;
+					const char* text = "%.2f%% free";
+					char textBuffer[64];
+					snprintf(textBuffer, sizeof(textBuffer), text, freePercentage);
 
-			core::smart_refctd_ptr<IGPUDescriptorSet> descriptorSet;
-		};
+					ImVec2 textSize = ImGui::CalcTextSize(textBuffer);
+					ImVec2 textPos = ImVec2
+					(
+						progressBarPos.x + (progressBarSize.x - textSize.x) * 0.5f,
+						progressBarPos.y + (progressBarSize.y - textSize.y) * 0.5f
+					);
 
-		struct E_APP_PASS
-		{
-			nbl::core::smart_refctd_ptr<CScene> scene;
-			C_UI ui;
-		} pass;
+					ImVec4 bgColor = ImGui::GetStyleColorVec4(ImGuiCol_WindowBg);
+					drawList->AddRectFilled
+					(
+						ImVec2(textPos.x - 5, textPos.y - 2),
+						ImVec2(textPos.x + textSize.x + 5, textPos.y + textSize.y + 2),
+						ImGui::GetColorU32(bgColor)
+					);
 
-		Camera camera = Camera(core::vectorSIMDf(0, 0, 0), core::vectorSIMDf(0, 0, 0), core::matrix4SIMD());
-		video::CDumbPresentationOracle oracle;
+					ImGui::SetCursorScreenPos(textPos);
+					ImGui::Text("%s", textBuffer);
 
-		uint16_t gcIndex = {}; // note: this is dirty however since I assume only single object in scene I can leave it now, when this example is upgraded to support multiple objects this needs to be changed
+					ImGui::Dummy(ImVec2(0.0f, verticalPadding));
 
-		TransformRequestParams transformParams;
-		bool isPerspective = true, isLH = true, flipGizmoY = true, move = false;
-		float fov = 60.f, zNear = 0.1f, zFar = 10000.f, moveSpeed = 1.f, rotateSpeed = 1.f;
-		float viewWidth = 10.f;
-		float camYAngle = 165.f / 180.f * 3.14159f;
-		float camXAngle = 32.f / 180.f * 3.14159f;
+					ImGui::End();
+				}
+
+				ImGui::End();
+			}
 
-		bool firstFrame = true;
+			smart_refctd_ptr<ext::imgui::UI> imGUI;
+			// descriptor set
+			smart_refctd_ptr<SubAllocatedDescriptorSet> subAllocDS;
+			SubAllocatedDescriptorSet::value_type renderColorViewDescIndex = SubAllocatedDescriptorSet::invalid_value;
+			//
+			Camera camera = Camera(core::vectorSIMDf(0, 0, 0), core::vectorSIMDf(0, 0, 0), core::matrix4SIMD());
+			// mutables
+			std::string_view objectName;
+			core::matrix3x4SIMD model;
+			TransformRequestParams transformParams;
+			uint16_t2 sceneResolution = {1280,720};
+			float fov = 60.f, zNear = 0.1f, zFar = 10000.f, moveSpeed = 1.f, rotateSpeed = 1.f;
+			float viewWidth = 10.f;
+			float camYAngle = 165.f / 180.f * 3.14159f;
+			float camXAngle = 32.f / 180.f * 3.14159f;
+			uint16_t gcIndex = {}; // note: this is dirty however since I assume only single object in scene I can leave it now, when this example is upgraded to support multiple objects this needs to be changed
+			bool isPerspective = true, isLH = true, flipGizmoY = true, move = false;
+			bool firstFrame = true;
+		} interface;
 };
 
 NBL_MAIN_FUNC(UISampleApp)
\ No newline at end of file

From 28726045367f9bbab5668e324af6a69bcfbb264c Mon Sep 17 00:00:00 2001
From: devsh <devsh@devsh.eu>
Date: Tue, 24 Jun 2025 01:30:21 +0200
Subject: [PATCH 421/529] fix bugs in ex 61: - correct aspect masks on image
 views - wrong renderpass given to imgui - handle virtual window getting
 minimized - imguizmo not updating - imgui not drawing offscreen image

---
 61_UI/main.cpp | 113 ++++++++++++++++++++++++++++++-------------------
 1 file changed, 70 insertions(+), 43 deletions(-)

diff --git a/61_UI/main.cpp b/61_UI/main.cpp
index d4f21f2e8..830318e4e 100644
--- a/61_UI/main.cpp
+++ b/61_UI/main.cpp
@@ -92,6 +92,7 @@ class UISampleApp final : public MonoWindowApplication, public BuiltinResourcesA
 					{},
 					IGPURenderpass::SCreationParams::SubpassesEnd
 				};
+				subpasses[0].depthStencilAttachment = {{.render={.attachmentIndex=0,.layout=IGPUImage::LAYOUT::ATTACHMENT_OPTIMAL}}};
 				subpasses[0].colorAttachments[0] = {.render={.attachmentIndex=0,.layout=IGPUImage::LAYOUT::ATTACHMENT_OPTIMAL}};
 				params.subpasses = subpasses;
 				
@@ -137,9 +138,12 @@ class UISampleApp final : public MonoWindowApplication, public BuiltinResourcesA
 					return logFail("Failed to create Scene Renderpass!");
 			}
 			m_renderer = CSimpleDebugRenderer::create(m_assetMgr.get(),m_renderpass.get(),0,m_scene.get());
+			// we'll only display one thing at a time
+			m_renderer->m_instances.resize(1);
 
 			// Create ImGUI
 			{
+				auto scRes = static_cast<CDefaultSwapchainFramebuffers*>(m_surface->getSwapchainResources());
 				ext::imgui::UI::SCreationParameters params = {};
 				params.resources.texturesInfo = {.setIx=0u,.bindingIx=TexturesImGUIBindingIndex};
 				params.resources.samplersInfo = {.setIx=0u,.bindingIx=1u};
@@ -147,7 +151,7 @@ class UISampleApp final : public MonoWindowApplication, public BuiltinResourcesA
 				params.transfer = getTransferUpQueue();
 				params.pipelineLayout = ext::imgui::UI::createDefaultPipelineLayout(m_utils->getLogicalDevice(),params.resources.texturesInfo,params.resources.samplersInfo,MaxImGUITextures);
 				params.assetManager = make_smart_refctd_ptr<IAssetManager>(smart_refctd_ptr(m_system));
-				params.renderpass = m_renderpass;
+				params.renderpass = smart_refctd_ptr<IGPURenderpass>(scRes->getRenderpass());
 				params.subpassIx = 0u;
 				params.pipelineCache = nullptr;
 				interface.imGUI = ext::imgui::UI::create(std::move(params));
@@ -196,17 +200,13 @@ class UISampleApp final : public MonoWindowApplication, public BuiltinResourcesA
 			return true;
 		}
 
-		inline void beginRenderpass(IGPUCommandBuffer* cb, const IGPUCommandBuffer::SRenderpassBeginInfo& info)
+		//
+		virtual inline bool onAppTerminated()
 		{
-			cb->beginRenderPass(info,IGPUCommandBuffer::SUBPASS_CONTENTS::INLINE);
-			cb->setScissor(0,1,&info.renderArea);
-			const SViewport viewport = {
-				.x = 0,
-				.y = 0,
-				.width = static_cast<float>(info.renderArea.extent.width),
-				.height = static_cast<float>(info.renderArea.extent.height)
-			};
-			cb->setViewport(0u,1u,&viewport);
+			SubAllocatedDescriptorSet::value_type fontAtlasDescIx = ext::imgui::UI::FontAtlasTexId;
+			IGPUDescriptorSet::SDropDescriptorSet dummy[1];
+			interface.subAllocDS->multi_deallocate(dummy,TexturesImGUIBindingIndex,1,&fontAtlasDescIx);
+			return device_base_t::onAppTerminated();
 		}
 
 		inline IQueue::SSubmitInfo::SSemaphoreInfo renderFrame(const std::chrono::microseconds nextPresentationTimestamp) override
@@ -226,14 +226,16 @@ class UISampleApp final : public MonoWindowApplication, public BuiltinResourcesA
 			cb->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT);
 			// clear to black for both things
 			const IGPUCommandBuffer::SClearColorValue clearValue = { .float32 = {0.f,0.f,0.f,1.f} };
+			if (m_framebuffer)
 			{
 				cb->beginDebugMarker("UISampleApp Scene Frame");
 				{
+					const IGPUCommandBuffer::SClearDepthStencilValue farValue = { .depth=0.f };
 					const IGPUCommandBuffer::SRenderpassBeginInfo renderpassInfo =
 					{
 						.framebuffer = m_framebuffer.get(),
 						.colorClearValues = &clearValue,
-						.depthStencilClearValues = nullptr,
+						.depthStencilClearValues = &farValue,
 						.renderArea = {
 							.offset = {0,0},
 							.extent = {virtualWindowRes[0],virtualWindowRes[1]}
@@ -254,7 +256,9 @@ class UISampleApp final : public MonoWindowApplication, public BuiltinResourcesA
 					const auto viewParams = CSimpleDebugRenderer::SViewParams(viewMatrix,viewProjMatrix);
 
 					// tear down scene every frame
-					m_renderer->m_instances[0].packedGeo = m_renderer->getInitParams().geoms.data()+interface.gcIndex;
+					auto& instance = m_renderer->m_instances[0];
+					memcpy(&instance.world,&interface.model,sizeof(instance.world));
+					instance.packedGeo = m_renderer->getInitParams().geoms.data()+interface.gcIndex;
  					m_renderer->render(cb,viewParams);
 				}
 				cb->endRenderPass();
@@ -468,42 +472,65 @@ class UISampleApp final : public MonoWindowApplication, public BuiltinResourcesA
 				}});
 				if (!m_device->allocate(image->getMemoryReqs(),image.get()).isValid())
 					return nullptr;
-				return m_device->createImageView({
+				IGPUImageView::SCreationParams params = {
 					.image = std::move(image),
 					.viewType = IGPUImageView::ET_2D,
-					.format = format,
-					.subresourceRange = {
-						.aspectMask = isDepthOrStencilFormat(format) ? IGPUImage::EAF_DEPTH_BIT:IGPUImage::EAF_COLOR_BIT,
-					}
-				});
+					.format = format
+				};
+				params.subresourceRange.aspectMask = isDepthOrStencilFormat(format) ? IGPUImage::EAF_DEPTH_BIT:IGPUImage::EAF_COLOR_BIT;
+				return m_device->createImageView(std::move(params));
 			};
-
-			m_renderColorView = createImageAndView(finalSceneRenderFormat);
-			auto depthView = createImageAndView(sceneRenderDepthFormat);
-			m_framebuffer = m_device->createFramebuffer({ {
-				.renderpass = m_renderpass,
-				.depthStencilAttachments = &depthView.get(),
-				.colorAttachments = &m_renderColorView.get(),
-				.width = resolution.x,
-				.height = resolution.y
-			}});
+			
+			smart_refctd_ptr<IGPUImageView> colorView;
+			// detect window minimization
+			if (resolution.x<0x4000 && resolution.y<0x4000)
+			{
+				colorView = createImageAndView(finalSceneRenderFormat);
+				auto depthView = createImageAndView(sceneRenderDepthFormat);
+				m_framebuffer = m_device->createFramebuffer({ {
+					.renderpass = m_renderpass,
+					.depthStencilAttachments = &depthView.get(),
+					.colorAttachments = &colorView.get(),
+					.width = resolution.x,
+					.height = resolution.y
+				}});
+			}
+			else
+				m_framebuffer = nullptr;
 
 			// release previous slot and its image
 			interface.subAllocDS->multi_deallocate(0,1,&interface.renderColorViewDescIndex,{.semaphore=m_semaphore.get(),.value=m_realFrameIx});
 			//
-			interface.subAllocDS->multi_allocate(0,1,&interface.renderColorViewDescIndex);
-			// update descriptor set
-			IGPUDescriptorSet::SDescriptorInfo info = {};
-			info.desc = m_renderColorView;
-			info.info.image.imageLayout = IGPUImage::LAYOUT::READ_ONLY_OPTIMAL;
-			const IGPUDescriptorSet::SWriteDescriptorSet write = {
-				.dstSet = interface.subAllocDS->getDescriptorSet(),
-				.binding = TexturesImGUIBindingIndex,
-				.arrayElement = interface.renderColorViewDescIndex,
-				.count = 1,
-				.info = &info
+			if (colorView)
+			{
+				interface.subAllocDS->multi_allocate(0,1,&interface.renderColorViewDescIndex);
+				// update descriptor set
+				IGPUDescriptorSet::SDescriptorInfo info = {};
+				info.desc = colorView;
+				info.info.image.imageLayout = IGPUImage::LAYOUT::READ_ONLY_OPTIMAL;
+				const IGPUDescriptorSet::SWriteDescriptorSet write = {
+					.dstSet = interface.subAllocDS->getDescriptorSet(),
+					.binding = TexturesImGUIBindingIndex,
+					.arrayElement = interface.renderColorViewDescIndex,
+					.count = 1,
+					.info = &info
+				};
+				m_device->updateDescriptorSets({&write,1},{});
+			}
+			interface.transformParams.sceneTexDescIx = interface.renderColorViewDescIndex;
+		}
+
+		inline void beginRenderpass(IGPUCommandBuffer* cb, const IGPUCommandBuffer::SRenderpassBeginInfo& info)
+		{
+			cb->beginRenderPass(info,IGPUCommandBuffer::SUBPASS_CONTENTS::INLINE);
+			cb->setScissor(0,1,&info.renderArea);
+			const SViewport viewport = {
+				.x = 0,
+				.y = 0,
+				.width = static_cast<float>(info.renderArea.extent.width),
+				.height = static_cast<float>(info.renderArea.extent.height)
 			};
-			m_device->updateDescriptorSets({&write,1},{});
+			cb->setViewport(0u,1u,&viewport);
 		}
 
 		// Maximum frames which can be simultaneously submitted, used to cycle through our per-frame resources like command buffers
@@ -518,7 +545,6 @@ class UISampleApp final : public MonoWindowApplication, public BuiltinResourcesA
 		smart_refctd_ptr<CGeometryCreatorScene> m_scene;
 		smart_refctd_ptr<IGPURenderpass> m_renderpass;
 		smart_refctd_ptr<CSimpleDebugRenderer> m_renderer;
-		smart_refctd_ptr<IGPUImageView> m_renderColorView;
 		smart_refctd_ptr<IGPUFramebuffer> m_framebuffer;
 		//
 		smart_refctd_ptr<ISemaphore> m_semaphore;
@@ -706,6 +732,7 @@ class UISampleApp final : public MonoWindowApplication, public BuiltinResourcesA
 					sceneResolution = EditTransform(imguizmoM16InOut.view.pointer(), imguizmoM16InOut.projection.pointer(), imguizmoM16InOut.model.pointer(), transformParams);
 				}
 
+				model = core::transpose(imguizmoM16InOut.model).extractSub3x4();
 				// to Nabla + update camera & model matrices
 // TODO: make it more nicely, extract:
 // - Position by computing inverse of the view matrix and grabbing its translation
@@ -835,8 +862,8 @@ class UISampleApp final : public MonoWindowApplication, public BuiltinResourcesA
 			//
 			Camera camera = Camera(core::vectorSIMDf(0, 0, 0), core::vectorSIMDf(0, 0, 0), core::matrix4SIMD());
 			// mutables
-			std::string_view objectName;
 			core::matrix3x4SIMD model;
+			std::string_view objectName;
 			TransformRequestParams transformParams;
 			uint16_t2 sceneResolution = {1280,720};
 			float fov = 60.f, zNear = 0.1f, zFar = 10000.f, moveSpeed = 1.f, rotateSpeed = 1.f;

From 20cc57eaea399d68da28f709d6b63878eba67a61 Mon Sep 17 00:00:00 2001
From: devsh <devsh@devsh.eu>
Date: Tue, 24 Jun 2025 02:26:32 +0200
Subject: [PATCH 422/529] Push Mesh Loader example

---
 12_MeshLoaders/CMakeLists.txt       |   8 +
 12_MeshLoaders/README.md            |   2 +
 12_MeshLoaders/config.json.template |  28 +++
 12_MeshLoaders/include/common.hpp   |  18 ++
 12_MeshLoaders/main.cpp             | 272 ++++++++++++++++++++++++++++
 12_MeshLoaders/pipeline.groovy      |  50 +++++
 CMakeLists.txt                      |   4 +-
 7 files changed, 381 insertions(+), 1 deletion(-)
 create mode 100644 12_MeshLoaders/CMakeLists.txt
 create mode 100644 12_MeshLoaders/README.md
 create mode 100644 12_MeshLoaders/config.json.template
 create mode 100644 12_MeshLoaders/include/common.hpp
 create mode 100644 12_MeshLoaders/main.cpp
 create mode 100644 12_MeshLoaders/pipeline.groovy

diff --git a/12_MeshLoaders/CMakeLists.txt b/12_MeshLoaders/CMakeLists.txt
new file mode 100644
index 000000000..2dd253226
--- /dev/null
+++ b/12_MeshLoaders/CMakeLists.txt
@@ -0,0 +1,8 @@
+set(NBL_INCLUDE_SERACH_DIRECTORIES
+	"${CMAKE_CURRENT_SOURCE_DIR}/include"
+)
+
+	# TODO; Arek I removed `NBL_EXECUTABLE_PROJECT_CREATION_PCH_TARGET` from the last parameter here, doesn't this macro have 4 arguments anyway !?
+nbl_create_executable_project("" "" "${NBL_INCLUDE_SERACH_DIRECTORIES}" "" "")
+# TODO: Arek temporarily disabled cause I haven't figured out how to make this target yet
+# LINK_BUILTIN_RESOURCES_TO_TARGET(${EXECUTABLE_NAME} nblExamplesGeometrySpirvBRD)
\ No newline at end of file
diff --git a/12_MeshLoaders/README.md b/12_MeshLoaders/README.md
new file mode 100644
index 000000000..6330f4673
--- /dev/null
+++ b/12_MeshLoaders/README.md
@@ -0,0 +1,2 @@
+https://github.com/user-attachments/assets/6f779700-e6d4-4e11-95fb-7a7fddc47255
+
diff --git a/12_MeshLoaders/config.json.template b/12_MeshLoaders/config.json.template
new file mode 100644
index 000000000..f961745c1
--- /dev/null
+++ b/12_MeshLoaders/config.json.template
@@ -0,0 +1,28 @@
+{
+  "enableParallelBuild": true,
+  "threadsPerBuildProcess" : 2,
+  "isExecuted": false,
+  "scriptPath": "",
+  "cmake": {
+    "configurations": [ "Release", "Debug", "RelWithDebInfo" ],
+    "buildModes": [],
+    "requiredOptions": []
+  }, 
+  "profiles": [
+    {
+      "backend": "vulkan",
+      "platform": "windows",
+      "buildModes": [],
+      "runConfiguration": "Release",
+      "gpuArchitectures": []
+    }
+  ],
+  "dependencies": [],
+  "data": [
+    {
+      "dependencies": [],
+      "command": [""],
+      "outputs": []
+    }
+  ]
+}
\ No newline at end of file
diff --git a/12_MeshLoaders/include/common.hpp b/12_MeshLoaders/include/common.hpp
new file mode 100644
index 000000000..84cd8118a
--- /dev/null
+++ b/12_MeshLoaders/include/common.hpp
@@ -0,0 +1,18 @@
+#ifndef _NBL_THIS_EXAMPLE_COMMON_H_INCLUDED_
+#define _NBL_THIS_EXAMPLE_COMMON_H_INCLUDED_
+
+
+#include "nbl/examples/examples.hpp"
+
+using namespace nbl;
+using namespace core;
+using namespace hlsl;
+using namespace system;
+using namespace asset;
+using namespace ui;
+using namespace video;
+using namespace scene;
+using namespace nbl::examples;
+
+
+#endif // __NBL_THIS_EXAMPLE_COMMON_H_INCLUDED__
\ No newline at end of file
diff --git a/12_MeshLoaders/main.cpp b/12_MeshLoaders/main.cpp
new file mode 100644
index 000000000..13868fa8c
--- /dev/null
+++ b/12_MeshLoaders/main.cpp
@@ -0,0 +1,272 @@
+// Copyright (C) 2018-2020 - DevSH Graphics Programming Sp. z O.O.
+// This file is part of the "Nabla Engine".
+// For conditions of distribution and use, see copyright notice in nabla.h
+#include "common.hpp"
+
+#include "../3rdparty/portable-file-dialogs/portable-file-dialogs.h"
+
+
+class MeshLoadersApp final : public MonoWindowApplication, public BuiltinResourcesApplication
+{
+		using device_base_t = MonoWindowApplication;
+		using asset_base_t = BuiltinResourcesApplication;
+
+	public:
+		inline MeshLoadersApp(const path& _localInputCWD, const path& _localOutputCWD, const path& _sharedInputCWD, const path& _sharedOutputCWD)
+			: IApplicationFramework(_localInputCWD, _localOutputCWD, _sharedInputCWD, _sharedOutputCWD),
+			device_base_t({1280,720}, EF_UNKNOWN, _localInputCWD, _localOutputCWD, _sharedInputCWD, _sharedOutputCWD) {}
+
+		inline bool onAppInitialized(smart_refctd_ptr<ISystem>&& system) override
+		{
+			if (!asset_base_t::onAppInitialized(smart_refctd_ptr(system)))
+				return false;
+			if (!device_base_t::onAppInitialized(smart_refctd_ptr(system)))
+				return false;
+
+			m_semaphore = m_device->createSemaphore(m_realFrameIx);
+			if (!m_semaphore)
+				return logFail("Failed to Create a Semaphore!");
+
+			auto pool = m_device->createCommandPool(getGraphicsQueue()->getFamilyIndex(),IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT);
+			for (auto i=0u; i<MaxFramesInFlight; i++)
+			{
+				if (!pool)
+					return logFail("Couldn't create Command Pool!");
+				if (!pool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY,{m_cmdBufs.data()+i,1}))
+					return logFail("Couldn't create Command Buffer!");
+			}
+			
+			//! cache results -- speeds up mesh generation on second run
+			m_qnc->loadCacheFromFile<EF_R8G8B8_SNORM>(m_system.get(),sharedOutputCWD/"../../tmp/normalCache888.sse");
+
+			//
+			if (!reloadModel())
+				return false;
+#if 0			
+			const uint32_t addtionalBufferOwnershipFamilies[] = {getGraphicsQueue()->getFamilyIndex()};
+			// we want to use the vertex data through UTBs
+			using usage_f = IGPUBuffer::E_USAGE_FLAGS;
+			CAssetConverter::patch_t<asset::ICPUPolygonGeometry> patch = {};
+			patch.positionBufferUsages = usage_f::EUF_UNIFORM_TEXEL_BUFFER_BIT;
+			patch.indexBufferUsages = usage_f::EUF_INDEX_BUFFER_BIT;
+			patch.otherBufferUsages = usage_f::EUF_UNIFORM_TEXEL_BUFFER_BIT;
+			m_scene = CGeometryCreatorScene::create(
+				{
+					.transferQueue = getTransferUpQueue(),
+					.utilities = m_utils.get(),
+					.logger = m_logger.get(),
+					.addtionalBufferOwnershipFamilies = addtionalBufferOwnershipFamilies
+				},patch
+			);
+#endif
+
+			auto scRes = static_cast<CDefaultSwapchainFramebuffers*>(m_surface->getSwapchainResources());
+			m_renderer = CSimpleDebugRenderer::create(m_assetMgr.get(),scRes->getRenderpass(),0,nullptr);
+
+			camera.mapKeysToArrows();
+
+			onAppInitializedFinish();
+			return true;
+		}
+
+		inline IQueue::SSubmitInfo::SSemaphoreInfo renderFrame(const std::chrono::microseconds nextPresentationTimestamp) override
+		{
+			m_inputSystem->getDefaultMouse(&mouse);
+			m_inputSystem->getDefaultKeyboard(&keyboard);
+
+			//
+			const auto resourceIx = m_realFrameIx % MaxFramesInFlight;
+
+			auto* const cb = m_cmdBufs.data()[resourceIx].get();
+			cb->reset(IGPUCommandBuffer::RESET_FLAGS::RELEASE_RESOURCES_BIT);
+			cb->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT);
+			// clear to black for both things
+			{
+				// begin renderpass
+				{
+					auto scRes = static_cast<CDefaultSwapchainFramebuffers*>(m_surface->getSwapchainResources());
+					auto* framebuffer = scRes->getFramebuffer(device_base_t::getCurrentAcquire().imageIndex);
+					const IGPUCommandBuffer::SClearColorValue clearValue = { .float32 = {1.f,0.f,1.f,1.f} };
+					const IGPUCommandBuffer::SClearDepthStencilValue depthValue = { .depth = 0.f };
+					const VkRect2D currentRenderArea =
+					{
+						.offset = {0,0},
+						.extent = {framebuffer->getCreationParameters().width,framebuffer->getCreationParameters().height}
+					};
+					const IGPUCommandBuffer::SRenderpassBeginInfo info =
+					{
+						.framebuffer = framebuffer,
+						.colorClearValues = &clearValue,
+						.depthStencilClearValues = &depthValue,
+						.renderArea = currentRenderArea
+					};
+					cb->beginRenderPass(info,IGPUCommandBuffer::SUBPASS_CONTENTS::INLINE);
+
+					const SViewport viewport = {
+						.x = static_cast<float>(currentRenderArea.offset.x),
+						.y = static_cast<float>(currentRenderArea.offset.y),
+						.width = static_cast<float>(currentRenderArea.extent.width),
+						.height = static_cast<float>(currentRenderArea.extent.height)
+					};
+					cb->setViewport(0u,1u,&viewport);
+		
+					cb->setScissor(0u,1u,&currentRenderArea);
+				}
+				// late latch input
+				{
+					camera.beginInputProcessing(nextPresentationTimestamp);
+					mouse.consumeEvents([&](const IMouseEventChannel::range_t& events) -> void { camera.mouseProcess(events); }, m_logger.get());
+					keyboard.consumeEvents([&](const IKeyboardEventChannel::range_t& events) -> void
+						{
+							camera.keyboardProcess(events);
+						},
+						m_logger.get()
+					);
+					camera.endInputProcessing(nextPresentationTimestamp);
+				}
+				// draw scene
+				{
+					float32_t3x4 viewMatrix;
+					float32_t4x4 viewProjMatrix;
+					// TODO: get rid of legacy matrices
+					{
+						memcpy(&viewMatrix,camera.getViewMatrix().pointer(),sizeof(viewMatrix));
+						memcpy(&viewProjMatrix,camera.getConcatenatedMatrix().pointer(),sizeof(viewProjMatrix));
+					}
+ 					m_renderer->render(cb,CSimpleDebugRenderer::SViewParams(viewMatrix,viewProjMatrix));
+				}
+				cb->endRenderPass();
+			}
+			cb->end();
+
+			//updateGUIDescriptorSet();
+
+			IQueue::SSubmitInfo::SSemaphoreInfo retval =
+			{
+				.semaphore = m_semaphore.get(),
+				.value = ++m_realFrameIx,
+				.stageMask = PIPELINE_STAGE_FLAGS::ALL_GRAPHICS_BITS
+			};
+			const IQueue::SSubmitInfo::SCommandBufferInfo commandBuffers[] =
+			{
+				{.cmdbuf = cb }
+			};
+			const IQueue::SSubmitInfo::SSemaphoreInfo acquired[] = {
+				{
+					.semaphore = device_base_t::getCurrentAcquire().semaphore,
+					.value = device_base_t::getCurrentAcquire().acquireCount,
+					.stageMask = PIPELINE_STAGE_FLAGS::NONE
+				}
+			};
+			const IQueue::SSubmitInfo infos[] =
+			{
+				{
+					.waitSemaphores = acquired,
+					.commandBuffers = commandBuffers,
+					.signalSemaphores = {&retval,1}
+				}
+			};
+			
+			if (getGraphicsQueue()->submit(infos) != IQueue::RESULT::SUCCESS)
+			{
+				retval.semaphore = nullptr; // so that we don't wait on semaphore that will never signal
+				m_realFrameIx--;
+			}
+
+			std::string caption = "[Nabla Engine] Mesh Loaders";
+			{
+				caption += ", displaying [";
+				caption += m_modelPath;
+				caption += "]";
+				m_window->setCaption(caption);
+			}
+			return retval;
+		}
+
+	protected:
+		const video::IGPURenderpass::SCreationParams::SSubpassDependency* getDefaultSubpassDependencies() const override
+		{
+			// Subsequent submits don't wait for each other, hence its important to have External Dependencies which prevent users of the depth attachment overlapping.
+			const static IGPURenderpass::SCreationParams::SSubpassDependency dependencies[] = {
+				// wipe-transition of Color to ATTACHMENT_OPTIMAL and depth
+				{
+					.srcSubpass = IGPURenderpass::SCreationParams::SSubpassDependency::External,
+					.dstSubpass = 0,
+					.memoryBarrier = {
+						// last place where the depth can get modified in previous frame, `COLOR_ATTACHMENT_OUTPUT_BIT` is implicitly later
+						.srcStageMask = PIPELINE_STAGE_FLAGS::LATE_FRAGMENT_TESTS_BIT,
+						// don't want any writes to be available, we'll clear 
+						.srcAccessMask = ACCESS_FLAGS::NONE,
+						// destination needs to wait as early as possible
+						// TODO: `COLOR_ATTACHMENT_OUTPUT_BIT` shouldn't be needed, because its a logically later stage, see TODO in `ECommonEnums.h`
+						.dstStageMask = PIPELINE_STAGE_FLAGS::EARLY_FRAGMENT_TESTS_BIT | PIPELINE_STAGE_FLAGS::COLOR_ATTACHMENT_OUTPUT_BIT,
+						// because depth and color get cleared first no read mask
+						.dstAccessMask = ACCESS_FLAGS::DEPTH_STENCIL_ATTACHMENT_WRITE_BIT | ACCESS_FLAGS::COLOR_ATTACHMENT_WRITE_BIT
+					}
+					// leave view offsets and flags default
+				},
+				// color from ATTACHMENT_OPTIMAL to PRESENT_SRC
+				{
+					.srcSubpass = 0,
+					.dstSubpass = IGPURenderpass::SCreationParams::SSubpassDependency::External,
+					.memoryBarrier = {
+						// last place where the color can get modified, depth is implicitly earlier
+						.srcStageMask = PIPELINE_STAGE_FLAGS::COLOR_ATTACHMENT_OUTPUT_BIT,
+						// only write ops, reads can't be made available
+						.srcAccessMask = ACCESS_FLAGS::COLOR_ATTACHMENT_WRITE_BIT
+						// spec says nothing is needed when presentation is the destination
+					}
+					// leave view offsets and flags default
+				},
+				IGPURenderpass::SCreationParams::DependenciesEnd
+			};
+			return dependencies;
+		}
+
+	private:
+		inline bool reloadModel()
+		{
+			pfd::open_file file("Choose a supported Model File", "../../media", { "All Supported Formats", "*.ply *.stl *.serialized *.obj",
+				"TODO (.ply)", "*.ply",
+				"TODO (.stl)", "*.stl",
+				"Mitsuba 0.6 Serialized (.serialized)", "*.serialized",
+				"Wavefront Object (.obj)", "*.obj" 
+			});
+			if (file.result().empty())
+				return false;
+			m_modelPath = file.result()[0];
+
+			// free up
+			m_assetMgr->clearAllAssetCache();
+
+			//! load the geometry
+			IAssetLoader::SAssetLoadParams params = {};
+			params.meshManipulatorOverride = nullptr; // TODO
+			auto bundle = m_assetMgr->getAsset(m_modelPath,params);
+			if (bundle.getContents().empty())
+				return false;
+			//! cache results -- speeds up mesh generation on second run
+			m_qnc->saveCacheToFile<EF_R8G8B8_SNORM>(m_system.get(),sharedOutputCWD/"../../tmp/normalCache888.sse");
+
+			return true;
+		}
+
+		// Maximum frames which can be simultaneously submitted, used to cycle through our per-frame resources like command buffers
+		constexpr static inline uint32_t MaxFramesInFlight = 3u;
+		//
+		smart_refctd_ptr<CQuantNormalCache> m_qnc;
+		smart_refctd_ptr<CSimpleDebugRenderer> m_renderer;
+		//
+		smart_refctd_ptr<ISemaphore> m_semaphore;
+		uint64_t m_realFrameIx = 0;
+		std::array<smart_refctd_ptr<IGPUCommandBuffer>,MaxFramesInFlight> m_cmdBufs;
+		//
+		InputSystem::ChannelReader<IMouseEventChannel> mouse;
+		InputSystem::ChannelReader<IKeyboardEventChannel> keyboard;
+		//
+		Camera camera = Camera(core::vectorSIMDf(0, 0, 0), core::vectorSIMDf(0, 0, 0), core::matrix4SIMD());
+		// mutables
+		std::string m_modelPath;
+};
+
+NBL_MAIN_FUNC(MeshLoadersApp)
\ No newline at end of file
diff --git a/12_MeshLoaders/pipeline.groovy b/12_MeshLoaders/pipeline.groovy
new file mode 100644
index 000000000..7b7c9702a
--- /dev/null
+++ b/12_MeshLoaders/pipeline.groovy
@@ -0,0 +1,50 @@
+import org.DevshGraphicsProgramming.Agent
+import org.DevshGraphicsProgramming.BuilderInfo
+import org.DevshGraphicsProgramming.IBuilder
+
+class CUIBuilder extends IBuilder
+{
+	public CUIBuilder(Agent _agent, _info)
+	{
+		super(_agent, _info)
+	}
+	
+	@Override
+	public boolean prepare(Map axisMapping)
+	{
+		return true
+	}
+	
+	@Override
+  	public boolean build(Map axisMapping)
+	{
+		IBuilder.CONFIGURATION config = axisMapping.get("CONFIGURATION")
+		IBuilder.BUILD_TYPE buildType = axisMapping.get("BUILD_TYPE")
+		
+		def nameOfBuildDirectory = getNameOfBuildDirectory(buildType)
+		def nameOfConfig = getNameOfConfig(config)
+		
+		agent.execute("cmake --build ${info.rootProjectPath}/${nameOfBuildDirectory}/${info.targetProjectPathRelativeToRoot} --target ${info.targetBaseName} --config ${nameOfConfig} -j12 -v")
+		
+		return true
+	}
+	
+	@Override
+  	public boolean test(Map axisMapping)
+	{
+		return true
+	}
+	
+	@Override
+	public boolean install(Map axisMapping)
+	{
+		return true
+	}
+}
+
+def create(Agent _agent, _info)
+{
+	return new CUIBuilder(_agent, _info)
+}
+
+return this
\ No newline at end of file
diff --git a/CMakeLists.txt b/CMakeLists.txt
index aa3880762..66d6f682d 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -41,6 +41,8 @@ if(NBL_BUILD_EXAMPLES)
 	add_subdirectory(10_CountingSort)
 	# showcase use of FFT for post-FX Bloom  effect
 	add_subdirectory(11_FFT)
+	#
+	add_subdirectory(12_MeshLoaders EXCLUDE_FROM_ALL)
 
 	# Waiting for a refactor
 	#add_subdirectory(27_PLYSTLDemo)
@@ -71,7 +73,7 @@ if(NBL_BUILD_EXAMPLES)
 	add_subdirectory(47_DerivMapTest EXCLUDE_FROM_ALL)
 	add_subdirectory(54_Transformations EXCLUDE_FROM_ALL)
 	add_subdirectory(55_RGB18E7S3 EXCLUDE_FROM_ALL)
-	add_subdirectory(61_UI EXCLUDE_FROM_ALL) # TODO: resurrect before `mesh_loaders` merge
+	add_subdirectory(61_UI)
 	add_subdirectory(62_CAD EXCLUDE_FROM_ALL) # TODO: Erfan, Przemek, Francisco and co. need to resurrect this
 	add_subdirectory(62_SchusslerTest EXCLUDE_FROM_ALL)
 	add_subdirectory(64_EmulatedFloatTest)

From 1b8135bee69dab63bc45ce9a11f6567dea5b4181 Mon Sep 17 00:00:00 2001
From: Erfan Ahmadi <ahmadierfan99@gmail.com>
Date: Tue, 24 Jun 2025 14:39:27 +0400
Subject: [PATCH 423/529] Grid Outlines SDF with Correct Dilation Logic

---
 62_CAD/shaders/globals.hlsl                   |  15 +-
 62_CAD/shaders/main_pipeline/dtm.hlsl         |  12 +-
 .../main_pipeline/fragment_shader.hlsl        | 182 ++++++++++++++----
 3 files changed, 151 insertions(+), 58 deletions(-)

diff --git a/62_CAD/shaders/globals.hlsl b/62_CAD/shaders/globals.hlsl
index 41c149205..eb32e103e 100644
--- a/62_CAD/shaders/globals.hlsl
+++ b/62_CAD/shaders/globals.hlsl
@@ -43,16 +43,6 @@ struct PushConstants
     uint32_t isDTMRendering;
 };
 
-#ifdef __HLSL_VERSION
-NBL_CONSTEXPR float InvalidGridDTMHeightValue = asfloat(0x7FC00000);
-
-bool isInvalidGridDtmHeightValue(float value)
-{
-    return isnan(value);
-}
-
-#endif
-
 struct WorldClipRect
 {
     pfloat64_t2 minClip; // min clip of a rect in worldspace coordinates of the original space (globals.defaultProjectionToNDC)
@@ -581,6 +571,11 @@ NBL_CONSTEXPR float MSDFSize = 64.0f;
 NBL_CONSTEXPR uint32_t MSDFMips = 4; 
 NBL_CONSTEXPR float HatchFillMSDFSceenSpaceSize = 8.0; 
 
+inline bool isInvalidGridDtmHeightValue(float value)
+{
+    return nbl::hlsl::isnan(value);
+}
+
 // Used in CPU-side only for now
 struct OrientedBoundingBox2D
 {
diff --git a/62_CAD/shaders/main_pipeline/dtm.hlsl b/62_CAD/shaders/main_pipeline/dtm.hlsl
index 6c638be58..a7697cf7f 100644
--- a/62_CAD/shaders/main_pipeline/dtm.hlsl
+++ b/62_CAD/shaders/main_pipeline/dtm.hlsl
@@ -230,7 +230,7 @@ float4 calculateDTMHeightColor(in DTMHeightShadingSettings settings, in float3 v
     return outputColor;
 }
 
-float calculateDTMContourSDF(in LineStyle contourStyle, in float3 v[3], in float2 fragPos, in float height)
+float calculateDTMContourSDF(in DTMContourSettings contourSettings, in LineStyle contourStyle, in float3 v[3], in float2 fragPos, in float height)
 {
     float distance = nbl::hlsl::numeric_limits<float>::max;
     const float contourThickness = (contourStyle.screenSpaceLineWidth + contourStyle.worldSpaceLineWidth * globals.screenToWorldRatio) * 0.5f;
@@ -473,22 +473,21 @@ struct GridDTMHeightMapData
     E_CELL_DIAGONAL cellDiagonal;
 };
 
-GridDTMHeightMapData retrieveGridDTMCellDataFromHeightMap(in float2 gridExtents, in float2 cellCoords, const float cellWidth, in Texture2D<uint32_t> heightMap)
+GridDTMHeightMapData retrieveGridDTMCellDataFromHeightMap(in float2 gridDimensions, in float2 cellCoords, in Texture2D<uint32_t> heightMap)
 {
     GridDTMHeightMapData output;
 
-    const float2 maxCellCoords = float2(round(gridExtents.x / cellWidth), round(gridExtents.y / cellWidth));
-    const float2 location = (cellCoords + float2(0.5f, 0.5f)) / maxCellCoords;
+    const float2 location = (cellCoords + float2(0.5f, 0.5f)) / gridDimensions;
     uint32_t4 cellData = heightMap.Gather(textureSampler, float2(location.x, location.y), 0);
 
-    printf("%u %u %u %u", cellData.x, cellData.y, cellData.z, cellData.w);
+    // printf("%u %u %u %u", cellData.x, cellData.y, cellData.z, cellData.w);
 
     output.heights = asfloat(cellData);
     output.cellDiagonal = dtm::resolveGridDTMCellDiagonal(cellData);
     return output;
 }
 
-GridDTMCell calculateCellTriangles(in float2 topLeft, in float2 gridExtents, in float2 cellCoords, const float cellWidth, in Texture2D<uint32_t> heightMap)
+GridDTMCell calculateCellTriangles(in dtm::GridDTMHeightMapData heightData, in float2 topLeft, in float2 cellCoords, const float cellWidth)
 {
     GridDTMCell output;
 
@@ -496,7 +495,6 @@ GridDTMCell calculateCellTriangles(in float2 topLeft, in float2 gridExtents, in
     // heightData.heihts.y - bottom right texel
     // heightData.heihts.z - top right texel
     // heightData.heihts.w - top left texel
-    dtm::GridDTMHeightMapData heightData = dtm::retrieveGridDTMCellDataFromHeightMap(gridExtents, cellCoords, cellWidth, heightMap);
     const bool diagonalFromTopLeftToBottomRight = heightData.cellDiagonal == E_CELL_DIAGONAL::TOP_LEFT_TO_BOTTOM_RIGHT;
     float2 gridSpaceCellTopLeftCoords = cellCoords * cellWidth;
 
diff --git a/62_CAD/shaders/main_pipeline/fragment_shader.hlsl b/62_CAD/shaders/main_pipeline/fragment_shader.hlsl
index 2627be5b3..2f6162815 100644
--- a/62_CAD/shaders/main_pipeline/fragment_shader.hlsl
+++ b/62_CAD/shaders/main_pipeline/fragment_shader.hlsl
@@ -158,7 +158,7 @@ float4 fragMain(PSInput input) : SV_TARGET
             for(uint32_t i = 0; i < dtmSettings.contourSettingsCount; ++i) // TODO: should reverse the order with blendUnder
             {
                 LineStyle contourStyle = loadLineStyle(dtmSettings.contourSettings[i].contourLineStyleIdx);
-                float sdf = dtm::calculateDTMContourColor(contourStyle, v, input.position.xy, height);
+                float sdf = dtm::calculateDTMContourSDF(dtmSettings.contourSettings[i], contourStyle, v, input.position.xy, height);
                 float4 contourColor = contourStyle.color;
                 contourColor.a *= 1.0f - smoothstep(-globals.antiAliasingFactor, globals.antiAliasingFactor, sdf);
                 dtmColor = dtm::blendUnder(dtmColor, contourColor);
@@ -428,21 +428,20 @@ float4 fragMain(PSInput input) : SV_TARGET
             float2 uv = input.getImageUV();
             const uint32_t textureId = input.getGridDTMHeightTextureID();
 
-            float2 topLeft = input.getGridDTMScreenSpaceTopLeft();
+            float2 gridTopLeftCorner = input.getGridDTMScreenSpaceTopLeft();
             float2 gridExtents = input.getGridDTMScreenSpaceGridExtents();
             const float cellWidth = input.getGridDTMScreenSpaceCellWidth();
-            float2 gridDimensions = gridExtents / cellWidth; // TODO: Figure out if it's better to precomp in vtx
+            // TODO: I think we can get it from the height map size if texture is valid?!, better if it comes directly from CPU side, vertex shader or something, division + round to integer is error-prone for large integer values
+            float2 gridDimensions = round(gridExtents / cellWidth); // texturesU32[NonUniformResourceIndex(textureId)].GetDimensions()? 
 
             float2 gridSpacePos = uv * gridExtents;
             float2 gridSpacePosDivGridCellWidth = gridSpacePos / cellWidth;
-            float2 cellCoords; // rename to currentCellCoords
+            float2 currentCellCoord;
             {
-                cellCoords.x = floor(gridSpacePosDivGridCellWidth.x);
-                cellCoords.y = floor(gridSpacePosDivGridCellWidth.y);
+                currentCellCoord.x = floor(gridSpacePosDivGridCellWidth.x);
+                currentCellCoord.y = floor(gridSpacePosDivGridCellWidth.y);
             }
 
-            float2 gridSpaceCellTopLeftCoords = cellCoords * cellWidth;
-
             // grid consists of square cells and cells are divided into two triangles:
             // depending on mode it is
             // either:        or:
@@ -459,9 +458,9 @@ float4 fragMain(PSInput input) : SV_TARGET
                 nbl::hlsl::shapes::Line<float> outlineLineSegments[2];
                 
                 const float halfCellWidth = cellWidth * 0.5f;
-                const float2 horizontalBounds = float2(topLeft.y, topLeft.y + gridExtents.y);
-                const float2 verticalBounds = float2(topLeft.x, topLeft.x + gridExtents.x);
-                float2 nearestLineRemainingCoords = int2((gridSpacePos + halfCellWidth) / cellWidth) * cellWidth + topLeft;
+                const float2 horizontalBounds = float2(gridTopLeftCorner.y, gridTopLeftCorner.y + gridExtents.y);
+                const float2 verticalBounds = float2(gridTopLeftCorner.x, gridTopLeftCorner.x + gridExtents.x);
+                float2 nearestLineRemainingCoords = int2((gridSpacePos + halfCellWidth) / cellWidth) * cellWidth + gridTopLeftCorner;
                 // shift lines outside of the grid to a bound
                 nearestLineRemainingCoords.x = clamp(nearestLineRemainingCoords.x, verticalBounds.x, verticalBounds.y);
                 nearestLineRemainingCoords.y = clamp(nearestLineRemainingCoords.y, horizontalBounds.x, horizontalBounds.y);
@@ -474,48 +473,87 @@ float4 fragMain(PSInput input) : SV_TARGET
                 outlineLineSegments[1].P1 = float32_t2(nearestLineRemainingCoords.x, horizontalBounds.y);
                 
                 float4 dtmColor = dtm::calculateGridDTMOutlineColor(dtmSettings.outlineLineStyleIdx, outlineLineSegments, input.position.xy, 0.0f);
+                
                 textureColor = dtmColor.rgb;
                 localAlpha = dtmColor.a;
             }
             else
             {
                 // calculate localUV and figure out the 4 cells we're gonna do sdf with
-                float2 localUV = gridSpacePosDivGridCellWidth - cellCoords; // TODO: use fmod instead?
-                float2 offset = round(localUV) * 2.0f - 1.0f;
-                
+                float2 localUV = gridSpacePosDivGridCellWidth - currentCellCoord; // TODO: use fmod instead?
+                int2 roundedLocalUV = round(localUV);
+                float2 offset = roundedLocalUV * 2.0f - 1.0f;
+
+                // Triangles
                 const uint32_t MaxTrianglesToDoSDFWith = 8u;
                 dtm::GridDTMTriangle triangles[MaxTrianglesToDoSDFWith];
                 float interpolatedHeights[MaxTrianglesToDoSDFWith]; // these are height based on barycentric interpolation of current pixel with all the triangles above
                 uint32_t triangleCount = 0u;
                 
-                const uint32_t MaxLinesToDoSDFWith = 4u;
-                // TODO: Lines to do SDF with
-                // But only do if outlines are enabled
+                // We can do sdf for up to 4 maximum lines for the outlines, 2 belong to the current cell and the other 2 belong to the opposite neighbouring cell
+                /* Example:
+                          |                  
+                          |     opposite cell
+                          |                  
+                    ------+------            
+                          |                  
+        current cell      |                  
+                          |                  
+                          
+                   `+` is the current corner and we draw the 4 lines leading up to it.
+                */
+                
+                // curr cell horizontal, curr cell vertical, opposite cell horizontal, opposite cell vertical 
+                bool4 linesValidity = bool4(false, false, false, false);
                 
-                // TODO: UNROLL
+                [unroll]
                 for (int i = 0; i < 2; ++i)
                 {
                     for (int j = 0; j < 2; ++j)
                     {
-                        float2 cellCoord = cellCoords + float2(i, j) * offset;
+                        float2 cellCoord = currentCellCoord + float2(i, j) * offset;
                         const bool isCellWithinRange = 
                             cellCoord.x >= 0.0f && cellCoord.y >= 0.0f && 
-                            cellCoord.x <= gridDimensions.x && cellCoord.y <= gridDimensions.y;
+                            cellCoord.x < gridDimensions.x && cellCoord.y < gridDimensions.y;
                         if (isCellWithinRange)
                         {
-                            // Triangle thing
-                            // topLeft, in float2 gridExtents, in float2 cellCoords, const float cellWidth, in Texture2D<uint32_t> heightMap;
-                            dtm::GridDTMCell gridCellFormed = calculateCellTriangles(topLeft, gridExtents, cellCoord, cellWidth, texturesU32[NonUniformResourceIndex(textureId)]);
+                            dtm::GridDTMHeightMapData heightData = dtm::retrieveGridDTMCellDataFromHeightMap(gridDimensions, cellCoord, texturesU32[NonUniformResourceIndex(textureId)]);
+                            dtm::GridDTMCell gridCellFormed = dtm::calculateCellTriangles(heightData, gridTopLeftCorner, cellCoord, cellWidth);
                             // Check the validity of the triangles and only add if valid
                             triangles[triangleCount++] = gridCellFormed.triangleA;
                             triangles[triangleCount++] = gridCellFormed.triangleB;
+
+                            // we just need to check and set lines validity
+                            // Formulas to get current cell's horizontal and vertical lines validity
+                            // All this to avoid extra texel fetch to check validity and use the Gather result instead :D
+                            if (i == 0 && j == 0)
+                            {
+                                // current cell's line validity
+                                linesValidity[0] = !isInvalidGridDtmHeightValue(heightData.heights[2 - (roundedLocalUV.y * 2)]) && !isInvalidGridDtmHeightValue(heightData.heights[3 - (roundedLocalUV.y * 2)]);
+                                linesValidity[1] = !isInvalidGridDtmHeightValue(heightData.heights[roundedLocalUV.x ^ 0]) && !isInvalidGridDtmHeightValue(heightData.heights[roundedLocalUV.x ^ 3]);
+                            }
+                            if (i == 1 && j == 0)
+                            {
+                                linesValidity[1] = !isInvalidGridDtmHeightValue(heightData.heights[roundedLocalUV.x ^ 1]) && !isInvalidGridDtmHeightValue(heightData.heights[roundedLocalUV.x ^ 2]);
+                                linesValidity[2] = !isInvalidGridDtmHeightValue(heightData.heights[2 - (roundedLocalUV.y * 2)]) && !isInvalidGridDtmHeightValue(heightData.heights[3 - (roundedLocalUV.y * 2)]);;
+                            }
+                            if (i == 0 && j == 1)
+                            {
+                                linesValidity[0] = !isInvalidGridDtmHeightValue(heightData.heights[roundedLocalUV.y * 2]) && !isInvalidGridDtmHeightValue(heightData.heights[roundedLocalUV.y * 2 + 1]);
+                                linesValidity[3] = !isInvalidGridDtmHeightValue(heightData.heights[roundedLocalUV.x ^ 1]) && !isInvalidGridDtmHeightValue(heightData.heights[roundedLocalUV.x ^ 2]);
+                            }
+                            if (i == 1 && j == 1)
+                            {
+                                linesValidity[2] = !isInvalidGridDtmHeightValue(heightData.heights[roundedLocalUV.y * 2]) && !isInvalidGridDtmHeightValue(heightData.heights[roundedLocalUV.y * 2 + 1]);
+                                linesValidity[3] = !isInvalidGridDtmHeightValue(heightData.heights[roundedLocalUV.x ^ 1]) && !isInvalidGridDtmHeightValue(heightData.heights[roundedLocalUV.x ^ 2]);
+                            }
                         }
                     }
                 }
                 
                 // float heightDeriv = fwidth(height);
                 // For height shading, merge this loop with the previous one, because baryCoord all positive means point inside triangle and we can use that to figure out the triangle we want to do height shading for.
-                for (int t = 0; t < trianglesCount; ++t)
+                for (int t = 0; t < triangleCount; ++t)
                 {
                     dtm::GridDTMTriangle tri = triangles[t];
                     const float3 baryCoord = dtm::calculateDTMTriangleBarycentrics(tri.vertices[0].xy, tri.vertices[1].xy, tri.vertices[2].xy, input.position.xy);
@@ -529,28 +567,90 @@ float4 fragMain(PSInput input) : SV_TARGET
                     {
                         LineStyle contourStyle = loadLineStyle(dtmSettings.contourSettings[i].contourLineStyleIdx);
                         float sdf = nbl::hlsl::numeric_limits<float>::max;
-                        for (int t = 0; t < trianglesCount; ++t)
+                        for (int t = 0; t < triangleCount; ++t)
                         {
                             dtm::GridDTMTriangle tri = triangles[t];
-                            sdf = min(sdf, dtm::calculateDTMContourSDF(contourStyle, tri.vertices, input.position.xy, interpolatedHeights[t]));
+                            sdf = min(sdf, dtm::calculateDTMContourSDF(dtmSettings.contourSettings[i], contourStyle, tri.vertices, input.position.xy, interpolatedHeights[t]));
+#if 0 // Debug Triangles
+                            nbl::hlsl::shapes::Line<float> lineSegment;
+                            lineSegment.P0 = tri.vertices[0].xy;
+                            lineSegment.P1 = tri.vertices[1].xy;
+                            float distance = ClippedSignedDistance<nbl::hlsl::shapes::Line<float> >::sdf(lineSegment, input.position.xy, 1.0f, false);
+                            sdf = min(sdf, distance);
+                            lineSegment.P0 = tri.vertices[1].xy;
+                            lineSegment.P1 = tri.vertices[2].xy;
+                            distance = ClippedSignedDistance<nbl::hlsl::shapes::Line<float> >::sdf(lineSegment, input.position.xy, 1.0f, false);
+                            sdf = min(sdf, distance);
+                            lineSegment.P0 = tri.vertices[0].xy;
+                            lineSegment.P1 = tri.vertices[2].xy;
+                            distance = ClippedSignedDistance<nbl::hlsl::shapes::Line<float> >::sdf(lineSegment, input.position.xy, 1.0f, false);
+                            sdf = min(sdf, distance);
+#endif
                         }
                         
-                        float4 contourColor = contourStyle.color;
+                        float4 contourColor = contourStyle.color; contourColor.a = 0.5f;
                         contourColor.a *= 1.0f - smoothstep(-globals.antiAliasingFactor, globals.antiAliasingFactor, sdf);
                         dtmColor = dtm::blendUnder(dtmColor, contourColor);
                     }
                 }
 
-                // Outlines:
-                // Is Outlines Enabled?
-                    // float sdf = max;
-                    // for each line
-                        // sdf = min(sdf, sdfOfOutlineSetting);
-                    // based on sdf, the outline line style + smoothstep: we compute color and alpha
-                    // blendUnder
+                if (dtmSettings.drawOutlineEnabled())
+                {
+                    float sdf = nbl::hlsl::numeric_limits<float>::max;
+                    LineStyle outlineStyle = loadLineStyle(dtmSettings.outlineLineStyleIdx);
+                    const float outlineThickness = (outlineStyle.screenSpaceLineWidth + outlineStyle.worldSpaceLineWidth * globals.screenToWorldRatio) * 0.5f;
+                    nbl::hlsl::shapes::Line<float> lineSegment;
+                    
+                    // Doing SDF of outlines as if cooridnate system is centered around the nearest corner of the cell
+                    float2 currentCellScreenspaceCoord = gridTopLeftCorner + (currentCellCoord + float2(roundedLocalUV)) * cellWidth;
+                    float2 localFragPos = input.position.xy - currentCellScreenspaceCoord;
+                    
+                    // TODO: Also make this a unrolled loop to reduce LOC
+                    if (linesValidity[0])
+                    {
+                        // this cells horizontal line
+                        lineSegment.P0 = float2(-offset.x, 0.0f) * cellWidth;
+                        lineSegment.P1 = float2(0.0f, 0.0f);
+                        float distance = ClippedSignedDistance<nbl::hlsl::shapes::Line<float> >::sdf(lineSegment, localFragPos, outlineThickness, outlineStyle.isRoadStyleFlag);
+                        sdf = min(sdf, distance);
+                    }
+                    if (linesValidity[1])
+                    {
+                        // this cells vertical line
+                        lineSegment.P0 = float2(0.0f, -offset.y) * cellWidth;
+                        lineSegment.P1 = float2(0.0f, 0.0f);
+                        float distance = ClippedSignedDistance<nbl::hlsl::shapes::Line<float> >::sdf(lineSegment, localFragPos, outlineThickness, outlineStyle.isRoadStyleFlag);
+                        sdf = min(sdf, distance);
+                    }
+                    if (linesValidity[2])
+                    {
+                        // opposite cell horizontal line
+                        lineSegment.P0 = float2(offset.x, 0.0f) * cellWidth;
+                        lineSegment.P1 = float2(0.0f, 0.0f);
+                        float distance = ClippedSignedDistance<nbl::hlsl::shapes::Line<float> >::sdf(lineSegment, localFragPos, outlineThickness, outlineStyle.isRoadStyleFlag);
+                        sdf = min(sdf, distance);
+                    }
+                    if (linesValidity[3])
+                    {
+                        // opposite cell vertical line
+                        lineSegment.P0 = float2(0.0f, offset.y) * cellWidth;
+                        lineSegment.P1 = float2(0.0f, 0.0f);
+                        float distance = ClippedSignedDistance<nbl::hlsl::shapes::Line<float> >::sdf(lineSegment, localFragPos, outlineThickness, outlineStyle.isRoadStyleFlag);
+                        sdf = min(sdf, distance);
+                    }
+
+                    float4 outlineColor = outlineStyle.color;
+                    outlineColor.a *= 1.0f - smoothstep(-globals.antiAliasingFactor, globals.antiAliasingFactor, sdf);
+                    dtmColor = dtm::blendUnder(dtmColor, outlineColor);
+                }
                 
-                // Height Shading:
-                    // We just do sdf with current triangle (if valid)
+                //textureColor = float3(linesValidity[0], linesValidity[1], 0.0f);
+                //localAlpha = 0.4f;
+
+                // TODO: Handle height shading, using only current triangle (if valid)
+                
+                textureColor = dtmColor.rgb / dtmColor.a;
+                localAlpha = dtmColor.a;
             }
             
 #if 0
@@ -566,13 +666,13 @@ float4 fragMain(PSInput input) : SV_TARGET
                 // heightData.heihts.y - bottom right texel
                 // heightData.heihts.z - top right texel
                 // heightData.heihts.w - top left texel
-                dtm::GridDTMHeightMapData heightData = dtm::retrieveGridDTMCellDataFromHeightMap(gridExtents, cellCoords, cellWidth, texturesU32[NonUniformResourceIndex(textureId)]);
+                dtm::GridDTMHeightMapData heightData = dtm::retrieveGridDTMCellDataFromHeightMap(gridExtents, currentCellCoord, cellWidth, texturesU32[NonUniformResourceIndex(textureId)]);
                 if (heightData.cellDiagonal == E_CELL_DIAGONAL::INVALID)
                     discard;
 
                 const bool diagonalFromTopLeftToBottomRight = heightData.cellDiagonal == E_CELL_DIAGONAL::TOP_LEFT_TO_BOTTOM_RIGHT;
 
-                float2 insideCellCoord = gridSpacePos - float2(cellWidth, cellWidth) * cellCoords; // TODO: use fmod instead?
+                float2 insideCellCoord = gridSpacePos - float2(cellWidth, cellWidth) * currentCellCoord; // TODO: use fmod instead?
                 // my ASCII art above explains which triangle is A and which is B
                 const bool triangleA = diagonalFromTopLeftToBottomRight ?
                     insideCellCoord.x < insideCellCoord.y :
@@ -610,7 +710,7 @@ float4 fragMain(PSInput input) : SV_TARGET
                 // move from grid space to screen space
                 [unroll]
                 for (int i = 0; i < 3; ++i)
-                    currentTriangle.vertices[i].xy += topLeft;
+                    currentTriangle.vertices[i].xy += gridTopLeftCorner;
 
                 const float2 neighbouringCellsCellOffsets[8] = {
                     float2(-1.0f, -1.0f),
@@ -626,8 +726,8 @@ float4 fragMain(PSInput input) : SV_TARGET
                 // construct triangles of neighbouring cells
                 for (int i = 0; i < 8; ++i)
                 {
-                    float2 neighbouringCellCoords = cellCoords + neighbouringCellsCellOffsets[i];
-                    neighbouringCells[i] = dtm::calculateCellTriangles(topLeft, gridExtents, neighbouringCellCoords, cellWidth, texturesU32[NonUniformResourceIndex(textureId)]);
+                    float2 neighbouringcurrentCellCoord = currentCellCoord + neighbouringCellsCellOffsets[i];
+                    neighbouringCells[i] = dtm::calculateCellTriangles(gridTopLeftCorner, gridExtents, neighbouringcurrentCellCoord, cellWidth, texturesU32[NonUniformResourceIndex(textureId)]);
                 }
             }
 

From b397df5e4fd2ad87af04993ec7ae3e5260d609f4 Mon Sep 17 00:00:00 2001
From: Erfan Ahmadi <ahmadierfan99@gmail.com>
Date: Tue, 24 Jun 2025 14:51:22 +0400
Subject: [PATCH 424/529] small fix

---
 62_CAD/shaders/main_pipeline/fragment_shader.hlsl | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/62_CAD/shaders/main_pipeline/fragment_shader.hlsl b/62_CAD/shaders/main_pipeline/fragment_shader.hlsl
index 2f6162815..5edc7b252 100644
--- a/62_CAD/shaders/main_pipeline/fragment_shader.hlsl
+++ b/62_CAD/shaders/main_pipeline/fragment_shader.hlsl
@@ -526,6 +526,7 @@ float4 fragMain(PSInput input) : SV_TARGET
                             // we just need to check and set lines validity
                             // Formulas to get current cell's horizontal and vertical lines validity
                             // All this to avoid extra texel fetch to check validity and use the Gather result instead :D
+                            // TODO: Only 0,0 and 1,1 is enough to check if cells are valid, but other checks required in case current cell is invalid (out of bounds) but it's line is valid
                             if (i == 0 && j == 0)
                             {
                                 // current cell's line validity
@@ -540,7 +541,7 @@ float4 fragMain(PSInput input) : SV_TARGET
                             if (i == 0 && j == 1)
                             {
                                 linesValidity[0] = !isInvalidGridDtmHeightValue(heightData.heights[roundedLocalUV.y * 2]) && !isInvalidGridDtmHeightValue(heightData.heights[roundedLocalUV.y * 2 + 1]);
-                                linesValidity[3] = !isInvalidGridDtmHeightValue(heightData.heights[roundedLocalUV.x ^ 1]) && !isInvalidGridDtmHeightValue(heightData.heights[roundedLocalUV.x ^ 2]);
+                                linesValidity[3] = !isInvalidGridDtmHeightValue(heightData.heights[roundedLocalUV.x ^ 0]) && !isInvalidGridDtmHeightValue(heightData.heights[roundedLocalUV.x ^ 3]);
                             }
                             if (i == 1 && j == 1)
                             {

From 73c5652ee0c8585ebe02e424d92888c1f382d217 Mon Sep 17 00:00:00 2001
From: Erfan Ahmadi <ahmadierfan99@gmail.com>
Date: Tue, 24 Jun 2025 14:52:25 +0400
Subject: [PATCH 425/529] comment

---
 62_CAD/shaders/main_pipeline/fragment_shader.hlsl | 1 +
 1 file changed, 1 insertion(+)

diff --git a/62_CAD/shaders/main_pipeline/fragment_shader.hlsl b/62_CAD/shaders/main_pipeline/fragment_shader.hlsl
index 5edc7b252..1cdc7fe63 100644
--- a/62_CAD/shaders/main_pipeline/fragment_shader.hlsl
+++ b/62_CAD/shaders/main_pipeline/fragment_shader.hlsl
@@ -606,6 +606,7 @@ float4 fragMain(PSInput input) : SV_TARGET
                     float2 currentCellScreenspaceCoord = gridTopLeftCorner + (currentCellCoord + float2(roundedLocalUV)) * cellWidth;
                     float2 localFragPos = input.position.xy - currentCellScreenspaceCoord;
                     
+                    // Drawing the lines that form a plus sign around the current corner:
                     // TODO: Also make this a unrolled loop to reduce LOC
                     if (linesValidity[0])
                     {

From 792f539526e02fb959f40c39a578d96a87414dca Mon Sep 17 00:00:00 2001
From: Erfan Ahmadi <ahmadierfan99@gmail.com>
Date: Tue, 24 Jun 2025 18:10:37 +0400
Subject: [PATCH 426/529] Small fixes with DTMS

---
 62_CAD/DrawResourcesFiller.cpp        |  2 +-
 62_CAD/shaders/globals.hlsl           | 11 +++++++++--
 62_CAD/shaders/main_pipeline/dtm.hlsl |  4 ++--
 3 files changed, 12 insertions(+), 5 deletions(-)

diff --git a/62_CAD/DrawResourcesFiller.cpp b/62_CAD/DrawResourcesFiller.cpp
index 296221fb5..8a3c0dff0 100644
--- a/62_CAD/DrawResourcesFiller.cpp
+++ b/62_CAD/DrawResourcesFiller.cpp
@@ -1668,7 +1668,7 @@ uint32_t DrawResourcesFiller::addDTMSettings_Internal(const DTMSettingsInfo& dtm
 		}
 		dtmSettings.heightShadingSettings.intervalIndexToHeightMultiplier = dtmSettingsInfo.heightShadingInfo.intervalIndexToHeightMultiplier;
 		dtmSettings.heightShadingSettings.isCenteredShading = static_cast<int>(dtmSettingsInfo.heightShadingInfo.isCenteredShading);
-		_NBL_DEBUG_BREAK_IF(!dtmSettingsInfo.heightShadingInfo.fillShaderDTMSettingsHeightColorMap(dtmSettings));
+		dtmSettingsInfo.heightShadingInfo.fillShaderDTMSettingsHeightColorMap(dtmSettings);
 	}
 	if (dtmSettings.mode & E_DTM_MODE::CONTOUR)
 	{
diff --git a/62_CAD/shaders/globals.hlsl b/62_CAD/shaders/globals.hlsl
index eb32e103e..2c645baf3 100644
--- a/62_CAD/shaders/globals.hlsl
+++ b/62_CAD/shaders/globals.hlsl
@@ -589,11 +589,11 @@ struct OrientedBoundingBox2D
 
 LineStyle loadLineStyle(const uint32_t index)
 {
-    return vk::RawBufferLoad<LineStyle>(globals.pointers.lineStyles + index * sizeof(LineStyle), 8u);
+    return vk::RawBufferLoad<LineStyle>(globals.pointers.lineStyles + index * sizeof(LineStyle), 4u);
 }
 DTMSettings loadDTMSettings(const uint32_t index)
 {
-    return vk::RawBufferLoad<DTMSettings>(globals.pointers.dtmSettings + index * sizeof(DTMSettings), 8u);
+    return vk::RawBufferLoad<DTMSettings>(globals.pointers.dtmSettings + index * sizeof(DTMSettings), 4u);
 }
 pfloat64_t3x3 loadCustomProjection(const uint32_t index)
 {
@@ -611,6 +611,13 @@ DrawObject loadDrawObject(const uint32_t index)
 {
     return vk::RawBufferLoad<DrawObject>(globals.pointers.drawObjects + index * sizeof(DrawObject), 8u);
 }
+#else
+static_assert(alignof(LineStyle)==4u);
+static_assert(alignof(DTMSettings)==4u);
+static_assert(alignof(pfloat64_t3x3)==8u);
+static_assert(alignof(WorldClipRect)==8u);
+static_assert(alignof(MainObject)==4u);
+static_assert(alignof(DrawObject)==8u);
 #endif
 
 
diff --git a/62_CAD/shaders/main_pipeline/dtm.hlsl b/62_CAD/shaders/main_pipeline/dtm.hlsl
index a7697cf7f..04c1daaca 100644
--- a/62_CAD/shaders/main_pipeline/dtm.hlsl
+++ b/62_CAD/shaders/main_pipeline/dtm.hlsl
@@ -77,8 +77,8 @@ void getIntervalHeightAndColor(in int intervalIndex, in DTMHeightShadingSettings
         outIntervalHeight = minShadingHeight + (float(intervalIndex)) * settings.intervalLength;
 
     DTMSettingsHeightsAccessor dtmHeightsAccessor = { settings };
-    uint32_t upperBoundHeightIndex = min(nbl::hlsl::upper_bound(dtmHeightsAccessor, 0, settings.heightColorEntryCount, heightForColor), settings.heightColorEntryCount - 1u);
-    uint32_t lowerBoundHeightIndex = max(upperBoundHeightIndex - 1, 0);
+    int32_t upperBoundHeightIndex = min(nbl::hlsl::upper_bound(dtmHeightsAccessor, 0, settings.heightColorEntryCount, heightForColor), settings.heightColorEntryCount - 1u);
+    int32_t lowerBoundHeightIndex = max(upperBoundHeightIndex - 1, 0);
 
     float upperBoundHeight = settings.heightColorMapHeights[upperBoundHeightIndex];
     float lowerBoundHeight = settings.heightColorMapHeights[lowerBoundHeightIndex];

From 161733fbc6817ee779f9ddf9b6b63eaa4500af2a Mon Sep 17 00:00:00 2001
From: kevyuu <kevin.kayu@gmail.com>
Date: Wed, 25 Jun 2025 18:33:18 +0700
Subject: [PATCH 427/529] Fix example 71

---
 .../app_resources/common.hlsl                 | 82 +---------------
 .../app_resources/raytrace.rchit.hlsl         | 97 +++++++++++++++++++
 71_RayTracingPipeline/include/common.hpp      | 27 +-----
 71_RayTracingPipeline/main.cpp                | 50 +++++++---
 4 files changed, 134 insertions(+), 122 deletions(-)

diff --git a/71_RayTracingPipeline/app_resources/common.hlsl b/71_RayTracingPipeline/app_resources/common.hlsl
index 18b67085a..8f7a06a33 100644
--- a/71_RayTracingPipeline/app_resources/common.hlsl
+++ b/71_RayTracingPipeline/app_resources/common.hlsl
@@ -92,6 +92,7 @@ struct STriangleGeomInfo
     MaterialPacked material;
     uint64_t vertexBufferAddress;
     uint64_t indexBufferAddress;
+    uint64_t normalBufferAddress;
 
     uint32_t vertexStride : 26;
     uint32_t objType: 3;
@@ -238,8 +239,6 @@ enum ObjectType : uint32_t  // matches c++
     OT_COUNT
 };
 
-static uint32_t s_offsetsToNormalBytes[OT_COUNT] = { 18, 24, 24, 20, 20, 24, 16, 12 };	// based on normals data position
-
 float32_t3 computeDiffuse(Material mat, float32_t3 light_dir, float32_t3 normal)
 {
 	float32_t dotNL = max(dot(normal, light_dir), 0.0);
@@ -271,85 +270,6 @@ float3 unpackNormals3x10(uint32_t v)
     return clamp(float3(pn) / 511.0, -1.0, 1.0);
 }
 
-float32_t3 fetchVertexNormal(int instID, int primID, STriangleGeomInfo geom, float2 bary)
-{
-    uint idxOffset = primID * 3;
-
-    const uint indexType = geom.indexType;
-    const uint vertexStride = geom.vertexStride;
-
-    const uint32_t objType = geom.objType;
-    const uint64_t indexBufferAddress = geom.indexBufferAddress;
-
-    uint i0, i1, i2;
-    switch (indexType)
-    {
-        case 0: // EIT_16BIT
-        {
-                i0 = uint32_t(vk::RawBufferLoad < uint16_t > (indexBufferAddress + (idxOffset + 0) * sizeof(uint16_t), 2u));
-                i1 = uint32_t(vk::RawBufferLoad < uint16_t > (indexBufferAddress + (idxOffset + 1) * sizeof(uint16_t), 2u));
-                i2 = uint32_t(vk::RawBufferLoad < uint16_t > (indexBufferAddress + (idxOffset + 2) * sizeof(uint16_t), 2u));
-            }
-            break;
-        case 1: // EIT_32BIT
-        {
-                i0 = vk::RawBufferLoad < uint32_t > (indexBufferAddress + (idxOffset + 0) * sizeof(uint32_t));
-                i1 = vk::RawBufferLoad < uint32_t > (indexBufferAddress + (idxOffset + 1) * sizeof(uint32_t));
-                i2 = vk::RawBufferLoad < uint32_t > (indexBufferAddress + (idxOffset + 2) * sizeof(uint32_t));
-            }
-            break;
-        default: // EIT_NONE
-        {
-                i0 = idxOffset;
-                i1 = idxOffset + 1;
-                i2 = idxOffset + 2;
-            }
-    }
-
-    const uint64_t normalVertexBufferAddress = geom.vertexBufferAddress + s_offsetsToNormalBytes[objType];
-    float3 n0, n1, n2;
-    switch (objType)
-    {
-        case OT_CUBE:
-        {
-                uint32_t v0 = vk::RawBufferLoad < uint32_t > (normalVertexBufferAddress + i0 * vertexStride, 2u);
-                uint32_t v1 = vk::RawBufferLoad < uint32_t > (normalVertexBufferAddress + i1 * vertexStride, 2u);
-                uint32_t v2 = vk::RawBufferLoad < uint32_t > (normalVertexBufferAddress + i2 * vertexStride, 2u);
-
-                n0 = normalize(nbl::hlsl::spirv::unpackSnorm4x8(v0).xyz);
-                n1 = normalize(nbl::hlsl::spirv::unpackSnorm4x8(v1).xyz);
-                n2 = normalize(nbl::hlsl::spirv::unpackSnorm4x8(v2).xyz);
-            }
-            break;
-        case OT_SPHERE:
-        case OT_CYLINDER:
-        case OT_ARROW:
-        case OT_CONE:
-        {
-                uint32_t v0 = vk::RawBufferLoad < uint32_t > (normalVertexBufferAddress + i0 * vertexStride);
-                uint32_t v1 = vk::RawBufferLoad < uint32_t > (normalVertexBufferAddress + i1 * vertexStride);
-                uint32_t v2 = vk::RawBufferLoad < uint32_t > (normalVertexBufferAddress + i2 * vertexStride);
-
-                n0 = normalize(unpackNormals3x10(v0));
-                n1 = normalize(unpackNormals3x10(v1));
-                n2 = normalize(unpackNormals3x10(v2));
-            }
-            break;
-        case OT_RECTANGLE:
-        case OT_DISK:
-        case OT_ICOSPHERE:
-        default:
-        {
-                n0 = vk::RawBufferLoad < float3 > (normalVertexBufferAddress + i0 * vertexStride);
-                n1 = vk::RawBufferLoad < float3 > (normalVertexBufferAddress + i1 * vertexStride);
-                n2 = vk::RawBufferLoad < float3 > (normalVertexBufferAddress + i2 * vertexStride);
-            }
-    }
-
-    float3 barycentrics = float3(0.0, bary);
-    barycentrics.x = 1.0 - barycentrics.y - barycentrics.z;
-    return normalize(barycentrics.x * n0 + barycentrics.y * n1 + barycentrics.z * n2);
-}
 #endif
 
 namespace nbl
diff --git a/71_RayTracingPipeline/app_resources/raytrace.rchit.hlsl b/71_RayTracingPipeline/app_resources/raytrace.rchit.hlsl
index cf68e52eb..0a2877ccf 100644
--- a/71_RayTracingPipeline/app_resources/raytrace.rchit.hlsl
+++ b/71_RayTracingPipeline/app_resources/raytrace.rchit.hlsl
@@ -2,6 +2,103 @@
 
 [[vk::push_constant]] SPushConstants pc;
 
+float32_t3 fetchVertexNormal(int instID, int primID, STriangleGeomInfo geom, float2 bary)
+{
+    uint idxOffset = primID * 3;
+    
+    const uint indexType = geom.indexType;
+    const uint vertexStride = geom.vertexStride;
+    
+    const uint32_t objType = geom.objType;
+    const uint64_t indexBufferAddress = geom.indexBufferAddress;
+    
+    uint i0, i1, i2;
+    switch (indexType)
+    {
+        case 0: // EIT_16BIT
+        {
+            i0 = uint32_t(vk::RawBufferLoad < uint16_t > (indexBufferAddress + (idxOffset + 0) * sizeof(uint16_t), 2u));
+            i1 = uint32_t(vk::RawBufferLoad < uint16_t > (indexBufferAddress + (idxOffset + 1) * sizeof(uint16_t), 2u));
+            i2 = uint32_t(vk::RawBufferLoad < uint16_t > (indexBufferAddress + (idxOffset + 2) * sizeof(uint16_t), 2u));
+        }
+        break;
+        case 1: // EIT_32BIT
+        {
+            i0 = vk::RawBufferLoad < uint32_t > (indexBufferAddress + (idxOffset + 0) * sizeof(uint32_t));
+            i1 = vk::RawBufferLoad < uint32_t > (indexBufferAddress + (idxOffset + 1) * sizeof(uint32_t));
+            i2 = vk::RawBufferLoad < uint32_t > (indexBufferAddress + (idxOffset + 2) * sizeof(uint32_t));
+        }
+        break;
+        default: // EIT_NONE
+        {
+            i0 = idxOffset;
+            i1 = idxOffset + 1;
+            i2 = idxOffset + 2;
+        }
+    }
+
+    const uint64_t normalVertexBufferAddress = geom.normalBufferAddress;
+    float3 n0, n1, n2;
+
+    // TODO(kevin): Currently this will work correctly both for cubes and rectangle, which are the only triangles geometry that is used in this example. Need to implement other geometry
+    uint32_t v0 = vk::RawBufferLoad < uint32_t > (normalVertexBufferAddress + i0 * 4);
+    uint32_t v1 = vk::RawBufferLoad < uint32_t > (normalVertexBufferAddress + i1 * 4);
+    uint32_t v2 = vk::RawBufferLoad < uint32_t > (normalVertexBufferAddress + i2 * 4);
+    
+
+    n0 = normalize(nbl::hlsl::spirv::unpackSnorm4x8(v0).xyz);
+    n1 = normalize(nbl::hlsl::spirv::unpackSnorm4x8(v1).xyz);
+    n2 = normalize(nbl::hlsl::spirv::unpackSnorm4x8(v2).xyz);
+
+    // switch (objType)
+    // {
+    //     case OT_CUBE:
+    //     {
+    //         // TODO(kevin): Don't hardcode the normal stride in hlsl
+    //         uint32_t v0 = vk::RawBufferLoad < uint32_t > (normalVertexBufferAddress + i0 * 4);
+    //         uint32_t v1 = vk::RawBufferLoad < uint32_t > (normalVertexBufferAddress + i1 * 4);
+    //         uint32_t v2 = vk::RawBufferLoad < uint32_t > (normalVertexBufferAddress + i2 * 4);
+    //
+    //         n0 = normalize(nbl::hlsl::spirv::unpackSnorm4x8(v0).xyz);
+    //         n1 = normalize(nbl::hlsl::spirv::unpackSnorm4x8(v1).xyz);
+    //         n2 = normalize(nbl::hlsl::spirv::unpackSnorm4x8(v2).xyz);
+    //     }
+    //     break;
+    //     case OT_SPHERE:
+    //     case OT_CYLINDER:
+    //     case OT_ARROW:
+    //     case OT_CONE:
+    //     {
+    //         // TODO(kevin): Fix this logic. Don't use vertex stride since nomral is separated from position
+    //         uint32_t v0 = vk::RawBufferLoad < uint32_t > (normalVertexBufferAddress + i0 * vertexStride);
+    //         uint32_t v1 = vk::RawBufferLoad < uint32_t > (normalVertexBufferAddress + i1 * vertexStride);
+    //         uint32_t v2 = vk::RawBufferLoad < uint32_t > (normalVertexBufferAddress + i2 * vertexStride);
+    //
+    //         n0 = normalize(unpackNormals3x10(v0));
+    //         n1 = normalize(unpackNormals3x10(v1));
+    //         n2 = normalize(unpackNormals3x10(v2));
+    //     }
+    //     break;
+    //     case OT_RECTANGLE:
+    //     case OT_DISK:
+    //     case OT_ICOSPHERE:
+    //     default:
+    //     {
+    //         // TODO(kevin): Don't hardcode the normal stride in hlsl
+    //         n0 = vk::RawBufferLoad < float3 > (normalVertexBufferAddress + i0 * 4);
+    //         n1 = vk::RawBufferLoad < float3 > (normalVertexBufferAddress + i1 * 4);
+    //         n2 = vk::RawBufferLoad < float3 > (normalVertexBufferAddress + i2 * 4);
+    //     }
+    // }
+
+    // n0 = float3(0, 1, 0);
+    // n1 = float3(0, 1, 0);
+    // n2 = float3(0, 1, 0);
+
+    float3 barycentrics = float3(0.0, bary);
+    barycentrics.x = 1.0 - barycentrics.y - barycentrics.z;
+    return normalize(barycentrics.x * n0 + barycentrics.y * n1 + barycentrics.z * n2);
+}
 
 [shader("closesthit")]
 void main(inout PrimaryPayload payload, in BuiltInTriangleIntersectionAttributes attribs)
diff --git a/71_RayTracingPipeline/include/common.hpp b/71_RayTracingPipeline/include/common.hpp
index 184d424c7..479b7fff6 100644
--- a/71_RayTracingPipeline/include/common.hpp
+++ b/71_RayTracingPipeline/include/common.hpp
@@ -45,40 +45,15 @@ struct ObjectMeta
 	std::string_view name = "Unknown";
 };
 
-struct ObjectDrawHookCpu
-{
-	nbl::core::matrix3x4SIMD model;
-	ObjectMeta meta;
-};
-
 struct ReferenceObjectCpu
 {
 	ObjectMeta meta;
 	core::smart_refctd_ptr<ICPUPolygonGeometry> data;
 	Material material;
   core::matrix3x4SIMD transform;
-};
 
-struct ReferenceObjectGpu
-{
-	struct Bindings
-	{
-		nbl::asset::SBufferBinding<IGPUBuffer> vertex, index;
-	};
-
-	ObjectMeta meta;
-	Bindings bindings;
-	uint32_t vertexStride;
-	nbl::asset::E_INDEX_TYPE indexType = nbl::asset::E_INDEX_TYPE::EIT_UNKNOWN;
-	uint32_t indexCount = {};
-	MaterialPacked material;
-  core::matrix3x4SIMD transform;
-
-	const bool useIndex() const
-	{
-		return bindings.index.buffer && (indexType != E_INDEX_TYPE::EIT_UNKNOWN);
-	}
 };
+
 }
 
 #endif // __NBL_THIS_EXAMPLE_COMMON_H_INCLUDED__
diff --git a/71_RayTracingPipeline/main.cpp b/71_RayTracingPipeline/main.cpp
index 382e5cccb..c47eea1c4 100644
--- a/71_RayTracingPipeline/main.cpp
+++ b/71_RayTracingPipeline/main.cpp
@@ -6,11 +6,13 @@
 #include "nbl/ext/FullScreenTriangle/FullScreenTriangle.h"
 #include "nbl/builtin/hlsl/indirect_commands.hlsl"
 
+#include "nbl/examples/common/BuiltinResourcesApplication.hpp"
 
-class RaytracingPipelineApp final : public SimpleWindowedApplication, public application_templates::MonoAssetManagerAndBuiltinResourceApplication
+
+class RaytracingPipelineApp final : public SimpleWindowedApplication, public BuiltinResourcesApplication
 {
 	using device_base_t = SimpleWindowedApplication;
-	using asset_base_t = application_templates::MonoAssetManagerAndBuiltinResourceApplication;
+	using asset_base_t = BuiltinResourcesApplication;
 	using clock_t = std::chrono::steady_clock;
 
 	constexpr static inline uint32_t WIN_W = 1280, WIN_H = 720;
@@ -1220,7 +1222,7 @@ class RaytracingPipelineApp final : public SimpleWindowedApplication, public app
 			}
 			else
 			{
-				auto triangles = make_refctd_dynamic_array<smart_refctd_dynamic_array<ICPUBottomLevelAccelerationStructure::Triangles<ICPUBuffer>>>(cpuObjects[i].data->exportForBLAS());
+				auto triangles = make_refctd_dynamic_array<smart_refctd_dynamic_array<ICPUBottomLevelAccelerationStructure::Triangles<ICPUBuffer>>>(1u);
 				auto primitiveCounts = make_refctd_dynamic_array<smart_refctd_dynamic_array<uint32_t>>(1u);
 
 				auto& tri = triangles->front();
@@ -1228,6 +1230,7 @@ class RaytracingPipelineApp final : public SimpleWindowedApplication, public app
 				auto& primCount = primitiveCounts->front();
 				primCount = cpuObjects[i].data->getPrimitiveCount();
 
+				tri = cpuObjects[i].data->exportForBLAS();
 				tri.geometryFlags = cpuObjects[i].material.isTransparent() ?
 					IGPUBottomLevelAccelerationStructure::GEOMETRY_FLAGS::NO_DUPLICATE_ANY_HIT_INVOCATION_BIT :
 					IGPUBottomLevelAccelerationStructure::GEOMETRY_FLAGS::OPAQUE_BIT;
@@ -1257,7 +1260,7 @@ class RaytracingPipelineApp final : public SimpleWindowedApplication, public app
 				inst.base.blas = cpuBlasList[i];
 				inst.base.flags = static_cast<uint32_t>(IGPUTopLevelAccelerationStructure::INSTANCE_FLAGS::TRIANGLE_FACING_CULL_DISABLE_BIT);
 				inst.base.instanceCustomIndex = i;
-				inst.base.instanceShaderBindingTableRecordOffset = isProceduralInstance ? 2 : 0;;
+				inst.base.instanceShaderBindingTableRecordOffset = isProceduralInstance ? 2 : 0;
 				inst.base.mask = 0xFF;
 				inst.transform = isProceduralInstance ? matrix3x4SIMD() : cpuObjects[i].transform;
 
@@ -1305,19 +1308,22 @@ class RaytracingPipelineApp final : public SimpleWindowedApplication, public app
 		inputs.allocator = &myalloc;
 
 		std::array<ICPUTopLevelAccelerationStructure*, 1u> tmpTlas;
-		std::array<ICPUPolygonGeometry*, std::size(cpuObjects)> tmpGeometries;
 		std::array<ICPUBuffer*, 1> tmpBuffers;
+		std::array<ICPUPolygonGeometry*, std::size(cpuObjects)> tmpGeometries;
+		std::array<CAssetConverter::patch_t<asset::ICPUPolygonGeometry>, std::size(cpuObjects)> tmpGeometryPatches;
 		{
 			tmpTlas[0] = cpuTlas.get();
 			tmpBuffers[0] = cpuProcBuffer.get();
 			for (uint32_t i = 0; i < cpuObjects.size(); i++)
 			{
 				tmpGeometries[i] = cpuObjects[i].data.get();
+				tmpGeometryPatches[i].indexBufferUsages= IGPUBuffer::E_USAGE_FLAGS::EUF_SHADER_DEVICE_ADDRESS_BIT;
 			}
 
 			std::get<CAssetConverter::SInputs::asset_span_t<ICPUTopLevelAccelerationStructure>>(inputs.assets) = tmpTlas;
 			std::get<CAssetConverter::SInputs::asset_span_t<ICPUBuffer>>(inputs.assets) = tmpBuffers;
 			std::get<CAssetConverter::SInputs::asset_span_t<ICPUPolygonGeometry>>(inputs.assets) = tmpGeometries;
+			std::get<CAssetConverter::SInputs::patch_span_t<ICPUPolygonGeometry>>(inputs.patches) = tmpGeometryPatches;
 		}
 
 		auto reservation = converter->reserve(inputs);
@@ -1346,6 +1352,7 @@ class RaytracingPipelineApp final : public SimpleWindowedApplication, public app
 
 			prepass.template operator() < ICPUTopLevelAccelerationStructure > (tmpTlas);
 			prepass.template operator() < ICPUBuffer > (tmpBuffers);
+			prepass.template operator() < ICPUPolygonGeometry > (tmpGeometries);
 		}
 
 		constexpr auto CompBufferCount = 2;
@@ -1425,25 +1432,37 @@ class RaytracingPipelineApp final : public SimpleWindowedApplication, public app
 			auto&& tlases = reservation.getGPUObjects<ICPUTopLevelAccelerationStructure>();
 			m_gpuTlas = tlases[0].value;
 			auto&& buffers = reservation.getGPUObjects<ICPUBuffer>();
+			m_proceduralAabbBuffer = buffers[0].value;
 
-			m_proceduralAabbBuffer = buffers[2 * proceduralBlasIdx].value;
+			auto&& gpuPolygonGeometries = reservation.getGPUObjects<ICPUPolygonGeometry>();
+			m_gpuPolygons.resize(gpuPolygonGeometries.size());
 
-			for (uint32_t i = 0; i < cpuObjects.size(); i++)
+			for (uint32_t i = 0; i < gpuPolygonGeometries.size(); i++)
 			{
 				const auto& cpuObject = cpuObjects[i];
-				const auto& cpuBlas = cpuBlasList[i];
-				const auto& geometry = cpuBlas->getTriangleGeometries()[0];
-				const uint64_t vertexBufferAddress = buffers[2 * i].value->getDeviceAddress();
-				const uint64_t indexBufferAddress = buffers[(2 * i) + 1].value->getDeviceAddress();
-				geomInfos[i] = {
+				const auto& gpuPolygon = gpuPolygonGeometries[i].value;
+				const auto gpuTriangles = gpuPolygon->exportForBLAS();
+
+				const auto& vertexBufferBinding = gpuTriangles.vertexData[0];
+				const uint64_t vertexBufferAddress = vertexBufferBinding.buffer->getDeviceAddress() + vertexBufferBinding.offset;
+
+				const auto& normalView = gpuPolygon->getNormalView();
+				const uint64_t normalBufferAddress = normalView ? normalView.src.buffer->getDeviceAddress() + normalView.src.offset : 0;
+
+				const auto& indexBufferBinding = gpuTriangles.indexData;
+				auto& geomInfo = geomInfos[i];
+				geomInfo = {
 				  .material = hlsl::_static_cast<MaterialPacked>(cpuObject.material),
 				  .vertexBufferAddress = vertexBufferAddress,
-				  .indexBufferAddress = geometry.indexData.buffer ? indexBufferAddress : vertexBufferAddress,
-				  .vertexStride = geometry.vertexStride,
+				  .indexBufferAddress = indexBufferBinding.buffer ? indexBufferBinding.buffer->getDeviceAddress() + indexBufferBinding.offset : vertexBufferAddress,
+					.normalBufferAddress = normalBufferAddress,
+				  .vertexStride = gpuTriangles.vertexStride,
 				  .objType = cpuObject.meta.type,
-				  .indexType = geometry.indexType,
+				  .indexType = gpuTriangles.indexType,
 				  .smoothNormals = scene::s_smoothNormals[cpuObject.meta.type],
 				};
+
+				m_gpuPolygons[i] = gpuPolygon;
 			}
 		}
 
@@ -1508,6 +1527,7 @@ class RaytracingPipelineApp final : public SimpleWindowedApplication, public app
 	core::vector<SProceduralGeomInfo> m_gpuIntersectionSpheres;
 	uint32_t m_intersectionHitGroupIdx;
 
+	core::vector<smart_refctd_ptr<IGPUPolygonGeometry>> m_gpuPolygons;
 	smart_refctd_ptr<IGPUTopLevelAccelerationStructure> m_gpuTlas;
 	smart_refctd_ptr<IGPUBuffer> m_instanceBuffer;
 

From 86cc7dda5858eb5dd83ee2fdaf9ae4ade485d7c7 Mon Sep 17 00:00:00 2001
From: devsh <devsh@devsh.eu>
Date: Wed, 25 Jun 2025 14:10:16 +0200
Subject: [PATCH 428/529] decouple `CSimpleDebugRenderer` from
 `CGeometryCreatorScene`

---
 12_MeshLoaders/main.cpp                       | 31 +++++++---
 .../geometry/CGeometryCreatorScene.hpp        | 32 +++++-----
 .../geometry/CSimpleDebugRenderer.hpp         | 61 +++++++++----------
 3 files changed, 70 insertions(+), 54 deletions(-)

diff --git a/12_MeshLoaders/main.cpp b/12_MeshLoaders/main.cpp
index 13868fa8c..0a4e20141 100644
--- a/12_MeshLoaders/main.cpp
+++ b/12_MeshLoaders/main.cpp
@@ -37,6 +37,7 @@ class MeshLoadersApp final : public MonoWindowApplication, public BuiltinResourc
 			}
 			
 			//! cache results -- speeds up mesh generation on second run
+			m_qnc = make_smart_refctd_ptr<CQuantNormalCache>();
 			m_qnc->loadCacheFromFile<EF_R8G8B8_SNORM>(m_system.get(),sharedOutputCWD/"../../tmp/normalCache888.sse");
 
 			//
@@ -224,17 +225,29 @@ class MeshLoadersApp final : public MonoWindowApplication, public BuiltinResourc
 		}
 
 	private:
+		// TODO: standardise this across examples, and take from `argv`
+		bool m_nonInteractiveTest = true;
+
 		inline bool reloadModel()
 		{
-			pfd::open_file file("Choose a supported Model File", "../../media", { "All Supported Formats", "*.ply *.stl *.serialized *.obj",
-				"TODO (.ply)", "*.ply",
-				"TODO (.stl)", "*.stl",
-				"Mitsuba 0.6 Serialized (.serialized)", "*.serialized",
-				"Wavefront Object (.obj)", "*.obj" 
-			});
-			if (file.result().empty())
-				return false;
-			m_modelPath = file.result()[0];
+			if (m_nonInteractiveTest) // TODO: maybe also take from argv and argc
+				m_modelPath = (sharedInputCWD/"ply/Spanner-ply.ply").string();
+			else
+			{
+				pfd::open_file file("Choose a supported Model File", sharedInputCWD.string(),
+					{
+						"All Supported Formats", "*.ply *.stl *.serialized *.obj",
+						"TODO (.ply)", "*.ply",
+						"TODO (.stl)", "*.stl",
+						"Mitsuba 0.6 Serialized (.serialized)", "*.serialized",
+						"Wavefront Object (.obj)", "*.obj"
+					},
+					false
+				);
+				if (file.result().empty())
+					return false;
+				m_modelPath = file.result()[0];
+			}
 
 			// free up
 			m_assetMgr->clearAllAssetCache();
diff --git a/common/include/nbl/examples/geometry/CGeometryCreatorScene.hpp b/common/include/nbl/examples/geometry/CGeometryCreatorScene.hpp
index 63b3d7a8d..2798cfed7 100644
--- a/common/include/nbl/examples/geometry/CGeometryCreatorScene.hpp
+++ b/common/include/nbl/examples/geometry/CGeometryCreatorScene.hpp
@@ -42,13 +42,13 @@ class CGeometryCreatorScene : public core::IReferenceCounted
 			}
 
 
-			core::vector<SNamedGeometry> namedGeometries;
+			SInitParams init = {};
 			core::vector<smart_refctd_ptr<const ICPUPolygonGeometry>> geometries;
 			// create out geometries
 			{
-				auto addGeometry = [&namedGeometries,&geometries](const std::string_view name, smart_refctd_ptr<const ICPUPolygonGeometry>&& geom)->void
+				auto addGeometry = [&init,&geometries](const std::string_view name, smart_refctd_ptr<const ICPUPolygonGeometry>&& geom)->void
 				{
-					namedGeometries.emplace_back().name = name;
+					init.geometryNames.emplace_back(name);
 					geometries.push_back(std::move(geom));
 				};
 
@@ -67,6 +67,7 @@ class CGeometryCreatorScene : public core::IReferenceCounted
 				addGeometry("Rectangle",creator->createRectangle({1.5f,3.f}));
 				addGeometry("Disk",creator->createDisk(2.f,30));
 			}
+			init.geometries.reserve(init.geometryNames.size());
 
 			// convert the geometries
 			{
@@ -148,34 +149,37 @@ class CGeometryCreatorScene : public core::IReferenceCounted
 				// assign outputs
 				{
 					auto inIt = reservation.getGPUObjects<ICPUPolygonGeometry>().data();
-					for (auto outIt=namedGeometries.begin(); outIt!=namedGeometries.end(); inIt++)
+					for (auto outIt=init.geometryNames.begin(); outIt!=init.geometryNames.end(); inIt++)
 					{
 						if (inIt->value)
-							(outIt++)->geom = inIt->value;
+						{
+							init.geometries.push_back(inIt->value);
+							outIt++;
+						}
 						else
 						{
-							logger->log("Failed to convert ICPUPolygonGeometry %s to GPU!",ILogger::ELL_ERROR,outIt->name.data());
-							outIt = namedGeometries.erase(outIt);
+							logger->log("Failed to convert ICPUPolygonGeometry %s to GPU!",ILogger::ELL_ERROR,outIt->c_str());
+							outIt = init.geometryNames.erase(outIt);
 						}
 					}
 				}
 			}
 
-			return smart_refctd_ptr<CGeometryCreatorScene>(new CGeometryCreatorScene(std::move(namedGeometries)),dont_grab);
+			return smart_refctd_ptr<CGeometryCreatorScene>(new CGeometryCreatorScene(std::move(init)),dont_grab);
 		}
 
 		//
-		struct SNamedGeometry
+		struct SInitParams
 		{
-			std::string name = {};
-			core::smart_refctd_ptr<video::IGPUPolygonGeometry> geom;
+			core::vector<core::smart_refctd_ptr<const video::IGPUPolygonGeometry>> geometries;
+			core::vector<std::string> geometryNames;
 		};
-		std::span<const SNamedGeometry> getGeometries() const {return m_geometries;}
+		const SInitParams& getInitParams() const {return m_init;}
 
 	protected:
-		inline CGeometryCreatorScene(core::vector<SNamedGeometry>&& _geometries) : m_geometries(std::move(_geometries)) {}
+		inline CGeometryCreatorScene(SInitParams&& _init) : m_init(std::move(_init)) {}
 
-		core::vector<SNamedGeometry> m_geometries;
+		SInitParams m_init;
 #undef EXPOSE_NABLA_NAMESPACES
 };
 
diff --git a/common/include/nbl/examples/geometry/CSimpleDebugRenderer.hpp b/common/include/nbl/examples/geometry/CSimpleDebugRenderer.hpp
index 474f1d350..325ae8eb7 100644
--- a/common/include/nbl/examples/geometry/CSimpleDebugRenderer.hpp
+++ b/common/include/nbl/examples/geometry/CSimpleDebugRenderer.hpp
@@ -79,7 +79,7 @@ class CSimpleDebugRenderer final : public core::IReferenceCounted
 		};
 
 		//
-		static inline core::smart_refctd_ptr<CSimpleDebugRenderer> create(asset::IAssetManager* assMan, video::IGPURenderpass* renderpass, const uint32_t subpassIX, const CGeometryCreatorScene* scene)
+		static inline core::smart_refctd_ptr<CSimpleDebugRenderer> create(asset::IAssetManager* assMan, video::IGPURenderpass* renderpass, const uint32_t subpassIX, const std::span<const video::IGPUPolygonGeometry* const> geometries)
 		{
 			EXPOSE_NABLA_NAMESPACES;
 
@@ -88,10 +88,7 @@ class CSimpleDebugRenderer final : public core::IReferenceCounted
 			auto device = const_cast<ILogicalDevice*>(renderpass->getOriginDevice());
 			auto logger = device->getLogger();
 
-			if (!assMan || !scene)
-				return nullptr;
-			const auto namedGeoms = scene->getGeometries();
-			if (namedGeoms.empty())
+			if (!assMan || geometries.empty())
 				return nullptr;
 
 			// load shader
@@ -154,33 +151,26 @@ class CSimpleDebugRenderer final : public core::IReferenceCounted
 			init.layout = device->createPipelineLayout(ranges,smart_refctd_ptr<const IGPUDescriptorSetLayout>(init.ds->getLayout()));
 
 			// create pipelines
-			enum PipelineType : uint8_t
-			{
-				BasicTriangleList,
-				BasicTriangleFan,
-				Cone,
-				Count
-			};
-			smart_refctd_ptr<IGPUGraphicsPipeline> pipelines[PipelineType::Count] = {};
+			using pipeline_e = SInitParams::PipelineType;
 			{
-				IGPUGraphicsPipeline::SCreationParams params[PipelineType::Count] = {};
-				params[PipelineType::BasicTriangleList].vertexShader = {.shader=shader.get(),.entryPoint="BasicVS"};
-				params[PipelineType::BasicTriangleList].fragmentShader = {.shader=shader.get(),.entryPoint="BasicFS"};
-				params[PipelineType::BasicTriangleFan].vertexShader = {.shader=shader.get(),.entryPoint="BasicVS"};
-				params[PipelineType::BasicTriangleFan].fragmentShader = {.shader=shader.get(),.entryPoint="BasicFS"};
-				params[PipelineType::Cone].vertexShader = {.shader=shader.get(),.entryPoint="ConeVS"};
-				params[PipelineType::Cone].fragmentShader = {.shader=shader.get(),.entryPoint="ConeFS"};
-				for (auto i=0; i< PipelineType::Count; i++)
+				IGPUGraphicsPipeline::SCreationParams params[pipeline_e::Count] = {};
+				params[pipeline_e::BasicTriangleList].vertexShader = {.shader=shader.get(),.entryPoint="BasicVS"};
+				params[pipeline_e::BasicTriangleList].fragmentShader = {.shader=shader.get(),.entryPoint="BasicFS"};
+				params[pipeline_e::BasicTriangleFan].vertexShader = {.shader=shader.get(),.entryPoint="BasicVS"};
+				params[pipeline_e::BasicTriangleFan].fragmentShader = {.shader=shader.get(),.entryPoint="BasicFS"};
+				params[pipeline_e::Cone].vertexShader = {.shader=shader.get(),.entryPoint="ConeVS"};
+				params[pipeline_e::Cone].fragmentShader = {.shader=shader.get(),.entryPoint="ConeFS"};
+				for (auto i=0; i<pipeline_e::Count; i++)
 				{
 					params[i].layout = init.layout.get();
 					// no vertex input
 					auto& primitiveAssembly = params[i].cached.primitiveAssembly;
 					auto& rasterization = params[i].cached.rasterization;
 					auto& blend = params[i].cached.blend;
-					const auto type = static_cast<PipelineType>(i);
+					const auto type = static_cast<pipeline_e>(i);
 					switch (type)
 					{
-						case PipelineType::BasicTriangleFan:
+						case pipeline_e::BasicTriangleFan:
 							primitiveAssembly.primitiveType = E_PRIMITIVE_TOPOLOGY::EPT_TRIANGLE_FAN;
 							break;
 						default:
@@ -193,7 +183,7 @@ class CSimpleDebugRenderer final : public core::IReferenceCounted
 					params[i].cached.subpassIx = subpassIX;
 					params[i].renderpass = renderpass;
 				}
-				if (!device->createGraphicsPipelines(nullptr,params,pipelines))
+				if (!device->createGraphicsPipelines(nullptr,params,init.pipelines))
 				{
 					logger->log("Could not create Graphics Pipelines!",ILogger::ELL_ERROR);
 					return nullptr;
@@ -212,9 +202,8 @@ class CSimpleDebugRenderer final : public core::IReferenceCounted
 					return retval;
 				};
 
-				for (const auto& entry : namedGeoms)
+				for (const auto geom : geometries)
 				{
-					const auto* geom = entry.geom.get();
 					// could also check device origin on all buffers
 					if (!geom->valid())
 						continue;
@@ -222,15 +211,12 @@ class CSimpleDebugRenderer final : public core::IReferenceCounted
 					switch (geom->getIndexingCallback()->knownTopology())
 					{
 						case E_PRIMITIVE_TOPOLOGY::EPT_TRIANGLE_FAN:
-							out.pipeline = pipelines[PipelineType::BasicTriangleFan];
+							out.pipeline = init.pipelines[pipeline_e::BasicTriangleFan];
 							break;
 						default:
-							out.pipeline = pipelines[PipelineType::BasicTriangleList];
+							out.pipeline = init.pipelines[pipeline_e::BasicTriangleList];
 							break;
 					}
-					// special case
-					if (entry.name=="Cone")
-						out.pipeline = pipelines[PipelineType::Cone];
 					if (const auto& view=geom->getIndexView(); view)
 					{
 						out.indexBuffer.offset = view.src.offset;
@@ -275,12 +261,25 @@ class CSimpleDebugRenderer final : public core::IReferenceCounted
 		//
 		struct SInitParams
 		{
+			enum PipelineType : uint8_t
+			{
+				BasicTriangleList,
+				BasicTriangleFan,
+				Cone, // special case
+				Count
+			};
+
 			core::smart_refctd_ptr<video::IGPUDescriptorSet> ds;
 			core::smart_refctd_ptr<video::IGPUPipelineLayout> layout;
+			core::smart_refctd_ptr<video::IGPUGraphicsPipeline> pipelines[PipelineType::Count];
 			core::vector<SPackedGeometry> geoms;
 		};
 		inline const SInitParams& getInitParams() const {return m_params;}
 
+		//
+		inline auto& getGeometry(const uint32_t ix) {return m_params.geoms[ix];}
+		inline const auto& getGeometry(const uint32_t ix) const {return m_params.geoms[ix];}
+
 		//
 		inline void render(video::IGPUCommandBuffer* cmdbuf, const SViewParams& viewParams) const
 		{

From 3cf8194aaab80ee715f9e312a7020e2259869a24 Mon Sep 17 00:00:00 2001
From: Erfan Ahmadi <ahmadierfan99@gmail.com>
Date: Wed, 25 Jun 2025 16:12:10 +0400
Subject: [PATCH 429/529] Auto-Submit for Triangle DTMS

---
 62_CAD/DrawResourcesFiller.cpp | 54 +++++++++++++++++++++-------------
 1 file changed, 33 insertions(+), 21 deletions(-)

diff --git a/62_CAD/DrawResourcesFiller.cpp b/62_CAD/DrawResourcesFiller.cpp
index 8a3c0dff0..012cd6da6 100644
--- a/62_CAD/DrawResourcesFiller.cpp
+++ b/62_CAD/DrawResourcesFiller.cpp
@@ -33,7 +33,7 @@ bool DrawResourcesFiller::allocateDrawResources(ILogicalDevice* logicalDevice, s
 	const size_t totalResourcesSize = adjustedImagesMemorySize + adjustedBuffersMemorySize;
 
 	IGPUBuffer::SCreationParams resourcesBufferCreationParams = {};
-	resourcesBufferCreationParams.size = adjustedBuffersMemorySize;
+	resourcesBufferCreationParams.size = 1300;
 	resourcesBufferCreationParams.usage = bitflag(IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT) | IGPUBuffer::EUF_TRANSFER_DST_BIT | IGPUBuffer::EUF_INDEX_BUFFER_BIT;
 	resourcesGPUBuffer = logicalDevice->createBuffer(std::move(resourcesBufferCreationParams));
 	resourcesGPUBuffer->setObjectDebugName("drawResourcesBuffer");
@@ -289,9 +289,6 @@ void DrawResourcesFiller::drawTriangleMesh(
 	setActiveDTMSettings(dtmSettingsInfo);
 	beginMainObject(MainObjectType::DTM);
 
-	DrawCallData drawCallData = {}; 
-	drawCallData.isDTMRendering = true;
-
 	uint32_t mainObjectIdx = acquireActiveMainObjectIndex_SubmitIfNeeded(intendedNextSubmit);
 	if (mainObjectIdx == InvalidMainObjectIdx)
 	{
@@ -299,41 +296,56 @@ void DrawResourcesFiller::drawTriangleMesh(
 		assert(false);
 		return;
 	}
-	drawCallData.dtm.triangleMeshMainObjectIndex = mainObjectIdx;
 
-	ICPUBuffer::SCreationParams geometryBuffParams;
-	
-	// concatenate the index and vertex buffer into the geometry buffer
-	const size_t indexBuffByteSize = mesh.getIndexBuffByteSize();
-	const size_t vtxBuffByteSize = mesh.getVertexBuffByteSize();
-	const size_t dataToAddByteSize = vtxBuffByteSize + indexBuffByteSize;
+	DrawCallData drawCallData = {}; 
+	drawCallData.isDTMRendering = true;
 
-	const size_t remainingResourcesSize = calculateRemainingResourcesSize();
+	ICPUBuffer::SCreationParams geometryBuffParams;
 
-	// TODO: assert of geometry buffer size, do i need to check if size of objects to be added <= remainingResourcesSize?
-	// TODO: auto submit instead of assert
-	assert(dataToAddByteSize <= remainingResourcesSize);
+	// concatenate the index and vertex buffer into the geometry buffer
+	const auto& indexBuffer = mesh.getIndices();
+	const auto& vertexBuffer = mesh.getVertices();
+	assert(indexBuffer.size() == vertexBuffer.size()); // We don't have any vertex re-use due to other limitations at the moemnt.
+	
 
+	const uint32_t numTriangles = indexBuffer.size() / 3u;
+	uint32_t trianglesUploaded = 0;
+	while (trianglesUploaded < numTriangles)
 	{
-		// NOTE[ERFAN]: these push contants will be removed, everything will be accessed by dtmSettings, including where the vertex buffer data resides
+		const size_t remainingResourcesSize = calculateRemainingResourcesSize();
+		const uint32_t maxUploadableVertices = remainingResourcesSize / (sizeof(CTriangleMesh::vertex_t) + sizeof(CTriangleMesh::index_t));
+		const uint32_t maxUploadableTriangles = maxUploadableVertices / 3u;
+		const uint32_t remainingTrianglesToUpload = numTriangles - trianglesUploaded;
+		const uint32_t trianglesToUpload = core::min(remainingTrianglesToUpload, maxUploadableTriangles);
+		const size_t vtxBuffByteSize = trianglesToUpload * 3u * sizeof(CTriangleMesh::vertex_t);
+		const size_t indexBuffByteSize = trianglesToUpload * 3u * sizeof(CTriangleMesh::index_t);
+		const size_t trianglesToUploadByteSize = vtxBuffByteSize + indexBuffByteSize;
 
 		// Copy VertexBuffer
-		size_t geometryBufferOffset = resourcesCollection.geometryInfo.increaseSizeAndGetOffset(dataToAddByteSize, alignof(CTriangleMesh::vertex_t));
+		size_t geometryBufferOffset = resourcesCollection.geometryInfo.increaseSizeAndGetOffset(trianglesToUploadByteSize, alignof(CTriangleMesh::vertex_t));
 		void* dst = resourcesCollection.geometryInfo.data() + geometryBufferOffset;
 		// the actual bda address will be determined only after all copies are finalized, later we will do += `baseBDAAddress + geometryInfo.bufferOffset`
 		drawCallData.dtm.triangleMeshVerticesBaseAddress = geometryBufferOffset;
-		memcpy(dst, mesh.getVertices().data(), vtxBuffByteSize);
+		memcpy(dst, &vertexBuffer[trianglesUploaded * 3u], vtxBuffByteSize);
 		geometryBufferOffset += vtxBuffByteSize; 
 
 		// Copy IndexBuffer
 		dst = resourcesCollection.geometryInfo.data() + geometryBufferOffset;
 		drawCallData.dtm.indexBufferOffset = geometryBufferOffset;
-		memcpy(dst, mesh.getIndices().data(), indexBuffByteSize);
+		memcpy(dst, &indexBuffer[trianglesUploaded * 3u], indexBuffByteSize);
 		geometryBufferOffset += indexBuffByteSize;
+		
+		trianglesUploaded += trianglesToUpload;
+		
+		drawCallData.dtm.triangleMeshMainObjectIndex = mainObjectIdx;
+		drawCallData.dtm.indexCount = trianglesToUpload * 3u;
+		drawCalls.push_back(drawCallData);
+
+		// Requires Auto-Submit If All Triangles of the Mesh couldn't fit into Memory
+		if (trianglesUploaded < numTriangles)
+			submitCurrentDrawObjectsAndReset(intendedNextSubmit, mainObjectIdx);
 	}
 
-	drawCallData.dtm.indexCount = mesh.getIndexCount();
-	drawCalls.push_back(drawCallData);
 	endMainObject();
 }
 

From 8a0043b33ee1b0ffc35773e05de304b71d8a35b0 Mon Sep 17 00:00:00 2001
From: Erfan Ahmadi <ahmadierfan99@gmail.com>
Date: Wed, 25 Jun 2025 16:34:07 +0400
Subject: [PATCH 430/529] small fix

---
 62_CAD/DrawResourcesFiller.cpp | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/62_CAD/DrawResourcesFiller.cpp b/62_CAD/DrawResourcesFiller.cpp
index 012cd6da6..4762e5915 100644
--- a/62_CAD/DrawResourcesFiller.cpp
+++ b/62_CAD/DrawResourcesFiller.cpp
@@ -33,7 +33,7 @@ bool DrawResourcesFiller::allocateDrawResources(ILogicalDevice* logicalDevice, s
 	const size_t totalResourcesSize = adjustedImagesMemorySize + adjustedBuffersMemorySize;
 
 	IGPUBuffer::SCreationParams resourcesBufferCreationParams = {};
-	resourcesBufferCreationParams.size = 1300;
+	resourcesBufferCreationParams.size = adjustedBuffersMemorySize;
 	resourcesBufferCreationParams.usage = bitflag(IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT) | IGPUBuffer::EUF_TRANSFER_DST_BIT | IGPUBuffer::EUF_INDEX_BUFFER_BIT;
 	resourcesGPUBuffer = logicalDevice->createBuffer(std::move(resourcesBufferCreationParams));
 	resourcesGPUBuffer->setObjectDebugName("drawResourcesBuffer");
@@ -341,6 +341,13 @@ void DrawResourcesFiller::drawTriangleMesh(
 		drawCallData.dtm.indexCount = trianglesToUpload * 3u;
 		drawCalls.push_back(drawCallData);
 
+		//if (trianglesUploaded == 0u)
+		//{
+		//	m_logger.log("drawTriangleMesh: not enough vram allocation for a single triangle!", nbl::system::ILogger::ELL_ERROR);
+		//	assert(false);
+		//	break;
+		//}
+
 		// Requires Auto-Submit If All Triangles of the Mesh couldn't fit into Memory
 		if (trianglesUploaded < numTriangles)
 			submitCurrentDrawObjectsAndReset(intendedNextSubmit, mainObjectIdx);

From e4c3f8b5c9167b5b16431113a61a1dcc0d4129d0 Mon Sep 17 00:00:00 2001
From: Erfan Ahmadi <ahmadierfan99@gmail.com>
Date: Wed, 25 Jun 2025 17:01:30 +0400
Subject: [PATCH 431/529] Fix reindexing in auto submit triangle dtms

---
 62_CAD/DrawResourcesFiller.cpp | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/62_CAD/DrawResourcesFiller.cpp b/62_CAD/DrawResourcesFiller.cpp
index 4762e5915..425684a5f 100644
--- a/62_CAD/DrawResourcesFiller.cpp
+++ b/62_CAD/DrawResourcesFiller.cpp
@@ -33,7 +33,7 @@ bool DrawResourcesFiller::allocateDrawResources(ILogicalDevice* logicalDevice, s
 	const size_t totalResourcesSize = adjustedImagesMemorySize + adjustedBuffersMemorySize;
 
 	IGPUBuffer::SCreationParams resourcesBufferCreationParams = {};
-	resourcesBufferCreationParams.size = adjustedBuffersMemorySize;
+	resourcesBufferCreationParams.size = 870;
 	resourcesBufferCreationParams.usage = bitflag(IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT) | IGPUBuffer::EUF_TRANSFER_DST_BIT | IGPUBuffer::EUF_INDEX_BUFFER_BIT;
 	resourcesGPUBuffer = logicalDevice->createBuffer(std::move(resourcesBufferCreationParams));
 	resourcesGPUBuffer->setObjectDebugName("drawResourcesBuffer");
@@ -325,7 +325,8 @@ void DrawResourcesFiller::drawTriangleMesh(
 		size_t geometryBufferOffset = resourcesCollection.geometryInfo.increaseSizeAndGetOffset(trianglesToUploadByteSize, alignof(CTriangleMesh::vertex_t));
 		void* dst = resourcesCollection.geometryInfo.data() + geometryBufferOffset;
 		// the actual bda address will be determined only after all copies are finalized, later we will do += `baseBDAAddress + geometryInfo.bufferOffset`
-		drawCallData.dtm.triangleMeshVerticesBaseAddress = geometryBufferOffset;
+		// the - is a small hack because index buffer grows but vertex buffer needs to start from 0, remove that once we either get rid of the index buffer or implement an algorithm that can have vertex reuse
+		drawCallData.dtm.triangleMeshVerticesBaseAddress = geometryBufferOffset - (sizeof(CTriangleMesh::vertex_t) * trianglesUploaded * 3); 
 		memcpy(dst, &vertexBuffer[trianglesUploaded * 3u], vtxBuffByteSize);
 		geometryBufferOffset += vtxBuffByteSize; 
 

From cbcc1c90a399820939ea0dc080bea52734f39a8a Mon Sep 17 00:00:00 2001
From: Arkadiusz Lachowicz <areklachowicz@gmail.com>
Date: Wed, 25 Jun 2025 15:26:34 +0200
Subject: [PATCH 432/529] create INTERFACE_TO_BUILTINS, add
 NblExtExamplesAPIBuiltinsSPIRV target created with NBL_REGISTER_SPIRV_SHADERS
 utility, update CSimpleDebugRenderer.hpp, change location of unified.hlsl

---
 CMakeLists.txt                                |  1 +
 common/CMakeLists.txt                         | 22 +++++++---
 .../common/BuiltinResourcesApplication.hpp    | 36 ++++++++++------
 .../geometry/CSimpleDebugRenderer.hpp         | 20 +++++++--
 common/src/nbl/examples/CMakeLists.txt        | 23 ++++++++--
 .../geometry/shaders/grid.vertex.hlsl         | 11 -----
 .../shaders/template/grid.common.hlsl         | 43 -------------------
 .../shaders => shaders/geometry}/unified.hlsl |  0
 8 files changed, 74 insertions(+), 82 deletions(-)
 delete mode 100644 common/src/nbl/examples/geometry/shaders/grid.vertex.hlsl
 delete mode 100644 common/src/nbl/examples/geometry/shaders/template/grid.common.hlsl
 rename common/src/nbl/examples/{geometry/shaders => shaders/geometry}/unified.hlsl (100%)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 56c0ee60c..bf18c445d 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -97,6 +97,7 @@ if(NBL_BUILD_EXAMPLES)
 		target_precompile_headers(${T} REUSE_FROM "${NBL_EXAMPLES_API_TARGET}")
 		LINK_BUILTIN_RESOURCES_TO_TARGET(${T} NblExtExamplesAPIBuiltinsSource)
 		LINK_BUILTIN_RESOURCES_TO_TARGET(${T} NblExtExamplesAPIBuiltinsInclude)
+		LINK_BUILTIN_RESOURCES_TO_TARGET(${T} NblExtExamplesAPIBuiltinsSPIRV)
     endforeach()
 
 	NBL_ADJUST_FOLDERS(examples)
diff --git a/common/CMakeLists.txt b/common/CMakeLists.txt
index b32e1a394..66f7e6ea7 100644
--- a/common/CMakeLists.txt
+++ b/common/CMakeLists.txt
@@ -13,6 +13,20 @@ nbl_create_ext_library_project(ExamplesAPI "" "${CMAKE_CURRENT_SOURCE_DIR}/src/n
 set_target_properties(${LIB_NAME} PROPERTIES DISABLE_PRECOMPILE_HEADERS OFF)
 target_precompile_headers(${LIB_NAME} PUBLIC "${CMAKE_CURRENT_SOURCE_DIR}/include/nbl/examples/PCH.hpp")
 
+set(COMMON_INCLUDE_DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}/include")
+
+function(INTERFACE_TO_BUILTINS TARGET)
+    #[[
+        even though builtin target is static library its still valid to reuse 
+        common PCH to boost its build speed to not preprocess entire Nabla again
+    ]]
+    set_target_properties(${TARGET} PROPERTIES DISABLE_PRECOMPILE_HEADERS OFF)
+    target_precompile_headers(${TARGET} REUSE_FROM "${LIB_NAME}")
+
+    target_include_directories(${TARGET} PUBLIC "${CMAKE_CURRENT_FUNCTION_LIST_DIR}/include")
+    target_link_libraries(${TARGET} INTERFACE ${LIB_NAME})
+endfunction()
+
 function(REGISTER_COMMON_BUILTINS)
 	cmake_parse_arguments(EX "" "TARGET;ARCHIVE_ABS_ENTRY;ARCHIVE_NAMESPACE" "GLOB_RGX" ${ARGN})
 
@@ -36,13 +50,7 @@ function(REGISTER_COMMON_BUILTINS)
 	endforeach()
 
     ADD_CUSTOM_BUILTIN_RESOURCES(${EX_TARGET} EXAMPLES_RESOURCES_TO_EMBED "${INPUT_DIRECTORY}" "${EX_ARCHIVE_ABS_ENTRY}" "${EX_ARCHIVE_NAMESPACE}" "${OUTPUT_INCLUDE}" "${OUTPUT_SRC}")
-
-    # even though builtin target is static library its still valid to reuse common PCH to boost its build speed to not preprocess entire Nabla again
-    set_target_properties(${EX_TARGET} PROPERTIES DISABLE_PRECOMPILE_HEADERS OFF)
-    target_precompile_headers(${EX_TARGET} REUSE_FROM "${LIB_NAME}")
-
-    target_include_directories(${EX_TARGET} PUBLIC "${INPUT_DIRECTORY}/include")
-    target_link_libraries(${EX_TARGET} INTERFACE ${LIB_NAME})
+    INTERFACE_TO_BUILTINS(${EX_TARGET})
 endfunction()
 
 #! common example API builtins as static library targets linked to each example
diff --git a/common/include/nbl/examples/common/BuiltinResourcesApplication.hpp b/common/include/nbl/examples/common/BuiltinResourcesApplication.hpp
index c32bbc3ea..02509ca6a 100644
--- a/common/include/nbl/examples/common/BuiltinResourcesApplication.hpp
+++ b/common/include/nbl/examples/common/BuiltinResourcesApplication.hpp
@@ -4,20 +4,25 @@
 #ifndef _NBL_EXAMPLES_BUILTIN_RESOURCE_APPLICATION_HPP_INCLUDED_
 #define _NBL_EXAMPLES_BUILTIN_RESOURCE_APPLICATION_HPP_INCLUDED_
 
-
 // we need a system, logger and an asset manager
 #include "nbl/application_templates/MonoAssetManagerApplication.hpp"
 
 #ifdef NBL_EMBED_BUILTIN_RESOURCES
 	#include "nbl/builtin/examples/include/CArchive.h"
 	#include "nbl/builtin/examples/src/CArchive.h"
-	// TODO: the build `nbl/examples` archive
-#if __has_include("nbl/this_example/builtin/CArchive.h")
-	#include "nbl/this_example/builtin/CArchive.h"
-#endif
+	#include "nbl/builtin/examples/build/spirv/CArchive.h"
+	#if __has_include("nbl/this_example/builtin/CArchive.h")
+		#include "nbl/this_example/builtin/CArchive.h"
+	#endif
+	// TODO: (**) there should be also 5th arch "nbl/this_example/builtin/build/spirv/CArchive.h"
+	/*
+		#if __has_include("nbl/this_example/builtin/build/spirv/CArchive.h")
+		#include "nbl/this_example/builtin/build/spirv/CArchive.h"
+		#endif
+	*/
+	//! this ain't meant to be the same as this_example ordinary archive
 #endif
 
-
 namespace nbl::examples
 {
 
@@ -40,27 +45,30 @@ class BuiltinResourcesApplication : public virtual application_templates::MonoAs
 
 			using namespace core;
 
-			smart_refctd_ptr<system::IFileArchive> examplesHeaderArch,examplesSourceArch,examplesBuildArch,thisExampleArch;
-		#ifdef NBL_EMBED_BUILTIN_RESOURCES
+			smart_refctd_ptr<system::IFileArchive> examplesHeaderArch,examplesSourceArch,examplesBuildSpirvArch,thisExampleArch;
+			#ifdef NBL_EMBED_BUILTIN_RESOURCES
 			examplesHeaderArch = core::make_smart_refctd_ptr<nbl::builtin::examples::include::CArchive>(smart_refctd_ptr(m_logger));
 			examplesSourceArch = core::make_smart_refctd_ptr<nbl::builtin::examples::src::CArchive>(smart_refctd_ptr(m_logger));
-			// TODO: the build archive
+			examplesBuildSpirvArch = core::make_smart_refctd_ptr<nbl::builtin::examples::build::spirv::CArchive>(smart_refctd_ptr(m_logger));
 
 			#ifdef _NBL_THIS_EXAMPLE_BUILTIN_C_ARCHIVE_H_
 				thisExampleArch = make_smart_refctd_ptr<nbl::this_example::builtin::CArchive>(smart_refctd_ptr(m_logger));
 			#endif
-		#else
+			// TODO: (**)
+			#else
 			examplesHeaderArch = make_smart_refctd_ptr<system::CMountDirectoryArchive>(localInputCWD/"../common/include/nbl/examples",smart_refctd_ptr(m_logger),m_system.get());
 			examplesSourceArch = make_smart_refctd_ptr<system::CMountDirectoryArchive>(localInputCWD/"../common/src/nbl/examples",smart_refctd_ptr(m_logger),m_system.get());
-// TODO: examplesBuildArch =
+			examplesBuildSpirvArch = make_smart_refctd_ptr<system::CMountDirectoryArchive>(NBL_EXAMPLES_BUILD_SPIRV_MOUNT_POINT, smart_refctd_ptr(m_logger), m_system.get());
 			thisExampleArch = make_smart_refctd_ptr<system::CMountDirectoryArchive>(localInputCWD/"app_resources",smart_refctd_ptr(m_logger),m_system.get());
-		#endif
+			// TODO: (**)
+			#endif
 			// yes all 3 aliases are meant to be the same
 			m_system->mount(std::move(examplesHeaderArch),"nbl/examples");
 			m_system->mount(std::move(examplesSourceArch),"nbl/examples");
-//			m_system->mount(std::move(examplesBuildArch),"nbl/examples");
+			m_system->mount(std::move(examplesBuildSpirvArch),"nbl/examples");
 			if (thisExampleArch)
 				m_system->mount(std::move(thisExampleArch),"app_resources");
+			// TODO: (**)
 
 			return true;
 		}
@@ -68,4 +76,4 @@ class BuiltinResourcesApplication : public virtual application_templates::MonoAs
 
 }
 
-#endif // _CAMERA_IMPL_
\ No newline at end of file
+#endif // _NBL_EXAMPLES_BUILTIN_RESOURCE_APPLICATION_HPP_INCLUDED_
\ No newline at end of file
diff --git a/common/include/nbl/examples/geometry/CSimpleDebugRenderer.hpp b/common/include/nbl/examples/geometry/CSimpleDebugRenderer.hpp
index 474f1d350..7b849e3b6 100644
--- a/common/include/nbl/examples/geometry/CSimpleDebugRenderer.hpp
+++ b/common/include/nbl/examples/geometry/CSimpleDebugRenderer.hpp
@@ -97,14 +97,26 @@ class CSimpleDebugRenderer final : public core::IReferenceCounted
 			// load shader
 			smart_refctd_ptr<IShader> shader;
 			{
-				const auto bundle = assMan->getAsset("nbl/examples/geometry/shaders/unified.hlsl",{});
-// TODO: Arek
-				//const auto bundle = assMan->getAsset("nbl/examples/geometry/shaders/unified.spv",{});
+				// TODO & NOTE: tmp, maybe I will turn it into CMake option
+				#define NBL_USE_PRECOMPILED_SPIRV
+
+				#ifdef NBL_USE_PRECOMPILED_SPIRV
+				constexpr std::string_view key = "nbl/examples/shaders/geometry/unified.hlsl.spv";
+				#else
+				constexpr std::string_view key = "nbl/examples/shaders/geometry/unified.hlsl";
+				#endif // NBL_USE_PRECOMPILED_SPIRV
+
+				const auto bundle = assMan->getAsset(key.data(), {});
+
+				//const auto bundle = assMan->getAsset("nbl/examples/shaders/geometry/unified.hlsl.spv",{});
 				const auto contents = bundle.getContents();
 				if (contents.empty() || bundle.getAssetType()!=IAsset::ET_SHADER)
 					return nullptr;
 				shader = IAsset::castDown<IShader>(contents[0]);
-				shader = device->compileShader({.source=shader.get()});
+				
+				#ifndef NBL_USE_PRECOMPILED_SPIRV
+				shader = device->compileShader({ .source = shader.get() });
+				#endif // NBL_USE_PRECOMPILED_SPIRV
 				if (!shader)
 					return nullptr;
 			}
diff --git a/common/src/nbl/examples/CMakeLists.txt b/common/src/nbl/examples/CMakeLists.txt
index a95372eea..3f4008541 100644
--- a/common/src/nbl/examples/CMakeLists.txt
+++ b/common/src/nbl/examples/CMakeLists.txt
@@ -1,4 +1,21 @@
-# TODO builtin SPIR-V shaders
-# add_subdirectory(geometry EXCLUDE_FROM_ALL)
+set(SPIRV_TARGET_V 6_8)
 
-# TODO: make docs once I get n4ce embed SPIRV tool to build system and then use the tool with Matts new shader
\ No newline at end of file
+set(COMMON_OPTIONS
+	-I "${COMMON_INCLUDE_DIRECTORY}"
+)
+
+NBL_REGISTER_SPIRV_SHADERS(
+	MOUNT_POINT_DEFINE
+		NBL_EXAMPLES_BUILD_SPIRV_MOUNT_POINT
+
+    ARCHIVE
+        TARGET NblExtExamplesAPIBuiltinsSPIRV
+        INPUT_DIRECTORY .
+        NAMESPACE nbl::builtin::examples::build::spirv
+
+	INPUTS
+		KEY shaders/geometry/unified.hlsl COMPILE_OPTIONS ${COMMON_OPTIONS} -T lib_${SPIRV_TARGET_V}
+		# KEY <xyz> COMPILE_OPTIONS ${COMMON_OPTIONS} -T <target>_${SPIRV_TARGET_V}
+)
+
+INTERFACE_TO_BUILTINS(NblExtExamplesAPIBuiltinsSPIRV)
\ No newline at end of file
diff --git a/common/src/nbl/examples/geometry/shaders/grid.vertex.hlsl b/common/src/nbl/examples/geometry/shaders/grid.vertex.hlsl
deleted file mode 100644
index 389c37bf2..000000000
--- a/common/src/nbl/examples/geometry/shaders/grid.vertex.hlsl
+++ /dev/null
@@ -1,11 +0,0 @@
-#include "template/grid.common.hlsl"
-
-
-PSInput VSMain(VSInput input)
-{
-    PSInput output;
-    output.position = mul(params.MVP, float4(input.position, 1.0));
-    output.uv = (input.uv - float2(0.5, 0.5)) * abs(input.position.xy);
-    
-    return output;
-}
\ No newline at end of file
diff --git a/common/src/nbl/examples/geometry/shaders/template/grid.common.hlsl b/common/src/nbl/examples/geometry/shaders/template/grid.common.hlsl
deleted file mode 100644
index 7ec9017e9..000000000
--- a/common/src/nbl/examples/geometry/shaders/template/grid.common.hlsl
+++ /dev/null
@@ -1,43 +0,0 @@
-#ifndef _NBL_EXAMPLES_GRID_COMMON_HLSL_
-#define _NBL_EXAMPLES_GRID_COMMON_HLSL_
-
-#include "common/SBasicViewParameters.hlsl"
-
-#ifdef __HLSL_VERSION
-// TODO: why is there even a mesh with HW vertices for this?
-struct VSInput
-{
-	[[vk::location(0)]] float3 position : POSITION;
-	[[vk::location(1)]] float4 color : COLOR;
-	[[vk::location(2)]] float2 uv : TEXCOORD;
-	[[vk::location(3)]] float3 normal : NORMAL;
-};
-
-struct PSInput
-{
-    float4 position : SV_Position;
-    float2 uv : TEXCOORD0;
-};
-
-[[vk::push_constant]] SBasicViewParameters params;
-#endif // __HLSL_VERSION
-
-
-float gridTextureGradBox(float2 p, float2 ddx, float2 ddy)
-{
-    float N = 30.0; // grid ratio
-    float2 w = max(abs(ddx), abs(ddy)) + 0.01; // filter kernel
-
-    // analytic (box) filtering
-    float2 a = p + 0.5 * w;
-    float2 b = p - 0.5 * w;
-    float2 i = (floor(a) + min(frac(a) * N, 1.0) - floor(b) - min(frac(b) * N, 1.0)) / (N * w);
-
-    // pattern
-    return (1.0 - i.x) * (1.0 - i.y);
-}
-
-#endif // _NBL_EXAMPLES_GRID_COMMON_HLSL_
-/*
-    do not remove this text, WAVE is so bad that you can get errors if no proper ending xD
-*/
\ No newline at end of file
diff --git a/common/src/nbl/examples/geometry/shaders/unified.hlsl b/common/src/nbl/examples/shaders/geometry/unified.hlsl
similarity index 100%
rename from common/src/nbl/examples/geometry/shaders/unified.hlsl
rename to common/src/nbl/examples/shaders/geometry/unified.hlsl

From e790c841466747645638060ded7972de2ca0348b Mon Sep 17 00:00:00 2001
From: devsh <devsh@devsh.eu>
Date: Wed, 25 Jun 2025 15:38:43 +0200
Subject: [PATCH 433/529] more improvements to make the Simple Debug Renderer
 more runtime friendly

---
 09_GeometryCreator/main.cpp                   |  31 ++-
 61_UI/main.cpp                                |  29 +-
 .../geometry/CSimpleDebugRenderer.hpp         | 260 ++++++++++++------
 3 files changed, 219 insertions(+), 101 deletions(-)

diff --git a/09_GeometryCreator/main.cpp b/09_GeometryCreator/main.cpp
index 1a959f7a0..900d827b7 100644
--- a/09_GeometryCreator/main.cpp
+++ b/09_GeometryCreator/main.cpp
@@ -37,25 +37,32 @@ class GeometryCreatorApp final : public MonoWindowApplication, public BuiltinRes
 			}
 
 			const uint32_t addtionalBufferOwnershipFamilies[] = {getGraphicsQueue()->getFamilyIndex()};
-			// we want to use the vertex data through UTBs
-			using usage_f = IGPUBuffer::E_USAGE_FLAGS;
-			CAssetConverter::patch_t<asset::ICPUPolygonGeometry> patch = {};
-			patch.positionBufferUsages = usage_f::EUF_UNIFORM_TEXEL_BUFFER_BIT;
-			patch.indexBufferUsages = usage_f::EUF_INDEX_BUFFER_BIT;
-			patch.otherBufferUsages = usage_f::EUF_UNIFORM_TEXEL_BUFFER_BIT;
 			m_scene = CGeometryCreatorScene::create(
 				{
 					.transferQueue = getTransferUpQueue(),
 					.utilities = m_utils.get(),
 					.logger = m_logger.get(),
 					.addtionalBufferOwnershipFamilies = addtionalBufferOwnershipFamilies
-				},patch
+				},
+				CSimpleDebugRenderer::DefaultPolygonGeometryPatch // we want to use the vertex data through UTBs
 			);
 			
 			auto scRes = static_cast<CDefaultSwapchainFramebuffers*>(m_surface->getSwapchainResources());
-			m_renderer = CSimpleDebugRenderer::create(m_assetMgr.get(),scRes->getRenderpass(),0,m_scene.get());
-			if (!m_renderer)
+			const auto& geometries = m_scene->getInitParams().geometries;
+			m_renderer = CSimpleDebugRenderer::create(m_assetMgr.get(),scRes->getRenderpass(),0,{&geometries.front().get(),geometries.size()});
+			if (!m_renderer || m_renderer->getGeometries().size() != geometries.size())
 				return logFail("Could not create Renderer!");
+			// special case
+			{
+				const auto& pipelines = m_renderer->getInitParams().pipelines;
+				auto ix = 0u;
+				for (const auto& name : m_scene->getInitParams().geometryNames)
+				{
+					if (name=="Cone")
+						m_renderer->getGeometry(ix).pipeline = pipelines[CSimpleDebugRenderer::SInitParams::PipelineType::Cone];
+					ix++;
+				}
+			}
 			m_renderer->m_instances.resize(1);
 			m_renderer->m_instances[0].world = float32_t3x4(
 				float32_t4(1,0,0,0),
@@ -143,7 +150,7 @@ class GeometryCreatorApp final : public MonoWindowApplication, public BuiltinRes
 			const auto viewParams = CSimpleDebugRenderer::SViewParams(viewMatrix,viewProjMatrix);
 
 			// tear down scene every frame
-			m_renderer->m_instances[0].packedGeo = m_renderer->getInitParams().geoms.data()+gcIndex;
+			m_renderer->m_instances[0].packedGeo = m_renderer->getGeometries().data()+gcIndex;
  			m_renderer->render(cb,viewParams);
 
 			cb->endRenderPass();
@@ -185,7 +192,7 @@ class GeometryCreatorApp final : public MonoWindowApplication, public BuiltinRes
 			std::string caption = "[Nabla Engine] Geometry Creator";
 			{
 				caption += ", displaying [";
-				caption += m_scene->getGeometries()[gcIndex].name;
+				caption += m_scene->getInitParams().geometryNames[gcIndex];
 				caption += "]";
 				m_window->setCaption(caption);
 			}
@@ -258,7 +265,7 @@ class GeometryCreatorApp final : public MonoWindowApplication, public BuiltinRes
 				if (ev.type==nbl::ui::SMouseEvent::EET_SCROLL && m_renderer)
 				{
 					gcIndex += int16_t(core::sign(ev.scrollEvent.verticalScroll));
-					gcIndex = core::clamp(gcIndex,0ull,m_renderer->getInitParams().geoms.size()-1);
+					gcIndex = core::clamp(gcIndex,0ull,m_renderer->getGeometries().size()-1);
 				}
 			}
 		}
diff --git a/61_UI/main.cpp b/61_UI/main.cpp
index 830318e4e..643cab079 100644
--- a/61_UI/main.cpp
+++ b/61_UI/main.cpp
@@ -40,19 +40,14 @@ class UISampleApp final : public MonoWindowApplication, public BuiltinResourcesA
 			}
 			
 			const uint32_t addtionalBufferOwnershipFamilies[] = {getGraphicsQueue()->getFamilyIndex()};
-			// we want to use the vertex data through UTBs
-			using usage_f = IGPUBuffer::E_USAGE_FLAGS;
-			CAssetConverter::patch_t<asset::ICPUPolygonGeometry> patch = {};
-			patch.positionBufferUsages = usage_f::EUF_UNIFORM_TEXEL_BUFFER_BIT;
-			patch.indexBufferUsages = usage_f::EUF_INDEX_BUFFER_BIT;
-			patch.otherBufferUsages = usage_f::EUF_UNIFORM_TEXEL_BUFFER_BIT;
 			m_scene = CGeometryCreatorScene::create(
 				{
 					.transferQueue = getTransferUpQueue(),
 					.utilities = m_utils.get(),
 					.logger = m_logger.get(),
 					.addtionalBufferOwnershipFamilies = addtionalBufferOwnershipFamilies
-				},patch
+				},
+				CSimpleDebugRenderer::DefaultPolygonGeometryPatch
 			);
 			
 			// for the scene drawing pass
@@ -137,7 +132,19 @@ class UISampleApp final : public MonoWindowApplication, public BuiltinResourcesA
 				if (!m_renderpass)
 					return logFail("Failed to create Scene Renderpass!");
 			}
-			m_renderer = CSimpleDebugRenderer::create(m_assetMgr.get(),m_renderpass.get(),0,m_scene.get());
+			const auto& geometries = m_scene->getInitParams().geometries;
+			m_renderer = CSimpleDebugRenderer::create(m_assetMgr.get(),m_renderpass.get(),0,{&geometries.front().get(),geometries.size()});
+			// special case
+			{
+				const auto& pipelines = m_renderer->getInitParams().pipelines;
+				auto ix = 0u;
+				for (const auto& name : m_scene->getInitParams().geometryNames)
+				{
+					if (name=="Cone")
+						m_renderer->getGeometry(ix).pipeline = pipelines[CSimpleDebugRenderer::SInitParams::PipelineType::Cone];
+					ix++;
+				}
+			}
 			// we'll only display one thing at a time
 			m_renderer->m_instances.resize(1);
 
@@ -258,7 +265,7 @@ class UISampleApp final : public MonoWindowApplication, public BuiltinResourcesA
 					// tear down scene every frame
 					auto& instance = m_renderer->m_instances[0];
 					memcpy(&instance.world,&interface.model,sizeof(instance.world));
-					instance.packedGeo = m_renderer->getInitParams().geoms.data()+interface.gcIndex;
+					instance.packedGeo = m_renderer->getGeometries().data() + interface.gcIndex;
  					m_renderer->render(cb,viewParams);
 				}
 				cb->endRenderPass();
@@ -418,7 +425,7 @@ class UISampleApp final : public MonoWindowApplication, public BuiltinResourcesA
 							if (e.type==nbl::ui::SMouseEvent::EET_SCROLL && m_renderer)
 							{
 								interface.gcIndex += int16_t(core::sign(e.scrollEvent.verticalScroll));
-								interface.gcIndex = core::clamp(interface.gcIndex,0ull,m_renderer->getInitParams().geoms.size()-1);
+								interface.gcIndex = core::clamp(interface.gcIndex,0ull,m_renderer->getGeometries().size()-1);
 							}
 						}
 					},
@@ -453,7 +460,7 @@ class UISampleApp final : public MonoWindowApplication, public BuiltinResourcesA
 				.keyboardEvents = uiEvents.keyboard
 			};
 
-			interface.objectName = m_scene->getGeometries()[interface.gcIndex].name;
+			interface.objectName = m_scene->getInitParams().geometryNames[interface.gcIndex];
 			interface.imGUI->update(params);
 		}
 
diff --git a/common/include/nbl/examples/geometry/CSimpleDebugRenderer.hpp b/common/include/nbl/examples/geometry/CSimpleDebugRenderer.hpp
index 325ae8eb7..969b3afd8 100644
--- a/common/include/nbl/examples/geometry/CSimpleDebugRenderer.hpp
+++ b/common/include/nbl/examples/geometry/CSimpleDebugRenderer.hpp
@@ -20,7 +20,10 @@ class CSimpleDebugRenderer final : public core::IReferenceCounted
 			using namespace nbl::system; \
 			using namespace nbl::asset; \
 			using namespace nbl::video
+
 	public:
+		//
+		constexpr static inline uint16_t VertexAttrubUTBDescBinding = 0;
 		//
 		struct SViewParams
 		{
@@ -79,7 +82,19 @@ class CSimpleDebugRenderer final : public core::IReferenceCounted
 		};
 
 		//
-		static inline core::smart_refctd_ptr<CSimpleDebugRenderer> create(asset::IAssetManager* assMan, video::IGPURenderpass* renderpass, const uint32_t subpassIX, const std::span<const video::IGPUPolygonGeometry* const> geometries)
+		constexpr static inline auto DefaultPolygonGeometryPatch = []()->video::CAssetConverter::patch_t<asset::ICPUPolygonGeometry>
+		{
+			// we want to use the vertex data through UTBs
+			using usage_f = video::IGPUBuffer::E_USAGE_FLAGS;
+			video::CAssetConverter::patch_t<asset::ICPUPolygonGeometry> patch = {};
+			patch.positionBufferUsages = usage_f::EUF_UNIFORM_TEXEL_BUFFER_BIT;
+			patch.indexBufferUsages = usage_f::EUF_INDEX_BUFFER_BIT;
+			patch.otherBufferUsages = usage_f::EUF_UNIFORM_TEXEL_BUFFER_BIT;
+			return patch;
+		}();
+
+		//
+		static inline core::smart_refctd_ptr<CSimpleDebugRenderer> create(asset::IAssetManager* assMan, video::IGPURenderpass* renderpass, const uint32_t subpassIX)
 		{
 			EXPOSE_NABLA_NAMESPACES;
 
@@ -88,7 +103,7 @@ class CSimpleDebugRenderer final : public core::IReferenceCounted
 			auto device = const_cast<ILogicalDevice*>(renderpass->getOriginDevice());
 			auto logger = device->getLogger();
 
-			if (!assMan || geometries.empty())
+			if (!assMan)
 				return nullptr;
 
 			// load shader
@@ -113,13 +128,14 @@ class CSimpleDebugRenderer final : public core::IReferenceCounted
 				// create Descriptor Set Layout
 				smart_refctd_ptr<IGPUDescriptorSetLayout> dsLayout;
 				{
+					using binding_flags_t = IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS;
 					const IGPUDescriptorSetLayout::SBinding bindings[] =
 					{
 						{
-							.binding = 0,
+							.binding = VertexAttrubUTBDescBinding,
 							.type = IDescriptor::E_TYPE::ET_UNIFORM_TEXEL_BUFFER,
-							// some geometries may not have particular attributes
-							.createFlags = IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_PARTIALLY_BOUND_BIT,
+							// need this trifecta of flags for `SubAllocatedDescriptorSet` to accept the binding as suballocatable
+							.createFlags = binding_flags_t::ECF_UPDATE_AFTER_BIND_BIT|binding_flags_t::ECF_UPDATE_UNUSED_WHILE_PENDING_BIT |binding_flags_t::ECF_PARTIALLY_BOUND_BIT,
 							.stageFlags = IShader::E_SHADER_STAGE::ESS_VERTEX|IShader::E_SHADER_STAGE::ESS_FRAGMENT,
 							.count = SInstance::SPushConstants::DescriptorCount
 						}
@@ -134,12 +150,13 @@ class CSimpleDebugRenderer final : public core::IReferenceCounted
 
 				// create Descriptor Set
 				auto pool = device->createDescriptorPoolForDSLayouts(IDescriptorPool::ECF_UPDATE_AFTER_BIND_BIT,{&dsLayout.get(),1});
-				init.ds = pool->createDescriptorSet(std::move(dsLayout));
-				if (!init.ds)
+				auto ds = pool->createDescriptorSet(std::move(dsLayout));
+				if (!ds)
 				{
 					logger->log("Could not descriptor set!",ILogger::ELL_ERROR);
 					return nullptr;
 				}
+				init.subAllocDS = make_smart_refctd_ptr<SubAllocatedDescriptorSet>(std::move(ds));
 			}
 
 			// create pipeline layout
@@ -148,7 +165,7 @@ class CSimpleDebugRenderer final : public core::IReferenceCounted
 				.offset = 0,
 				.size = sizeof(SInstance::SPushConstants),
 			}};
-			init.layout = device->createPipelineLayout(ranges,smart_refctd_ptr<const IGPUDescriptorSetLayout>(init.ds->getLayout()));
+			init.layout = device->createPipelineLayout(ranges,smart_refctd_ptr<const IGPUDescriptorSetLayout>(init.subAllocDS->getDescriptorSet()->getLayout()));
 
 			// create pipelines
 			using pipeline_e = SInitParams::PipelineType;
@@ -190,74 +207,18 @@ class CSimpleDebugRenderer final : public core::IReferenceCounted
 				}
 			}
 
-			// write geometries' attributes to descriptor set
-			{
-				core::vector<IGPUDescriptorSet::SDescriptorInfo> infos;
-				auto allocateUTB = [device,&infos](const IGeometry<const IGPUBuffer>::SDataView& view)->uint8_t
-				{
-					if (!view)
-						return SInstance::SPushConstants::DescriptorCount;
-					const auto retval = infos.size();
-					infos.emplace_back().desc = device->createBufferView(view.src, view.composed.format);
-					return retval;
-				};
-
-				for (const auto geom : geometries)
-				{
-					// could also check device origin on all buffers
-					if (!geom->valid())
-						continue;
-					auto& out = init.geoms.emplace_back();
-					switch (geom->getIndexingCallback()->knownTopology())
-					{
-						case E_PRIMITIVE_TOPOLOGY::EPT_TRIANGLE_FAN:
-							out.pipeline = init.pipelines[pipeline_e::BasicTriangleFan];
-							break;
-						default:
-							out.pipeline = init.pipelines[pipeline_e::BasicTriangleList];
-							break;
-					}
-					if (const auto& view=geom->getIndexView(); view)
-					{
-						out.indexBuffer.offset = view.src.offset;
-						out.indexBuffer.buffer = view.src.buffer;
-						switch (view.composed.format)
-						{
-							case E_FORMAT::EF_R16_UINT:
-								out.indexType = EIT_16BIT;
-								break;
-							case E_FORMAT::EF_R32_UINT:
-								out.indexType = EIT_32BIT;
-								break;
-							default:
-								assert(false);
-								return nullptr;
-						}
-					}
-					out.elementCount = geom->getVertexReferenceCount();
-					out.positionView = allocateUTB(geom->getPositionView());
-					out.normalView = allocateUTB(geom->getNormalView());
-					// the first view is usually the UV
-					if (const auto& auxViews = geom->getAuxAttributeViews(); !auxViews.empty())
-						out.uvView = allocateUTB(auxViews.front());
-				}
-
-				if (infos.empty())
-					return nullptr;
-				const IGPUDescriptorSet::SWriteDescriptorSet write = {
-					.dstSet = init.ds.get(),
-					.binding = 0,
-					.arrayElement = 0,
-					.count = static_cast<uint32_t>(infos.size()),
-					.info = infos.data()
-				};
-				if (!device->updateDescriptorSets({&write,1},{}))
-					return nullptr;
-			}
-
 			return smart_refctd_ptr<CSimpleDebugRenderer>(new CSimpleDebugRenderer(std::move(init)),dont_grab);
 		}
 
+		//
+		static inline core::smart_refctd_ptr<CSimpleDebugRenderer> create(asset::IAssetManager* assMan, video::IGPURenderpass* renderpass, const uint32_t subpassIX, const std::span<const video::IGPUPolygonGeometry* const> geometries)
+		{
+			auto retval = create(assMan,renderpass,subpassIX);
+			if (retval)
+				retval->addGeometries(geometries);
+			return retval;
+		}
+
 		//
 		struct SInitParams
 		{
@@ -269,16 +230,145 @@ class CSimpleDebugRenderer final : public core::IReferenceCounted
 				Count
 			};
 
-			core::smart_refctd_ptr<video::IGPUDescriptorSet> ds;
+			core::smart_refctd_ptr<video::SubAllocatedDescriptorSet> subAllocDS;
 			core::smart_refctd_ptr<video::IGPUPipelineLayout> layout;
 			core::smart_refctd_ptr<video::IGPUGraphicsPipeline> pipelines[PipelineType::Count];
-			core::vector<SPackedGeometry> geoms;
 		};
 		inline const SInitParams& getInitParams() const {return m_params;}
 
 		//
-		inline auto& getGeometry(const uint32_t ix) {return m_params.geoms[ix];}
-		inline const auto& getGeometry(const uint32_t ix) const {return m_params.geoms[ix];}
+		inline bool addGeometries(const std::span<const video::IGPUPolygonGeometry* const> geometries)
+		{
+			EXPOSE_NABLA_NAMESPACES;
+			if (geometries.empty())
+				return false;
+			auto device = const_cast<ILogicalDevice*>(m_params.layout->getOriginDevice());
+
+			core::vector<IGPUDescriptorSet::SWriteDescriptorSet> writes;
+			core::vector<IGPUDescriptorSet::SDescriptorInfo> infos;
+			auto allocateUTB = [&](const IGeometry<const IGPUBuffer>::SDataView& view)->uint8_t
+			{
+				if (!view)
+					return SInstance::SPushConstants::DescriptorCount;
+				auto index = SubAllocatedDescriptorSet::invalid_value;
+				if (m_params.subAllocDS->multi_allocate(VertexAttrubUTBDescBinding,1,&index)!=0)
+					return SInstance::SPushConstants::DescriptorCount;
+				const auto retval = infos.size();
+				infos.emplace_back().desc = device->createBufferView(view.src,view.composed.format);
+				writes.emplace_back() = {
+					.dstSet = m_params.subAllocDS->getDescriptorSet(),
+					.binding = VertexAttrubUTBDescBinding,
+					.arrayElement = index,
+					.count = 1,
+					.info = reinterpret_cast<const IGPUDescriptorSet::SDescriptorInfo*>(retval)
+				};
+				return retval;
+			};
+
+			auto sizeToSet = m_geoms.size();
+			auto resetGeoms = core::makeRAIIExiter([&]()->void
+				{
+					for (auto& write : writes)
+						immediateDealloc(write.arrayElement);
+					m_geoms.resize(sizeToSet);
+				}
+			);
+			for (const auto geom : geometries)
+			{
+				// could also check device origin on all buffers
+				if (!geom->valid())
+					return false;
+				auto& out = m_geoms.emplace_back();
+				using pipeline_e = SInitParams::PipelineType;
+				switch (geom->getIndexingCallback()->knownTopology())
+				{
+					case E_PRIMITIVE_TOPOLOGY::EPT_TRIANGLE_FAN:
+						out.pipeline = m_params.pipelines[pipeline_e::BasicTriangleFan];
+						break;
+					default:
+						out.pipeline = m_params.pipelines[pipeline_e::BasicTriangleList];
+						break;
+				}
+				if (const auto& view=geom->getIndexView(); view)
+				{
+					out.indexBuffer.offset = view.src.offset;
+					out.indexBuffer.buffer = view.src.buffer;
+					switch (view.composed.format)
+					{
+						case E_FORMAT::EF_R16_UINT:
+							out.indexType = EIT_16BIT;
+							break;
+						case E_FORMAT::EF_R32_UINT:
+							out.indexType = EIT_32BIT;
+							break;
+						default:
+							return false;
+					}
+				}
+				out.elementCount = geom->getVertexReferenceCount();
+				out.positionView = allocateUTB(geom->getPositionView());
+				out.normalView = allocateUTB(geom->getNormalView());
+				// the first view is usually the UV
+				if (const auto& auxViews = geom->getAuxAttributeViews(); !auxViews.empty())
+					out.uvView = allocateUTB(auxViews.front());
+			}
+
+			// no geometry
+			if (infos.empty())
+				return false;
+
+			// unbase our pointers
+			for (auto& write : writes)
+				write.info = infos.data()+reinterpret_cast<const size_t&>(write.info);
+			if (!device->updateDescriptorSets(writes,{}))
+				return false;
+
+			// retain
+			writes.clear();
+			sizeToSet = m_geoms.size();
+			return true;
+		}
+
+		//
+		inline void removeGeometry(const uint32_t ix, const video::ISemaphore::SWaitInfo& info)
+		{
+			EXPOSE_NABLA_NAMESPACES;
+			if (ix>=m_geoms.size())
+				return;
+
+			core::vector<SubAllocatedDescriptorSet::value_type> deferredFree;
+			deferredFree.reserve(3);
+			auto deallocate = [&](SubAllocatedDescriptorSet::value_type index)->void
+			{
+				if (info.semaphore)
+					deferredFree.push_back(index);
+				else
+					immediateDealloc(index);
+			};
+			auto geo = m_geoms.begin() + ix;
+			deallocate(geo->positionView);
+			deallocate(geo->normalView);
+			deallocate(geo->uvView);
+			m_geoms.erase(geo);
+
+			if (deferredFree.empty())
+				return;
+
+			core::vector<IGPUDescriptorSet::SDropDescriptorSet> nullify(deferredFree.size());
+			const_cast<ILogicalDevice*>(m_params.layout->getOriginDevice())->nullifyDescriptors(nullify);
+		}
+
+		//
+		inline void clearGeometries(const video::ISemaphore::SWaitInfo& info)
+		{
+			// back to front to avoid O(n^2) resize
+			while (!m_geoms.empty())
+				removeGeometry(m_geoms.size()-1,info);
+		}
+
+		//
+		inline const auto& getGeometries() const {return m_geoms;}
+		inline auto& getGeometry(const uint32_t ix) {return m_geoms[ix];}
 
 		//
 		inline void render(video::IGPUCommandBuffer* cmdbuf, const SViewParams& viewParams) const
@@ -288,7 +378,8 @@ class CSimpleDebugRenderer final : public core::IReferenceCounted
 			cmdbuf->beginDebugMarker("CSimpleDebugRenderer::render");
 
 			const auto* layout = m_params.layout.get();
-			cmdbuf->bindDescriptorSets(E_PIPELINE_BIND_POINT::EPBP_GRAPHICS,layout,0,1,&m_params.ds.get());
+			const auto ds = m_params.subAllocDS->getDescriptorSet();
+			cmdbuf->bindDescriptorSets(E_PIPELINE_BIND_POINT::EPBP_GRAPHICS,layout,0,1,&ds);
 
 			for (const auto& instance : m_instances)
 			{
@@ -311,8 +402,21 @@ class CSimpleDebugRenderer final : public core::IReferenceCounted
 
 	protected:
 		inline CSimpleDebugRenderer(SInitParams&& _params) : m_params(std::move(_params)) {}
+		inline ~CSimpleDebugRenderer()
+		{
+			// clean shutdown, can also make SubAllocatedDescriptorSet resillient against that, and issue `device->waitIdle` if not everything is freed
+			const_cast<video::ILogicalDevice*>(m_params.layout->getOriginDevice())->waitIdle();
+			clearGeometries({});
+		}
+
+		inline void immediateDealloc(video::SubAllocatedDescriptorSet::value_type index)
+		{
+			video::IGPUDescriptorSet::SDropDescriptorSet dummy[1];
+			m_params.subAllocDS->multi_deallocate(dummy,VertexAttrubUTBDescBinding,1,&index);
+		}
 
 		SInitParams m_params;
+		core::vector<SPackedGeometry> m_geoms;
 #undef EXPOSE_NABLA_NAMESPACES
 };
 

From eaa132075c5c8564723b07d28ce58bb3040b4dba Mon Sep 17 00:00:00 2001
From: devsh <devsh@devsh.eu>
Date: Wed, 25 Jun 2025 15:50:26 +0200
Subject: [PATCH 434/529] prep the conversion

---
 12_MeshLoaders/main.cpp | 123 +++++++++++++++++++++++++++++++++-------
 1 file changed, 102 insertions(+), 21 deletions(-)

diff --git a/12_MeshLoaders/main.cpp b/12_MeshLoaders/main.cpp
index 0a4e20141..8c97cb44a 100644
--- a/12_MeshLoaders/main.cpp
+++ b/12_MeshLoaders/main.cpp
@@ -40,29 +40,14 @@ class MeshLoadersApp final : public MonoWindowApplication, public BuiltinResourc
 			m_qnc = make_smart_refctd_ptr<CQuantNormalCache>();
 			m_qnc->loadCacheFromFile<EF_R8G8B8_SNORM>(m_system.get(),sharedOutputCWD/"../../tmp/normalCache888.sse");
 
+			auto scRes = static_cast<CDefaultSwapchainFramebuffers*>(m_surface->getSwapchainResources());
+			m_renderer = CSimpleDebugRenderer::create(m_assetMgr.get(),scRes->getRenderpass(),0,{});
+			if (!m_renderer)
+				return logFail("Failed to create renderer!");
+
 			//
 			if (!reloadModel())
 				return false;
-#if 0			
-			const uint32_t addtionalBufferOwnershipFamilies[] = {getGraphicsQueue()->getFamilyIndex()};
-			// we want to use the vertex data through UTBs
-			using usage_f = IGPUBuffer::E_USAGE_FLAGS;
-			CAssetConverter::patch_t<asset::ICPUPolygonGeometry> patch = {};
-			patch.positionBufferUsages = usage_f::EUF_UNIFORM_TEXEL_BUFFER_BIT;
-			patch.indexBufferUsages = usage_f::EUF_INDEX_BUFFER_BIT;
-			patch.otherBufferUsages = usage_f::EUF_UNIFORM_TEXEL_BUFFER_BIT;
-			m_scene = CGeometryCreatorScene::create(
-				{
-					.transferQueue = getTransferUpQueue(),
-					.utilities = m_utils.get(),
-					.logger = m_logger.get(),
-					.addtionalBufferOwnershipFamilies = addtionalBufferOwnershipFamilies
-				},patch
-			);
-#endif
-
-			auto scRes = static_cast<CDefaultSwapchainFramebuffers*>(m_surface->getSwapchainResources());
-			m_renderer = CSimpleDebugRenderer::create(m_assetMgr.get(),scRes->getRenderpass(),0,nullptr);
 
 			camera.mapKeysToArrows();
 
@@ -250,6 +235,8 @@ class MeshLoadersApp final : public MonoWindowApplication, public BuiltinResourc
 			}
 
 			// free up
+			m_renderer->m_instances.clear();
+			m_renderer->clearGeometries({.semaphore=m_semaphore.get(),.value=m_realFrameIx});
 			m_assetMgr->clearAllAssetCache();
 
 			//! load the geometry
@@ -258,10 +245,104 @@ class MeshLoadersApp final : public MonoWindowApplication, public BuiltinResourc
 			auto bundle = m_assetMgr->getAsset(m_modelPath,params);
 			if (bundle.getContents().empty())
 				return false;
+
+			// 
+			core::vector<smart_refctd_ptr<const ICPUPolygonGeometry>> geometries;
+			switch (bundle.getAssetType())
+			{
+				case IAsset::E_TYPE::ET_GEOMETRY:
+					for (const auto& item : bundle.getContents())
+					if (auto polyGeo=IAsset::castDown<ICPUPolygonGeometry>(item); polyGeo)
+						geometries.push_back(polyGeo);
+					break;
+				default:
+					m_logger->log("Asset loaded but not a supported type (ET_GEOMETRY,ET_GEOMETRY_COLLECTION)",ILogger::ELL_ERROR);
+					break;
+			}
+			if (geometries.empty())
+				return false;
+
 			//! cache results -- speeds up mesh generation on second run
 			m_qnc->saveCacheToFile<EF_R8G8B8_SNORM>(m_system.get(),sharedOutputCWD/"../../tmp/normalCache888.sse");
+			
+			// convert the geometries
+			{
+				smart_refctd_ptr<CAssetConverter> converter = CAssetConverter::create({.device=m_device.get()});
 
-			return true;
+				const auto transferFamily = getTransferUpQueue()->getFamilyIndex();
+
+				struct SInputs : CAssetConverter::SInputs
+				{
+					virtual inline std::span<const uint32_t> getSharedOwnershipQueueFamilies(const size_t groupCopyID, const asset::ICPUBuffer* buffer, const CAssetConverter::patch_t<asset::ICPUBuffer>& patch) const
+					{
+						return sharedBufferOwnership;
+					}
+
+					core::vector<uint32_t> sharedBufferOwnership;
+				} inputs = {};
+				core::vector<CAssetConverter::patch_t<ICPUPolygonGeometry>> patches(geometries.size(),CSimpleDebugRenderer::DefaultPolygonGeometryPatch);
+				{
+					inputs.logger = m_logger.get();
+					std::get<CAssetConverter::SInputs::asset_span_t<ICPUPolygonGeometry>>(inputs.assets) = {&geometries.front().get(),geometries.size()};
+					std::get<CAssetConverter::SInputs::patch_span_t<ICPUPolygonGeometry>>(inputs.patches) = patches;
+					// set up shared ownership so we don't have to 
+					core::unordered_set<uint32_t> families;
+					families.insert(transferFamily);
+					families.insert(getGraphicsQueue()->getFamilyIndex());
+					if (families.size()>1)
+					for (const auto fam : families)
+						inputs.sharedBufferOwnership.push_back(fam);
+				}
+				
+				// reserve
+				auto reservation = converter->reserve(inputs);
+				if (!reservation)
+				{
+					m_logger->log("Failed to reserve GPU objects for CPU->GPU conversion!",ILogger::ELL_ERROR);
+					return false;
+				}
+
+				// convert
+				{
+					auto semaphore = m_device->createSemaphore(0u);
+
+					constexpr auto MultiBuffering = 2;
+					std::array<smart_refctd_ptr<IGPUCommandBuffer>,MultiBuffering> commandBuffers = {};
+					{
+						auto pool = m_device->createCommandPool(transferFamily,IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT|IGPUCommandPool::CREATE_FLAGS::TRANSIENT_BIT);
+						pool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY,commandBuffers,smart_refctd_ptr(m_logger));
+					}
+					commandBuffers.front()->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT);
+
+					std::array<IQueue::SSubmitInfo::SCommandBufferInfo,MultiBuffering> commandBufferSubmits;
+					for (auto i=0; i<MultiBuffering; i++)
+						commandBufferSubmits[i].cmdbuf = commandBuffers[i].get();
+
+					SIntendedSubmitInfo transfer = {};
+					transfer.queue = getTransferUpQueue();
+					transfer.scratchCommandBuffers = commandBufferSubmits;
+					transfer.scratchSemaphore = {
+						.semaphore = semaphore.get(),
+						.value = 0u,
+						.stageMask = PIPELINE_STAGE_FLAGS::ALL_TRANSFER_BITS
+					};
+
+					CAssetConverter::SConvertParams cpar = {};
+					cpar.utilities = m_utils.get();
+					cpar.transfer = &transfer;
+
+					// basically it records all data uploads and submits them right away
+					auto future = reservation.convert(cpar);
+					if (future.copy()!=IQueue::RESULT::SUCCESS)
+					{
+						m_logger->log("Failed to await submission feature!", ILogger::ELL_ERROR);
+						return false;
+					}
+				}
+
+				const auto& converted = reservation.getGPUObjects<ICPUPolygonGeometry>();
+				return m_renderer->addGeometries({&converted.front().get(),converted.size()});
+			}
 		}
 
 		// Maximum frames which can be simultaneously submitted, used to cycle through our per-frame resources like command buffers

From c71e38692e5be4a73012e937043fd9fb3f69ace2 Mon Sep 17 00:00:00 2001
From: Arkadiusz Lachowicz <areklachowicz@gmail.com>
Date: Wed, 25 Jun 2025 16:52:58 +0200
Subject: [PATCH 435/529] update CMake to support (NOT
 NBL_EMBED_BUILTIN_RESOURCES) mode but it seems NSC cannot find boost headers
 (like something is off with mounting the boost directory)

---
 CMakeLists.txt                         | 12 +++++++++---
 common/CMakeLists.txt                  |  2 +-
 common/src/nbl/examples/CMakeLists.txt |  4 +++-
 3 files changed, 13 insertions(+), 5 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index bf18c445d..044cf6049 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -95,9 +95,15 @@ if(NBL_BUILD_EXAMPLES)
         target_link_libraries(${T} PUBLIC ${NBL_EXAMPLES_API_TARGET})
 		target_include_directories(${T} PUBLIC $<TARGET_PROPERTY:${NBL_EXAMPLES_API_TARGET},INCLUDE_DIRECTORIES>)
 		target_precompile_headers(${T} REUSE_FROM "${NBL_EXAMPLES_API_TARGET}")
-		LINK_BUILTIN_RESOURCES_TO_TARGET(${T} NblExtExamplesAPIBuiltinsSource)
-		LINK_BUILTIN_RESOURCES_TO_TARGET(${T} NblExtExamplesAPIBuiltinsInclude)
-		LINK_BUILTIN_RESOURCES_TO_TARGET(${T} NblExtExamplesAPIBuiltinsSPIRV)
+
+		# TODO: make them all INTERFACE if not NBL_EMBED_BUILTIN_RESOURCES and link in loop
+		if(NBL_EMBED_BUILTIN_RESOURCES)
+			LINK_BUILTIN_RESOURCES_TO_TARGET(${T} NblExtExamplesAPIBuiltinsSource)
+			LINK_BUILTIN_RESOURCES_TO_TARGET(${T} NblExtExamplesAPIBuiltinsInclude)
+			LINK_BUILTIN_RESOURCES_TO_TARGET(${T} NblExtExamplesAPIBuiltinsSPIRV)
+		else()
+			target_link_libraries(${T} PUBLIC NblExtExamplesAPIBuiltinsSPIRV)
+		endif()
     endforeach()
 
 	NBL_ADJUST_FOLDERS(examples)
diff --git a/common/CMakeLists.txt b/common/CMakeLists.txt
index 66f7e6ea7..1cbdefea7 100644
--- a/common/CMakeLists.txt
+++ b/common/CMakeLists.txt
@@ -23,7 +23,7 @@ function(INTERFACE_TO_BUILTINS TARGET)
     set_target_properties(${TARGET} PROPERTIES DISABLE_PRECOMPILE_HEADERS OFF)
     target_precompile_headers(${TARGET} REUSE_FROM "${LIB_NAME}")
 
-    target_include_directories(${TARGET} PUBLIC "${CMAKE_CURRENT_FUNCTION_LIST_DIR}/include")
+    target_include_directories(${TARGET} PUBLIC "${COMMON_INCLUDE_DIRECTORY}")
     target_link_libraries(${TARGET} INTERFACE ${LIB_NAME})
 endfunction()
 
diff --git a/common/src/nbl/examples/CMakeLists.txt b/common/src/nbl/examples/CMakeLists.txt
index 3f4008541..cfebab2b4 100644
--- a/common/src/nbl/examples/CMakeLists.txt
+++ b/common/src/nbl/examples/CMakeLists.txt
@@ -18,4 +18,6 @@ NBL_REGISTER_SPIRV_SHADERS(
 		# KEY <xyz> COMPILE_OPTIONS ${COMMON_OPTIONS} -T <target>_${SPIRV_TARGET_V}
 )
 
-INTERFACE_TO_BUILTINS(NblExtExamplesAPIBuiltinsSPIRV)
\ No newline at end of file
+if(NBL_EMBED_BUILTIN_RESOURCES)
+	INTERFACE_TO_BUILTINS(NblExtExamplesAPIBuiltinsSPIRV)
+endif()
\ No newline at end of file

From d54bd9802d1563acf0cb4284cd120ee5a3977a5a Mon Sep 17 00:00:00 2001
From: Przemek <minikers21@gmail.com>
Date: Wed, 25 Jun 2025 21:09:21 +0200
Subject: [PATCH 436/529] Reimplemented height shading

---
 62_CAD/main.cpp                               |  9 +--
 62_CAD/shaders/main_pipeline/dtm.hlsl         | 51 ++++++--------
 .../main_pipeline/fragment_shader.hlsl        | 67 ++++++++++++++-----
 .../shaders/main_pipeline/vertex_shader.hlsl  |  1 +
 4 files changed, 76 insertions(+), 52 deletions(-)

diff --git a/62_CAD/main.cpp b/62_CAD/main.cpp
index 905bdc98d..d6ad87637 100644
--- a/62_CAD/main.cpp
+++ b/62_CAD/main.cpp
@@ -3577,9 +3577,10 @@ class ComputerAidedDesign final : public examples::SimpleWindowedApplication, pu
 			dtmInfo.mode |= E_DTM_MODE::CONTOUR;
 
 			dtmInfo.outlineStyleInfo.screenSpaceLineWidth = 0.0f;
-			dtmInfo.outlineStyleInfo.worldSpaceLineWidth = 2.0f;
+			dtmInfo.outlineStyleInfo.worldSpaceLineWidth = 1.0f;
 			dtmInfo.outlineStyleInfo.color = float32_t4(0.0f, 0.39f, 0.0f, 1.0f);
-			std::array<double, 4> outlineStipplePattern = { 0.0f, -5.0f, 20.0f, -5.0f };
+			//std::array<double, 4> outlineStipplePattern = { 0.0f, -5.0f, 20.0f, -5.0f };
+			std::array<double, 4> outlineStipplePattern = { -10.0f, 10.0f };
 			dtmInfo.outlineStyleInfo.setStipplePatternData(outlineStipplePattern);
 
 			dtmInfo.contourSettingsCount = 2u;
@@ -3587,7 +3588,7 @@ class ComputerAidedDesign final : public examples::SimpleWindowedApplication, pu
 			dtmInfo.contourSettings[0u].endHeight = 90;
 			dtmInfo.contourSettings[0u].heightInterval = 10;
 			dtmInfo.contourSettings[0u].lineStyleInfo.screenSpaceLineWidth = 0.0f;
-			dtmInfo.contourSettings[0u].lineStyleInfo.worldSpaceLineWidth = 1.0f;
+			dtmInfo.contourSettings[0u].lineStyleInfo.worldSpaceLineWidth = 3.0f;
 			dtmInfo.contourSettings[0u].lineStyleInfo.color = float32_t4(0.0f, 0.0f, 1.0f, 0.7f);
 			std::array<double, 4> contourStipplePattern = { 0.0f, -5.0f, 10.0f, -5.0f };
 			dtmInfo.contourSettings[0u].lineStyleInfo.setStipplePatternData(contourStipplePattern);
@@ -3654,7 +3655,7 @@ class ComputerAidedDesign final : public examples::SimpleWindowedApplication, pu
 			worldSpaceExtents.y = (heightMapExtent.height - 1) * HeightMapCellWidth;
 			const uint64_t heightMapTextureID = 0ull;
 
-			constexpr bool DrawGridOnly = true;
+			constexpr bool DrawGridOnly = false;
 			
 			if(DrawGridOnly)
 			{
diff --git a/62_CAD/shaders/main_pipeline/dtm.hlsl b/62_CAD/shaders/main_pipeline/dtm.hlsl
index a7697cf7f..c6b95a5c8 100644
--- a/62_CAD/shaders/main_pipeline/dtm.hlsl
+++ b/62_CAD/shaders/main_pipeline/dtm.hlsl
@@ -234,8 +234,8 @@ float calculateDTMContourSDF(in DTMContourSettings contourSettings, in LineStyle
 {
     float distance = nbl::hlsl::numeric_limits<float>::max;
     const float contourThickness = (contourStyle.screenSpaceLineWidth + contourStyle.worldSpaceLineWidth * globals.screenToWorldRatio) * 0.5f;
-    float stretch = 1.0f;
-    float phaseShift = 0.0f;
+    const float stretch = 1.0f;
+    const float phaseShift = 0.0f;
 
     // TODO: move to ubo or push constants
     const float startHeight = contourSettings.contourLinesStartHeight;
@@ -252,7 +252,6 @@ float calculateDTMContourSDF(in DTMContourSettings contourSettings, in LineStyle
 
     int contourLinePointsIdx = 0;
     float2 contourLinePoints[2];
-    // TODO: case where heights we are looking for are on all three vertices
     for (int i = 0; i < 3; ++i)
     {
         if (contourLinePointsIdx == 2)
@@ -362,44 +361,32 @@ float4 calculateDTMOutlineColor(in uint outlineLineStyleIdx, in float3 v[3], in
     return outputColor;
 }
 
-// It's literally sdf with 2 line shapes
-float4 calculateGridDTMOutlineColor(in uint outlineLineStyleIdx, in nbl::hlsl::shapes::Line<float> outlineLineSegments[2], in float2 fragPos, in float phaseShift)
+// TODO:
+// It's literally sdf with a line shape
+// so it should be moved somewhere else and used for every line maybe
+float calculateLineSDF(in LineStyle lineStyle, in nbl::hlsl::shapes::Line<float> lineSegment, in float2 fragPos, in float phaseShift)
 {
-    LineStyle outlineStyle = loadLineStyle(outlineLineStyleIdx);
-    const float outlineThickness = (outlineStyle.screenSpaceLineWidth + outlineStyle.worldSpaceLineWidth * globals.screenToWorldRatio) * 0.5f;
+    const float outlineThickness = (lineStyle.screenSpaceLineWidth + lineStyle.worldSpaceLineWidth * globals.screenToWorldRatio) * 0.5f;
     const float stretch = 1.0f;
 
-    // find distance to outline
     float minDistance = nbl::hlsl::numeric_limits<float>::max;
-    if (!outlineStyle.hasStipples() || stretch == InvalidStyleStretchValue)
+    if (!lineStyle.hasStipples() || stretch == InvalidStyleStretchValue)
     {
-        for (int i = 0; i < 2; ++i)
-        {
-            float distance = nbl::hlsl::numeric_limits<float>::max;
-            distance = ClippedSignedDistance<nbl::hlsl::shapes::Line<float> >::sdf(outlineLineSegments[i], fragPos, outlineThickness, outlineStyle.isRoadStyleFlag);
-
-            minDistance = min(minDistance, distance);
-        }
+        float distance = nbl::hlsl::numeric_limits<float>::max;
+        distance = ClippedSignedDistance<nbl::hlsl::shapes::Line<float> >::sdf(lineSegment, fragPos, outlineThickness, lineStyle.isRoadStyleFlag);
+        minDistance = min(minDistance, distance);
     }
     else
     {
-        for (int i = 0; i < 2; ++i)
-        {
-            float distance = nbl::hlsl::numeric_limits<float>::max;
-            nbl::hlsl::shapes::Line<float>::ArcLengthCalculator arcLenCalc = nbl::hlsl::shapes::Line<float>::ArcLengthCalculator::construct(outlineLineSegments[i]);
-            LineStyleClipper clipper = LineStyleClipper::construct(outlineStyle, outlineLineSegments[i], arcLenCalc, phaseShift, stretch, globals.worldToScreenRatio);
-            distance = ClippedSignedDistance<nbl::hlsl::shapes::Line<float>, LineStyleClipper>::sdf(outlineLineSegments[i], fragPos, outlineThickness, outlineStyle.isRoadStyleFlag, clipper);
+        float distance = nbl::hlsl::numeric_limits<float>::max;
+        nbl::hlsl::shapes::Line<float>::ArcLengthCalculator arcLenCalc = nbl::hlsl::shapes::Line<float>::ArcLengthCalculator::construct(lineSegment);
+        LineStyleClipper clipper = LineStyleClipper::construct(lineStyle, lineSegment, arcLenCalc, phaseShift, stretch, globals.worldToScreenRatio);
+        distance = ClippedSignedDistance<nbl::hlsl::shapes::Line<float>, LineStyleClipper>::sdf(lineSegment, fragPos, outlineThickness, lineStyle.isRoadStyleFlag, clipper);
 
-            minDistance = min(minDistance, distance);
-        }
+        minDistance = min(minDistance, distance);
     }
 
-    float4 outputColor;
-    outputColor.a = 1.0f - smoothstep(-globals.antiAliasingFactor, globals.antiAliasingFactor, minDistance);
-    outputColor.a *= outlineStyle.color.a;
-    outputColor.rgb = outlineStyle.color.rgb;
-
-    return outputColor;
+    return minDistance;
 }
 
 float4 blendUnder(in float4 dstColor, in float4 srcColor)
@@ -445,6 +432,7 @@ E_CELL_DIAGONAL resolveGridDTMCellDiagonal(in uint32_t4 cellData)
 struct GridDTMTriangle
 {
     float3 vertices[3];
+    bool isValid;
 };
 
 /**
@@ -519,6 +507,9 @@ GridDTMCell calculateCellTriangles(in dtm::GridDTMHeightMapData heightData, in f
         output.triangleB.vertices[2] = float3(gridSpaceCellTopLeftCoords.x + cellWidth, gridSpaceCellTopLeftCoords.y + cellWidth, heightData.heights.y);
     }
 
+    output.triangleA.isValid = !(any(isnan(output.triangleA.vertices[0])) || any(isnan(output.triangleA.vertices[1])) || any(isnan(output.triangleA.vertices[2])));
+    output.triangleB.isValid = !(any(isnan(output.triangleB.vertices[0])) || any(isnan(output.triangleB.vertices[1])) || any(isnan(output.triangleB.vertices[2])));
+
     // move from grid space to screen space
     [unroll]
     for (int i = 0; i < 3; ++i)
diff --git a/62_CAD/shaders/main_pipeline/fragment_shader.hlsl b/62_CAD/shaders/main_pipeline/fragment_shader.hlsl
index 1cdc7fe63..b6a29d86f 100644
--- a/62_CAD/shaders/main_pipeline/fragment_shader.hlsl
+++ b/62_CAD/shaders/main_pipeline/fragment_shader.hlsl
@@ -472,7 +472,12 @@ float4 fragMain(PSInput input) : SV_TARGET
                 outlineLineSegments[1].P0 = float32_t2(nearestLineRemainingCoords.x, horizontalBounds.x);
                 outlineLineSegments[1].P1 = float32_t2(nearestLineRemainingCoords.x, horizontalBounds.y);
                 
-                float4 dtmColor = dtm::calculateGridDTMOutlineColor(dtmSettings.outlineLineStyleIdx, outlineLineSegments, input.position.xy, 0.0f);
+                LineStyle outlineStyle = loadLineStyle(dtmSettings.outlineLineStyleIdx);
+                float sdf = dtm::calculateLineSDF(outlineStyle, outlineLineSegments[0], input.position.xy, 0.0f);
+                sdf = min(sdf, dtm::calculateLineSDF(outlineStyle, outlineLineSegments[1], input.position.xy, 0.0f));
+
+                float4 dtmColor = outlineStyle.color;
+                dtmColor.a *= 1.0f - smoothstep(-globals.antiAliasingFactor, globals.antiAliasingFactor, sdf);
                 
                 textureColor = dtmColor.rgb;
                 localAlpha = dtmColor.a;
@@ -489,6 +494,7 @@ float4 fragMain(PSInput input) : SV_TARGET
                 dtm::GridDTMTriangle triangles[MaxTrianglesToDoSDFWith];
                 float interpolatedHeights[MaxTrianglesToDoSDFWith]; // these are height based on barycentric interpolation of current pixel with all the triangles above
                 uint32_t triangleCount = 0u;
+                uint32_t currentTriangleIndex = nbl::hlsl::numeric_limits<uint32_t>::max;
                 
                 // We can do sdf for up to 4 maximum lines for the outlines, 2 belong to the current cell and the other 2 belong to the opposite neighbouring cell
                 /* Example:
@@ -505,8 +511,8 @@ float4 fragMain(PSInput input) : SV_TARGET
                 
                 // curr cell horizontal, curr cell vertical, opposite cell horizontal, opposite cell vertical 
                 bool4 linesValidity = bool4(false, false, false, false);
-                
-                [unroll]
+
+                //[unroll]
                 for (int i = 0; i < 2; ++i)
                 {
                     for (int j = 0; j < 2; ++j)
@@ -551,7 +557,7 @@ float4 fragMain(PSInput input) : SV_TARGET
                         }
                     }
                 }
-                
+
                 // float heightDeriv = fwidth(height);
                 // For height shading, merge this loop with the previous one, because baryCoord all positive means point inside triangle and we can use that to figure out the triangle we want to do height shading for.
                 for (int t = 0; t < triangleCount; ++t)
@@ -559,6 +565,11 @@ float4 fragMain(PSInput input) : SV_TARGET
                     dtm::GridDTMTriangle tri = triangles[t];
                     const float3 baryCoord = dtm::calculateDTMTriangleBarycentrics(tri.vertices[0].xy, tri.vertices[1].xy, tri.vertices[2].xy, input.position.xy);
                     interpolatedHeights[t] = baryCoord.x * tri.vertices[0].z + baryCoord.y * tri.vertices[1].z + baryCoord.z * tri.vertices[2].z;
+
+                    const float minValue = 0.0f - nbl::hlsl::numeric_limits<float>::epsilon;
+                    const float maxValue = 1.0f + nbl::hlsl::numeric_limits<float>::epsilon;
+                    if (all(baryCoord >= minValue) && all(baryCoord <= maxValue))
+                        currentTriangleIndex = t;
                 }
 
                 float4 dtmColor = float4(0.0f, 0.0f, 0.0f, 0.0f);
@@ -599,9 +610,8 @@ float4 fragMain(PSInput input) : SV_TARGET
                 {
                     float sdf = nbl::hlsl::numeric_limits<float>::max;
                     LineStyle outlineStyle = loadLineStyle(dtmSettings.outlineLineStyleIdx);
-                    const float outlineThickness = (outlineStyle.screenSpaceLineWidth + outlineStyle.worldSpaceLineWidth * globals.screenToWorldRatio) * 0.5f;
                     nbl::hlsl::shapes::Line<float> lineSegment;
-                    
+
                     // Doing SDF of outlines as if cooridnate system is centered around the nearest corner of the cell
                     float2 currentCellScreenspaceCoord = gridTopLeftCorner + (currentCellCoord + float2(roundedLocalUV)) * cellWidth;
                     float2 localFragPos = input.position.xy - currentCellScreenspaceCoord;
@@ -613,32 +623,28 @@ float4 fragMain(PSInput input) : SV_TARGET
                         // this cells horizontal line
                         lineSegment.P0 = float2(-offset.x, 0.0f) * cellWidth;
                         lineSegment.P1 = float2(0.0f, 0.0f);
-                        float distance = ClippedSignedDistance<nbl::hlsl::shapes::Line<float> >::sdf(lineSegment, localFragPos, outlineThickness, outlineStyle.isRoadStyleFlag);
-                        sdf = min(sdf, distance);
+                        sdf = min(sdf, dtm::calculateLineSDF(outlineStyle, lineSegment, localFragPos, 0.0f));
                     }
                     if (linesValidity[1])
                     {
                         // this cells vertical line
                         lineSegment.P0 = float2(0.0f, -offset.y) * cellWidth;
                         lineSegment.P1 = float2(0.0f, 0.0f);
-                        float distance = ClippedSignedDistance<nbl::hlsl::shapes::Line<float> >::sdf(lineSegment, localFragPos, outlineThickness, outlineStyle.isRoadStyleFlag);
-                        sdf = min(sdf, distance);
+                        sdf = min(sdf, dtm::calculateLineSDF(outlineStyle, lineSegment, localFragPos, 0.0f));
                     }
                     if (linesValidity[2])
                     {
                         // opposite cell horizontal line
-                        lineSegment.P0 = float2(offset.x, 0.0f) * cellWidth;
-                        lineSegment.P1 = float2(0.0f, 0.0f);
-                        float distance = ClippedSignedDistance<nbl::hlsl::shapes::Line<float> >::sdf(lineSegment, localFragPos, outlineThickness, outlineStyle.isRoadStyleFlag);
-                        sdf = min(sdf, distance);
+                        lineSegment.P0 = float2(0.0f, 0.0f);
+                        lineSegment.P1 = float2(offset.x, 0.0f) * cellWidth;
+                        sdf = min(sdf, dtm::calculateLineSDF(outlineStyle, lineSegment, localFragPos, 0.0f));
                     }
                     if (linesValidity[3])
                     {
                         // opposite cell vertical line
-                        lineSegment.P0 = float2(0.0f, offset.y) * cellWidth;
-                        lineSegment.P1 = float2(0.0f, 0.0f);
-                        float distance = ClippedSignedDistance<nbl::hlsl::shapes::Line<float> >::sdf(lineSegment, localFragPos, outlineThickness, outlineStyle.isRoadStyleFlag);
-                        sdf = min(sdf, distance);
+                        lineSegment.P0 = float2(0.0f, 0.0f);
+                        lineSegment.P1 = float2(0.0f, offset.y) * cellWidth;
+                        sdf = min(sdf, dtm::calculateLineSDF(outlineStyle, lineSegment, localFragPos, 0.0f));
                     }
 
                     float4 outlineColor = outlineStyle.color;
@@ -650,6 +656,31 @@ float4 fragMain(PSInput input) : SV_TARGET
                 //localAlpha = 0.4f;
 
                 // TODO: Handle height shading, using only current triangle (if valid)
+
+                if (dtmSettings.drawHeightShadingEnabled())
+                {
+                    // Establish which triangle is current pixel inside of.
+                    // If the triangle is valid then do height shading
+                    // if the triangle is invalid, then "fade" color of neighbouring valid triangles, to avoid aliasing
+
+
+                    if (currentTriangleIndex != nbl::hlsl::numeric_limits<uint32_t>::max)
+                    {
+                        dtm::GridDTMTriangle currentTriangle = triangles[currentTriangleIndex];
+
+                        if (currentTriangle.isValid)
+                        {
+                            float heightDeriv = fwidth(interpolatedHeights[currentTriangleIndex]); // TODO: is it a good place for `fwidth` call?
+                            dtmColor = dtm::blendUnder(dtmColor, dtm::calculateDTMHeightColor(dtmSettings.heightShadingSettings, currentTriangle.vertices, heightDeriv, input.position.xy, interpolatedHeights[currentTriangleIndex]));
+                        }
+                        else
+                        {
+
+                        }
+                        
+                    }
+
+                }
                 
                 textureColor = dtmColor.rgb / dtmColor.a;
                 localAlpha = dtmColor.a;
diff --git a/62_CAD/shaders/main_pipeline/vertex_shader.hlsl b/62_CAD/shaders/main_pipeline/vertex_shader.hlsl
index fd327e7fd..30283885e 100644
--- a/62_CAD/shaders/main_pipeline/vertex_shader.hlsl
+++ b/62_CAD/shaders/main_pipeline/vertex_shader.hlsl
@@ -652,6 +652,7 @@ PSInput main(uint vertexID : SV_VertexID)
             float gridCellWidth = vk::RawBufferLoad<float>(globals.pointers.geometryBuffer + drawObj.geometryAddress + 2 * sizeof(pfloat64_t2) + sizeof(uint32_t), 8u);
             float thicknessOfTheThickestLine = vk::RawBufferLoad<float>(globals.pointers.geometryBuffer + drawObj.geometryAddress + 2 * sizeof(pfloat64_t2) + sizeof(uint32_t) + sizeof(float), 8u);
 
+            // TODO: remove
             // test large dilation
             //thicknessOfTheThickestLine += 200.0f;
 

From 91de83775b80345d579cf09e75696604711635ee Mon Sep 17 00:00:00 2001
From: Erfan Ahmadi <ahmadierfan99@gmail.com>
Date: Thu, 26 Jun 2025 13:06:23 +0400
Subject: [PATCH 437/529] outline phaseShift

---
 .../main_pipeline/fragment_shader.hlsl        | 31 +++++++++++--------
 1 file changed, 18 insertions(+), 13 deletions(-)

diff --git a/62_CAD/shaders/main_pipeline/fragment_shader.hlsl b/62_CAD/shaders/main_pipeline/fragment_shader.hlsl
index b6a29d86f..0a28f07b6 100644
--- a/62_CAD/shaders/main_pipeline/fragment_shader.hlsl
+++ b/62_CAD/shaders/main_pipeline/fragment_shader.hlsl
@@ -614,37 +614,42 @@ float4 fragMain(PSInput input) : SV_TARGET
 
                     // Doing SDF of outlines as if cooridnate system is centered around the nearest corner of the cell
                     float2 currentCellScreenspaceCoord = gridTopLeftCorner + (currentCellCoord + float2(roundedLocalUV)) * cellWidth;
+                    // We do sdf in corner's local coordinate, so we subtract currentCellScreenspaceCoord from fragmentPos and topLeftGrid 
                     float2 localFragPos = input.position.xy - currentCellScreenspaceCoord;
+                    float2 localGridTopLeftCorner = gridTopLeftCorner - currentCellScreenspaceCoord;
                     
+                    float phaseShift = 0.0f;
+                    const bool hasStipples = outlineStyle.hasStipples();
+                    const float rcpPattenLenScreenSpace =  outlineStyle.reciprocalStipplePatternLen * globals.worldToScreenRatio;
                     // Drawing the lines that form a plus sign around the current corner:
-                    // TODO: Also make this a unrolled loop to reduce LOC
                     if (linesValidity[0])
                     {
                         // this cells horizontal line
-                        lineSegment.P0 = float2(-offset.x, 0.0f) * cellWidth;
-                        lineSegment.P1 = float2(0.0f, 0.0f);
-                        sdf = min(sdf, dtm::calculateLineSDF(outlineStyle, lineSegment, localFragPos, 0.0f));
+                        lineSegment.P0 = float2((offset.x > 0) ? -offset.x * cellWidth : 0.0f, 0.0f);
+                        lineSegment.P1 = float2((offset.x < 0) ? -offset.x * cellWidth : 0.0f, 0.0f);
+                        phaseShift = fract((lineSegment.P0.x - localGridTopLeftCorner.x) * rcpPattenLenScreenSpace );
+                        sdf = min(sdf, dtm::calculateLineSDF(outlineStyle, lineSegment, localFragPos, phaseShift));
                     }
                     if (linesValidity[1])
                     {
                         // this cells vertical line
-                        lineSegment.P0 = float2(0.0f, -offset.y) * cellWidth;
-                        lineSegment.P1 = float2(0.0f, 0.0f);
-                        sdf = min(sdf, dtm::calculateLineSDF(outlineStyle, lineSegment, localFragPos, 0.0f));
+                        lineSegment.P0 = float2(0.0f, (offset.y > 0) ? -offset.y * cellWidth : 0.0f);
+                        lineSegment.P1 = float2(0.0f, (offset.y < 0) ? -offset.y * cellWidth : 0.0f);
+                        sdf = min(sdf, dtm::calculateLineSDF(outlineStyle, lineSegment, localFragPos, phaseShift));
                     }
                     if (linesValidity[2])
                     {
                         // opposite cell horizontal line
-                        lineSegment.P0 = float2(0.0f, 0.0f);
-                        lineSegment.P1 = float2(offset.x, 0.0f) * cellWidth;
-                        sdf = min(sdf, dtm::calculateLineSDF(outlineStyle, lineSegment, localFragPos, 0.0f));
+                        lineSegment.P0 = float2((offset.x < 0) ? offset.x * cellWidth : 0.0f, 0.0f);
+                        lineSegment.P1 = float2((offset.x > 0) ? offset.x * cellWidth : 0.0f, 0.0f);
+                        sdf = min(sdf, dtm::calculateLineSDF(outlineStyle, lineSegment, localFragPos, phaseShift));
                     }
                     if (linesValidity[3])
                     {
                         // opposite cell vertical line
-                        lineSegment.P0 = float2(0.0f, 0.0f);
-                        lineSegment.P1 = float2(0.0f, offset.y) * cellWidth;
-                        sdf = min(sdf, dtm::calculateLineSDF(outlineStyle, lineSegment, localFragPos, 0.0f));
+                        lineSegment.P0 = float2(0.0f, (offset.y < 0) ? offset.y * cellWidth : 0.0f);
+                        lineSegment.P1 = float2(0.0f, (offset.y > 0) ? offset.y * cellWidth : 0.0f);
+                        sdf = min(sdf, dtm::calculateLineSDF(outlineStyle, lineSegment, localFragPos, phaseShift));
                     }
 
                     float4 outlineColor = outlineStyle.color;

From 1a6123018349cc3f51b2f1fe44bc0b8213123bb7 Mon Sep 17 00:00:00 2001
From: Erfan Ahmadi <ahmadierfan99@gmail.com>
Date: Thu, 26 Jun 2025 13:08:02 +0400
Subject: [PATCH 438/529] Forgot to actually set the phaseshift

---
 62_CAD/shaders/main_pipeline/fragment_shader.hlsl | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/62_CAD/shaders/main_pipeline/fragment_shader.hlsl b/62_CAD/shaders/main_pipeline/fragment_shader.hlsl
index 0a28f07b6..5964917f7 100644
--- a/62_CAD/shaders/main_pipeline/fragment_shader.hlsl
+++ b/62_CAD/shaders/main_pipeline/fragment_shader.hlsl
@@ -620,14 +620,14 @@ float4 fragMain(PSInput input) : SV_TARGET
                     
                     float phaseShift = 0.0f;
                     const bool hasStipples = outlineStyle.hasStipples();
-                    const float rcpPattenLenScreenSpace =  outlineStyle.reciprocalStipplePatternLen * globals.worldToScreenRatio;
+                    const float rcpPattenLenScreenSpace = outlineStyle.reciprocalStipplePatternLen * globals.worldToScreenRatio;
                     // Drawing the lines that form a plus sign around the current corner:
                     if (linesValidity[0])
                     {
                         // this cells horizontal line
                         lineSegment.P0 = float2((offset.x > 0) ? -offset.x * cellWidth : 0.0f, 0.0f);
                         lineSegment.P1 = float2((offset.x < 0) ? -offset.x * cellWidth : 0.0f, 0.0f);
-                        phaseShift = fract((lineSegment.P0.x - localGridTopLeftCorner.x) * rcpPattenLenScreenSpace );
+                        phaseShift = fract((lineSegment.P0.x - localGridTopLeftCorner.x) * rcpPattenLenScreenSpace);
                         sdf = min(sdf, dtm::calculateLineSDF(outlineStyle, lineSegment, localFragPos, phaseShift));
                     }
                     if (linesValidity[1])
@@ -635,6 +635,7 @@ float4 fragMain(PSInput input) : SV_TARGET
                         // this cells vertical line
                         lineSegment.P0 = float2(0.0f, (offset.y > 0) ? -offset.y * cellWidth : 0.0f);
                         lineSegment.P1 = float2(0.0f, (offset.y < 0) ? -offset.y * cellWidth : 0.0f);
+                        phaseShift = fract((lineSegment.P0.y - localGridTopLeftCorner.y) * rcpPattenLenScreenSpace);
                         sdf = min(sdf, dtm::calculateLineSDF(outlineStyle, lineSegment, localFragPos, phaseShift));
                     }
                     if (linesValidity[2])
@@ -642,6 +643,7 @@ float4 fragMain(PSInput input) : SV_TARGET
                         // opposite cell horizontal line
                         lineSegment.P0 = float2((offset.x < 0) ? offset.x * cellWidth : 0.0f, 0.0f);
                         lineSegment.P1 = float2((offset.x > 0) ? offset.x * cellWidth : 0.0f, 0.0f);
+                        phaseShift = fract((lineSegment.P0.x - localGridTopLeftCorner.x) * rcpPattenLenScreenSpace);
                         sdf = min(sdf, dtm::calculateLineSDF(outlineStyle, lineSegment, localFragPos, phaseShift));
                     }
                     if (linesValidity[3])
@@ -649,6 +651,7 @@ float4 fragMain(PSInput input) : SV_TARGET
                         // opposite cell vertical line
                         lineSegment.P0 = float2(0.0f, (offset.y < 0) ? offset.y * cellWidth : 0.0f);
                         lineSegment.P1 = float2(0.0f, (offset.y > 0) ? offset.y * cellWidth : 0.0f);
+                        phaseShift = fract((lineSegment.P0.y - localGridTopLeftCorner.y) * rcpPattenLenScreenSpace);
                         sdf = min(sdf, dtm::calculateLineSDF(outlineStyle, lineSegment, localFragPos, phaseShift));
                     }
 

From ed437ebdb9344274bb94ffa56a17dea81abe4eed Mon Sep 17 00:00:00 2001
From: Erfan Ahmadi <ahmadierfan99@gmail.com>
Date: Thu, 26 Jun 2025 16:35:17 +0400
Subject: [PATCH 439/529] DTM Contour Fixes and GRID DTM refactor

---
 62_CAD/shaders/main_pipeline/dtm.hlsl         |  21 +-
 .../main_pipeline/fragment_shader.hlsl        | 190 +++---------------
 2 files changed, 31 insertions(+), 180 deletions(-)

diff --git a/62_CAD/shaders/main_pipeline/dtm.hlsl b/62_CAD/shaders/main_pipeline/dtm.hlsl
index 95ec63f69..40ce36987 100644
--- a/62_CAD/shaders/main_pipeline/dtm.hlsl
+++ b/62_CAD/shaders/main_pipeline/dtm.hlsl
@@ -237,12 +237,9 @@ float calculateDTMContourSDF(in DTMContourSettings contourSettings, in LineStyle
     const float stretch = 1.0f;
     const float phaseShift = 0.0f;
 
-    // TODO: move to ubo or push constants
     const float startHeight = contourSettings.contourLinesStartHeight;
     const float endHeight = contourSettings.contourLinesEndHeight;
     const float interval = contourSettings.contourLinesHeightInterval;
-
-    // TODO: can be precomputed
     const int maxContourLineIdx = (endHeight - startHeight) / interval;
 
     // TODO: it actually can output a negative number, fix
@@ -263,15 +260,10 @@ float calculateDTMContourSDF(in DTMContourSettings contourSettings, in LineStyle
         if (p1.z < p0.z)
             nbl::hlsl::swap(p0, p1);
 
-        float minHeight = p0.z;
-        float maxHeight = p1.z;
-
-        if (height >= minHeight && height <= maxHeight)
+        if (contourLineHeight >= p0.z && contourLineHeight <= p1.z)
         {
-            float2 edge = float2(p1.x, p1.y) - float2(p0.x, p0.y);
-            float scale = (contourLineHeight - minHeight) / (maxHeight - minHeight);
-
-            contourLinePoints[contourLinePointsIdx] = scale * edge + float2(p0.x, p0.y);
+            float interpolationVal = (contourLineHeight - p0.z) / (p1.z - p0.z);
+            contourLinePoints[contourLinePointsIdx] = p0.xy + interpolationVal * (p1.xy - p0.xy);
             ++contourLinePointsIdx;
         }
     }
@@ -432,7 +424,6 @@ E_CELL_DIAGONAL resolveGridDTMCellDiagonal(in uint32_t4 cellData)
 struct GridDTMTriangle
 {
     float3 vertices[3];
-    bool isValid;
 };
 
 /**
@@ -449,6 +440,8 @@ struct GridDTMCell
 {
     GridDTMTriangle triangleA;
     GridDTMTriangle triangleB;
+    bool validA;
+    bool validB;
 };
 
 struct GridDTMHeightMapData
@@ -507,8 +500,8 @@ GridDTMCell calculateCellTriangles(in dtm::GridDTMHeightMapData heightData, in f
         output.triangleB.vertices[2] = float3(gridSpaceCellTopLeftCoords.x + cellWidth, gridSpaceCellTopLeftCoords.y + cellWidth, heightData.heights.y);
     }
 
-    output.triangleA.isValid = !(any(isnan(output.triangleA.vertices[0])) || any(isnan(output.triangleA.vertices[1])) || any(isnan(output.triangleA.vertices[2])));
-    output.triangleB.isValid = !(any(isnan(output.triangleB.vertices[0])) || any(isnan(output.triangleB.vertices[1])) || any(isnan(output.triangleB.vertices[2])));
+    output.validA = !(any(isInvalidGridDtmHeightValue(output.triangleA.vertices[0])) || any(isInvalidGridDtmHeightValue(output.triangleA.vertices[1])) || any(isInvalidGridDtmHeightValue(output.triangleA.vertices[2])));
+    output.validB = !(any(isInvalidGridDtmHeightValue(output.triangleB.vertices[0])) || any(isInvalidGridDtmHeightValue(output.triangleB.vertices[1])) || any(isInvalidGridDtmHeightValue(output.triangleB.vertices[2])));
 
     // move from grid space to screen space
     [unroll]
diff --git a/62_CAD/shaders/main_pipeline/fragment_shader.hlsl b/62_CAD/shaders/main_pipeline/fragment_shader.hlsl
index 5964917f7..0a7199627 100644
--- a/62_CAD/shaders/main_pipeline/fragment_shader.hlsl
+++ b/62_CAD/shaders/main_pipeline/fragment_shader.hlsl
@@ -405,20 +405,6 @@ float4 fragMain(PSInput input) : SV_TARGET
         }
         else if (objType == ObjectType::GRID_DTM)
         {
-            // NOTE: create and read from a texture as a last step, you can generate the height values procedurally from a function while you're working on the sdf stuff.
-            
-            // Query dtm settings
-            // use texture Gather to get 4 corners: https://learn.microsoft.com/en-us/windows/win32/direct3dhlsl/dx-graphics-hlsl-to-gather
-            // DONE (but needs to be fixed): A. the outlines can be stippled, use phaseshift of the line such that they started from the grid's origin worldspace coordinate
-            // DONE: B. the contours are computed for triangles, use the same function as for dtms, choose between the two triangles based on local UV coords in current cell
-                // DONE: Make it so we can choose which diagonal to use to construct the triangle, it's either u=v or u=1-v
-            // DONE: C. Height shading same as contours (split into two triangles)
-
-            // DONE (but needs to be tested after i implement texture height maps) Heights can have invalid values (let's say NaN) if a cell corner has NaN value then no triangle (for contour and shading) and no outline should include that corner. (see DTM image in discord with gaps)
-            
-            // TODO: we need to emulate dilation and do sdf of neighbouring cells as well. because contours, outlines and shading can bleed into other cells for AA.
-            // [NOTE] Do dilation as last step, when everything else works fine
-
             DTMSettings dtmSettings = loadDTMSettings(mainObj.dtmSettingsIdx);
 
             if (!dtmSettings.drawContourEnabled() && !dtmSettings.drawOutlineEnabled() && !dtmSettings.drawHeightShadingEnabled())
@@ -494,7 +480,6 @@ float4 fragMain(PSInput input) : SV_TARGET
                 dtm::GridDTMTriangle triangles[MaxTrianglesToDoSDFWith];
                 float interpolatedHeights[MaxTrianglesToDoSDFWith]; // these are height based on barycentric interpolation of current pixel with all the triangles above
                 uint32_t triangleCount = 0u;
-                uint32_t currentTriangleIndex = nbl::hlsl::numeric_limits<uint32_t>::max;
                 
                 // We can do sdf for up to 4 maximum lines for the outlines, 2 belong to the current cell and the other 2 belong to the opposite neighbouring cell
                 /* Example:
@@ -525,9 +510,10 @@ float4 fragMain(PSInput input) : SV_TARGET
                         {
                             dtm::GridDTMHeightMapData heightData = dtm::retrieveGridDTMCellDataFromHeightMap(gridDimensions, cellCoord, texturesU32[NonUniformResourceIndex(textureId)]);
                             dtm::GridDTMCell gridCellFormed = dtm::calculateCellTriangles(heightData, gridTopLeftCorner, cellCoord, cellWidth);
-                            // Check the validity of the triangles and only add if valid
-                            triangles[triangleCount++] = gridCellFormed.triangleA;
-                            triangles[triangleCount++] = gridCellFormed.triangleB;
+                            if (gridCellFormed.validA)
+                                triangles[triangleCount++] = gridCellFormed.triangleA;
+                            if (gridCellFormed.validB)
+                                triangles[triangleCount++] = gridCellFormed.triangleB;
 
                             // we just need to check and set lines validity
                             // Formulas to get current cell's horizontal and vertical lines validity
@@ -557,8 +543,9 @@ float4 fragMain(PSInput input) : SV_TARGET
                         }
                     }
                 }
-
-                // float heightDeriv = fwidth(height);
+                
+                const uint32_t InvalidTriangleIndex = nbl::hlsl::numeric_limits<uint32_t>::max;
+                uint32_t currentTriangleIndex = InvalidTriangleIndex;
                 // For height shading, merge this loop with the previous one, because baryCoord all positive means point inside triangle and we can use that to figure out the triangle we want to do height shading for.
                 for (int t = 0; t < triangleCount; ++t)
                 {
@@ -566,10 +553,13 @@ float4 fragMain(PSInput input) : SV_TARGET
                     const float3 baryCoord = dtm::calculateDTMTriangleBarycentrics(tri.vertices[0].xy, tri.vertices[1].xy, tri.vertices[2].xy, input.position.xy);
                     interpolatedHeights[t] = baryCoord.x * tri.vertices[0].z + baryCoord.y * tri.vertices[1].z + baryCoord.z * tri.vertices[2].z;
 
-                    const float minValue = 0.0f - nbl::hlsl::numeric_limits<float>::epsilon;
-                    const float maxValue = 1.0f + nbl::hlsl::numeric_limits<float>::epsilon;
-                    if (all(baryCoord >= minValue) && all(baryCoord <= maxValue))
-                        currentTriangleIndex = t;
+                    if (currentTriangleIndex == InvalidTriangleIndex)
+                    {
+                        const float minValue = 0.0f - nbl::hlsl::numeric_limits<float>::epsilon;
+                        const float maxValue = 1.0f + nbl::hlsl::numeric_limits<float>::epsilon;
+                        if (all(baryCoord >= minValue) && all(baryCoord <= maxValue))
+                            currentTriangleIndex = t;
+                    }
                 }
 
                 float4 dtmColor = float4(0.0f, 0.0f, 0.0f, 0.0f);
@@ -581,8 +571,9 @@ float4 fragMain(PSInput input) : SV_TARGET
                         float sdf = nbl::hlsl::numeric_limits<float>::max;
                         for (int t = 0; t < triangleCount; ++t)
                         {
-                            dtm::GridDTMTriangle tri = triangles[t];
-                            sdf = min(sdf, dtm::calculateDTMContourSDF(dtmSettings.contourSettings[i], contourStyle, tri.vertices, input.position.xy, interpolatedHeights[t]));
+                            const dtm::GridDTMTriangle tri = triangles[t];
+                            const float currentInterpolatedHeight = interpolatedHeights[t];
+                            sdf = min(sdf, dtm::calculateDTMContourSDF(dtmSettings.contourSettings[i], contourStyle, tri.vertices, input.position.xy, currentInterpolatedHeight));
 #if 0 // Debug Triangles
                             nbl::hlsl::shapes::Line<float> lineSegment;
                             lineSegment.P0 = tri.vertices[0].xy;
@@ -660,32 +651,17 @@ float4 fragMain(PSInput input) : SV_TARGET
                     dtmColor = dtm::blendUnder(dtmColor, outlineColor);
                 }
                 
-                //textureColor = float3(linesValidity[0], linesValidity[1], 0.0f);
-                //localAlpha = 0.4f;
-
-                // TODO: Handle height shading, using only current triangle (if valid)
-
                 if (dtmSettings.drawHeightShadingEnabled())
                 {
-                    // Establish which triangle is current pixel inside of.
-                    // If the triangle is valid then do height shading
-                    // if the triangle is invalid, then "fade" color of neighbouring valid triangles, to avoid aliasing
-
-
-                    if (currentTriangleIndex != nbl::hlsl::numeric_limits<uint32_t>::max)
+                    if (currentTriangleIndex != InvalidTriangleIndex)
                     {
                         dtm::GridDTMTriangle currentTriangle = triangles[currentTriangleIndex];
-
-                        if (currentTriangle.isValid)
-                        {
-                            float heightDeriv = fwidth(interpolatedHeights[currentTriangleIndex]); // TODO: is it a good place for `fwidth` call?
-                            dtmColor = dtm::blendUnder(dtmColor, dtm::calculateDTMHeightColor(dtmSettings.heightShadingSettings, currentTriangle.vertices, heightDeriv, input.position.xy, interpolatedHeights[currentTriangleIndex]));
-                        }
-                        else
-                        {
-
-                        }
-                        
+                        float heightDeriv = fwidth(interpolatedHeights[currentTriangleIndex]);
+                        dtmColor = dtm::blendUnder(dtmColor, dtm::calculateDTMHeightColor(dtmSettings.heightShadingSettings, currentTriangle.vertices, heightDeriv, input.position.xy, interpolatedHeights[currentTriangleIndex]));
+                    }
+                    else
+                    {
+                        // TODO[Future]: Average color of nearby valid triangles (dtm height function should return color + polygon sdf) 
                     }
 
                 }
@@ -693,125 +669,7 @@ float4 fragMain(PSInput input) : SV_TARGET
                 textureColor = dtmColor.rgb / dtmColor.a;
                 localAlpha = dtmColor.a;
             }
-            
-#if 0
-            // calculate screen space coordinates of vertices of the current tiranlge within the grid
-            dtm::GridDTMTriangle currentTriangle;
-            dtm::GridDTMCell neighbouringCells[8];
-            if (dtmSettings.drawContourEnabled() || dtmSettings.drawHeightShadingEnabled())
-            {
-                if (textureId == InvalidTextureIndex)
-                    discard;
-
-                // heightData.heihts.x - bottom left texel
-                // heightData.heihts.y - bottom right texel
-                // heightData.heihts.z - top right texel
-                // heightData.heihts.w - top left texel
-                dtm::GridDTMHeightMapData heightData = dtm::retrieveGridDTMCellDataFromHeightMap(gridExtents, currentCellCoord, cellWidth, texturesU32[NonUniformResourceIndex(textureId)]);
-                if (heightData.cellDiagonal == E_CELL_DIAGONAL::INVALID)
-                    discard;
-
-                const bool diagonalFromTopLeftToBottomRight = heightData.cellDiagonal == E_CELL_DIAGONAL::TOP_LEFT_TO_BOTTOM_RIGHT;
-
-                float2 insideCellCoord = gridSpacePos - float2(cellWidth, cellWidth) * currentCellCoord; // TODO: use fmod instead?
-                // my ASCII art above explains which triangle is A and which is B
-                const bool triangleA = diagonalFromTopLeftToBottomRight ?
-                    insideCellCoord.x < insideCellCoord.y :
-                    insideCellCoord.x < cellWidth - insideCellCoord.y;
-
-                if (diagonalFromTopLeftToBottomRight)
-                {
-                    currentTriangle.vertices[0] = float3(gridSpaceCellTopLeftCoords.x, gridSpaceCellTopLeftCoords.y, heightData.heights.w);
-                    currentTriangle.vertices[1] = float3(gridSpaceCellTopLeftCoords.x + cellWidth, gridSpaceCellTopLeftCoords.y + cellWidth, heightData.heights.y);
-                    currentTriangle.vertices[2] = triangleA ? float3(gridSpaceCellTopLeftCoords.x, gridSpaceCellTopLeftCoords.y + cellWidth, heightData.heights.x) : float3(gridSpaceCellTopLeftCoords.x + cellWidth, gridSpaceCellTopLeftCoords.y, heightData.heights.z);
-
-                    // TODO: use cell space instead https://github.com/Devsh-Graphics-Programming/Nabla-Examples-and-Tests/pull/186#discussion_r2133699055
-                    //currentTriangle.vertices[0] = float3(0.0f, 0.0f, heightData.heights.w);
-                    //currentTriangle.vertices[1] = float3(cellWidth, cellWidth, heightData.heights.y);
-                    //currentTriangle.vertices[2] = triangleA ? float3(0.0f, cellWidth, heightData.heights.x) : float3(cellWidth, 0.0f, heightData.heights.z);
-                }
-                else
-                {
-                    currentTriangle.vertices[0] = float3(gridSpaceCellTopLeftCoords.x, gridSpaceCellTopLeftCoords.y + cellWidth, heightData.heights.x);
-                    currentTriangle.vertices[1] = float3(gridSpaceCellTopLeftCoords.x + cellWidth, gridSpaceCellTopLeftCoords.y, heightData.heights.z);
-                    currentTriangle.vertices[2] = triangleA ? float3(gridSpaceCellTopLeftCoords.x, gridSpaceCellTopLeftCoords.y, heightData.heights.w) : float3(gridSpaceCellTopLeftCoords.x + cellWidth, gridSpaceCellTopLeftCoords.y + cellWidth, heightData.heights.y);
-
-                    // TODO: use cell space instead https://github.com/Devsh-Graphics-Programming/Nabla-Examples-and-Tests/pull/186#discussion_r2133699055
-                    //currentTriangle.vertices[0] = float3(0.0f, 0.0f + cellWidth, heightData.heights.x);
-                    //currentTriangle.vertices[1] = float3(0.0f + cellWidth, 0.0f, heightData.heights.z);
-                    //currentTriangle.vertices[2] = triangleA ? float3(0.0f, 0.0f, heightData.heights.w) : float3(cellWidth, cellWidth, heightData.heights.y);
-                }
-
-                bool isTriangleInvalid = isnan(currentTriangle.vertices[0].z) || isnan(currentTriangle.vertices[1].z) || isnan(currentTriangle.vertices[2].z);
-                bool isCellPartiallyInvalid = isnan(heightData.heights.x) || isnan(heightData.heights.y) || isnan(heightData.heights.z) || isnan(heightData.heights.w);
-
-                if (isTriangleInvalid)
-                    discard;
-
-                // move from grid space to screen space
-                [unroll]
-                for (int i = 0; i < 3; ++i)
-                    currentTriangle.vertices[i].xy += gridTopLeftCorner;
-
-                const float2 neighbouringCellsCellOffsets[8] = {
-                    float2(-1.0f, -1.0f),
-                    float2(0.0f, -1.0f),
-                    float2(1.0f, -1.0f),
-                    float2(-1.0f, 0.0f),
-                    float2(-1.0f, 0.0f),
-                    float2(-1.0f, 1.0f),
-                    float2(0.0f, 1.0f),
-                    float2(1.0f, 1.0f)
-                };
-
-                // construct triangles of neighbouring cells
-                for (int i = 0; i < 8; ++i)
-                {
-                    float2 neighbouringcurrentCellCoord = currentCellCoord + neighbouringCellsCellOffsets[i];
-                    neighbouringCells[i] = dtm::calculateCellTriangles(gridTopLeftCorner, gridExtents, neighbouringcurrentCellCoord, cellWidth, texturesU32[NonUniformResourceIndex(textureId)]);
-                }
-            }
-
-            const float3 baryCoord = dtm::calculateDTMTriangleBarycentrics(currentTriangle.vertices[0].xy, currentTriangle.vertices[1].xy, currentTriangle.vertices[2].xy, input.position.xy);
-            float height = baryCoord.x * currentTriangle.vertices[0].z + baryCoord.y * currentTriangle.vertices[1].z + baryCoord.z * currentTriangle.vertices[2].z;
-            float heightDeriv = fwidth(height);
-
-            const bool outOfBoundsUV = uv.x < 0.0f || uv.y < 0.0f || uv.x > 1.0f || uv.y > 1.0f;
-            float4 dtmColor = float4(0.0f, 0.0f, 0.0f, 0.0f);
-            if (dtmSettings.drawContourEnabled() && !outOfBoundsUV)
-            {
-                for (int i = dtmSettings.contourSettingsCount-1u; i >= 0; --i) 
-                    dtmColor = dtm::blendUnder(dtmColor, dtm::calculateDTMContourColor(dtmSettings.contourSettings[i], currentTriangle.vertices, input.position.xy, height));
 
-                // draw shit form neighbouring cells
-                for (int i = 0; i < 8; ++i)
-                {
-                    for (int j = dtmSettings.contourSettingsCount - 1u; j >= 0; --j)
-                    {
-                        dtmColor = dtm::blendUnder(dtmColor, dtm::calculateDTMContourColor(dtmSettings.contourSettings[i], neighbouringCells[i].triangleA.vertices, input.position.xy, height));
-                        dtmColor = dtm::blendUnder(dtmColor, dtm::calculateDTMContourColor(dtmSettings.contourSettings[i], neighbouringCells[i].triangleB.vertices, input.position.xy, height));
-                    }
-                }
-            }
-            if (dtmSettings.drawOutlineEnabled())
-                dtmColor = dtm::blendUnder(dtmColor, dtm::calculateGridDTMOutlineColor(dtmSettings.outlineLineStyleIdx, outlineLineSegments, input.position.xy, 0.0f));
-            if (dtmSettings.drawHeightShadingEnabled() && !outOfBoundsUV)
-                dtmColor = dtm::blendUnder(dtmColor, dtm::calculateDTMHeightColor(dtmSettings.heightShadingSettings, currentTriangle.vertices, heightDeriv, input.position.xy, interpolatedHeights[0]));
-
-            textureColor = dtmColor.rgb / dtmColor.a;
-            localAlpha = dtmColor.a;
-
-            // because final color is premultiplied by alpha
-            textureColor = dtmColor.rgb / dtmColor.a;
-
-            // test out of bounds draw
-            /*if (outOfBoundsUV)
-                textureColor = float3(0.0f, 1.0f, 0.0f);
-            else
-                textureColor = float3(0.0f, 0.0f, 1.0f);
-
-            localAlpha = 0.5f;*/
-#endif
         }
         else if (objType == ObjectType::STREAMED_IMAGE) 
         {

From 71a41f374619957d91036622753d6206b9e3bdd0 Mon Sep 17 00:00:00 2001
From: Erfan Ahmadi <ahmadierfan99@gmail.com>
Date: Thu, 26 Jun 2025 16:44:43 +0400
Subject: [PATCH 440/529] small revert

---
 62_CAD/DrawResourcesFiller.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/62_CAD/DrawResourcesFiller.cpp b/62_CAD/DrawResourcesFiller.cpp
index 425684a5f..152401ded 100644
--- a/62_CAD/DrawResourcesFiller.cpp
+++ b/62_CAD/DrawResourcesFiller.cpp
@@ -33,7 +33,7 @@ bool DrawResourcesFiller::allocateDrawResources(ILogicalDevice* logicalDevice, s
 	const size_t totalResourcesSize = adjustedImagesMemorySize + adjustedBuffersMemorySize;
 
 	IGPUBuffer::SCreationParams resourcesBufferCreationParams = {};
-	resourcesBufferCreationParams.size = 870;
+	resourcesBufferCreationParams.size = adjustedBuffersMemorySize;
 	resourcesBufferCreationParams.usage = bitflag(IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT) | IGPUBuffer::EUF_TRANSFER_DST_BIT | IGPUBuffer::EUF_INDEX_BUFFER_BIT;
 	resourcesGPUBuffer = logicalDevice->createBuffer(std::move(resourcesBufferCreationParams));
 	resourcesGPUBuffer->setObjectDebugName("drawResourcesBuffer");

From 3a7b90d92c98528880b81b338211779e8bd276db Mon Sep 17 00:00:00 2001
From: Erfan Ahmadi <ahmadierfan99@gmail.com>
Date: Thu, 26 Jun 2025 18:02:05 +0400
Subject: [PATCH 441/529] Small Fix

---
 62_CAD/shaders/main_pipeline/dtm.hlsl | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/62_CAD/shaders/main_pipeline/dtm.hlsl b/62_CAD/shaders/main_pipeline/dtm.hlsl
index 40ce36987..dc45ba66f 100644
--- a/62_CAD/shaders/main_pipeline/dtm.hlsl
+++ b/62_CAD/shaders/main_pipeline/dtm.hlsl
@@ -500,9 +500,9 @@ GridDTMCell calculateCellTriangles(in dtm::GridDTMHeightMapData heightData, in f
         output.triangleB.vertices[2] = float3(gridSpaceCellTopLeftCoords.x + cellWidth, gridSpaceCellTopLeftCoords.y + cellWidth, heightData.heights.y);
     }
 
-    output.validA = !(any(isInvalidGridDtmHeightValue(output.triangleA.vertices[0])) || any(isInvalidGridDtmHeightValue(output.triangleA.vertices[1])) || any(isInvalidGridDtmHeightValue(output.triangleA.vertices[2])));
-    output.validB = !(any(isInvalidGridDtmHeightValue(output.triangleB.vertices[0])) || any(isInvalidGridDtmHeightValue(output.triangleB.vertices[1])) || any(isInvalidGridDtmHeightValue(output.triangleB.vertices[2])));
-
+    output.validA = !isInvalidGridDtmHeightValue(output.triangleA.vertices[0].z) && !isInvalidGridDtmHeightValue(output.triangleA.vertices[1].z) && !isInvalidGridDtmHeightValue(output.triangleA.vertices[2].z);
+    output.validB = !isInvalidGridDtmHeightValue(output.triangleB.vertices[0].z) && !isInvalidGridDtmHeightValue(output.triangleB.vertices[1].z) && !isInvalidGridDtmHeightValue(output.triangleB.vertices[2].z);
+    
     // move from grid space to screen space
     [unroll]
     for (int i = 0; i < 3; ++i)

From 5929be13ea1bfbb1d04bbe6a39321d519a3cbf92 Mon Sep 17 00:00:00 2001
From: devsh <devsh@devsh.eu>
Date: Thu, 26 Jun 2025 17:52:52 +0200
Subject: [PATCH 442/529] just some todo markup

---
 12_MeshLoaders/main.cpp | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/12_MeshLoaders/main.cpp b/12_MeshLoaders/main.cpp
index 8c97cb44a..14da9f0d6 100644
--- a/12_MeshLoaders/main.cpp
+++ b/12_MeshLoaders/main.cpp
@@ -125,8 +125,6 @@ class MeshLoadersApp final : public MonoWindowApplication, public BuiltinResourc
 			}
 			cb->end();
 
-			//updateGUIDescriptorSet();
-
 			IQueue::SSubmitInfo::SSemaphoreInfo retval =
 			{
 				.semaphore = m_semaphore.get(),
@@ -341,8 +339,15 @@ class MeshLoadersApp final : public MonoWindowApplication, public BuiltinResourc
 				}
 
 				const auto& converted = reservation.getGPUObjects<ICPUPolygonGeometry>();
-				return m_renderer->addGeometries({&converted.front().get(),converted.size()});
+				if (!m_renderer->addGeometries({ &converted.front().get(),converted.size() }))
+					return false;
 			}
+
+// TODO: get scene bounds and reset camera
+
+			// TODO: write out the geometry
+
+			return true;
 		}
 
 		// Maximum frames which can be simultaneously submitted, used to cycle through our per-frame resources like command buffers

From 52c1aa54cf859b63a8ff6df648f743003e5e13fe Mon Sep 17 00:00:00 2001
From: keptsecret <sorchon@gmail.com>
Date: Fri, 27 Jun 2025 10:18:40 +0700
Subject: [PATCH 443/529] ready changes for mesh_loaders merge, requires
 examples.hpp from mesh_loaders

---
 .../include/nbl/this_example/common.hpp       |  2 +-
 31_HLSLPathTracer/main.cpp                    | 34 +++++++++----------
 2 files changed, 18 insertions(+), 18 deletions(-)

diff --git a/31_HLSLPathTracer/include/nbl/this_example/common.hpp b/31_HLSLPathTracer/include/nbl/this_example/common.hpp
index ff3dd8095..b08656eee 100644
--- a/31_HLSLPathTracer/include/nbl/this_example/common.hpp
+++ b/31_HLSLPathTracer/include/nbl/this_example/common.hpp
@@ -6,7 +6,7 @@
 // common api
 #include "CCamera.hpp"
 #include "SimpleWindowedApplication.hpp"
-#include "nbl/application_templates/MonoAssetManagerAndBuiltinResourceApplication.hpp"
+#include "nbl/examples/examples.hpp"
 #include "CEventCallback.hpp"
 
 // example's own headers
diff --git a/31_HLSLPathTracer/main.cpp b/31_HLSLPathTracer/main.cpp
index 0dc5fc053..6b4cad224 100644
--- a/31_HLSLPathTracer/main.cpp
+++ b/31_HLSLPathTracer/main.cpp
@@ -23,10 +23,10 @@ struct PTPushConstant {
 
 // TODO: Add a QueryPool for timestamping once its ready
 // TODO: Do buffer creation using assConv
-class HLSLComputePathtracer final : public examples::SimpleWindowedApplication, public application_templates::MonoAssetManagerAndBuiltinResourceApplication
+class HLSLComputePathtracer final : public examples::SimpleWindowedApplication, public examples::BuiltinResourcesApplication
 {
 		using device_base_t = examples::SimpleWindowedApplication;
-		using asset_base_t = application_templates::MonoAssetManagerAndBuiltinResourceApplication;
+		using asset_base_t = examples::BuiltinResourcesApplication;
 		using clock_t = std::chrono::steady_clock;
 
 		enum E_LIGHT_GEOMETRY : uint8_t
@@ -323,7 +323,7 @@ class HLSLComputePathtracer final : public examples::SimpleWindowedApplication,
 				m_presentDescriptorSet = presentDSPool->createDescriptorSet(gpuPresentDescriptorSetLayout);
 
 				// Create Shaders
-				auto loadAndCompileGLSLShader = [&](const std::string& pathToShader, bool persistentWorkGroups = false) -> smart_refctd_ptr<IGPUShader>
+				auto loadAndCompileGLSLShader = [&](const std::string& pathToShader, bool persistentWorkGroups = false) -> smart_refctd_ptr<IShader>
 				{
 					IAssetLoader::SAssetLoadParams lp = {};
 					lp.workingDirectory = localInputCWD;
@@ -335,7 +335,7 @@ class HLSLComputePathtracer final : public examples::SimpleWindowedApplication,
 						std::exit(-1);
 					}
 
-					auto source = IAsset::castDown<ICPUShader>(assets[0]);
+					auto source = smart_refctd_ptr_static_cast<IShader>(assets[0]);
 					// The down-cast should not fail!
 					assert(source);
 
@@ -361,7 +361,7 @@ class HLSLComputePathtracer final : public examples::SimpleWindowedApplication,
 					source = compiler->compileToSPIRV((const char*)source->getContent()->getPointer(), options);
 
 					// this time we skip the use of the asset converter since the ICPUShader->IGPUShader path is quick and simple
-					auto shader = m_device->createShader(source.get());
+					auto shader = m_device->compileShader({ source.get(), nullptr, nullptr, nullptr });
 					if (!shader)
 					{
 						m_logger->log("GLSL shader creationed failed: %s!", ILogger::ELL_ERROR, pathToShader);
@@ -371,7 +371,7 @@ class HLSLComputePathtracer final : public examples::SimpleWindowedApplication,
 					return shader;
 				};
 
-				auto loadAndCompileHLSLShader = [&](const std::string& pathToShader, const std::string& defineMacro = "", bool persistentWorkGroups = false) -> smart_refctd_ptr<IGPUShader>
+				auto loadAndCompileHLSLShader = [&](const std::string& pathToShader, const std::string& defineMacro = "", bool persistentWorkGroups = false) -> smart_refctd_ptr<IShader>
 				{
 					IAssetLoader::SAssetLoadParams lp = {};
 					lp.workingDirectory = localInputCWD;
@@ -383,7 +383,7 @@ class HLSLComputePathtracer final : public examples::SimpleWindowedApplication,
 						std::exit(-1);
 					}
 
-					auto source = IAsset::castDown<ICPUShader>(assets[0]);
+					auto source = smart_refctd_ptr_static_cast<IShader>(assets[0]);
 					// The down-cast should not fail!
 					assert(source);
 
@@ -410,7 +410,7 @@ class HLSLComputePathtracer final : public examples::SimpleWindowedApplication,
 
 					source = compiler->compileToSPIRV((const char*)source->getContent()->getPointer(), options);
 					
-					auto shader = m_device->createShader(source.get());
+					auto shader = m_device->compileShader({ source.get(), nullptr, nullptr, nullptr });
 					if (!shader)
 					{
 						m_logger->log("HLSL shader creationed failed: %s!", ILogger::ELL_ERROR, pathToShader);
@@ -447,8 +447,8 @@ class HLSLComputePathtracer final : public examples::SimpleWindowedApplication,
 							params.shader.shader = ptShader.get();
 							params.shader.entryPoint = "main";
 							params.shader.entries = nullptr;
-							 params.shader.requireFullSubgroups = true;
-							 params.shader.requiredSubgroupSize = static_cast<IGPUShader::SSpecInfo::SUBGROUP_SIZE>(5);
+						    params.cached.requireFullSubgroups = true;
+						    params.shader.requiredSubgroupSize = static_cast<IPipelineBase::SUBGROUP_SIZE>(5);
 							if (!m_device->createComputePipelines(nullptr, { &params, 1 }, m_PTGLSLPipelines.data() + index))
 								return logFail("Failed to create GLSL compute pipeline!\n");
 						}
@@ -460,8 +460,8 @@ class HLSLComputePathtracer final : public examples::SimpleWindowedApplication,
 							params.shader.shader = ptShader.get();
 							params.shader.entryPoint = "main";
 							params.shader.entries = nullptr;
-							params.shader.requireFullSubgroups = true;
-							params.shader.requiredSubgroupSize = static_cast<IGPUShader::SSpecInfo::SUBGROUP_SIZE>(5);
+							params.cached.requireFullSubgroups = true;
+							params.shader.requiredSubgroupSize = static_cast<IPipelineBase::SUBGROUP_SIZE>(5);
 							if (!m_device->createComputePipelines(nullptr, { &params, 1 }, m_PTHLSLPipelines.data() + index))
 								return logFail("Failed to create HLSL compute pipeline!\n");
 						}
@@ -475,8 +475,8 @@ class HLSLComputePathtracer final : public examples::SimpleWindowedApplication,
 							params.shader.shader = ptShader.get();
 							params.shader.entryPoint = "main";
 							params.shader.entries = nullptr;
-							params.shader.requireFullSubgroups = true;
-							params.shader.requiredSubgroupSize = static_cast<IGPUShader::SSpecInfo::SUBGROUP_SIZE>(5);
+							params.cached.requireFullSubgroups = true;
+							params.shader.requiredSubgroupSize = static_cast<IPipelineBase::SUBGROUP_SIZE>(5);
 							if (!m_device->createComputePipelines(nullptr, { &params, 1 }, m_PTGLSLPersistentWGPipelines.data() + index))
 								return logFail("Failed to create GLSL PersistentWG compute pipeline!\n");
 						}
@@ -488,8 +488,8 @@ class HLSLComputePathtracer final : public examples::SimpleWindowedApplication,
 							params.shader.shader = ptShader.get();
 							params.shader.entryPoint = "main";
 							params.shader.entries = nullptr;
-							params.shader.requireFullSubgroups = true;
-							params.shader.requiredSubgroupSize = static_cast<IGPUShader::SSpecInfo::SUBGROUP_SIZE>(5);
+							params.cached.requireFullSubgroups = true;
+							params.shader.requiredSubgroupSize = static_cast<IPipelineBase::SUBGROUP_SIZE>(5);
 							if (!m_device->createComputePipelines(nullptr, { &params, 1 }, m_PTHLSLPersistentWGPipelines.data() + index))
 								return logFail("Failed to create HLSL PersistentWG compute pipeline!\n");
 						}
@@ -508,7 +508,7 @@ class HLSLComputePathtracer final : public examples::SimpleWindowedApplication,
 					if (!fragmentShader)
 						return logFail("Failed to Load and Compile Fragment Shader: lumaMeterShader!");
 
-					const IGPUShader::SSpecInfo fragSpec = {
+					const IGPUPipelineBase::SShaderSpecInfo fragSpec = {
 						.entryPoint = "main",
 						.shader = fragmentShader.get()
 					};

From c43c93b75a11870cacef0ad16bc2a8bdf40ae0e3 Mon Sep 17 00:00:00 2001
From: keptsecret <sorchon@gmail.com>
Date: Fri, 27 Jun 2025 11:26:29 +0700
Subject: [PATCH 444/529] cpp fixes so it compiles at least

---
 .../include/nbl/this_example/common.hpp         |  6 +++---
 31_HLSLPathTracer/main.cpp                      | 17 +++++++++--------
 2 files changed, 12 insertions(+), 11 deletions(-)

diff --git a/31_HLSLPathTracer/include/nbl/this_example/common.hpp b/31_HLSLPathTracer/include/nbl/this_example/common.hpp
index b08656eee..db051bb3e 100644
--- a/31_HLSLPathTracer/include/nbl/this_example/common.hpp
+++ b/31_HLSLPathTracer/include/nbl/this_example/common.hpp
@@ -4,10 +4,10 @@
 #include <nabla.h>
 
 // common api
-#include "CCamera.hpp"
-#include "SimpleWindowedApplication.hpp"
+#include "nbl/examples/common/SimpleWindowedApplication.hpp"
 #include "nbl/examples/examples.hpp"
-#include "CEventCallback.hpp"
+#include "nbl/examples/cameras/CCamera.hpp"
+#include "nbl/examples/common/CEventCallback.hpp"
 
 // example's own headers
 #include "nbl/ui/ICursorControl.h"
diff --git a/31_HLSLPathTracer/main.cpp b/31_HLSLPathTracer/main.cpp
index 6b4cad224..576a4c7b0 100644
--- a/31_HLSLPathTracer/main.cpp
+++ b/31_HLSLPathTracer/main.cpp
@@ -14,6 +14,7 @@ using namespace system;
 using namespace asset;
 using namespace ui;
 using namespace video;
+using namespace nbl::examples;
 
 struct PTPushConstant {
 	matrix4SIMD invMVP;
@@ -23,10 +24,10 @@ struct PTPushConstant {
 
 // TODO: Add a QueryPool for timestamping once its ready
 // TODO: Do buffer creation using assConv
-class HLSLComputePathtracer final : public examples::SimpleWindowedApplication, public examples::BuiltinResourcesApplication
+class HLSLComputePathtracer final : public SimpleWindowedApplication, public BuiltinResourcesApplication
 {
-		using device_base_t = examples::SimpleWindowedApplication;
-		using asset_base_t = examples::BuiltinResourcesApplication;
+		using device_base_t = SimpleWindowedApplication;
+		using asset_base_t = BuiltinResourcesApplication;
 		using clock_t = std::chrono::steady_clock;
 
 		enum E_LIGHT_GEOMETRY : uint8_t
@@ -91,7 +92,7 @@ class HLSLComputePathtracer final : public examples::SimpleWindowedApplication,
 			if (!m_surface)
 			{
 				{
-					auto windowCallback = core::make_smart_refctd_ptr<CEventCallback>(smart_refctd_ptr(m_inputSystem), smart_refctd_ptr(m_logger));
+					auto windowCallback = core::make_smart_refctd_ptr<examples::CEventCallback>(smart_refctd_ptr(m_inputSystem), smart_refctd_ptr(m_logger));
 					IWindow::SCreationParams params = {};
 					params.callback = core::make_smart_refctd_ptr<nbl::video::ISimpleManagedSurface::ICallback>();
 					params.width = WindowDimensions.x;
@@ -118,7 +119,7 @@ class HLSLComputePathtracer final : public examples::SimpleWindowedApplication,
 		{
 			// Init systems
 			{
-				m_inputSystem = make_smart_refctd_ptr<InputSystem>(logger_opt_smart_ptr(smart_refctd_ptr(m_logger)));
+				m_inputSystem = make_smart_refctd_ptr<examples::InputSystem>(logger_opt_smart_ptr(smart_refctd_ptr(m_logger)));
 
 				// Remember to call the base class initialization!
 				if (!device_base_t::onAppInitialized(smart_refctd_ptr(system)))
@@ -509,8 +510,8 @@ class HLSLComputePathtracer final : public examples::SimpleWindowedApplication,
 						return logFail("Failed to Load and Compile Fragment Shader: lumaMeterShader!");
 
 					const IGPUPipelineBase::SShaderSpecInfo fragSpec = {
-						.entryPoint = "main",
-						.shader = fragmentShader.get()
+						.shader = fragmentShader.get(),
+					    .entryPoint = "main"
 					};
 
 					auto presentLayout = m_device->createPipelineLayout(
@@ -1381,7 +1382,7 @@ class HLSLComputePathtracer final : public examples::SimpleWindowedApplication,
 
 		// system resources
 		core::smart_refctd_ptr<InputSystem> m_inputSystem;
-		InputSystem::ChannelReader<IMouseEventChannel> mouse;
+        InputSystem::ChannelReader<IMouseEventChannel> mouse;
 		InputSystem::ChannelReader<IKeyboardEventChannel> keyboard;
 
 		// pathtracer resources

From 8b31859520069831b246d13270b43b97aea83141 Mon Sep 17 00:00:00 2001
From: keptsecret <sorchon@gmail.com>
Date: Fri, 27 Jun 2025 14:53:17 +0700
Subject: [PATCH 445/529] a bazillion fixes since last time bxdf usages changed

---
 .../app_resources/hlsl/material_system.hlsl   | 84 +++++++++++++------
 .../hlsl/next_event_estimator.hlsl            |  6 +-
 .../app_resources/hlsl/pathtracer.hlsl        | 41 ++++-----
 .../app_resources/hlsl/render.comp.hlsl       | 11 ++-
 31_HLSLPathTracer/main.cpp                    |  7 --
 5 files changed, 87 insertions(+), 62 deletions(-)

diff --git a/31_HLSLPathTracer/app_resources/hlsl/material_system.hlsl b/31_HLSLPathTracer/app_resources/hlsl/material_system.hlsl
index feffee9ef..4e2fdc5a0 100644
--- a/31_HLSLPathTracer/app_resources/hlsl/material_system.hlsl
+++ b/31_HLSLPathTracer/app_resources/hlsl/material_system.hlsl
@@ -14,22 +14,6 @@ namespace ext
 namespace MaterialSystem
 {
 
-// struct Material
-// {
-//     enum Type : uint32_t    // enum class?
-//     {
-//         DIFFUSE,
-//         CONDUCTOR,
-//         DIELECTRIC
-//     };
-
-//     NBL_CONSTEXPR_STATIC_INLINE uint32_t DataSize = 1;
-
-//     uint32_t type : 2;
-//     uint32_t unused : 30;   // possible space for flags
-//     uint32_t data[DataSize];
-// };
-
 enum MaterialType : uint32_t    // enum class?
 {
     DIFFUSE,
@@ -37,6 +21,52 @@ enum MaterialType : uint32_t    // enum class?
     DIELECTRIC
 };
 
+template<class DiffuseBxDF, class ConductorBxDF, class DielectricBxDF>
+struct MaterialParams
+{
+    using this_t = MaterialParams<DiffuseBxDF, ConductorBxDF, DielectricBxDF>;
+    using sample_type = typename DiffuseBxDF::sample_type;
+    using anisotropic_interaction_type = typename DiffuseBxDF::anisotropic_interaction_type;
+    using isotropic_interaction_type = typename anisotropic_interaction_type::isotropic_interaction_type;
+    using anisocache_type = typename ConductorBxDF::anisocache_type;
+    using isocache_type = typename anisocache_type::isocache_type;
+
+    using diffuse_params_type = typename DiffuseBxDF::params_isotropic_t;
+    using conductor_params_type = typename ConductorBxDF::params_isotropic_t;
+    using dielectric_params_type = typename DielectricBxDF::params_isotropic_t;
+
+    // we're only doing isotropic for this example
+    static this_t create(sample_type _sample, isotropic_interaction_type _interaction, isocache_type _cache, bxdf::BxDFClampMode _clamp)
+    {
+        this_t retval;
+        retval._Sample = _sample;
+        retval.interaction = _interaction;
+        retval.cache = _cache;
+        retval.clampMode = _clamp;
+        return retval;
+    }
+
+    diffuse_params_type getDiffuseParams()
+    {
+        return diffuse_params_type::create(_Sample, interaction, clampMode);
+    }
+
+    conductor_params_type getConductorParams()
+    {
+        return conductor_params_type::create(_Sample, interaction, cache, clampMode);
+    }
+
+    dielectric_params_type getDielectricParams()
+    {
+        return dielectric_params_type::create(_Sample, interaction, cache, clampMode);
+    }
+
+    sample_type _Sample;
+    isotropic_interaction_type interaction;
+    isocache_type cache;
+    bxdf::BxDFClampMode clampMode;
+};
+
 template<class DiffuseBxDF, class ConductorBxDF, class DielectricBxDF>  // NOTE: these bxdfs should match the ones in Scene BxDFNode
 struct System
 {
@@ -48,9 +78,11 @@ struct System
     using sample_type = typename DiffuseBxDF::sample_type;
     using ray_dir_info_type = typename sample_type::ray_dir_info_type;
     using quotient_pdf_type = typename DiffuseBxDF::quotient_pdf_type;
-    using anisotropic_type = typename DiffuseBxDF::anisotropic_type;
+    using anisotropic_interaction_type = typename DiffuseBxDF::anisotropic_interaction_type;
+    using isotropic_interaction_type = typename anisotropic_interaction_type::isotropic_interaction_type;
     using anisocache_type = typename ConductorBxDF::anisocache_type;
-    using params_t = bxdf::SBxDFParams<scalar_type>;
+    using isocache_type = typename anisocache_type::isocache_type;
+    using params_t = MaterialParams<DiffuseBxDF, ConductorBxDF, DielectricBxDF>;
     using create_params_t = bxdf::SBxDFCreationParams<scalar_type, measure_type>;
 
     using diffuse_op_type = DiffuseBxDF;
@@ -73,19 +105,19 @@ struct System
             case MaterialType::DIFFUSE:
             {
                 diffuseBxDF.init(cparams);
-                return (measure_type)diffuseBxDF.eval(params);
+                return (measure_type)diffuseBxDF.eval(params.getDiffuseParams());
             }
             break;
             case MaterialType::CONDUCTOR:
             {
                 conductorBxDF.init(cparams);
-                return conductorBxDF.eval(params);
+                return conductorBxDF.eval(params.getConductorParams());
             }
             break;
             case MaterialType::DIELECTRIC:
             {
                 dielectricBxDF.init(cparams);
-                return dielectricBxDF.eval(params);
+                return dielectricBxDF.eval(params.getDielectricParams());
             }
             break;
             default:
@@ -93,7 +125,7 @@ struct System
         }
     }
 
-    sample_type generate(uint32_t material, NBL_CONST_REF_ARG(create_params_t) cparams, NBL_CONST_REF_ARG(anisotropic_type) interaction, NBL_CONST_REF_ARG(vector3_type) u, NBL_REF_ARG(anisocache_type) _cache)
+    sample_type generate(uint32_t material, NBL_CONST_REF_ARG(create_params_t) cparams, NBL_CONST_REF_ARG(anisotropic_interaction_type) interaction, NBL_CONST_REF_ARG(vector3_type) u, NBL_REF_ARG(anisocache_type) _cache)
     {
         switch(material)
         {
@@ -131,26 +163,26 @@ struct System
     quotient_pdf_type quotient_and_pdf(uint32_t material, NBL_CONST_REF_ARG(create_params_t) cparams, NBL_CONST_REF_ARG(params_t) params)
     {
         const float minimumProjVectorLen = 0.00000001;
-        if (params.NdotV > minimumProjVectorLen && params.NdotL > minimumProjVectorLen)
+        if (params.interaction.getNdotV() > minimumProjVectorLen && params._Sample.getNdotL() > minimumProjVectorLen)
         {
             switch(material)
             {
                 case MaterialType::DIFFUSE:
                 {
                     diffuseBxDF.init(cparams);
-                    return diffuseBxDF.quotient_and_pdf(params);
+                    return diffuseBxDF.quotient_and_pdf(params.getDiffuseParams());
                 }
                 break;
                 case MaterialType::CONDUCTOR:
                 {
                     conductorBxDF.init(cparams);
-                    return conductorBxDF.quotient_and_pdf(params);
+                    return conductorBxDF.quotient_and_pdf(params.getConductorParams());
                 }
                 break;
                 case MaterialType::DIELECTRIC:
                 {
                     dielectricBxDF.init(cparams);
-                    return dielectricBxDF.quotient_and_pdf(params);
+                    return dielectricBxDF.quotient_and_pdf(params.getDielectricParams());
                 }
                 break;
                 default:
diff --git a/31_HLSLPathTracer/app_resources/hlsl/next_event_estimator.hlsl b/31_HLSLPathTracer/app_resources/hlsl/next_event_estimator.hlsl
index 51c018ac5..ac74b1abf 100644
--- a/31_HLSLPathTracer/app_resources/hlsl/next_event_estimator.hlsl
+++ b/31_HLSLPathTracer/app_resources/hlsl/next_event_estimator.hlsl
@@ -294,7 +294,7 @@ struct Estimator<Scene, Ray, LightSample, Aniso, IM_PROCEDURAL, PST_SPHERE, PPM>
     using light_type = typename Scene::light_type;
     using spectral_type = typename light_type::spectral_type;
     using interaction_type = Aniso;
-    using quotient_pdf_type = bxdf::quotient_and_pdf<spectral_type, scalar_type>;
+    using quotient_pdf_type = sampling::quotient_and_pdf<spectral_type, scalar_type>;
     using sample_type = LightSample;
     using ray_dir_info_type = typename sample_type::ray_dir_info_type;
 
@@ -346,7 +346,7 @@ struct Estimator<Scene, Ray, LightSample, Aniso, IM_PROCEDURAL, PST_TRIANGLE, PP
     using light_type = typename Scene::light_type;
     using spectral_type = typename light_type::spectral_type;
     using interaction_type = Aniso;
-    using quotient_pdf_type = bxdf::quotient_and_pdf<spectral_type, scalar_type>;
+    using quotient_pdf_type = sampling::quotient_and_pdf<spectral_type, scalar_type>;
     using sample_type = LightSample;
     using ray_dir_info_type = typename sample_type::ray_dir_info_type;
 
@@ -397,7 +397,7 @@ struct Estimator<Scene, Ray, LightSample, Aniso, IM_PROCEDURAL, PST_RECTANGLE, P
     using light_type = typename Scene::light_type;
     using spectral_type = typename light_type::spectral_type;
     using interaction_type = Aniso;
-    using quotient_pdf_type = bxdf::quotient_and_pdf<spectral_type, scalar_type>;
+    using quotient_pdf_type = sampling::quotient_and_pdf<spectral_type, scalar_type>;
     using sample_type = LightSample;
     using ray_dir_info_type = typename sample_type::ray_dir_info_type;
 
diff --git a/31_HLSLPathTracer/app_resources/hlsl/pathtracer.hlsl b/31_HLSLPathTracer/app_resources/hlsl/pathtracer.hlsl
index f5d5206dc..add1eb8a9 100644
--- a/31_HLSLPathTracer/app_resources/hlsl/pathtracer.hlsl
+++ b/31_HLSLPathTracer/app_resources/hlsl/pathtracer.hlsl
@@ -58,8 +58,8 @@ struct Unidirectional
     using ray_type = typename RayGen::ray_type;
     using light_type = Light<measure_type>;
     using bxdfnode_type = BxDFNode<measure_type>;
-    using anisotropic_type = typename MaterialSystem::anisotropic_type;
-    using isotropic_type = typename anisotropic_type::isotropic_type;
+    using anisotropic_interaction_type = typename MaterialSystem::anisotropic_interaction_type;
+    using isotropic_interaction_type = typename anisotropic_interaction_type::isotropic_interaction_type;
     using anisocache_type = typename MaterialSystem::anisocache_type;
     using isocache_type = typename anisocache_type::isocache_type;
     using quotient_pdf_type = typename NextEventEstimator::quotient_pdf_type;
@@ -100,8 +100,8 @@ struct Unidirectional
         const vector3_type intersection = ray.origin + ray.direction * ray.intersectionT;
 
         uint32_t bsdfLightIDs;
-        anisotropic_type interaction;
-        isotropic_type iso_interaction;
+        anisotropic_interaction_type interaction;
+        isotropic_interaction_type iso_interaction;
         uint32_t mode = objectID.mode;
         switch (mode)
         {
@@ -116,8 +116,8 @@ struct Unidirectional
                 N = nbl::hlsl::normalize(N);
                 ray_dir_info_type V;
                 V.direction = -ray.direction;
-                isotropic_type iso_interaction = isotropic_type::create(V, N);
-                interaction = anisotropic_type::create(iso_interaction);
+                isotropic_interaction_type iso_interaction = isotropic_interaction_type::create(V, N);
+                interaction = anisotropic_interaction_type::create(iso_interaction);
             }
             break;
             default:
@@ -142,9 +142,9 @@ struct Unidirectional
 
         // TODO: ifdef kill diffuse specular paths
 
-        const bool isBSDF = (bxdf.materialType == ext::MaterialSystem::MaterialType::DIFFUSE) ? bxdf_traits<diffuse_op_type>::type == BT_BSDF :
-                            (bxdf.materialType == ext::MaterialSystem::MaterialType::CONDUCTOR) ? bxdf_traits<conductor_op_type>::type == BT_BSDF :
-                            bxdf_traits<dielectric_op_type>::type == BT_BSDF;
+        const bool isBSDF = (bxdf.materialType == ext::MaterialSystem::MaterialType::DIFFUSE) ? bxdf::traits<diffuse_op_type>::type == bxdf::BT_BSDF :
+                            (bxdf.materialType == ext::MaterialSystem::MaterialType::CONDUCTOR) ? bxdf::traits<conductor_op_type>::type == bxdf::BT_BSDF :
+                            bxdf::traits<dielectric_op_type>::type == bxdf::BT_BSDF;
 
         vector3_type eps0 = rand3d(depth, _sample, 0u);
         vector3_type eps1 = rand3d(depth, _sample, 1u);
@@ -171,24 +171,25 @@ struct Unidirectional
             );
 
             // We don't allow non watertight transmitters in this renderer
-            bool validPath = nee_sample.NdotL > numeric_limits<scalar_type>::min;
+            bool validPath = nee_sample.getNdotL() > numeric_limits<scalar_type>::min;
             // but if we allowed non-watertight transmitters (single water surface), it would make sense just to apply this line by itself
-            anisocache_type _cache;
-            validPath = validPath && anisocache_type::template compute<ray_dir_info_type, ray_dir_info_type>(_cache, interaction, nee_sample, monochromeEta);
+            bxdf::fresnel::OrientedEtas<scalar_type> orientedEta = bxdf::fresnel::OrientedEtas<scalar_type>::create(interaction.getNdotV(), monochromeEta);
+            anisocache_type _cache = anisocache_type::template create<anisotropic_interaction_type, sample_type>(interaction, nee_sample, orientedEta);
+            validPath = validPath && _cache.getNdotH() >= 0.0;
             bxdf.params.eta = monochromeEta;
 
             if (neeContrib_pdf.pdf < numeric_limits<scalar_type>::max)
             {
-                if (nbl::hlsl::any(isnan(nee_sample.L.direction)))
+                if (nbl::hlsl::any(isnan(nee_sample.getL().getDirection())))
                     ray.payload.accumulation += vector3_type(1000.f, 0.f, 0.f);
-                else if (nbl::hlsl::all((vector3_type)69.f == nee_sample.L.direction))
+                else if (nbl::hlsl::all((vector3_type)69.f == nee_sample.getL().getDirection()))
                     ray.payload.accumulation += vector3_type(0.f, 1000.f, 0.f);
                 else if (validPath)
                 {
                     bxdf::BxDFClampMode _clamp;
                     _clamp = (bxdf.materialType == ext::MaterialSystem::MaterialType::DIELECTRIC) ? bxdf::BxDFClampMode::BCM_ABS : bxdf::BxDFClampMode::BCM_MAX;
                     // example only uses isotropic bxdfs
-                    params_type params = params_type::template create<sample_type, isotropic_type, isocache_type>(nee_sample, interaction.isotropic, _cache.iso_cache, _clamp);
+                    params_type params = params_type::create(nee_sample, interaction.isotropic, _cache.iso_cache, _clamp);
 
                     quotient_pdf_type bsdf_quotient_pdf = materialSystem.quotient_and_pdf(bxdf.materialType, bxdf.params, params);
                     neeContrib_pdf.quotient *= bxdf.albedo * throughput * bsdf_quotient_pdf.quotient;
@@ -200,8 +201,8 @@ struct Unidirectional
                     // neeContrib_pdf.quotient *= otherGenOverChoice;
 
                     ray_type nee_ray;
-                    nee_ray.origin = intersection + nee_sample.L.direction * t * Tolerance<scalar_type>::getStart(depth);
-                    nee_ray.direction = nee_sample.L.direction;
+                    nee_ray.origin = intersection + nee_sample.getL().getDirection() * t * Tolerance<scalar_type>::getStart(depth);
+                    nee_ray.direction = nee_sample.getL().getDirection();
                     nee_ray.intersectionT = t;
                     if (bsdf_quotient_pdf.pdf < numeric_limits<scalar_type>::max && getLuma(neeContrib_pdf.quotient) > lumaContributionThreshold && intersector_type::traceRay(nee_ray, scene).id == -1)
                         ray.payload.accumulation += neeContrib_pdf.quotient;
@@ -221,13 +222,13 @@ struct Unidirectional
             bxdf::BxDFClampMode _clamp;
             _clamp = (bxdf.materialType == ext::MaterialSystem::MaterialType::DIELECTRIC) ? bxdf::BxDFClampMode::BCM_ABS : bxdf::BxDFClampMode::BCM_MAX;
             // example only uses isotropic bxdfs
-            params_type params = params_type::template create<sample_type, isotropic_type, isocache_type>(bsdf_sample, interaction.isotropic, _cache.iso_cache, _clamp);
+            params_type params = params_type::create(bsdf_sample, interaction.isotropic, _cache.iso_cache, _clamp);
 
             // the value of the bsdf divided by the probability of the sample being generated
             quotient_pdf_type bsdf_quotient_pdf = materialSystem.quotient_and_pdf(bxdf.materialType, bxdf.params, params);
             throughput *= bxdf.albedo * bsdf_quotient_pdf.quotient;
             bxdfPdf = bsdf_quotient_pdf.pdf;
-            bxdfSample = bsdf_sample.L.direction;
+            bxdfSample = bsdf_sample.getL().getDirection();
         }
 
         // additional threshold
@@ -243,7 +244,7 @@ struct Unidirectional
             ray.direction = bxdfSample;
             if ((PTPolygonMethod)nee_type::PolygonMethod == PPM_APPROX_PROJECTED_SOLID_ANGLE)
             {
-                ray.normalAtOrigin = interaction.isotropic.N;
+                ray.normalAtOrigin = interaction.getN();
                 ray.wasBSDFAtOrigin = isBSDF;
             }
             return true;
diff --git a/31_HLSLPathTracer/app_resources/hlsl/render.comp.hlsl b/31_HLSLPathTracer/app_resources/hlsl/render.comp.hlsl
index 81736f508..a40eb3dd0 100644
--- a/31_HLSLPathTracer/app_resources/hlsl/render.comp.hlsl
+++ b/31_HLSLPathTracer/app_resources/hlsl/render.comp.hlsl
@@ -74,18 +74,17 @@ float32_t2 getTexCoords()
 
 using ray_dir_info_t = bxdf::ray_dir_info::SBasic<float>;
 using iso_interaction = bxdf::surface_interactions::SIsotropic<ray_dir_info_t>;
-using aniso_interaction = bxdf::surface_interactions::SAnisotropic<ray_dir_info_t>;
+using aniso_interaction = bxdf::surface_interactions::SAnisotropic<iso_interaction>;
 using sample_t = bxdf::SLightSample<ray_dir_info_t>;
 using iso_cache = bxdf::SIsotropicMicrofacetCache<float>;
-using aniso_cache = bxdf::SAnisotropicMicrofacetCache<float>;
-using quotient_pdf_t = bxdf::quotient_and_pdf<float32_t3, float>;
+using aniso_cache = bxdf::SAnisotropicMicrofacetCache<iso_cache>;
+using quotient_pdf_t = sampling::quotient_and_pdf<float32_t3, float>;
 using spectral_t = vector<float, 3>;
-using params_t = bxdf::SBxDFParams<float>;
 using create_params_t = bxdf::SBxDFCreationParams<float, spectral_t>;
 
 using diffuse_bxdf_type = bxdf::reflection::SOrenNayarBxDF<sample_t, iso_interaction, aniso_interaction, spectral_t>;
-using conductor_bxdf_type = bxdf::reflection::SGGXBxDF<sample_t, iso_cache, aniso_cache, spectral_t>;
-using dielectric_bxdf_type = bxdf::transmission::SGGXDielectricBxDF<sample_t, iso_cache, aniso_cache, spectral_t>;
+using conductor_bxdf_type = bxdf::reflection::SGGXBxDF<sample_t, iso_interaction, aniso_interaction, iso_cache, aniso_cache, spectral_t>;
+using dielectric_bxdf_type = bxdf::transmission::SGGXDielectricBxDF<sample_t, iso_interaction, aniso_interaction, iso_cache, aniso_cache, spectral_t>;
 
 using ray_type = ext::Ray<float>;
 using light_type = ext::Light<spectral_t>;
diff --git a/31_HLSLPathTracer/main.cpp b/31_HLSLPathTracer/main.cpp
index 576a4c7b0..2e139af8d 100644
--- a/31_HLSLPathTracer/main.cpp
+++ b/31_HLSLPathTracer/main.cpp
@@ -80,13 +80,6 @@ class HLSLComputePathtracer final : public SimpleWindowedApplication, public Bui
 
 		inline bool isComputeOnly() const override { return false; }
 
-		//inline video::IAPIConnection::SFeatures getAPIFeaturesToEnable() override
-		//{
-		//	auto retval = device_base_t::getAPIFeaturesToEnable();
-		//	retval.synchronizationValidation = true;
-		//	return retval;
-		//}
-
 		inline core::vector<video::SPhysicalDeviceFilter::SurfaceCompatibility> getSurfaces() const override
 		{
 			if (!m_surface)

From 229b5211effb676218829073cd6e1d535094efc8 Mon Sep 17 00:00:00 2001
From: Arkadiusz Lachowicz <areklachowicz@gmail.com>
Date: Fri, 27 Jun 2025 11:22:34 +0200
Subject: [PATCH 446/529] remove my tests and assume pre-compiled SPIRV for
 CSimpleDebugRenderer.hpp

---
 common/CMakeLists.txt                                |  1 -
 .../nbl/examples/geometry/CSimpleDebugRenderer.hpp   | 12 ------------
 2 files changed, 13 deletions(-)

diff --git a/common/CMakeLists.txt b/common/CMakeLists.txt
index 1cbdefea7..2c4037e2d 100644
--- a/common/CMakeLists.txt
+++ b/common/CMakeLists.txt
@@ -83,7 +83,6 @@ endif()
     and NblExtExamplesAPIBuiltinsInclude targets)
 ]]
 
-#! NOTE: as I write it we don't have any targets there yet
 add_subdirectory("src/nbl/examples" EXCLUDE_FROM_ALL)
 
 NBL_GET_ALL_TARGETS(TARGETS)
diff --git a/common/include/nbl/examples/geometry/CSimpleDebugRenderer.hpp b/common/include/nbl/examples/geometry/CSimpleDebugRenderer.hpp
index 7b849e3b6..f5fd2bac6 100644
--- a/common/include/nbl/examples/geometry/CSimpleDebugRenderer.hpp
+++ b/common/include/nbl/examples/geometry/CSimpleDebugRenderer.hpp
@@ -97,26 +97,14 @@ class CSimpleDebugRenderer final : public core::IReferenceCounted
 			// load shader
 			smart_refctd_ptr<IShader> shader;
 			{
-				// TODO & NOTE: tmp, maybe I will turn it into CMake option
-				#define NBL_USE_PRECOMPILED_SPIRV
-
-				#ifdef NBL_USE_PRECOMPILED_SPIRV
 				constexpr std::string_view key = "nbl/examples/shaders/geometry/unified.hlsl.spv";
-				#else
-				constexpr std::string_view key = "nbl/examples/shaders/geometry/unified.hlsl";
-				#endif // NBL_USE_PRECOMPILED_SPIRV
-
 				const auto bundle = assMan->getAsset(key.data(), {});
 
-				//const auto bundle = assMan->getAsset("nbl/examples/shaders/geometry/unified.hlsl.spv",{});
 				const auto contents = bundle.getContents();
 				if (contents.empty() || bundle.getAssetType()!=IAsset::ET_SHADER)
 					return nullptr;
 				shader = IAsset::castDown<IShader>(contents[0]);
 				
-				#ifndef NBL_USE_PRECOMPILED_SPIRV
-				shader = device->compileShader({ .source = shader.get() });
-				#endif // NBL_USE_PRECOMPILED_SPIRV
 				if (!shader)
 					return nullptr;
 			}

From 7d4e1f403215642fd78e6ace93969d2be4274288 Mon Sep 17 00:00:00 2001
From: Erfan Ahmadi <ahmadierfan99@gmail.com>
Date: Mon, 30 Jun 2025 13:32:51 +0400
Subject: [PATCH 447/529] Shader Fixes and Improvements

---
 62_CAD/shaders/main_pipeline/dtm.hlsl | 68 +++++++++++++--------------
 1 file changed, 34 insertions(+), 34 deletions(-)

diff --git a/62_CAD/shaders/main_pipeline/dtm.hlsl b/62_CAD/shaders/main_pipeline/dtm.hlsl
index dc45ba66f..0fb35fab3 100644
--- a/62_CAD/shaders/main_pipeline/dtm.hlsl
+++ b/62_CAD/shaders/main_pipeline/dtm.hlsl
@@ -106,7 +106,7 @@ float3 calculateDTMTriangleBarycentrics(in float2 v1, in float2 v2, in float2 v3
     return float3(u, v, w);
 }
 
-float4 calculateDTMHeightColor(in DTMHeightShadingSettings settings, in float3 v[3], in float heightDeriv, in float2 fragPos, in float height)
+float4 calculateDTMHeightColor(in DTMHeightShadingSettings settings, in float3 triangleVertices[3], in float heightDeriv, in float2 fragPos, in float height)
 {
     float4 outputColor = float4(0.0f, 0.0f, 0.0f, 0.0f);
 
@@ -117,33 +117,37 @@ float4 calculateDTMHeightColor(in DTMHeightShadingSettings settings, in float3 v
 
     if (heightMapSize > 0)
     {
-        // partially based on https://www.shadertoy.com/view/XsXSz4 by Inigo Quilez
-        float2 e0 = (v[1] - v[0]).xy;
-        float2 e1 = (v[2] - v[1]).xy;
-        float2 e2 = (v[0] - v[2]).xy;
-
-        float triangleAreaSign = -sign(e0.x * e2.y - e0.y * e2.x);
-        float2 v0 = fragPos - v[0].xy;
-        float2 v1 = fragPos - v[1].xy;
-        float2 v2 = fragPos - v[2].xy;
-
-        float distanceToLine0 = sqrt(dot2(v0 - e0 * dot(v0, e0) / dot(e0, e0)));
-        float distanceToLine1 = sqrt(dot2(v1 - e1 * dot(v1, e1) / dot(e1, e1)));
-        float distanceToLine2 = sqrt(dot2(v2 - e2 * dot(v2, e2) / dot(e2, e2)));
-
-        float line0Sdf = distanceToLine0 * triangleAreaSign * sign(v0.x * e0.y - v0.y * e0.x);
-        float line1Sdf = distanceToLine1 * triangleAreaSign * sign(v1.x * e1.y - v1.y * e1.x);
-        float line2Sdf = distanceToLine2 * triangleAreaSign * sign(v2.x * e2.y - v2.y * e2.x);
-        float line3Sdf = (minShadingHeight - height) / heightDeriv;
-        float line4Sdf = (height - maxShadingHeight) / heightDeriv;
-
-        float convexPolygonSdf = max(line0Sdf, line1Sdf);
-        convexPolygonSdf = max(convexPolygonSdf, line2Sdf);
-        convexPolygonSdf = max(convexPolygonSdf, line3Sdf);
-        convexPolygonSdf = max(convexPolygonSdf, line4Sdf);
-
+        // Do the triangle SDF:
+        float2 e0 = (triangleVertices[1] - triangleVertices[0]).xy;
+        float2 e1 = (triangleVertices[2] - triangleVertices[1]).xy;
+        float2 e2 = (triangleVertices[0] - triangleVertices[2]).xy;
+        
+        float2 v0 = fragPos - triangleVertices[0].xy;
+        float2 v1 = fragPos - triangleVertices[1].xy;
+        float2 v2 = fragPos - triangleVertices[2].xy;
+
+        float distanceToLine0 = dot2(v0 - e0 * clamp(dot(v0, e0) / dot(e0, e0), 0.0, 1.0));
+        float distanceToLine1 = dot2(v1 - e1 * clamp(dot(v1, e1) / dot(e1, e1), 0.0, 1.0));
+        float distanceToLine2 = dot2(v2 - e2 * clamp(dot(v2, e2) / dot(e2, e2), 0.0, 1.0));
+
+        // TODO[Optization]: We can get the sign (whether inside or outside the triangle) from the barycentric coords we already compute outside this func
+        // So we can skip this part which tries to figure out which side of each triangle edge line the fragPos relies on
+        float o = e0.x * e2.y - e0.y * e2.x;
+        float2 d = min(min(float2(distanceToLine0, o * (v0.x * e0.y - v0.y * e0.x)),
+                        float2(distanceToLine1, o * (v1.x * e1.y - v1.y * e1.x))),
+                        float2(distanceToLine2, o * (v2.x * e2.y - v2.y * e2.x)));
+                         
+        float triangleSDF = -sqrt(d.x) * sign(d.y);
+        
+        // Intersect with the region between min and max height shading.
+        float minHeightShadingLine = (minShadingHeight - height) / heightDeriv;
+        float maxHeightShadingLine = (height - maxShadingHeight) / heightDeriv;
+
+        float convexPolygonSdf = triangleSDF;
+        convexPolygonSdf = max(convexPolygonSdf, minHeightShadingLine);
+        convexPolygonSdf = max(convexPolygonSdf, maxHeightShadingLine);
         outputColor.a = 1.0f - smoothstep(0.0f, globals.antiAliasingFactor + globals.antiAliasingFactor, convexPolygonSdf);
-
+     
         // calculate height color
         E_HEIGHT_SHADING_MODE mode = settings.determineHeightShadingMode();
         if (mode == E_HEIGHT_SHADING_MODE::DISCRETE_VARIABLE_LENGTH_INTERVALS)
@@ -263,7 +267,7 @@ float calculateDTMContourSDF(in DTMContourSettings contourSettings, in LineStyle
         if (contourLineHeight >= p0.z && contourLineHeight <= p1.z)
         {
             float interpolationVal = (contourLineHeight - p0.z) / (p1.z - p0.z);
-            contourLinePoints[contourLinePointsIdx] = p0.xy + interpolationVal * (p1.xy - p0.xy);
+            contourLinePoints[contourLinePointsIdx] = lerp(p0.xy, p1.xy, clamp(interpolationVal, 0.0f, 1.0f));
             ++contourLinePointsIdx;
         }
     }
@@ -405,10 +409,7 @@ E_CELL_DIAGONAL resolveGridDTMCellDiagonal(in uint32_t4 cellData)
         invalidHeightsCount += int(invalidHeights[i]);
 
     if (invalidHeightsCount == 0)
-    {
-        E_CELL_DIAGONAL a = getDiagonalModeFromCellCornerData(cellData.w);
         return getDiagonalModeFromCellCornerData(cellData.w);
-    }
 
     if (invalidHeightsCount > 1)
         return INVALID;
@@ -476,10 +477,9 @@ GridDTMCell calculateCellTriangles(in dtm::GridDTMHeightMapData heightData, in f
     // heightData.heihts.y - bottom right texel
     // heightData.heihts.z - top right texel
     // heightData.heihts.w - top left texel
-    const bool diagonalFromTopLeftToBottomRight = heightData.cellDiagonal == E_CELL_DIAGONAL::TOP_LEFT_TO_BOTTOM_RIGHT;
     float2 gridSpaceCellTopLeftCoords = cellCoords * cellWidth;
 
-    if (diagonalFromTopLeftToBottomRight)
+    if (heightData.cellDiagonal == E_CELL_DIAGONAL::TOP_LEFT_TO_BOTTOM_RIGHT)
     {
         output.triangleA.vertices[0] = float3(gridSpaceCellTopLeftCoords.x, gridSpaceCellTopLeftCoords.y, heightData.heights.w);
         output.triangleA.vertices[1] = float3(gridSpaceCellTopLeftCoords.x + cellWidth, gridSpaceCellTopLeftCoords.y + cellWidth, heightData.heights.y);
@@ -502,7 +502,7 @@ GridDTMCell calculateCellTriangles(in dtm::GridDTMHeightMapData heightData, in f
 
     output.validA = !isInvalidGridDtmHeightValue(output.triangleA.vertices[0].z) && !isInvalidGridDtmHeightValue(output.triangleA.vertices[1].z) && !isInvalidGridDtmHeightValue(output.triangleA.vertices[2].z);
     output.validB = !isInvalidGridDtmHeightValue(output.triangleB.vertices[0].z) && !isInvalidGridDtmHeightValue(output.triangleB.vertices[1].z) && !isInvalidGridDtmHeightValue(output.triangleB.vertices[2].z);
-    
+
     // move from grid space to screen space
     [unroll]
     for (int i = 0; i < 3; ++i)

From d3590dd0f48bfd21e228109fe29c804aa1b128a5 Mon Sep 17 00:00:00 2001
From: Erfan Ahmadi <ahmadierfan99@gmail.com>
Date: Mon, 30 Jun 2025 13:33:07 +0400
Subject: [PATCH 448/529] shader fixes and improvements on dtm

---
 .../main_pipeline/fragment_shader.hlsl        | 19 ++++++++++---------
 .../shaders/main_pipeline/vertex_shader.hlsl  | 18 +++++++++++-------
 2 files changed, 21 insertions(+), 16 deletions(-)

diff --git a/62_CAD/shaders/main_pipeline/fragment_shader.hlsl b/62_CAD/shaders/main_pipeline/fragment_shader.hlsl
index 0a7199627..c1c6715af 100644
--- a/62_CAD/shaders/main_pipeline/fragment_shader.hlsl
+++ b/62_CAD/shaders/main_pipeline/fragment_shader.hlsl
@@ -140,32 +140,33 @@ float4 fragMain(PSInput input) : SV_TARGET
     {
         DTMSettings dtmSettings = loadDTMSettings(mainObj.dtmSettingsIdx);
 
-        float3 v[3];
-        v[0] = input.getScreenSpaceVertexAttribs(0);
-        v[1] = input.getScreenSpaceVertexAttribs(1);
-        v[2] = input.getScreenSpaceVertexAttribs(2);
+        float3 triangleVertices[3];
+        triangleVertices[0] = input.getScreenSpaceVertexAttribs(0);
+        triangleVertices[1] = input.getScreenSpaceVertexAttribs(1);
+        triangleVertices[2] = input.getScreenSpaceVertexAttribs(2);
 
-        const float3 baryCoord = dtm::calculateDTMTriangleBarycentrics(v[0].xy, v[1].xy, v[2].xy, input.position.xy);
-        float height = baryCoord.x * v[0].z + baryCoord.y * v[1].z + baryCoord.z * v[2].z;
+        const float3 baryCoord = dtm::calculateDTMTriangleBarycentrics(triangleVertices[0].xy, triangleVertices[1].xy, triangleVertices[2].xy, input.position.xy);
+
+        float height = baryCoord.x * triangleVertices[0].z + baryCoord.y * triangleVertices[1].z + baryCoord.z * triangleVertices[2].z;
         float heightDeriv = fwidth(height);
 
         float4 dtmColor = float4(0.0f, 0.0f, 0.0f, 0.0f);
         
         if (dtmSettings.drawOutlineEnabled())                                                                                                    // TODO: do i need 'height' paramter here?
-            dtmColor = dtm::blendUnder(dtmColor, dtm::calculateDTMOutlineColor(dtmSettings.outlineLineStyleIdx, v, input.position.xy));
+            dtmColor = dtm::blendUnder(dtmColor, dtm::calculateDTMOutlineColor(dtmSettings.outlineLineStyleIdx, triangleVertices, input.position.xy));
         if (dtmSettings.drawContourEnabled())
         {
             for(uint32_t i = 0; i < dtmSettings.contourSettingsCount; ++i) // TODO: should reverse the order with blendUnder
             {
                 LineStyle contourStyle = loadLineStyle(dtmSettings.contourSettings[i].contourLineStyleIdx);
-                float sdf = dtm::calculateDTMContourSDF(dtmSettings.contourSettings[i], contourStyle, v, input.position.xy, height);
+                float sdf = dtm::calculateDTMContourSDF(dtmSettings.contourSettings[i], contourStyle, triangleVertices, input.position.xy, height);
                 float4 contourColor = contourStyle.color;
                 contourColor.a *= 1.0f - smoothstep(-globals.antiAliasingFactor, globals.antiAliasingFactor, sdf);
                 dtmColor = dtm::blendUnder(dtmColor, contourColor);
             }
         }
         if (dtmSettings.drawHeightShadingEnabled())
-            dtmColor = dtm::blendUnder(dtmColor, dtm::calculateDTMHeightColor(dtmSettings.heightShadingSettings, v, heightDeriv, input.position.xy, height));
+            dtmColor = dtm::blendUnder(dtmColor, dtm::calculateDTMHeightColor(dtmSettings.heightShadingSettings, triangleVertices, heightDeriv, input.position.xy, height));
 
         textureColor = dtmColor.rgb / dtmColor.a;
         localAlpha = dtmColor.a;
diff --git a/62_CAD/shaders/main_pipeline/vertex_shader.hlsl b/62_CAD/shaders/main_pipeline/vertex_shader.hlsl
index 30283885e..432a18511 100644
--- a/62_CAD/shaders/main_pipeline/vertex_shader.hlsl
+++ b/62_CAD/shaders/main_pipeline/vertex_shader.hlsl
@@ -134,8 +134,8 @@ PSInput main(uint vertexID : SV_VertexID)
         float2 transformedOriginalPos;
         float2 transformedDilatedPos;
         {
-            uint32_t firstVertexOfCurrentTriangleIndex = vertexID - vertexID % 3;
-            uint32_t currentVertexWithinTriangleIndex = vertexID - firstVertexOfCurrentTriangleIndex;
+            uint32_t currentVertexWithinTriangleIndex = vertexID % 3;
+            uint32_t firstVertexOfCurrentTriangleIndex = vertexID - currentVertexWithinTriangleIndex;
 
             TriangleMeshVertex triangleVertices[3];
             triangleVertices[0] = vk::RawBufferLoad<TriangleMeshVertex>(pc.triangleMeshVerticesBaseAddress + sizeof(TriangleMeshVertex) * firstVertexOfCurrentTriangleIndex, 8u);
@@ -581,8 +581,9 @@ PSInput main(uint vertexID : SV_VertexID)
 
             float32_t2 minUV = glyphInfo.getMinUV();
             uint16_t textureID = glyphInfo.getTextureID();
-
-            const float32_t2 dirV = float32_t2(glyphInfo.dirU.y, -glyphInfo.dirU.x) * glyphInfo.aspectRatio;
+            
+            const int ndcYDirectionSign = sign(clipProjectionData.projectionToNDC[1][1]);
+            const float32_t2 dirV = float32_t2(glyphInfo.dirU.y, ndcYDirectionSign * glyphInfo.dirU.x) * glyphInfo.aspectRatio;
             const float2 screenTopLeft = _static_cast<float2>(transformPointNdc(clipProjectionData.projectionToNDC, glyphInfo.topLeft));
             const float2 screenDirU = _static_cast<float2>(transformVectorNdc(clipProjectionData.projectionToNDC, _static_cast<pfloat64_t2>(glyphInfo.dirU)));
             const float2 screenDirV = _static_cast<float2>(transformVectorNdc(clipProjectionData.projectionToNDC, _static_cast<pfloat64_t2>(dirV)));
@@ -630,7 +631,9 @@ PSInput main(uint vertexID : SV_VertexID)
             float32_t aspectRatio = vk::RawBufferLoad<float32_t>(globals.pointers.geometryBuffer + drawObj.geometryAddress + sizeof(pfloat64_t2) + sizeof(float2), 4u);
             uint32_t textureID = vk::RawBufferLoad<uint32_t>(globals.pointers.geometryBuffer + drawObj.geometryAddress + sizeof(pfloat64_t2) + sizeof(float2) + sizeof(float), 4u);
 
-            const float32_t2 dirV = float32_t2(dirU.y, -dirU.x) * aspectRatio;
+            // If y increases as we go down in ndc this sign is positive (screenspace-like transformations), if y decreases as we go down this sign is negative (worldspace-like transformations)
+            const int ndcYDirectionSign = sign(clipProjectionData.projectionToNDC[1][1]);
+            const float32_t2 dirV = float32_t2(dirU.y, ndcYDirectionSign * dirU.x) * aspectRatio;
             const float2 ndcTopLeft = _static_cast<float2>(transformPointNdc(clipProjectionData.projectionToNDC, topLeft));
             const float2 ndcDirU = _static_cast<float2>(transformVectorNdc(clipProjectionData.projectionToNDC, _static_cast<pfloat64_t2>(dirU)));
             const float2 ndcDirV = _static_cast<float2>(transformVectorNdc(clipProjectionData.projectionToNDC, _static_cast<pfloat64_t2>(dirV)));
@@ -725,8 +728,9 @@ PSInput main(uint vertexID : SV_VertexID)
             float32_t2 dirU = vk::RawBufferLoad<float32_t2>(globals.pointers.geometryBuffer + drawObj.geometryAddress + sizeof(pfloat64_t2), 4u);
             float32_t aspectRatio = vk::RawBufferLoad<float32_t>(globals.pointers.geometryBuffer + drawObj.geometryAddress + sizeof(pfloat64_t2) + sizeof(float2), 4u);
             uint32_t textureID = vk::RawBufferLoad<uint32_t>(globals.pointers.geometryBuffer + drawObj.geometryAddress + sizeof(pfloat64_t2) + sizeof(float2) + sizeof(float), 4u);
-
-            const float32_t2 dirV = float32_t2(dirU.y, -dirU.x) * aspectRatio;
+            
+            const int ndcYDirectionSign = sign(clipProjectionData.projectionToNDC[1][1]);
+            const float32_t2 dirV = float32_t2(dirU.y, ndcYDirectionSign * dirU.x) * aspectRatio;
             const float2 ndcTopLeft = _static_cast<float2>(transformPointNdc(clipProjectionData.projectionToNDC, topLeft));
             const float2 ndcDirU = _static_cast<float2>(transformVectorNdc(clipProjectionData.projectionToNDC, _static_cast<pfloat64_t2>(dirU)));
             const float2 ndcDirV = _static_cast<float2>(transformVectorNdc(clipProjectionData.projectionToNDC, _static_cast<pfloat64_t2>(dirV)));

From f23411ebbc810645b76585b65d8a5877cc5c60b0 Mon Sep 17 00:00:00 2001
From: Erfan Ahmadi <ahmadierfan99@gmail.com>
Date: Mon, 30 Jun 2025 16:19:50 +0400
Subject: [PATCH 449/529] adding rotation to viewport

---
 62_CAD/main.cpp | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/62_CAD/main.cpp b/62_CAD/main.cpp
index d6ad87637..d9eefa8b9 100644
--- a/62_CAD/main.cpp
+++ b/62_CAD/main.cpp
@@ -1495,7 +1495,16 @@ class ComputerAidedDesign final : public examples::SimpleWindowedApplication, pu
 
 		float64_t3x3 projectionToNDC;
 		projectionToNDC = m_Camera.constructViewProjection();
-		
+#if 0
+		double rotation = 0.25 * PI<double>();
+		float64_t2 rotationVec = float64_t2(cos(rotation), sin(rotation));
+		float64_t3x3 rotationParameter = float64_t3x3 {
+			rotationVec.x, rotationVec.y, 0.0,
+			-rotationVec.y, rotationVec.x, 0.0,
+			0.0, 0.0, 1.0
+		};
+		projectionToNDC = nbl::hlsl::mul(projectionToNDC, rotationParameter);
+#endif
 		Globals globalData = {};
 		uint64_t baseAddress = resourcesGPUBuffer->getDeviceAddress();
 		globalData.pointers = {

From 8f6c7bc66bfed7ad70f13fab5f2c6af50917e9b5 Mon Sep 17 00:00:00 2001
From: devsh <devsh@devsh.eu>
Date: Tue, 1 Jul 2025 10:10:05 +0200
Subject: [PATCH 450/529] get the spanner PLY loading!

---
 12_MeshLoaders/main.cpp | 44 ++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 43 insertions(+), 1 deletion(-)

diff --git a/12_MeshLoaders/main.cpp b/12_MeshLoaders/main.cpp
index 14da9f0d6..b181162d4 100644
--- a/12_MeshLoaders/main.cpp
+++ b/12_MeshLoaders/main.cpp
@@ -100,15 +100,21 @@ class MeshLoadersApp final : public MonoWindowApplication, public BuiltinResourc
 				}
 				// late latch input
 				{
+					bool reload = false;
 					camera.beginInputProcessing(nextPresentationTimestamp);
 					mouse.consumeEvents([&](const IMouseEventChannel::range_t& events) -> void { camera.mouseProcess(events); }, m_logger.get());
 					keyboard.consumeEvents([&](const IKeyboardEventChannel::range_t& events) -> void
 						{
+							for (const auto& event : events)
+							if (event.keyCode==E_KEY_CODE::EKC_R && event.action==SKeyboardEvent::ECA_RELEASED)
+								reload = true;
 							camera.keyboardProcess(events);
 						},
 						m_logger.get()
 					);
 					camera.endInputProcessing(nextPresentationTimestamp);
+					if (reload)
+						reloadModel();
 				}
 				// draw scene
 				{
@@ -239,6 +245,7 @@ class MeshLoadersApp final : public MonoWindowApplication, public BuiltinResourc
 
 			//! load the geometry
 			IAssetLoader::SAssetLoadParams params = {};
+			params.logger = m_logger.get();
 			params.meshManipulatorOverride = nullptr; // TODO
 			auto bundle = m_assetMgr->getAsset(m_modelPath,params);
 			if (bundle.getContents().empty())
@@ -263,6 +270,7 @@ class MeshLoadersApp final : public MonoWindowApplication, public BuiltinResourc
 			//! cache results -- speeds up mesh generation on second run
 			m_qnc->saveCacheToFile<EF_R8G8B8_SNORM>(m_system.get(),sharedOutputCWD/"../../tmp/normalCache888.sse");
 			
+			auto bound = hlsl::shapes::AABB<3,double>::create();
 			// convert the geometries
 			{
 				smart_refctd_ptr<CAssetConverter> converter = CAssetConverter::create({.device=m_device.get()});
@@ -339,11 +347,45 @@ class MeshLoadersApp final : public MonoWindowApplication, public BuiltinResourc
 				}
 
 				const auto& converted = reservation.getGPUObjects<ICPUPolygonGeometry>();
+				for (const auto& geom : converted)
+				{
+					geom.value->visitAABB([&bound](const auto& aabb)->void
+						{
+							hlsl::shapes::AABB<3,double> promoted;
+							promoted.minVx = aabb.minVx;
+							promoted.maxVx = aabb.maxVx;
+							bound = hlsl::shapes::util::union_(promoted,bound);
+						}
+					);
+				}
 				if (!m_renderer->addGeometries({ &converted.front().get(),converted.size() }))
 					return false;
+				for (const auto& geo : m_renderer->getGeometries())
+					m_renderer->m_instances.push_back({
+						.world = hlsl::float32_t3x4(
+							hlsl::float32_t4(1,0,0,0),
+							hlsl::float32_t4(0,1,0,0),
+							hlsl::float32_t4(0,0,1,0)
+						),
+						.packedGeo = &geo
+					});
 			}
 
-// TODO: get scene bounds and reset camera
+			// get scene bounds and reset camera
+			{
+				const double distance = 0.05;
+				const auto diagonal = bound.maxVx-bound.minVx;
+				{
+					const auto measure = hlsl::length(diagonal);
+					const auto aspectRatio = float(m_window->getWidth())/float(m_window->getHeight());
+					camera.setProjectionMatrix(core::matrix4SIMD::buildProjectionMatrixPerspectiveFovRH(1.2f,aspectRatio,distance*measure*0.1,measure*4.0));
+					camera.setMoveSpeed(measure*0.04);
+				}
+				const auto pos = bound.maxVx+diagonal*distance;
+				camera.setPosition(vectorSIMDf(pos.x,pos.y,pos.z));
+				const auto center = (bound.minVx+bound.maxVx)*0.5;
+				camera.setTarget(vectorSIMDf(center.x,center.y,center.z));
+			}
 
 			// TODO: write out the geometry
 

From f66d952415b903646f778bc6c135513be6511f30 Mon Sep 17 00:00:00 2001
From: kevyuu <kevin.kayu@gmail.com>
Date: Tue, 1 Jul 2025 21:16:22 +0700
Subject: [PATCH 451/529] Add more geometry to geometry creator demo

---
 .../include/nbl/examples/geometry/CGeometryCreatorScene.hpp   | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/common/include/nbl/examples/geometry/CGeometryCreatorScene.hpp b/common/include/nbl/examples/geometry/CGeometryCreatorScene.hpp
index 63b3d7a8d..1da3ea487 100644
--- a/common/include/nbl/examples/geometry/CGeometryCreatorScene.hpp
+++ b/common/include/nbl/examples/geometry/CGeometryCreatorScene.hpp
@@ -66,6 +66,10 @@ class CGeometryCreatorScene : public core::IReferenceCounted
 				addGeometry("Cube",creator->createCube({1.f,1.f,1.f}));
 				addGeometry("Rectangle",creator->createRectangle({1.5f,3.f}));
 				addGeometry("Disk",creator->createDisk(2.f,30));
+				addGeometry("Sphere", creator->createSphere(2, 16, 16));
+				addGeometry("Cylinder", creator->createCylinder(2, 2, 20));
+				addGeometry("Cone", creator->createCone(2, 3, 10));
+				addGeometry("Icosphere", creator->createIcoSphere(1, 4, true));
 			}
 
 			// convert the geometries

From 0bc0557c6ae815a96e59e651fe4bdf4a13d1e38f Mon Sep 17 00:00:00 2001
From: kevyuu <kevin.kayu@gmail.com>
Date: Tue, 1 Jul 2025 21:16:48 +0700
Subject: [PATCH 452/529] Fix example 67

---
 67_RayQueryGeometry/app_resources/common.hlsl |   3 +-
 .../app_resources/render.comp.hlsl            |  37 ++---
 67_RayQueryGeometry/include/common.hpp        |  66 +++++++++
 67_RayQueryGeometry/main.cpp                  | 131 ++++++++----------
 4 files changed, 142 insertions(+), 95 deletions(-)

diff --git a/67_RayQueryGeometry/app_resources/common.hlsl b/67_RayQueryGeometry/app_resources/common.hlsl
index e39e7192b..0827a0e90 100644
--- a/67_RayQueryGeometry/app_resources/common.hlsl
+++ b/67_RayQueryGeometry/app_resources/common.hlsl
@@ -10,6 +10,7 @@ struct SGeomInfo
 {
     uint64_t vertexBufferAddress;
     uint64_t indexBufferAddress;
+    uint64_t normalBufferAddress;
 
     uint32_t vertexStride : 29;
     uint32_t indexType : 2; // 16 bit, 32 bit or none
@@ -35,8 +36,6 @@ enum ObjectType : uint32_t  // matches c++
     OT_SPHERE,
     OT_CYLINDER,
     OT_RECTANGLE,
-    OT_DISK,
-    OT_ARROW,
     OT_CONE,
     OT_ICOSPHERE,
 
diff --git a/67_RayQueryGeometry/app_resources/render.comp.hlsl b/67_RayQueryGeometry/app_resources/render.comp.hlsl
index 657d0bbf0..0d2f4e425 100644
--- a/67_RayQueryGeometry/app_resources/render.comp.hlsl
+++ b/67_RayQueryGeometry/app_resources/render.comp.hlsl
@@ -29,9 +29,11 @@ float3 calculateSmoothNormals(int instID, int primID, SGeomInfo geom, float2 bar
 {
     const uint indexType = geom.indexType;
     const uint vertexStride = geom.vertexStride;
+    const uint objType = instID;
 
     const uint64_t vertexBufferAddress = geom.vertexBufferAddress;
     const uint64_t indexBufferAddress = geom.indexBufferAddress;
+    const uint64_t normalBufferAddress = geom.normalBufferAddress;
 
     uint32_t3 indices;
     switch (indexType)
@@ -51,42 +53,31 @@ float3 calculateSmoothNormals(int instID, int primID, SGeomInfo geom, float2 bar
     }
 
     float3 n0, n1, n2;
-    switch (instID)
+    switch (objType)
     {
         case OT_CUBE:
+        case OT_SPHERE:
+        case OT_RECTANGLE:
+        case OT_CYLINDER:
+        //case OT_ARROW:
+        case OT_CONE:
         {
             // TODO: document why the alignment is 2 here and nowhere else? isnt the `vertexStride` aligned to more than 2 anyway?
-            uint32_t v0 = vk::RawBufferLoad<uint32_t>(vertexBufferAddress + indices[0] * vertexStride, 2u);
-            uint32_t v1 = vk::RawBufferLoad<uint32_t>(vertexBufferAddress + indices[1] * vertexStride, 2u);
-            uint32_t v2 = vk::RawBufferLoad<uint32_t>(vertexBufferAddress + indices[2] * vertexStride, 2u);
+            uint32_t v0 = vk::RawBufferLoad<uint32_t>(normalBufferAddress + indices[0] * 4);
+            uint32_t v1 = vk::RawBufferLoad<uint32_t>(normalBufferAddress + indices[1] * 4);
+            uint32_t v2 = vk::RawBufferLoad<uint32_t>(normalBufferAddress + indices[2] * 4);
 
             n0 = normalize(nbl::hlsl::spirv::unpackSnorm4x8(v0).xyz);
             n1 = normalize(nbl::hlsl::spirv::unpackSnorm4x8(v1).xyz);
             n2 = normalize(nbl::hlsl::spirv::unpackSnorm4x8(v2).xyz);
         }
         break;
-        case OT_SPHERE:
-        case OT_CYLINDER:
-        case OT_ARROW:
-        case OT_CONE:
-        {
-            uint32_t v0 = vk::RawBufferLoad<uint32_t>(vertexBufferAddress + indices[0] * vertexStride);
-            uint32_t v1 = vk::RawBufferLoad<uint32_t>(vertexBufferAddress + indices[1] * vertexStride);
-            uint32_t v2 = vk::RawBufferLoad<uint32_t>(vertexBufferAddress + indices[2] * vertexStride);
-
-            n0 = normalize(unpackNormals3x10(v0));
-            n1 = normalize(unpackNormals3x10(v1));
-            n2 = normalize(unpackNormals3x10(v2));
-        }
-        break;
-        case OT_RECTANGLE:
-        case OT_DISK:
         case OT_ICOSPHERE:
         default:
         {
-            n0 = normalize(vk::RawBufferLoad<float3>(vertexBufferAddress + indices[0] * vertexStride));
-            n1 = normalize(vk::RawBufferLoad<float3>(vertexBufferAddress + indices[1] * vertexStride));
-            n2 = normalize(vk::RawBufferLoad<float3>(vertexBufferAddress + indices[2] * vertexStride));
+            n0 = normalize(vk::RawBufferLoad<float3>(normalBufferAddress + indices[0] * vertexStride));
+            n1 = normalize(vk::RawBufferLoad<float3>(normalBufferAddress + indices[1] * vertexStride));
+            n2 = normalize(vk::RawBufferLoad<float3>(normalBufferAddress + indices[2] * vertexStride));
         }
     }
 
diff --git a/67_RayQueryGeometry/include/common.hpp b/67_RayQueryGeometry/include/common.hpp
index bcf896f55..48b91ba5f 100644
--- a/67_RayQueryGeometry/include/common.hpp
+++ b/67_RayQueryGeometry/include/common.hpp
@@ -15,4 +15,70 @@ using namespace nbl::examples;
 
 #include "app_resources/common.hlsl"
 
+namespace nbl::scene
+{
+enum ObjectType : uint8_t
+{
+	OT_CUBE,
+	OT_SPHERE,
+	OT_CYLINDER,
+	OT_RECTANGLE,
+	OT_CONE,
+	OT_ICOSPHERE,
+
+	OT_COUNT,
+	OT_UNKNOWN = std::numeric_limits<uint8_t>::max()
+};
+
+static constexpr uint32_t s_smoothNormals[OT_COUNT] = { 0, 1, 1, 0, 1, 1 };
+
+struct ObjectMeta
+{
+	ObjectType type = OT_UNKNOWN;
+	std::string_view name = "Unknown";
+};
+
+struct ObjectDrawHookCpu
+{
+	nbl::core::matrix3x4SIMD model;
+	ObjectMeta meta;
+};
+
+enum GeometryShader
+{
+	GP_BASIC = 0,
+	GP_CONE,
+	GP_ICO,
+
+	GP_COUNT
+};
+
+struct ReferenceObjectCpu
+{
+	ObjectMeta meta;
+	GeometryShader shadersType;
+	core::smart_refctd_ptr<ICPUPolygonGeometry> data;
+};
+
+struct ReferenceObjectGpu
+{
+	struct Bindings
+	{
+		nbl::asset::SBufferBinding<IGPUBuffer> vertex, index;
+	};
+
+	ObjectMeta meta;
+	Bindings bindings;
+	uint32_t vertexStride;
+	nbl::asset::E_INDEX_TYPE indexType = nbl::asset::E_INDEX_TYPE::EIT_UNKNOWN;
+	uint32_t indexCount = {};
+
+	const bool useIndex() const
+	{
+		return bindings.index.buffer && (indexType != E_INDEX_TYPE::EIT_UNKNOWN);
+	}
+};
+}
+
+
 #endif // _NBL_THIS_EXAMPLE_COMMON_H_INCLUDED_
\ No newline at end of file
diff --git a/67_RayQueryGeometry/main.cpp b/67_RayQueryGeometry/main.cpp
index 495f3a3e2..3fd6e5642 100644
--- a/67_RayQueryGeometry/main.cpp
+++ b/67_RayQueryGeometry/main.cpp
@@ -3,10 +3,10 @@
 // For conditions of distribution and use, see copyright notice in nabla.h
 #include "common.hpp"
 
-class RayQueryGeometryApp final : public SimpleWindowedApplication, public MonoAssetManagerAndBuiltinResourceApplication
+class RayQueryGeometryApp final : public SimpleWindowedApplication, public BuiltinResourcesApplication
 {
 		using device_base_t = SimpleWindowedApplication;
-		using asset_base_t = MonoAssetManagerAndBuiltinResourceApplication;
+		using asset_base_t = BuiltinResourcesApplication;
 		using clock_t = std::chrono::steady_clock;
 
 		constexpr static inline uint32_t WIN_W = 1280, WIN_H = 720;
@@ -486,25 +486,22 @@ class RayQueryGeometryApp final : public SimpleWindowedApplication, public MonoA
 
 		smart_refctd_ptr<IGPUDescriptorSet> createAccelerationStructureDS(video::CThreadSafeQueueAdapter* queue)
 		{
-			// get geometries in ICPUBuffers
-#if 1
-			return nullptr;
-#else
-			std::array<ReferenceObjectCpu, OT_COUNT> objectsCpu;
-			objectsCpu[OT_CUBE] = ReferenceObjectCpu{ .meta = {.type = OT_CUBE, .name = "Cube Mesh" }, .shadersType = GP_BASIC, .data = gc->createCubeMesh(nbl::core::vector3df(1.f, 1.f, 1.f)) };
-			objectsCpu[OT_SPHERE] = ReferenceObjectCpu{ .meta = {.type = OT_SPHERE, .name = "Sphere Mesh" }, .shadersType = GP_BASIC, .data = gc->createSphereMesh(2, 16, 16) };
-			objectsCpu[OT_CYLINDER] = ReferenceObjectCpu{ .meta = {.type = OT_CYLINDER, .name = "Cylinder Mesh" }, .shadersType = GP_BASIC, .data = gc->createCylinderMesh(2, 2, 20) };
-			objectsCpu[OT_RECTANGLE] = ReferenceObjectCpu{ .meta = {.type = OT_RECTANGLE, .name = "Rectangle Mesh" }, .shadersType = GP_BASIC, .data = gc->createRectangleMesh(nbl::core::vector2df_SIMD(1.5, 3)) };
-			objectsCpu[OT_DISK] = ReferenceObjectCpu{ .meta = {.type = OT_DISK, .name = "Disk Mesh" }, .shadersType = GP_BASIC, .data = gc->createDiskMesh(2, 30) };
-			objectsCpu[OT_ARROW] = ReferenceObjectCpu{ .meta = {.type = OT_ARROW, .name = "Arrow Mesh" }, .shadersType = GP_BASIC, .data = gc->createArrowMesh() };
-			objectsCpu[OT_CONE] = ReferenceObjectCpu{ .meta = {.type = OT_CONE, .name = "Cone Mesh" }, .shadersType = GP_CONE, .data = gc->createConeMesh(2, 3, 10) };
-			objectsCpu[OT_ICOSPHERE] = ReferenceObjectCpu{ .meta = {.type = OT_ICOSPHERE, .name = "Icosphere Mesh" }, .shadersType = GP_ICO, .data = gc->createIcoSphere(1, 3, true) };
+			using namespace nbl::scene;
+
+      // triangles geometries
+      auto gc = make_smart_refctd_ptr<CGeometryCreator>();
+
+			std::array<ReferenceObjectCpu, OT_COUNT> cpuObjects;
+			cpuObjects[OT_CUBE] = ReferenceObjectCpu{ .meta = {.type = OT_CUBE, .name = "Cube Mesh" }, .shadersType = GP_BASIC, .data = gc->createCube({1.f, 1.f, 1.f}) };
+			cpuObjects[OT_SPHERE] = ReferenceObjectCpu{ .meta = {.type = OT_SPHERE, .name = "Sphere Mesh" }, .shadersType = GP_BASIC, .data = gc->createSphere(2, 16, 16) };
+			cpuObjects[OT_CYLINDER] = ReferenceObjectCpu{ .meta = {.type = OT_CYLINDER, .name = "Cylinder Mesh" }, .shadersType = GP_BASIC, .data = gc->createCylinder(2, 2, 20) };
+			cpuObjects[OT_RECTANGLE] = ReferenceObjectCpu{ .meta = {.type = OT_RECTANGLE, .name = "Rectangle Mesh" }, .shadersType = GP_BASIC, .data = gc->createRectangle({1.5, 3}) };
+			cpuObjects[OT_CONE] = ReferenceObjectCpu{ .meta = {.type = OT_CONE, .name = "Cone Mesh" }, .shadersType = GP_CONE, .data = gc->createCone(2, 3, 10) };
+			cpuObjects[OT_ICOSPHERE] = ReferenceObjectCpu{ .meta = {.type = OT_ICOSPHERE, .name = "Icosphere Mesh" }, .shadersType = GP_ICO, .data = gc->createIcoSphere(1, 3, true) };
 
 			auto geomInfoBuffer = ICPUBuffer::create({ OT_COUNT * sizeof(SGeomInfo) });
 
 			SGeomInfo* geomInfos = reinterpret_cast<SGeomInfo*>(geomInfoBuffer->getPointer());
-			const uint32_t byteOffsets[OT_COUNT] = { 18, 24, 24, 20, 20, 24, 16, 12 };	// based on normals data position
-			const uint32_t smoothNormals[OT_COUNT] = { 0, 1, 1, 0, 0, 1, 1, 1 };
 
 			// get ICPUBuffers into ICPUBottomLevelAccelerationStructures
 			std::array<smart_refctd_ptr<ICPUBottomLevelAccelerationStructure>, OT_COUNT> cpuBlas;
@@ -514,37 +511,14 @@ class RayQueryGeometryApp final : public SimpleWindowedApplication, public MonoA
 				auto primitiveCounts = make_refctd_dynamic_array<smart_refctd_dynamic_array<uint32_t>>(1u);
 
 				auto& tri = triangles->front();
-				auto& primCount = primitiveCounts->front();
-				const auto& geom = objectsCpu[i];
-
-				const bool useIndex = geom.data.indexType != EIT_UNKNOWN;
-				const uint32_t vertexStride = geom.data.inputParams.bindings[0].stride;
-				const uint32_t numVertices = (geom.data.bindings[0].buffer->getSize()-geom.data.bindings[0].offset) / vertexStride;
 
-				if (useIndex)
-					primCount = geom.data.indexCount / 3;
-				else
-					primCount = numVertices / 3;
-
-				geomInfos[i].indexType = geom.data.indexType;
-				geomInfos[i].vertexStride = vertexStride;
-				geomInfos[i].smoothNormals = smoothNormals[i];
+				auto& primCount = primitiveCounts->front();
+				primCount = cpuObjects[i].data->getPrimitiveCount();
 
-				geom.data.bindings[0].buffer->setContentHash(geom.data.bindings[0].buffer->computeContentHash());
-				tri.vertexData[0] = geom.data.bindings[0];
-				if (useIndex)
-				{
-					geom.data.indexBuffer.buffer->setContentHash(geom.data.indexBuffer.buffer->computeContentHash());
-					tri.indexData = geom.data.indexBuffer;
-				}
-				tri.maxVertex = numVertices - 1;
-				tri.vertexStride = vertexStride;
-				tri.vertexFormat = static_cast<E_FORMAT>(geom.data.inputParams.attributes[0].format);
-				tri.indexType = geom.data.indexType;
-				tri.geometryFlags = IGPUBottomLevelAccelerationStructure::GEOMETRY_FLAGS::OPAQUE_BIT;
+				tri = cpuObjects[i].data->exportForBLAS();
 
 				auto& blas = cpuBlas[i];
-				blas = make_smart_refctd_ptr<ICPUBottomLevelAccelerationStructure>();
+        blas = make_smart_refctd_ptr<ICPUBottomLevelAccelerationStructure>();
 				blas->setGeometries(std::move(triangles), std::move(primitiveCounts));
 
 				auto blasFlags = bitflag(IGPUBottomLevelAccelerationStructure::BUILD_FLAGS::PREFER_FAST_TRACE_BIT) | IGPUBottomLevelAccelerationStructure::BUILD_FLAGS::ALLOW_COMPACTION_BIT;
@@ -639,28 +613,25 @@ class RayQueryGeometryApp final : public SimpleWindowedApplication, public MonoA
 			CAssetConverter::patch_t<ICPUTopLevelAccelerationStructure> tlasPatch = {};
 			tlasPatch.compactAfterBuild = true;
 			std::array<CAssetConverter::patch_t<ICPUBottomLevelAccelerationStructure>,OT_COUNT> tmpBLASPatches = {};
-			std::array<const ICPUBuffer*, OT_COUNT * 2u> tmpBuffers;
-			std::array<CAssetConverter::patch_t<ICPUBuffer>, OT_COUNT * 2u> tmpBufferPatches;
+      std::array<ICPUPolygonGeometry*, std::size(cpuObjects)> tmpGeometries;
+      std::array<CAssetConverter::patch_t<asset::ICPUPolygonGeometry>, std::size(cpuObjects)> tmpGeometryPatches;
 			{
 				tmpBLASPatches.front().compactAfterBuild = true;
 				std::fill(tmpBLASPatches.begin(),tmpBLASPatches.end(),tmpBLASPatches.front());
 				//
-				for (uint32_t i = 0; i < objectsCpu.size(); i++)
-				{
-					tmpBuffers[2 * i + 0] = cpuBlas[i]->getTriangleGeometries().front().vertexData[0].buffer.get();
-					tmpBuffers[2 * i + 1] = cpuBlas[i]->getTriangleGeometries().front().indexData.buffer.get();
-				}
-				// make sure all buffers are BDA-readable
-				for (auto& patch : tmpBufferPatches)
-					patch.usage |= asset::IBuffer::E_USAGE_FLAGS::EUF_SHADER_DEVICE_ADDRESS_BIT;
+        for (uint32_t i = 0; i < cpuObjects.size(); i++)
+        {
+          tmpGeometries[i] = cpuObjects[i].data.get();
+          tmpGeometryPatches[i].indexBufferUsages= IGPUBuffer::E_USAGE_FLAGS::EUF_SHADER_DEVICE_ADDRESS_BIT;
+        }
 
 				std::get<CAssetConverter::SInputs::asset_span_t<ICPUDescriptorSet>>(inputs.assets) = {&descriptorSet.get(),1};
 				std::get<CAssetConverter::SInputs::asset_span_t<ICPUTopLevelAccelerationStructure>>(inputs.assets) = {&cpuTlas.get(),1};
 				std::get<CAssetConverter::SInputs::patch_span_t<ICPUTopLevelAccelerationStructure>>(inputs.patches) = {&tlasPatch,1};
 				std::get<CAssetConverter::SInputs::asset_span_t<ICPUBottomLevelAccelerationStructure>>(inputs.assets) = {&cpuBlas.data()->get(),cpuBlas.size()};
 				std::get<CAssetConverter::SInputs::patch_span_t<ICPUBottomLevelAccelerationStructure>>(inputs.patches) = tmpBLASPatches;
-				std::get<CAssetConverter::SInputs::asset_span_t<ICPUBuffer>>(inputs.assets) = tmpBuffers;
-				std::get<CAssetConverter::SInputs::patch_span_t<ICPUBuffer>>(inputs.patches) = tmpBufferPatches;
+        std::get<CAssetConverter::SInputs::asset_span_t<ICPUPolygonGeometry>>(inputs.assets) = tmpGeometries;
+        std::get<CAssetConverter::SInputs::patch_span_t<ICPUPolygonGeometry>>(inputs.patches) = tmpGeometryPatches;
 			}
 
 			auto reservation = converter->reserve(inputs);
@@ -783,19 +754,38 @@ class RayQueryGeometryApp final : public SimpleWindowedApplication, public MonoA
 					return {};
 				}
 
-				// assign gpu objects to output
-				for (const auto& buffer : reservation.getGPUObjects<ICPUBuffer>())
-					retainedBuffers.push_back(buffer.value);
-				for (uint32_t i = 0; i < objectsCpu.size(); i++)
-				{
-					auto vBuffer = retainedBuffers[2 * i + 0].get();
-					auto iBuffer = retainedBuffers[2 * i + 1].get();
-					const auto& geom = objectsCpu[i];
-					const bool useIndex = geom.data.indexType != EIT_UNKNOWN;
+        auto&& tlases = reservation.getGPUObjects<ICPUTopLevelAccelerationStructure>();
+        m_gpuTlas = tlases[0].value;
 
-					geomInfos[i].vertexBufferAddress = vBuffer->getDeviceAddress() + byteOffsets[i];
-					geomInfos[i].indexBufferAddress = useIndex ? iBuffer->getDeviceAddress():0x0ull;
-				}
+        auto&& gpuPolygonGeometries = reservation.getGPUObjects<ICPUPolygonGeometry>();
+        m_gpuPolygons.resize(gpuPolygonGeometries.size());
+
+				// assign gpu objects to output
+				for (uint32_t i = 0; i < gpuPolygonGeometries.size(); i++)
+        {
+          const auto& cpuObject = cpuObjects[i];
+          const auto& gpuPolygon = gpuPolygonGeometries[i].value;
+          const auto gpuTriangles = gpuPolygon->exportForBLAS();
+
+          const auto& vertexBufferBinding = gpuTriangles.vertexData[0];
+          const uint64_t vertexBufferAddress = vertexBufferBinding.buffer->getDeviceAddress() + vertexBufferBinding.offset;
+
+          const auto& normalView = gpuPolygon->getNormalView();
+          const uint64_t normalBufferAddress = normalView ? normalView.src.buffer->getDeviceAddress() + normalView.src.offset : 0;
+
+          const auto& indexBufferBinding = gpuTriangles.indexData;
+          auto& geomInfo = geomInfos[i];
+          geomInfo = {
+            .vertexBufferAddress = vertexBufferAddress,
+            .indexBufferAddress = indexBufferBinding.buffer ? indexBufferBinding.buffer->getDeviceAddress() + indexBufferBinding.offset : vertexBufferAddress,
+            .normalBufferAddress = normalBufferAddress,
+            .vertexStride = gpuTriangles.vertexStride,
+            .indexType = gpuTriangles.indexType,
+            .smoothNormals = s_smoothNormals[cpuObject.meta.type],
+          };
+
+          m_gpuPolygons[i] = gpuPolygon;
+        }
 			}
 
 			//
@@ -892,7 +882,6 @@ class RayQueryGeometryApp final : public SimpleWindowedApplication, public MonoA
 			m_api->endCapture();
 
 			return reservation.getGPUObjects<ICPUDescriptorSet>().front().value;
-#endif
 		}
 
 
@@ -911,11 +900,13 @@ class RayQueryGeometryApp final : public SimpleWindowedApplication, public MonoA
 		video::CDumbPresentationOracle oracle;
 
 		smart_refctd_ptr<IGPUBuffer> geometryInfoBuffer;
-		core::vector<smart_refctd_ptr<IGPUBuffer>> retainedBuffers;
 		smart_refctd_ptr<IGPUImage> outHDRImage;
+    core::vector<smart_refctd_ptr<IGPUPolygonGeometry>> m_gpuPolygons;
+    smart_refctd_ptr<IGPUTopLevelAccelerationStructure> m_gpuTlas;
 
 		smart_refctd_ptr<IGPUComputePipeline> renderPipeline;
 		smart_refctd_ptr<IGPUDescriptorSet> renderDs;
+
 };
 
 NBL_MAIN_FUNC(RayQueryGeometryApp)
\ No newline at end of file

From 01c2b69cd51ffb1464f69d40542cb6b7615d942c Mon Sep 17 00:00:00 2001
From: kevyuu <kevin.kayu@gmail.com>
Date: Tue, 1 Jul 2025 21:17:23 +0700
Subject: [PATCH 453/529] Fix example 67 indentation to use tabs

---
 67_RayQueryGeometry/main.cpp | 124 +++++++++++++++++------------------
 1 file changed, 62 insertions(+), 62 deletions(-)

diff --git a/67_RayQueryGeometry/main.cpp b/67_RayQueryGeometry/main.cpp
index 3fd6e5642..97482dc51 100644
--- a/67_RayQueryGeometry/main.cpp
+++ b/67_RayQueryGeometry/main.cpp
@@ -279,11 +279,11 @@ class RayQueryGeometryApp final : public SimpleWindowedApplication, public Built
 			{
 				IGPUCommandBuffer::SPipelineBarrierDependencyInfo::image_barrier_t imageBarriers[1];
 				imageBarriers[0].barrier = {
-				   .dep = {
-					   .srcStageMask = PIPELINE_STAGE_FLAGS::NONE,
-					   .srcAccessMask = ACCESS_FLAGS::NONE,
-					   .dstStageMask = PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT,
-					   .dstAccessMask = ACCESS_FLAGS::SHADER_WRITE_BITS
+					 .dep = {
+						 .srcStageMask = PIPELINE_STAGE_FLAGS::NONE,
+						 .srcAccessMask = ACCESS_FLAGS::NONE,
+						 .dstStageMask = PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT,
+						 .dstAccessMask = ACCESS_FLAGS::SHADER_WRITE_BITS
 					}
 				};
 				imageBarriers[0].image = outHDRImage.get();
@@ -319,11 +319,11 @@ class RayQueryGeometryApp final : public SimpleWindowedApplication, public Built
 			{
 				IGPUCommandBuffer::SPipelineBarrierDependencyInfo::image_barrier_t imageBarriers[2];
 				imageBarriers[0].barrier = {
-				   .dep = {
-					   .srcStageMask = PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT,
-					   .srcAccessMask = ACCESS_FLAGS::SHADER_WRITE_BITS,
-					   .dstStageMask = PIPELINE_STAGE_FLAGS::BLIT_BIT,
-					   .dstAccessMask = ACCESS_FLAGS::TRANSFER_WRITE_BIT
+					 .dep = {
+						 .srcStageMask = PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT,
+						 .srcAccessMask = ACCESS_FLAGS::SHADER_WRITE_BITS,
+						 .dstStageMask = PIPELINE_STAGE_FLAGS::BLIT_BIT,
+						 .dstAccessMask = ACCESS_FLAGS::TRANSFER_WRITE_BIT
 					}
 				};
 				imageBarriers[0].image = outHDRImage.get();
@@ -338,11 +338,11 @@ class RayQueryGeometryApp final : public SimpleWindowedApplication, public Built
 				imageBarriers[0].newLayout = IImage::LAYOUT::TRANSFER_SRC_OPTIMAL;
 
 				imageBarriers[1].barrier = {
-				   .dep = {
-					   .srcStageMask = PIPELINE_STAGE_FLAGS::NONE,
-					   .srcAccessMask = ACCESS_FLAGS::NONE,
-					   .dstStageMask = PIPELINE_STAGE_FLAGS::BLIT_BIT,
-					   .dstAccessMask = ACCESS_FLAGS::TRANSFER_WRITE_BIT
+					 .dep = {
+						 .srcStageMask = PIPELINE_STAGE_FLAGS::NONE,
+						 .srcAccessMask = ACCESS_FLAGS::NONE,
+						 .dstStageMask = PIPELINE_STAGE_FLAGS::BLIT_BIT,
+						 .dstAccessMask = ACCESS_FLAGS::TRANSFER_WRITE_BIT
 					}
 				};
 				imageBarriers[1].image = m_surface->getSwapchainResources()->getImage(m_currentImageAcquire.imageIndex);
@@ -384,11 +384,11 @@ class RayQueryGeometryApp final : public SimpleWindowedApplication, public Built
 			{
 				IGPUCommandBuffer::SPipelineBarrierDependencyInfo::image_barrier_t imageBarriers[1];
 				imageBarriers[0].barrier = {
-				   .dep = {
-					   .srcStageMask = PIPELINE_STAGE_FLAGS::BLIT_BIT,
-					   .srcAccessMask = ACCESS_FLAGS::TRANSFER_WRITE_BIT,
-					   .dstStageMask = PIPELINE_STAGE_FLAGS::NONE,
-					   .dstAccessMask = ACCESS_FLAGS::NONE
+					 .dep = {
+						 .srcStageMask = PIPELINE_STAGE_FLAGS::BLIT_BIT,
+						 .srcAccessMask = ACCESS_FLAGS::TRANSFER_WRITE_BIT,
+						 .dstStageMask = PIPELINE_STAGE_FLAGS::NONE,
+						 .dstAccessMask = ACCESS_FLAGS::NONE
 					}
 				};
 				imageBarriers[0].image = m_surface->getSwapchainResources()->getImage(m_currentImageAcquire.imageIndex);
@@ -488,8 +488,8 @@ class RayQueryGeometryApp final : public SimpleWindowedApplication, public Built
 		{
 			using namespace nbl::scene;
 
-      // triangles geometries
-      auto gc = make_smart_refctd_ptr<CGeometryCreator>();
+			// triangles geometries
+			auto gc = make_smart_refctd_ptr<CGeometryCreator>();
 
 			std::array<ReferenceObjectCpu, OT_COUNT> cpuObjects;
 			cpuObjects[OT_CUBE] = ReferenceObjectCpu{ .meta = {.type = OT_CUBE, .name = "Cube Mesh" }, .shadersType = GP_BASIC, .data = gc->createCube({1.f, 1.f, 1.f}) };
@@ -518,7 +518,7 @@ class RayQueryGeometryApp final : public SimpleWindowedApplication, public Built
 				tri = cpuObjects[i].data->exportForBLAS();
 
 				auto& blas = cpuBlas[i];
-        blas = make_smart_refctd_ptr<ICPUBottomLevelAccelerationStructure>();
+				blas = make_smart_refctd_ptr<ICPUBottomLevelAccelerationStructure>();
 				blas->setGeometries(std::move(triangles), std::move(primitiveCounts));
 
 				auto blasFlags = bitflag(IGPUBottomLevelAccelerationStructure::BUILD_FLAGS::PREFER_FAST_TRACE_BIT) | IGPUBottomLevelAccelerationStructure::BUILD_FLAGS::ALLOW_COMPACTION_BIT;
@@ -613,25 +613,25 @@ class RayQueryGeometryApp final : public SimpleWindowedApplication, public Built
 			CAssetConverter::patch_t<ICPUTopLevelAccelerationStructure> tlasPatch = {};
 			tlasPatch.compactAfterBuild = true;
 			std::array<CAssetConverter::patch_t<ICPUBottomLevelAccelerationStructure>,OT_COUNT> tmpBLASPatches = {};
-      std::array<ICPUPolygonGeometry*, std::size(cpuObjects)> tmpGeometries;
-      std::array<CAssetConverter::patch_t<asset::ICPUPolygonGeometry>, std::size(cpuObjects)> tmpGeometryPatches;
+			std::array<ICPUPolygonGeometry*, std::size(cpuObjects)> tmpGeometries;
+			std::array<CAssetConverter::patch_t<asset::ICPUPolygonGeometry>, std::size(cpuObjects)> tmpGeometryPatches;
 			{
 				tmpBLASPatches.front().compactAfterBuild = true;
 				std::fill(tmpBLASPatches.begin(),tmpBLASPatches.end(),tmpBLASPatches.front());
 				//
-        for (uint32_t i = 0; i < cpuObjects.size(); i++)
-        {
-          tmpGeometries[i] = cpuObjects[i].data.get();
-          tmpGeometryPatches[i].indexBufferUsages= IGPUBuffer::E_USAGE_FLAGS::EUF_SHADER_DEVICE_ADDRESS_BIT;
-        }
+				for (uint32_t i = 0; i < cpuObjects.size(); i++)
+				{
+					tmpGeometries[i] = cpuObjects[i].data.get();
+					tmpGeometryPatches[i].indexBufferUsages= IGPUBuffer::E_USAGE_FLAGS::EUF_SHADER_DEVICE_ADDRESS_BIT;
+				}
 
 				std::get<CAssetConverter::SInputs::asset_span_t<ICPUDescriptorSet>>(inputs.assets) = {&descriptorSet.get(),1};
 				std::get<CAssetConverter::SInputs::asset_span_t<ICPUTopLevelAccelerationStructure>>(inputs.assets) = {&cpuTlas.get(),1};
 				std::get<CAssetConverter::SInputs::patch_span_t<ICPUTopLevelAccelerationStructure>>(inputs.patches) = {&tlasPatch,1};
 				std::get<CAssetConverter::SInputs::asset_span_t<ICPUBottomLevelAccelerationStructure>>(inputs.assets) = {&cpuBlas.data()->get(),cpuBlas.size()};
 				std::get<CAssetConverter::SInputs::patch_span_t<ICPUBottomLevelAccelerationStructure>>(inputs.patches) = tmpBLASPatches;
-        std::get<CAssetConverter::SInputs::asset_span_t<ICPUPolygonGeometry>>(inputs.assets) = tmpGeometries;
-        std::get<CAssetConverter::SInputs::patch_span_t<ICPUPolygonGeometry>>(inputs.patches) = tmpGeometryPatches;
+				std::get<CAssetConverter::SInputs::asset_span_t<ICPUPolygonGeometry>>(inputs.assets) = tmpGeometries;
+				std::get<CAssetConverter::SInputs::patch_span_t<ICPUPolygonGeometry>>(inputs.patches) = tmpGeometryPatches;
 			}
 
 			auto reservation = converter->reserve(inputs);
@@ -754,38 +754,38 @@ class RayQueryGeometryApp final : public SimpleWindowedApplication, public Built
 					return {};
 				}
 
-        auto&& tlases = reservation.getGPUObjects<ICPUTopLevelAccelerationStructure>();
-        m_gpuTlas = tlases[0].value;
+				auto&& tlases = reservation.getGPUObjects<ICPUTopLevelAccelerationStructure>();
+				m_gpuTlas = tlases[0].value;
 
-        auto&& gpuPolygonGeometries = reservation.getGPUObjects<ICPUPolygonGeometry>();
-        m_gpuPolygons.resize(gpuPolygonGeometries.size());
+				auto&& gpuPolygonGeometries = reservation.getGPUObjects<ICPUPolygonGeometry>();
+				m_gpuPolygons.resize(gpuPolygonGeometries.size());
 
 				// assign gpu objects to output
 				for (uint32_t i = 0; i < gpuPolygonGeometries.size(); i++)
-        {
-          const auto& cpuObject = cpuObjects[i];
-          const auto& gpuPolygon = gpuPolygonGeometries[i].value;
-          const auto gpuTriangles = gpuPolygon->exportForBLAS();
-
-          const auto& vertexBufferBinding = gpuTriangles.vertexData[0];
-          const uint64_t vertexBufferAddress = vertexBufferBinding.buffer->getDeviceAddress() + vertexBufferBinding.offset;
-
-          const auto& normalView = gpuPolygon->getNormalView();
-          const uint64_t normalBufferAddress = normalView ? normalView.src.buffer->getDeviceAddress() + normalView.src.offset : 0;
-
-          const auto& indexBufferBinding = gpuTriangles.indexData;
-          auto& geomInfo = geomInfos[i];
-          geomInfo = {
-            .vertexBufferAddress = vertexBufferAddress,
-            .indexBufferAddress = indexBufferBinding.buffer ? indexBufferBinding.buffer->getDeviceAddress() + indexBufferBinding.offset : vertexBufferAddress,
-            .normalBufferAddress = normalBufferAddress,
-            .vertexStride = gpuTriangles.vertexStride,
-            .indexType = gpuTriangles.indexType,
-            .smoothNormals = s_smoothNormals[cpuObject.meta.type],
-          };
-
-          m_gpuPolygons[i] = gpuPolygon;
-        }
+				{
+					const auto& cpuObject = cpuObjects[i];
+					const auto& gpuPolygon = gpuPolygonGeometries[i].value;
+					const auto gpuTriangles = gpuPolygon->exportForBLAS();
+
+					const auto& vertexBufferBinding = gpuTriangles.vertexData[0];
+					const uint64_t vertexBufferAddress = vertexBufferBinding.buffer->getDeviceAddress() + vertexBufferBinding.offset;
+
+					const auto& normalView = gpuPolygon->getNormalView();
+					const uint64_t normalBufferAddress = normalView ? normalView.src.buffer->getDeviceAddress() + normalView.src.offset : 0;
+
+					const auto& indexBufferBinding = gpuTriangles.indexData;
+					auto& geomInfo = geomInfos[i];
+					geomInfo = {
+						.vertexBufferAddress = vertexBufferAddress,
+						.indexBufferAddress = indexBufferBinding.buffer ? indexBufferBinding.buffer->getDeviceAddress() + indexBufferBinding.offset : vertexBufferAddress,
+						.normalBufferAddress = normalBufferAddress,
+						.vertexStride = gpuTriangles.vertexStride,
+						.indexType = gpuTriangles.indexType,
+						.smoothNormals = s_smoothNormals[cpuObject.meta.type],
+					};
+
+					m_gpuPolygons[i] = gpuPolygon;
+				}
 			}
 
 			//
@@ -901,8 +901,8 @@ class RayQueryGeometryApp final : public SimpleWindowedApplication, public Built
 
 		smart_refctd_ptr<IGPUBuffer> geometryInfoBuffer;
 		smart_refctd_ptr<IGPUImage> outHDRImage;
-    core::vector<smart_refctd_ptr<IGPUPolygonGeometry>> m_gpuPolygons;
-    smart_refctd_ptr<IGPUTopLevelAccelerationStructure> m_gpuTlas;
+		core::vector<smart_refctd_ptr<IGPUPolygonGeometry>> m_gpuPolygons;
+		smart_refctd_ptr<IGPUTopLevelAccelerationStructure> m_gpuTlas;
 
 		smart_refctd_ptr<IGPUComputePipeline> renderPipeline;
 		smart_refctd_ptr<IGPUDescriptorSet> renderDs;

From 0b304624f1c512ab4f796280aa5a6f5909c3b003 Mon Sep 17 00:00:00 2001
From: kevyuu <kevin.kayu@gmail.com>
Date: Tue, 1 Jul 2025 21:29:12 +0700
Subject: [PATCH 454/529] Fix example 71 normal computation for more geometry

---
 .../app_resources/raytrace.rchit.hlsl         | 80 +++++++------------
 1 file changed, 27 insertions(+), 53 deletions(-)

diff --git a/71_RayTracingPipeline/app_resources/raytrace.rchit.hlsl b/71_RayTracingPipeline/app_resources/raytrace.rchit.hlsl
index 0a2877ccf..b513d5958 100644
--- a/71_RayTracingPipeline/app_resources/raytrace.rchit.hlsl
+++ b/71_RayTracingPipeline/app_resources/raytrace.rchit.hlsl
@@ -40,60 +40,34 @@ float32_t3 fetchVertexNormal(int instID, int primID, STriangleGeomInfo geom, flo
     const uint64_t normalVertexBufferAddress = geom.normalBufferAddress;
     float3 n0, n1, n2;
 
-    // TODO(kevin): Currently this will work correctly both for cubes and rectangle, which are the only triangles geometry that is used in this example. Need to implement other geometry
-    uint32_t v0 = vk::RawBufferLoad < uint32_t > (normalVertexBufferAddress + i0 * 4);
-    uint32_t v1 = vk::RawBufferLoad < uint32_t > (normalVertexBufferAddress + i1 * 4);
-    uint32_t v2 = vk::RawBufferLoad < uint32_t > (normalVertexBufferAddress + i2 * 4);
-    
-
-    n0 = normalize(nbl::hlsl::spirv::unpackSnorm4x8(v0).xyz);
-    n1 = normalize(nbl::hlsl::spirv::unpackSnorm4x8(v1).xyz);
-    n2 = normalize(nbl::hlsl::spirv::unpackSnorm4x8(v2).xyz);
-
-    // switch (objType)
-    // {
-    //     case OT_CUBE:
-    //     {
-    //         // TODO(kevin): Don't hardcode the normal stride in hlsl
-    //         uint32_t v0 = vk::RawBufferLoad < uint32_t > (normalVertexBufferAddress + i0 * 4);
-    //         uint32_t v1 = vk::RawBufferLoad < uint32_t > (normalVertexBufferAddress + i1 * 4);
-    //         uint32_t v2 = vk::RawBufferLoad < uint32_t > (normalVertexBufferAddress + i2 * 4);
-    //
-    //         n0 = normalize(nbl::hlsl::spirv::unpackSnorm4x8(v0).xyz);
-    //         n1 = normalize(nbl::hlsl::spirv::unpackSnorm4x8(v1).xyz);
-    //         n2 = normalize(nbl::hlsl::spirv::unpackSnorm4x8(v2).xyz);
-    //     }
-    //     break;
-    //     case OT_SPHERE:
-    //     case OT_CYLINDER:
-    //     case OT_ARROW:
-    //     case OT_CONE:
-    //     {
-    //         // TODO(kevin): Fix this logic. Don't use vertex stride since nomral is separated from position
-    //         uint32_t v0 = vk::RawBufferLoad < uint32_t > (normalVertexBufferAddress + i0 * vertexStride);
-    //         uint32_t v1 = vk::RawBufferLoad < uint32_t > (normalVertexBufferAddress + i1 * vertexStride);
-    //         uint32_t v2 = vk::RawBufferLoad < uint32_t > (normalVertexBufferAddress + i2 * vertexStride);
-    //
-    //         n0 = normalize(unpackNormals3x10(v0));
-    //         n1 = normalize(unpackNormals3x10(v1));
-    //         n2 = normalize(unpackNormals3x10(v2));
-    //     }
-    //     break;
-    //     case OT_RECTANGLE:
-    //     case OT_DISK:
-    //     case OT_ICOSPHERE:
-    //     default:
-    //     {
-    //         // TODO(kevin): Don't hardcode the normal stride in hlsl
-    //         n0 = vk::RawBufferLoad < float3 > (normalVertexBufferAddress + i0 * 4);
-    //         n1 = vk::RawBufferLoad < float3 > (normalVertexBufferAddress + i1 * 4);
-    //         n2 = vk::RawBufferLoad < float3 > (normalVertexBufferAddress + i2 * 4);
-    //     }
-    // }
+    float3 n0, n1, n2;
+    switch (objType)
+    {
+        case OT_CUBE:
+        case OT_SPHERE:
+        case OT_RECTANGLE:
+        case OT_CYLINDER:
+        //case OT_ARROW:
+        case OT_CONE:
+        {
+            // TODO: document why the alignment is 2 here and nowhere else? isnt the `vertexStride` aligned to more than 2 anyway?
+            uint32_t v0 = vk::RawBufferLoad<uint32_t>(normalBufferAddress + i0 * 4);
+            uint32_t v1 = vk::RawBufferLoad<uint32_t>(normalBufferAddress + i1 * 4);
+            uint32_t v2 = vk::RawBufferLoad<uint32_t>(normalBufferAddress + i2 * 4);
 
-    // n0 = float3(0, 1, 0);
-    // n1 = float3(0, 1, 0);
-    // n2 = float3(0, 1, 0);
+            n0 = normalize(nbl::hlsl::spirv::unpackSnorm4x8(v0).xyz);
+            n1 = normalize(nbl::hlsl::spirv::unpackSnorm4x8(v1).xyz);
+            n2 = normalize(nbl::hlsl::spirv::unpackSnorm4x8(v2).xyz);
+        }
+        break;
+        case OT_ICOSPHERE:
+        default:
+        {
+            n0 = normalize(vk::RawBufferLoad<float3>(normalBufferAddress + i0 * 12));
+            n1 = normalize(vk::RawBufferLoad<float3>(normalBufferAddress + i1 * 12));
+            n2 = normalize(vk::RawBufferLoad<float3>(normalBufferAddress + i2 * 12));
+        }
+    }
 
     float3 barycentrics = float3(0.0, bary);
     barycentrics.x = 1.0 - barycentrics.y - barycentrics.z;

From 83394e416980f81288db3ada2ad28a3b5a9a7930 Mon Sep 17 00:00:00 2001
From: kevyuu <kevin.kayu@gmail.com>
Date: Tue, 1 Jul 2025 21:29:30 +0700
Subject: [PATCH 455/529] Fix normal computation for icosphere example 67

---
 67_RayQueryGeometry/app_resources/render.comp.hlsl | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/67_RayQueryGeometry/app_resources/render.comp.hlsl b/67_RayQueryGeometry/app_resources/render.comp.hlsl
index 0d2f4e425..bf6431af5 100644
--- a/67_RayQueryGeometry/app_resources/render.comp.hlsl
+++ b/67_RayQueryGeometry/app_resources/render.comp.hlsl
@@ -75,9 +75,9 @@ float3 calculateSmoothNormals(int instID, int primID, SGeomInfo geom, float2 bar
         case OT_ICOSPHERE:
         default:
         {
-            n0 = normalize(vk::RawBufferLoad<float3>(normalBufferAddress + indices[0] * vertexStride));
-            n1 = normalize(vk::RawBufferLoad<float3>(normalBufferAddress + indices[1] * vertexStride));
-            n2 = normalize(vk::RawBufferLoad<float3>(normalBufferAddress + indices[2] * vertexStride));
+            n0 = normalize(vk::RawBufferLoad<float3>(normalBufferAddress + indices[0] * 12));
+            n1 = normalize(vk::RawBufferLoad<float3>(normalBufferAddress + indices[1] * 12));
+            n2 = normalize(vk::RawBufferLoad<float3>(normalBufferAddress + indices[2] * 12));
         }
     }
 

From 82c0d90aa9350430fc969c31a3bf260122e7e944 Mon Sep 17 00:00:00 2001
From: Przemek <minikers21@gmail.com>
Date: Tue, 1 Jul 2025 17:02:28 +0200
Subject: [PATCH 456/529] Moved grid only calculations from screen to grid
 space

---
 62_CAD/main.cpp                                   |  2 +-
 62_CAD/shaders/main_pipeline/common.hlsl          | 12 ++++--------
 62_CAD/shaders/main_pipeline/fragment_shader.hlsl |  7 +++----
 62_CAD/shaders/main_pipeline/vertex_shader.hlsl   | 13 +++----------
 4 files changed, 11 insertions(+), 23 deletions(-)

diff --git a/62_CAD/main.cpp b/62_CAD/main.cpp
index d9eefa8b9..d869ecc56 100644
--- a/62_CAD/main.cpp
+++ b/62_CAD/main.cpp
@@ -3664,7 +3664,7 @@ class ComputerAidedDesign final : public examples::SimpleWindowedApplication, pu
 			worldSpaceExtents.y = (heightMapExtent.height - 1) * HeightMapCellWidth;
 			const uint64_t heightMapTextureID = 0ull;
 
-			constexpr bool DrawGridOnly = false;
+			constexpr bool DrawGridOnly = true;
 			
 			if(DrawGridOnly)
 			{
diff --git a/62_CAD/shaders/main_pipeline/common.hlsl b/62_CAD/shaders/main_pipeline/common.hlsl
index f378c44db..bb2770a31 100644
--- a/62_CAD/shaders/main_pipeline/common.hlsl
+++ b/62_CAD/shaders/main_pipeline/common.hlsl
@@ -233,16 +233,12 @@ struct PSInput
 
     /* GRID DTM */
     uint getGridDTMHeightTextureID() { return data1.z; }
-    float2 getGridDTMScreenSpaceTopLeft() { return data2.xy; }
-    float2 getGridDTMScreenSpaceGridExtents() { return data2.zw; }
-    float getGridDTMScreenSpaceCellWidth() { return data3.x; }
-    float2 getGridDTMScreenSpacePosition() { return interp_data5.zw; }
+    float2 getGridDTMScreenSpaceGridExtents() { return data2.xy; }
+    float getGridDTMScreenSpaceCellWidth() { return data2.z; }
 
     void setGridDTMHeightTextureID(uint textureID) { data1.z = textureID; }
-    void setGridDTMScreenSpaceTopLeft(float2 screenSpaceTopLeft) { data2.xy = screenSpaceTopLeft; }
-    void setGridDTMScreenSpaceGridExtents(float2 screenSpaceGridExtends) { data2.zw = screenSpaceGridExtends; }
-    void setGridDTMScreenSpaceCellWidth(float screenSpaceGridWidth) { data3.x = screenSpaceGridWidth; }
-    void setGridDTMScreenSpacePosition(float2 screenSpacePosition) { interp_data5.zw = screenSpacePosition; }
+    void setGridDTMScreenSpaceGridExtents(float2 screenSpaceGridExtends) { data2.xy = screenSpaceGridExtends; }
+    void setGridDTMScreenSpaceCellWidth(float screenSpaceGridWidth) { data2.z = screenSpaceGridWidth; }
 };
 
 // Set 0 - Scene Data and Globals, buffer bindings don't change the buffers only get updated
diff --git a/62_CAD/shaders/main_pipeline/fragment_shader.hlsl b/62_CAD/shaders/main_pipeline/fragment_shader.hlsl
index c1c6715af..1fd08db79 100644
--- a/62_CAD/shaders/main_pipeline/fragment_shader.hlsl
+++ b/62_CAD/shaders/main_pipeline/fragment_shader.hlsl
@@ -411,11 +411,10 @@ float4 fragMain(PSInput input) : SV_TARGET
             if (!dtmSettings.drawContourEnabled() && !dtmSettings.drawOutlineEnabled() && !dtmSettings.drawHeightShadingEnabled())
                 discard;
 
-            float2 pos = input.getGridDTMScreenSpacePosition();
             float2 uv = input.getImageUV();
             const uint32_t textureId = input.getGridDTMHeightTextureID();
 
-            float2 gridTopLeftCorner = input.getGridDTMScreenSpaceTopLeft();
+            float2 gridTopLeftCorner = 0.0f;
             float2 gridExtents = input.getGridDTMScreenSpaceGridExtents();
             const float cellWidth = input.getGridDTMScreenSpaceCellWidth();
             // TODO: I think we can get it from the height map size if texture is valid?!, better if it comes directly from CPU side, vertex shader or something, division + round to integer is error-prone for large integer values
@@ -460,8 +459,8 @@ float4 fragMain(PSInput input) : SV_TARGET
                 outlineLineSegments[1].P1 = float32_t2(nearestLineRemainingCoords.x, horizontalBounds.y);
                 
                 LineStyle outlineStyle = loadLineStyle(dtmSettings.outlineLineStyleIdx);
-                float sdf = dtm::calculateLineSDF(outlineStyle, outlineLineSegments[0], input.position.xy, 0.0f);
-                sdf = min(sdf, dtm::calculateLineSDF(outlineStyle, outlineLineSegments[1], input.position.xy, 0.0f));
+                float sdf = dtm::calculateLineSDF(outlineStyle, outlineLineSegments[0], gridSpacePos, 0.0f);
+                sdf = min(sdf, dtm::calculateLineSDF(outlineStyle, outlineLineSegments[1], gridSpacePos, 0.0f));
 
                 float4 dtmColor = outlineStyle.color;
                 dtmColor.a *= 1.0f - smoothstep(-globals.antiAliasingFactor, globals.antiAliasingFactor, sdf);
diff --git a/62_CAD/shaders/main_pipeline/vertex_shader.hlsl b/62_CAD/shaders/main_pipeline/vertex_shader.hlsl
index 432a18511..3a5a74c10 100644
--- a/62_CAD/shaders/main_pipeline/vertex_shader.hlsl
+++ b/62_CAD/shaders/main_pipeline/vertex_shader.hlsl
@@ -660,17 +660,9 @@ PSInput main(uint vertexID : SV_VertexID)
             //thicknessOfTheThickestLine += 200.0f;
 
             const float2 corner = float2(bool2(vertexIdx & 0x1u, vertexIdx >> 1));
-            worldSpaceExtents.y = ieee754::flipSign(worldSpaceExtents.y);
-
-            pfloat64_t2 vtxPos = topLeft;
-            vtxPos.x = vtxPos.x + worldSpaceExtents.x * corner.x;
-            vtxPos.y = vtxPos.y + worldSpaceExtents.y * corner.y;
-            worldSpaceExtents.y = ieee754::flipSign(worldSpaceExtents.y);
 
             outV.setGridDTMHeightTextureID(textureID);
             outV.setGridDTMScreenSpaceCellWidth(gridCellWidth * globals.screenToWorldRatio);
-            outV.setGridDTMScreenSpacePosition(transformPointScreenSpace(clipProjectionData.projectionToNDC, globals.resolution, vtxPos));
-            outV.setGridDTMScreenSpaceTopLeft(transformPointScreenSpace(clipProjectionData.projectionToNDC, globals.resolution, topLeft));
             outV.setGridDTMScreenSpaceGridExtents(_static_cast<float2>(worldSpaceExtents) * globals.screenToWorldRatio);
 
             static const float SquareRootOfTwo = 1.4142135f;
@@ -708,12 +700,13 @@ PSInput main(uint vertexID : SV_VertexID)
             outV.setImageUV(uv);
             /*printf("uv = { %f, %f } scale = { %f, %f }", _static_cast<float>(uv.x), _static_cast<float>(uv.y), _static_cast<float>(uvScale.x), _static_cast<float>(uvScale.y));*/
 
+            // TODO: test dilation
             pfloat64_t2 topLeftToGridCenterVector = worldSpaceExtents * 0.5;
             topLeftToGridCenterVector.y = -topLeftToGridCenterVector.y;
             pfloat64_t2 gridCenter = topLeft + topLeftToGridCenterVector;
 
-            pfloat64_t2 dilatedVtxPos = vtxPos + dilationVector;
-
+            const pfloat64_t2 vtxPos = topLeft + float2(worldSpaceExtents.x, -worldSpaceExtents.y) * corner;
+            const pfloat64_t2 dilatedVtxPos = vtxPos + dilationVector;
 
             float2 ndcVtxPos = _static_cast<float2>(transformPointNdc(clipProjectionData.projectionToNDC, dilatedVtxPos));
             outV.position = float4(ndcVtxPos, 0.0f, 1.0f);

From 0c13db5a5fb03ae619c1141cf4a7a79371521ae1 Mon Sep 17 00:00:00 2001
From: devsh <devsh@devsh.eu>
Date: Tue, 1 Jul 2025 17:22:30 +0200
Subject: [PATCH 457/529] we'll do Normal Quantization and cache load/store in
 another example

---
 12_MeshLoaders/main.cpp | 14 ++++++--------
 1 file changed, 6 insertions(+), 8 deletions(-)

diff --git a/12_MeshLoaders/main.cpp b/12_MeshLoaders/main.cpp
index b181162d4..a941f9655 100644
--- a/12_MeshLoaders/main.cpp
+++ b/12_MeshLoaders/main.cpp
@@ -5,6 +5,9 @@
 
 #include "../3rdparty/portable-file-dialogs/portable-file-dialogs.h"
 
+#ifdef _NBL_COMPILE_WITH_MITSUBA_SERIALIZED_LOADER_
+#include "nbl/ext/MitsubaLoader/CSerializedLoader.h"
+#endif
 
 class MeshLoadersApp final : public MonoWindowApplication, public BuiltinResourcesApplication
 {
@@ -20,6 +23,9 @@ class MeshLoadersApp final : public MonoWindowApplication, public BuiltinResourc
 		{
 			if (!asset_base_t::onAppInitialized(smart_refctd_ptr(system)))
 				return false;
+		#ifdef _NBL_COMPILE_WITH_MITSUBA_SERIALIZED_LOADER_
+			m_assetMgr->addAssetLoader(make_smart_refctd_ptr<ext::MitsubaLoader::CSerializedLoader>());
+		#endif
 			if (!device_base_t::onAppInitialized(smart_refctd_ptr(system)))
 				return false;
 
@@ -36,9 +42,6 @@ class MeshLoadersApp final : public MonoWindowApplication, public BuiltinResourc
 					return logFail("Couldn't create Command Buffer!");
 			}
 			
-			//! cache results -- speeds up mesh generation on second run
-			m_qnc = make_smart_refctd_ptr<CQuantNormalCache>();
-			m_qnc->loadCacheFromFile<EF_R8G8B8_SNORM>(m_system.get(),sharedOutputCWD/"../../tmp/normalCache888.sse");
 
 			auto scRes = static_cast<CDefaultSwapchainFramebuffers*>(m_surface->getSwapchainResources());
 			m_renderer = CSimpleDebugRenderer::create(m_assetMgr.get(),scRes->getRenderpass(),0,{});
@@ -246,7 +249,6 @@ class MeshLoadersApp final : public MonoWindowApplication, public BuiltinResourc
 			//! load the geometry
 			IAssetLoader::SAssetLoadParams params = {};
 			params.logger = m_logger.get();
-			params.meshManipulatorOverride = nullptr; // TODO
 			auto bundle = m_assetMgr->getAsset(m_modelPath,params);
 			if (bundle.getContents().empty())
 				return false;
@@ -266,9 +268,6 @@ class MeshLoadersApp final : public MonoWindowApplication, public BuiltinResourc
 			}
 			if (geometries.empty())
 				return false;
-
-			//! cache results -- speeds up mesh generation on second run
-			m_qnc->saveCacheToFile<EF_R8G8B8_SNORM>(m_system.get(),sharedOutputCWD/"../../tmp/normalCache888.sse");
 			
 			auto bound = hlsl::shapes::AABB<3,double>::create();
 			// convert the geometries
@@ -395,7 +394,6 @@ class MeshLoadersApp final : public MonoWindowApplication, public BuiltinResourc
 		// Maximum frames which can be simultaneously submitted, used to cycle through our per-frame resources like command buffers
 		constexpr static inline uint32_t MaxFramesInFlight = 3u;
 		//
-		smart_refctd_ptr<CQuantNormalCache> m_qnc;
 		smart_refctd_ptr<CSimpleDebugRenderer> m_renderer;
 		//
 		smart_refctd_ptr<ISemaphore> m_semaphore;

From d51c739115bf58adf5f2f7f32a08162a9e019dfd Mon Sep 17 00:00:00 2001
From: devsh <devsh@devsh.eu>
Date: Tue, 1 Jul 2025 17:31:23 +0200
Subject: [PATCH 458/529] start extending the mesh loader example for
 conditionally enabled extension loaders

---
 12_MeshLoaders/CMakeLists.txt | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/12_MeshLoaders/CMakeLists.txt b/12_MeshLoaders/CMakeLists.txt
index 2dd253226..dee195066 100644
--- a/12_MeshLoaders/CMakeLists.txt
+++ b/12_MeshLoaders/CMakeLists.txt
@@ -1,8 +1,18 @@
 set(NBL_INCLUDE_SERACH_DIRECTORIES
 	"${CMAKE_CURRENT_SOURCE_DIR}/include"
 )
+set(NBL_LIBRARIES)
+
+if (NBL_BUILD_MITSUBA_LOADER)
+	list(APPEND NBL_INCLUDE_SERACH_DIRECTORIES
+		"${NBL_EXT_MITSUBA_LOADER_INCLUDE_DIRS}"
+	)
+	list(APPEND NBL_LIBRARIES
+		"${NBL_EXT_MITSUBA_LOADER_LIB}"
+	)
+endif()
 
 	# TODO; Arek I removed `NBL_EXECUTABLE_PROJECT_CREATION_PCH_TARGET` from the last parameter here, doesn't this macro have 4 arguments anyway !?
-nbl_create_executable_project("" "" "${NBL_INCLUDE_SERACH_DIRECTORIES}" "" "")
+nbl_create_executable_project("" "" "${NBL_INCLUDE_SERACH_DIRECTORIES}" "${NBL_LIBRARIES}")
 # TODO: Arek temporarily disabled cause I haven't figured out how to make this target yet
 # LINK_BUILTIN_RESOURCES_TO_TARGET(${EXECUTABLE_NAME} nblExamplesGeometrySpirvBRD)
\ No newline at end of file

From 050a2010a2aa89171d1f12d40c567c7348f2c9c4 Mon Sep 17 00:00:00 2001
From: Przemek <minikers21@gmail.com>
Date: Tue, 1 Jul 2025 17:37:44 +0200
Subject: [PATCH 459/529] Moved grid DTM color calculation from screen to grid
 space

---
 62_CAD/main.cpp                               |  4 +-
 62_CAD/shaders/main_pipeline/dtm.hlsl         | 10 +----
 .../main_pipeline/fragment_shader.hlsl        | 37 +++++--------------
 .../shaders/main_pipeline/vertex_shader.hlsl  |  6 ---
 4 files changed, 14 insertions(+), 43 deletions(-)

diff --git a/62_CAD/main.cpp b/62_CAD/main.cpp
index d869ecc56..58ccf625d 100644
--- a/62_CAD/main.cpp
+++ b/62_CAD/main.cpp
@@ -1495,6 +1495,8 @@ class ComputerAidedDesign final : public examples::SimpleWindowedApplication, pu
 
 		float64_t3x3 projectionToNDC;
 		projectionToNDC = m_Camera.constructViewProjection();
+
+		// TEST CAMERA ROTATION
 #if 0
 		double rotation = 0.25 * PI<double>();
 		float64_t2 rotationVec = float64_t2(cos(rotation), sin(rotation));
@@ -3664,7 +3666,7 @@ class ComputerAidedDesign final : public examples::SimpleWindowedApplication, pu
 			worldSpaceExtents.y = (heightMapExtent.height - 1) * HeightMapCellWidth;
 			const uint64_t heightMapTextureID = 0ull;
 
-			constexpr bool DrawGridOnly = true;
+			constexpr bool DrawGridOnly = false;
 			
 			if(DrawGridOnly)
 			{
diff --git a/62_CAD/shaders/main_pipeline/dtm.hlsl b/62_CAD/shaders/main_pipeline/dtm.hlsl
index 0fb35fab3..6f50a9384 100644
--- a/62_CAD/shaders/main_pipeline/dtm.hlsl
+++ b/62_CAD/shaders/main_pipeline/dtm.hlsl
@@ -469,7 +469,7 @@ GridDTMHeightMapData retrieveGridDTMCellDataFromHeightMap(in float2 gridDimensio
     return output;
 }
 
-GridDTMCell calculateCellTriangles(in dtm::GridDTMHeightMapData heightData, in float2 topLeft, in float2 cellCoords, const float cellWidth)
+GridDTMCell calculateCellTriangles(in dtm::GridDTMHeightMapData heightData, in float2 cellCoords, const float cellWidth)
 {
     GridDTMCell output;
 
@@ -503,14 +503,6 @@ GridDTMCell calculateCellTriangles(in dtm::GridDTMHeightMapData heightData, in f
     output.validA = !isInvalidGridDtmHeightValue(output.triangleA.vertices[0].z) && !isInvalidGridDtmHeightValue(output.triangleA.vertices[1].z) && !isInvalidGridDtmHeightValue(output.triangleA.vertices[2].z);
     output.validB = !isInvalidGridDtmHeightValue(output.triangleB.vertices[0].z) && !isInvalidGridDtmHeightValue(output.triangleB.vertices[1].z) && !isInvalidGridDtmHeightValue(output.triangleB.vertices[2].z);
 
-    // move from grid space to screen space
-    [unroll]
-    for (int i = 0; i < 3; ++i)
-    {
-        output.triangleA.vertices[i].xy += topLeft;
-        output.triangleB.vertices[i].xy += topLeft;
-    }
-
     return output;
 }
 
diff --git a/62_CAD/shaders/main_pipeline/fragment_shader.hlsl b/62_CAD/shaders/main_pipeline/fragment_shader.hlsl
index 1fd08db79..7d16bd263 100644
--- a/62_CAD/shaders/main_pipeline/fragment_shader.hlsl
+++ b/62_CAD/shaders/main_pipeline/fragment_shader.hlsl
@@ -414,7 +414,6 @@ float4 fragMain(PSInput input) : SV_TARGET
             float2 uv = input.getImageUV();
             const uint32_t textureId = input.getGridDTMHeightTextureID();
 
-            float2 gridTopLeftCorner = 0.0f;
             float2 gridExtents = input.getGridDTMScreenSpaceGridExtents();
             const float cellWidth = input.getGridDTMScreenSpaceCellWidth();
             // TODO: I think we can get it from the height map size if texture is valid?!, better if it comes directly from CPU side, vertex shader or something, division + round to integer is error-prone for large integer values
@@ -444,9 +443,9 @@ float4 fragMain(PSInput input) : SV_TARGET
                 nbl::hlsl::shapes::Line<float> outlineLineSegments[2];
                 
                 const float halfCellWidth = cellWidth * 0.5f;
-                const float2 horizontalBounds = float2(gridTopLeftCorner.y, gridTopLeftCorner.y + gridExtents.y);
-                const float2 verticalBounds = float2(gridTopLeftCorner.x, gridTopLeftCorner.x + gridExtents.x);
-                float2 nearestLineRemainingCoords = int2((gridSpacePos + halfCellWidth) / cellWidth) * cellWidth + gridTopLeftCorner;
+                const float2 horizontalBounds = float2(0.0f, gridExtents.y);
+                const float2 verticalBounds = float2(0.0f, gridExtents.x);
+                float2 nearestLineRemainingCoords = int2((gridSpacePos + halfCellWidth) / cellWidth) * cellWidth;
                 // shift lines outside of the grid to a bound
                 nearestLineRemainingCoords.x = clamp(nearestLineRemainingCoords.x, verticalBounds.x, verticalBounds.y);
                 nearestLineRemainingCoords.y = clamp(nearestLineRemainingCoords.y, horizontalBounds.x, horizontalBounds.y);
@@ -497,7 +496,7 @@ float4 fragMain(PSInput input) : SV_TARGET
                 // curr cell horizontal, curr cell vertical, opposite cell horizontal, opposite cell vertical 
                 bool4 linesValidity = bool4(false, false, false, false);
 
-                //[unroll]
+                [unroll]
                 for (int i = 0; i < 2; ++i)
                 {
                     for (int j = 0; j < 2; ++j)
@@ -509,7 +508,7 @@ float4 fragMain(PSInput input) : SV_TARGET
                         if (isCellWithinRange)
                         {
                             dtm::GridDTMHeightMapData heightData = dtm::retrieveGridDTMCellDataFromHeightMap(gridDimensions, cellCoord, texturesU32[NonUniformResourceIndex(textureId)]);
-                            dtm::GridDTMCell gridCellFormed = dtm::calculateCellTriangles(heightData, gridTopLeftCorner, cellCoord, cellWidth);
+                            dtm::GridDTMCell gridCellFormed = dtm::calculateCellTriangles(heightData, cellCoord, cellWidth);
                             if (gridCellFormed.validA)
                                 triangles[triangleCount++] = gridCellFormed.triangleA;
                             if (gridCellFormed.validB)
@@ -550,7 +549,7 @@ float4 fragMain(PSInput input) : SV_TARGET
                 for (int t = 0; t < triangleCount; ++t)
                 {
                     dtm::GridDTMTriangle tri = triangles[t];
-                    const float3 baryCoord = dtm::calculateDTMTriangleBarycentrics(tri.vertices[0].xy, tri.vertices[1].xy, tri.vertices[2].xy, input.position.xy);
+                    const float3 baryCoord = dtm::calculateDTMTriangleBarycentrics(tri.vertices[0].xy, tri.vertices[1].xy, tri.vertices[2].xy, gridSpacePos);
                     interpolatedHeights[t] = baryCoord.x * tri.vertices[0].z + baryCoord.y * tri.vertices[1].z + baryCoord.z * tri.vertices[2].z;
 
                     if (currentTriangleIndex == InvalidTriangleIndex)
@@ -573,22 +572,7 @@ float4 fragMain(PSInput input) : SV_TARGET
                         {
                             const dtm::GridDTMTriangle tri = triangles[t];
                             const float currentInterpolatedHeight = interpolatedHeights[t];
-                            sdf = min(sdf, dtm::calculateDTMContourSDF(dtmSettings.contourSettings[i], contourStyle, tri.vertices, input.position.xy, currentInterpolatedHeight));
-#if 0 // Debug Triangles
-                            nbl::hlsl::shapes::Line<float> lineSegment;
-                            lineSegment.P0 = tri.vertices[0].xy;
-                            lineSegment.P1 = tri.vertices[1].xy;
-                            float distance = ClippedSignedDistance<nbl::hlsl::shapes::Line<float> >::sdf(lineSegment, input.position.xy, 1.0f, false);
-                            sdf = min(sdf, distance);
-                            lineSegment.P0 = tri.vertices[1].xy;
-                            lineSegment.P1 = tri.vertices[2].xy;
-                            distance = ClippedSignedDistance<nbl::hlsl::shapes::Line<float> >::sdf(lineSegment, input.position.xy, 1.0f, false);
-                            sdf = min(sdf, distance);
-                            lineSegment.P0 = tri.vertices[0].xy;
-                            lineSegment.P1 = tri.vertices[2].xy;
-                            distance = ClippedSignedDistance<nbl::hlsl::shapes::Line<float> >::sdf(lineSegment, input.position.xy, 1.0f, false);
-                            sdf = min(sdf, distance);
-#endif
+                            sdf = min(sdf, dtm::calculateDTMContourSDF(dtmSettings.contourSettings[i], contourStyle, tri.vertices, gridSpacePos, currentInterpolatedHeight));
                         }
                         
                         float4 contourColor = contourStyle.color; contourColor.a = 0.5f;
@@ -604,10 +588,9 @@ float4 fragMain(PSInput input) : SV_TARGET
                     nbl::hlsl::shapes::Line<float> lineSegment;
 
                     // Doing SDF of outlines as if cooridnate system is centered around the nearest corner of the cell
-                    float2 currentCellScreenspaceCoord = gridTopLeftCorner + (currentCellCoord + float2(roundedLocalUV)) * cellWidth;
+                    float2 localGridTopLeftCorner = (currentCellCoord + float2(roundedLocalUV)) * cellWidth;
                     // We do sdf in corner's local coordinate, so we subtract currentCellScreenspaceCoord from fragmentPos and topLeftGrid 
-                    float2 localFragPos = input.position.xy - currentCellScreenspaceCoord;
-                    float2 localGridTopLeftCorner = gridTopLeftCorner - currentCellScreenspaceCoord;
+                    float2 localFragPos = gridSpacePos - localGridTopLeftCorner;
                     
                     float phaseShift = 0.0f;
                     const bool hasStipples = outlineStyle.hasStipples();
@@ -657,7 +640,7 @@ float4 fragMain(PSInput input) : SV_TARGET
                     {
                         dtm::GridDTMTriangle currentTriangle = triangles[currentTriangleIndex];
                         float heightDeriv = fwidth(interpolatedHeights[currentTriangleIndex]);
-                        dtmColor = dtm::blendUnder(dtmColor, dtm::calculateDTMHeightColor(dtmSettings.heightShadingSettings, currentTriangle.vertices, heightDeriv, input.position.xy, interpolatedHeights[currentTriangleIndex]));
+                        dtmColor = dtm::blendUnder(dtmColor, dtm::calculateDTMHeightColor(dtmSettings.heightShadingSettings, currentTriangle.vertices, heightDeriv, gridSpacePos, interpolatedHeights[currentTriangleIndex]));
                     }
                     else
                     {
diff --git a/62_CAD/shaders/main_pipeline/vertex_shader.hlsl b/62_CAD/shaders/main_pipeline/vertex_shader.hlsl
index 3a5a74c10..2b54d79e7 100644
--- a/62_CAD/shaders/main_pipeline/vertex_shader.hlsl
+++ b/62_CAD/shaders/main_pipeline/vertex_shader.hlsl
@@ -698,12 +698,6 @@ PSInput main(uint vertexID : SV_VertexID)
 
             const float2 uv = corner + uvOffset;
             outV.setImageUV(uv);
-            /*printf("uv = { %f, %f } scale = { %f, %f }", _static_cast<float>(uv.x), _static_cast<float>(uv.y), _static_cast<float>(uvScale.x), _static_cast<float>(uvScale.y));*/
-
-            // TODO: test dilation
-            pfloat64_t2 topLeftToGridCenterVector = worldSpaceExtents * 0.5;
-            topLeftToGridCenterVector.y = -topLeftToGridCenterVector.y;
-            pfloat64_t2 gridCenter = topLeft + topLeftToGridCenterVector;
 
             const pfloat64_t2 vtxPos = topLeft + float2(worldSpaceExtents.x, -worldSpaceExtents.y) * corner;
             const pfloat64_t2 dilatedVtxPos = vtxPos + dilationVector;

From adbd5f02050ce7d09bda8681b664fe25a13a9387 Mon Sep 17 00:00:00 2001
From: devsh <devsh@devsh.eu>
Date: Wed, 2 Jul 2025 01:48:16 +0200
Subject: [PATCH 460/529] display reconstructed (face) normals when per-vertex
 aren't available

---
 12_MeshLoaders/main.cpp                       |  2 +-
 .../geometry/CSimpleDebugRenderer.hpp         |  4 +--
 .../nbl/examples/geometry/SPushConstants.hlsl |  3 --
 .../examples/geometry/shaders/unified.hlsl    | 33 ++++++++++++-------
 4 files changed, 24 insertions(+), 18 deletions(-)

diff --git a/12_MeshLoaders/main.cpp b/12_MeshLoaders/main.cpp
index a941f9655..6eef82ebe 100644
--- a/12_MeshLoaders/main.cpp
+++ b/12_MeshLoaders/main.cpp
@@ -17,7 +17,7 @@ class MeshLoadersApp final : public MonoWindowApplication, public BuiltinResourc
 	public:
 		inline MeshLoadersApp(const path& _localInputCWD, const path& _localOutputCWD, const path& _sharedInputCWD, const path& _sharedOutputCWD)
 			: IApplicationFramework(_localInputCWD, _localOutputCWD, _sharedInputCWD, _sharedOutputCWD),
-			device_base_t({1280,720}, EF_UNKNOWN, _localInputCWD, _localOutputCWD, _sharedInputCWD, _sharedOutputCWD) {}
+			device_base_t({1280,720}, EF_D32_SFLOAT, _localInputCWD, _localOutputCWD, _sharedInputCWD, _sharedOutputCWD) {}
 
 		inline bool onAppInitialized(smart_refctd_ptr<ISystem>&& system) override
 		{
diff --git a/common/include/nbl/examples/geometry/CSimpleDebugRenderer.hpp b/common/include/nbl/examples/geometry/CSimpleDebugRenderer.hpp
index 969b3afd8..f3565d24d 100644
--- a/common/include/nbl/examples/geometry/CSimpleDebugRenderer.hpp
+++ b/common/include/nbl/examples/geometry/CSimpleDebugRenderer.hpp
@@ -161,7 +161,7 @@ class CSimpleDebugRenderer final : public core::IReferenceCounted
 
 			// create pipeline layout
 			const SPushConstantRange ranges[] = {{
-				.stageFlags = hlsl::ShaderStage::ESS_VERTEX,
+				.stageFlags = hlsl::ShaderStage::ESS_VERTEX|hlsl::ShaderStage::ESS_FRAGMENT,
 				.offset = 0,
 				.size = sizeof(SInstance::SPushConstants),
 			}};
@@ -386,7 +386,7 @@ class CSimpleDebugRenderer final : public core::IReferenceCounted
 				const auto* geo = instance.packedGeo;
 				cmdbuf->bindGraphicsPipeline(geo->pipeline.get());
 				const auto pc = instance.computePushConstants(viewParams);
-				cmdbuf->pushConstants(layout,hlsl::ShaderStage::ESS_VERTEX,0,sizeof(pc),&pc);
+				cmdbuf->pushConstants(layout,hlsl::ShaderStage::ESS_VERTEX|hlsl::ShaderStage::ESS_FRAGMENT,0,sizeof(pc),&pc);
 				if (geo->indexBuffer)
 				{
 					cmdbuf->bindIndexBuffer(geo->indexBuffer,geo->indexType);
diff --git a/common/include/nbl/examples/geometry/SPushConstants.hlsl b/common/include/nbl/examples/geometry/SPushConstants.hlsl
index 932210d0d..91cca803b 100644
--- a/common/include/nbl/examples/geometry/SPushConstants.hlsl
+++ b/common/include/nbl/examples/geometry/SPushConstants.hlsl
@@ -22,10 +22,7 @@ struct SInstanceMatrices
 
 struct SPushConstants
 {
-	// no idea if DXC still has this bug with Push Constant static variables
-#ifndef __HLSL_VERSiON
 	NBL_CONSTEXPR_STATIC_INLINE uint32_t DescriptorCount = 255;
-#endif
 
 	SInstanceMatrices matrices;
 	uint32_t positionView : 11;
diff --git a/common/src/nbl/examples/geometry/shaders/unified.hlsl b/common/src/nbl/examples/geometry/shaders/unified.hlsl
index bc6b6e13a..07bdbbd5e 100644
--- a/common/src/nbl/examples/geometry/shaders/unified.hlsl
+++ b/common/src/nbl/examples/geometry/shaders/unified.hlsl
@@ -4,7 +4,7 @@ using namespace nbl::hlsl;
 using namespace nbl::hlsl::examples::geometry_creator_scene;
 
 // for dat sweet programmable pulling
-[[vk::binding(0)]] Buffer<float32_t4> utbs[/*SPushConstants::DescriptorCount*/255];
+[[vk::binding(0)]] Buffer<float32_t4> utbs[SPushConstants::DescriptorCount];
 
 //
 [[vk::push_constant]] SPushConstants pc;
@@ -12,11 +12,20 @@ using namespace nbl::hlsl::examples::geometry_creator_scene;
 //
 struct SInterpolants
 {
-	float32_t4 position : SV_Position;
-	float32_t3 meta : COLOR0;
+	float32_t4 ndc : SV_Position;
+	float32_t3 meta : COLOR1;
 };
 #include "nbl/builtin/hlsl/math/linalg/fast_affine.hlsl"
 
+float32_t3 reconstructGeometricNormal(float32_t3 pos)
+{
+    const float32_t2x3 dPos_dScreen = float32_t2x3(
+        ddx(pos),
+        ddy(pos)
+    );
+    return cross(dPos_dScreen[0],dPos_dScreen[1]);
+}
+
 //
 [shader("vertex")]
 SInterpolants BasicVS(uint32_t VertexIndex : SV_VertexID)
@@ -24,14 +33,18 @@ SInterpolants BasicVS(uint32_t VertexIndex : SV_VertexID)
     const float32_t3 position = utbs[pc.positionView][VertexIndex].xyz;
 
     SInterpolants output;
-    output.position = math::linalg::promoted_mul(pc.matrices.worldViewProj,position);
-    output.meta = mul(pc.matrices.normal,utbs[pc.normalView][VertexIndex].xyz);
+    output.ndc = math::linalg::promoted_mul(pc.matrices.worldViewProj,position);
+    if (pc.normalView<SPushConstants::DescriptorCount)
+        output.meta = mul(pc.matrices.normal,utbs[pc.normalView][VertexIndex].xyz);
+    else
+        output.meta = mul(inverse(transpose(pc.matrices.normal)),position);
     return output;
 }
 [shader("pixel")]
 float32_t4 BasicFS(SInterpolants input) : SV_Target0
 {
-    return float32_t4(normalize(input.meta)*0.5f+promote<float32_t3>(0.5f),1.f);
+    const float32_t3 normal = pc.normalView<SPushConstants::DescriptorCount ? input.meta:reconstructGeometricNormal(input.meta);
+    return float32_t4(normalize(normal)*0.5f+promote<float32_t3>(0.5f),1.f);
 }
 
 // TODO: do smooth normals on the cone
@@ -41,17 +54,13 @@ SInterpolants ConeVS(uint32_t VertexIndex : SV_VertexID)
     const float32_t3 position = utbs[pc.positionView][VertexIndex].xyz;
 
     SInterpolants output;
-    output.position = math::linalg::promoted_mul(pc.matrices.worldViewProj,position);
+    output.ndc = math::linalg::promoted_mul(pc.matrices.worldViewProj,position);
     output.meta = mul(inverse(transpose(pc.matrices.normal)),position);
     return output;
 }
 [shader("pixel")]
 float32_t4 ConeFS(SInterpolants input) : SV_Target0
 {
-    const float32_t2x3 dViewPos_dScreen = float32_t2x3(
-        ddx(input.meta),
-        ddy(input.meta)
-    );
-    const float32_t3 normal = cross(dViewPos_dScreen[0],dViewPos_dScreen[1]);
+    const float32_t3 normal = reconstructGeometricNormal(input.meta);
     return float32_t4(normalize(normal)*0.5f+promote<float32_t3>(0.5f),1.f);
 }
\ No newline at end of file

From 13a835bc851fe7e96bbe1da940c3784cb11fedd9 Mon Sep 17 00:00:00 2001
From: devsh <devsh@devsh.eu>
Date: Wed, 2 Jul 2025 02:34:12 +0200
Subject: [PATCH 461/529] fix `removeGeometry` to actually deallocate slots in
 the Suballocated Descriptor sets and no null-free

---
 .../geometry/CSimpleDebugRenderer.hpp         | 19 ++++++++++---------
 1 file changed, 10 insertions(+), 9 deletions(-)

diff --git a/common/include/nbl/examples/geometry/CSimpleDebugRenderer.hpp b/common/include/nbl/examples/geometry/CSimpleDebugRenderer.hpp
index f3565d24d..9b27ab190 100644
--- a/common/include/nbl/examples/geometry/CSimpleDebugRenderer.hpp
+++ b/common/include/nbl/examples/geometry/CSimpleDebugRenderer.hpp
@@ -57,9 +57,10 @@ class CSimpleDebugRenderer final : public core::IReferenceCounted
 			asset::SBufferBinding<const video::IGPUBuffer> indexBuffer = {};
 			uint32_t elementCount = 0;
 			// indices into the descriptor set
-			uint8_t positionView = 0;
-			uint8_t normalView = 0;
-			uint8_t uvView = 0;
+			constexpr static inline auto MissingView = hlsl::examples::geometry_creator_scene::SPushConstants::DescriptorCount;
+			uint8_t positionView = MissingView;
+			uint8_t normalView = MissingView;
+			uint8_t uvView = MissingView;
 			asset::E_INDEX_TYPE indexType = asset::EIT_UNKNOWN;
 		};
 		//
@@ -137,7 +138,7 @@ class CSimpleDebugRenderer final : public core::IReferenceCounted
 							// need this trifecta of flags for `SubAllocatedDescriptorSet` to accept the binding as suballocatable
 							.createFlags = binding_flags_t::ECF_UPDATE_AFTER_BIND_BIT|binding_flags_t::ECF_UPDATE_UNUSED_WHILE_PENDING_BIT |binding_flags_t::ECF_PARTIALLY_BOUND_BIT,
 							.stageFlags = IShader::E_SHADER_STAGE::ESS_VERTEX|IShader::E_SHADER_STAGE::ESS_FRAGMENT,
-							.count = SInstance::SPushConstants::DescriptorCount
+							.count = SPackedGeometry::MissingView
 						}
 					};
 					dsLayout = device->createDescriptorSetLayout(bindings);
@@ -249,10 +250,10 @@ class CSimpleDebugRenderer final : public core::IReferenceCounted
 			auto allocateUTB = [&](const IGeometry<const IGPUBuffer>::SDataView& view)->uint8_t
 			{
 				if (!view)
-					return SInstance::SPushConstants::DescriptorCount;
+					return SPackedGeometry::MissingView;
 				auto index = SubAllocatedDescriptorSet::invalid_value;
 				if (m_params.subAllocDS->multi_allocate(VertexAttrubUTBDescBinding,1,&index)!=0)
-					return SInstance::SPushConstants::DescriptorCount;
+					return SPackedGeometry::MissingView;
 				const auto retval = infos.size();
 				infos.emplace_back().desc = device->createBufferView(view.src,view.composed.format);
 				writes.emplace_back() = {
@@ -340,6 +341,8 @@ class CSimpleDebugRenderer final : public core::IReferenceCounted
 			deferredFree.reserve(3);
 			auto deallocate = [&](SubAllocatedDescriptorSet::value_type index)->void
 			{
+				if (index>=SPackedGeometry::MissingView)
+					return;
 				if (info.semaphore)
 					deferredFree.push_back(index);
 				else
@@ -353,9 +356,7 @@ class CSimpleDebugRenderer final : public core::IReferenceCounted
 
 			if (deferredFree.empty())
 				return;
-
-			core::vector<IGPUDescriptorSet::SDropDescriptorSet> nullify(deferredFree.size());
-			const_cast<ILogicalDevice*>(m_params.layout->getOriginDevice())->nullifyDescriptors(nullify);
+			m_params.subAllocDS->multi_deallocate(VertexAttrubUTBDescBinding,deferredFree.size(),deferredFree.data(),info);
 		}
 
 		//

From 1b3a341ac538340e32f9e6ad4111f41ace89fee1 Mon Sep 17 00:00:00 2001
From: devsh <devsh@devsh.eu>
Date: Wed, 2 Jul 2025 13:01:49 +0200
Subject: [PATCH 462/529] Find the bug causing us to crash on mesh reload.

---
 12_MeshLoaders/main.cpp                                  | 8 ++++----
 common/include/nbl/examples/geometry/SPushConstants.hlsl | 7 +++----
 2 files changed, 7 insertions(+), 8 deletions(-)

diff --git a/12_MeshLoaders/main.cpp b/12_MeshLoaders/main.cpp
index 6eef82ebe..cd19fddc0 100644
--- a/12_MeshLoaders/main.cpp
+++ b/12_MeshLoaders/main.cpp
@@ -5,7 +5,7 @@
 
 #include "../3rdparty/portable-file-dialogs/portable-file-dialogs.h"
 
-#ifdef _NBL_COMPILE_WITH_MITSUBA_SERIALIZED_LOADER_
+#ifdef NBL_BUILD_MITSUBA_LOADER
 #include "nbl/ext/MitsubaLoader/CSerializedLoader.h"
 #endif
 
@@ -23,7 +23,7 @@ class MeshLoadersApp final : public MonoWindowApplication, public BuiltinResourc
 		{
 			if (!asset_base_t::onAppInitialized(smart_refctd_ptr(system)))
 				return false;
-		#ifdef _NBL_COMPILE_WITH_MITSUBA_SERIALIZED_LOADER_
+		#ifdef NBL_BUILD_MITSUBA_LOADER
 			m_assetMgr->addAssetLoader(make_smart_refctd_ptr<ext::MitsubaLoader::CSerializedLoader>());
 		#endif
 			if (!device_base_t::onAppInitialized(smart_refctd_ptr(system)))
@@ -218,9 +218,9 @@ class MeshLoadersApp final : public MonoWindowApplication, public BuiltinResourc
 
 	private:
 		// TODO: standardise this across examples, and take from `argv`
-		bool m_nonInteractiveTest = true;
+		bool m_nonInteractiveTest = false;
 
-		inline bool reloadModel()
+		bool reloadModel()
 		{
 			if (m_nonInteractiveTest) // TODO: maybe also take from argv and argc
 				m_modelPath = (sharedInputCWD/"ply/Spanner-ply.ply").string();
diff --git a/common/include/nbl/examples/geometry/SPushConstants.hlsl b/common/include/nbl/examples/geometry/SPushConstants.hlsl
index 91cca803b..74cbfd565 100644
--- a/common/include/nbl/examples/geometry/SPushConstants.hlsl
+++ b/common/include/nbl/examples/geometry/SPushConstants.hlsl
@@ -22,12 +22,11 @@ struct SInstanceMatrices
 
 struct SPushConstants
 {
-	NBL_CONSTEXPR_STATIC_INLINE uint32_t DescriptorCount = 255;
+	NBL_CONSTEXPR_STATIC_INLINE uint32_t DescriptorCount = (0x1<<16)-1;
 
 	SInstanceMatrices matrices;
-	uint32_t positionView : 11;
-	uint32_t normalView : 10;
-	uint32_t uvView : 11;
+	uint32_t positionView : 16;
+	uint32_t normalView : 16;
 };
 
 }

From 0144b2d3a8a41a2cab0f3d6693188d92c84ce727 Mon Sep 17 00:00:00 2001
From: devsh <devsh@devsh.eu>
Date: Wed, 2 Jul 2025 13:02:37 +0200
Subject: [PATCH 463/529] add missing file from last commit

---
 .../geometry/CSimpleDebugRenderer.hpp         | 24 +++++++++----------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/common/include/nbl/examples/geometry/CSimpleDebugRenderer.hpp b/common/include/nbl/examples/geometry/CSimpleDebugRenderer.hpp
index 9b27ab190..f75ac9009 100644
--- a/common/include/nbl/examples/geometry/CSimpleDebugRenderer.hpp
+++ b/common/include/nbl/examples/geometry/CSimpleDebugRenderer.hpp
@@ -58,9 +58,8 @@ class CSimpleDebugRenderer final : public core::IReferenceCounted
 			uint32_t elementCount = 0;
 			// indices into the descriptor set
 			constexpr static inline auto MissingView = hlsl::examples::geometry_creator_scene::SPushConstants::DescriptorCount;
-			uint8_t positionView = MissingView;
-			uint8_t normalView = MissingView;
-			uint8_t uvView = MissingView;
+			uint16_t positionView = MissingView;
+			uint16_t normalView = MissingView;
 			asset::E_INDEX_TYPE indexType = asset::EIT_UNKNOWN;
 		};
 		//
@@ -73,8 +72,7 @@ class CSimpleDebugRenderer final : public core::IReferenceCounted
 				return {
 					.matrices = viewParams.computeForInstance(world),
 					.positionView = packedGeo->positionView,
-					.normalView = packedGeo->normalView,
-					.uvView = packedGeo->uvView
+					.normalView = packedGeo->normalView
 				};
 			}
 
@@ -247,24 +245,30 @@ class CSimpleDebugRenderer final : public core::IReferenceCounted
 
 			core::vector<IGPUDescriptorSet::SWriteDescriptorSet> writes;
 			core::vector<IGPUDescriptorSet::SDescriptorInfo> infos;
+			bool anyFailed = false;
 			auto allocateUTB = [&](const IGeometry<const IGPUBuffer>::SDataView& view)->uint8_t
 			{
 				if (!view)
 					return SPackedGeometry::MissingView;
 				auto index = SubAllocatedDescriptorSet::invalid_value;
 				if (m_params.subAllocDS->multi_allocate(VertexAttrubUTBDescBinding,1,&index)!=0)
+				{
+					anyFailed = true;
 					return SPackedGeometry::MissingView;
-				const auto retval = infos.size();
+				}
+				const auto infosOffset = infos.size();
 				infos.emplace_back().desc = device->createBufferView(view.src,view.composed.format);
 				writes.emplace_back() = {
 					.dstSet = m_params.subAllocDS->getDescriptorSet(),
 					.binding = VertexAttrubUTBDescBinding,
 					.arrayElement = index,
 					.count = 1,
-					.info = reinterpret_cast<const IGPUDescriptorSet::SDescriptorInfo*>(retval)
+					.info = reinterpret_cast<const IGPUDescriptorSet::SDescriptorInfo*>(infosOffset)
 				};
-				return retval;
+				return index;
 			};
+			if (anyFailed)
+				device->getLogger()->log("Failed to allocate a UTB for some geometries, probably ran out of space in Descriptor Set!",system::ILogger::ELL_ERROR);
 
 			auto sizeToSet = m_geoms.size();
 			auto resetGeoms = core::makeRAIIExiter([&]()->void
@@ -309,9 +313,6 @@ class CSimpleDebugRenderer final : public core::IReferenceCounted
 				out.elementCount = geom->getVertexReferenceCount();
 				out.positionView = allocateUTB(geom->getPositionView());
 				out.normalView = allocateUTB(geom->getNormalView());
-				// the first view is usually the UV
-				if (const auto& auxViews = geom->getAuxAttributeViews(); !auxViews.empty())
-					out.uvView = allocateUTB(auxViews.front());
 			}
 
 			// no geometry
@@ -351,7 +352,6 @@ class CSimpleDebugRenderer final : public core::IReferenceCounted
 			auto geo = m_geoms.begin() + ix;
 			deallocate(geo->positionView);
 			deallocate(geo->normalView);
-			deallocate(geo->uvView);
 			m_geoms.erase(geo);
 
 			if (deferredFree.empty())

From 6e055333acaa43051b83860eeba6728b4699c82b Mon Sep 17 00:00:00 2001
From: devsh <devsh@devsh.eu>
Date: Wed, 2 Jul 2025 15:49:19 +0200
Subject: [PATCH 464/529] lay out separate geomtries along the X-axis

---
 12_MeshLoaders/main.cpp | 38 +++++++++++++++++++++++++++-----------
 1 file changed, 27 insertions(+), 11 deletions(-)

diff --git a/12_MeshLoaders/main.cpp b/12_MeshLoaders/main.cpp
index cd19fddc0..5d563584b 100644
--- a/12_MeshLoaders/main.cpp
+++ b/12_MeshLoaders/main.cpp
@@ -268,8 +268,13 @@ class MeshLoadersApp final : public MonoWindowApplication, public BuiltinResourc
 			}
 			if (geometries.empty())
 				return false;
-			
-			auto bound = hlsl::shapes::AABB<3,double>::create();
+
+			using aabb_t = hlsl::shapes::AABB<3,double>;
+			auto printAABB = [&](const aabb_t& aabb, const char* extraMsg="")->void
+			{
+				m_logger->log("%s AABB is (%f,%f,%f) -> (%f,%f,%f)",ILogger::ELL_INFO,extraMsg,aabb.minVx.x,aabb.minVx.y,aabb.minVx.z,aabb.maxVx.x,aabb.maxVx.y,aabb.maxVx.z);
+			};
+			auto bound = aabb_t::create();
 			// convert the geometries
 			{
 				smart_refctd_ptr<CAssetConverter> converter = CAssetConverter::create({.device=m_device.get()});
@@ -344,28 +349,39 @@ class MeshLoadersApp final : public MonoWindowApplication, public BuiltinResourc
 						return false;
 					}
 				}
-
+				
+				auto tmp = hlsl::float32_t4x3(
+					hlsl::float32_t3(1,0,0),
+					hlsl::float32_t3(0,1,0),
+					hlsl::float32_t3(0,0,1),
+					hlsl::float32_t3(0,0,0)
+				);
+				core::vector<hlsl::float32_t3x4> worldTforms;
 				const auto& converted = reservation.getGPUObjects<ICPUPolygonGeometry>();
 				for (const auto& geom : converted)
 				{
-					geom.value->visitAABB([&bound](const auto& aabb)->void
+					geom.value->visitAABB([&bound,&worldTforms,&tmp,&printAABB](const auto& aabb)->void
 						{
 							hlsl::shapes::AABB<3,double> promoted;
 							promoted.minVx = aabb.minVx;
 							promoted.maxVx = aabb.maxVx;
-							bound = hlsl::shapes::util::union_(promoted,bound);
+							printAABB(promoted,"Geometry");
+							tmp[3].x += promoted.getExtent().x;
+							const auto promotedWorld = hlsl::float64_t3x4(worldTforms.emplace_back(hlsl::transpose(tmp)));
+							const auto transformed = hlsl::shapes::util::transform(promotedWorld,promoted);
+							printAABB(transformed,"Transformed");
+							bound = hlsl::shapes::util::union_(transformed,bound);
 						}
 					);
 				}
+				printAABB(bound,"Total");
 				if (!m_renderer->addGeometries({ &converted.front().get(),converted.size() }))
 					return false;
+
+				auto worlTformsIt = worldTforms.begin();
 				for (const auto& geo : m_renderer->getGeometries())
 					m_renderer->m_instances.push_back({
-						.world = hlsl::float32_t3x4(
-							hlsl::float32_t4(1,0,0,0),
-							hlsl::float32_t4(0,1,0,0),
-							hlsl::float32_t4(0,0,1,0)
-						),
+						.world = *(worlTformsIt++),
 						.packedGeo = &geo
 					});
 			}
@@ -373,7 +389,7 @@ class MeshLoadersApp final : public MonoWindowApplication, public BuiltinResourc
 			// get scene bounds and reset camera
 			{
 				const double distance = 0.05;
-				const auto diagonal = bound.maxVx-bound.minVx;
+				const auto diagonal = bound.getExtent();
 				{
 					const auto measure = hlsl::length(diagonal);
 					const auto aspectRatio = float(m_window->getWidth())/float(m_window->getHeight());

From 4ed18fa6d52a47bca3c2755a4756895961002e25 Mon Sep 17 00:00:00 2001
From: devsh <devsh@devsh.eu>
Date: Wed, 2 Jul 2025 15:58:56 +0200
Subject: [PATCH 465/529] fix integer overflow issue

---
 common/include/nbl/examples/geometry/CSimpleDebugRenderer.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/common/include/nbl/examples/geometry/CSimpleDebugRenderer.hpp b/common/include/nbl/examples/geometry/CSimpleDebugRenderer.hpp
index f75ac9009..c1cf6567f 100644
--- a/common/include/nbl/examples/geometry/CSimpleDebugRenderer.hpp
+++ b/common/include/nbl/examples/geometry/CSimpleDebugRenderer.hpp
@@ -246,7 +246,7 @@ class CSimpleDebugRenderer final : public core::IReferenceCounted
 			core::vector<IGPUDescriptorSet::SWriteDescriptorSet> writes;
 			core::vector<IGPUDescriptorSet::SDescriptorInfo> infos;
 			bool anyFailed = false;
-			auto allocateUTB = [&](const IGeometry<const IGPUBuffer>::SDataView& view)->uint8_t
+			auto allocateUTB = [&](const IGeometry<const IGPUBuffer>::SDataView& view)->decltype(SubAllocatedDescriptorSet::invalid_value)
 			{
 				if (!view)
 					return SPackedGeometry::MissingView;

From 26302a37d1bc30ed868f8f0154bfacb01d236f63 Mon Sep 17 00:00:00 2001
From: devsh <devsh@devsh.eu>
Date: Wed, 2 Jul 2025 17:46:01 +0200
Subject: [PATCH 466/529] adjust to AABB changes

---
 12_MeshLoaders/main.cpp | 20 +++++++-------------
 1 file changed, 7 insertions(+), 13 deletions(-)

diff --git a/12_MeshLoaders/main.cpp b/12_MeshLoaders/main.cpp
index 5d563584b..3a4d8b13b 100644
--- a/12_MeshLoaders/main.cpp
+++ b/12_MeshLoaders/main.cpp
@@ -360,19 +360,13 @@ class MeshLoadersApp final : public MonoWindowApplication, public BuiltinResourc
 				const auto& converted = reservation.getGPUObjects<ICPUPolygonGeometry>();
 				for (const auto& geom : converted)
 				{
-					geom.value->visitAABB([&bound,&worldTforms,&tmp,&printAABB](const auto& aabb)->void
-						{
-							hlsl::shapes::AABB<3,double> promoted;
-							promoted.minVx = aabb.minVx;
-							promoted.maxVx = aabb.maxVx;
-							printAABB(promoted,"Geometry");
-							tmp[3].x += promoted.getExtent().x;
-							const auto promotedWorld = hlsl::float64_t3x4(worldTforms.emplace_back(hlsl::transpose(tmp)));
-							const auto transformed = hlsl::shapes::util::transform(promotedWorld,promoted);
-							printAABB(transformed,"Transformed");
-							bound = hlsl::shapes::util::union_(transformed,bound);
-						}
-					);
+					const auto promoted = geom.value->getAABB<aabb_t>();
+					printAABB(promoted,"Geometry");
+					tmp[3].x += promoted.getExtent().x;
+					const auto promotedWorld = hlsl::float64_t3x4(worldTforms.emplace_back(hlsl::transpose(tmp)));
+					const auto transformed = hlsl::shapes::util::transform(promotedWorld,promoted);
+					printAABB(transformed,"Transformed");
+					bound = hlsl::shapes::util::union_(transformed,bound);
 				}
 				printAABB(bound,"Total");
 				if (!m_renderer->addGeometries({ &converted.front().get(),converted.size() }))

From 5287dab0ffde5a483a4d143f37728fc7bb522c80 Mon Sep 17 00:00:00 2001
From: kevyuu <kevin.kayu@gmail.com>
Date: Thu, 3 Jul 2025 21:03:29 +0700
Subject: [PATCH 467/529] Add arrow geometry to example 67

---
 67_RayQueryGeometry/app_resources/common.hlsl |  2 +-
 .../app_resources/render.comp.hlsl            |  4 +-
 67_RayQueryGeometry/include/common.hpp        |  2 +-
 67_RayQueryGeometry/main.cpp                  | 48 +++++++++++--------
 4 files changed, 31 insertions(+), 25 deletions(-)

diff --git a/67_RayQueryGeometry/app_resources/common.hlsl b/67_RayQueryGeometry/app_resources/common.hlsl
index 0827a0e90..9110cd4a1 100644
--- a/67_RayQueryGeometry/app_resources/common.hlsl
+++ b/67_RayQueryGeometry/app_resources/common.hlsl
@@ -12,7 +12,7 @@ struct SGeomInfo
     uint64_t indexBufferAddress;
     uint64_t normalBufferAddress;
 
-    uint32_t vertexStride : 29;
+    uint32_t objType : 29;
     uint32_t indexType : 2; // 16 bit, 32 bit or none
     uint32_t smoothNormals : 1;	// flat for cube, rectangle, disk
     uint32_t padding;
diff --git a/67_RayQueryGeometry/app_resources/render.comp.hlsl b/67_RayQueryGeometry/app_resources/render.comp.hlsl
index bf6431af5..937273767 100644
--- a/67_RayQueryGeometry/app_resources/render.comp.hlsl
+++ b/67_RayQueryGeometry/app_resources/render.comp.hlsl
@@ -28,8 +28,7 @@ float3 unpackNormals3x10(uint32_t v)
 float3 calculateSmoothNormals(int instID, int primID, SGeomInfo geom, float2 bary)
 {
     const uint indexType = geom.indexType;
-    const uint vertexStride = geom.vertexStride;
-    const uint objType = instID;
+    const uint objType = geom.objType;
 
     const uint64_t vertexBufferAddress = geom.vertexBufferAddress;
     const uint64_t indexBufferAddress = geom.indexBufferAddress;
@@ -62,7 +61,6 @@ float3 calculateSmoothNormals(int instID, int primID, SGeomInfo geom, float2 bar
         //case OT_ARROW:
         case OT_CONE:
         {
-            // TODO: document why the alignment is 2 here and nowhere else? isnt the `vertexStride` aligned to more than 2 anyway?
             uint32_t v0 = vk::RawBufferLoad<uint32_t>(normalBufferAddress + indices[0] * 4);
             uint32_t v1 = vk::RawBufferLoad<uint32_t>(normalBufferAddress + indices[1] * 4);
             uint32_t v2 = vk::RawBufferLoad<uint32_t>(normalBufferAddress + indices[2] * 4);
diff --git a/67_RayQueryGeometry/include/common.hpp b/67_RayQueryGeometry/include/common.hpp
index 48b91ba5f..b1759e9e3 100644
--- a/67_RayQueryGeometry/include/common.hpp
+++ b/67_RayQueryGeometry/include/common.hpp
@@ -56,7 +56,7 @@ enum GeometryShader
 struct ReferenceObjectCpu
 {
 	ObjectMeta meta;
-	GeometryShader shadersType;
+  core::matrix3x4SIMD transform;
 	core::smart_refctd_ptr<ICPUPolygonGeometry> data;
 };
 
diff --git a/67_RayQueryGeometry/main.cpp b/67_RayQueryGeometry/main.cpp
index 97482dc51..76a4819e0 100644
--- a/67_RayQueryGeometry/main.cpp
+++ b/67_RayQueryGeometry/main.cpp
@@ -491,20 +491,32 @@ class RayQueryGeometryApp final : public SimpleWindowedApplication, public Built
 			// triangles geometries
 			auto gc = make_smart_refctd_ptr<CGeometryCreator>();
 
-			std::array<ReferenceObjectCpu, OT_COUNT> cpuObjects;
-			cpuObjects[OT_CUBE] = ReferenceObjectCpu{ .meta = {.type = OT_CUBE, .name = "Cube Mesh" }, .shadersType = GP_BASIC, .data = gc->createCube({1.f, 1.f, 1.f}) };
-			cpuObjects[OT_SPHERE] = ReferenceObjectCpu{ .meta = {.type = OT_SPHERE, .name = "Sphere Mesh" }, .shadersType = GP_BASIC, .data = gc->createSphere(2, 16, 16) };
-			cpuObjects[OT_CYLINDER] = ReferenceObjectCpu{ .meta = {.type = OT_CYLINDER, .name = "Cylinder Mesh" }, .shadersType = GP_BASIC, .data = gc->createCylinder(2, 2, 20) };
-			cpuObjects[OT_RECTANGLE] = ReferenceObjectCpu{ .meta = {.type = OT_RECTANGLE, .name = "Rectangle Mesh" }, .shadersType = GP_BASIC, .data = gc->createRectangle({1.5, 3}) };
-			cpuObjects[OT_CONE] = ReferenceObjectCpu{ .meta = {.type = OT_CONE, .name = "Cone Mesh" }, .shadersType = GP_CONE, .data = gc->createCone(2, 3, 10) };
-			cpuObjects[OT_ICOSPHERE] = ReferenceObjectCpu{ .meta = {.type = OT_ICOSPHERE, .name = "Icosphere Mesh" }, .shadersType = GP_ICO, .data = gc->createIcoSphere(1, 3, true) };
+			auto transform_i = 0;
+			auto nextTransform = [&transform_i]()
+			{
+				core::matrix3x4SIMD transform;
+				transform.setTranslation(nbl::core::vectorSIMDf(5.f * transform_i, 0, 0, 0));
+				transform_i++;
+				return transform;
+			};
 
-			auto geomInfoBuffer = ICPUBuffer::create({ OT_COUNT * sizeof(SGeomInfo) });
+			std::vector<ReferenceObjectCpu> cpuObjects;
+			cpuObjects.push_back(ReferenceObjectCpu{ .meta = {.type = OT_CUBE, .name = "Cube Mesh" }, .transform = nextTransform(), .data = gc->createCube({1.f, 1.f, 1.f})});
+			cpuObjects.push_back(ReferenceObjectCpu{ .meta = {.type = OT_SPHERE, .name = "Sphere Mesh" }, .transform = nextTransform(), .data = gc->createSphere(2, 16, 16)});
+			cpuObjects.push_back(ReferenceObjectCpu{ .meta = {.type = OT_CYLINDER, .name = "Cylinder Mesh" }, .transform = nextTransform(), .data = gc->createCylinder(2, 2, 20)});
+			cpuObjects.push_back(ReferenceObjectCpu{ .meta = {.type = OT_RECTANGLE, .name = "Rectangle Mesh" }, .transform = nextTransform(), .data = gc->createRectangle({1.5, 3})});
+			cpuObjects.push_back(ReferenceObjectCpu{ .meta = {.type = OT_CONE, .name = "Cone Mesh" }, .transform = nextTransform(), .data = gc->createCone(2, 3, 10)});
+			cpuObjects.push_back(ReferenceObjectCpu{ .meta = {.type = OT_ICOSPHERE, .name = "Icosphere Mesh" }, .transform = nextTransform(), .data = gc->createIcoSphere(1, 3, true)});
+			const auto arrowPolygons = gc->createArrow();
+			const auto arrowTransform = nextTransform();
+			cpuObjects.push_back(ReferenceObjectCpu{ .meta = {.type = OT_CYLINDER, .name = "Arrow Mesh" }, .transform = arrowTransform, .data = arrowPolygons[0]});
+			cpuObjects.push_back(ReferenceObjectCpu{ .meta = {.type = OT_CONE, .name = "Arrow Mesh" }, .transform = arrowTransform, .data = arrowPolygons[1]});
+			auto geomInfoBuffer = ICPUBuffer::create({ cpuObjects.size() * sizeof(SGeomInfo) });
 
 			SGeomInfo* geomInfos = reinterpret_cast<SGeomInfo*>(geomInfoBuffer->getPointer());
 
 			// get ICPUBuffers into ICPUBottomLevelAccelerationStructures
-			std::array<smart_refctd_ptr<ICPUBottomLevelAccelerationStructure>, OT_COUNT> cpuBlas;
+			std::vector<smart_refctd_ptr<ICPUBottomLevelAccelerationStructure>> cpuBlas(cpuObjects.size());
 			for (uint32_t i = 0; i < cpuBlas.size(); i++)
 			{
 				auto triangles = make_refctd_dynamic_array<smart_refctd_dynamic_array<ICPUBottomLevelAccelerationStructure::Triangles<ICPUBuffer>>>(1u);
@@ -530,7 +542,7 @@ class RayQueryGeometryApp final : public SimpleWindowedApplication, public Built
 			}
 
 			// get ICPUBottomLevelAccelerationStructure into ICPUTopLevelAccelerationStructure
-			auto geomInstances = make_refctd_dynamic_array<smart_refctd_dynamic_array<ICPUTopLevelAccelerationStructure::PolymorphicInstance>>(OT_COUNT);
+			auto geomInstances = make_refctd_dynamic_array<smart_refctd_dynamic_array<ICPUTopLevelAccelerationStructure::PolymorphicInstance>>(cpuObjects.size());
 			{
 				uint32_t i = 0;
 				for (auto instance = geomInstances->begin(); instance != geomInstances->end(); instance++, i++)
@@ -541,11 +553,7 @@ class RayQueryGeometryApp final : public SimpleWindowedApplication, public Built
 					inst.base.instanceCustomIndex = i;
 					inst.base.instanceShaderBindingTableRecordOffset = 0;
 					inst.base.mask = 0xFF;
-
-					core::matrix3x4SIMD transform;
-					transform.setTranslation(nbl::core::vectorSIMDf(5.f * i, 0, 0, 0));
-					inst.transform = transform;
-					
+					inst.transform = cpuObjects[i].transform;
 					instance->instance = inst;
 				}
 			}
@@ -612,9 +620,9 @@ class RayQueryGeometryApp final : public SimpleWindowedApplication, public Built
 			
 			CAssetConverter::patch_t<ICPUTopLevelAccelerationStructure> tlasPatch = {};
 			tlasPatch.compactAfterBuild = true;
-			std::array<CAssetConverter::patch_t<ICPUBottomLevelAccelerationStructure>,OT_COUNT> tmpBLASPatches = {};
-			std::array<ICPUPolygonGeometry*, std::size(cpuObjects)> tmpGeometries;
-			std::array<CAssetConverter::patch_t<asset::ICPUPolygonGeometry>, std::size(cpuObjects)> tmpGeometryPatches;
+			std::vector<CAssetConverter::patch_t<ICPUBottomLevelAccelerationStructure>> tmpBLASPatches(cpuObjects.size());
+			std::vector<ICPUPolygonGeometry*> tmpGeometries(cpuObjects.size());
+			std::vector<CAssetConverter::patch_t<asset::ICPUPolygonGeometry>> tmpGeometryPatches(cpuObjects.size());
 			{
 				tmpBLASPatches.front().compactAfterBuild = true;
 				std::fill(tmpBLASPatches.begin(),tmpBLASPatches.end(),tmpBLASPatches.front());
@@ -779,7 +787,7 @@ class RayQueryGeometryApp final : public SimpleWindowedApplication, public Built
 						.vertexBufferAddress = vertexBufferAddress,
 						.indexBufferAddress = indexBufferBinding.buffer ? indexBufferBinding.buffer->getDeviceAddress() + indexBufferBinding.offset : vertexBufferAddress,
 						.normalBufferAddress = normalBufferAddress,
-						.vertexStride = gpuTriangles.vertexStride,
+						.objType = cpuObject.meta.type,
 						.indexType = gpuTriangles.indexType,
 						.smoothNormals = s_smoothNormals[cpuObject.meta.type],
 					};
@@ -792,7 +800,7 @@ class RayQueryGeometryApp final : public SimpleWindowedApplication, public Built
 			{
 				IGPUBuffer::SCreationParams params;
 				params.usage = IGPUBuffer::EUF_STORAGE_BUFFER_BIT | IGPUBuffer::EUF_TRANSFER_DST_BIT | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT;
-				params.size = OT_COUNT * sizeof(SGeomInfo);
+				params.size = cpuObjects.size() * sizeof(SGeomInfo);
 				m_utils->createFilledDeviceLocalBufferOnDedMem(SIntendedSubmitInfo{ .queue = gQueue }, std::move(params), geomInfos).move_into(geometryInfoBuffer);
 			}
 

From 60bdf1bb1e37465fac44c8f865e63189f497cfb1 Mon Sep 17 00:00:00 2001
From: Arkadiusz Lachowicz <areklachowicz@gmail.com>
Date: Thu, 3 Jul 2025 20:37:21 +0200
Subject: [PATCH 468/529] use NBL_REGISTER_BUILD_MOUNT_POINT, update
 BuiltinResourcesApplication and common/src/nbl/examples/CMakeLists.txt

---
 .../common/BuiltinResourcesApplication.hpp       | 12 ++++++------
 common/src/nbl/examples/CMakeLists.txt           | 16 +++++++++++-----
 2 files changed, 17 insertions(+), 11 deletions(-)

diff --git a/common/include/nbl/examples/common/BuiltinResourcesApplication.hpp b/common/include/nbl/examples/common/BuiltinResourcesApplication.hpp
index 02509ca6a..b0a21fb05 100644
--- a/common/include/nbl/examples/common/BuiltinResourcesApplication.hpp
+++ b/common/include/nbl/examples/common/BuiltinResourcesApplication.hpp
@@ -10,14 +10,14 @@
 #ifdef NBL_EMBED_BUILTIN_RESOURCES
 	#include "nbl/builtin/examples/include/CArchive.h"
 	#include "nbl/builtin/examples/src/CArchive.h"
-	#include "nbl/builtin/examples/build/spirv/CArchive.h"
+	#include "nbl/builtin/examples/build/CArchive.h"
 	#if __has_include("nbl/this_example/builtin/CArchive.h")
 		#include "nbl/this_example/builtin/CArchive.h"
 	#endif
-	// TODO: (**) there should be also 5th arch "nbl/this_example/builtin/build/spirv/CArchive.h"
+	// TODO: (**) there should be also 5th arch "nbl/this_example/builtin/build/CArchive.h"
 	/*
-		#if __has_include("nbl/this_example/builtin/build/spirv/CArchive.h")
-		#include "nbl/this_example/builtin/build/spirv/CArchive.h"
+		#if __has_include("nbl/this_example/builtin/build/CArchive.h")
+		#include "nbl/this_example/builtin/build/CArchive.h"
 		#endif
 	*/
 	//! this ain't meant to be the same as this_example ordinary archive
@@ -49,7 +49,7 @@ class BuiltinResourcesApplication : public virtual application_templates::MonoAs
 			#ifdef NBL_EMBED_BUILTIN_RESOURCES
 			examplesHeaderArch = core::make_smart_refctd_ptr<nbl::builtin::examples::include::CArchive>(smart_refctd_ptr(m_logger));
 			examplesSourceArch = core::make_smart_refctd_ptr<nbl::builtin::examples::src::CArchive>(smart_refctd_ptr(m_logger));
-			examplesBuildSpirvArch = core::make_smart_refctd_ptr<nbl::builtin::examples::build::spirv::CArchive>(smart_refctd_ptr(m_logger));
+			examplesBuildSpirvArch = core::make_smart_refctd_ptr<nbl::builtin::examples::build::CArchive>(smart_refctd_ptr(m_logger));
 
 			#ifdef _NBL_THIS_EXAMPLE_BUILTIN_C_ARCHIVE_H_
 				thisExampleArch = make_smart_refctd_ptr<nbl::this_example::builtin::CArchive>(smart_refctd_ptr(m_logger));
@@ -58,7 +58,7 @@ class BuiltinResourcesApplication : public virtual application_templates::MonoAs
 			#else
 			examplesHeaderArch = make_smart_refctd_ptr<system::CMountDirectoryArchive>(localInputCWD/"../common/include/nbl/examples",smart_refctd_ptr(m_logger),m_system.get());
 			examplesSourceArch = make_smart_refctd_ptr<system::CMountDirectoryArchive>(localInputCWD/"../common/src/nbl/examples",smart_refctd_ptr(m_logger),m_system.get());
-			examplesBuildSpirvArch = make_smart_refctd_ptr<system::CMountDirectoryArchive>(NBL_EXAMPLES_BUILD_SPIRV_MOUNT_POINT, smart_refctd_ptr(m_logger), m_system.get());
+			examplesBuildSpirvArch = make_smart_refctd_ptr<system::CMountDirectoryArchive>(NBL_EXAMPLES_BUILD_MOUNT_POINT, smart_refctd_ptr(m_logger), m_system.get());
 			thisExampleArch = make_smart_refctd_ptr<system::CMountDirectoryArchive>(localInputCWD/"app_resources",smart_refctd_ptr(m_logger),m_system.get());
 			// TODO: (**)
 			#endif
diff --git a/common/src/nbl/examples/CMakeLists.txt b/common/src/nbl/examples/CMakeLists.txt
index cfebab2b4..7cc198ebe 100644
--- a/common/src/nbl/examples/CMakeLists.txt
+++ b/common/src/nbl/examples/CMakeLists.txt
@@ -4,16 +4,22 @@ set(COMMON_OPTIONS
 	-I "${COMMON_INCLUDE_DIRECTORY}"
 )
 
-NBL_REGISTER_SPIRV_SHADERS(
-	MOUNT_POINT_DEFINE
-		NBL_EXAMPLES_BUILD_SPIRV_MOUNT_POINT
+set(OUTPUT_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/auto-gen" )
+file(WRITE "${OUTPUT_DIRECTORY}/dummy.txt" "dummy, test")
 
+NBL_REGISTER_BUILD_MOUNT_POINT(
     ARCHIVE
         TARGET NblExtExamplesAPIBuiltinsSPIRV
         INPUT_DIRECTORY .
-        NAMESPACE nbl::builtin::examples::build::spirv
+		OUTPUT_DIRECTORY "${OUTPUT_DIRECTORY}"
+        NAMESPACE nbl::builtin::examples::build
+		MOUNT_POINT_DEFINE NBL_EXAMPLES_BUILD_MOUNT_POINT
+	
+	# relative to ARCHIVE.OUTPUT_DIRECTORY
+	BUILTINS
+		dummy.txt
 
-	INPUTS
+	SHADERS
 		KEY shaders/geometry/unified.hlsl COMPILE_OPTIONS ${COMMON_OPTIONS} -T lib_${SPIRV_TARGET_V}
 		# KEY <xyz> COMPILE_OPTIONS ${COMMON_OPTIONS} -T <target>_${SPIRV_TARGET_V}
 )

From 767e6c4e46e1e5c5ab4ec827f94f8886f705eb33 Mon Sep 17 00:00:00 2001
From: Arkadiusz Lachowicz <areklachowicz@gmail.com>
Date: Thu, 3 Jul 2025 21:10:07 +0200
Subject: [PATCH 469/529] update common/src/nbl/examples/CMakeLists.txt

---
 common/src/nbl/examples/CMakeLists.txt | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/common/src/nbl/examples/CMakeLists.txt b/common/src/nbl/examples/CMakeLists.txt
index 7cc198ebe..fce1f48ac 100644
--- a/common/src/nbl/examples/CMakeLists.txt
+++ b/common/src/nbl/examples/CMakeLists.txt
@@ -15,13 +15,17 @@ NBL_REGISTER_BUILD_MOUNT_POINT(
         NAMESPACE nbl::builtin::examples::build
 		MOUNT_POINT_DEFINE NBL_EXAMPLES_BUILD_MOUNT_POINT
 	
-	# relative to ARCHIVE.OUTPUT_DIRECTORY
 	BUILTINS
-		dummy.txt
+		dummy.txt # relative to ARCHIVE.OUTPUT_DIRECTORY
 
 	SHADERS
-		KEY shaders/geometry/unified.hlsl COMPILE_OPTIONS ${COMMON_OPTIONS} -T lib_${SPIRV_TARGET_V}
-		# KEY <xyz> COMPILE_OPTIONS ${COMMON_OPTIONS} -T <target>_${SPIRV_TARGET_V}
+		KEY shaders/geometry/unified.hlsl 
+		COMPILE_OPTIONS ${COMMON_OPTIONS} -T lib_${SPIRV_TARGET_V}
+		# DEPENDS <>
+
+		# KEY <xyz> 
+		# COMPILE_OPTIONS ${COMMON_OPTIONS} -T <target>_${SPIRV_TARGET_V}
+		# DEPENDS <>
 )
 
 if(NBL_EMBED_BUILTIN_RESOURCES)

From c573951ec38a3aa187b6ac8724cff1c9d293631c Mon Sep 17 00:00:00 2001
From: Arkadiusz Lachowicz <areklachowicz@gmail.com>
Date: Thu, 3 Jul 2025 21:57:17 +0200
Subject: [PATCH 470/529] update some examples after moving spirv target field

---
 03_DeviceSelectionAndSharedSources/Testers.h | 2 +-
 03_DeviceSelectionAndSharedSources/main.cpp  | 2 +-
 23_Arithmetic2UnitTest/main.cpp              | 2 +-
 29_Arithmetic2Bench/main.cpp                 | 2 +-
 64_EmulatedFloatTest/main.cpp                | 4 ++--
 70_FLIPFluids/main.cpp                       | 2 +-
 6 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/03_DeviceSelectionAndSharedSources/Testers.h b/03_DeviceSelectionAndSharedSources/Testers.h
index f957e50a0..fcd5c5ee4 100644
--- a/03_DeviceSelectionAndSharedSources/Testers.h
+++ b/03_DeviceSelectionAndSharedSources/Testers.h
@@ -56,7 +56,7 @@ class IntrospectionTesterBase
 			// if the extension is generic (.glsl or .hlsl) the stage is unknown.
 			// But it can still be overriden from within the source with a `#pragma shader_stage` 
 			options.stage = shaderStage == IShader::E_SHADER_STAGE::ESS_COMPUTE ? shaderStage : IShader::E_SHADER_STAGE::ESS_VERTEX; // TODO: do smth with it
-			options.targetSpirvVersion = device->getPhysicalDevice()->getLimits().spirvVersion;
+			options.preprocessorOptions.targetSpirvVersion = device->getPhysicalDevice()->getLimits().spirvVersion;
 			// we need to perform an unoptimized compilation with source debug info or we'll lose names of variable sin the introspection
 			options.spirvOptimizer = nullptr;
 			options.debugInfoFlags |= IShaderCompiler::E_DEBUG_INFO_FLAGS::EDIF_SOURCE_BIT;
diff --git a/03_DeviceSelectionAndSharedSources/main.cpp b/03_DeviceSelectionAndSharedSources/main.cpp
index c09228ce5..b8fd3d18b 100644
--- a/03_DeviceSelectionAndSharedSources/main.cpp
+++ b/03_DeviceSelectionAndSharedSources/main.cpp
@@ -275,7 +275,7 @@ class DeviceSelectionAndSharedSourcesApp final : public application_templates::M
 			// if the extension is generic (.glsl or .hlsl) the stage is unknown.
 			// But it can still be overriden from within the source with a `#pragma shader_stage` 
 			options.stage = shaderStage == IShader::E_SHADER_STAGE::ESS_COMPUTE ? shaderStage : IShader::E_SHADER_STAGE::ESS_VERTEX; // TODO: do smth with it
-			options.targetSpirvVersion = m_device->getPhysicalDevice()->getLimits().spirvVersion;
+			options.preprocessorOptions.targetSpirvVersion = m_device->getPhysicalDevice()->getLimits().spirvVersion;
 			// we need to perform an unoptimized compilation with source debug info or we'll lose names of variable sin the introspection
 			options.spirvOptimizer = nullptr;
 			options.debugInfoFlags |= IShaderCompiler::E_DEBUG_INFO_FLAGS::EDIF_SOURCE_BIT;
diff --git a/23_Arithmetic2UnitTest/main.cpp b/23_Arithmetic2UnitTest/main.cpp
index 3939fd443..8d70547bc 100644
--- a/23_Arithmetic2UnitTest/main.cpp
+++ b/23_Arithmetic2UnitTest/main.cpp
@@ -294,7 +294,7 @@ class Workgroup2ScanTestApp final : public application_templates::BasicMultiQueu
 		auto compiler = make_smart_refctd_ptr<asset::CHLSLCompiler>(smart_refctd_ptr(m_system));
 		CHLSLCompiler::SOptions options = {};
 		options.stage = IShader::E_SHADER_STAGE::ESS_COMPUTE;
-		options.targetSpirvVersion = m_device->getPhysicalDevice()->getLimits().spirvVersion;
+		options.preprocessorOptions.targetSpirvVersion = m_device->getPhysicalDevice()->getLimits().spirvVersion;
 		options.spirvOptimizer = nullptr;
 #ifndef _NBL_DEBUG
 		ISPIRVOptimizer::E_OPTIMIZER_PASS optPasses = ISPIRVOptimizer::EOP_STRIP_DEBUG_INFO;
diff --git a/29_Arithmetic2Bench/main.cpp b/29_Arithmetic2Bench/main.cpp
index 75f483db0..5809c4a9a 100644
--- a/29_Arithmetic2Bench/main.cpp
+++ b/29_Arithmetic2Bench/main.cpp
@@ -527,7 +527,7 @@ class ArithmeticBenchApp final : public examples::SimpleWindowedApplication, pub
 		auto compiler = make_smart_refctd_ptr<asset::CHLSLCompiler>(smart_refctd_ptr(m_system));
 		CHLSLCompiler::SOptions options = {};
 		options.stage = IShader::E_SHADER_STAGE::ESS_COMPUTE;
-		options.targetSpirvVersion = m_device->getPhysicalDevice()->getLimits().spirvVersion;
+		options.preprocessorOptions.targetSpirvVersion = m_device->getPhysicalDevice()->getLimits().spirvVersion;
 		options.spirvOptimizer = nullptr;
 #ifndef _NBL_DEBUG
 		ISPIRVOptimizer::E_OPTIMIZER_PASS optPasses = ISPIRVOptimizer::EOP_STRIP_DEBUG_INFO;
diff --git a/64_EmulatedFloatTest/main.cpp b/64_EmulatedFloatTest/main.cpp
index fd3e465e7..3fc635e87 100644
--- a/64_EmulatedFloatTest/main.cpp
+++ b/64_EmulatedFloatTest/main.cpp
@@ -280,7 +280,7 @@ class CompatibilityTest final : public MonoDeviceApplication, public BuiltinReso
 
                     nbl::asset::IShaderCompiler::SCompilerOptions options = {};
                     options.stage = ESS_COMPUTE;
-                    options.targetSpirvVersion = base.m_device->getPhysicalDevice()->getLimits().spirvVersion;
+                    options.preprocessorOptions.targetSpirvVersion = base.m_device->getPhysicalDevice()->getLimits().spirvVersion;
                     options.spirvOptimizer = nullptr;
                     options.debugInfoFlags |= IShaderCompiler::E_DEBUG_INFO_FLAGS::EDIF_SOURCE_BIT;
                     options.preprocessorOptions.sourceIdentifier = source->getFilepathHint();
@@ -946,7 +946,7 @@ class CompatibilityTest final : public MonoDeviceApplication, public BuiltinReso
 
                     IShaderCompiler::SCompilerOptions options = {};
                     options.stage = ESS_COMPUTE;
-                    options.targetSpirvVersion = base.m_device->getPhysicalDevice()->getLimits().spirvVersion;
+                    options.preprocessorOptions.targetSpirvVersion = base.m_device->getPhysicalDevice()->getLimits().spirvVersion;
                     options.spirvOptimizer = nullptr;
                     options.debugInfoFlags |= IShaderCompiler::E_DEBUG_INFO_FLAGS::EDIF_SOURCE_BIT;
                     options.preprocessorOptions.sourceIdentifier = source->getFilepathHint();
diff --git a/70_FLIPFluids/main.cpp b/70_FLIPFluids/main.cpp
index 66596c526..899d00ba4 100644
--- a/70_FLIPFluids/main.cpp
+++ b/70_FLIPFluids/main.cpp
@@ -1426,7 +1426,7 @@ class FLIPFluidsApp final : public SimpleWindowedApplication, public BuiltinReso
             options.stage = shaderStage;
             if (!(options.stage == IShader::E_SHADER_STAGE::ESS_COMPUTE || options.stage == IShader::E_SHADER_STAGE::ESS_FRAGMENT))
                 options.stage = IShader::E_SHADER_STAGE::ESS_VERTEX;
-            options.targetSpirvVersion = m_device->getPhysicalDevice()->getLimits().spirvVersion;
+            options.preprocessorOptions.targetSpirvVersion = m_device->getPhysicalDevice()->getLimits().spirvVersion;
             options.spirvOptimizer = nullptr;
         #ifndef _NBL_DEBUG
             ISPIRVOptimizer::E_OPTIMIZER_PASS optPasses = ISPIRVOptimizer::EOP_STRIP_DEBUG_INFO;

From acf32da22e89716f09c34d483dbcac930d7e7e42 Mon Sep 17 00:00:00 2001
From: devsh <devsh@devsh.eu>
Date: Fri, 4 Jul 2025 14:58:05 +0200
Subject: [PATCH 471/529] update media submodule

---
 media | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/media b/media
index 68dbe85b9..c24f4e139 160000
--- a/media
+++ b/media
@@ -1 +1 @@
-Subproject commit 68dbe85b9849c9b094760428a3639f5c8917d85e
+Subproject commit c24f4e13901554abc9fdf87081108cc7dca1db57

From 89ab539f1fe0d98d50b42f1a3f190cc66ec90073 Mon Sep 17 00:00:00 2001
From: Arkadiusz Lachowicz <areklachowicz@gmail.com>
Date: Sun, 6 Jul 2025 20:27:02 +0200
Subject: [PATCH 472/529] update CMakeLists.txt files after changes, remove
 .hlsl ext part from SPIRV key in CSimpleDebugRenderer

---
 CMakeLists.txt                                |  6 +-
 .../geometry/CSimpleDebugRenderer.hpp         |  2 +-
 common/src/nbl/examples/CMakeLists.txt        | 88 +++++++++++++------
 3 files changed, 64 insertions(+), 32 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 9b238942e..6891691f0 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -98,13 +98,13 @@ if(NBL_BUILD_EXAMPLES)
 		target_include_directories(${T} PUBLIC $<TARGET_PROPERTY:${NBL_EXAMPLES_API_TARGET},INCLUDE_DIRECTORIES>)
 		target_precompile_headers(${T} REUSE_FROM "${NBL_EXAMPLES_API_TARGET}")
 
-		# TODO: make them all INTERFACE if not NBL_EMBED_BUILTIN_RESOURCES and link in loop
+		# TODO: make them all INTERFACE if not NBL_EMBED_BUILTIN_RESOURCES and link in loop without checking the var
 		if(NBL_EMBED_BUILTIN_RESOURCES)
 			LINK_BUILTIN_RESOURCES_TO_TARGET(${T} NblExtExamplesAPIBuiltinsSource)
 			LINK_BUILTIN_RESOURCES_TO_TARGET(${T} NblExtExamplesAPIBuiltinsInclude)
-			LINK_BUILTIN_RESOURCES_TO_TARGET(${T} NblExtExamplesAPIBuiltinsSPIRV)
+			LINK_BUILTIN_RESOURCES_TO_TARGET(${T} NblExtExamplesAPIBuiltinsBuild)
 		else()
-			target_link_libraries(${T} PUBLIC NblExtExamplesAPIBuiltinsSPIRV)
+			target_link_libraries(${T} PUBLIC NblExtExamplesAPIBuiltinsBuild)
 		endif()
     endforeach()
 
diff --git a/common/include/nbl/examples/geometry/CSimpleDebugRenderer.hpp b/common/include/nbl/examples/geometry/CSimpleDebugRenderer.hpp
index 53ca1be5d..d5b48de0e 100644
--- a/common/include/nbl/examples/geometry/CSimpleDebugRenderer.hpp
+++ b/common/include/nbl/examples/geometry/CSimpleDebugRenderer.hpp
@@ -108,7 +108,7 @@ class CSimpleDebugRenderer final : public core::IReferenceCounted
 			// load shader
 			smart_refctd_ptr<IShader> shader;
 			{
-				constexpr std::string_view key = "nbl/examples/shaders/geometry/unified.hlsl.spv";
+				constexpr std::string_view key = "nbl/examples/shaders/geometry/unified.spv";
 				const auto bundle = assMan->getAsset(key.data(), {});
 
 				const auto contents = bundle.getContents();
diff --git a/common/src/nbl/examples/CMakeLists.txt b/common/src/nbl/examples/CMakeLists.txt
index fce1f48ac..f79b23d9b 100644
--- a/common/src/nbl/examples/CMakeLists.txt
+++ b/common/src/nbl/examples/CMakeLists.txt
@@ -1,33 +1,65 @@
-set(SPIRV_TARGET_V 6_8)
-
-set(COMMON_OPTIONS
-	-I "${COMMON_INCLUDE_DIRECTORY}"
+set(OUTPUT_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/auto-gen")
+set(ARGS 
+	TARGET NblExtExamplesAPISPIRV
+	BINARY_DIR ${OUTPUT_DIRECTORY}
+	COMMON_OPTIONS -I "${COMMON_INCLUDE_DIRECTORY}"
+	OUTPUT_VAR KEYS
 )
 
-set(OUTPUT_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/auto-gen" )
-file(WRITE "${OUTPUT_DIRECTORY}/dummy.txt" "dummy, test")
-
-NBL_REGISTER_BUILD_MOUNT_POINT(
-    ARCHIVE
-        TARGET NblExtExamplesAPIBuiltinsSPIRV
-        INPUT_DIRECTORY .
-		OUTPUT_DIRECTORY "${OUTPUT_DIRECTORY}"
-        NAMESPACE nbl::builtin::examples::build
-		MOUNT_POINT_DEFINE NBL_EXAMPLES_BUILD_MOUNT_POINT
-	
-	BUILTINS
-		dummy.txt # relative to ARCHIVE.OUTPUT_DIRECTORY
-
-	SHADERS
-		KEY shaders/geometry/unified.hlsl 
-		COMPILE_OPTIONS ${COMMON_OPTIONS} -T lib_${SPIRV_TARGET_V}
-		# DEPENDS <>
-
-		# KEY <xyz> 
-		# COMPILE_OPTIONS ${COMMON_OPTIONS} -T <target>_${SPIRV_TARGET_V}
-		# DEPENDS <>
+# note json is array of objects, you can register all rules at once
+set(JSON [=[
+[
+    {
+		"INPUT": "shaders/geometry/unified.hlsl",
+        "COMPILE_OPTIONS": ["-T", "lib_6_6"],
+		"DEPENDS": [],
+		"CAPS": []
+    }
+]
+]=])
+
+NBL_CREATE_NSC_COMPILE_RULES(${ARGS} INPUTS ${JSON})
+
+set(JSON [=[
+[
+    {
+		"INPUT": "shaders/geometry/unified.hlsl",
+        "COMPILE_OPTIONS": ["-T", "lib_6_6"],
+		"DEPENDS": [],
+		"CAPS": [
+            {
+                "name": "shaderFloat64",
+				"type": "bool",
+                "values": ["true", "false"]
+            },
+            {
+                "name": "subgroupSize",
+				"type": "uint16_t",
+                "values": ["32", "64"]
+            }
+        ]
+    }
+]
+]=])
+
+# but it also supports incremental rule updates, uncomment to add rules with permutation caps
+# NBL_CREATE_NSC_COMPILE_RULES(${ARGS} INPUTS ${JSON})
+
+# note we can add more inputs from build dir which keys can be part of the same archive/mount point,
+# ex. one could auto generate bc texture or whatever and add here like
+# file(WRITE "${OUTPUT_DIRECTORY}/dummy.txt" "dummy, test")
+# list(APPEND KEYS dummy.txt)
+
+NBL_CREATE_RESOURCE_ARCHIVE(
+	TARGET NblExtExamplesAPIBuiltinsBuild
+	BIND "${OUTPUT_DIRECTORY}"
+	NAMESPACE nbl::builtin::examples::build
+	MOUNT_POINT_DEFINE NBL_EXAMPLES_BUILD_MOUNT_POINT
+	BUILTINS ${KEYS}
 )
 
 if(NBL_EMBED_BUILTIN_RESOURCES)
-	INTERFACE_TO_BUILTINS(NblExtExamplesAPIBuiltinsSPIRV)
-endif()
\ No newline at end of file
+	INTERFACE_TO_BUILTINS(NblExtExamplesAPIBuiltinsBuild)
+endif()
+
+target_link_libraries(NblExtExamplesAPIBuiltinsBuild PUBLIC NblExtExamplesAPISPIRV)

From e074ad32c23569bf5baa0cf8af4c65de2ef92c17 Mon Sep 17 00:00:00 2001
From: Arkadiusz Lachowicz <areklachowicz@gmail.com>
Date: Mon, 7 Jul 2025 11:00:13 +0200
Subject: [PATCH 473/529] use json canonical KEY field, update
 common/src/nbl/examples/CMakeLists.txt

---
 common/src/nbl/examples/CMakeLists.txt | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/common/src/nbl/examples/CMakeLists.txt b/common/src/nbl/examples/CMakeLists.txt
index f79b23d9b..b55b699dd 100644
--- a/common/src/nbl/examples/CMakeLists.txt
+++ b/common/src/nbl/examples/CMakeLists.txt
@@ -11,6 +11,7 @@ set(JSON [=[
 [
     {
 		"INPUT": "shaders/geometry/unified.hlsl",
+		"KEY": "shaders/geometry/unified.spv",
         "COMPILE_OPTIONS": ["-T", "lib_6_6"],
 		"DEPENDS": [],
 		"CAPS": []
@@ -24,6 +25,7 @@ set(JSON [=[
 [
     {
 		"INPUT": "shaders/geometry/unified.hlsl",
+		"KEY": "shaders/geometry/unified.spv",
         "COMPILE_OPTIONS": ["-T", "lib_6_6"],
 		"DEPENDS": [],
 		"CAPS": [

From 5e4180c3b61d4249601bbdab70a4c988434f1f89 Mon Sep 17 00:00:00 2001
From: Przemek <minikers21@gmail.com>
Date: Tue, 8 Jul 2025 16:28:11 +0200
Subject: [PATCH 474/529] Fixed out of bounds height map array access

---
 62_CAD/main.cpp                       | 33 +--------------------------
 62_CAD/shaders/main_pipeline/dtm.hlsl |  4 ++--
 2 files changed, 3 insertions(+), 34 deletions(-)

diff --git a/62_CAD/main.cpp b/62_CAD/main.cpp
index 58ccf625d..1870f0dba 100644
--- a/62_CAD/main.cpp
+++ b/62_CAD/main.cpp
@@ -3342,37 +3342,6 @@ class ComputerAidedDesign final : public examples::SimpleWindowedApplication, pu
 		}
 		else if (mode == ExampleMode::CASE_9)
 		{
-			// GRID (outdated)
-			/*core::vector<TriangleMeshVertex> vertices = {
-				{ float32_t2(-200.0f, -200.0f), 10.0f },
-				{ float32_t2(-50.0f, -200.0f), 50.0f },
-				{ float32_t2(100.0f, -200.0f), 90.0f },
-				{ float32_t2(-125.0f, -70.1f), 10.0f },
-				{ float32_t2(25.0f, -70.1f), 50.0f },
-				{ float32_t2(175.0f, -70.1f), 90.0f },
-				{ float32_t2(-200.0f, 59.8f), 10.0f },
-				{ float32_t2(-50.0f, 59.8f), 50.0f },
-				{ float32_t2(100.0f, 59.8f), 90.0f },
-				{ float32_t2(-125.0f, 189.7f), 10.0f },
-				{ float32_t2(25.0f, 189.7f), 50.0f },
-				{ float32_t2(175.0f, 189.7f), 90.0f }
-			};
-
-			core::vector<uint32_t> indices = {
-				0, 3, 1,
-				1, 3, 4,
-				1, 2, 4,
-				2, 4, 5,
-				3, 4, 6,
-				4, 6, 7,
-				4, 5, 7,
-				5, 7, 8,
-				6, 7, 9,
-				7, 9, 10,
-				7, 8, 10,
-				8, 10, 11
-			};*/
-
 			// PYRAMID
 			core::vector<TriangleMeshVertex> vertices = {
 				//{ float64_t2(0.0, 0.0), 100.0 }, //0
@@ -3431,7 +3400,7 @@ class ComputerAidedDesign final : public examples::SimpleWindowedApplication, pu
 			dtmInfo.contourSettingsCount = 2u;
 			dtmInfo.contourSettings[0u].startHeight = 20;
 			dtmInfo.contourSettings[0u].endHeight = 90;
-			dtmInfo.contourSettings[0u].heightInterval = 10;
+			dtmInfo.contourSettings[0u].heightInterval = 9.98;
 			dtmInfo.contourSettings[0u].lineStyleInfo.screenSpaceLineWidth = 0.0f;
 			dtmInfo.contourSettings[0u].lineStyleInfo.worldSpaceLineWidth = 1.0f;
 			dtmInfo.contourSettings[0u].lineStyleInfo.color = float32_t4(0.0f, 0.0f, 1.0f, 0.7f);
diff --git a/62_CAD/shaders/main_pipeline/dtm.hlsl b/62_CAD/shaders/main_pipeline/dtm.hlsl
index 6f50a9384..ac16d7723 100644
--- a/62_CAD/shaders/main_pipeline/dtm.hlsl
+++ b/62_CAD/shaders/main_pipeline/dtm.hlsl
@@ -153,7 +153,7 @@ float4 calculateDTMHeightColor(in DTMHeightShadingSettings settings, in float3 t
         if (mode == E_HEIGHT_SHADING_MODE::DISCRETE_VARIABLE_LENGTH_INTERVALS)
         {
             DTMSettingsHeightsAccessor dtmHeightsAccessor = { settings };
-            int upperBoundIndex = nbl::hlsl::upper_bound(dtmHeightsAccessor, 0, heightMapSize, height);
+            int upperBoundIndex = min(nbl::hlsl::upper_bound(dtmHeightsAccessor, 0u, heightMapSize, height), heightMapSize - 1u);
             int mapIndex = max(upperBoundIndex - 1, 0);
             int mapIndexPrev = max(mapIndex - 1, 0);
             int mapIndexNext = min(mapIndex + 1, heightMapSize - 1);
@@ -209,7 +209,7 @@ float4 calculateDTMHeightColor(in DTMHeightShadingSettings settings, in float3 t
         else if (mode == E_HEIGHT_SHADING_MODE::CONTINOUS_INTERVALS)
         {
             DTMSettingsHeightsAccessor dtmHeightsAccessor = { settings };
-            uint32_t upperBoundHeightIndex = nbl::hlsl::upper_bound(dtmHeightsAccessor, 0, heightMapSize, height);
+            uint32_t upperBoundHeightIndex = min(nbl::hlsl::upper_bound(dtmHeightsAccessor, 0u, heightMapSize - 1u, height), heightMapSize - 1u);
             uint32_t lowerBoundHeightIndex = upperBoundHeightIndex == 0 ? upperBoundHeightIndex : upperBoundHeightIndex - 1;
 
             float upperBoundHeight = settings.heightColorMapHeights[upperBoundHeightIndex];

From 710e2f766307d8a62f05f3143c65499ba34cdf67 Mon Sep 17 00:00:00 2001
From: Przemek <minikers21@gmail.com>
Date: Wed, 9 Jul 2025 14:38:39 +0200
Subject: [PATCH 475/529] Fixed emulated float compilation errors

---
 62_CAD/main.cpp                               |  2 +-
 .../shaders/main_pipeline/vertex_shader.hlsl  | 38 ++++++++++---------
 2 files changed, 22 insertions(+), 18 deletions(-)

diff --git a/62_CAD/main.cpp b/62_CAD/main.cpp
index 1870f0dba..10827f65b 100644
--- a/62_CAD/main.cpp
+++ b/62_CAD/main.cpp
@@ -80,7 +80,7 @@ constexpr std::array<float, (uint32_t)ExampleMode::CASE_COUNT> cameraExtents =
 	1000.0	// CASE_11
 };
 
-constexpr ExampleMode mode = ExampleMode::CASE_11;
+constexpr ExampleMode mode = ExampleMode::CASE_8;
 
 class Camera2D
 {
diff --git a/62_CAD/shaders/main_pipeline/vertex_shader.hlsl b/62_CAD/shaders/main_pipeline/vertex_shader.hlsl
index 2b54d79e7..feb3ec4b5 100644
--- a/62_CAD/shaders/main_pipeline/vertex_shader.hlsl
+++ b/62_CAD/shaders/main_pipeline/vertex_shader.hlsl
@@ -582,7 +582,7 @@ PSInput main(uint vertexID : SV_VertexID)
             float32_t2 minUV = glyphInfo.getMinUV();
             uint16_t textureID = glyphInfo.getTextureID();
             
-            const int ndcYDirectionSign = sign(clipProjectionData.projectionToNDC[1][1]);
+            const int ndcYDirectionSign = sign(_static_cast<float>(clipProjectionData.projectionToNDC[1].y));
             const float32_t2 dirV = float32_t2(glyphInfo.dirU.y, ndcYDirectionSign * glyphInfo.dirU.x) * glyphInfo.aspectRatio;
             const float2 screenTopLeft = _static_cast<float2>(transformPointNdc(clipProjectionData.projectionToNDC, glyphInfo.topLeft));
             const float2 screenDirU = _static_cast<float2>(transformVectorNdc(clipProjectionData.projectionToNDC, _static_cast<pfloat64_t2>(glyphInfo.dirU)));
@@ -632,7 +632,7 @@ PSInput main(uint vertexID : SV_VertexID)
             uint32_t textureID = vk::RawBufferLoad<uint32_t>(globals.pointers.geometryBuffer + drawObj.geometryAddress + sizeof(pfloat64_t2) + sizeof(float2) + sizeof(float), 4u);
 
             // If y increases as we go down in ndc this sign is positive (screenspace-like transformations), if y decreases as we go down this sign is negative (worldspace-like transformations)
-            const int ndcYDirectionSign = sign(clipProjectionData.projectionToNDC[1][1]);
+            const int ndcYDirectionSign = sign(_static_cast<float>(clipProjectionData.projectionToNDC[1].y));
             const float32_t2 dirV = float32_t2(dirU.y, ndcYDirectionSign * dirU.x) * aspectRatio;
             const float2 ndcTopLeft = _static_cast<float2>(transformPointNdc(clipProjectionData.projectionToNDC, topLeft));
             const float2 ndcDirU = _static_cast<float2>(transformVectorNdc(clipProjectionData.projectionToNDC, _static_cast<pfloat64_t2>(dirU)));
@@ -650,7 +650,7 @@ PSInput main(uint vertexID : SV_VertexID)
         else if (objType == ObjectType::GRID_DTM)
         {
             pfloat64_t2 topLeft = vk::RawBufferLoad<pfloat64_t2>(globals.pointers.geometryBuffer + drawObj.geometryAddress, 8u);
-            pfloat64_t2 worldSpaceExtents = vk::RawBufferLoad<pfloat64_t2>(globals.pointers.geometryBuffer + drawObj.geometryAddress + sizeof(pfloat64_t2), 8u);
+            const pfloat64_t2 worldSpaceExtents = vk::RawBufferLoad<pfloat64_t2>(globals.pointers.geometryBuffer + drawObj.geometryAddress + sizeof(pfloat64_t2), 8u);
             uint32_t textureID = vk::RawBufferLoad<uint32_t>(globals.pointers.geometryBuffer + drawObj.geometryAddress + 2 * sizeof(pfloat64_t2), 8u);
             float gridCellWidth = vk::RawBufferLoad<float>(globals.pointers.geometryBuffer + drawObj.geometryAddress + 2 * sizeof(pfloat64_t2) + sizeof(uint32_t), 8u);
             float thicknessOfTheThickestLine = vk::RawBufferLoad<float>(globals.pointers.geometryBuffer + drawObj.geometryAddress + 2 * sizeof(pfloat64_t2) + sizeof(uint32_t) + sizeof(float), 8u);
@@ -666,30 +666,35 @@ PSInput main(uint vertexID : SV_VertexID)
             outV.setGridDTMScreenSpaceGridExtents(_static_cast<float2>(worldSpaceExtents) * globals.screenToWorldRatio);
 
             static const float SquareRootOfTwo = 1.4142135f;
-            const pfloat64_t dilationFactor = SquareRootOfTwo * thicknessOfTheThickestLine;
-            pfloat64_t2 dilationVector = pfloat64_t2(dilationFactor, dilationFactor);
+            const pfloat64_t dilationFactor = _static_cast<pfloat64_t>(SquareRootOfTwo * thicknessOfTheThickestLine);
+            pfloat64_t2 dilationVector;
+            dilationVector.x = dilationFactor;
+            dilationVector.y = dilationFactor;
 
             const pfloat64_t dilationFactorTimesTwo = dilationFactor * 2.0f;
-            const pfloat64_t2 dilatedGridExtents = worldSpaceExtents + pfloat64_t2(dilationFactorTimesTwo, dilationFactorTimesTwo);
+            pfloat64_t2 dilationFactorTimesTwoVector;
+            dilationFactorTimesTwoVector.x = dilationFactorTimesTwo;
+            dilationFactorTimesTwoVector.y = dilationFactorTimesTwo;
+            const pfloat64_t2 dilatedGridExtents = worldSpaceExtents + dilationFactorTimesTwoVector;
             const float2 uvScale = _static_cast<float2>(worldSpaceExtents) / _static_cast<float2>(dilatedGridExtents);
-            float2 uvOffset = float2(dilationFactor, dilationFactor) / _static_cast<float2>(dilatedGridExtents);
+            float2 uvOffset = _static_cast<float2>(dilationVector) / _static_cast<float2>(dilatedGridExtents);
             uvOffset /= uvScale;
 
             if (corner.x == 0.0f && corner.y == 0.0f)
             {
-                dilationVector.x = -dilationVector.x;
+                dilationVector.x = ieee754::flipSign(dilationVector.x);
                 uvOffset.x = -uvOffset.x;
                 uvOffset.y = -uvOffset.y;
             }
             else if (corner.x == 0.0f && corner.y == 1.0f)
             {
-                dilationVector.x = -dilationVector.x;
-                dilationVector.y = -dilationVector.y;
+                dilationVector.x = ieee754::flipSign(dilationVector.x);
+                dilationVector.y = ieee754::flipSign(dilationVector.y);
                 uvOffset.x = -uvOffset.x;
             }
             else if (corner.x == 1.0f && corner.y == 1.0f)
             {
-                dilationVector.y = -dilationVector.y;
+                dilationVector.y = ieee754::flipSign(dilationVector.y);
             }
             else if (corner.x == 1.0f && corner.y == 0.0f)
             {
@@ -699,15 +704,14 @@ PSInput main(uint vertexID : SV_VertexID)
             const float2 uv = corner + uvOffset;
             outV.setImageUV(uv);
 
-            const pfloat64_t2 vtxPos = topLeft + float2(worldSpaceExtents.x, -worldSpaceExtents.y) * corner;
+            pfloat64_t2 worldSpaceExtentsYAxisFlipped;
+            worldSpaceExtentsYAxisFlipped.x = worldSpaceExtents.x;
+            worldSpaceExtentsYAxisFlipped.y = ieee754::flipSign(worldSpaceExtents.y);
+            const pfloat64_t2 vtxPos = topLeft + worldSpaceExtentsYAxisFlipped * _static_cast<pfloat64_t2>(corner);
             const pfloat64_t2 dilatedVtxPos = vtxPos + dilationVector;
 
             float2 ndcVtxPos = _static_cast<float2>(transformPointNdc(clipProjectionData.projectionToNDC, dilatedVtxPos));
             outV.position = float4(ndcVtxPos, 0.0f, 1.0f);
-
-            /*outV.setImageUV(corner);
-            float2 ndcVtxPos = _static_cast<float2>(transformPointNdc(clipProjectionData.projectionToNDC, vtxPos));
-            outV.position = float4(ndcVtxPos, 0.0f, 1.0f);*/
         }
         else if (objType == ObjectType::STREAMED_IMAGE)
         {
@@ -716,7 +720,7 @@ PSInput main(uint vertexID : SV_VertexID)
             float32_t aspectRatio = vk::RawBufferLoad<float32_t>(globals.pointers.geometryBuffer + drawObj.geometryAddress + sizeof(pfloat64_t2) + sizeof(float2), 4u);
             uint32_t textureID = vk::RawBufferLoad<uint32_t>(globals.pointers.geometryBuffer + drawObj.geometryAddress + sizeof(pfloat64_t2) + sizeof(float2) + sizeof(float), 4u);
             
-            const int ndcYDirectionSign = sign(clipProjectionData.projectionToNDC[1][1]);
+            const int ndcYDirectionSign = sign(_static_cast<float>(clipProjectionData.projectionToNDC[1].y));
             const float32_t2 dirV = float32_t2(dirU.y, ndcYDirectionSign * dirU.x) * aspectRatio;
             const float2 ndcTopLeft = _static_cast<float2>(transformPointNdc(clipProjectionData.projectionToNDC, topLeft));
             const float2 ndcDirU = _static_cast<float2>(transformVectorNdc(clipProjectionData.projectionToNDC, _static_cast<pfloat64_t2>(dirU)));

From a6271a4ca05eb0fe8bb7316f5bd857e64fdc3734 Mon Sep 17 00:00:00 2001
From: Arkadiusz Lachowicz <areklachowicz@gmail.com>
Date: Thu, 10 Jul 2025 03:12:49 +0200
Subject: [PATCH 476/529] adjust to new API changes

---
 09_GeometryCreator/main.cpp                   |  1 -
 common/CMakeLists.txt                         |  1 +
 common/include/nbl/examples/PCH.hpp           |  1 +
 common/include/nbl/examples/examples.hpp      |  3 +-
 .../geometry/CSimpleDebugRenderer.hpp         |  8 +---
 common/src/nbl/examples/CMakeLists.txt        | 41 +++++++++++++++----
 6 files changed, 37 insertions(+), 18 deletions(-)

diff --git a/09_GeometryCreator/main.cpp b/09_GeometryCreator/main.cpp
index 900d827b7..cb3c21f4d 100644
--- a/09_GeometryCreator/main.cpp
+++ b/09_GeometryCreator/main.cpp
@@ -5,7 +5,6 @@
 
 #include "common.hpp"
 
-
 class GeometryCreatorApp final : public MonoWindowApplication, public BuiltinResourcesApplication
 {
 	using device_base_t = MonoWindowApplication;
diff --git a/common/CMakeLists.txt b/common/CMakeLists.txt
index 2c4037e2d..f388c4cbf 100644
--- a/common/CMakeLists.txt
+++ b/common/CMakeLists.txt
@@ -84,6 +84,7 @@ endif()
 ]]
 
 add_subdirectory("src/nbl/examples" EXCLUDE_FROM_ALL)
+target_link_libraries(${LIB_NAME} PUBLIC NblExtExamplesAPISPIRV)
 
 NBL_GET_ALL_TARGETS(TARGETS)
 list(REMOVE_ITEM TARGETS ${LIB_NAME})
diff --git a/common/include/nbl/examples/PCH.hpp b/common/include/nbl/examples/PCH.hpp
index 0905465c2..a20984464 100644
--- a/common/include/nbl/examples/PCH.hpp
+++ b/common/include/nbl/examples/PCH.hpp
@@ -14,6 +14,7 @@
 #include "nabla.h"
 
 //! Common example interface headers
+#include "nbl/examples/common/build/spirv/keys.hpp"
 #include "nbl/examples/common/SimpleWindowedApplication.hpp"
 #include "nbl/examples/common/MonoWindowApplication.hpp"
 #include "nbl/examples/common/InputSystem.hpp"
diff --git a/common/include/nbl/examples/examples.hpp b/common/include/nbl/examples/examples.hpp
index d82303514..1450abc2a 100644
--- a/common/include/nbl/examples/examples.hpp
+++ b/common/include/nbl/examples/examples.hpp
@@ -17,8 +17,7 @@
 
 // #include "..."
 
-// Cannot be in PCH because depens on definition of `this_example` for Example's builtins
+// cannot be in PCH because depens on definition of `this_example` for Example's builtins
 #include "nbl/examples/common/BuiltinResourcesApplication.hpp"
 
-
 #endif // _NBL_EXAMPLES_HPP_
\ No newline at end of file
diff --git a/common/include/nbl/examples/geometry/CSimpleDebugRenderer.hpp b/common/include/nbl/examples/geometry/CSimpleDebugRenderer.hpp
index d5b48de0e..9a9e5c966 100644
--- a/common/include/nbl/examples/geometry/CSimpleDebugRenderer.hpp
+++ b/common/include/nbl/examples/geometry/CSimpleDebugRenderer.hpp
@@ -1,15 +1,9 @@
 #ifndef _NBL_EXAMPLES_C_SIMPLE_DEBUG_RENDERER_H_INCLUDED_
 #define _NBL_EXAMPLES_C_SIMPLE_DEBUG_RENDERER_H_INCLUDED_
 
-
 #include "nbl/builtin/hlsl/math/linalg/fast_affine.hlsl"
 #include "nbl/examples/geometry/SPushConstants.hlsl"
 
-// TODO: Arek bring back
-//#include "nbl/examples/geometry/spirv/builtin/CArchive.h"
-//#include "nbl/examples/geometry/spirv/builtin/builtinResources.h"
-
-
 namespace nbl::examples
 {
 
@@ -108,7 +102,7 @@ class CSimpleDebugRenderer final : public core::IReferenceCounted
 			// load shader
 			smart_refctd_ptr<IShader> shader;
 			{
-				constexpr std::string_view key = "nbl/examples/shaders/geometry/unified.spv";
+				auto key = "nbl/examples/" + nbl::builtin::examples::build::get_spirv_key<"shaders/geometry/unified">(device);
 				const auto bundle = assMan->getAsset(key.data(), {});
 
 				const auto contents = bundle.getContents();
diff --git a/common/src/nbl/examples/CMakeLists.txt b/common/src/nbl/examples/CMakeLists.txt
index b55b699dd..3fa63c839 100644
--- a/common/src/nbl/examples/CMakeLists.txt
+++ b/common/src/nbl/examples/CMakeLists.txt
@@ -1,9 +1,22 @@
 set(OUTPUT_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/auto-gen")
-set(ARGS 
+set(ARGS
+	# meta INTERFACE target with NSC compilation rules
 	TARGET NblExtExamplesAPISPIRV
+
+	# build directory for its SPIRV outputs
 	BINARY_DIR ${OUTPUT_DIRECTORY}
+
+	# extra NSC compile options
 	COMMON_OPTIONS -I "${COMMON_INCLUDE_DIRECTORY}"
+
+	# fallback variable to which SPIRV access keys are appended to (including permutations), relative to BINARY_DIR
 	OUTPUT_VAR KEYS
+
+	# include file with key getters, use with #include directive on downstream targets
+	INCLUDE nbl/examples/common/build/spirv/keys.hpp
+
+	# namespace for key getters in include file
+	NAMESPACE nbl::builtin::examples::build
 )
 
 # note json is array of objects, you can register all rules at once
@@ -11,7 +24,7 @@ set(JSON [=[
 [
     {
 		"INPUT": "shaders/geometry/unified.hlsl",
-		"KEY": "shaders/geometry/unified.spv",
+		"KEY": "shaders/geometry/unified",
         "COMPILE_OPTIONS": ["-T", "lib_6_6"],
 		"DEPENDS": [],
 		"CAPS": []
@@ -25,26 +38,39 @@ set(JSON [=[
 [
     {
 		"INPUT": "shaders/geometry/unified.hlsl",
-		"KEY": "shaders/geometry/unified.spv",
+		"KEY": "shaders/geometry/unified-caps",
         "COMPILE_OPTIONS": ["-T", "lib_6_6"],
 		"DEPENDS": [],
 		"CAPS": [
             {
                 "name": "shaderFloat64",
 				"type": "bool",
-                "values": ["true", "false"]
+                "values": [1, 0]
             },
             {
                 "name": "subgroupSize",
 				"type": "uint16_t",
-                "values": ["32", "64"]
+                "values": [32, 64]
+            }
+        ]
+    },
+	{
+		"INPUT": "shaders/geometry/unified.hlsl",
+		"KEY": "shaders/geometry/unified-caps-2",
+        "COMPILE_OPTIONS": ["-T", "lib_6_6"],
+		"DEPENDS": [],
+		"CAPS": [
+            {
+                "name": "shaderFloat64",
+				"type": "bool",
+                "values": [1, 0]
             }
         ]
     }
 ]
 ]=])
 
-# but it also supports incremental rule updates, uncomment to add rules with permutation caps
+# it also supports incremental rule updates, uncomment to add rules with permutation caps (testing purposes, remove after review)
 # NBL_CREATE_NSC_COMPILE_RULES(${ARGS} INPUTS ${JSON})
 
 # note we can add more inputs from build dir which keys can be part of the same archive/mount point,
@@ -62,6 +88,5 @@ NBL_CREATE_RESOURCE_ARCHIVE(
 
 if(NBL_EMBED_BUILTIN_RESOURCES)
 	INTERFACE_TO_BUILTINS(NblExtExamplesAPIBuiltinsBuild)
+	target_link_libraries(NblExtExamplesAPIBuiltinsBuild PUBLIC NblExtExamplesAPISPIRV)
 endif()
-
-target_link_libraries(NblExtExamplesAPIBuiltinsBuild PUBLIC NblExtExamplesAPISPIRV)

From e6417f4f38ccc4abb4fdaa17161a6ed19471bb9e Mon Sep 17 00:00:00 2001
From: Przemek <minikers21@gmail.com>
Date: Thu, 10 Jul 2025 21:21:07 +0200
Subject: [PATCH 477/529] Renamed CASE_BUG after bug fix

---
 62_CAD/main.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/62_CAD/main.cpp b/62_CAD/main.cpp
index 10827f65b..05ca64009 100644
--- a/62_CAD/main.cpp
+++ b/62_CAD/main.cpp
@@ -59,7 +59,7 @@ enum class ExampleMode
 	CASE_7, // Images
 	CASE_8, // MSDF and Text
 	CASE_9, // DTM
-	CASE_BUG, // Bug Repro, after fix, rename to CASE_10 and comment should be: testing fixed geometry and emulated fp64 corner cases
+	CASE_10, // testing fixed geometry and emulated fp64 corner cases
 	CASE_11, // grid DTM
 	CASE_COUNT
 };

From ea00599368a8842ae616255f1587c0f02a1600a4 Mon Sep 17 00:00:00 2001
From: Przemek <minikers21@gmail.com>
Date: Thu, 10 Jul 2025 21:37:05 +0200
Subject: [PATCH 478/529] Fixed compilation errors

---
 62_CAD/main.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/62_CAD/main.cpp b/62_CAD/main.cpp
index 05ca64009..df0eca09c 100644
--- a/62_CAD/main.cpp
+++ b/62_CAD/main.cpp
@@ -76,11 +76,11 @@ constexpr std::array<float, (uint32_t)ExampleMode::CASE_COUNT> cameraExtents =
 	10.0,	// CASE_7
 	600.0,	// CASE_8
 	600.0,	// CASE_9
-	10.0,	// CASE_BUG
+	10.0,	// CASE_10
 	1000.0	// CASE_11
 };
 
-constexpr ExampleMode mode = ExampleMode::CASE_8;
+constexpr ExampleMode mode = ExampleMode::CASE_11;
 
 class Camera2D
 {
@@ -3470,7 +3470,7 @@ class ComputerAidedDesign final : public examples::SimpleWindowedApplication, pu
 
 			drawResourcesFiller.drawTriangleMesh(mesh, dtmInfo, intendedNextSubmit);
 		}
-		else if (mode == ExampleMode::CASE_BUG)
+		else if (mode == ExampleMode::CASE_10)
 		{
 			CPolyline polyline;
 			

From e6c50acb07f904e0f1616a7155dfe71596dee2e8 Mon Sep 17 00:00:00 2001
From: kevyuu <kevin.kayu@gmail.com>
Date: Fri, 11 Jul 2025 05:51:34 +0700
Subject: [PATCH 479/529] Fix raytrace hit shader

---
 71_RayTracingPipeline/app_resources/raytrace.rchit.hlsl | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/71_RayTracingPipeline/app_resources/raytrace.rchit.hlsl b/71_RayTracingPipeline/app_resources/raytrace.rchit.hlsl
index b513d5958..8fd23a235 100644
--- a/71_RayTracingPipeline/app_resources/raytrace.rchit.hlsl
+++ b/71_RayTracingPipeline/app_resources/raytrace.rchit.hlsl
@@ -37,8 +37,7 @@ float32_t3 fetchVertexNormal(int instID, int primID, STriangleGeomInfo geom, flo
         }
     }
 
-    const uint64_t normalVertexBufferAddress = geom.normalBufferAddress;
-    float3 n0, n1, n2;
+    const uint64_t normalBufferAddress = geom.normalBufferAddress;
 
     float3 n0, n1, n2;
     switch (objType)

From 663f2f298bcd77530d42c6443a6ad6a9c6bb6b4f Mon Sep 17 00:00:00 2001
From: kevyuu <kevin.kayu@gmail.com>
Date: Fri, 11 Jul 2025 05:52:01 +0700
Subject: [PATCH 480/529] In progress

---
 71_RayTracingPipeline/main.cpp | 130 ++++++++++++++++++---------------
 1 file changed, 72 insertions(+), 58 deletions(-)

diff --git a/71_RayTracingPipeline/main.cpp b/71_RayTracingPipeline/main.cpp
index c47eea1c4..159421663 100644
--- a/71_RayTracingPipeline/main.cpp
+++ b/71_RayTracingPipeline/main.cpp
@@ -289,85 +289,99 @@ class RaytracingPipelineApp final : public SimpleWindowedApplication, public Bui
 
 		// ray trace pipeline and descriptor set layout setup
 		{
-			const IGPUDescriptorSetLayout::SBinding bindings[] = {
-			  {
-				.binding = 0,
-				.type = asset::IDescriptor::E_TYPE::ET_ACCELERATION_STRUCTURE,
-				.createFlags = IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_NONE,
-				.stageFlags = asset::IShader::E_SHADER_STAGE::ESS_RAYGEN,
-				.count = 1,
+			const auto bindings = std::array<const ICPUDescriptorSetLayout::SBinding, 2>{
+			  ICPUDescriptorSetLayout::SBinding{
+          .binding = 0,
+          .type = asset::IDescriptor::E_TYPE::ET_ACCELERATION_STRUCTURE,
+          .createFlags = IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_NONE,
+          .stageFlags = asset::IShader::E_SHADER_STAGE::ESS_RAYGEN,
+          .count = 1,
 			  },
 			  {
-				.binding = 1,
-				.type = asset::IDescriptor::E_TYPE::ET_STORAGE_IMAGE,
-				.createFlags = IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_NONE,
-				.stageFlags = asset::IShader::E_SHADER_STAGE::ESS_RAYGEN,
-				.count = 1,
+          .binding = 1,
+          .type = asset::IDescriptor::E_TYPE::ET_STORAGE_IMAGE,
+          .createFlags = IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_NONE,
+          .stageFlags = asset::IShader::E_SHADER_STAGE::ESS_RAYGEN,
+          .count = 1,
 			  }
 			};
-			const auto descriptorSetLayout = m_device->createDescriptorSetLayout(bindings);
-
-			const std::array<IGPUDescriptorSetLayout*, ICPUPipelineLayout::DESCRIPTOR_SET_COUNT> dsLayoutPtrs = { descriptorSetLayout.get() };
-			m_rayTracingDsPool = m_device->createDescriptorPoolForDSLayouts(IDescriptorPool::ECF_UPDATE_AFTER_BIND_BIT, std::span(dsLayoutPtrs.begin(), dsLayoutPtrs.end()));
-			m_rayTracingDs = m_rayTracingDsPool->createDescriptorSet(descriptorSetLayout);
+			auto descriptorSetLayout = core::make_smart_refctd_ptr<ICPUDescriptorSetLayout>(bindings);
 
 			const SPushConstantRange pcRange = {
 			  .stageFlags = IShader::E_SHADER_STAGE::ESS_ALL_RAY_TRACING,
 			  .offset = 0u,
 			  .size = sizeof(SPushConstants),
 			};
-			const auto pipelineLayout = m_device->createPipelineLayout({ &pcRange, 1 }, smart_refctd_ptr(descriptorSetLayout), nullptr, nullptr, nullptr);
-
-			IGPURayTracingPipeline::SCreationParams params = {};
-			params.layout = pipelineLayout.get();
-			using RayTracingFlags = IGPURayTracingPipeline::SCreationParams::FLAGS;
-			params.flags = core::bitflag(RayTracingFlags::NO_NULL_MISS_SHADERS) |
-				RayTracingFlags::NO_NULL_INTERSECTION_SHADERS |
-				RayTracingFlags::NO_NULL_ANY_HIT_SHADERS;
+			const auto pipelineLayout = core::make_smart_refctd_ptr<ICPUPipelineLayout>(std::span<const asset::SPushConstantRange>({ pcRange }), std::move(descriptorSetLayout), nullptr, nullptr, nullptr);
 
-			auto& shaderGroups = params.shaderGroups;
+			const auto pipeline = ICPURayTracingPipeline::create(pipelineLayout.get());
+			pipeline->getCachedCreationParams() = {
+				.maxRecursionDepth = 1,
+				.dynamicStackSize = true,
+			};
 
-			shaderGroups.raygen = { .shader = raygenShader.get(), .entryPoint = "main" };
+			pipeline->getSpecInfos(ESS_RAYGEN)[0] = {
+				.shader = raygenShader,
+				.entryPoint = "main",
+			};
 
-			IGPUPipelineBase::SShaderSpecInfo missGroups[EMT_COUNT];
-			missGroups[EMT_PRIMARY] = { .shader = missShader.get(), .entryPoint = "main" };
-			missGroups[EMT_OCCLUSION] = { .shader = missShadowShader.get(), .entryPoint = "main" };
-			shaderGroups.misses = missGroups;
+			pipeline->getSpecInfoVector(ESS_MISS)->resize(EMT_COUNT);
+			const auto missGroups = pipeline->getSpecInfos(ESS_MISS);
+			missGroups[EMT_PRIMARY] = { .shader = missShader, .entryPoint = "main" };
+			missGroups[EMT_OCCLUSION] = { .shader = missShadowShader, .entryPoint = "main" };
 
 			auto getHitGroupIndex = [](E_GEOM_TYPE geomType, E_RAY_TYPE rayType)
 				{
 					return geomType * ERT_COUNT + rayType;
 				};
-			IGPURayTracingPipeline::SHitGroup hitGroups[E_RAY_TYPE::ERT_COUNT * E_GEOM_TYPE::EGT_COUNT];
-			hitGroups[getHitGroupIndex(EGT_TRIANGLES, ERT_PRIMARY)] = {
-				.closestHit = { .shader = closestHitShader.get(), .entryPoint = "main" },
-			  .anyHit = { .shader = anyHitShaderColorPayload.get(), .entryPoint = "main" },
-			};
-			hitGroups[getHitGroupIndex(EGT_TRIANGLES, ERT_OCCLUSION)] = {
-			  .anyHit = { .shader = anyHitShaderShadowPayload.get(), .entryPoint = "main" },
-			};
-			hitGroups[getHitGroupIndex(EGT_PROCEDURAL, ERT_PRIMARY)] = {
-			  .closestHit = { .shader = proceduralClosestHitShader.get(), .entryPoint = "main" },
-			  .anyHit = { .shader = anyHitShaderColorPayload.get(), .entryPoint = "main" },
-			  .intersection = { .shader = intersectionHitShader.get(), .entryPoint = "main" },
-			};
-			hitGroups[getHitGroupIndex(EGT_PROCEDURAL, ERT_OCCLUSION)] = {
-			  .anyHit = { .shader = anyHitShaderShadowPayload.get(), .entryPoint = "main" },
-			  .intersection = { .shader = intersectionHitShader.get(), .entryPoint = "main" },
-			};
-			shaderGroups.hits = hitGroups;
 
-			IGPUPipelineBase::SShaderSpecInfo callableGroups[ELT_COUNT];
-			callableGroups[ELT_DIRECTIONAL] = { .shader = directionalLightCallShader.get(), .entryPoint = "main" };
-			callableGroups[ELT_POINT] = { .shader = pointLightCallShader.get(), .entryPoint = "main" };
-			callableGroups[ELT_SPOT] = { .shader = spotLightCallShader.get(), .entryPoint = "main" };
-			shaderGroups.callables = callableGroups;
+			const auto hitGroupCount = ERT_COUNT * EGT_COUNT;
+			pipeline->getSpecInfoVector(ESS_CLOSEST_HIT)->resize(hitGroupCount);
+			pipeline->getSpecInfoVector(ESS_ANY_HIT)->resize(hitGroupCount);
+			pipeline->getSpecInfoVector(ESS_INTERSECTION)->resize(hitGroupCount);
+
+			const auto closestHitSpecs = pipeline->getSpecInfos(ESS_CLOSEST_HIT);
+			const auto anyHitSpecs = pipeline->getSpecInfos(ESS_ANY_HIT);
+			const auto intersectionSpecs = pipeline->getSpecInfos(ESS_INTERSECTION);
+
+			closestHitSpecs[getHitGroupIndex(EGT_TRIANGLES, ERT_PRIMARY)] = { .shader = closestHitShader, .entryPoint = "main" };
+			anyHitSpecs[getHitGroupIndex(EGT_TRIANGLES, ERT_PRIMARY)] = {.shader = anyHitShaderColorPayload, .entryPoint = "main"};
+
+			anyHitSpecs[getHitGroupIndex(EGT_TRIANGLES, ERT_OCCLUSION)] = { .shader = anyHitShaderShadowPayload, .entryPoint = "main" };
+			closestHitSpecs[getHitGroupIndex(EGT_PROCEDURAL, ERT_PRIMARY)] = { .shader = proceduralClosestHitShader, .entryPoint = "main" };
+			anyHitSpecs[getHitGroupIndex(EGT_PROCEDURAL, ERT_PRIMARY)] = { .shader = anyHitShaderColorPayload, .entryPoint = "main" };
+			intersectionSpecs[getHitGroupIndex(EGT_PROCEDURAL, ERT_PRIMARY)] = { .shader = intersectionHitShader, .entryPoint = "main" };
 
-			params.cached.maxRecursionDepth = 1;
-			params.cached.dynamicStackSize = true;
+			anyHitSpecs[getHitGroupIndex(EGT_PROCEDURAL, ERT_OCCLUSION)] = {.shader = anyHitShaderShadowPayload, .entryPoint = "main" };
+			intersectionSpecs[getHitGroupIndex(EGT_PROCEDURAL, ERT_OCCLUSION)] = { .shader = intersectionHitShader, .entryPoint = "main" };
 
-			if (!m_device->createRayTracingPipelines(nullptr, { &params, 1 }, &m_rayTracingPipeline))
-				return logFail("Failed to create ray tracing pipeline");
+			pipeline->getSpecInfoVector(ESS_CALLABLE)->resize(ELT_COUNT);
+			const auto callableGroups = pipeline->getSpecInfos(ESS_CALLABLE);
+			callableGroups[ELT_DIRECTIONAL] = { .shader = directionalLightCallShader, .entryPoint = "main" };
+			callableGroups[ELT_POINT] = { .shader = pointLightCallShader, .entryPoint = "main" };
+			callableGroups[ELT_SPOT] = { .shader = spotLightCallShader, .entryPoint = "main" };
+
+      smart_refctd_ptr<CAssetConverter> converter = CAssetConverter::create({ .device = m_device.get(), .optimizer = {} });
+		  CAssetConverter::SInputs inputs = {};
+      inputs.logger = m_logger.get();
+
+			const std::array cpuPipelines = { pipeline.get() };
+			std::get<CAssetConverter::SInputs::asset_span_t<ICPURayTracingPipeline>>(inputs.assets) = cpuPipelines;
+
+			CAssetConverter::SConvertParams params = {};
+			params.utilities = m_utils.get();
+
+      auto reservation = converter->reserve(inputs);
+			auto future = reservation.convert(params);
+			if (future.copy() != IQueue::RESULT::SUCCESS)
+			{
+				m_logger->log("Failed to await submission feature!", ILogger::ELL_ERROR);
+				return false;
+			}
+
+			// assign gpu objects to output
+			auto&& pipelines = reservation.getGPUObjects<ICPURayTracingPipeline>();
+			m_rayTracingPipeline = pipelines[0].value;
 
 			calculateRayTracingStackSize(m_rayTracingPipeline);
 

From 06d839d6c3185fd66ad95602ab50d61f67fa4c7c Mon Sep 17 00:00:00 2001
From: Przemek <minikers21@gmail.com>
Date: Fri, 11 Jul 2025 12:30:57 +0200
Subject: [PATCH 481/529] Enabled grid DTM height map view format validation

---
 62_CAD/DrawResourcesFiller.cpp | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/62_CAD/DrawResourcesFiller.cpp b/62_CAD/DrawResourcesFiller.cpp
index 152401ded..66b54d3ab 100644
--- a/62_CAD/DrawResourcesFiller.cpp
+++ b/62_CAD/DrawResourcesFiller.cpp
@@ -2362,8 +2362,7 @@ DrawResourcesFiller::ImageAllocateResults DrawResourcesFiller::tryCreateAndAlloc
 		
 		if (imageViewFormatOverride != asset::E_FORMAT::EF_COUNT && imageViewFormatOverride != imageParams.format)
 		{
-			// TODO: figure out why this crashes the app
-			//params.viewFormats.set(static_cast<size_t>(imageViewFormatOverride), true);
+			params.viewFormats.set(static_cast<size_t>(imageViewFormatOverride), true);
 			params.flags |= asset::IImage::E_CREATE_FLAGS::ECF_MUTABLE_FORMAT_BIT;
 		}
 		auto gpuImage = device->createImage(std::move(params));

From 3a0564221c709077560d2d135ccad60eac447c06 Mon Sep 17 00:00:00 2001
From: kevyuu <kevin.kayu@gmail.com>
Date: Fri, 11 Jul 2025 05:52:01 +0700
Subject: [PATCH 482/529] Use cpu ray tracing pipeline asset conversion on rt
 pipeline demo

---
 71_RayTracingPipeline/main.cpp | 136 +++++++++++++++++++--------------
 1 file changed, 78 insertions(+), 58 deletions(-)

diff --git a/71_RayTracingPipeline/main.cpp b/71_RayTracingPipeline/main.cpp
index c47eea1c4..0ba5c7df1 100644
--- a/71_RayTracingPipeline/main.cpp
+++ b/71_RayTracingPipeline/main.cpp
@@ -289,85 +289,105 @@ class RaytracingPipelineApp final : public SimpleWindowedApplication, public Bui
 
 		// ray trace pipeline and descriptor set layout setup
 		{
-			const IGPUDescriptorSetLayout::SBinding bindings[] = {
-			  {
-				.binding = 0,
-				.type = asset::IDescriptor::E_TYPE::ET_ACCELERATION_STRUCTURE,
-				.createFlags = IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_NONE,
-				.stageFlags = asset::IShader::E_SHADER_STAGE::ESS_RAYGEN,
-				.count = 1,
+			const auto bindings = std::array<const ICPUDescriptorSetLayout::SBinding, 2>{
+			  ICPUDescriptorSetLayout::SBinding{
+          .binding = 0,
+          .type = asset::IDescriptor::E_TYPE::ET_ACCELERATION_STRUCTURE,
+          .createFlags = IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_NONE,
+          .stageFlags = asset::IShader::E_SHADER_STAGE::ESS_RAYGEN,
+          .count = 1,
 			  },
 			  {
-				.binding = 1,
-				.type = asset::IDescriptor::E_TYPE::ET_STORAGE_IMAGE,
-				.createFlags = IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_NONE,
-				.stageFlags = asset::IShader::E_SHADER_STAGE::ESS_RAYGEN,
-				.count = 1,
+          .binding = 1,
+          .type = asset::IDescriptor::E_TYPE::ET_STORAGE_IMAGE,
+          .createFlags = IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_NONE,
+          .stageFlags = asset::IShader::E_SHADER_STAGE::ESS_RAYGEN,
+          .count = 1,
 			  }
 			};
-			const auto descriptorSetLayout = m_device->createDescriptorSetLayout(bindings);
-
-			const std::array<IGPUDescriptorSetLayout*, ICPUPipelineLayout::DESCRIPTOR_SET_COUNT> dsLayoutPtrs = { descriptorSetLayout.get() };
-			m_rayTracingDsPool = m_device->createDescriptorPoolForDSLayouts(IDescriptorPool::ECF_UPDATE_AFTER_BIND_BIT, std::span(dsLayoutPtrs.begin(), dsLayoutPtrs.end()));
-			m_rayTracingDs = m_rayTracingDsPool->createDescriptorSet(descriptorSetLayout);
+			auto cpuDescriptorSetLayout = core::make_smart_refctd_ptr<ICPUDescriptorSetLayout>(bindings);
 
 			const SPushConstantRange pcRange = {
 			  .stageFlags = IShader::E_SHADER_STAGE::ESS_ALL_RAY_TRACING,
 			  .offset = 0u,
 			  .size = sizeof(SPushConstants),
 			};
-			const auto pipelineLayout = m_device->createPipelineLayout({ &pcRange, 1 }, smart_refctd_ptr(descriptorSetLayout), nullptr, nullptr, nullptr);
-
-			IGPURayTracingPipeline::SCreationParams params = {};
-			params.layout = pipelineLayout.get();
-			using RayTracingFlags = IGPURayTracingPipeline::SCreationParams::FLAGS;
-			params.flags = core::bitflag(RayTracingFlags::NO_NULL_MISS_SHADERS) |
-				RayTracingFlags::NO_NULL_INTERSECTION_SHADERS |
-				RayTracingFlags::NO_NULL_ANY_HIT_SHADERS;
+			const auto cpuPipelineLayout = core::make_smart_refctd_ptr<ICPUPipelineLayout>(std::span<const asset::SPushConstantRange>({ pcRange }), std::move(cpuDescriptorSetLayout), nullptr, nullptr, nullptr);
 
-			auto& shaderGroups = params.shaderGroups;
+			const auto pipeline = ICPURayTracingPipeline::create(cpuPipelineLayout.get());
+			pipeline->getCachedCreationParams() = {
+				.maxRecursionDepth = 1,
+				.dynamicStackSize = true,
+			};
 
-			shaderGroups.raygen = { .shader = raygenShader.get(), .entryPoint = "main" };
+			pipeline->getSpecInfos(ESS_RAYGEN)[0] = {
+				.shader = raygenShader,
+				.entryPoint = "main",
+			};
 
-			IGPUPipelineBase::SShaderSpecInfo missGroups[EMT_COUNT];
-			missGroups[EMT_PRIMARY] = { .shader = missShader.get(), .entryPoint = "main" };
-			missGroups[EMT_OCCLUSION] = { .shader = missShadowShader.get(), .entryPoint = "main" };
-			shaderGroups.misses = missGroups;
+			pipeline->getSpecInfoVector(ESS_MISS)->resize(EMT_COUNT);
+			const auto missGroups = pipeline->getSpecInfos(ESS_MISS);
+			missGroups[EMT_PRIMARY] = { .shader = missShader, .entryPoint = "main" };
+			missGroups[EMT_OCCLUSION] = { .shader = missShadowShader, .entryPoint = "main" };
 
 			auto getHitGroupIndex = [](E_GEOM_TYPE geomType, E_RAY_TYPE rayType)
 				{
 					return geomType * ERT_COUNT + rayType;
 				};
-			IGPURayTracingPipeline::SHitGroup hitGroups[E_RAY_TYPE::ERT_COUNT * E_GEOM_TYPE::EGT_COUNT];
-			hitGroups[getHitGroupIndex(EGT_TRIANGLES, ERT_PRIMARY)] = {
-				.closestHit = { .shader = closestHitShader.get(), .entryPoint = "main" },
-			  .anyHit = { .shader = anyHitShaderColorPayload.get(), .entryPoint = "main" },
-			};
-			hitGroups[getHitGroupIndex(EGT_TRIANGLES, ERT_OCCLUSION)] = {
-			  .anyHit = { .shader = anyHitShaderShadowPayload.get(), .entryPoint = "main" },
-			};
-			hitGroups[getHitGroupIndex(EGT_PROCEDURAL, ERT_PRIMARY)] = {
-			  .closestHit = { .shader = proceduralClosestHitShader.get(), .entryPoint = "main" },
-			  .anyHit = { .shader = anyHitShaderColorPayload.get(), .entryPoint = "main" },
-			  .intersection = { .shader = intersectionHitShader.get(), .entryPoint = "main" },
-			};
-			hitGroups[getHitGroupIndex(EGT_PROCEDURAL, ERT_OCCLUSION)] = {
-			  .anyHit = { .shader = anyHitShaderShadowPayload.get(), .entryPoint = "main" },
-			  .intersection = { .shader = intersectionHitShader.get(), .entryPoint = "main" },
-			};
-			shaderGroups.hits = hitGroups;
 
-			IGPUPipelineBase::SShaderSpecInfo callableGroups[ELT_COUNT];
-			callableGroups[ELT_DIRECTIONAL] = { .shader = directionalLightCallShader.get(), .entryPoint = "main" };
-			callableGroups[ELT_POINT] = { .shader = pointLightCallShader.get(), .entryPoint = "main" };
-			callableGroups[ELT_SPOT] = { .shader = spotLightCallShader.get(), .entryPoint = "main" };
-			shaderGroups.callables = callableGroups;
+			const auto hitGroupCount = ERT_COUNT * EGT_COUNT;
+			pipeline->getSpecInfoVector(ESS_CLOSEST_HIT)->resize(hitGroupCount);
+			pipeline->getSpecInfoVector(ESS_ANY_HIT)->resize(hitGroupCount);
+			pipeline->getSpecInfoVector(ESS_INTERSECTION)->resize(hitGroupCount);
+
+			const auto closestHitSpecs = pipeline->getSpecInfos(ESS_CLOSEST_HIT);
+			const auto anyHitSpecs = pipeline->getSpecInfos(ESS_ANY_HIT);
+			const auto intersectionSpecs = pipeline->getSpecInfos(ESS_INTERSECTION);
+
+			closestHitSpecs[getHitGroupIndex(EGT_TRIANGLES, ERT_PRIMARY)] = { .shader = closestHitShader, .entryPoint = "main" };
+			anyHitSpecs[getHitGroupIndex(EGT_TRIANGLES, ERT_PRIMARY)] = {.shader = anyHitShaderColorPayload, .entryPoint = "main"};
+
+			anyHitSpecs[getHitGroupIndex(EGT_TRIANGLES, ERT_OCCLUSION)] = { .shader = anyHitShaderShadowPayload, .entryPoint = "main" };
+
+			closestHitSpecs[getHitGroupIndex(EGT_PROCEDURAL, ERT_PRIMARY)] = { .shader = proceduralClosestHitShader, .entryPoint = "main" };
+			anyHitSpecs[getHitGroupIndex(EGT_PROCEDURAL, ERT_PRIMARY)] = { .shader = anyHitShaderColorPayload, .entryPoint = "main" };
+			intersectionSpecs[getHitGroupIndex(EGT_PROCEDURAL, ERT_PRIMARY)] = { .shader = intersectionHitShader, .entryPoint = "main" };
 
-			params.cached.maxRecursionDepth = 1;
-			params.cached.dynamicStackSize = true;
+			anyHitSpecs[getHitGroupIndex(EGT_PROCEDURAL, ERT_OCCLUSION)] = {.shader = anyHitShaderShadowPayload, .entryPoint = "main" };
+			intersectionSpecs[getHitGroupIndex(EGT_PROCEDURAL, ERT_OCCLUSION)] = { .shader = intersectionHitShader, .entryPoint = "main" };
+
+			pipeline->getSpecInfoVector(ESS_CALLABLE)->resize(ELT_COUNT);
+			const auto callableGroups = pipeline->getSpecInfos(ESS_CALLABLE);
+			callableGroups[ELT_DIRECTIONAL] = { .shader = directionalLightCallShader, .entryPoint = "main" };
+			callableGroups[ELT_POINT] = { .shader = pointLightCallShader, .entryPoint = "main" };
+			callableGroups[ELT_SPOT] = { .shader = spotLightCallShader, .entryPoint = "main" };
+
+      smart_refctd_ptr<CAssetConverter> converter = CAssetConverter::create({ .device = m_device.get(), .optimizer = {} });
+		  CAssetConverter::SInputs inputs = {};
+      inputs.logger = m_logger.get();
+
+			const std::array cpuPipelines = { pipeline.get() };
+			std::get<CAssetConverter::SInputs::asset_span_t<ICPURayTracingPipeline>>(inputs.assets) = cpuPipelines;
+
+			CAssetConverter::SConvertParams params = {};
+			params.utilities = m_utils.get();
+
+      auto reservation = converter->reserve(inputs);
+			auto future = reservation.convert(params);
+			if (future.copy() != IQueue::RESULT::SUCCESS)
+			{
+				m_logger->log("Failed to await submission feature!", ILogger::ELL_ERROR);
+				return false;
+			}
+
+			// assign gpu objects to output
+			auto&& pipelines = reservation.getGPUObjects<ICPURayTracingPipeline>();
+			m_rayTracingPipeline = pipelines[0].value;
+			const auto* gpuDsLayout = m_rayTracingPipeline->getLayout()->getDescriptorSetLayouts()[0];
 
-			if (!m_device->createRayTracingPipelines(nullptr, { &params, 1 }, &m_rayTracingPipeline))
-				return logFail("Failed to create ray tracing pipeline");
+			const std::array<const IGPUDescriptorSetLayout*, ICPUPipelineLayout::DESCRIPTOR_SET_COUNT> dsLayoutPtrs = { gpuDsLayout };
+      m_rayTracingDsPool = m_device->createDescriptorPoolForDSLayouts(IDescriptorPool::ECF_UPDATE_AFTER_BIND_BIT, std::span(dsLayoutPtrs.begin(), dsLayoutPtrs.end()));
+			m_rayTracingDs = m_rayTracingDsPool->createDescriptorSet(core::smart_refctd_ptr<const IGPUDescriptorSetLayout>(gpuDsLayout));
 
 			calculateRayTracingStackSize(m_rayTracingPipeline);
 

From 0de0bc395611cd8132d724277d4299ede6b7f0d2 Mon Sep 17 00:00:00 2001
From: Arkadiusz Lachowicz <areklachowicz@gmail.com>
Date: Wed, 16 Jul 2025 16:01:59 +0200
Subject: [PATCH 483/529] stylo and corrections, adjust to changes

---
 CMakeLists.txt                         |  3 --
 common/CMakeLists.txt                  | 59 ++++++++++++++------------
 common/src/nbl/examples/CMakeLists.txt | 17 +++-----
 3 files changed, 38 insertions(+), 41 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 6891691f0..8cf1364a5 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -98,13 +98,10 @@ if(NBL_BUILD_EXAMPLES)
 		target_include_directories(${T} PUBLIC $<TARGET_PROPERTY:${NBL_EXAMPLES_API_TARGET},INCLUDE_DIRECTORIES>)
 		target_precompile_headers(${T} REUSE_FROM "${NBL_EXAMPLES_API_TARGET}")
 
-		# TODO: make them all INTERFACE if not NBL_EMBED_BUILTIN_RESOURCES and link in loop without checking the var
 		if(NBL_EMBED_BUILTIN_RESOURCES)
 			LINK_BUILTIN_RESOURCES_TO_TARGET(${T} NblExtExamplesAPIBuiltinsSource)
 			LINK_BUILTIN_RESOURCES_TO_TARGET(${T} NblExtExamplesAPIBuiltinsInclude)
 			LINK_BUILTIN_RESOURCES_TO_TARGET(${T} NblExtExamplesAPIBuiltinsBuild)
-		else()
-			target_link_libraries(${T} PUBLIC NblExtExamplesAPIBuiltinsBuild)
 		endif()
     endforeach()
 
diff --git a/common/CMakeLists.txt b/common/CMakeLists.txt
index f388c4cbf..b3e57da6f 100644
--- a/common/CMakeLists.txt
+++ b/common/CMakeLists.txt
@@ -18,7 +18,7 @@ set(COMMON_INCLUDE_DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}/include")
 function(INTERFACE_TO_BUILTINS TARGET)
     #[[
         even though builtin target is static library its still valid to reuse 
-        common PCH to boost its build speed to not preprocess entire Nabla again
+        common PCH to boost its build speed to not preprocess entire Nabla again (**)
     ]]
     set_target_properties(${TARGET} PROPERTIES DISABLE_PRECOMPILE_HEADERS OFF)
     target_precompile_headers(${TARGET} REUSE_FROM "${LIB_NAME}")
@@ -28,42 +28,33 @@ function(INTERFACE_TO_BUILTINS TARGET)
 endfunction()
 
 function(REGISTER_COMMON_BUILTINS)
-	cmake_parse_arguments(EX "" "TARGET;ARCHIVE_ABS_ENTRY;ARCHIVE_NAMESPACE" "GLOB_RGX" ${ARGN})
-
-    get_filename_component(INPUT_DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}" ABSOLUTE)
-    get_filename_component(OUTPUT_SRC "${CMAKE_CURRENT_BINARY_DIR}/builtin/${EX_TARGET}/src" ABSOLUTE)
-    get_filename_component(OUTPUT_INCLUDE "${CMAKE_CURRENT_BINARY_DIR}/builtin/${EX_TARGET}/include" ABSOLUTE)
-
-    set(KEYS_ENTRY "${INPUT_DIRECTORY}/${EX_ARCHIVE_ABS_ENTRY}")
-    list(TRANSFORM EX_GLOB_RGX PREPEND "${KEYS_ENTRY}/")
-	file(GLOB_RECURSE KEYS RELATIVE "${KEYS_ENTRY}" CONFIGURE_DEPENDS ${EX_GLOB_RGX})
-
-    #[[
-        note we do force you to specify full globbing expressions relative to keys entry which we do not filter
-        because if runtime outputs .spv compilation artifacts/shader cache preprocessed.hlsl(s) to source you will hit CMake
-        reconfiguration each time the file content or timestampts change and it could lead to embeding intermediate trash
-    ]]
+	cmake_parse_arguments(EX "" "TARGET;BIND;NAMESPACE" "GLOB_RGX" ${ARGN})
+    get_filename_component(MOUNT_POINT "${CMAKE_CURRENT_SOURCE_DIR}/${EX_BIND}" ABSOLUTE)
+    list(TRANSFORM EX_GLOB_RGX PREPEND "${MOUNT_POINT}/")
+	file(GLOB_RECURSE KEYS RELATIVE "${MOUNT_POINT}" CONFIGURE_DEPENDS ${EX_GLOB_RGX})
 	
-    unset(EXAMPLES_RESOURCES_TO_EMBED)
-	foreach(KEY IN LISTS KEYS)
-		LIST_BUILTIN_RESOURCE(EXAMPLES_RESOURCES_TO_EMBED "${KEY}")
-	endforeach()
-
-    ADD_CUSTOM_BUILTIN_RESOURCES(${EX_TARGET} EXAMPLES_RESOURCES_TO_EMBED "${INPUT_DIRECTORY}" "${EX_ARCHIVE_ABS_ENTRY}" "${EX_ARCHIVE_NAMESPACE}" "${OUTPUT_INCLUDE}" "${OUTPUT_SRC}")
+    NBL_CREATE_RESOURCE_ARCHIVE(
+        TARGET ${EX_TARGET}
+        BIND "${MOUNT_POINT}"
+        BUILTINS ${KEYS}
+        NAMESPACE ${EX_NAMESPACE}
+    )
     INTERFACE_TO_BUILTINS(${EX_TARGET})
 endfunction()
 
 #! common example API builtins as static library targets linked to each example
 if(NBL_EMBED_BUILTIN_RESOURCES)
-    REGISTER_COMMON_BUILTINS(TARGET NblExtExamplesAPIBuiltinsSource 
-        ARCHIVE_ABS_ENTRY src/nbl/examples 
-        ARCHIVE_NAMESPACE nbl::builtin::examples::src 
+    REGISTER_COMMON_BUILTINS(
+        TARGET NblExtExamplesAPIBuiltinsSource 
+        BIND src/nbl/examples 
+        NAMESPACE nbl::builtin::examples::src 
         GLOB_RGX *.hlsl *.txt
     )
 
-    REGISTER_COMMON_BUILTINS(TARGET NblExtExamplesAPIBuiltinsInclude 
-        ARCHIVE_ABS_ENTRY include/nbl/examples 
-        ARCHIVE_NAMESPACE nbl::builtin::examples::include
+    REGISTER_COMMON_BUILTINS(
+        TARGET NblExtExamplesAPIBuiltinsInclude 
+        BIND include/nbl/examples 
+        NAMESPACE nbl::builtin::examples::include
         GLOB_RGX *.hpp *.h *.hlsl *.txt
     )
 endif()
@@ -85,6 +76,18 @@ endif()
 
 add_subdirectory("src/nbl/examples" EXCLUDE_FROM_ALL)
 target_link_libraries(${LIB_NAME} PUBLIC NblExtExamplesAPISPIRV)
+if(NBL_EMBED_BUILTIN_RESOURCES)
+	INTERFACE_TO_BUILTINS(NblExtExamplesAPIBuiltinsBuild)
+
+    #[[
+        we have SPIRV keys include file in examples' PCH which then gets REUSE(d) by common archives (**) in built-in mode,
+        to not glitch compiler we need to ensure we inherit interface properties (include directories needed) for all targets 
+        which share PCH, also note it doest really link any library, the target we inherit properties from is INTERFACE
+    ]]
+    target_link_libraries(NblExtExamplesAPIBuiltinsSource PUBLIC NblExtExamplesAPISPIRV)
+    target_link_libraries(NblExtExamplesAPIBuiltinsInclude PUBLIC NblExtExamplesAPISPIRV)
+    target_link_libraries(NblExtExamplesAPIBuiltinsBuild PUBLIC NblExtExamplesAPISPIRV)
+endif()
 
 NBL_GET_ALL_TARGETS(TARGETS)
 list(REMOVE_ITEM TARGETS ${LIB_NAME})
diff --git a/common/src/nbl/examples/CMakeLists.txt b/common/src/nbl/examples/CMakeLists.txt
index 3fa63c839..e486b2b22 100644
--- a/common/src/nbl/examples/CMakeLists.txt
+++ b/common/src/nbl/examples/CMakeLists.txt
@@ -6,16 +6,19 @@ set(ARGS
 	# build directory for its SPIRV outputs
 	BINARY_DIR ${OUTPUT_DIRECTORY}
 
+	# preprocessor #define for BINARY_DIR bind point
+	MOUNT_POINT_DEFINE NBL_EXAMPLES_BUILD_MOUNT_POINT
+
 	# extra NSC compile options
 	COMMON_OPTIONS -I "${COMMON_INCLUDE_DIRECTORY}"
 
-	# fallback variable to which SPIRV access keys are appended to (including permutations), relative to BINARY_DIR
+	# out variable to which SPIRV access keys are appended to (including permutations), relative to BINARY_DIR
 	OUTPUT_VAR KEYS
 
-	# include file with key getters, use with #include directive on downstream targets
+	# include file with inline template key getters, use with #include directive on downstream targets
 	INCLUDE nbl/examples/common/build/spirv/keys.hpp
 
-	# namespace for key getters in include file
+	# namespace for key getters in the include file
 	NAMESPACE nbl::builtin::examples::build
 )
 
@@ -81,12 +84,6 @@ set(JSON [=[
 NBL_CREATE_RESOURCE_ARCHIVE(
 	TARGET NblExtExamplesAPIBuiltinsBuild
 	BIND "${OUTPUT_DIRECTORY}"
-	NAMESPACE nbl::builtin::examples::build
-	MOUNT_POINT_DEFINE NBL_EXAMPLES_BUILD_MOUNT_POINT
 	BUILTINS ${KEYS}
+	NAMESPACE nbl::builtin::examples::build
 )
-
-if(NBL_EMBED_BUILTIN_RESOURCES)
-	INTERFACE_TO_BUILTINS(NblExtExamplesAPIBuiltinsBuild)
-	target_link_libraries(NblExtExamplesAPIBuiltinsBuild PUBLIC NblExtExamplesAPISPIRV)
-endif()

From e82ba9e3ddc1026b5883fcd7b6e80bdc043fe013 Mon Sep 17 00:00:00 2001
From: Erfan Ahmadi <ahmadierfan99@gmail.com>
Date: Thu, 17 Jul 2025 11:27:54 +0400
Subject: [PATCH 484/529] Shader Edits

---
 62_CAD/shaders/main_pipeline/fragment_shader.hlsl |  6 +++---
 62_CAD/shaders/main_pipeline/vertex_shader.hlsl   | 15 ++++++---------
 2 files changed, 9 insertions(+), 12 deletions(-)

diff --git a/62_CAD/shaders/main_pipeline/fragment_shader.hlsl b/62_CAD/shaders/main_pipeline/fragment_shader.hlsl
index 7d16bd263..cf249bf34 100644
--- a/62_CAD/shaders/main_pipeline/fragment_shader.hlsl
+++ b/62_CAD/shaders/main_pipeline/fragment_shader.hlsl
@@ -588,9 +588,9 @@ float4 fragMain(PSInput input) : SV_TARGET
                     nbl::hlsl::shapes::Line<float> lineSegment;
 
                     // Doing SDF of outlines as if cooridnate system is centered around the nearest corner of the cell
-                    float2 localGridTopLeftCorner = (currentCellCoord + float2(roundedLocalUV)) * cellWidth;
-                    // We do sdf in corner's local coordinate, so we subtract currentCellScreenspaceCoord from fragmentPos and topLeftGrid 
-                    float2 localFragPos = gridSpacePos - localGridTopLeftCorner;
+                    float2 localCellSpaceOrigin = (currentCellCoord + float2(roundedLocalUV)) * cellWidth; // in local cell space, origin
+                    float2 localGridTopLeftCorner = -localCellSpaceOrigin; // top left in local cell space: topLeft is (0, 0) implicitly
+                    float2 localFragPos = gridSpacePos - localCellSpaceOrigin; // we compute the current fragment pos, in local cell space
                     
                     float phaseShift = 0.0f;
                     const bool hasStipples = outlineStyle.hasStipples();
diff --git a/62_CAD/shaders/main_pipeline/vertex_shader.hlsl b/62_CAD/shaders/main_pipeline/vertex_shader.hlsl
index feb3ec4b5..6cdfc6b0c 100644
--- a/62_CAD/shaders/main_pipeline/vertex_shader.hlsl
+++ b/62_CAD/shaders/main_pipeline/vertex_shader.hlsl
@@ -581,9 +581,8 @@ PSInput main(uint vertexID : SV_VertexID)
 
             float32_t2 minUV = glyphInfo.getMinUV();
             uint16_t textureID = glyphInfo.getTextureID();
-            
-            const int ndcYDirectionSign = sign(_static_cast<float>(clipProjectionData.projectionToNDC[1].y));
-            const float32_t2 dirV = float32_t2(glyphInfo.dirU.y, ndcYDirectionSign * glyphInfo.dirU.x) * glyphInfo.aspectRatio;
+
+            const float32_t2 dirV = float32_t2(glyphInfo.dirU.y, -glyphInfo.dirU.x) * glyphInfo.aspectRatio;
             const float2 screenTopLeft = _static_cast<float2>(transformPointNdc(clipProjectionData.projectionToNDC, glyphInfo.topLeft));
             const float2 screenDirU = _static_cast<float2>(transformVectorNdc(clipProjectionData.projectionToNDC, _static_cast<pfloat64_t2>(glyphInfo.dirU)));
             const float2 screenDirV = _static_cast<float2>(transformVectorNdc(clipProjectionData.projectionToNDC, _static_cast<pfloat64_t2>(dirV)));
@@ -631,9 +630,8 @@ PSInput main(uint vertexID : SV_VertexID)
             float32_t aspectRatio = vk::RawBufferLoad<float32_t>(globals.pointers.geometryBuffer + drawObj.geometryAddress + sizeof(pfloat64_t2) + sizeof(float2), 4u);
             uint32_t textureID = vk::RawBufferLoad<uint32_t>(globals.pointers.geometryBuffer + drawObj.geometryAddress + sizeof(pfloat64_t2) + sizeof(float2) + sizeof(float), 4u);
 
-            // If y increases as we go down in ndc this sign is positive (screenspace-like transformations), if y decreases as we go down this sign is negative (worldspace-like transformations)
-            const int ndcYDirectionSign = sign(_static_cast<float>(clipProjectionData.projectionToNDC[1].y));
-            const float32_t2 dirV = float32_t2(dirU.y, ndcYDirectionSign * dirU.x) * aspectRatio;
+            // TODO[DEVSH]: make sure it's documented properly that for topLeft+dirV+aspectRatio to work it's computing dirU like below (they need to be careful with transformations when y increases when you go down in screen
+            const float32_t2 dirV = float32_t2(dirU.y, -dirU.x) * aspectRatio;
             const float2 ndcTopLeft = _static_cast<float2>(transformPointNdc(clipProjectionData.projectionToNDC, topLeft));
             const float2 ndcDirU = _static_cast<float2>(transformVectorNdc(clipProjectionData.projectionToNDC, _static_cast<pfloat64_t2>(dirU)));
             const float2 ndcDirV = _static_cast<float2>(transformVectorNdc(clipProjectionData.projectionToNDC, _static_cast<pfloat64_t2>(dirV)));
@@ -719,9 +717,8 @@ PSInput main(uint vertexID : SV_VertexID)
             float32_t2 dirU = vk::RawBufferLoad<float32_t2>(globals.pointers.geometryBuffer + drawObj.geometryAddress + sizeof(pfloat64_t2), 4u);
             float32_t aspectRatio = vk::RawBufferLoad<float32_t>(globals.pointers.geometryBuffer + drawObj.geometryAddress + sizeof(pfloat64_t2) + sizeof(float2), 4u);
             uint32_t textureID = vk::RawBufferLoad<uint32_t>(globals.pointers.geometryBuffer + drawObj.geometryAddress + sizeof(pfloat64_t2) + sizeof(float2) + sizeof(float), 4u);
-            
-            const int ndcYDirectionSign = sign(_static_cast<float>(clipProjectionData.projectionToNDC[1].y));
-            const float32_t2 dirV = float32_t2(dirU.y, ndcYDirectionSign * dirU.x) * aspectRatio;
+
+            const float32_t2 dirV = float32_t2(dirU.y, -dirU.x) * aspectRatio;
             const float2 ndcTopLeft = _static_cast<float2>(transformPointNdc(clipProjectionData.projectionToNDC, topLeft));
             const float2 ndcDirU = _static_cast<float2>(transformVectorNdc(clipProjectionData.projectionToNDC, _static_cast<pfloat64_t2>(dirU)));
             const float2 ndcDirV = _static_cast<float2>(transformVectorNdc(clipProjectionData.projectionToNDC, _static_cast<pfloat64_t2>(dirV)));

From 81f710c381bf2494dca10bccad32345920c57544 Mon Sep 17 00:00:00 2001
From: Erfan Ahmadi <ahmadierfan99@gmail.com>
Date: Thu, 17 Jul 2025 11:40:27 +0400
Subject: [PATCH 485/529] Small Fix

---
 62_CAD/DrawResourcesFiller.cpp | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/62_CAD/DrawResourcesFiller.cpp b/62_CAD/DrawResourcesFiller.cpp
index 66b54d3ab..ec5058232 100644
--- a/62_CAD/DrawResourcesFiller.cpp
+++ b/62_CAD/DrawResourcesFiller.cpp
@@ -2397,6 +2397,17 @@ DrawResourcesFiller::ImageAllocateResults DrawResourcesFiller::tryCreateAndAlloc
 							.viewType = IGPUImageView::ET_2D,
 							.format = (imageViewFormatOverride == asset::E_FORMAT::EF_COUNT) ? gpuImage->getCreationParameters().format : imageViewFormatOverride
 						};
+
+						const uint32_t channelCount = nbl::asset::getFormatChannelCount(viewParams.format);
+						if (channelCount == 1u)
+						{
+							// for rendering grayscale:
+							viewParams.components.r = nbl::asset::IImageViewBase::SComponentMapping::E_SWIZZLE::ES_R;
+							viewParams.components.g = nbl::asset::IImageViewBase::SComponentMapping::E_SWIZZLE::ES_R;
+							viewParams.components.b = nbl::asset::IImageViewBase::SComponentMapping::E_SWIZZLE::ES_R;
+							viewParams.components.a = nbl::asset::IImageViewBase::SComponentMapping::E_SWIZZLE::ES_ONE;
+						}
+
 						ret.gpuImageView = device->createImageView(std::move(viewParams));
 						if (ret.gpuImageView)
 						{

From 3dd782ed5537a646ce8ecc27beaa75ffd7fb3359 Mon Sep 17 00:00:00 2001
From: Erfan Ahmadi <ahmadierfan99@gmail.com>
Date: Thu, 17 Jul 2025 13:29:56 +0400
Subject: [PATCH 486/529] Fix Compiler Errors with new changes

---
 62_CAD/GeoTexture.cpp |   9 ++--
 62_CAD/GeoTexture.h   |   4 +-
 62_CAD/main.cpp       | 111 +++++++++++++++++++++---------------------
 3 files changed, 62 insertions(+), 62 deletions(-)

diff --git a/62_CAD/GeoTexture.cpp b/62_CAD/GeoTexture.cpp
index 71cbcef34..de8a974d0 100644
--- a/62_CAD/GeoTexture.cpp
+++ b/62_CAD/GeoTexture.cpp
@@ -1,8 +1,8 @@
 #include "GeoTexture.h"
 
 bool GeoTextureRenderer::initialize(
-		IGPUShader* vertexShader,
-		IGPUShader* fragmentShader,
+		IShader* vertexShader,
+		IShader* fragmentShader,
 		IGPURenderpass* compatibleRenderPass,
 		const smart_refctd_ptr<IGPUBuffer>& globalsBuffer)
 {
@@ -87,14 +87,15 @@ bool GeoTextureRenderer::initialize(
 
 	// Create Main Graphics Pipelines 
 	{
-		IGPUShader::SSpecInfo specInfo[2] = {
+		video::IGPUPipelineBase::SShaderSpecInfo specInfo[2] = {
 			{.shader=vertexShader },
 			{.shader=fragmentShader },
 		};
 
 		IGPUGraphicsPipeline::SCreationParams params[1] = {};
 		params[0].layout = m_pipelineLayout.get();
-		params[0].shaders = specInfo;
+		params[0].vertexShader = specInfo[0];
+		params[0].fragmentShader = specInfo[1];
 		params[0].cached = {
 			.vertexInput = {},
 			.primitiveAssembly = {
diff --git a/62_CAD/GeoTexture.h b/62_CAD/GeoTexture.h
index c43208e32..f471009fc 100644
--- a/62_CAD/GeoTexture.h
+++ b/62_CAD/GeoTexture.h
@@ -29,8 +29,8 @@ class GeoTextureRenderer
 	{}
 
 	bool initialize(
-		IGPUShader* vertexShader,
-		IGPUShader* fragmentShader,
+		IShader* vertexShader,
+		IShader* fragmentShader,
 		IGPURenderpass* compatibleRenderPass,
 		const smart_refctd_ptr<IGPUBuffer>& globalsBuffer);
 
diff --git a/62_CAD/main.cpp b/62_CAD/main.cpp
index e4867ddf7..ac04a37a4 100644
--- a/62_CAD/main.cpp
+++ b/62_CAD/main.cpp
@@ -3,15 +3,18 @@
 
 #include "nbl/examples/examples.hpp"
 
-using namespace nbl;
-using namespace nbl::core;
 using namespace nbl::hlsl;
-using namespace nbl::system;
-using namespace nbl::asset;
-using namespace nbl::ui;
-using namespace nbl::video;
-// TODO: probably need to be `using namespace nbl::examples` as well, see other examples
+using namespace nbl;
+using namespace core;
+using namespace system;
+using namespace asset;
+using namespace ui;
+using namespace video;
 
+#include "nbl/examples/common/BuiltinResourcesApplication.hpp"
+#include "nbl/examples/common/SimpleWindowedApplication.hpp"
+#include "nbl/examples/common/InputSystem.hpp"
+#include "nbl/video/utilities/CSimpleResizeSurface.h"
 
 #include "nbl/ext/FullScreenTriangle/FullScreenTriangle.h"
 #include "nbl/ext/TextRendering/TextRendering.h"
@@ -170,14 +173,14 @@ class Camera2D
 class CEventCallback : public ISimpleManagedSurface::ICallback
 {
 public:
-	CEventCallback(nbl::core::smart_refctd_ptr<InputSystem>&& m_inputSystem, nbl::system::logger_opt_smart_ptr&& logger) : m_inputSystem(std::move(m_inputSystem)), m_logger(std::move(logger)){}
+	CEventCallback(nbl::core::smart_refctd_ptr<nbl::examples::InputSystem>&& m_inputSystem, nbl::system::logger_opt_smart_ptr&& logger) : m_inputSystem(std::move(m_inputSystem)), m_logger(std::move(logger)){}
 	CEventCallback() {}
 	
 	void setLogger(nbl::system::logger_opt_smart_ptr& logger)
 	{
 		m_logger = logger;
 	}
-	void setInputSystem(nbl::core::smart_refctd_ptr<InputSystem>&& m_inputSystem)
+	void setInputSystem(nbl::core::smart_refctd_ptr<nbl::examples::InputSystem>&& m_inputSystem)
 	{
 		m_inputSystem = std::move(m_inputSystem);
 	}
@@ -205,7 +208,7 @@ class CEventCallback : public ISimpleManagedSurface::ICallback
 	}
 
 private:
-	nbl::core::smart_refctd_ptr<InputSystem> m_inputSystem = nullptr;
+	nbl::core::smart_refctd_ptr<nbl::examples::InputSystem> m_inputSystem = nullptr;
 	nbl::system::logger_opt_smart_ptr m_logger = nullptr;
 };
 	
@@ -358,10 +361,10 @@ bool performImageFormatPromotionCopy(const core::smart_refctd_ptr<asset::ICPUIma
         return performCopyUsingImageFilter<asset::CSwizzleAndConvertImageFilter<asset::EF_UNKNOWN, asset::EF_UNKNOWN, PromotionComponentSwizzle<4u>>>(inCPUImage, outCPUImage);
 }
 
-class ComputerAidedDesign final : public examples::SimpleWindowedApplication, public application_templates::MonoAssetManagerAndBuiltinResourceApplication
+class ComputerAidedDesign final : public nbl::examples::SimpleWindowedApplication, public nbl::examples::BuiltinResourcesApplication
 {
-	using device_base_t = examples::SimpleWindowedApplication;
-	using asset_base_t = application_templates::MonoAssetManagerAndBuiltinResourceApplication;
+	using device_base_t = nbl::examples::SimpleWindowedApplication;
+	using asset_base_t = nbl::examples::BuiltinResourcesApplication;
 	using clock_t = std::chrono::steady_clock;
 	
 	constexpr static uint32_t WindowWidthRequest = 1600u;
@@ -738,7 +741,7 @@ class ComputerAidedDesign final : public examples::SimpleWindowedApplication, pu
 
 	inline bool onAppInitialized(smart_refctd_ptr<ISystem>&& system) override
 	{
-		m_inputSystem = make_smart_refctd_ptr<InputSystem>(logger_opt_smart_ptr(smart_refctd_ptr(m_logger)));
+		m_inputSystem = make_smart_refctd_ptr<nbl::examples::InputSystem>(logger_opt_smart_ptr(smart_refctd_ptr(m_logger)));
 
 		// Remember to call the base class initialization!
 		if (!device_base_t::onAppInitialized(smart_refctd_ptr(system)))
@@ -922,9 +925,9 @@ class ComputerAidedDesign final : public examples::SimpleWindowedApplication, pu
 
 		drawResourcesFiller.setTexturesDescriptorSetAndBinding(core::smart_refctd_ptr(descriptorSet0), imagesBinding);
 
-		smart_refctd_ptr<IGPUShader> mainPipelineFragmentShaders = {};
-		smart_refctd_ptr<IGPUShader> mainPipelineVertexShader = {};
-		std::array<smart_refctd_ptr<IGPUShader>, 2u> geoTexturePipelineShaders = {};
+		smart_refctd_ptr<IShader> mainPipelineFragmentShaders = {};
+		smart_refctd_ptr<IShader> mainPipelineVertexShader = {};
+		std::array<smart_refctd_ptr<IShader>, 2u> geoTexturePipelineShaders = {};
 		{
 			smart_refctd_ptr<IShaderCompiler::CCache> shaderReadCache = nullptr;
 			smart_refctd_ptr<IShaderCompiler::CCache> shaderWriteCache = core::make_smart_refctd_ptr<IShaderCompiler::CCache>();
@@ -958,36 +961,30 @@ class ComputerAidedDesign final : public examples::SimpleWindowedApplication, pu
 			}
 
 			// Load Custom Shader
-			auto loadCompileShader = [&](const std::string& relPath, IShader::E_SHADER_STAGE stage) -> smart_refctd_ptr<ICPUShader>
-				{
-					IAssetLoader::SAssetLoadParams lp = {};
-					lp.logger = m_logger.get();
-					lp.workingDirectory = ""; // virtual root
-					auto assetBundle = m_assetMgr->getAsset(relPath, lp);
-					const auto assets = assetBundle.getContents();
-					if (assets.empty())
-						return nullptr;
-
-					// lets go straight from ICPUSpecializedShader to IGPUSpecializedShader
-					auto cpuShader = IAsset::castDown<ICPUShader>(assets[0]);
-					if (!cpuShader)
-						return nullptr;
-
-					cpuShader->setShaderStage(stage);
-					return m_device->compileShader({ cpuShader.get(), nullptr, shaderReadCache.get(), shaderWriteCache.get() });
-				};
+auto loadCompileShader = [&](const std::string& relPath, IShader::E_SHADER_STAGE stage) -> smart_refctd_ptr<IShader>
+	{
+		IAssetLoader::SAssetLoadParams lp = {};
+		lp.logger = m_logger.get();
+		lp.workingDirectory = ""; // virtual root
+		auto assetBundle = m_assetMgr->getAsset(relPath, lp);
+		const auto assets = assetBundle.getContents();
+		if (assets.empty())
+			return nullptr;
 
-			auto mainPipelineFragmentCpuShader = loadCompileShader("../shaders/main_pipeline/fragment.hlsl", IShader::E_SHADER_STAGE::ESS_ALL_OR_LIBRARY);
-			auto mainPipelineVertexCpuShader = loadCompileShader("../shaders/main_pipeline/vertex_shader.hlsl", IShader::E_SHADER_STAGE::ESS_VERTEX);
-			// auto geoTexturePipelineVertCpuShader = loadCompileShader(GeoTextureRenderer::VertexShaderRelativePath, IShader::E_SHADER_STAGE::ESS_VERTEX);
-			// auto geoTexturePipelineFragCpuShader = loadCompileShader(GeoTextureRenderer::FragmentShaderRelativePath, IShader::E_SHADER_STAGE::ESS_FRAGMENT);
-			mainPipelineFragmentCpuShader->setShaderStage(IShader::E_SHADER_STAGE::ESS_FRAGMENT);
+		// lets go straight from ICPUSpecializedShader to IGPUSpecializedShader
+		auto source = IAsset::castDown<IShader>(assets[0]);
+		if (!source)
+			return nullptr;
+	
+		return m_device->compileShader({ source.get(), nullptr, shaderReadCache.get(), shaderWriteCache.get() });
+	};
 
-			mainPipelineFragmentShaders = m_device->createShader({ mainPipelineFragmentCpuShader.get(), nullptr, shaderReadCache.get(), shaderWriteCache.get() });
-			mainPipelineVertexShader = m_device->createShader({ mainPipelineVertexCpuShader.get(), nullptr, shaderReadCache.get(), shaderWriteCache.get() });
-			// geoTexturePipelineShaders[0] = m_device->createShader({ geoTexturePipelineVertCpuShader.get(), nullptr, shaderReadCache.get(), shaderWriteCache.get() });
-			// geoTexturePipelineShaders[1] = m_device->createShader({ geoTexturePipelineFragCpuShader.get(), nullptr, shaderReadCache.get(), shaderWriteCache.get() });
+			auto mainPipelineFragmentShader = loadCompileShader("../shaders/main_pipeline/fragment.hlsl", IShader::E_SHADER_STAGE::ESS_ALL_OR_LIBRARY);
+			auto mainPipelineVertexShader = loadCompileShader("../shaders/main_pipeline/vertex_shader.hlsl", IShader::E_SHADER_STAGE::ESS_VERTEX);
+			// auto geoTexturePipelineVertShader = loadCompileShader(GeoTextureRenderer::VertexShaderRelativePath, IShader::E_SHADER_STAGE::ESS_VERTEX);
+			// auto geoTexturePipelineFragShader = loadCompileShader(GeoTextureRenderer::FragmentShaderRelativePath, IShader::E_SHADER_STAGE::ESS_FRAGMENT);
 			
+#if 0
 			core::smart_refctd_ptr<system::IFile> shaderWriteCacheFile;
 			{
 				system::ISystem::future_t<core::smart_refctd_ptr<system::IFile>> future;
@@ -1013,6 +1010,7 @@ class ComputerAidedDesign final : public examples::SimpleWindowedApplication, pu
 				else
 					m_logger->log("Failed Creating Shader Cache File.", ILogger::ELL_ERROR);
 			}
+#endif
 		}
 
 		// Shared Blend Params between pipelines
@@ -1030,7 +1028,7 @@ class ComputerAidedDesign final : public examples::SimpleWindowedApplication, pu
 			// Load FSTri Shader
 			ext::FullScreenTriangle::ProtoPipeline fsTriangleProtoPipe(m_assetMgr.get(),m_device.get(),m_logger.get());
 			
-			const IGPUShader::SSpecInfo fragSpec = { .entryPoint = "resolveAlphaMain", .shader = mainPipelineFragmentShaders.get() };
+			const video::IGPUPipelineBase::SShaderSpecInfo fragSpec = { .shader = mainPipelineFragmentShaders.get(), .entryPoint = "resolveAlphaMain" };
 
 			resolveAlphaGraphicsPipeline = fsTriangleProtoPipe.createPipeline(fragSpec, pipelineLayout.get(), compatibleRenderPass.get(), 0u, blendParams);
 			if (!resolveAlphaGraphicsPipeline)
@@ -1041,20 +1039,21 @@ class ComputerAidedDesign final : public examples::SimpleWindowedApplication, pu
 		// Create Main Graphics Pipelines 
 		{
 			
-			IGPUShader::SSpecInfo specInfo[2] = {
+			video::IGPUPipelineBase::SShaderSpecInfo specInfo[2] = {
 				{
-					.entryPoint = "main",
-					.shader = mainPipelineVertexShader.get()
+					.shader = mainPipelineVertexShader.get(),
+					.entryPoint = "main"
 				},
 				{
-					.entryPoint = "fragMain",
-					.shader = mainPipelineFragmentShaders.get()
+					.shader = mainPipelineFragmentShaders.get(),
+					.entryPoint = "fragMain"
 				},
 			};
 
 			IGPUGraphicsPipeline::SCreationParams params[1] = {};
 			params[0].layout = pipelineLayout.get();
-			params[0].shaders = specInfo;
+			params[0].vertexShader = specInfo[0];
+			params[0].fragmentShader = specInfo[1];
 			params[0].cached = {
 				.vertexInput = {},
 				.primitiveAssembly = {
@@ -1626,7 +1625,7 @@ class ComputerAidedDesign final : public examples::SimpleWindowedApplication, pu
 					.triangleMeshMainObjectIndex = drawCall.dtm.triangleMeshMainObjectIndex,
 					.isDTMRendering = true
 				};
-				cb->pushConstants(graphicsPipeline->getLayout(), IGPUShader::E_SHADER_STAGE::ESS_VERTEX | IShader::E_SHADER_STAGE::ESS_FRAGMENT, 0, sizeof(PushConstants), &pc);
+				cb->pushConstants(graphicsPipeline->getLayout(), IShader::E_SHADER_STAGE::ESS_VERTEX | IShader::E_SHADER_STAGE::ESS_FRAGMENT, 0, sizeof(PushConstants), &pc);
 
 				cb->drawIndexed(drawCall.dtm.indexCount, 1u, 0u, 0u, 0u);
 			}
@@ -1635,7 +1634,7 @@ class ComputerAidedDesign final : public examples::SimpleWindowedApplication, pu
 				PushConstants pc = {
 					.isDTMRendering = false
 				};
-				cb->pushConstants(graphicsPipeline->getLayout(), IGPUShader::E_SHADER_STAGE::ESS_VERTEX | IShader::E_SHADER_STAGE::ESS_FRAGMENT, 0, sizeof(PushConstants), &pc);
+				cb->pushConstants(graphicsPipeline->getLayout(), IShader::E_SHADER_STAGE::ESS_VERTEX | IShader::E_SHADER_STAGE::ESS_FRAGMENT, 0, sizeof(PushConstants), &pc);
 
 				const uint64_t indexOffset = drawCall.drawObj.drawObjectStart * 6u;
 				const uint64_t indexCount = drawCall.drawObj.drawObjectCount * 6u;
@@ -3698,9 +3697,9 @@ class ComputerAidedDesign final : public examples::SimpleWindowedApplication, pu
 
 	bool fragmentShaderInterlockEnabled = false;
 
-	core::smart_refctd_ptr<InputSystem> m_inputSystem;
-	InputSystem::ChannelReader<IMouseEventChannel> mouse;
-	InputSystem::ChannelReader<IKeyboardEventChannel> keyboard;
+	core::smart_refctd_ptr<nbl::examples::InputSystem> m_inputSystem;
+	nbl::examples::InputSystem::ChannelReader<IMouseEventChannel> mouse;
+	nbl::examples::InputSystem::ChannelReader<IKeyboardEventChannel> keyboard;
 	
 	smart_refctd_ptr<IGPURenderpass> renderpassInitial; // this renderpass will clear the attachment and transition it to COLOR_ATTACHMENT_OPTIMAL
 	smart_refctd_ptr<IGPURenderpass> renderpassInBetween; // this renderpass will load the attachment and transition it to COLOR_ATTACHMENT_OPTIMAL

From 4483e3ce1d71a1333fde8995030aa62685e25717 Mon Sep 17 00:00:00 2001
From: Arkadiusz Lachowicz <areklachowicz@gmail.com>
Date: Thu, 17 Jul 2025 15:29:38 +0200
Subject: [PATCH 487/529] save work

---
 23_Arithmetic2UnitTest/CMakeLists.txt | 38 +++++++++------------------
 CMakeLists.txt                        |  1 +
 2 files changed, 14 insertions(+), 25 deletions(-)

diff --git a/23_Arithmetic2UnitTest/CMakeLists.txt b/23_Arithmetic2UnitTest/CMakeLists.txt
index 0724366c9..e510411f2 100644
--- a/23_Arithmetic2UnitTest/CMakeLists.txt
+++ b/23_Arithmetic2UnitTest/CMakeLists.txt
@@ -1,25 +1,13 @@
-
-include(common RESULT_VARIABLE RES)
-if(NOT RES)
-	message(FATAL_ERROR "common.cmake not found. Should be in {repo_root}/cmake directory")
-endif()
-
-nbl_create_executable_project("" "" "" "" "${NBL_EXECUTABLE_PROJECT_CREATION_PCH_TARGET}")
-
-if(NBL_EMBED_BUILTIN_RESOURCES)
-	set(_BR_TARGET_ ${EXECUTABLE_NAME}_builtinResourceData)
-	set(RESOURCE_DIR "app_resources")
-
-	get_filename_component(_SEARCH_DIRECTORIES_ "${CMAKE_CURRENT_SOURCE_DIR}" ABSOLUTE)
-	get_filename_component(_OUTPUT_DIRECTORY_SOURCE_ "${CMAKE_CURRENT_BINARY_DIR}/src" ABSOLUTE)
-	get_filename_component(_OUTPUT_DIRECTORY_HEADER_ "${CMAKE_CURRENT_BINARY_DIR}/include" ABSOLUTE)
-
-    file(GLOB_RECURSE BUILTIN_RESOURCE_FILES RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}/${RESOURCE_DIR}" CONFIGURE_DEPENDS "${CMAKE_CURRENT_SOURCE_DIR}/${RESOURCE_DIR}/*")
-    foreach(RES_FILE ${BUILTIN_RESOURCE_FILES})
-      LIST_BUILTIN_RESOURCE(RESOURCES_TO_EMBED "${RES_FILE}")
-    endforeach()
-
-	ADD_CUSTOM_BUILTIN_RESOURCES(${_BR_TARGET_} RESOURCES_TO_EMBED "${_SEARCH_DIRECTORIES_}" "${RESOURCE_DIR}" "nbl::this_example::builtin" "${_OUTPUT_DIRECTORY_HEADER_}" "${_OUTPUT_DIRECTORY_SOURCE_}")
-
-	LINK_BUILTIN_RESOURCES_TO_TARGET(${EXECUTABLE_NAME} ${_BR_TARGET_})
-endif()
\ No newline at end of file
+include(common)
+
+nbl_create_executable_project("" "" "" "")
+
+get_filename_component(MOUNT_POINT "${CMAKE_CURRENT_SOURCE_DIR}/app_resources" ABSOLUTE)
+file(GLOB_RECURSE KEYS RELATIVE ${MOUNT_POINT} CONFIGURE_DEPENDS app_resources/*.hlsl)
+NBL_CREATE_RESOURCE_ARCHIVE(
+	TARGET ${EXECUTABLE_NAME}_builtins
+	LINK_TO ${EXECUTABLE_NAME}
+	BIND ${MOUNT_POINT}
+	BUILTINS ${KEYS}
+	NAMESPACE nbl::this_example::builtin
+)
\ No newline at end of file
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 8cf1364a5..80b3889e4 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -96,6 +96,7 @@ if(NBL_BUILD_EXAMPLES)
 	foreach(T IN LISTS TARGETS)
         target_link_libraries(${T} PUBLIC ${NBL_EXAMPLES_API_TARGET})
 		target_include_directories(${T} PUBLIC $<TARGET_PROPERTY:${NBL_EXAMPLES_API_TARGET},INCLUDE_DIRECTORIES>)
+		set_target_properties(${T} PROPERTIES DISABLE_PRECOMPILE_HEADERS OFF)
 		target_precompile_headers(${T} REUSE_FROM "${NBL_EXAMPLES_API_TARGET}")
 
 		if(NBL_EMBED_BUILTIN_RESOURCES)

From d3ad6770272fa7cae2f5f14c93d47c44c5ff9fe4 Mon Sep 17 00:00:00 2001
From: Erfan Ahmadi <ahmadierfan99@gmail.com>
Date: Fri, 18 Jul 2025 08:57:32 +0400
Subject: [PATCH 488/529] Shader Compilation Fixes

---
 62_CAD/main.cpp                               | 42 +++++++++----------
 .../shaders/main_pipeline/vertex_shader.hlsl  |  3 +-
 2 files changed, 21 insertions(+), 24 deletions(-)

diff --git a/62_CAD/main.cpp b/62_CAD/main.cpp
index ac04a37a4..2fe936f68 100644
--- a/62_CAD/main.cpp
+++ b/62_CAD/main.cpp
@@ -961,30 +961,27 @@ class ComputerAidedDesign final : public nbl::examples::SimpleWindowedApplicatio
 			}
 
 			// Load Custom Shader
-auto loadCompileShader = [&](const std::string& relPath, IShader::E_SHADER_STAGE stage) -> smart_refctd_ptr<IShader>
-	{
-		IAssetLoader::SAssetLoadParams lp = {};
-		lp.logger = m_logger.get();
-		lp.workingDirectory = ""; // virtual root
-		auto assetBundle = m_assetMgr->getAsset(relPath, lp);
-		const auto assets = assetBundle.getContents();
-		if (assets.empty())
-			return nullptr;
-
-		// lets go straight from ICPUSpecializedShader to IGPUSpecializedShader
-		auto source = IAsset::castDown<IShader>(assets[0]);
-		if (!source)
-			return nullptr;
+			auto loadCompileShader = [&](const std::string& relPath) -> smart_refctd_ptr<IShader>
+				{
+					IAssetLoader::SAssetLoadParams lp = {};
+					lp.logger = m_logger.get();
+					lp.workingDirectory = ""; // virtual root
+					auto assetBundle = m_assetMgr->getAsset(relPath, lp);
+					const auto assets = assetBundle.getContents();
+					if (assets.empty())
+						return nullptr;
+
+					// lets go straight from ICPUSpecializedShader to IGPUSpecializedShader
+					auto source = IAsset::castDown<IShader>(assets[0]);
+					if (!source)
+						return nullptr;
 	
-		return m_device->compileShader({ source.get(), nullptr, shaderReadCache.get(), shaderWriteCache.get() });
-	};
+					return m_device->compileShader( ILogicalDevice::SShaderCreationParameters { .source = source.get(), .readCache = shaderReadCache.get(), .writeCache = shaderWriteCache.get(), .stage = IShader::E_SHADER_STAGE::ESS_ALL_OR_LIBRARY });
+				};
 
-			auto mainPipelineFragmentShader = loadCompileShader("../shaders/main_pipeline/fragment.hlsl", IShader::E_SHADER_STAGE::ESS_ALL_OR_LIBRARY);
-			auto mainPipelineVertexShader = loadCompileShader("../shaders/main_pipeline/vertex_shader.hlsl", IShader::E_SHADER_STAGE::ESS_VERTEX);
-			// auto geoTexturePipelineVertShader = loadCompileShader(GeoTextureRenderer::VertexShaderRelativePath, IShader::E_SHADER_STAGE::ESS_VERTEX);
-			// auto geoTexturePipelineFragShader = loadCompileShader(GeoTextureRenderer::FragmentShaderRelativePath, IShader::E_SHADER_STAGE::ESS_FRAGMENT);
+			mainPipelineFragmentShaders = loadCompileShader("../shaders/main_pipeline/fragment.hlsl");
+			mainPipelineVertexShader = loadCompileShader("../shaders/main_pipeline/vertex_shader.hlsl");
 			
-#if 0
 			core::smart_refctd_ptr<system::IFile> shaderWriteCacheFile;
 			{
 				system::ISystem::future_t<core::smart_refctd_ptr<system::IFile>> future;
@@ -1010,7 +1007,6 @@ auto loadCompileShader = [&](const std::string& relPath, IShader::E_SHADER_STAGE
 				else
 					m_logger->log("Failed Creating Shader Cache File.", ILogger::ELL_ERROR);
 			}
-#endif
 		}
 
 		// Shared Blend Params between pipelines
@@ -1042,7 +1038,7 @@ auto loadCompileShader = [&](const std::string& relPath, IShader::E_SHADER_STAGE
 			video::IGPUPipelineBase::SShaderSpecInfo specInfo[2] = {
 				{
 					.shader = mainPipelineVertexShader.get(),
-					.entryPoint = "main"
+					.entryPoint = "vtxMain"
 				},
 				{
 					.shader = mainPipelineFragmentShaders.get(),
diff --git a/62_CAD/shaders/main_pipeline/vertex_shader.hlsl b/62_CAD/shaders/main_pipeline/vertex_shader.hlsl
index 6cdfc6b0c..407731ffe 100644
--- a/62_CAD/shaders/main_pipeline/vertex_shader.hlsl
+++ b/62_CAD/shaders/main_pipeline/vertex_shader.hlsl
@@ -106,7 +106,8 @@ void dilateHatch<false>(out float2 outOffsetVec, out float2 outUV, const float2
     // Or optionally we could dilate and stuff when we know this hatch is opaque (alpha = 1.0)
 }
 
-PSInput main(uint vertexID : SV_VertexID)
+[shader("vertex")]
+PSInput vtxMain(uint vertexID : SV_VertexID)
 {
     NDCClipProjectionData clipProjectionData;
     

From aafa64132de4ba4126c4fd82d59c035fe46cadf4 Mon Sep 17 00:00:00 2001
From: Erfan Ahmadi <ahmadierfan99@gmail.com>
Date: Fri, 18 Jul 2025 13:19:42 +0400
Subject: [PATCH 489/529] WIP: Transformations and ScreenToWorld Fixes

---
 62_CAD/main.cpp                               | 45 ++++++++++++-------
 62_CAD/shaders/globals.hlsl                   |  8 ++--
 62_CAD/shaders/main_pipeline/common.hlsl      | 27 ++++++-----
 62_CAD/shaders/main_pipeline/dtm.hlsl         | 18 ++++----
 .../main_pipeline/fragment_shader.hlsl        | 28 +++++++-----
 .../shaders/main_pipeline/vertex_shader.hlsl  | 23 ++++++++--
 6 files changed, 91 insertions(+), 58 deletions(-)

diff --git a/62_CAD/main.cpp b/62_CAD/main.cpp
index 2fe936f68..459ffe6ea 100644
--- a/62_CAD/main.cpp
+++ b/62_CAD/main.cpp
@@ -83,7 +83,7 @@ constexpr std::array<float, (uint32_t)ExampleMode::CASE_COUNT> cameraExtents =
 	1000.0	// CASE_11
 };
 
-constexpr ExampleMode mode = ExampleMode::CASE_11;
+constexpr ExampleMode mode = ExampleMode::CASE_5;
 
 class Camera2D
 {
@@ -1493,7 +1493,8 @@ class ComputerAidedDesign final : public nbl::examples::SimpleWindowedApplicatio
 
 		// TEST CAMERA ROTATION
 #if 0
-		double rotation = 0.25 * PI<double>();
+		// double rotation = 0.25 * PI<double>();
+		double rotation = abs(cos(m_timeElapsed * 0.0001)) * PI<double>();
 		float64_t2 rotationVec = float64_t2(cos(rotation), sin(rotation));
 		float64_t3x3 rotationParameter = float64_t3x3 {
 			rotationVec.x, rotationVec.y, 0.0,
@@ -1517,11 +1518,9 @@ class ComputerAidedDesign final : public nbl::examples::SimpleWindowedApplicatio
 		globalData.resolution = uint32_t2{ m_window->getWidth(), m_window->getHeight() };
 		globalData.defaultProjectionToNDC = projectionToNDC;
 		float screenToWorld = getScreenToWorldRatio(globalData.defaultProjectionToNDC, globalData.resolution);
-		globalData.screenToWorldRatio = screenToWorld;
-		globalData.worldToScreenRatio = (1.0f/screenToWorld);
-		globalData.screenToWorldScaleTransform = float64_t3x3(globalData.worldToScreenRatio, 0.0f, 0.0f,
-														 0.0f, globalData.worldToScreenRatio, 0.0f,
-														 0.0f, 0.0f, 1.0f);
+		globalData.screenToWorldScaleTransform = float64_t3x3(	1.0f / screenToWorld, 0.0f, 0.0f,
+																0.0f, 1.0f / screenToWorld, 0.0f,
+																0.0f, 0.0f, 1.0f);
 		globalData.miterLimit = 10.0f;
 		globalData.currentlyActiveMainObjectIndex = drawResourcesFiller.getActiveMainObjectIndex();
 		SBufferRange<IGPUBuffer> globalBufferUpdateRange = { .offset = 0ull, .size = sizeof(Globals), .buffer = m_globalsBuffer.get() };
@@ -1646,14 +1645,27 @@ class ComputerAidedDesign final : public nbl::examples::SimpleWindowedApplicatio
 			cb->bindGraphicsPipeline(resolveAlphaGraphicsPipeline.get());
 			nbl::ext::FullScreenTriangle::recordDrawCall(cb);
 		}
-
+		
 		if constexpr (DebugModeWireframe)
 		{
-			const uint32_t indexCount = resourcesCollection.drawObjects.getCount() * 6u;
 			cb->bindGraphicsPipeline(debugGraphicsPipeline.get());
-			cb->drawIndexed(indexCount, 1u, 0u, 0u, 0u);
-		}
 
+			for (auto& drawCall : drawResourcesFiller.getDrawCalls())
+			{
+				PushConstants pc = {
+					.isDTMRendering = false
+				};
+				cb->pushConstants(debugGraphicsPipeline->getLayout(), IShader::E_SHADER_STAGE::ESS_VERTEX | IShader::E_SHADER_STAGE::ESS_FRAGMENT, 0, sizeof(PushConstants), &pc);
+
+				const uint64_t indexOffset = drawCall.drawObj.drawObjectStart * 6u;
+				const uint64_t indexCount = drawCall.drawObj.drawObjectCount * 6u;
+
+				// assert(currentIndexCount == resourcesCollection.indexBuffer.getCount());
+				cb->bindIndexBuffer({ .offset = resourcesCollection.indexBuffer.bufferOffset + indexOffset * sizeof(uint32_t), .buffer = resourcesGPUBuffer.get()}, asset::EIT_32BIT);
+
+				cb->drawIndexed(indexCount, 1u, 0u, 0u, 0u);
+			}
+		}
 		cb->endRenderPass();
 
 		if (!inBetweenSubmit)
@@ -2445,11 +2457,11 @@ class ComputerAidedDesign final : public nbl::examples::SimpleWindowedApplicatio
 		else if (mode == ExampleMode::CASE_5)
 		{
 //#define CASE_5_POLYLINE_1 // animated stipple pattern
-//#define CASE_5_POLYLINE_2 // miter test static
+#define CASE_5_POLYLINE_2 // miter test static
 //#define CASE_5_POLYLINE_3 // miter test animated
 //#define CASE_5_POLYLINE_4 // miter test animated (every angle)
 //#define CASE_5_POLYLINE_5 // closed polygon
-#define CASE_5_POLYLINE_6 // stretching
+// #define CASE_5_POLYLINE_6 // stretching
 //#define CASE_5_POLYLINE_7 // wide non solid lines
 
 #if defined(CASE_5_POLYLINE_1)
@@ -2565,7 +2577,7 @@ class ComputerAidedDesign final : public nbl::examples::SimpleWindowedApplicatio
 				/*quadratics2[3].P0 = {20.0, 50.0};
 				quadratics2[3].P1 = { -80.0, 100.0 };
 				quadratics2[3].P2 = { -100.0, 90.0 };*/
-				polyline.addQuadBeziers(core::SRange<shapes::QuadraticBezier<double>>(quadratics2.data(), quadratics2.data() + quadratics2.size()));
+				polyline.addQuadBeziers(quadratics2);
 
 				// section 3: lines
 				std::vector<float64_t2> linePoints2;
@@ -3679,9 +3691,8 @@ class ComputerAidedDesign final : public nbl::examples::SimpleWindowedApplicatio
 	double getScreenToWorldRatio(const float64_t3x3& viewProjectionMatrix, uint32_t2 windowSize)
 	{
 		double idx_0_0 = viewProjectionMatrix[0u][0u] * (windowSize.x / 2.0);
-		double idx_1_1 = viewProjectionMatrix[1u][1u] * (windowSize.y / 2.0);
-		double det_2x2_mat = idx_0_0 * idx_1_1;
-		return static_cast<float>(core::sqrt(core::abs(det_2x2_mat)));
+		double idx_1_0 = viewProjectionMatrix[1u][0u] * (windowSize.y / 2.0);
+		return hlsl::length(float64_t2(idx_0_0, idx_1_0));
 	}
 
 protected:
diff --git a/62_CAD/shaders/globals.hlsl b/62_CAD/shaders/globals.hlsl
index 2c645baf3..dbf0cf390 100644
--- a/62_CAD/shaders/globals.hlsl
+++ b/62_CAD/shaders/globals.hlsl
@@ -68,8 +68,6 @@ struct Globals
     Pointers pointers;
     pfloat64_t3x3 defaultProjectionToNDC;
     pfloat64_t3x3 screenToWorldScaleTransform; // Pre-multiply your transform with this to scale in screen space (e.g., scale 100.0 means 100 screen pixels).
-    float screenToWorldRatio;
-    float worldToScreenRatio;
     uint32_t2 resolution;
     float antiAliasingFactor;
     uint32_t miterLimit;
@@ -77,7 +75,7 @@ struct Globals
     float32_t _padding;
 };
 #ifndef __HLSL_VERSION
-static_assert(sizeof(Globals) == 232u);
+static_assert(sizeof(Globals) == 224u);
 #endif
 
 #ifdef __HLSL_VERSION
@@ -330,7 +328,7 @@ static float32_t3 unpackR11G11B10_UNORM(uint32_t packed)
 struct PolylineConnector
 {
     pfloat64_t2 circleCenter;
-    float32_t2 v;
+    float32_t2 v; // the vector from circle center to the intersection of the line ends, it's normalized such that the radius of the circle is equal to 1
     float32_t cosAngleDifferenceHalf;
     float32_t _reserved_pad;
 };
@@ -477,6 +475,8 @@ struct DTMSettings
     uint32_t contourSettingsCount;
     DTMContourSettings contourSettings[MaxContourSettings];
 
+    uint32_t workaroundForSpirvOptimizerBugToMakeNextMembersAlignmentEqualTo16_LOL;
+    
     // height shading
     DTMHeightShadingSettings heightShadingSettings;
     
diff --git a/62_CAD/shaders/main_pipeline/common.hlsl b/62_CAD/shaders/main_pipeline/common.hlsl
index bb2770a31..7ce0e2adf 100644
--- a/62_CAD/shaders/main_pipeline/common.hlsl
+++ b/62_CAD/shaders/main_pipeline/common.hlsl
@@ -75,19 +75,21 @@ struct PrecomputedRootFinder
  //     As always try to reuse parameters and try not to introduce new ones
 struct PSInput
 {
-    float4 position : SV_Position;
-    float4 clip : SV_ClipDistance;
-    [[vk::location(0)]] nointerpolation uint4 data1 : COLOR1;
-    [[vk::location(1)]] nointerpolation float4 data2 : COLOR2;
-    [[vk::location(2)]] nointerpolation float4 data3 : COLOR3;
-    [[vk::location(3)]] nointerpolation float4 data4 : COLOR4;
+    [[vk::location(0)]] float4 position : SV_Position;
+    [[vk::location(1)]] float4 clip : SV_ClipDistance;
+    
+    [[vk::location(2)]] nointerpolation uint4 data1 : COLOR1;
+    [[vk::location(3)]] nointerpolation float4 data2 : COLOR2;
+    [[vk::location(4)]] nointerpolation float4 data3 : COLOR3;
+    [[vk::location(5)]] nointerpolation float4 data4 : COLOR4;
     // Data segments that need interpolation, mostly for hatches
-    [[vk::location(5)]] float4 interp_data5 : COLOR5;
+    [[vk::location(6)]] float4 interp_data5 : COLOR5;
 #ifdef FRAGMENT_SHADER_INPUT
-    [[vk::location(6)]] [[vk::ext_decorate(/*spv::DecoratePerVertexKHR*/5285)]] float3 vertexScreenSpacePos[3] : COLOR6;
+    [[vk::location(7)]] [[vk::ext_decorate(/*spv::DecoratePerVertexKHR*/5285)]] float3 vertexScreenSpacePos[3] : COLOR6;
 #else
-    [[vk::location(6)]] float3 vertexScreenSpacePos : COLOR6;
+    [[vk::location(7)]] float3 vertexScreenSpacePos : COLOR6;
 #endif
+    [[vk::location(14)]] nointerpolation float data6 : COLOR7; // TODO: Why is location 8 consumed by SV_Position
     // ArcLenCalculator<float>
 
     // Set functions used in vshader, get functions used in fshader
@@ -110,9 +112,6 @@ struct PSInput
     void setCurrentPhaseShift(float phaseShift)  { interp_data5.x = phaseShift; }
     float getCurrentPhaseShift() { return interp_data5.x; }
 
-    void setCurrentWorldToScreenRatio(float worldToScreen) { interp_data5.y = worldToScreen; }
-    float getCurrentWorldToScreenRatio() { return interp_data5.y; }
-
     /* LINE */
     float2 getLineStart() { return data2.xy; }
     float2 getLineEnd() { return data2.zw; }
@@ -239,6 +238,10 @@ struct PSInput
     void setGridDTMHeightTextureID(uint textureID) { data1.z = textureID; }
     void setGridDTMScreenSpaceGridExtents(float2 screenSpaceGridExtends) { data2.xy = screenSpaceGridExtends; }
     void setGridDTMScreenSpaceCellWidth(float screenSpaceGridWidth) { data2.z = screenSpaceGridWidth; }
+
+    void setCurrentWorldToScreenRatio(float worldToScreen) { data6.x = worldToScreen; }
+    float getCurrentWorldToScreenRatio() { return data6.x; }
+
 };
 
 // Set 0 - Scene Data and Globals, buffer bindings don't change the buffers only get updated
diff --git a/62_CAD/shaders/main_pipeline/dtm.hlsl b/62_CAD/shaders/main_pipeline/dtm.hlsl
index ac16d7723..be10c2f31 100644
--- a/62_CAD/shaders/main_pipeline/dtm.hlsl
+++ b/62_CAD/shaders/main_pipeline/dtm.hlsl
@@ -234,10 +234,10 @@ float4 calculateDTMHeightColor(in DTMHeightShadingSettings settings, in float3 t
     return outputColor;
 }
 
-float calculateDTMContourSDF(in DTMContourSettings contourSettings, in LineStyle contourStyle, in float3 v[3], in float2 fragPos, in float height)
+float calculateDTMContourSDF(in DTMContourSettings contourSettings, in LineStyle contourStyle, in float worldToScreenRatio, in float3 v[3], in float2 fragPos, in float height)
 {
     float distance = nbl::hlsl::numeric_limits<float>::max;
-    const float contourThickness = (contourStyle.screenSpaceLineWidth + contourStyle.worldSpaceLineWidth * globals.screenToWorldRatio) * 0.5f;
+    const float contourThickness = (contourStyle.screenSpaceLineWidth + contourStyle.worldSpaceLineWidth / worldToScreenRatio) * 0.5f;
     const float stretch = 1.0f;
     const float phaseShift = 0.0f;
 
@@ -286,7 +286,7 @@ float calculateDTMContourSDF(in DTMContourSettings contourSettings, in LineStyle
             // It might be beneficial to calculate distance between pixel and contour line to early out some pixels and save yourself from stipple sdf computations!
             // where you only compute the complex sdf if abs((height - contourVal) / heightDeriv) <= aaFactor
             nbl::hlsl::shapes::Line<float>::ArcLengthCalculator arcLenCalc = nbl::hlsl::shapes::Line<float>::ArcLengthCalculator::construct(lineSegment);
-            LineStyleClipper clipper = LineStyleClipper::construct(contourStyle, lineSegment, arcLenCalc, phaseShift, stretch, globals.worldToScreenRatio);
+            LineStyleClipper clipper = LineStyleClipper::construct(contourStyle, lineSegment, arcLenCalc, phaseShift, stretch, worldToScreenRatio);
             distance = ClippedSignedDistance<nbl::hlsl::shapes::Line<float>, LineStyleClipper>::sdf(lineSegment, fragPos, contourThickness, contourStyle.isRoadStyleFlag, clipper);
         }
     }
@@ -294,12 +294,12 @@ float calculateDTMContourSDF(in DTMContourSettings contourSettings, in LineStyle
     return distance;
 }
 
-float4 calculateDTMOutlineColor(in uint outlineLineStyleIdx, in float3 v[3], in float2 fragPos)
+float4 calculateDTMOutlineColor(in uint outlineLineStyleIdx, in float worldToScreenRatio, in float3 v[3], in float2 fragPos)
 {
     float4 outputColor;
 
     LineStyle outlineStyle = loadLineStyle(outlineLineStyleIdx);
-    const float outlineThickness = (outlineStyle.screenSpaceLineWidth + outlineStyle.worldSpaceLineWidth * globals.screenToWorldRatio) * 0.5f;
+    const float outlineThickness = (outlineStyle.screenSpaceLineWidth + outlineStyle.worldSpaceLineWidth / worldToScreenRatio) * 0.5f;
     const float phaseShift = 0.0f; // input.getCurrentPhaseShift();
     const float stretch = 1.0f;
 
@@ -343,7 +343,7 @@ float4 calculateDTMOutlineColor(in uint outlineLineStyleIdx, in float3 v[3], in
 
             float distance = nbl::hlsl::numeric_limits<float>::max;
             nbl::hlsl::shapes::Line<float>::ArcLengthCalculator arcLenCalc = nbl::hlsl::shapes::Line<float>::ArcLengthCalculator::construct(lineSegment);
-            LineStyleClipper clipper = LineStyleClipper::construct(outlineStyle, lineSegment, arcLenCalc, phaseShift, stretch, globals.worldToScreenRatio);
+            LineStyleClipper clipper = LineStyleClipper::construct(outlineStyle, lineSegment, arcLenCalc, phaseShift, stretch, worldToScreenRatio);
             distance = ClippedSignedDistance<nbl::hlsl::shapes::Line<float>, LineStyleClipper>::sdf(lineSegment, fragPos, outlineThickness, outlineStyle.isRoadStyleFlag, clipper);
 
             minDistance = min(minDistance, distance);
@@ -360,9 +360,9 @@ float4 calculateDTMOutlineColor(in uint outlineLineStyleIdx, in float3 v[3], in
 // TODO:
 // It's literally sdf with a line shape
 // so it should be moved somewhere else and used for every line maybe
-float calculateLineSDF(in LineStyle lineStyle, in nbl::hlsl::shapes::Line<float> lineSegment, in float2 fragPos, in float phaseShift)
+float calculateLineSDF(in LineStyle lineStyle, in float worldToScreenRatio, in nbl::hlsl::shapes::Line<float> lineSegment, in float2 fragPos, in float phaseShift)
 {
-    const float outlineThickness = (lineStyle.screenSpaceLineWidth + lineStyle.worldSpaceLineWidth * globals.screenToWorldRatio) * 0.5f;
+    const float outlineThickness = (lineStyle.screenSpaceLineWidth + lineStyle.worldSpaceLineWidth / worldToScreenRatio) * 0.5f;
     const float stretch = 1.0f;
 
     float minDistance = nbl::hlsl::numeric_limits<float>::max;
@@ -376,7 +376,7 @@ float calculateLineSDF(in LineStyle lineStyle, in nbl::hlsl::shapes::Line<float>
     {
         float distance = nbl::hlsl::numeric_limits<float>::max;
         nbl::hlsl::shapes::Line<float>::ArcLengthCalculator arcLenCalc = nbl::hlsl::shapes::Line<float>::ArcLengthCalculator::construct(lineSegment);
-        LineStyleClipper clipper = LineStyleClipper::construct(lineStyle, lineSegment, arcLenCalc, phaseShift, stretch, globals.worldToScreenRatio);
+        LineStyleClipper clipper = LineStyleClipper::construct(lineStyle, lineSegment, arcLenCalc, phaseShift, stretch, worldToScreenRatio);
         distance = ClippedSignedDistance<nbl::hlsl::shapes::Line<float>, LineStyleClipper>::sdf(lineSegment, fragPos, outlineThickness, lineStyle.isRoadStyleFlag, clipper);
 
         minDistance = min(minDistance, distance);
diff --git a/62_CAD/shaders/main_pipeline/fragment_shader.hlsl b/62_CAD/shaders/main_pipeline/fragment_shader.hlsl
index cf249bf34..222384c55 100644
--- a/62_CAD/shaders/main_pipeline/fragment_shader.hlsl
+++ b/62_CAD/shaders/main_pipeline/fragment_shader.hlsl
@@ -135,6 +135,7 @@ float4 fragMain(PSInput input) : SV_TARGET
     ObjectType objType = input.getObjType();
     const uint32_t currentMainObjectIdx = input.getMainObjectIdx();
     const MainObject mainObj = loadMainObject(currentMainObjectIdx);
+    float worldToScreenRatio = input.getCurrentWorldToScreenRatio();
     
     if (pc.isDTMRendering)
     {
@@ -153,13 +154,13 @@ float4 fragMain(PSInput input) : SV_TARGET
         float4 dtmColor = float4(0.0f, 0.0f, 0.0f, 0.0f);
         
         if (dtmSettings.drawOutlineEnabled())                                                                                                    // TODO: do i need 'height' paramter here?
-            dtmColor = dtm::blendUnder(dtmColor, dtm::calculateDTMOutlineColor(dtmSettings.outlineLineStyleIdx, triangleVertices, input.position.xy));
+            dtmColor = dtm::blendUnder(dtmColor, dtm::calculateDTMOutlineColor(dtmSettings.outlineLineStyleIdx, worldToScreenRatio, triangleVertices, input.position.xy));
         if (dtmSettings.drawContourEnabled())
         {
             for(uint32_t i = 0; i < dtmSettings.contourSettingsCount; ++i) // TODO: should reverse the order with blendUnder
             {
                 LineStyle contourStyle = loadLineStyle(dtmSettings.contourSettings[i].contourLineStyleIdx);
-                float sdf = dtm::calculateDTMContourSDF(dtmSettings.contourSettings[i], contourStyle, triangleVertices, input.position.xy, height);
+                float sdf = dtm::calculateDTMContourSDF(dtmSettings.contourSettings[i], contourStyle, worldToScreenRatio, triangleVertices, input.position.xy, height);
                 float4 contourColor = contourStyle.color;
                 contourColor.a *= 1.0f - smoothstep(-globals.antiAliasingFactor, globals.antiAliasingFactor, sdf);
                 dtmColor = dtm::blendUnder(dtmColor, contourColor);
@@ -203,7 +204,7 @@ float4 fragMain(PSInput input) : SV_TARGET
                 else
                 {
                     nbl::hlsl::shapes::Line<float>::ArcLengthCalculator arcLenCalc = nbl::hlsl::shapes::Line<float>::ArcLengthCalculator::construct(lineSegment);
-                    LineStyleClipper clipper = LineStyleClipper::construct(loadLineStyle(styleIdx), lineSegment, arcLenCalc, phaseShift, stretch, globals.worldToScreenRatio);
+                    LineStyleClipper clipper = LineStyleClipper::construct(loadLineStyle(styleIdx), lineSegment, arcLenCalc, phaseShift, stretch, worldToScreenRatio);
                     distance = ClippedSignedDistance<nbl::hlsl::shapes::Line<float>, LineStyleClipper>::sdf(lineSegment, input.position.xy, thickness, style.isRoadStyleFlag, clipper);
                 }
             }
@@ -224,7 +225,7 @@ float4 fragMain(PSInput input) : SV_TARGET
                 }
                 else
                 {
-                    BezierStyleClipper clipper = BezierStyleClipper::construct(loadLineStyle(styleIdx), quadratic, arcLenCalc, phaseShift, stretch, globals.worldToScreenRatio );
+                    BezierStyleClipper clipper = BezierStyleClipper::construct(loadLineStyle(styleIdx), quadratic, arcLenCalc, phaseShift, stretch, worldToScreenRatio );
                     distance = ClippedSignedDistance<nbl::hlsl::shapes::Quadratic<float>, BezierStyleClipper>::sdf(quadratic, input.position.xy, thickness, style.isRoadStyleFlag, clipper);
                 }
             }
@@ -241,6 +242,9 @@ float4 fragMain(PSInput input) : SV_TARGET
 
             }
             localAlpha = 1.0f - smoothstep(-globals.antiAliasingFactor, globals.antiAliasingFactor, distance);
+            
+            // if (objType != ObjectType::POLYLINE_CONNECTOR)
+            //    localAlpha *= 0.3f;
         }
         else if (objType == ObjectType::CURVE_BOX) 
         {
@@ -458,8 +462,8 @@ float4 fragMain(PSInput input) : SV_TARGET
                 outlineLineSegments[1].P1 = float32_t2(nearestLineRemainingCoords.x, horizontalBounds.y);
                 
                 LineStyle outlineStyle = loadLineStyle(dtmSettings.outlineLineStyleIdx);
-                float sdf = dtm::calculateLineSDF(outlineStyle, outlineLineSegments[0], gridSpacePos, 0.0f);
-                sdf = min(sdf, dtm::calculateLineSDF(outlineStyle, outlineLineSegments[1], gridSpacePos, 0.0f));
+                float sdf = dtm::calculateLineSDF(outlineStyle, worldToScreenRatio, outlineLineSegments[0], gridSpacePos, 0.0f);
+                sdf = min(sdf, dtm::calculateLineSDF(outlineStyle, worldToScreenRatio, outlineLineSegments[1], gridSpacePos, 0.0f));
 
                 float4 dtmColor = outlineStyle.color;
                 dtmColor.a *= 1.0f - smoothstep(-globals.antiAliasingFactor, globals.antiAliasingFactor, sdf);
@@ -572,7 +576,7 @@ float4 fragMain(PSInput input) : SV_TARGET
                         {
                             const dtm::GridDTMTriangle tri = triangles[t];
                             const float currentInterpolatedHeight = interpolatedHeights[t];
-                            sdf = min(sdf, dtm::calculateDTMContourSDF(dtmSettings.contourSettings[i], contourStyle, tri.vertices, gridSpacePos, currentInterpolatedHeight));
+                            sdf = min(sdf, dtm::calculateDTMContourSDF(dtmSettings.contourSettings[i], contourStyle, worldToScreenRatio, tri.vertices, gridSpacePos, currentInterpolatedHeight));
                         }
                         
                         float4 contourColor = contourStyle.color; contourColor.a = 0.5f;
@@ -594,7 +598,7 @@ float4 fragMain(PSInput input) : SV_TARGET
                     
                     float phaseShift = 0.0f;
                     const bool hasStipples = outlineStyle.hasStipples();
-                    const float rcpPattenLenScreenSpace = outlineStyle.reciprocalStipplePatternLen * globals.worldToScreenRatio;
+                    const float rcpPattenLenScreenSpace = outlineStyle.reciprocalStipplePatternLen * worldToScreenRatio;
                     // Drawing the lines that form a plus sign around the current corner:
                     if (linesValidity[0])
                     {
@@ -602,7 +606,7 @@ float4 fragMain(PSInput input) : SV_TARGET
                         lineSegment.P0 = float2((offset.x > 0) ? -offset.x * cellWidth : 0.0f, 0.0f);
                         lineSegment.P1 = float2((offset.x < 0) ? -offset.x * cellWidth : 0.0f, 0.0f);
                         phaseShift = fract((lineSegment.P0.x - localGridTopLeftCorner.x) * rcpPattenLenScreenSpace);
-                        sdf = min(sdf, dtm::calculateLineSDF(outlineStyle, lineSegment, localFragPos, phaseShift));
+                        sdf = min(sdf, dtm::calculateLineSDF(outlineStyle, worldToScreenRatio, lineSegment, localFragPos, phaseShift));
                     }
                     if (linesValidity[1])
                     {
@@ -610,7 +614,7 @@ float4 fragMain(PSInput input) : SV_TARGET
                         lineSegment.P0 = float2(0.0f, (offset.y > 0) ? -offset.y * cellWidth : 0.0f);
                         lineSegment.P1 = float2(0.0f, (offset.y < 0) ? -offset.y * cellWidth : 0.0f);
                         phaseShift = fract((lineSegment.P0.y - localGridTopLeftCorner.y) * rcpPattenLenScreenSpace);
-                        sdf = min(sdf, dtm::calculateLineSDF(outlineStyle, lineSegment, localFragPos, phaseShift));
+                        sdf = min(sdf, dtm::calculateLineSDF(outlineStyle, worldToScreenRatio, lineSegment, localFragPos, phaseShift));
                     }
                     if (linesValidity[2])
                     {
@@ -618,7 +622,7 @@ float4 fragMain(PSInput input) : SV_TARGET
                         lineSegment.P0 = float2((offset.x < 0) ? offset.x * cellWidth : 0.0f, 0.0f);
                         lineSegment.P1 = float2((offset.x > 0) ? offset.x * cellWidth : 0.0f, 0.0f);
                         phaseShift = fract((lineSegment.P0.x - localGridTopLeftCorner.x) * rcpPattenLenScreenSpace);
-                        sdf = min(sdf, dtm::calculateLineSDF(outlineStyle, lineSegment, localFragPos, phaseShift));
+                        sdf = min(sdf, dtm::calculateLineSDF(outlineStyle, worldToScreenRatio, lineSegment, localFragPos, phaseShift));
                     }
                     if (linesValidity[3])
                     {
@@ -626,7 +630,7 @@ float4 fragMain(PSInput input) : SV_TARGET
                         lineSegment.P0 = float2(0.0f, (offset.y < 0) ? offset.y * cellWidth : 0.0f);
                         lineSegment.P1 = float2(0.0f, (offset.y > 0) ? offset.y * cellWidth : 0.0f);
                         phaseShift = fract((lineSegment.P0.y - localGridTopLeftCorner.y) * rcpPattenLenScreenSpace);
-                        sdf = min(sdf, dtm::calculateLineSDF(outlineStyle, lineSegment, localFragPos, phaseShift));
+                        sdf = min(sdf, dtm::calculateLineSDF(outlineStyle, worldToScreenRatio, lineSegment, localFragPos, phaseShift));
                     }
 
                     float4 outlineColor = outlineStyle.color;
diff --git a/62_CAD/shaders/main_pipeline/vertex_shader.hlsl b/62_CAD/shaders/main_pipeline/vertex_shader.hlsl
index 407731ffe..5280e7451 100644
--- a/62_CAD/shaders/main_pipeline/vertex_shader.hlsl
+++ b/62_CAD/shaders/main_pipeline/vertex_shader.hlsl
@@ -72,6 +72,13 @@ float32_t4 transformFromSreenSpaceToNdc(float2 pos, uint32_t2 resolution)
 {
     return float32_t4((pos.xy / (float32_t2)resolution) * 2.0f - 1.0f, 0.0f, 1.0f);
 }
+float32_t getScreenToWorldRatio(pfloat64_t3x3 transformation, uint32_t2 resolution)
+{
+	pfloat64_t idx_0_0 = transformation[0u].x * (resolution.x / 2.0);
+	pfloat64_t idx_1_0 = transformation[1u].x * (resolution.y / 2.0);
+    float32_t2 firstCol; firstCol.x = _static_cast<float32_t>(idx_0_0); firstCol.y = _static_cast<float32_t>(idx_1_0); 
+	return nbl::hlsl::length(firstCol); // TODO: Do length in fp64?
+}
 
 template<bool FragmentShaderPixelInterlock>
 void dilateHatch(out float2 outOffsetVec, out float2 outUV, const float2 undilatedCorner, const float2 dilateRate, const float2 ndcAxisU, const float2 ndcAxisV);
@@ -131,6 +138,10 @@ PSInput vtxMain(uint vertexID : SV_VertexID)
         MainObject mainObj = loadMainObject(pc.triangleMeshMainObjectIndex);
         clipProjectionData = getClipProjectionData(mainObj);
 
+        float screenToWorldRatio = getScreenToWorldRatio(clipProjectionData.projectionToNDC, globals.resolution);
+        float worldToScreenRatio = 1.0f / screenToWorldRatio;
+        outV.setCurrentWorldToScreenRatio(worldToScreenRatio);
+        
         // assuming there are 3 * N vertices, number of vertices is equal to number of indices and indices are sequential starting from 0
         float2 transformedOriginalPos;
         float2 transformedDilatedPos;
@@ -154,7 +165,7 @@ PSInput vtxMain(uint vertexID : SV_VertexID)
             triangleVertices[2].pos = triangleVertices[2].pos - triangleCentroid;
 
             // TODO: calculate dialation factor
-            // const float dilateByPixels = 0.5 * (dtmSettings.maxScreenSpaceLineWidth + dtmSettings.maxWorldSpaceLineWidth * globals.screenToWorldRatio) + aaFactor;
+            // const float dilateByPixels = 0.5 * (dtmSettings.maxScreenSpaceLineWidth + dtmSettings.maxWorldSpaceLineWidth * screenToWorldRatio) + aaFactor;
         
             pfloat64_t dialationFactor = _static_cast<pfloat64_t>(2.0f);
             pfloat64_t2 dialatedVertex = triangleVertices[currentVertexWithinTriangleIndex].pos * dialationFactor;
@@ -193,6 +204,10 @@ PSInput vtxMain(uint vertexID : SV_VertexID)
 
         MainObject mainObj = loadMainObject(drawObj.mainObjIndex);
         clipProjectionData = getClipProjectionData(mainObj);
+        
+        float screenToWorldRatio = getScreenToWorldRatio(clipProjectionData.projectionToNDC, globals.resolution);
+        float worldToScreenRatio = 1.0f / screenToWorldRatio;
+        outV.setCurrentWorldToScreenRatio(worldToScreenRatio);
     
         // We only need these for Outline type objects like lines and bezier curves
         if (objType == ObjectType::LINE || objType == ObjectType::QUAD_BEZIER || objType == ObjectType::POLYLINE_CONNECTOR)
@@ -200,7 +215,7 @@ PSInput vtxMain(uint vertexID : SV_VertexID)
             LineStyle lineStyle = loadLineStyle(mainObj.styleIdx);
 
             // Width is on both sides, thickness is one one side of the curve (div by 2.0f)
-            const float screenSpaceLineWidth = lineStyle.screenSpaceLineWidth + lineStyle.worldSpaceLineWidth * globals.screenToWorldRatio;
+            const float screenSpaceLineWidth = lineStyle.screenSpaceLineWidth + lineStyle.worldSpaceLineWidth * screenToWorldRatio;
             const float antiAliasedLineThickness = screenSpaceLineWidth * 0.5f + globals.antiAliasingFactor;
             const float sdfLineThickness = screenSpaceLineWidth / 2.0f;
             outV.setLineThickness(sdfLineThickness);
@@ -661,8 +676,8 @@ PSInput vtxMain(uint vertexID : SV_VertexID)
             const float2 corner = float2(bool2(vertexIdx & 0x1u, vertexIdx >> 1));
 
             outV.setGridDTMHeightTextureID(textureID);
-            outV.setGridDTMScreenSpaceCellWidth(gridCellWidth * globals.screenToWorldRatio);
-            outV.setGridDTMScreenSpaceGridExtents(_static_cast<float2>(worldSpaceExtents) * globals.screenToWorldRatio);
+            outV.setGridDTMScreenSpaceCellWidth(gridCellWidth * screenToWorldRatio);
+            outV.setGridDTMScreenSpaceGridExtents(_static_cast<float2>(worldSpaceExtents) * screenToWorldRatio);
 
             static const float SquareRootOfTwo = 1.4142135f;
             const pfloat64_t dilationFactor = _static_cast<pfloat64_t>(SquareRootOfTwo * thicknessOfTheThickestLine);

From 0cf6cd98edb18ea729cbd734d4022c1dae03efea Mon Sep 17 00:00:00 2001
From: devsh <devsh@devsh.eu>
Date: Mon, 21 Jul 2025 16:25:01 +0200
Subject: [PATCH 490/529] vertex input attributes no longer recommended

---
 old_to_refactor/03_GPU_Mesh/CMakeLists.txt  |   7 -
 old_to_refactor/03_GPU_Mesh/main.cpp        | 244 --------------------
 old_to_refactor/03_GPU_Mesh/pipeline.groovy |  50 ----
 3 files changed, 301 deletions(-)
 delete mode 100644 old_to_refactor/03_GPU_Mesh/CMakeLists.txt
 delete mode 100644 old_to_refactor/03_GPU_Mesh/main.cpp
 delete mode 100644 old_to_refactor/03_GPU_Mesh/pipeline.groovy

diff --git a/old_to_refactor/03_GPU_Mesh/CMakeLists.txt b/old_to_refactor/03_GPU_Mesh/CMakeLists.txt
deleted file mode 100644
index a476b6203..000000000
--- a/old_to_refactor/03_GPU_Mesh/CMakeLists.txt
+++ /dev/null
@@ -1,7 +0,0 @@
-
-include(common RESULT_VARIABLE RES)
-if(NOT RES)
-	message(FATAL_ERROR "common.cmake not found. Should be in {repo_root}/cmake directory")
-endif()
-
-nbl_create_executable_project("" "" "" "" "${NBL_EXECUTABLE_PROJECT_CREATION_PCH_TARGET}")
\ No newline at end of file
diff --git a/old_to_refactor/03_GPU_Mesh/main.cpp b/old_to_refactor/03_GPU_Mesh/main.cpp
deleted file mode 100644
index cc871bb9f..000000000
--- a/old_to_refactor/03_GPU_Mesh/main.cpp
+++ /dev/null
@@ -1,244 +0,0 @@
-
-#include "CCamera.hpp"
-
-
-#include "nbl/nblpack.h"
-struct VertexStruct
-{
-    /// every member needs to be at location aligned to its type size for GLSL
-    float Pos[3]; /// uses float hence need 4 byte alignment
-    uint8_t Col[2]; /// same logic needs 1 byte alignment
-    uint8_t uselessPadding[2]; /// so if there is a member with 4 byte alignment then whole struct needs 4 byte align, so pad it
-} PACK_STRUCT;
-#include "nbl/nblunpack.h"
-
-const char* vertexSource = R"===(
-#version 430 core
-
-layout(location = 0) in vec4 vPos; //only a 3d position is passed from Nabla, but last (the W) coordinate gets filled with default 1.0
-layout(location = 1) in vec4 vCol;
-
-layout( push_constant, row_major ) uniform Block {
-	mat4 modelViewProj;
-} PushConstants;
-
-layout(location = 0) out vec4 Color; //per vertex output color, will be interpolated across the triangle
-
-void main()
-{
-    gl_Position = PushConstants.modelViewProj*vPos; //only thing preventing the shader from being core-compliant
-    Color = vCol;
-}
-)===";
-
-const char* fragmentSource = R"===(
-#version 430 core
-
-layout(location = 0) in vec4 Color; //per vertex output color, will be interpolated across the triangle
-
-layout(location = 0) out vec4 pixelColor;
-
-void main()
-{
-    pixelColor = Color;
-}
-)===";
-
-class GPUMesh : public ApplicationBase
-{
-
-public:
-
-	nbl::core::smart_refctd_ptr<video::IGPUFence> gpuTransferFence;
-	nbl::core::smart_refctd_ptr<video::IGPUFence> gpuComputeFence;
-	nbl::video::IGPUObjectFromAssetConverter cpu2gpu;
-
-	CommonAPI::InputSystem::ChannelReader<ui::IMouseEventChannel> mouse;
-	CommonAPI::InputSystem::ChannelReader<ui::IKeyboardEventChannel> keyboard;
-	Camera camera = Camera(vectorSIMDf(0, 0, 0), vectorSIMDf(0, 0, 0), matrix4SIMD());
-
-	int resourceIx = -1;
-	uint32_t acquiredNextFBO = {};
-	std::chrono::system_clock::time_point lastTime;
-	bool frameDataFilled = false;
-	size_t frame_count = 0ull;
-	double time_sum = 0;
-	double dtList[NBL_FRAMES_TO_AVERAGE] = {};
-
-	core::smart_refctd_ptr<video::IGPUFence> frameComplete[FRAMES_IN_FLIGHT] = { nullptr };
-	core::smart_refctd_ptr<video::IGPUSemaphore> imageAcquire[FRAMES_IN_FLIGHT] = { nullptr };
-	core::smart_refctd_ptr<video::IGPUSemaphore> renderFinished[FRAMES_IN_FLIGHT] = { nullptr };
-	core::smart_refctd_ptr<video::IGPUCommandBuffer> commandBuffers[FRAMES_IN_FLIGHT];
-
-	nbl::video::ISwapchain::SCreationParams m_swapchainCreationParams;
-
-
-
-
-	void onAppInitialized_impl() override
-	{
-
-		for (size_t i = 0ull; i < NBL_FRAMES_TO_AVERAGE; ++i)
-			dtList[i] = 0.0;
-
-		matrix4SIMD projectionMatrix = matrix4SIMD::buildProjectionMatrixPerspectiveFovLH(core::radians(60.0f), float(WIN_W) / WIN_H, 0.1, 1000);
-		camera = Camera(core::vectorSIMDf(-4, 0, 0), core::vectorSIMDf(0, 0, 0), projectionMatrix);
-	}
-
-	void workLoopBody() override
-	{
-
-		auto renderStart = std::chrono::system_clock::now();
-		const auto renderDt = std::chrono::duration_cast<std::chrono::milliseconds>(renderStart - lastTime).count();
-		lastTime = renderStart;
-		{ // Calculate Simple Moving Average for FrameTime
-			time_sum -= dtList[frame_count];
-			time_sum += renderDt;
-			dtList[frame_count] = renderDt;
-			frame_count++;
-			if (frame_count >= NBL_FRAMES_TO_AVERAGE)
-			{
-				frameDataFilled = true;
-				frame_count = 0;
-			}
-
-		}
-		const double averageFrameTime = frameDataFilled ? (time_sum / (double)NBL_FRAMES_TO_AVERAGE) : (time_sum / frame_count);
-
-#ifdef NBL_MORE_LOGS
-		logger->log("renderDt = %f ------ averageFrameTime = %f", system::ILogger::ELL_INFO, renderDt, averageFrameTime);
-#endif // NBL_MORE_LOGS
-
-		auto averageFrameTimeDuration = std::chrono::duration<double, std::milli>(averageFrameTime);
-		auto nextPresentationTime = renderStart + averageFrameTimeDuration;
-		auto nextPresentationTimeStamp = std::chrono::duration_cast<std::chrono::microseconds>(nextPresentationTime.time_since_epoch());
-
-		inputSystem->getDefaultMouse(&mouse);
-		inputSystem->getDefaultKeyboard(&keyboard);
-
-		camera.beginInputProcessing(nextPresentationTimeStamp);
-		mouse.consumeEvents([&](const ui::IMouseEventChannel::range_t& events) -> void { camera.mouseProcess(events); }, logger.get());
-		keyboard.consumeEvents([&](const ui::IKeyboardEventChannel::range_t& events) -> void { camera.keyboardProcess(events); }, logger.get());
-		camera.endInputProcessing(nextPresentationTimeStamp);
-
-		const auto& mvp = camera.getConcatenatedMatrix();
-
-
-
-
-
-
-
-
-		asset::SViewport viewport;
-		viewport.minDepth = 1.f;
-		viewport.maxDepth = 0.f;
-		viewport.x = 0u;
-		viewport.y = 0u;
-		viewport.width = WIN_W;
-		viewport.height = WIN_H;
-		commandBuffer->setViewport(0u, 1u, &viewport);
-
-
-
-
-
-
-		//! Stress test for memleaks aside from demo how to create meshes that live on the GPU RAM
-		{
-			VertexStruct vertices[8];
-			vertices[0] = VertexStruct{ {-1.f,-1.f,-1.f},{  0,  0} };
-			vertices[1] = VertexStruct{ { 1.f,-1.f,-1.f},{127,  0} };
-			vertices[2] = VertexStruct{ {-1.f, 1.f,-1.f},{255,  0} };
-			vertices[3] = VertexStruct{ { 1.f, 1.f,-1.f},{  0,127} };
-			vertices[4] = VertexStruct{ {-1.f,-1.f, 1.f},{127,127} };
-			vertices[5] = VertexStruct{ { 1.f,-1.f, 1.f},{255,127} };
-			vertices[6] = VertexStruct{ {-1.f, 1.f, 1.f},{  0,255} };
-			vertices[7] = VertexStruct{ { 1.f, 1.f, 1.f},{127,255} };
-
-			uint16_t indices_indexed16[] =
-			{
-				0,1,2,1,2,3,
-				4,5,6,5,6,7,
-				0,1,4,1,4,5,
-				2,3,6,3,6,7,
-				0,2,4,2,4,6,
-				1,3,5,3,5,7
-			};
-
-			//	auto upStreamBuff = driver->getDefaultUpStreamingBuffer();
-			//	core::smart_refctd_ptr<video::IGPUBuffer> upStreamRef(upStreamBuff->getBuffer());
-
-			//	const void* dataToPlace[2] = { vertices,indices_indexed16 };
-			//	uint32_t offsets[2] = { video::StreamingTransientDataBufferMT<>::invalid_address,video::StreamingTransientDataBufferMT<>::invalid_address };
-			//	uint32_t alignments[2] = { sizeof(decltype(vertices[0u])),sizeof(decltype(indices_indexed16[0u])) };
-			//	uint32_t sizes[2] = { sizeof(vertices),sizeof(indices_indexed16) };
-			//	upStreamBuff->multi_place(2u, (const void* const*)dataToPlace, (uint32_t*)offsets, (uint32_t*)sizes, (uint32_t*)alignments);
-			//	if (upStreamBuff->needsManualFlushOrInvalidate())
-			//	{
-			//		auto upStreamMem = upStreamBuff->getBuffer()->getBoundMemory();
-			//		driver->flushMappedMemoryRanges({ video::IDeviceMemoryAllocation::MappedMemoryRange(upStreamMem,offsets[0],sizes[0]),video::IDeviceMemoryAllocation::MappedMemoryRange(upStreamMem,offsets[1],sizes[1]) });
-			//	}
-
-			//	asset::SPushConstantRange range[1] = { asset::ISpecializedShader::ESS_VERTEX,0u,sizeof(core::matrix4SIMD) };
-
-			//	auto createSpecializedShaderFromSource = [=](const char* source, asset::ISpecializedShader::E_SHADER_STAGE stage)
-			//	{
-			//		auto spirv = device->getAssetManager()->getGLSLCompiler()->createSPIRVFromGLSL(source, stage, "main", "runtimeID");
-			//		auto unspec = driver->createShader(std::move(spirv));
-			//		return driver->createSpecializedShader(unspec.get(), { nullptr,nullptr,"main",stage });
-			//	};
-			//	// origFilepath is only relevant when you have filesystem #includes in your shader
-			//	auto createSpecializedShaderFromSourceWithIncludes = [&](const char* source, asset::ISpecializedShader::E_SHADER_STAGE stage, const char* origFilepath)
-			//	{
-			//		auto resolved_includes = device->getAssetManager()->getGLSLCompiler()->resolveIncludeDirectives(source, stage, origFilepath);
-			//		return createSpecializedShaderFromSource(reinterpret_cast<const char*>(resolved_includes->getContent()->getPointer()), stage);
-			//	};
-			//	core::smart_refctd_ptr<video::IGPUSpecializedShader> shaders[2] =
-			//	{
-			//		createSpecializedShaderFromSourceWithIncludes(vertexSource,asset::ISpecializedShader::ESS_VERTEX, "shader.vert"),
-			//		createSpecializedShaderFromSource(fragmentSource,asset::ISpecializedShader::ESS_FRAGMENT)
-			//	};
-			//	auto shadersPtr = reinterpret_cast<video::IGPUSpecializedShader**>(shaders);
-
-			//	asset::SVertexInputParams inputParams;
-			//	inputParams.enabledAttribFlags = 0b11u;
-			//	inputParams.enabledBindingFlags = 0b1u;
-			//	inputParams.attributes[0].binding = 0u;
-			//	inputParams.attributes[0].format = asset::EF_R32G32B32_SFLOAT;
-			//	inputParams.attributes[0].relativeOffset = offsetof(VertexStruct, Pos[0]);
-			//	inputParams.attributes[1].binding = 0u;
-			//	inputParams.attributes[1].format = asset::EF_R8G8_UNORM;
-			//	inputParams.attributes[1].relativeOffset = offsetof(VertexStruct, Col[0]);
-			//	inputParams.bindings[0].stride = sizeof(VertexStruct);
-			//	inputParams.bindings[0].inputRate = asset::EVIR_PER_VERTEX;
-
-			//	asset::SBlendParams blendParams; // defaults are sane
-
-			//	asset::SPrimitiveAssemblyParams assemblyParams = { asset::EPT_TRIANGLE_LIST,false,1u };
-
-			//	asset::SStencilOpParams defaultStencil;
-			//	asset::SRasterizationParams rasterParams;
-			//	rasterParams.faceCullingMode = asset::EFCM_NONE;
-			//	auto pipeline = driver->createRenderpassIndependentPipeline(nullptr, driver->createPipelineLayout(range, range + 1u, nullptr, nullptr, nullptr, nullptr),
-			//		shadersPtr, shadersPtr + sizeof(shaders) / sizeof(core::smart_refctd_ptr<video::IGPUSpecializedShader>),
-			//		inputParams, blendParams, assemblyParams, rasterParams);
-
-			//	asset::SBufferBinding<video::IGPUBuffer> bindings[video::IGPUMeshBuffer::MAX_ATTR_BUF_BINDING_COUNT];
-			//	bindings[0u] = { offsets[0],upStreamRef };
-			//	auto mb = core::make_smart_refctd_ptr<video::IGPUMeshBuffer>(std::move(pipeline), nullptr, bindings, asset::SBufferBinding<video::IGPUBuffer>{offsets[1], upStreamRef});
-			//	{
-			//		mb->setIndexType(asset::EIT_16BIT);
-			//		mb->setIndexCount(2 * 3 * 6);
-			//	}
-
-			//	driver->bindGraphicsPipeline(mb->getPipeline());
-			//	driver->pushConstants(mb->getPipeline()->getLayout(), asset::ISpecializedShader::ESS_VERTEX, 0u, sizeof(core::matrix4SIMD), mvp.pointer());
-			//	driver->drawMeshBuffer(mb.get());
-
-			//	upStreamBuff->multi_free(2u, (uint32_t*)&offsets, (uint32_t*)&sizes, driver->placeFence());
-			//}
-			//driver->endScene();
-		}
-	}
-};
diff --git a/old_to_refactor/03_GPU_Mesh/pipeline.groovy b/old_to_refactor/03_GPU_Mesh/pipeline.groovy
deleted file mode 100644
index b19625fa7..000000000
--- a/old_to_refactor/03_GPU_Mesh/pipeline.groovy
+++ /dev/null
@@ -1,50 +0,0 @@
-import org.DevshGraphicsProgramming.Agent
-import org.DevshGraphicsProgramming.BuilderInfo
-import org.DevshGraphicsProgramming.IBuilder
-
-class CGPUMeshBuilder extends IBuilder
-{
-	public CGPUMeshBuilder(Agent _agent, _info)
-	{
-		super(_agent, _info)
-	}
-	
-	@Override
-	public boolean prepare(Map axisMapping)
-	{
-		return true
-	}
-	
-	@Override
-  	public boolean build(Map axisMapping)
-	{
-		IBuilder.CONFIGURATION config = axisMapping.get("CONFIGURATION")
-		IBuilder.BUILD_TYPE buildType = axisMapping.get("BUILD_TYPE")
-		
-		def nameOfBuildDirectory = getNameOfBuildDirectory(buildType)
-		def nameOfConfig = getNameOfConfig(config)
-		
-		agent.execute("cmake --build ${info.rootProjectPath}/${nameOfBuildDirectory}/${info.targetProjectPathRelativeToRoot} --target ${info.targetBaseName} --config ${nameOfConfig} -j12 -v")
-		
-		return true
-	}
-	
-	@Override
-  	public boolean test(Map axisMapping)
-	{
-		return true
-	}
-	
-	@Override
-	public boolean install(Map axisMapping)
-	{
-		return true
-	}
-}
-
-def create(Agent _agent, _info)
-{
-	return new CGPUMeshBuilder(_agent, _info)
-}
-
-return this
\ No newline at end of file

From 76cedea98997bba010b738607214e17e64b105a5 Mon Sep 17 00:00:00 2001
From: devsh <devsh@devsh.eu>
Date: Mon, 21 Jul 2025 16:26:26 +0200
Subject: [PATCH 491/529] Need a different tutorial

---
 .../05_NablaTutorialExample/CMakeLists.txt    |   7 -
 .../config.json.template                      |  28 -
 .../05_NablaTutorialExample/main.cpp          | 593 ------------------
 .../05_NablaTutorialExample/pipeline.groovy   |  50 --
 4 files changed, 678 deletions(-)
 delete mode 100644 old_to_refactor/05_NablaTutorialExample/CMakeLists.txt
 delete mode 100644 old_to_refactor/05_NablaTutorialExample/config.json.template
 delete mode 100644 old_to_refactor/05_NablaTutorialExample/main.cpp
 delete mode 100644 old_to_refactor/05_NablaTutorialExample/pipeline.groovy

diff --git a/old_to_refactor/05_NablaTutorialExample/CMakeLists.txt b/old_to_refactor/05_NablaTutorialExample/CMakeLists.txt
deleted file mode 100644
index a476b6203..000000000
--- a/old_to_refactor/05_NablaTutorialExample/CMakeLists.txt
+++ /dev/null
@@ -1,7 +0,0 @@
-
-include(common RESULT_VARIABLE RES)
-if(NOT RES)
-	message(FATAL_ERROR "common.cmake not found. Should be in {repo_root}/cmake directory")
-endif()
-
-nbl_create_executable_project("" "" "" "" "${NBL_EXECUTABLE_PROJECT_CREATION_PCH_TARGET}")
\ No newline at end of file
diff --git a/old_to_refactor/05_NablaTutorialExample/config.json.template b/old_to_refactor/05_NablaTutorialExample/config.json.template
deleted file mode 100644
index f961745c1..000000000
--- a/old_to_refactor/05_NablaTutorialExample/config.json.template
+++ /dev/null
@@ -1,28 +0,0 @@
-{
-  "enableParallelBuild": true,
-  "threadsPerBuildProcess" : 2,
-  "isExecuted": false,
-  "scriptPath": "",
-  "cmake": {
-    "configurations": [ "Release", "Debug", "RelWithDebInfo" ],
-    "buildModes": [],
-    "requiredOptions": []
-  }, 
-  "profiles": [
-    {
-      "backend": "vulkan",
-      "platform": "windows",
-      "buildModes": [],
-      "runConfiguration": "Release",
-      "gpuArchitectures": []
-    }
-  ],
-  "dependencies": [],
-  "data": [
-    {
-      "dependencies": [],
-      "command": [""],
-      "outputs": []
-    }
-  ]
-}
\ No newline at end of file
diff --git a/old_to_refactor/05_NablaTutorialExample/main.cpp b/old_to_refactor/05_NablaTutorialExample/main.cpp
deleted file mode 100644
index abebb882c..000000000
--- a/old_to_refactor/05_NablaTutorialExample/main.cpp
+++ /dev/null
@@ -1,593 +0,0 @@
-// Copyright (C) 2018-2020 - DevSH Graphics Programming Sp. z O.O.
-// This file is part of the "Nabla Engine".
-// For conditions of distribution and use, see copyright notice in nabla.h
-
-#define _NBL_STATIC_LIB_
-#include <iostream>
-#include <cstdio>
-#include <nabla.h>
-
-#include "CCamera.hpp"
-#include "../common/CommonAPI.h"
-
-/*
-	General namespaces. Entire engine consists of those bellow.
-*/
-
-using namespace nbl;
-using namespace asset;
-using namespace video;
-using namespace core;
-
-/*
-	Uncomment for more detailed logging
-*/
-
-// #define NBL_MORE_LOGS
-
-class NablaTutorialExampleApp : public ApplicationBase
-{
-	/*
-		 SIrrlichtCreationParameters holds some specific initialization information
-		 about driver being used, size of window, stencil buffer or depth buffer.
-		 Used to create a device.
-	*/
-
-	constexpr static uint32_t WIN_W = 1280;
-	constexpr static uint32_t WIN_H = 720;
-	constexpr static uint32_t SC_IMG_COUNT = 3u;
-	constexpr static uint32_t FRAMES_IN_FLIGHT = 5u;
-	constexpr static uint64_t MAX_TIMEOUT = 99999999999999ull;
-	constexpr static size_t NBL_FRAMES_TO_AVERAGE = 100ull;
-	
-	static_assert(FRAMES_IN_FLIGHT > SC_IMG_COUNT);
-
-public:
-	/*
-		Most important objects to manage literally whole stuff are bellow.
-		By their usage you can create for example GPU objects, load or write
-		assets or manage objects on a scene.
-	*/
-	
-	nbl::core::smart_refctd_ptr<nbl::ui::IWindowManager> windowManager;
-	nbl::core::smart_refctd_ptr<nbl::ui::IWindow> window;
-	nbl::core::smart_refctd_ptr<CommonAPI::CommonAPIEventCallback> windowCb;
-	nbl::core::smart_refctd_ptr<nbl::video::IAPIConnection> apiConnection;
-	nbl::core::smart_refctd_ptr<nbl::video::ISurface> surface;
-	nbl::core::smart_refctd_ptr<nbl::video::IUtilities> utilities;
-	nbl::core::smart_refctd_ptr<nbl::video::ILogicalDevice> logicalDevice;
-	nbl::video::IPhysicalDevice* physicalDevice;
-	std::array<nbl::video::IGPUQueue*, CommonAPI::InitOutput::MaxQueuesCount> queues = { nullptr, nullptr, nullptr, nullptr };
-	nbl::core::smart_refctd_ptr<nbl::video::ISwapchain> swapchain;
-	nbl::core::smart_refctd_ptr<nbl::video::IGPURenderpass> renderpass;
-	nbl::core::smart_refctd_dynamic_array<nbl::core::smart_refctd_ptr<nbl::video::IGPUFramebuffer>> fbo;
-	std::array<std::array<nbl::core::smart_refctd_ptr<nbl::video::IGPUCommandPool>, CommonAPI::InitOutput::MaxFramesInFlight>, CommonAPI::InitOutput::MaxQueuesCount> commandPools; // TODO: Multibuffer and reset the commandpools
-	nbl::core::smart_refctd_ptr<nbl::system::ISystem> system;
-	nbl::core::smart_refctd_ptr<nbl::asset::IAssetManager> assetManager;
-	nbl::video::IGPUObjectFromAssetConverter::SParams cpu2gpuParams;
-	nbl::core::smart_refctd_ptr<nbl::system::ILogger> logger;
-	nbl::core::smart_refctd_ptr<CommonAPI::InputSystem> inputSystem;
-	
-	nbl::core::smart_refctd_ptr<video::IGPUFence> gpuTransferFence;
-	nbl::core::smart_refctd_ptr<video::IGPUFence> gpuComputeFence;
-	nbl::video::IGPUObjectFromAssetConverter cpu2gpu;
-	
-	core::smart_refctd_ptr<video::IGPUMeshBuffer> gpuMeshBuffer;
-	core::smart_refctd_ptr<IGPURenderpassIndependentPipeline> gpuRenderpassIndependentPipeline;
-	core::smart_refctd_ptr<IGPUBuffer> gpuubo;
-	core::smart_refctd_ptr<IGPUDescriptorSet> gpuDescriptorSet1;
-	core::smart_refctd_ptr<IGPUDescriptorSet> gpuDescriptorSet3;
-	core::smart_refctd_ptr<IGPUGraphicsPipeline> gpuGraphicsPipeline;
-	
-	core::smart_refctd_ptr<video::IGPUFence> frameComplete[FRAMES_IN_FLIGHT] = { nullptr };
-	core::smart_refctd_ptr<video::IGPUSemaphore> imageAcquire[FRAMES_IN_FLIGHT] = { nullptr };
-	core::smart_refctd_ptr<video::IGPUSemaphore> renderFinished[FRAMES_IN_FLIGHT] = { nullptr };
-	core::smart_refctd_ptr<video::IGPUCommandBuffer> commandBuffers[FRAMES_IN_FLIGHT];
-
-	nbl::video::ISwapchain::SCreationParams m_swapchainCreationParams;
-
-	CommonAPI::InputSystem::ChannelReader<ui::IMouseEventChannel> mouse;
-	CommonAPI::InputSystem::ChannelReader<ui::IKeyboardEventChannel> keyboard;
-	Camera camera = Camera(vectorSIMDf(0, 0, 0), vectorSIMDf(0, 0, 0), matrix4SIMD());
-	
-	uint32_t ds1UboBinding = 0;
-	int resourceIx;
-	uint32_t acquiredNextFBO = {};
-	std::chrono::system_clock::time_point lastTime;
-	bool frameDataFilled = false;
-	size_t frame_count = 0ull;
-	double time_sum = 0;
-	double dtList[NBL_FRAMES_TO_AVERAGE] = {};
-	
-	void setWindow(core::smart_refctd_ptr<nbl::ui::IWindow>&& wnd) override
-	{
-		window = std::move(wnd);
-	}
-	void setSystem(core::smart_refctd_ptr<nbl::system::ISystem>&& s) override
-	{
-		system = std::move(s);
-	}
-	nbl::ui::IWindow* getWindow() override
-	{
-		return window.get();
-	}
-	video::IAPIConnection* getAPIConnection() override
-	{
-		return apiConnection.get();
-	}
-	video::ILogicalDevice* getLogicalDevice()  override
-	{
-		return logicalDevice.get();
-	}
-	video::IGPURenderpass* getRenderpass() override
-	{
-		return renderpass.get();
-	}
-	void setSurface(core::smart_refctd_ptr<video::ISurface>&& s) override
-	{
-		surface = std::move(s);
-	}
-	void setFBOs(std::vector<core::smart_refctd_ptr<video::IGPUFramebuffer>>& f) override
-	{
-		for (int i = 0; i < f.size(); i++)
-		{
-			fbo->begin()[i] = core::smart_refctd_ptr(f[i]);
-		}
-	}
-	void setSwapchain(core::smart_refctd_ptr<video::ISwapchain>&& s) override
-	{
-		swapchain = std::move(s);
-	}
-	uint32_t getSwapchainImageCount() override
-	{
-		return swapchain->getImageCount();
-	}
-	virtual nbl::asset::E_FORMAT getDepthFormat() override
-	{
-		return nbl::asset::EF_D32_SFLOAT;
-	}
-
-	APP_CONSTRUCTOR(NablaTutorialExampleApp)
-
-	void onAppInitialized_impl() override
-	{
-		const auto swapchainImageUsage = static_cast<asset::IImage::E_USAGE_FLAGS>(asset::IImage::EUF_COLOR_ATTACHMENT_BIT);
-		CommonAPI::InitParams initParams;
-		initParams.window = core::smart_refctd_ptr(window);
-		initParams.apiType = video::EAT_VULKAN;
-		initParams.appName = { _NBL_APP_NAME_ };
-		initParams.framesInFlight = FRAMES_IN_FLIGHT;
-		initParams.windowWidth = WIN_W;
-		initParams.windowHeight = WIN_H;
-		initParams.swapchainImageCount = SC_IMG_COUNT;
-		initParams.swapchainImageUsage = swapchainImageUsage;
-		initParams.depthFormat = nbl::asset::EF_D32_SFLOAT;
-		auto initOutput = CommonAPI::InitWithDefaultExt(std::move(initParams));
-
-		window = std::move(initParams.window);
-		windowCb = std::move(initParams.windowCb);
-		apiConnection = std::move(initOutput.apiConnection);
-		surface = std::move(initOutput.surface);
-		utilities = std::move(initOutput.utilities);
-		logicalDevice = std::move(initOutput.logicalDevice);
-		physicalDevice = initOutput.physicalDevice;
-		queues = std::move(initOutput.queues);
-		renderpass = std::move(initOutput.renderToSwapchainRenderpass);
-		commandPools = std::move(initOutput.commandPools);
-		system = std::move(initOutput.system);
-		assetManager = std::move(initOutput.assetManager);
-		cpu2gpuParams = std::move(initOutput.cpu2gpuParams);
-		logger = std::move(initOutput.logger);
-		inputSystem = std::move(initOutput.inputSystem);
-		m_swapchainCreationParams = std::move(initOutput.swapchainCreationParams);
-
-		CommonAPI::createSwapchain(std::move(logicalDevice), m_swapchainCreationParams, WIN_W, WIN_H, swapchain);
-		assert(swapchain);
-		fbo = CommonAPI::createFBOWithSwapchainImages(
-			swapchain->getImageCount(), WIN_W, WIN_H,
-			logicalDevice, swapchain, renderpass,
-			nbl::asset::EF_D32_SFLOAT
-		);
-
-		gpuTransferFence = logicalDevice->createFence(static_cast<video::IGPUFence::E_CREATE_FLAGS>(0));
-		gpuComputeFence = logicalDevice->createFence(static_cast<video::IGPUFence::E_CREATE_FLAGS>(0));
-
-		/*
-			Helpfull class for managing basic geometry objects.
-			Thanks to it you can get half filled pipeline for your
-			geometries such as cubes, cones or spheres.
-		*/
-
-		auto geometryCreator = assetManager->getGeometryCreator();
-		auto rectangleGeometry = geometryCreator->createRectangleMesh(nbl::core::vector2df_SIMD(1.5, 3));
-
-		/*
-		Loading an asset bundle. You can specify some flags
-		and parameters to have an impact on extraordinary
-		tasks while loading for example.
-		*/
-
-		asset::IAssetLoader::SAssetLoadParams loadingParams;
-		auto images_bundle = assetManager->getAsset("../../media/color_space_test/R8G8B8A8_1.png", loadingParams);
-		assert(!images_bundle.getContents().empty());
-		auto image = images_bundle.getContents().begin()[0];
-
-		/*
-		By default an image that comes out of an image loader will only have the TRANSFER_DST usage flag.
-		We need to add more usages, as only we know what we'll do with the image farther along in the pipeline.
-		*/
-		auto image_raw = static_cast<asset::ICPUImage*>(image.get());
-		image_raw->addImageUsageFlags(asset::IImage::EUF_SAMPLED_BIT);
-
-		/*
-			Specifing gpu image view parameters to create a gpu
-			image view through the driver.
-		*/
-
-		cpu2gpuParams.beginCommandBuffers();
-		auto gpuImage = cpu2gpu.getGPUObjectsFromAssets(&image_raw, &image_raw + 1, cpu2gpuParams)->front();
-		cpu2gpuParams.waitForCreationToComplete();
-		auto& gpuParams = gpuImage->getCreationParameters();
-
-		IImageView<IGPUImage>::SCreationParams gpuImageViewParams = {};
-		// Compute mipmap creation in Asset Converter tends to create some extra raw UINT views with STORAGE of the original image,
-		// so we need to declare that we won't be using STORAGE on this view or we wouldn't be able to use the SRGB format for it
-		gpuImageViewParams.subUsages = IGPUImage::EUF_SAMPLED_BIT;
-		gpuImageViewParams.image = gpuImage;
-		gpuImageViewParams.viewType = IGPUImageView::ET_2D;
-		gpuImageViewParams.format = gpuParams.format;
-		auto gpuImageView = logicalDevice->createImageView(std::move(gpuImageViewParams));
-
-		/*
-			Specifying cache key to default exsisting cached asset bundle
-			and specifying it's size where end is determined by
-			static_cast<IAsset::E_TYPE>(0u)
-		*/
-
-		const IAsset::E_TYPE types[]{ IAsset::E_TYPE::ET_SPECIALIZED_SHADER, IAsset::E_TYPE::ET_SPECIALIZED_SHADER, static_cast<IAsset::E_TYPE>(0u) };
-
-		auto cpuVertexShader = core::smart_refctd_ptr_static_cast<ICPUSpecializedShader>(assetManager->findAssets("nbl/builtin/material/lambertian/singletexture/specialized_shader.vert", types)->front().getContents().begin()[0]);
-		auto cpuFragmentShader = core::smart_refctd_ptr_static_cast<ICPUSpecializedShader>(assetManager->findAssets("nbl/builtin/material/lambertian/singletexture/specialized_shader.frag", types)->front().getContents().begin()[0]);
-
-		cpu2gpuParams.beginCommandBuffers();
-		auto gpuVertexShader = cpu2gpu.getGPUObjectsFromAssets(&cpuVertexShader.get(), &cpuVertexShader.get() + 1, cpu2gpuParams)->front();
-		auto gpuFragmentShader = cpu2gpu.getGPUObjectsFromAssets(&cpuFragmentShader.get(), &cpuFragmentShader.get() + 1, cpu2gpuParams)->front();
-		cpu2gpuParams.waitForCreationToComplete();
-		std::array<IGPUSpecializedShader*, 2> gpuShaders = { gpuVertexShader.get(), gpuFragmentShader.get() };
-
-		size_t ds0SamplerBinding = 0, ds1UboBinding = 0;
-		{
-			/*
-				SBinding for the texture (sampler).
-			*/
-
-			IGPUDescriptorSetLayout::SBinding gpuSamplerBinding;
-			gpuSamplerBinding.binding = ds0SamplerBinding;
-			gpuSamplerBinding.type = asset::IDescriptor::E_TYPE::ET_COMBINED_IMAGE_SAMPLER;
-			gpuSamplerBinding.count = 1u;
-			gpuSamplerBinding.stageFlags = static_cast<IGPUShader::E_SHADER_STAGE>(IGPUShader::ESS_FRAGMENT);
-			gpuSamplerBinding.samplers = nullptr;
-
-			/*
-				SBinding for UBO - basic view parameters.
-			*/
-
-			IGPUDescriptorSetLayout::SBinding gpuUboBinding;
-			gpuUboBinding.count = 1u;
-			gpuUboBinding.binding = ds1UboBinding;
-			gpuUboBinding.stageFlags = static_cast<asset::ICPUShader::E_SHADER_STAGE>(asset::ICPUShader::ESS_VERTEX | asset::ICPUShader::ESS_FRAGMENT);
-			gpuUboBinding.type = asset::IDescriptor::E_TYPE::ET_UNIFORM_BUFFER;
-
-			/*
-				Creating specific descriptor set layouts from specialized bindings.
-				Those layouts needs to attached to pipeline layout if required by user.
-				IrrlichtBaW provides 4 places for descriptor set layout usage.
-			*/
-
-			auto gpuDs1Layout = logicalDevice->createDescriptorSetLayout(&gpuUboBinding, &gpuUboBinding + 1);
-			auto gpuDs3Layout = logicalDevice->createDescriptorSetLayout(&gpuSamplerBinding, &gpuSamplerBinding + 1);
-
-			/*
-				Creating gpu UBO with appropiate size.
-
-				We know ahead of time that `SBasicViewParameters` struct is the expected structure of the only UBO block in the descriptor set nr. 1 of the shader.
-			*/
-			{
-				IGPUBuffer::SCreationParams creationParams = {};
-				creationParams.usage = core::bitflag(asset::IBuffer::EUF_UNIFORM_BUFFER_BIT)|asset::IBuffer::EUF_TRANSFER_DST_BIT|asset::IBuffer::EUF_INLINE_UPDATE_VIA_CMDBUF;
-				creationParams.size = sizeof(SBasicViewParameters);
-				gpuubo = logicalDevice->createBuffer(std::move(creationParams));
-
-				IDeviceMemoryBacked::SDeviceMemoryRequirements memReq = gpuubo->getMemoryReqs();
-				memReq.memoryTypeBits &= physicalDevice->getDeviceLocalMemoryTypeBits();
-				logicalDevice->allocate(memReq, gpuubo.get());
-			}
-
-			/*
-				Creating descriptor sets - texture (sampler) and basic view parameters (UBO).
-				Specifying info and write parameters for updating certain descriptor set to the driver.
-
-				We know ahead of time that `SBasicViewParameters` struct is the expected structure of the only UBO block in the descriptor set nr. 1 of the shader.
-			*/
-
-			nbl::core::smart_refctd_ptr<video::IDescriptorPool> descriptorPool = nullptr;
-			{
-				constexpr uint32_t DescriptorSetCount = 2u;
-
-				video::IDescriptorPool::SCreateInfo createInfo = {};
-				createInfo.maxSets = DescriptorSetCount;
-				createInfo.maxDescriptorCount[static_cast<uint32_t>(asset::IDescriptor::E_TYPE::ET_UNIFORM_BUFFER)] = 1; // DS1 uses one UBO descriptor.
-				createInfo.maxDescriptorCount[static_cast<uint32_t>(asset::IDescriptor::E_TYPE::ET_COMBINED_IMAGE_SAMPLER)] = 1; // DS3 uses one combined image sampler descriptor.
-
-				descriptorPool = logicalDevice->createDescriptorPool(std::move(createInfo));
-			}
-
-			gpuDescriptorSet3 = descriptorPool->createDescriptorSet(gpuDs3Layout);
-			{
-				video::IGPUDescriptorSet::SWriteDescriptorSet write;
-				write.dstSet = gpuDescriptorSet3.get();
-				write.binding = ds0SamplerBinding;
-				write.count = 1u;
-				write.arrayElement = 0u;
-				write.descriptorType = asset::IDescriptor::E_TYPE::ET_COMBINED_IMAGE_SAMPLER;
-				IGPUDescriptorSet::SDescriptorInfo info;
-				{
-					info.desc = std::move(gpuImageView);
-					ISampler::SParams samplerParams = { ISampler::ETC_CLAMP_TO_EDGE,ISampler::ETC_CLAMP_TO_EDGE,ISampler::ETC_CLAMP_TO_EDGE,ISampler::ETBC_FLOAT_OPAQUE_BLACK,ISampler::ETF_LINEAR,ISampler::ETF_LINEAR,ISampler::ESMM_LINEAR,0u,false,ECO_ALWAYS };
-					info.info.image = { logicalDevice->createSampler(samplerParams),IGPUImage::EL_SHADER_READ_ONLY_OPTIMAL };
-				}
-				write.info = &info;
-				logicalDevice->updateDescriptorSets(1u, &write, 0u, nullptr);
-			}
-
-			gpuDescriptorSet1 = descriptorPool->createDescriptorSet(gpuDs1Layout);
-			{
-				video::IGPUDescriptorSet::SWriteDescriptorSet write;
-				write.dstSet = gpuDescriptorSet1.get();
-				write.binding = ds1UboBinding;
-				write.count = 1u;
-				write.arrayElement = 0u;
-				write.descriptorType = asset::IDescriptor::E_TYPE::ET_UNIFORM_BUFFER;
-				video::IGPUDescriptorSet::SDescriptorInfo info;
-				{
-					info.desc = gpuubo;
-					info.info.buffer.offset = 0ull;
-					info.info.buffer.size = sizeof(SBasicViewParameters);
-				}
-				write.info = &info;
-				logicalDevice->updateDescriptorSets(1u, &write, 0u, nullptr);
-			}
-
-			auto gpuPipelineLayout = logicalDevice->createPipelineLayout(nullptr, nullptr, nullptr, std::move(gpuDs1Layout), nullptr, std::move(gpuDs3Layout));
-
-			/*
-				Preparing required pipeline parameters and filling choosen one.
-				Note that some of them are returned from geometry creator according
-				to what I mentioned in returning half pipeline parameters.
-			*/
-
-			asset::SBlendParams blendParams;
-			asset::SRasterizationParams rasterParams;
-			rasterParams.faceCullingMode = asset::EFCM_NONE;
-
-			/*
-				Creating gpu pipeline with it's pipeline layout and specilized parameters.
-				Attaching vertex shader and fragment shaders.
-			*/
-
-			gpuRenderpassIndependentPipeline = logicalDevice->createRenderpassIndependentPipeline(nullptr, std::move(gpuPipelineLayout), gpuShaders.data(), gpuShaders.data() + gpuShaders.size(), rectangleGeometry.inputParams, blendParams, rectangleGeometry.assemblyParams, rasterParams);
-
-			nbl::video::IGPUGraphicsPipeline::SCreationParams graphicsPipelineParams;
-			graphicsPipelineParams.renderpassIndependent = core::smart_refctd_ptr<nbl::video::IGPURenderpassIndependentPipeline>(gpuRenderpassIndependentPipeline.get());
-			graphicsPipelineParams.renderpass = core::smart_refctd_ptr(renderpass);
-			gpuGraphicsPipeline = logicalDevice->createGraphicsPipeline(nullptr, std::move(graphicsPipelineParams));
-
-			core::vectorSIMDf cameraPosition(-5, 0, 0);
-			matrix4SIMD projectionMatrix = matrix4SIMD::buildProjectionMatrixPerspectiveFovLH(core::radians(60.0f), float(WIN_W) / WIN_H, 0.01, 1000);
-			camera = Camera(cameraPosition, core::vectorSIMDf(0, 0, 0), projectionMatrix, 10.f, 1.f);
-
-			/*
-				Creating gpu meshbuffer from parameters fetched from geometry creator return value.
-			*/
-
-			constexpr auto MAX_ATTR_BUF_BINDING_COUNT = video::IGPUMeshBuffer::MAX_ATTR_BUF_BINDING_COUNT;
-			constexpr auto MAX_DATA_BUFFERS = MAX_ATTR_BUF_BINDING_COUNT + 1;
-			core::vector<asset::ICPUBuffer*> cpubuffers;
-			cpubuffers.reserve(MAX_DATA_BUFFERS);
-			for (auto i = 0; i < MAX_ATTR_BUF_BINDING_COUNT; i++)
-			{
-				auto buf = rectangleGeometry.bindings[i].buffer.get();
-				if (buf)
-					cpubuffers.push_back(buf);
-			}
-			auto cpuindexbuffer = rectangleGeometry.indexBuffer.buffer.get();
-			if (cpuindexbuffer)
-				cpubuffers.push_back(cpuindexbuffer);
-
-			cpu2gpuParams.beginCommandBuffers();
-			auto gpubuffers = cpu2gpu.getGPUObjectsFromAssets(cpubuffers.data(), cpubuffers.data() + cpubuffers.size(), cpu2gpuParams);
-			cpu2gpuParams.waitForCreationToComplete();
-
-			asset::SBufferBinding<video::IGPUBuffer> bindings[MAX_DATA_BUFFERS];
-			for (auto i = 0, j = 0; i < MAX_ATTR_BUF_BINDING_COUNT; i++)
-			{
-				if (!rectangleGeometry.bindings[i].buffer)
-					continue;
-				auto buffPair = gpubuffers->operator[](j++);
-				bindings[i].offset = buffPair->getOffset();
-				bindings[i].buffer = core::smart_refctd_ptr<video::IGPUBuffer>(buffPair->getBuffer());
-			}
-			if (cpuindexbuffer)
-			{
-				auto buffPair = gpubuffers->back();
-				bindings[MAX_ATTR_BUF_BINDING_COUNT].offset = buffPair->getOffset();
-				bindings[MAX_ATTR_BUF_BINDING_COUNT].buffer = core::smart_refctd_ptr<video::IGPUBuffer>(buffPair->getBuffer());
-			}
-
-			gpuMeshBuffer = core::make_smart_refctd_ptr<video::IGPUMeshBuffer>(core::smart_refctd_ptr(gpuRenderpassIndependentPipeline), nullptr, bindings, std::move(bindings[MAX_ATTR_BUF_BINDING_COUNT]));
-			{
-				gpuMeshBuffer->setIndexType(rectangleGeometry.indexType);
-				gpuMeshBuffer->setIndexCount(rectangleGeometry.indexCount);
-				gpuMeshBuffer->setBoundingBox(rectangleGeometry.bbox);
-			}
-		}
-
-		const auto& graphicsCommandPools = commandPools[CommonAPI::InitOutput::EQT_GRAPHICS];
-		for (uint32_t i = 0u; i < FRAMES_IN_FLIGHT; i++)
-		{
-			logicalDevice->createCommandBuffers(graphicsCommandPools[i].get(), video::IGPUCommandBuffer::EL_PRIMARY, 1, commandBuffers+i);
-			imageAcquire[i] = logicalDevice->createSemaphore();
-			renderFinished[i] = logicalDevice->createSemaphore();
-		}
-	}
-
-	/*
-		Hot loop for rendering a scene.
-	*/
-
-	void workLoopBody() override
-	{
-		++resourceIx;
-		if (resourceIx >= FRAMES_IN_FLIGHT)
-			resourceIx = 0;
-
-		auto& commandBuffer = commandBuffers[resourceIx];
-		auto& fence = frameComplete[resourceIx];
-
-		if (fence)
-		{
-			logicalDevice->blockForFences(1u,&fence.get());
-			logicalDevice->resetFences(1u,&fence.get());
-		}
-		else
-			fence = logicalDevice->createFence(static_cast<video::IGPUFence::E_CREATE_FLAGS>(0));
-
-		auto renderStart = std::chrono::system_clock::now();
-		const auto renderDt = std::chrono::duration_cast<std::chrono::milliseconds>(renderStart - lastTime).count();
-		lastTime = renderStart;
-		{ // Calculate Simple Moving Average for FrameTime
-			time_sum -= dtList[frame_count];
-			time_sum += renderDt;
-			dtList[frame_count] = renderDt;
-			frame_count++;
-			if (frame_count >= NBL_FRAMES_TO_AVERAGE)
-			{
-				frameDataFilled = true;
-				frame_count = 0;
-			}
-
-		}
-		const double averageFrameTime = frameDataFilled ? (time_sum / (double)NBL_FRAMES_TO_AVERAGE) : (time_sum / frame_count);
-
-#ifdef NBL_MORE_LOGS
-		logger->log("renderDt = %f ------ averageFrameTime = %f", system::ILogger::ELL_INFO, renderDt, averageFrameTime);
-#endif // NBL_MORE_LOGS
-
-		auto averageFrameTimeDuration = std::chrono::duration<double, std::milli>(averageFrameTime);
-		auto nextPresentationTime = renderStart + averageFrameTimeDuration;
-		auto nextPresentationTimeStamp = std::chrono::duration_cast<std::chrono::microseconds>(nextPresentationTime.time_since_epoch());
-
-		inputSystem->getDefaultMouse(&mouse);
-		inputSystem->getDefaultKeyboard(&keyboard);
-
-		camera.beginInputProcessing(nextPresentationTimeStamp);
-		mouse.consumeEvents([&](const ui::IMouseEventChannel::range_t& events) -> void { camera.mouseProcess(events); }, logger.get());
-		keyboard.consumeEvents([&](const ui::IKeyboardEventChannel::range_t& events) -> void { camera.keyboardProcess(events); }, logger.get());
-		camera.endInputProcessing(nextPresentationTimeStamp);
-
-		const auto& viewMatrix = camera.getViewMatrix();
-		const auto& viewProjectionMatrix = camera.getConcatenatedMatrix();
-
-		commandBuffer->reset(nbl::video::IGPUCommandBuffer::ERF_RELEASE_RESOURCES_BIT);
-		commandBuffer->begin(IGPUCommandBuffer::EU_NONE);
-
-		asset::SViewport viewport;
-		viewport.minDepth = 1.f;
-		viewport.maxDepth = 0.f;
-		viewport.x = 0u;
-		viewport.y = 0u;
-		viewport.width = WIN_W;
-		viewport.height = WIN_H;
-		commandBuffer->setViewport(0u, 1u, &viewport);
-		VkRect2D scissor;
-		scissor.offset = {0u,0u};
-		scissor.extent = {WIN_W,WIN_H};
-		commandBuffer->setScissor(0u,1u,&scissor);
-
-		const auto viewProjection = camera.getConcatenatedMatrix();
-		core::matrix3x4SIMD modelMatrix;
-		modelMatrix.setRotation(nbl::core::quaternion(0, 1, 0));
-
-		auto mv = core::concatenateBFollowedByA(camera.getViewMatrix(), modelMatrix);
-		auto mvp = core::concatenateBFollowedByA(viewProjection, modelMatrix);
-		core::matrix3x4SIMD normalMat;
-		mv.getSub3x3InverseTranspose(normalMat);
-
-		/*
-			Updating UBO for basic view parameters and sending
-			updated data to staging buffer that will redirect
-			the data to graphics card - to vertex shader.
-		*/
-
-		SBasicViewParameters uboData;
-		memcpy(uboData.MV, mv.pointer(), sizeof(mv));
-		memcpy(uboData.MVP, mvp.pointer(), sizeof(mvp));
-		memcpy(uboData.NormalMat, normalMat.pointer(), sizeof(normalMat));
-		commandBuffer->updateBuffer(gpuubo.get(), 0ull, gpuubo->getSize(), &uboData);
-
-		swapchain->acquireNextImage(MAX_TIMEOUT, imageAcquire[resourceIx].get(), nullptr, &acquiredNextFBO);
-
-		nbl::video::IGPUCommandBuffer::SRenderpassBeginInfo beginInfo;
-		{
-			VkRect2D area;
-			area.offset = { 0,0 };
-			area.extent = { WIN_W, WIN_H };
-			asset::SClearValue clear[2] = {};
-			clear[0].color.float32[0] = 0.f;
-			clear[0].color.float32[1] = 0.f;
-			clear[0].color.float32[2] = 0.f;
-			clear[0].color.float32[3] = 1.f;
-			clear[1].depthStencil.depth = 0.f;
-
-			beginInfo.clearValueCount = 2u;
-			beginInfo.framebuffer = fbo->begin()[acquiredNextFBO];
-			beginInfo.renderpass = renderpass;
-			beginInfo.renderArea = area;
-			beginInfo.clearValues = clear;
-		}
-		commandBuffer->beginRenderPass(&beginInfo, nbl::asset::ESC_INLINE);
-
-		/*
-			Binding the most important objects needed to
-			render anything on the screen with textures:
-
-			- gpu pipeline
-			- gpu descriptor sets
-		*/
-
-		commandBuffer->bindGraphicsPipeline(gpuGraphicsPipeline.get());
-		commandBuffer->bindDescriptorSets(asset::EPBP_GRAPHICS, gpuRenderpassIndependentPipeline->getLayout(), 1u, 1u, &gpuDescriptorSet1.get(), 0u);
-		commandBuffer->bindDescriptorSets(asset::EPBP_GRAPHICS, gpuRenderpassIndependentPipeline->getLayout(), 3u, 1u, &gpuDescriptorSet3.get(), 0u);
-
-		/*
-			Drawing a mesh (created rectangle) with it's gpu mesh buffer usage.
-		*/
-
-		commandBuffer->drawMeshBuffer(gpuMeshBuffer.get());
-
-		commandBuffer->endRenderPass();
-		commandBuffer->end();
-
-		CommonAPI::Submit(logicalDevice.get(), commandBuffer.get(), queues[CommonAPI::InitOutput::EQT_GRAPHICS], imageAcquire[resourceIx].get(), renderFinished[resourceIx].get(), fence.get());
-		CommonAPI::Present(logicalDevice.get(), swapchain.get(), queues[CommonAPI::InitOutput::EQT_GRAPHICS], renderFinished[resourceIx].get(), acquiredNextFBO);
-	}
-
-	bool keepRunning() override
-	{
-		return windowCb->isWindowOpen();
-	}
-
-	void onAppTerminated_impl() override { logicalDevice->waitIdle(); }
-};
-
-NBL_COMMON_API_MAIN(NablaTutorialExampleApp)
diff --git a/old_to_refactor/05_NablaTutorialExample/pipeline.groovy b/old_to_refactor/05_NablaTutorialExample/pipeline.groovy
deleted file mode 100644
index 31cadf9e9..000000000
--- a/old_to_refactor/05_NablaTutorialExample/pipeline.groovy
+++ /dev/null
@@ -1,50 +0,0 @@
-import org.DevshGraphicsProgramming.Agent
-import org.DevshGraphicsProgramming.BuilderInfo
-import org.DevshGraphicsProgramming.IBuilder
-
-class CNablaTutorialExampleBuilder extends IBuilder
-{
-	public CNablaTutorialExampleBuilder(Agent _agent, _info)
-	{
-		super(_agent, _info)
-	}
-	
-	@Override
-	public boolean prepare(Map axisMapping)
-	{
-		return true
-	}
-	
-	@Override
-  	public boolean build(Map axisMapping)
-	{
-		IBuilder.CONFIGURATION config = axisMapping.get("CONFIGURATION")
-		IBuilder.BUILD_TYPE buildType = axisMapping.get("BUILD_TYPE")
-		
-		def nameOfBuildDirectory = getNameOfBuildDirectory(buildType)
-		def nameOfConfig = getNameOfConfig(config)
-		
-		agent.execute("cmake --build ${info.rootProjectPath}/${nameOfBuildDirectory}/${info.targetProjectPathRelativeToRoot} --target ${info.targetBaseName} --config ${nameOfConfig} -j12 -v")
-		
-		return true
-	}
-	
-	@Override
-  	public boolean test(Map axisMapping)
-	{
-		return true
-	}
-	
-	@Override
-	public boolean install(Map axisMapping)
-	{
-		return true
-	}
-}
-
-def create(Agent _agent, _info)
-{
-	return new CNablaTutorialExampleBuilder(_agent, _info)
-}
-
-return this
\ No newline at end of file

From 251c070d6e22115918de4d94167c0ba269cfbe0f Mon Sep 17 00:00:00 2001
From: devsh <devsh@devsh.eu>
Date: Mon, 21 Jul 2025 16:34:17 +0200
Subject: [PATCH 492/529] make CI always test ex 12, remove ex 29 which is
 duplicate of old 12

---
 29_MeshLoaders/CMakeLists.txt       |   37 -
 29_MeshLoaders/config.json.template |   28 -
 29_MeshLoaders/main.cpp             | 1404 ---------------------------
 29_MeshLoaders/pipeline.groovy      |   50 -
 CMakeLists.txt                      |    4 +-
 5 files changed, 3 insertions(+), 1520 deletions(-)
 delete mode 100644 29_MeshLoaders/CMakeLists.txt
 delete mode 100644 29_MeshLoaders/config.json.template
 delete mode 100644 29_MeshLoaders/main.cpp
 delete mode 100644 29_MeshLoaders/pipeline.groovy

diff --git a/29_MeshLoaders/CMakeLists.txt b/29_MeshLoaders/CMakeLists.txt
deleted file mode 100644
index 07b0fd396..000000000
--- a/29_MeshLoaders/CMakeLists.txt
+++ /dev/null
@@ -1,37 +0,0 @@
-include(common RESULT_VARIABLE RES)
-if(NOT RES)
-        message(FATAL_ERROR "common.cmake not found. Should be in {repo_root}/cmake directory")
-endif()
-
-if(NBL_BUILD_IMGUI)
-	set(NBL_INCLUDE_SERACH_DIRECTORIES
-		"${CMAKE_CURRENT_SOURCE_DIR}/include"
-	)
-
-	list(APPEND NBL_LIBRARIES 
-		imtestengine
-		"${NBL_EXT_IMGUI_UI_LIB}"
-	)
-
-	nbl_create_executable_project("" "" "${NBL_INCLUDE_SERACH_DIRECTORIES}" "${NBL_LIBRARIES}" "${NBL_EXECUTABLE_PROJECT_CREATION_PCH_TARGET}")
-
-	if(NBL_EMBED_BUILTIN_RESOURCES)
-		set(_BR_TARGET_ ${EXECUTABLE_NAME}_builtinResourceData)
-		set(RESOURCE_DIR "app_resources")
-
-		get_filename_component(_SEARCH_DIRECTORIES_ "${CMAKE_CURRENT_SOURCE_DIR}" ABSOLUTE)
-		get_filename_component(_OUTPUT_DIRECTORY_SOURCE_ "${CMAKE_CURRENT_BINARY_DIR}/src" ABSOLUTE)
-		get_filename_component(_OUTPUT_DIRECTORY_HEADER_ "${CMAKE_CURRENT_BINARY_DIR}/include" ABSOLUTE)
-
-		file(GLOB_RECURSE BUILTIN_RESOURCE_FILES RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}/${RESOURCE_DIR}" "${CMAKE_CURRENT_SOURCE_DIR}/${RESOURCE_DIR}/*")
-		foreach(RES_FILE ${BUILTIN_RESOURCE_FILES})
-			LIST_BUILTIN_RESOURCE(RESOURCES_TO_EMBED "${RES_FILE}")
-		endforeach()
-
-		ADD_CUSTOM_BUILTIN_RESOURCES(${_BR_TARGET_} RESOURCES_TO_EMBED "${_SEARCH_DIRECTORIES_}" "${RESOURCE_DIR}" "nbl::this_example::builtin" "${_OUTPUT_DIRECTORY_HEADER_}" "${_OUTPUT_DIRECTORY_SOURCE_}")
-
-		LINK_BUILTIN_RESOURCES_TO_TARGET(${EXECUTABLE_NAME} ${_BR_TARGET_})
-	endif()
-endif()
-
-
diff --git a/29_MeshLoaders/config.json.template b/29_MeshLoaders/config.json.template
deleted file mode 100644
index 2c42b001d..000000000
--- a/29_MeshLoaders/config.json.template
+++ /dev/null
@@ -1,28 +0,0 @@
-{
-  "enableParallelBuild": true,
-  "threadsPerBuildProcess" : 2,
-  "isExecuted": false,
-  "scriptPath": "",
-  "cmake": {
-    "configurations": [ "Release", "Debug", "RelWithDebInfo" ],
-    "buildModes": [],
-    "requiredOptions": [ "NBL_BUILD_MITSUBA_LOADER" ]
-  }, 
-  "profiles": [
-    {
-      "backend": "vulkan",
-      "platform": "windows",
-      "buildModes": [],
-      "runConfiguration": "Release",
-      "gpuArchitectures": []
-    }
-  ],
-  "dependencies": [],
-  "data": [
-    {
-      "dependencies": [],
-      "command": [""],
-      "outputs": []
-    }
-  ]
-}
\ No newline at end of file
diff --git a/29_MeshLoaders/main.cpp b/29_MeshLoaders/main.cpp
deleted file mode 100644
index 6afb74a5c..000000000
--- a/29_MeshLoaders/main.cpp
+++ /dev/null
@@ -1,1404 +0,0 @@
-// Copyright (C) 2018-2024 - DevSH Graphics Programming Sp. z O.O.
-// This file is part of the "Nabla Engine".
-// For conditions of distribution and use, see copyright notice in nabla.h
-
-#include <nabla.h>
-#include "nbl/asset/utils/CGeometryCreator.h"
-#include "nbl/application_templates/MonoAssetManagerAndBuiltinResourceApplication.hpp"
-
-#include <nbl/builtin/hlsl/cpp_compat.hlsl>
-#include <nbl/builtin/hlsl/cpp_compat/matrix.hlsl>
-
-using namespace nbl;
-using namespace core;
-using namespace hlsl;
-using namespace system;
-using namespace asset;
-using namespace ui;
-using namespace video;
-
-
-class MeshLoadersApp final : public examples::SimpleWindowedApplication, public application_templates::MonoAssetManagerAndBuiltinResourceApplication
-{
-		using device_base_t = examples::SimpleWindowedApplication;
-		using asset_base_t = application_templates::MonoAssetManagerAndBuiltinResourceApplication;
-
-		constexpr static inline uint32_t WIN_W = 1280, WIN_H = 720;
-		constexpr static inline uint32_t MaxFramesInFlight = 3u;
-		constexpr static inline uint8_t MaxUITextureCount = 1u;
-
-
-	public:
-		inline MeshLoadersApp(const path& _localInputCWD, const path& _localOutputCWD, const path& _sharedInputCWD, const path& _sharedOutputCWD)
-			: IApplicationFramework(_localInputCWD, _localOutputCWD, _sharedInputCWD, _sharedOutputCWD)
-		{
-		}
-
-		inline SPhysicalDeviceFeatures getPreferredDeviceFeatures() const override
-		{
-			auto retval = device_base_t::getPreferredDeviceFeatures();
-			retval.accelerationStructure = true;
-			retval.rayQuery = true;
-			return retval;
-		}
-
-		inline core::vector<video::SPhysicalDeviceFilter::SurfaceCompatibility> getSurfaces() const override
-		{
-			if (!m_surface)
-			{
-				{
-					auto windowCallback = core::make_smart_refctd_ptr<CEventCallback>(smart_refctd_ptr(m_inputSystem), smart_refctd_ptr(m_logger));
-					IWindow::SCreationParams params = {};
-					params.callback = core::make_smart_refctd_ptr<ISimpleManagedSurface::ICallback>();
-					params.width = WIN_W;
-					params.height = WIN_H;
-					params.x = 32;
-					params.y = 32;
-					params.flags = ui::IWindow::ECF_HIDDEN | IWindow::ECF_BORDERLESS | IWindow::ECF_RESIZABLE;
-					params.windowCaption = "MeshLoadersApp";
-					params.callback = windowCallback;
-					const_cast<std::remove_const_t<decltype(m_window)>&>(m_window) = m_winMgr->createWindow(std::move(params));
-				}
-
-				auto surface = CSurfaceVulkanWin32::create(smart_refctd_ptr(m_api), smart_refctd_ptr_static_cast<IWindowWin32>(m_window));
-				const_cast<std::remove_const_t<decltype(m_surface)>&>(m_surface) = CSimpleResizeSurface<ISimpleManagedSurface::ISwapchainResources>::create(std::move(surface));
-			}
-
-			if (m_surface)
-				return { {m_surface->getSurface()/*,EQF_NONE*/} };
-
-			return {};
-		}
-
-		// so that we can use the same queue for asset converter and rendering
-		inline core::vector<queue_req_t> getQueueRequirements() const override
-		{
-			auto reqs = device_base_t::getQueueRequirements();
-			reqs.front().requiredFlags |= IQueue::FAMILY_FLAGS::TRANSFER_BIT;
-			reqs.front().requiredFlags |= IQueue::FAMILY_FLAGS::COMPUTE_BIT;
-			return reqs;
-		}
-
-		inline bool onAppInitialized(smart_refctd_ptr<ISystem>&& system) override
-		{
-			m_inputSystem = make_smart_refctd_ptr<InputSystem>(logger_opt_smart_ptr(smart_refctd_ptr(m_logger)));
-
-			if (!device_base_t::onAppInitialized(smart_refctd_ptr(system)))
-				return false;
-
-			if (!asset_base_t::onAppInitialized(smart_refctd_ptr(system)))
-				return false;
-
-#if 0
-		// Load Custom Shader
-		auto loadCompileAndCreateShader = [&](const std::string& relPath) -> smart_refctd_ptr<IGPUShader>
-			{
-				IAssetLoader::SAssetLoadParams lp = {};
-				lp.logger = m_logger.get();
-				lp.workingDirectory = ""; // virtual root
-				auto assetBundle = m_assetMgr->getAsset(relPath, lp);
-				const auto assets = assetBundle.getContents();
-				if (assets.empty())
-					return nullptr;
-
-				// lets go straight from ICPUSpecializedShader to IGPUSpecializedShader
-				auto sourceRaw = IAsset::castDown<ICPUShader>(assets[0]);
-				if (!sourceRaw)
-					return nullptr;
-
-				return m_device->createShader({ sourceRaw.get(), nullptr, shaderReadCache.get(), shaderWriteCache.get() });
-			};
-
-		// load shaders
-		const auto raygenShader = loadCompileAndCreateShader("app_resources/raytrace.rgen.hlsl");
-		const auto closestHitShader = loadCompileAndCreateShader("app_resources/raytrace.rchit.hlsl");
-		const auto proceduralClosestHitShader = loadCompileAndCreateShader("app_resources/raytrace_procedural.rchit.hlsl");
-		const auto intersectionHitShader = loadCompileAndCreateShader("app_resources/raytrace.rint.hlsl");
-		const auto anyHitShaderColorPayload = loadCompileAndCreateShader("app_resources/raytrace.rahit.hlsl");
-		const auto anyHitShaderShadowPayload = loadCompileAndCreateShader("app_resources/raytrace_shadow.rahit.hlsl");
-		const auto missShader = loadCompileAndCreateShader("app_resources/raytrace.rmiss.hlsl");
-		const auto missShadowShader = loadCompileAndCreateShader("app_resources/raytrace_shadow.rmiss.hlsl");
-		const auto directionalLightCallShader = loadCompileAndCreateShader("app_resources/light_directional.rcall.hlsl");
-		const auto pointLightCallShader = loadCompileAndCreateShader("app_resources/light_point.rcall.hlsl");
-		const auto spotLightCallShader = loadCompileAndCreateShader("app_resources/light_spot.rcall.hlsl");
-		const auto fragmentShader = loadCompileAndCreateShader("app_resources/present.frag.hlsl");
-#endif
-
-			m_semaphore = m_device->createSemaphore(m_realFrameIx);
-			if (!m_semaphore)
-				return logFail("Failed to Create a Semaphore!");
-
-			auto gQueue = getGraphicsQueue();
-
-			// Create renderpass and init surface
-			nbl::video::IGPURenderpass* renderpass;
-			{
-				ISwapchain::SCreationParams swapchainParams = { .surface = smart_refctd_ptr<ISurface>(m_surface->getSurface()) };
-				if (!swapchainParams.deduceFormat(m_physicalDevice))
-					return logFail("Could not choose a Surface Format for the Swapchain!");
-
-				const static IGPURenderpass::SCreationParams::SSubpassDependency dependencies[] =
-				{
-				  {
-					.srcSubpass = IGPURenderpass::SCreationParams::SSubpassDependency::External,
-					.dstSubpass = 0,
-					.memoryBarrier =
-					{
-					  .srcStageMask = asset::PIPELINE_STAGE_FLAGS::COPY_BIT,
-					  .srcAccessMask = asset::ACCESS_FLAGS::TRANSFER_WRITE_BIT,
-					  .dstStageMask = asset::PIPELINE_STAGE_FLAGS::COLOR_ATTACHMENT_OUTPUT_BIT,
-					  .dstAccessMask = asset::ACCESS_FLAGS::COLOR_ATTACHMENT_WRITE_BIT
-					}
-				  },
-				  {
-					.srcSubpass = 0,
-					.dstSubpass = IGPURenderpass::SCreationParams::SSubpassDependency::External,
-					.memoryBarrier =
-					{
-					  .srcStageMask = asset::PIPELINE_STAGE_FLAGS::COLOR_ATTACHMENT_OUTPUT_BIT,
-					  .srcAccessMask = asset::ACCESS_FLAGS::COLOR_ATTACHMENT_WRITE_BIT
-					}
-				  },
-				  IGPURenderpass::SCreationParams::DependenciesEnd
-				};
-
-				auto scResources = std::make_unique<CDefaultSwapchainFramebuffers>(m_device.get(), swapchainParams.surfaceFormat.format, dependencies);
-				renderpass = scResources->getRenderpass();
-
-				if (!renderpass)
-					return logFail("Failed to create Renderpass!");
-
-				if (!m_surface || !m_surface->init(gQueue, std::move(scResources), swapchainParams.sharedParams))
-					return logFail("Could not create Window & Surface or initialize the Surface!");
-			}
-#if 0
-			auto pool = m_device->createCommandPool(gQueue->getFamilyIndex(), IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT);
-
-		m_converter = CAssetConverter::create({ .device = m_device.get(), .optimizer = {} });
-
-		for (auto i = 0u; i < MaxFramesInFlight; i++)
-		{
-			if (!pool)
-				return logFail("Couldn't create Command Pool!");
-			if (!pool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY, { m_cmdBufs.data() + i, 1 }))
-				return logFail("Couldn't create Command Buffer!");
-		}
-#endif
-			m_winMgr->setWindowSize(m_window.get(), WIN_W, WIN_H);
-			m_surface->recreateSwapchain();
-
-#if 0
-		// create output images
-		m_hdrImage = m_device->createImage({
-			{
-			  .type = IGPUImage::ET_2D,
-			  .samples = ICPUImage::ESCF_1_BIT,
-			  .format = EF_R16G16B16A16_SFLOAT,
-			  .extent = {WIN_W, WIN_H, 1},
-			  .mipLevels = 1,
-			  .arrayLayers = 1,
-			  .flags = IImage::ECF_NONE,
-			  .usage = bitflag(IImage::EUF_STORAGE_BIT) | IImage::EUF_TRANSFER_SRC_BIT | IImage::EUF_SAMPLED_BIT
-			}
-			});
-
-		if (!m_hdrImage || !m_device->allocate(m_hdrImage->getMemoryReqs(), m_hdrImage.get()).isValid())
-			return logFail("Could not create HDR Image");
-
-		m_hdrImageView = m_device->createImageView({
-		  .flags = IGPUImageView::ECF_NONE,
-		  .subUsages = IGPUImage::E_USAGE_FLAGS::EUF_STORAGE_BIT | IGPUImage::E_USAGE_FLAGS::EUF_SAMPLED_BIT,
-		  .image = m_hdrImage,
-		  .viewType = IGPUImageView::E_TYPE::ET_2D,
-		  .format = asset::EF_R16G16B16A16_SFLOAT
-			});
-
-
-
-		// ray trace pipeline and descriptor set layout setup
-		{
-			const IGPUDescriptorSetLayout::SBinding bindings[] = {
-			  {
-				.binding = 0,
-				.type = asset::IDescriptor::E_TYPE::ET_ACCELERATION_STRUCTURE,
-				.createFlags = IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_NONE,
-				.stageFlags = asset::IShader::E_SHADER_STAGE::ESS_RAYGEN,
-				.count = 1,
-			  },
-			  {
-				.binding = 1,
-				.type = asset::IDescriptor::E_TYPE::ET_STORAGE_IMAGE,
-				.createFlags = IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_NONE,
-				.stageFlags = asset::IShader::E_SHADER_STAGE::ESS_RAYGEN,
-				.count = 1,
-			  }
-			};
-			const auto descriptorSetLayout = m_device->createDescriptorSetLayout(bindings);
-
-			const std::array<IGPUDescriptorSetLayout*, ICPUPipelineLayout::DESCRIPTOR_SET_COUNT> dsLayoutPtrs = { descriptorSetLayout.get() };
-			m_rayTracingDsPool = m_device->createDescriptorPoolForDSLayouts(IDescriptorPool::ECF_UPDATE_AFTER_BIND_BIT, std::span(dsLayoutPtrs.begin(), dsLayoutPtrs.end()));
-			m_rayTracingDs = m_rayTracingDsPool->createDescriptorSet(descriptorSetLayout);
-
-			const SPushConstantRange pcRange = {
-			  .stageFlags = IShader::E_SHADER_STAGE::ESS_ALL_RAY_TRACING,
-			  .offset = 0u,
-			  .size = sizeof(SPushConstants),
-			};
-			const auto pipelineLayout = m_device->createPipelineLayout({ &pcRange, 1 }, smart_refctd_ptr(descriptorSetLayout), nullptr, nullptr, nullptr);
-
-			IGPURayTracingPipeline::SCreationParams params = {};
-
-			enum RtDemoShader
-			{
-				RTDS_RAYGEN,
-				RTDS_MISS,
-				RTDS_MISS_SHADOW,
-				RTDS_CLOSEST_HIT,
-				RTDS_SPHERE_CLOSEST_HIT,
-				RTDS_ANYHIT_PRIMARY,
-				RTDS_ANYHIT_SHADOW,
-				RTDS_INTERSECTION,
-				RTDS_DIRECTIONAL_CALL,
-				RTDS_POINT_CALL,
-				RTDS_SPOT_CALL,
-				RTDS_COUNT
-			};
-
-			IGPUShader::SSpecInfo shaders[RTDS_COUNT];
-			shaders[RTDS_RAYGEN] = { .shader = raygenShader.get() };
-			shaders[RTDS_MISS] = { .shader = missShader.get() };
-			shaders[RTDS_MISS_SHADOW] = { .shader = missShadowShader.get() };
-			shaders[RTDS_CLOSEST_HIT] = { .shader = closestHitShader.get() };
-			shaders[RTDS_SPHERE_CLOSEST_HIT] = { .shader = proceduralClosestHitShader.get() };
-			shaders[RTDS_ANYHIT_PRIMARY] = { .shader = anyHitShaderColorPayload.get() };
-			shaders[RTDS_ANYHIT_SHADOW] = { .shader = anyHitShaderShadowPayload.get() };
-			shaders[RTDS_INTERSECTION] = { .shader = intersectionHitShader.get() };
-			shaders[RTDS_DIRECTIONAL_CALL] = { .shader = directionalLightCallShader.get() };
-			shaders[RTDS_POINT_CALL] = { .shader = pointLightCallShader.get() };
-			shaders[RTDS_SPOT_CALL] = { .shader = spotLightCallShader.get() };
-
-			params.layout = pipelineLayout.get();
-			params.shaders = std::span(shaders);
-			using RayTracingFlags = IGPURayTracingPipeline::SCreationParams::FLAGS;
-			params.flags = core::bitflag(RayTracingFlags::NO_NULL_MISS_SHADERS) |
-				RayTracingFlags::NO_NULL_INTERSECTION_SHADERS |
-				RayTracingFlags::NO_NULL_ANY_HIT_SHADERS;
-
-			auto& shaderGroups = params.shaderGroups;
-
-			shaderGroups.raygen = { .index = RTDS_RAYGEN };
-
-			IRayTracingPipelineBase::SGeneralShaderGroup missGroups[EMT_COUNT];
-			missGroups[EMT_PRIMARY] = { .index = RTDS_MISS };
-			missGroups[EMT_OCCLUSION] = { .index = RTDS_MISS_SHADOW };
-			shaderGroups.misses = missGroups;
-
-			auto getHitGroupIndex = [](E_GEOM_TYPE geomType, E_RAY_TYPE rayType)
-				{
-					return geomType * ERT_COUNT + rayType;
-				};
-			IRayTracingPipelineBase::SHitShaderGroup hitGroups[E_RAY_TYPE::ERT_COUNT * E_GEOM_TYPE::EGT_COUNT];
-			hitGroups[getHitGroupIndex(EGT_TRIANGLES, ERT_PRIMARY)] = {
-			  .closestHit = RTDS_CLOSEST_HIT,
-			  .anyHit = RTDS_ANYHIT_PRIMARY,
-			};
-			hitGroups[getHitGroupIndex(EGT_TRIANGLES, ERT_OCCLUSION)] = {
-			  .closestHit = IGPURayTracingPipeline::SGeneralShaderGroup::Unused,
-			  .anyHit = RTDS_ANYHIT_SHADOW,
-			};
-			hitGroups[getHitGroupIndex(EGT_PROCEDURAL, ERT_PRIMARY)] = {
-			  .closestHit = RTDS_SPHERE_CLOSEST_HIT,
-			  .anyHit = RTDS_ANYHIT_PRIMARY,
-			  .intersection = RTDS_INTERSECTION,
-			};
-			hitGroups[getHitGroupIndex(EGT_PROCEDURAL, ERT_OCCLUSION)] = {
-			  .closestHit = IGPURayTracingPipeline::SGeneralShaderGroup::Unused,
-			  .anyHit = RTDS_ANYHIT_SHADOW,
-			  .intersection = RTDS_INTERSECTION,
-			};
-			shaderGroups.hits = hitGroups;
-
-			IRayTracingPipelineBase::SGeneralShaderGroup callableGroups[ELT_COUNT];
-			callableGroups[ELT_DIRECTIONAL] = { .index = RTDS_DIRECTIONAL_CALL };
-			callableGroups[ELT_POINT] = { .index = RTDS_POINT_CALL };
-			callableGroups[ELT_SPOT] = { .index = RTDS_SPOT_CALL };
-			shaderGroups.callables = callableGroups;
-
-			params.cached.maxRecursionDepth = 1;
-			params.cached.dynamicStackSize = true;
-
-			if (!m_device->createRayTracingPipelines(nullptr, { &params, 1 }, &m_rayTracingPipeline))
-				return logFail("Failed to create ray tracing pipeline");
-
-			calculateRayTracingStackSize(m_rayTracingPipeline);
-
-			if (!createShaderBindingTable(m_rayTracingPipeline))
-				return logFail("Could not create shader binding table");
-
-		}
-
-		auto assetManager = make_smart_refctd_ptr<nbl::asset::IAssetManager>(smart_refctd_ptr(system));
-		auto* geometryCreator = assetManager->getGeometryCreator();
-
-		if (!createIndirectBuffer())
-			return logFail("Could not create indirect buffer");
-
-		if (!createAccelerationStructuresFromGeometry(geometryCreator))
-			return logFail("Could not create acceleration structures from geometry creator");
-
-		ISampler::SParams samplerParams = {
-		  .AnisotropicFilter = 0
-		};
-		auto defaultSampler = m_device->createSampler(samplerParams);
-
-		{
-			const IGPUDescriptorSetLayout::SBinding bindings[] = {
-			  {
-				.binding = 0u,
-				.type = nbl::asset::IDescriptor::E_TYPE::ET_COMBINED_IMAGE_SAMPLER,
-				.createFlags = ICPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_NONE,
-				.stageFlags = IShader::E_SHADER_STAGE::ESS_FRAGMENT,
-				.count = 1u,
-				.immutableSamplers = &defaultSampler
-			  }
-			};
-			auto gpuPresentDescriptorSetLayout = m_device->createDescriptorSetLayout(bindings);
-			const video::IGPUDescriptorSetLayout* const layouts[] = { gpuPresentDescriptorSetLayout.get() };
-			const uint32_t setCounts[] = { 1u };
-			m_presentDsPool = m_device->createDescriptorPoolForDSLayouts(IDescriptorPool::E_CREATE_FLAGS::ECF_NONE, layouts, setCounts);
-			m_presentDs = m_presentDsPool->createDescriptorSet(gpuPresentDescriptorSetLayout);
-
-			auto scRes = static_cast<CDefaultSwapchainFramebuffers*>(m_surface->getSwapchainResources());
-			ext::FullScreenTriangle::ProtoPipeline fsTriProtoPPln(m_assetMgr.get(), m_device.get(), m_logger.get());
-			if (!fsTriProtoPPln)
-				return logFail("Failed to create Full Screen Triangle protopipeline or load its vertex shader!");
-
-			const IGPUShader::SSpecInfo fragSpec = {
-			  .entryPoint = "main",
-			  .shader = fragmentShader.get()
-			};
-
-			auto presentLayout = m_device->createPipelineLayout(
-				{},
-				core::smart_refctd_ptr(gpuPresentDescriptorSetLayout),
-				nullptr,
-				nullptr,
-				nullptr
-			);
-			m_presentPipeline = fsTriProtoPPln.createPipeline(fragSpec, presentLayout.get(), scRes->getRenderpass());
-			if (!m_presentPipeline)
-				return logFail("Could not create Graphics Pipeline!");
-		}
-
-		// write descriptors
-		IGPUDescriptorSet::SDescriptorInfo infos[3];
-		infos[0].desc = m_gpuTlas;
-
-		infos[1].desc = m_hdrImageView;
-		if (!infos[1].desc)
-			return logFail("Failed to create image view");
-		infos[1].info.image.imageLayout = IImage::LAYOUT::GENERAL;
-
-		infos[2].desc = m_hdrImageView;
-		infos[2].info.image.imageLayout = IImage::LAYOUT::READ_ONLY_OPTIMAL;
-
-		IGPUDescriptorSet::SWriteDescriptorSet writes[] = {
-			{.dstSet = m_rayTracingDs.get(), .binding = 0, .arrayElement = 0, .count = 1, .info = &infos[0]},
-			{.dstSet = m_rayTracingDs.get(), .binding = 1, .arrayElement = 0, .count = 1, .info = &infos[1]},
-			{.dstSet = m_presentDs.get(), .binding = 0, .arrayElement = 0, .count = 1, .info = &infos[2] },
-		};
-		m_device->updateDescriptorSets(std::span(writes), {});
-
-		// gui descriptor setup
-		{
-			using binding_flags_t = IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS;
-			{
-				IGPUSampler::SParams params;
-				params.AnisotropicFilter = 1u;
-				params.TextureWrapU = ETC_REPEAT;
-				params.TextureWrapV = ETC_REPEAT;
-				params.TextureWrapW = ETC_REPEAT;
-
-				m_ui.samplers.gui = m_device->createSampler(params);
-				m_ui.samplers.gui->setObjectDebugName("Nabla IMGUI UI Sampler");
-			}
-
-			std::array<core::smart_refctd_ptr<IGPUSampler>, 69u> immutableSamplers;
-			for (auto& it : immutableSamplers)
-				it = smart_refctd_ptr(m_ui.samplers.scene);
-
-			immutableSamplers[nbl::ext::imgui::UI::FontAtlasTexId] = smart_refctd_ptr(m_ui.samplers.gui);
-
-			nbl::ext::imgui::UI::SCreationParameters params;
-
-			params.resources.texturesInfo = { .setIx = 0u, .bindingIx = 0u };
-			params.resources.samplersInfo = { .setIx = 0u, .bindingIx = 1u };
-			params.assetManager = m_assetMgr;
-			params.pipelineCache = nullptr;
-			params.pipelineLayout = nbl::ext::imgui::UI::createDefaultPipelineLayout(m_utils->getLogicalDevice(), params.resources.texturesInfo, params.resources.samplersInfo, MaxUITextureCount);
-			params.renderpass = smart_refctd_ptr<IGPURenderpass>(renderpass);
-			params.streamingBuffer = nullptr;
-			params.subpassIx = 0u;
-			params.transfer = getGraphicsQueue();
-			params.utilities = m_utils;
-			{
-				m_ui.manager = ext::imgui::UI::create(std::move(params));
-
-				// note that we use default layout provided by our extension, but you are free to create your own by filling nbl::ext::imgui::UI::S_CREATION_PARAMETERS::resources
-				const auto* descriptorSetLayout = m_ui.manager->getPipeline()->getLayout()->getDescriptorSetLayout(0u);
-				const auto& params = m_ui.manager->getCreationParameters();
-
-				IDescriptorPool::SCreateInfo descriptorPoolInfo = {};
-				descriptorPoolInfo.maxDescriptorCount[static_cast<uint32_t>(asset::IDescriptor::E_TYPE::ET_SAMPLER)] = (uint32_t)nbl::ext::imgui::UI::DefaultSamplerIx::COUNT;
-				descriptorPoolInfo.maxDescriptorCount[static_cast<uint32_t>(asset::IDescriptor::E_TYPE::ET_SAMPLED_IMAGE)] = MaxUITextureCount;
-				descriptorPoolInfo.maxSets = 1u;
-				descriptorPoolInfo.flags = IDescriptorPool::E_CREATE_FLAGS::ECF_UPDATE_AFTER_BIND_BIT;
-
-				m_guiDescriptorSetPool = m_device->createDescriptorPool(std::move(descriptorPoolInfo));
-				assert(m_guiDescriptorSetPool);
-
-				m_guiDescriptorSetPool->createDescriptorSets(1u, &descriptorSetLayout, &m_ui.descriptorSet);
-				assert(m_ui.descriptorSet);
-			}
-		}
-
-		m_ui.manager->registerListener(
-			[this]() -> void {
-				ImGuiIO& io = ImGui::GetIO();
-
-				m_camera.setProjectionMatrix([&]()
-					{
-						static matrix4SIMD projection;
-
-						projection = matrix4SIMD::buildProjectionMatrixPerspectiveFovRH(
-							core::radians(m_cameraSetting.fov),
-							io.DisplaySize.x / io.DisplaySize.y,
-							m_cameraSetting.zNear,
-							m_cameraSetting.zFar);
-
-						return projection;
-					}());
-
-				ImGui::SetNextWindowPos(ImVec2(1024, 100), ImGuiCond_Appearing);
-				ImGui::SetNextWindowSize(ImVec2(256, 256), ImGuiCond_Appearing);
-
-				// create a window and insert the inspector
-				ImGui::SetNextWindowPos(ImVec2(10, 10), ImGuiCond_Appearing);
-				ImGui::SetNextWindowSize(ImVec2(320, 340), ImGuiCond_Appearing);
-				ImGui::Begin("Controls");
-
-				ImGui::SameLine();
-
-				ImGui::Text("Camera");
-
-				ImGui::SliderFloat("Move speed", &m_cameraSetting.moveSpeed, 0.1f, 10.f);
-				ImGui::SliderFloat("Rotate speed", &m_cameraSetting.rotateSpeed, 0.1f, 10.f);
-				ImGui::SliderFloat("Fov", &m_cameraSetting.fov, 20.f, 150.f);
-				ImGui::SliderFloat("zNear", &m_cameraSetting.zNear, 0.1f, 100.f);
-				ImGui::SliderFloat("zFar", &m_cameraSetting.zFar, 110.f, 10000.f);
-				Light m_oldLight = m_light;
-				int light_type = m_light.type;
-				ImGui::ListBox("LightType", &light_type, s_lightTypeNames, ELT_COUNT);
-				m_light.type = static_cast<E_LIGHT_TYPE>(light_type);
-				if (m_light.type == ELT_DIRECTIONAL)
-				{
-					ImGui::SliderFloat3("Light Direction", &m_light.direction.x, -1.f, 1.f);
-				}
-				else if (m_light.type == ELT_POINT)
-				{
-					ImGui::SliderFloat3("Light Position", &m_light.position.x, -20.f, 20.f);
-				}
-				else if (m_light.type == ELT_SPOT)
-				{
-					ImGui::SliderFloat3("Light Direction", &m_light.direction.x, -1.f, 1.f);
-					ImGui::SliderFloat3("Light Position", &m_light.position.x, -20.f, 20.f);
-
-					float32_t dOuterCutoff = hlsl::degrees(acos(m_light.outerCutoff));
-					if (ImGui::SliderFloat("Light Outer Cutoff", &dOuterCutoff, 0.0f, 45.0f))
-					{
-						m_light.outerCutoff = cos(hlsl::radians(dOuterCutoff));
-					}
-				}
-				ImGui::Checkbox("Use Indirect Command", &m_useIndirectCommand);
-				if (m_light != m_oldLight)
-				{
-					m_frameAccumulationCounter = 0;
-				}
-
-				ImGui::Text("X: %f Y: %f", io.MousePos.x, io.MousePos.y);
-
-				ImGui::End();
-			}
-		);
-#endif
-			// Set Camera
-			{
-				core::vectorSIMDf cameraPosition(0, 5, -10);
-				matrix4SIMD proj = matrix4SIMD::buildProjectionMatrixPerspectiveFovRH(
-					core::radians(60.0f),
-					WIN_W / WIN_H,
-					0.01f,
-					500.0f
-				);
-				m_camera = Camera(cameraPosition, core::vectorSIMDf(0, 0, 0), proj);
-			}
-
-			m_winMgr->setWindowSize(m_window.get(), WIN_W, WIN_H);
-			m_surface->recreateSwapchain();
-			m_winMgr->show(m_window.get());
-			m_oracle.reportBeginFrameRecord();
-			m_camera.mapKeysToWASD();
-
-			return true;
-		}
-
-		bool updateGUIDescriptorSet()
-		{
-			// texture atlas, note we don't create info & write pair for the font sampler because UI extension's is immutable and baked into DS layout
-			static std::array<IGPUDescriptorSet::SDescriptorInfo, MaxUITextureCount> descriptorInfo;
-			static IGPUDescriptorSet::SWriteDescriptorSet writes[MaxUITextureCount];
-
-			descriptorInfo[ext::imgui::UI::FontAtlasTexId].info.image.imageLayout = IImage::LAYOUT::READ_ONLY_OPTIMAL;
-			descriptorInfo[ext::imgui::UI::FontAtlasTexId].desc = smart_refctd_ptr<IGPUImageView>(m_ui.manager->getFontAtlasView());
-
-			for (uint32_t i = 0; i < descriptorInfo.size(); ++i)
-			{
-				writes[i].dstSet = m_ui.descriptorSet.get();
-				writes[i].binding = 0u;
-				writes[i].arrayElement = i;
-				writes[i].count = 1u;
-			}
-			writes[ext::imgui::UI::FontAtlasTexId].info = descriptorInfo.data() + ext::imgui::UI::FontAtlasTexId;
-
-			return m_device->updateDescriptorSets(writes, {});
-		}
-
-		inline void workLoopBody() override
-		{
-			// framesInFlight: ensuring safe execution of command buffers and acquires, `framesInFlight` only affect semaphore waits, don't use this to index your resources because it can change with swapchain recreation.
-			const uint32_t framesInFlight = core::min(MaxFramesInFlight, m_surface->getMaxAcquiresInFlight());
-			// We block for semaphores for 2 reasons here:
-			  // A) Resource: Can't use resource like a command buffer BEFORE previous use is finished! [MaxFramesInFlight]
-			  // B) Acquire: Can't have more acquires in flight than a certain threshold returned by swapchain or your surface helper class. [MaxAcquiresInFlight]
-			if (m_realFrameIx >= framesInFlight)
-			{
-				const ISemaphore::SWaitInfo cbDonePending[] =
-				{
-				  {
-					.semaphore = m_semaphore.get(),
-					.value = m_realFrameIx + 1 - framesInFlight
-				  }
-				};
-				if (m_device->blockForSemaphores(cbDonePending) != ISemaphore::WAIT_RESULT::SUCCESS)
-					return;
-			}
-			const auto resourceIx = m_realFrameIx % MaxFramesInFlight;
-
-			m_api->startCapture();
-
-//		update();
-
-			auto queue = getGraphicsQueue();
-			auto cmdbuf = m_cmdBufs[resourceIx].get();
-
-			if (!keepRunning())
-				return;
-
-			cmdbuf->reset(IGPUCommandBuffer::RESET_FLAGS::RELEASE_RESOURCES_BIT);
-			cmdbuf->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT);
-			cmdbuf->beginDebugMarker("Frame");
-#if 0
-		const auto viewMatrix = m_camera.getViewMatrix();
-		const auto projectionMatrix = m_camera.getProjectionMatrix();
-		const auto viewProjectionMatrix = m_camera.getConcatenatedMatrix();
-
-		core::matrix3x4SIMD modelMatrix;
-		modelMatrix.setTranslation(nbl::core::vectorSIMDf(0, 0, 0, 0));
-		modelMatrix.setRotation(quaternion(0, 0, 0));
-
-		core::matrix4SIMD invModelViewProjectionMatrix;
-		modelViewProjectionMatrix.getInverseTransform(invModelViewProjectionMatrix);
-
-		{
-			IGPUCommandBuffer::SPipelineBarrierDependencyInfo::image_barrier_t imageBarriers[1];
-			imageBarriers[0].barrier = {
-			   .dep = {
-				 .srcStageMask = PIPELINE_STAGE_FLAGS::FRAGMENT_SHADER_BIT, // previous frame read from framgent shader
-				 .srcAccessMask = ACCESS_FLAGS::SHADER_READ_BITS,
-				 .dstStageMask = PIPELINE_STAGE_FLAGS::RAY_TRACING_SHADER_BIT,
-				 .dstAccessMask = ACCESS_FLAGS::SHADER_WRITE_BITS
-			  }
-			};
-			imageBarriers[0].image = m_hdrImage.get();
-			imageBarriers[0].subresourceRange = {
-			  .aspectMask = IImage::EAF_COLOR_BIT,
-			  .baseMipLevel = 0u,
-			  .levelCount = 1u,
-			  .baseArrayLayer = 0u,
-			  .layerCount = 1u
-			};
-			imageBarriers[0].oldLayout = m_frameAccumulationCounter == 0 ? IImage::LAYOUT::UNDEFINED : IImage::LAYOUT::READ_ONLY_OPTIMAL;
-			imageBarriers[0].newLayout = IImage::LAYOUT::GENERAL;
-			cmdbuf->pipelineBarrier(E_DEPENDENCY_FLAGS::EDF_NONE, { .imgBarriers = imageBarriers });
-		}
-
-		// Trace Rays Pass
-		{
-			SPushConstants pc;
-			pc.light = m_light;
-			pc.proceduralGeomInfoBuffer = m_proceduralGeomInfoBuffer->getDeviceAddress();
-			pc.triangleGeomInfoBuffer = m_triangleGeomInfoBuffer->getDeviceAddress();
-			pc.frameCounter = m_frameAccumulationCounter;
-			const core::vector3df camPos = m_camera.getPosition().getAsVector3df();
-			pc.camPos = { camPos.X, camPos.Y, camPos.Z };
-			memcpy(&pc.invMVP, invModelViewProjectionMatrix.pointer(), sizeof(pc.invMVP));
-
-			cmdbuf->bindRayTracingPipeline(m_rayTracingPipeline.get());
-			cmdbuf->setRayTracingPipelineStackSize(m_rayTracingStackSize);
-			cmdbuf->pushConstants(m_rayTracingPipeline->getLayout(), IShader::E_SHADER_STAGE::ESS_ALL_RAY_TRACING, 0, sizeof(SPushConstants), &pc);
-			cmdbuf->bindDescriptorSets(EPBP_RAY_TRACING, m_rayTracingPipeline->getLayout(), 0, 1, &m_rayTracingDs.get());
-			if (m_useIndirectCommand)
-			{
-				cmdbuf->traceRaysIndirect(
-					SBufferBinding<const IGPUBuffer>{
-					.offset = 0,
-						.buffer = m_indirectBuffer,
-				});
-			}
-			else
-			{
-				cmdbuf->traceRays(
-					m_shaderBindingTable.raygenGroupRange,
-					m_shaderBindingTable.missGroupsRange, m_shaderBindingTable.missGroupsStride,
-					m_shaderBindingTable.hitGroupsRange, m_shaderBindingTable.hitGroupsStride,
-					m_shaderBindingTable.callableGroupsRange, m_shaderBindingTable.callableGroupsStride,
-					WIN_W, WIN_H, 1);
-			}
-		}
-
-		// pipeline barrier
-		{
-			IGPUCommandBuffer::SPipelineBarrierDependencyInfo::image_barrier_t imageBarriers[1];
-			imageBarriers[0].barrier = {
-			  .dep = {
-				.srcStageMask = PIPELINE_STAGE_FLAGS::RAY_TRACING_SHADER_BIT,
-				.srcAccessMask = ACCESS_FLAGS::SHADER_WRITE_BITS,
-				.dstStageMask = PIPELINE_STAGE_FLAGS::COLOR_ATTACHMENT_OUTPUT_BIT,
-				.dstAccessMask = ACCESS_FLAGS::COLOR_ATTACHMENT_WRITE_BIT
-			  }
-			};
-			imageBarriers[0].image = m_hdrImage.get();
-			imageBarriers[0].subresourceRange = {
-			  .aspectMask = IImage::EAF_COLOR_BIT,
-			  .baseMipLevel = 0u,
-			  .levelCount = 1u,
-			  .baseArrayLayer = 0u,
-			  .layerCount = 1u
-			};
-			imageBarriers[0].oldLayout = IImage::LAYOUT::GENERAL;
-			imageBarriers[0].newLayout = IImage::LAYOUT::READ_ONLY_OPTIMAL;
-
-			cmdbuf->pipelineBarrier(E_DEPENDENCY_FLAGS::EDF_NONE, { .imgBarriers = imageBarriers });
-		}
-
-		{
-			asset::SViewport viewport;
-			{
-				viewport.minDepth = 1.f;
-				viewport.maxDepth = 0.f;
-				viewport.x = 0u;
-				viewport.y = 0u;
-				viewport.width = WIN_W;
-				viewport.height = WIN_H;
-			}
-			cmdbuf->setViewport(0u, 1u, &viewport);
-
-
-			VkRect2D defaultScisors[] = { {.offset = {(int32_t)viewport.x, (int32_t)viewport.y}, .extent = {(uint32_t)viewport.width, (uint32_t)viewport.height}} };
-			cmdbuf->setScissor(defaultScisors);
-
-			auto scRes = static_cast<CDefaultSwapchainFramebuffers*>(m_surface->getSwapchainResources());
-			const VkRect2D currentRenderArea =
-			{
-			  .offset = {0,0},
-			  .extent = {m_window->getWidth(),m_window->getHeight()}
-			};
-			const IGPUCommandBuffer::SClearColorValue clearColor = { .float32 = {0.f,0.f,0.f,1.f} };
-			const IGPUCommandBuffer::SRenderpassBeginInfo info =
-			{
-			  .framebuffer = scRes->getFramebuffer(m_currentImageAcquire.imageIndex),
-			  .colorClearValues = &clearColor,
-			  .depthStencilClearValues = nullptr,
-			  .renderArea = currentRenderArea
-			};
-			nbl::video::ISemaphore::SWaitInfo waitInfo = { .semaphore = m_semaphore.get(), .value = m_realFrameIx + 1u };
-
-			cmdbuf->beginRenderPass(info, IGPUCommandBuffer::SUBPASS_CONTENTS::INLINE);
-
-			cmdbuf->bindGraphicsPipeline(m_presentPipeline.get());
-			cmdbuf->bindDescriptorSets(EPBP_GRAPHICS, m_presentPipeline->getLayout(), 0, 1u, &m_presentDs.get());
-			ext::FullScreenTriangle::recordDrawCall(cmdbuf);
-
-			const auto uiParams = m_ui.manager->getCreationParameters();
-			auto* uiPipeline = m_ui.manager->getPipeline();
-			cmdbuf->bindGraphicsPipeline(uiPipeline);
-			cmdbuf->bindDescriptorSets(EPBP_GRAPHICS, uiPipeline->getLayout(), uiParams.resources.texturesInfo.setIx, 1u, &m_ui.descriptorSet.get());
-			m_ui.manager->render(cmdbuf, waitInfo);
-
-			cmdbuf->endRenderPass();
-
-		}
-#endif
-			cmdbuf->endDebugMarker();
-			cmdbuf->end();
-
-			{
-				const IQueue::SSubmitInfo::SSemaphoreInfo rendered[] =
-				{
-				  {
-					.semaphore = m_semaphore.get(),
-					.value = ++m_realFrameIx,
-					.stageMask = PIPELINE_STAGE_FLAGS::ALL_TRANSFER_BITS
-				  }
-				};
-				{
-					{
-						const IQueue::SSubmitInfo::SCommandBufferInfo commandBuffers[] =
-						{
-						  {.cmdbuf = cmdbuf }
-						};
-
-						const IQueue::SSubmitInfo::SSemaphoreInfo acquired[] =
-						{
-						  {
-							.semaphore = m_currentImageAcquire.semaphore,
-							.value = m_currentImageAcquire.acquireCount,
-							.stageMask = PIPELINE_STAGE_FLAGS::NONE
-						  }
-						};
-						const IQueue::SSubmitInfo infos[] =
-						{
-						  {
-							.waitSemaphores = acquired,
-							.commandBuffers = commandBuffers,
-							.signalSemaphores = rendered
-						  }
-						};
-
-//						updateGUIDescriptorSet();
-
-						if (queue->submit(infos) != IQueue::RESULT::SUCCESS)
-							m_realFrameIx--;
-					}
-				}
-
-				m_window->setCaption("[Nabla Engine] Ray Tracing Pipeline");
-				m_surface->present(m_currentImageAcquire.imageIndex, rendered);
-			}
-			m_api->endCapture();
-			m_frameAccumulationCounter++;
-		}
-#if 0
-		inline void update()
-		{
-			m_camera.setMoveSpeed(m_cameraSetting.moveSpeed);
-			m_camera.setRotateSpeed(m_cameraSetting.rotateSpeed);
-
-			static std::chrono::microseconds previousEventTimestamp{};
-
-			m_inputSystem->getDefaultMouse(&m_mouse);
-			m_inputSystem->getDefaultKeyboard(&m_keyboard);
-
-			auto updatePresentationTimestamp = [&]()
-				{
-					m_currentImageAcquire = m_surface->acquireNextImage();
-
-					m_oracle.reportEndFrameRecord();
-					const auto timestamp = m_oracle.getNextPresentationTimeStamp();
-					m_oracle.reportBeginFrameRecord();
-
-					return timestamp;
-				};
-
-			const auto nextPresentationTimestamp = updatePresentationTimestamp();
-
-			struct
-			{
-				std::vector<SMouseEvent> mouse{};
-				std::vector<SKeyboardEvent> keyboard{};
-			} capturedEvents;
-
-			m_camera.beginInputProcessing(nextPresentationTimestamp);
-			{
-				const auto& io = ImGui::GetIO();
-				m_mouse.consumeEvents([&](const IMouseEventChannel::range_t& events) -> void
-					{
-						if (!io.WantCaptureMouse)
-							m_camera.mouseProcess(events); // don't capture the events, only let camera handle them with its impl
-
-						for (const auto& e : events) // here capture
-						{
-							if (e.timeStamp < previousEventTimestamp)
-								continue;
-
-							previousEventTimestamp = e.timeStamp;
-							capturedEvents.mouse.emplace_back(e);
-
-						}
-					}, m_logger.get());
-
-				m_keyboard.consumeEvents([&](const IKeyboardEventChannel::range_t& events) -> void
-					{
-						if (!io.WantCaptureKeyboard)
-							m_camera.keyboardProcess(events); // don't capture the events, only let camera handle them with its impl
-
-						for (const auto& e : events) // here capture
-						{
-							if (e.timeStamp < previousEventTimestamp)
-								continue;
-
-							previousEventTimestamp = e.timeStamp;
-							capturedEvents.keyboard.emplace_back(e);
-						}
-					}, m_logger.get());
-
-			}
-			m_camera.endInputProcessing(nextPresentationTimestamp);
-
-			const core::SRange<const nbl::ui::SMouseEvent> mouseEvents(capturedEvents.mouse.data(), capturedEvents.mouse.data() + capturedEvents.mouse.size());
-			const core::SRange<const nbl::ui::SKeyboardEvent> keyboardEvents(capturedEvents.keyboard.data(), capturedEvents.keyboard.data() + capturedEvents.keyboard.size());
-			const auto cursorPosition = m_window->getCursorControl()->getPosition();
-			const auto mousePosition = float32_t2(cursorPosition.x, cursorPosition.y) - float32_t2(m_window->getX(), m_window->getY());
-
-			const ext::imgui::UI::SUpdateParameters params =
-			{
-			  .mousePosition = mousePosition,
-			  .displaySize = { m_window->getWidth(), m_window->getHeight() },
-			  .mouseEvents = mouseEvents,
-			  .keyboardEvents = keyboardEvents
-			};
-
-			m_ui.manager->update(params);
-		}
-#endif
-		inline bool keepRunning() override
-		{
-			if (m_surface->irrecoverable())
-				return false;
-
-			return true;
-		}
-
-		inline bool onAppTerminated() override
-		{
-			return device_base_t::onAppTerminated();
-		}
-
-	private:
-#if 0
-		bool createAccelerationStructuresFromGeometry(const IGeometryCreator* gc)
-		{
-			auto queue = getGraphicsQueue();
-			// get geometries into ICPUBuffers
-			auto pool = m_device->createCommandPool(queue->getFamilyIndex(), IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT);
-			if (!pool)
-				return logFail("Couldn't create Command Pool for geometry creation!");
-
-			const auto defaultMaterial = Material{
-			  .ambient = {0.2, 0.1, 0.1},
-			  .diffuse = {0.8, 0.3, 0.3},
-			  .specular = {0.8, 0.8, 0.8},
-			  .shininess = 1.0f,
-			  .alpha = 1.0f,
-			};
-
-			auto getTranslationMatrix = [](float32_t x, float32_t y, float32_t z)
-				{
-					core::matrix3x4SIMD transform;
-					transform.setTranslation(nbl::core::vectorSIMDf(x, y, z, 0));
-					return transform;
-				};
-
-			core::matrix3x4SIMD planeTransform;
-			planeTransform.setRotation(quaternion::fromAngleAxis(core::radians(-90.0f), vector3df_SIMD{ 1, 0, 0 }));
-
-			// triangles geometries
-			const auto cpuObjects = std::array{
-				ReferenceObjectCpu {
-					.meta = {.type = OT_RECTANGLE, .name = "Plane Mesh"},
-					.data = gc->createRectangleMesh(nbl::core::vector2df_SIMD(10, 10)),
-					.material = defaultMaterial,
-					.transform = planeTransform,
-				},
-				ReferenceObjectCpu {
-					.meta = {.type = OT_CUBE, .name = "Cube Mesh"},
-					.data = gc->createCubeMesh(nbl::core::vector3df(1, 1, 1)),
-					.material = defaultMaterial,
-					.transform = getTranslationMatrix(0, 0.5f, 0),
-				},
-				ReferenceObjectCpu {
-					.meta = {.type = OT_CUBE, .name = "Cube Mesh 2"},
-					.data = gc->createCubeMesh(nbl::core::vector3df(1.5, 1.5, 1.5)),
-					.material = Material{
-						.ambient = {0.1, 0.1, 0.2},
-						.diffuse = {0.2, 0.2, 0.8},
-						.specular = {0.8, 0.8, 0.8},
-						.shininess = 1.0f,
-					},
-					.transform = getTranslationMatrix(-5.0f, 1.0f, 0),
-				},
-				ReferenceObjectCpu {
-					.meta = {.type = OT_CUBE, .name = "Transparent Cube Mesh"},
-					.data = gc->createCubeMesh(nbl::core::vector3df(1.5, 1.5, 1.5)),
-					.material = Material{
-						.ambient = {0.1, 0.2, 0.1},
-						.diffuse = {0.2, 0.8, 0.2},
-						.specular = {0.8, 0.8, 0.8},
-						.shininess = 1.0f,
-						.alpha = 0.2,
-					},
-					.transform = getTranslationMatrix(5.0f, 1.0f, 0),
-				},
-			};
-
-			struct CPUTriBufferBindings
-			{
-				nbl::asset::SBufferBinding<ICPUBuffer> vertex, index;
-			};
-			std::array<CPUTriBufferBindings, std::size(cpuObjects)> cpuTriBuffers;
-
-			for (uint32_t i = 0; i < cpuObjects.size(); i++)
-			{
-				const auto& cpuObject = cpuObjects[i];
-
-				auto vBuffer = smart_refctd_ptr(cpuObject.data.bindings[0].buffer); // no offset
-				auto vUsage = bitflag(IGPUBuffer::EUF_STORAGE_BUFFER_BIT) | IGPUBuffer::EUF_TRANSFER_DST_BIT | IGPUBuffer::EUF_INLINE_UPDATE_VIA_CMDBUF |
-					IGPUBuffer::EUF_ACCELERATION_STRUCTURE_BUILD_INPUT_READ_ONLY_BIT | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT;
-				vBuffer->addUsageFlags(vUsage);
-				vBuffer->setContentHash(vBuffer->computeContentHash());
-
-				auto iBuffer = smart_refctd_ptr(cpuObject.data.indexBuffer.buffer); // no offset
-				auto iUsage = bitflag(IGPUBuffer::EUF_STORAGE_BUFFER_BIT) | IGPUBuffer::EUF_TRANSFER_DST_BIT | IGPUBuffer::EUF_INLINE_UPDATE_VIA_CMDBUF |
-					IGPUBuffer::EUF_ACCELERATION_STRUCTURE_BUILD_INPUT_READ_ONLY_BIT | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT;
-
-				if (cpuObject.data.indexType != EIT_UNKNOWN)
-					if (iBuffer)
-					{
-						iBuffer->addUsageFlags(iUsage);
-						iBuffer->setContentHash(iBuffer->computeContentHash());
-					}
-
-				cpuTriBuffers[i] = {
-				  .vertex = {.offset = 0, .buffer = vBuffer},
-				  .index = {.offset = 0, .buffer = iBuffer},
-				};
-
-			}
-
-			// procedural geometries
-			using Aabb = IGPUBottomLevelAccelerationStructure::AABB_t;
-
-			smart_refctd_ptr<ICPUBuffer> cpuProcBuffer;
-			{
-				ICPUBuffer::SCreationParams params;
-				params.size = NumberOfProceduralGeometries * sizeof(Aabb);
-				cpuProcBuffer = ICPUBuffer::create(std::move(params));
-			}
-
-			core::vector<SProceduralGeomInfo> proceduralGeoms;
-			proceduralGeoms.reserve(NumberOfProceduralGeometries);
-			auto proceduralGeometries = reinterpret_cast<Aabb*>(cpuProcBuffer->getPointer());
-			for (int32_t i = 0; i < NumberOfProceduralGeometries; i++)
-			{
-				const auto middle_i = NumberOfProceduralGeometries / 2.0;
-				SProceduralGeomInfo sphere = {
-						.material = hlsl::_static_cast<MaterialPacked>(Material{
-						.ambient = {0.1, 0.05 * i, 0.1},
-						.diffuse = {0.3, 0.2 * i, 0.3},
-						.specular = {0.8, 0.8, 0.8},
-						.shininess = 1.0f,
-					}),
-					.center = float32_t3((i - middle_i) * 4.0, 2, 5.0),
-					.radius = 1,
-				};
-
-				proceduralGeoms.push_back(sphere);
-				const auto sphereMin = sphere.center - sphere.radius;
-				const auto sphereMax = sphere.center + sphere.radius;
-				proceduralGeometries[i] = {
-					vector3d(sphereMin.x, sphereMin.y, sphereMin.z),
-					vector3d(sphereMax.x, sphereMax.y, sphereMax.z)
-				};
-			}
-
-			{
-				IGPUBuffer::SCreationParams params;
-				params.usage = IGPUBuffer::EUF_STORAGE_BUFFER_BIT | IGPUBuffer::EUF_TRANSFER_DST_BIT | IGPUBuffer::EUF_INLINE_UPDATE_VIA_CMDBUF | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT;
-				params.size = proceduralGeoms.size() * sizeof(SProceduralGeomInfo);
-				m_utils->createFilledDeviceLocalBufferOnDedMem(SIntendedSubmitInfo{ .queue = queue }, std::move(params), proceduralGeoms.data()).move_into(m_proceduralGeomInfoBuffer);
-			}
-
-			// get ICPUBuffers into ICPUBLAS
-			// TODO use one BLAS and multiple triangles/aabbs in one
-			const auto blasCount = std::size(cpuObjects) + 1;
-			const auto proceduralBlasIdx = std::size(cpuObjects);
-
-			std::array<smart_refctd_ptr<ICPUBottomLevelAccelerationStructure>, std::size(cpuObjects)+1u> cpuBlas;
-			for (uint32_t i = 0; i < blasCount; i++)
-			{
-				auto& blas = cpuBlas[i];
-				blas = make_smart_refctd_ptr<ICPUBottomLevelAccelerationStructure>();
-
-				if (i == proceduralBlasIdx)
-				{
-					auto aabbs = make_refctd_dynamic_array<smart_refctd_dynamic_array<ICPUBottomLevelAccelerationStructure::AABBs<ICPUBuffer>>>(1u);
-					auto primitiveCounts = make_refctd_dynamic_array<smart_refctd_dynamic_array<uint32_t>>(1u);
-
-					auto& aabb = aabbs->front();
-					auto& primCount = primitiveCounts->front();
-				
-					primCount = NumberOfProceduralGeometries;
-					aabb.data = { .offset = 0, .buffer = cpuProcBuffer };
-					aabb.stride = sizeof(IGPUBottomLevelAccelerationStructure::AABB_t);
-					aabb.geometryFlags = IGPUBottomLevelAccelerationStructure::GEOMETRY_FLAGS::OPAQUE_BIT; // only allow opaque for now
-
-					blas->setGeometries(std::move(aabbs), std::move(primitiveCounts));
-				}
-				else
-				{
-					auto triangles = make_refctd_dynamic_array<smart_refctd_dynamic_array<ICPUBottomLevelAccelerationStructure::Triangles<ICPUBuffer>>>(1u);
-					auto primitiveCounts = make_refctd_dynamic_array<smart_refctd_dynamic_array<uint32_t>>(1u);
-
-					auto& tri = triangles->front();
-					auto& primCount = primitiveCounts->front();
-					const auto& geom = cpuObjects[i];
-					const auto& cpuBuf = cpuTriBuffers[i];
-
-					const bool useIndex = geom.data.indexType != EIT_UNKNOWN;
-					const uint32_t vertexStride = geom.data.inputParams.bindings[0].stride;
-					const uint32_t numVertices = cpuBuf.vertex.buffer->getSize() / vertexStride;
-
-					if (useIndex)
-						primCount = geom.data.indexCount / 3;
-					else
-						primCount = numVertices / 3;
-
-					tri.vertexData[0] = cpuBuf.vertex;
-					tri.indexData = useIndex ? cpuBuf.index : cpuBuf.vertex;
-					tri.maxVertex = numVertices - 1;
-					tri.vertexStride = vertexStride;
-					tri.vertexFormat = EF_R32G32B32_SFLOAT;
-					tri.indexType = geom.data.indexType;
-					tri.geometryFlags = geom.material.isTransparent() ?
-						IGPUBottomLevelAccelerationStructure::GEOMETRY_FLAGS::NO_DUPLICATE_ANY_HIT_INVOCATION_BIT :
-						IGPUBottomLevelAccelerationStructure::GEOMETRY_FLAGS::OPAQUE_BIT;
-
-					blas->setGeometries(std::move(triangles), std::move(primitiveCounts));
-				}
-
-				auto blasFlags = bitflag(IGPUBottomLevelAccelerationStructure::BUILD_FLAGS::PREFER_FAST_TRACE_BIT) | IGPUBottomLevelAccelerationStructure::BUILD_FLAGS::ALLOW_COMPACTION_BIT;
-				if (i == proceduralBlasIdx)
-					blasFlags |= IGPUBottomLevelAccelerationStructure::BUILD_FLAGS::GEOMETRY_TYPE_IS_AABB_BIT;
-
-				blas->setBuildFlags(blasFlags);
-				blas->setContentHash(blas->computeContentHash());
-			}
-
-			auto geomInfoBuffer = ICPUBuffer::create({ std::size(cpuObjects) * sizeof(STriangleGeomInfo) });
-			STriangleGeomInfo* geomInfos = reinterpret_cast<STriangleGeomInfo*>(geomInfoBuffer->getPointer());
-
-			// get ICPUBLAS into ICPUTLAS
-			auto geomInstances = make_refctd_dynamic_array<smart_refctd_dynamic_array<ICPUTopLevelAccelerationStructure::PolymorphicInstance>>(blasCount);
-			{
-				uint32_t i = 0;
-				for (auto instance = geomInstances->begin(); instance != geomInstances->end(); instance++, i++)
-				{
-					const auto isProceduralInstance = i == proceduralBlasIdx;
-					ICPUTopLevelAccelerationStructure::StaticInstance inst;
-					inst.base.blas = cpuBlas[i];
-					inst.base.flags = static_cast<uint32_t>(IGPUTopLevelAccelerationStructure::INSTANCE_FLAGS::TRIANGLE_FACING_CULL_DISABLE_BIT);
-					inst.base.instanceCustomIndex = i;
-					inst.base.instanceShaderBindingTableRecordOffset = isProceduralInstance ? 2 : 0;;
-					inst.base.mask = 0xFF;
-					inst.transform = isProceduralInstance ? matrix3x4SIMD() : cpuObjects[i].transform;
-
-					instance->instance = inst;
-				}
-			}
-
-			auto cpuTlas = make_smart_refctd_ptr<ICPUTopLevelAccelerationStructure>();
-			cpuTlas->setInstances(std::move(geomInstances));
-			cpuTlas->setBuildFlags(IGPUTopLevelAccelerationStructure::BUILD_FLAGS::PREFER_FAST_TRACE_BIT);
-
-			// convert with asset converter
-			smart_refctd_ptr<CAssetConverter> converter = CAssetConverter::create({ .device = m_device.get(), .optimizer = {} });
-			struct MyInputs : CAssetConverter::SInputs
-			{
-				// For the GPU Buffers to be directly writeable and so that we don't need a Transfer Queue submit at all
-				inline uint32_t constrainMemoryTypeBits(const size_t groupCopyID, const IAsset* canonicalAsset, const blake3_hash_t& contentHash, const IDeviceMemoryBacked* memoryBacked) const override
-				{
-					assert(memoryBacked);
-					return memoryBacked->getObjectType() != IDeviceMemoryBacked::EOT_BUFFER ? (~0u) : rebarMemoryTypes;
-				}
-
-				uint32_t rebarMemoryTypes;
-			} inputs = {};
-			inputs.logger = m_logger.get();
-			inputs.rebarMemoryTypes = m_physicalDevice->getDirectVRAMAccessMemoryTypeBits();
-			// the allocator needs to be overriden to hand out memory ranges which have already been mapped so that the ReBAR fast-path can kick in
-			// (multiple buffers can be bound to same memory, but memory can only be mapped once at one place, so Asset Converter can't do it)
-			struct MyAllocator final : public IDeviceMemoryAllocator
-			{
-				ILogicalDevice* getDeviceForAllocations() const override { return device; }
-
-				SAllocation allocate(const SAllocateInfo& info) override
-				{
-					auto retval = device->allocate(info);
-					// map what is mappable by default so ReBAR checks succeed
-					if (retval.isValid() && retval.memory->isMappable())
-						retval.memory->map({ .offset = 0,.length = info.size });
-					return retval;
-				}
-
-				ILogicalDevice* device;
-			} myalloc;
-			myalloc.device = m_device.get();
-			inputs.allocator = &myalloc;
-
-			std::array<ICPUTopLevelAccelerationStructure*, 1u> tmpTlas;
-			std::array<ICPUBuffer*, 2 * std::size(cpuObjects) + 1u> tmpBuffers;
-			{
-				tmpTlas[0] = cpuTlas.get();
-				for (uint32_t i = 0; i < cpuObjects.size(); i++)
-				{
-					tmpBuffers[2 * i + 0] = cpuTriBuffers[i].vertex.buffer.get();
-					tmpBuffers[2 * i + 1] = cpuTriBuffers[i].index.buffer.get();
-				}
-				tmpBuffers[2 * proceduralBlasIdx] = cpuProcBuffer.get();
-
-				std::get<CAssetConverter::SInputs::asset_span_t<ICPUTopLevelAccelerationStructure>>(inputs.assets) = tmpTlas;
-				std::get<CAssetConverter::SInputs::asset_span_t<ICPUBuffer>>(inputs.assets) = tmpBuffers;
-			}
-
-			auto reservation = converter->reserve(inputs);
-			{
-				auto prepass = [&]<typename asset_type_t>(const auto & references) -> bool
-				{
-					auto objects = reservation.getGPUObjects<asset_type_t>();
-					uint32_t counter = {};
-					for (auto& object : objects)
-					{
-						auto gpu = object.value;
-						auto* reference = references[counter];
-
-						if (reference)
-						{
-							if (!gpu)
-							{
-								m_logger->log("Failed to convert a CPU object to GPU!", ILogger::ELL_ERROR);
-								return false;
-							}
-						}
-						counter++;
-					}
-					return true;
-				};
-
-				prepass.template operator() < ICPUTopLevelAccelerationStructure > (tmpTlas);
-				prepass.template operator() < ICPUBuffer > (tmpBuffers);
-			}
-
-			constexpr auto CompBufferCount = 2;
-			std::array<smart_refctd_ptr<IGPUCommandBuffer>, CompBufferCount> compBufs = {};
-			std::array<IQueue::SSubmitInfo::SCommandBufferInfo, CompBufferCount> compBufInfos = {};
-			{
-				auto pool = m_device->createCommandPool(queue->getFamilyIndex(), IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT | IGPUCommandPool::CREATE_FLAGS::TRANSIENT_BIT);
-				pool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY, compBufs);
-				compBufs.front()->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT);
-				for (auto i = 0; i < CompBufferCount; i++)
-					compBufInfos[i].cmdbuf = compBufs[i].get();
-			}
-			auto compSema = m_device->createSemaphore(0u);
-			SIntendedSubmitInfo compute = {};
-			compute.queue = queue;
-			compute.scratchCommandBuffers = compBufInfos;
-			compute.scratchSemaphore = {
-				.semaphore = compSema.get(),
-				.value = 0u,
-				.stageMask = PIPELINE_STAGE_FLAGS::ACCELERATION_STRUCTURE_BUILD_BIT | PIPELINE_STAGE_FLAGS::ACCELERATION_STRUCTURE_COPY_BIT
-			};
-			// convert
-			{
-				smart_refctd_ptr<CAssetConverter::SConvertParams::scratch_for_device_AS_build_t> scratchAlloc;
-				{
-					constexpr auto MaxAlignment = 256;
-					constexpr auto MinAllocationSize = 1024;
-					const auto scratchSize = core::alignUp(reservation.getMaxASBuildScratchSize(false), MaxAlignment);
-
-
-					IGPUBuffer::SCreationParams creationParams = {};
-					creationParams.size = scratchSize;
-					creationParams.usage = IGPUBuffer::EUF_ACCELERATION_STRUCTURE_BUILD_INPUT_READ_ONLY_BIT | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT | IGPUBuffer::EUF_STORAGE_BUFFER_BIT;
-					auto scratchBuffer = m_device->createBuffer(std::move(creationParams));
-
-					auto reqs = scratchBuffer->getMemoryReqs();
-					reqs.memoryTypeBits &= m_physicalDevice->getDirectVRAMAccessMemoryTypeBits();
-
-					auto allocation = m_device->allocate(reqs, scratchBuffer.get(), IDeviceMemoryAllocation::EMAF_DEVICE_ADDRESS_BIT);
-					allocation.memory->map({ .offset = 0,.length = reqs.size });
-
-					scratchAlloc = make_smart_refctd_ptr<CAssetConverter::SConvertParams::scratch_for_device_AS_build_t>(
-						SBufferRange<video::IGPUBuffer>{0ull, scratchSize, std::move(scratchBuffer)},
-						core::allocator<uint8_t>(), MaxAlignment, MinAllocationSize
-					);
-				}
-
-				struct MyParams final : CAssetConverter::SConvertParams
-				{
-					inline uint32_t getFinalOwnerQueueFamily(const IGPUBuffer* buffer, const core::blake3_hash_t& createdFrom) override
-					{
-						return finalUser;
-					}
-					inline uint32_t getFinalOwnerQueueFamily(const IGPUAccelerationStructure* image, const core::blake3_hash_t& createdFrom) override
-					{
-						return finalUser;
-					}
-
-					uint8_t finalUser;
-				} params = {};
-				params.utilities = m_utils.get();
-				params.compute = &compute;
-				params.scratchForDeviceASBuild = scratchAlloc.get();
-				params.finalUser = queue->getFamilyIndex();
-
-				auto future = reservation.convert(params);
-				if (future.copy() != IQueue::RESULT::SUCCESS)
-				{
-					m_logger->log("Failed to await submission feature!", ILogger::ELL_ERROR);
-					return false;
-				}
-				// 2 submits, BLAS build, TLAS build, DO NOT ADD COMPACTIONS IN THIS EXAMPLE!
-				if (compute.getFutureScratchSemaphore().value>3)
-					m_logger->log("Overflow submitted on Compute Queue despite using ReBAR (no transfer submits or usage of staging buffer) and providing a AS Build Scratch Buffer of correctly queried max size!",system::ILogger::ELL_ERROR);
-
-				// assign gpu objects to output
-				auto&& tlases = reservation.getGPUObjects<ICPUTopLevelAccelerationStructure>();
-				m_gpuTlas = tlases[0].value;
-				auto&& buffers = reservation.getGPUObjects<ICPUBuffer>();
-				for (uint32_t i = 0; i < cpuObjects.size(); i++)
-				{
-					auto& cpuObject = cpuObjects[i];
-
-					m_gpuTriangleGeometries.push_back(ReferenceObjectGpu{
-					  .meta = cpuObject.meta,
-					  .bindings = {
-						.vertex = {.offset = 0, .buffer = buffers[2 * i + 0].value },
-						.index = {.offset = 0, .buffer = buffers[2 * i + 1].value },
-					  },
-					  .vertexStride = cpuObject.data.inputParams.bindings[0].stride,
-					  .indexType = cpuObject.data.indexType,
-					  .indexCount = cpuObject.data.indexCount,
-					  .material = hlsl::_static_cast<MaterialPacked>(cpuObject.material),
-					  .transform = cpuObject.transform,
-						});
-				}
-				m_proceduralAabbBuffer = buffers[2 * proceduralBlasIdx].value;
-
-				for (uint32_t i = 0; i < m_gpuTriangleGeometries.size(); i++)
-				{
-					const auto& gpuObject = m_gpuTriangleGeometries[i];
-					const uint64_t vertexBufferAddress = gpuObject.bindings.vertex.buffer->getDeviceAddress();
-					geomInfos[i] = {
-					  .material = gpuObject.material,
-					  .vertexBufferAddress = vertexBufferAddress,
-					  .indexBufferAddress = gpuObject.useIndex() ? gpuObject.bindings.index.buffer->getDeviceAddress() : vertexBufferAddress,
-					  .vertexStride = gpuObject.vertexStride,
-					  .objType = gpuObject.meta.type,
-					  .indexType = gpuObject.indexType,
-					  .smoothNormals = s_smoothNormals[gpuObject.meta.type],
-					};
-				}
-			}
-
-			{
-				IGPUBuffer::SCreationParams params;
-				params.usage = IGPUBuffer::EUF_STORAGE_BUFFER_BIT | IGPUBuffer::EUF_TRANSFER_DST_BIT | IGPUBuffer::EUF_INLINE_UPDATE_VIA_CMDBUF | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT;
-				params.size = geomInfoBuffer->getSize();
-				m_utils->createFilledDeviceLocalBufferOnDedMem(SIntendedSubmitInfo{ .queue = queue }, std::move(params), geomInfos).move_into(m_triangleGeomInfoBuffer);
-			}
-
-			return true;
-		}
-#endif
-		smart_refctd_ptr<CAssetConverter> m_converter;
-
-		smart_refctd_ptr<IWindow> m_window;
-		smart_refctd_ptr<CSimpleResizeSurface<ISimpleManagedSurface::ISwapchainResources>> m_surface;
-		smart_refctd_ptr<ISemaphore> m_semaphore;
-		uint64_t m_realFrameIx = 0;
-uint32_t m_frameAccumulationCounter = 0;
-		std::array<smart_refctd_ptr<IGPUCommandBuffer>, MaxFramesInFlight> m_cmdBufs;
-		ISimpleManagedSurface::SAcquireResult m_currentImageAcquire = {};
-
-		core::smart_refctd_ptr<InputSystem> m_inputSystem;
-		InputSystem::ChannelReader<IMouseEventChannel> m_mouse;
-		InputSystem::ChannelReader<IKeyboardEventChannel> m_keyboard;
-
-		struct CameraSetting
-		{
-			float fov = 60.f;
-			float zNear = 0.1f;
-			float zFar = 10000.f;
-			float moveSpeed = 1.f;
-			float rotateSpeed = 1.f;
-			float viewWidth = 10.f;
-			float camYAngle = 165.f / 180.f * 3.14159f;
-			float camXAngle = 32.f / 180.f * 3.14159f;
-
-		} m_cameraSetting;
-		Camera m_camera = Camera(core::vectorSIMDf(0, 0, 0), core::vectorSIMDf(0, 0, 0), core::matrix4SIMD());
-
-		video::CDumbPresentationOracle m_oracle;
-
-#if 0
-	struct C_UI
-	{
-		nbl::core::smart_refctd_ptr<nbl::ext::imgui::UI> manager;
-
-		struct
-		{
-			core::smart_refctd_ptr<video::IGPUSampler> gui, scene;
-		} samplers;
-
-		core::smart_refctd_ptr<IGPUDescriptorSet> descriptorSet;
-	} m_ui;
-	core::smart_refctd_ptr<IDescriptorPool> m_guiDescriptorSetPool;
-
-	core::vector<ReferenceObjectGpu> m_gpuTriangleGeometries;
-	core::vector<SProceduralGeomInfo> m_gpuIntersectionSpheres;
-	uint32_t m_intersectionHitGroupIdx;
-
-	smart_refctd_ptr<IGPUTopLevelAccelerationStructure> m_gpuTlas;
-	smart_refctd_ptr<IGPUBuffer> m_instanceBuffer;
-
-	smart_refctd_ptr<IGPUBuffer> m_triangleGeomInfoBuffer;
-	smart_refctd_ptr<IGPUBuffer> m_proceduralGeomInfoBuffer;
-	smart_refctd_ptr<IGPUBuffer> m_proceduralAabbBuffer;
-	smart_refctd_ptr<IGPUBuffer> m_indirectBuffer;
-
-	smart_refctd_ptr<IGPUImage> m_hdrImage;
-	smart_refctd_ptr<IGPUImageView> m_hdrImageView;
-
-	smart_refctd_ptr<IDescriptorPool> m_rayTracingDsPool;
-	smart_refctd_ptr<IGPUDescriptorSet> m_rayTracingDs;
-	smart_refctd_ptr<IGPURayTracingPipeline> m_rayTracingPipeline;
-	uint64_t m_rayTracingStackSize;
-	ShaderBindingTable m_shaderBindingTable;
-
-	smart_refctd_ptr<IGPUDescriptorSet> m_presentDs;
-	smart_refctd_ptr<IDescriptorPool> m_presentDsPool;
-	smart_refctd_ptr<IGPUGraphicsPipeline> m_presentPipeline;
-
-#endif
-};
-NBL_MAIN_FUNC(MeshLoadersApp)
diff --git a/29_MeshLoaders/pipeline.groovy b/29_MeshLoaders/pipeline.groovy
deleted file mode 100644
index 9a89cc786..000000000
--- a/29_MeshLoaders/pipeline.groovy
+++ /dev/null
@@ -1,50 +0,0 @@
-import org.DevshGraphicsProgramming.Agent
-import org.DevshGraphicsProgramming.BuilderInfo
-import org.DevshGraphicsProgramming.IBuilder
-
-class CPLYSTLDemoBuilder extends IBuilder
-{
-	public CPLYSTLDemoBuilder(Agent _agent, _info)
-	{
-		super(_agent, _info)
-	}
-	
-	@Override
-	public boolean prepare(Map axisMapping)
-	{
-		return true
-	}
-	
-	@Override
-  	public boolean build(Map axisMapping)
-	{
-		IBuilder.CONFIGURATION config = axisMapping.get("CONFIGURATION")
-		IBuilder.BUILD_TYPE buildType = axisMapping.get("BUILD_TYPE")
-		
-		def nameOfBuildDirectory = getNameOfBuildDirectory(buildType)
-		def nameOfConfig = getNameOfConfig(config)
-		
-		agent.execute("cmake --build ${info.rootProjectPath}/${nameOfBuildDirectory}/${info.targetProjectPathRelativeToRoot} --target ${info.targetBaseName} --config ${nameOfConfig} -j12 -v")
-		
-		return true
-	}
-	
-	@Override
-  	public boolean test(Map axisMapping)
-	{
-		return true
-	}
-	
-	@Override
-	public boolean install(Map axisMapping)
-	{
-		return true
-	}
-}
-
-def create(Agent _agent, _info)
-{
-	return new CPLYSTLDemoBuilder(_agent, _info)
-}
-
-return this
\ No newline at end of file
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 8cf1364a5..d3c2e249a 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -42,7 +42,9 @@ if(NBL_BUILD_EXAMPLES)
 	# showcase use of FFT for post-FX Bloom  effect
 	add_subdirectory(11_FFT)
 	#
-	add_subdirectory(12_MeshLoaders EXCLUDE_FROM_ALL)
+	add_subdirectory(12_MeshLoaders)
+	#
+	#add_subdirectory(13_MaterialCompiler EXCLUDE_FROM_ALL)
 
 	# Waiting for a refactor
 	#add_subdirectory(27_PLYSTLDemo)

From 6d48373b21a76e24301f65b79bb92d3aedfd7d31 Mon Sep 17 00:00:00 2001
From: Arkadiusz Lachowicz <areklachowicz@gmail.com>
Date: Mon, 21 Jul 2025 17:38:25 +0200
Subject: [PATCH 493/529] update examples 23, 27 and 29, perform tests on both
 built-in off/on modes, some updates to BuiltinResourcesApplication.hpp to
 cover unique ex. build mount points

---
 23_Arithmetic2UnitTest/CMakeLists.txt         |  4 +-
 .../app_resources/shaderCommon.hlsl           |  2 +-
 .../app_resources/testSubgroup.comp.hlsl      |  2 +-
 .../app_resources/testWorkgroup.comp.hlsl     |  2 +-
 27_MPMCScheduler/CMakeLists.txt               | 54 ++++++++++++-------
 .../app_resources/schedulers/mpmc.hlsl        |  4 +-
 .../app_resources/shader.comp.hlsl            |  4 +-
 .../workgroup/pool_allocator.hlsl             |  2 +-
 27_MPMCScheduler/main.cpp                     | 40 ++++++++------
 29_Arithmetic2Bench/CMakeLists.txt            | 34 +++++-------
 .../app_resources/benchmarkSubgroup.comp.hlsl |  2 +-
 .../benchmarkWorkgroup.comp.hlsl              |  2 +-
 .../app_resources/shaderCommon.hlsl           |  2 +-
 CMakeLists.txt                                | 21 ++++----
 .../common/BuiltinResourcesApplication.hpp    | 32 ++++++-----
 15 files changed, 115 insertions(+), 92 deletions(-)

diff --git a/23_Arithmetic2UnitTest/CMakeLists.txt b/23_Arithmetic2UnitTest/CMakeLists.txt
index e510411f2..9e6a62f67 100644
--- a/23_Arithmetic2UnitTest/CMakeLists.txt
+++ b/23_Arithmetic2UnitTest/CMakeLists.txt
@@ -3,7 +3,9 @@ include(common)
 nbl_create_executable_project("" "" "" "")
 
 get_filename_component(MOUNT_POINT "${CMAKE_CURRENT_SOURCE_DIR}/app_resources" ABSOLUTE)
-file(GLOB_RECURSE KEYS RELATIVE ${MOUNT_POINT} CONFIGURE_DEPENDS app_resources/*.hlsl)
+file(GLOB_RECURSE KEYS RELATIVE ${MOUNT_POINT} app_resources/*.hlsl)
+list(FILTER KEYS EXCLUDE REGEX "preprocessed\\.hlsl$")
+
 NBL_CREATE_RESOURCE_ARCHIVE(
 	TARGET ${EXECUTABLE_NAME}_builtins
 	LINK_TO ${EXECUTABLE_NAME}
diff --git a/23_Arithmetic2UnitTest/app_resources/shaderCommon.hlsl b/23_Arithmetic2UnitTest/app_resources/shaderCommon.hlsl
index 3793b08f8..5baf9a28d 100644
--- a/23_Arithmetic2UnitTest/app_resources/shaderCommon.hlsl
+++ b/23_Arithmetic2UnitTest/app_resources/shaderCommon.hlsl
@@ -1,4 +1,4 @@
-#include "common.hlsl"
+#include "app_resources/common.hlsl"
 
 using namespace nbl;
 using namespace hlsl;
diff --git a/23_Arithmetic2UnitTest/app_resources/testSubgroup.comp.hlsl b/23_Arithmetic2UnitTest/app_resources/testSubgroup.comp.hlsl
index 3105aec56..de1e813f1 100644
--- a/23_Arithmetic2UnitTest/app_resources/testSubgroup.comp.hlsl
+++ b/23_Arithmetic2UnitTest/app_resources/testSubgroup.comp.hlsl
@@ -7,7 +7,7 @@
 #include "nbl/builtin/hlsl/subgroup2/arithmetic_portability.hlsl"
 #include "nbl/builtin/hlsl/subgroup2/arithmetic_params.hlsl"
 
-#include "shaderCommon.hlsl"
+#include "app_resources/shaderCommon.hlsl"
 #include "nbl/builtin/hlsl/workgroup2/basic.hlsl"
 
 template<class Binop, class device_capabilities>
diff --git a/23_Arithmetic2UnitTest/app_resources/testWorkgroup.comp.hlsl b/23_Arithmetic2UnitTest/app_resources/testWorkgroup.comp.hlsl
index a3e70b8ff..664e2f472 100644
--- a/23_Arithmetic2UnitTest/app_resources/testWorkgroup.comp.hlsl
+++ b/23_Arithmetic2UnitTest/app_resources/testWorkgroup.comp.hlsl
@@ -7,7 +7,7 @@
 
 using config_t = WORKGROUP_CONFIG_T;
 
-#include "shaderCommon.hlsl"
+#include "app_resources/shaderCommon.hlsl"
 
 typedef vector<uint32_t, config_t::ItemsPerInvocation_0> type_t;
 
diff --git a/27_MPMCScheduler/CMakeLists.txt b/27_MPMCScheduler/CMakeLists.txt
index a434ff32a..08bccdb1b 100644
--- a/27_MPMCScheduler/CMakeLists.txt
+++ b/27_MPMCScheduler/CMakeLists.txt
@@ -1,24 +1,40 @@
-include(common RESULT_VARIABLE RES)
-if(NOT RES)
-	message(FATAL_ERROR "common.cmake not found. Should be in {repo_root}/cmake directory")
-endif()
+include(common)
 
-nbl_create_executable_project("" "" "" "" "${NBL_EXECUTABLE_PROJECT_CREATION_PCH_TARGET}")
+nbl_create_executable_project("" "" "" "")
 
-if(NBL_EMBED_BUILTIN_RESOURCES)
-	set(_BR_TARGET_ ${EXECUTABLE_NAME}_builtinResourceData)
-	set(RESOURCE_DIR "app_resources")
+set(OUTPUT_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/auto-gen")
+file(GLOB_RECURSE DEPENDS app_resources/*.hlsl)
+list(FILTER DEPENDS EXCLUDE REGEX "preprocessed\\.hlsl$")
 
-	get_filename_component(_SEARCH_DIRECTORIES_ "${CMAKE_CURRENT_SOURCE_DIR}" ABSOLUTE)
-	get_filename_component(_OUTPUT_DIRECTORY_SOURCE_ "${CMAKE_CURRENT_BINARY_DIR}/src" ABSOLUTE)
-	get_filename_component(_OUTPUT_DIRECTORY_HEADER_ "${CMAKE_CURRENT_BINARY_DIR}/include" ABSOLUTE)
+set(JSON [=[
+[
+    {
+		"INPUT": "app_resources/shader.comp.hlsl",
+		"KEY": "shader",
+        "COMPILE_OPTIONS": ["-T", "cs_6_8"],
+		"DEPENDS": [],
+		"CAPS": []
+    }
+]
+]=])
 
-    file(GLOB_RECURSE BUILTIN_RESOURCE_FILES RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}/${RESOURCE_DIR}" CONFIGURE_DEPENDS "${CMAKE_CURRENT_SOURCE_DIR}/${RESOURCE_DIR}/*")
-    foreach(RES_FILE ${BUILTIN_RESOURCE_FILES})
-      LIST_BUILTIN_RESOURCE(RESOURCES_TO_EMBED "${RES_FILE}")
-    endforeach()
+NBL_CREATE_NSC_COMPILE_RULES(
+	TARGET ${EXECUTABLE_NAME}SPIRV
+	LINK_TO ${EXECUTABLE_NAME}
+	DEPENDS ${DEPENDS}
+	BINARY_DIR ${OUTPUT_DIRECTORY}
+	MOUNT_POINT_DEFINE NBL_THIS_EXAMPLE_BUILD_MOUNT_POINT
+	COMMON_OPTIONS -I "${CMAKE_CURRENT_SOURCE_DIR}"
+	OUTPUT_VAR KEYS
+	INCLUDE nbl/this_example/builtin/build/spirv/keys.hpp
+	NAMESPACE nbl::this_example::builtin::build
+	INPUTS ${JSON}
+)
 
-	ADD_CUSTOM_BUILTIN_RESOURCES(${_BR_TARGET_} RESOURCES_TO_EMBED "${_SEARCH_DIRECTORIES_}" "${RESOURCE_DIR}" "nbl::this_example::builtin" "${_OUTPUT_DIRECTORY_HEADER_}" "${_OUTPUT_DIRECTORY_SOURCE_}")
-
-	LINK_BUILTIN_RESOURCES_TO_TARGET(${EXECUTABLE_NAME} ${_BR_TARGET_})
-endif()
\ No newline at end of file
+NBL_CREATE_RESOURCE_ARCHIVE(
+	TARGET ${EXECUTABLE_NAME}_builtinsBuild
+	LINK_TO ${EXECUTABLE_NAME}
+	BIND "${OUTPUT_DIRECTORY}"
+	BUILTINS ${KEYS}
+	NAMESPACE nbl::this_example::builtin::build
+)
\ No newline at end of file
diff --git a/27_MPMCScheduler/app_resources/schedulers/mpmc.hlsl b/27_MPMCScheduler/app_resources/schedulers/mpmc.hlsl
index 184f3702a..836c91576 100644
--- a/27_MPMCScheduler/app_resources/schedulers/mpmc.hlsl
+++ b/27_MPMCScheduler/app_resources/schedulers/mpmc.hlsl
@@ -1,8 +1,8 @@
 #ifndef _NBL_HLSL_SCHEDULERS_MPMC_HLSL_
 #define _NBL_HLSL_SCHEDULERS_MPMC_HLSL_
 
-//#include "../workgroup/stack.hlsl"
-//#include "mpmc_queue.hlsl"
+//#include "app_resources/workgroup/stack.hlsl"
+//#include "app_resources/mpmc_queue.hlsl"
 
 #include "nbl/builtin/hlsl/workgroup/scratch_size.hlsl"
 #include "nbl/builtin/hlsl/workgroup/arithmetic.hlsl"
diff --git a/27_MPMCScheduler/app_resources/shader.comp.hlsl b/27_MPMCScheduler/app_resources/shader.comp.hlsl
index 966963761..3055ad618 100644
--- a/27_MPMCScheduler/app_resources/shader.comp.hlsl
+++ b/27_MPMCScheduler/app_resources/shader.comp.hlsl
@@ -1,6 +1,6 @@
 //#include "nbl/builtin/hlsl/memory_accessor.hlsl"
 
-#include "common.hlsl"
+#include "app_resources/common.hlsl"
 
 #include "nbl/builtin/hlsl/limits.hlsl"
 #include "nbl/builtin/hlsl/numbers.hlsl"
@@ -156,7 +156,7 @@ struct SharedAccessor
 };
 
 //
-#include "schedulers/mpmc.hlsl"
+#include "app_resources/schedulers/mpmc.hlsl"
 struct SubgroupCaps
 {
     NBL_CONSTEXPR_STATIC_INLINE bool shaderSubgroupArithmetic = true;
diff --git a/27_MPMCScheduler/app_resources/workgroup/pool_allocator.hlsl b/27_MPMCScheduler/app_resources/workgroup/pool_allocator.hlsl
index 6685fd5fc..e1532f945 100644
--- a/27_MPMCScheduler/app_resources/workgroup/pool_allocator.hlsl
+++ b/27_MPMCScheduler/app_resources/workgroup/pool_allocator.hlsl
@@ -1,7 +1,7 @@
 #ifndef _NBL_HLSL_WORKGROUP_POOL_ALLOCATOR_HLSL_
 #define _NBL_HLSL_WORKGROUP_POOL_ALLOCATOR_HLSL_
 
-#include "workgroup/stack.hlsl"
+#include "app_resources/workgroup/stack.hlsl"
 
 namespace nbl
 {
diff --git a/27_MPMCScheduler/main.cpp b/27_MPMCScheduler/main.cpp
index 580335a35..0963f86e5 100644
--- a/27_MPMCScheduler/main.cpp
+++ b/27_MPMCScheduler/main.cpp
@@ -4,6 +4,7 @@
 
 
 #include "nbl/examples/examples.hpp"
+#include "nbl/this_example/builtin/build/spirv/keys.hpp"
 
 using namespace nbl;
 using namespace nbl::core;
@@ -73,22 +74,29 @@ class MPMCSchedulerApp final : public SimpleWindowedApplication, public BuiltinR
 
 			smart_refctd_ptr<IShader> shader;
 			{
-				IAssetLoader::SAssetLoadParams lp = {};
-				lp.logger = m_logger.get();
-				lp.workingDirectory = ""; // virtual root
-				auto assetBundle = m_assetMgr->getAsset("app_resources/shader.comp.hlsl", lp);
-				const auto assets = assetBundle.getContents();
-				if (assets.empty())
-					return logFail("Failed to load shader from disk");
-
-				// lets go straight from ICPUSpecializedShader to IGPUSpecializedShader
-				auto source = IAsset::castDown<IShader>(assets[0]);
-				if (!source)
-					return logFail("Failed to load shader from disk");
-
-				shader = m_device->compileShader({ source.get() });
-				if (!shader)
-					return false;
+				// load shader
+				{
+					IAssetLoader::SAssetLoadParams lp = {};
+					lp.logger = m_logger.get();
+					lp.workingDirectory = "";
+
+					auto key = "app_resources/" + nbl::this_example::builtin::build::get_spirv_key<"shader">(m_device.get());
+					const auto bundle = m_assetMgr->getAsset(key.data(), lp);
+
+					const auto contents = bundle.getContents();
+
+					if (contents.empty())
+						return logFail("Failed to load shader from disk");
+
+					if (bundle.getAssetType() != IAsset::ET_SHADER)
+						return logFail("Loaded asset has wrong type!");
+
+					shader = IAsset::castDown<IShader>(contents[0]);
+
+					if (!shader)
+						false;
+				}
+
 			}
 			
 			smart_refctd_ptr<IGPUDescriptorSetLayout> dsLayout;
diff --git a/29_Arithmetic2Bench/CMakeLists.txt b/29_Arithmetic2Bench/CMakeLists.txt
index 0724366c9..9e6a62f67 100644
--- a/29_Arithmetic2Bench/CMakeLists.txt
+++ b/29_Arithmetic2Bench/CMakeLists.txt
@@ -1,25 +1,15 @@
+include(common)
 
-include(common RESULT_VARIABLE RES)
-if(NOT RES)
-	message(FATAL_ERROR "common.cmake not found. Should be in {repo_root}/cmake directory")
-endif()
+nbl_create_executable_project("" "" "" "")
 
-nbl_create_executable_project("" "" "" "" "${NBL_EXECUTABLE_PROJECT_CREATION_PCH_TARGET}")
+get_filename_component(MOUNT_POINT "${CMAKE_CURRENT_SOURCE_DIR}/app_resources" ABSOLUTE)
+file(GLOB_RECURSE KEYS RELATIVE ${MOUNT_POINT} app_resources/*.hlsl)
+list(FILTER KEYS EXCLUDE REGEX "preprocessed\\.hlsl$")
 
-if(NBL_EMBED_BUILTIN_RESOURCES)
-	set(_BR_TARGET_ ${EXECUTABLE_NAME}_builtinResourceData)
-	set(RESOURCE_DIR "app_resources")
-
-	get_filename_component(_SEARCH_DIRECTORIES_ "${CMAKE_CURRENT_SOURCE_DIR}" ABSOLUTE)
-	get_filename_component(_OUTPUT_DIRECTORY_SOURCE_ "${CMAKE_CURRENT_BINARY_DIR}/src" ABSOLUTE)
-	get_filename_component(_OUTPUT_DIRECTORY_HEADER_ "${CMAKE_CURRENT_BINARY_DIR}/include" ABSOLUTE)
-
-    file(GLOB_RECURSE BUILTIN_RESOURCE_FILES RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}/${RESOURCE_DIR}" CONFIGURE_DEPENDS "${CMAKE_CURRENT_SOURCE_DIR}/${RESOURCE_DIR}/*")
-    foreach(RES_FILE ${BUILTIN_RESOURCE_FILES})
-      LIST_BUILTIN_RESOURCE(RESOURCES_TO_EMBED "${RES_FILE}")
-    endforeach()
-
-	ADD_CUSTOM_BUILTIN_RESOURCES(${_BR_TARGET_} RESOURCES_TO_EMBED "${_SEARCH_DIRECTORIES_}" "${RESOURCE_DIR}" "nbl::this_example::builtin" "${_OUTPUT_DIRECTORY_HEADER_}" "${_OUTPUT_DIRECTORY_SOURCE_}")
-
-	LINK_BUILTIN_RESOURCES_TO_TARGET(${EXECUTABLE_NAME} ${_BR_TARGET_})
-endif()
\ No newline at end of file
+NBL_CREATE_RESOURCE_ARCHIVE(
+	TARGET ${EXECUTABLE_NAME}_builtins
+	LINK_TO ${EXECUTABLE_NAME}
+	BIND ${MOUNT_POINT}
+	BUILTINS ${KEYS}
+	NAMESPACE nbl::this_example::builtin
+)
\ No newline at end of file
diff --git a/29_Arithmetic2Bench/app_resources/benchmarkSubgroup.comp.hlsl b/29_Arithmetic2Bench/app_resources/benchmarkSubgroup.comp.hlsl
index f6ad3e678..018672386 100644
--- a/29_Arithmetic2Bench/app_resources/benchmarkSubgroup.comp.hlsl
+++ b/29_Arithmetic2Bench/app_resources/benchmarkSubgroup.comp.hlsl
@@ -8,7 +8,7 @@
 #include "nbl/builtin/hlsl/subgroup2/arithmetic_portability.hlsl"
 #include "nbl/builtin/hlsl/random/xoroshiro.hlsl"
 
-#include "shaderCommon.hlsl"
+#include "app_resources/shaderCommon.hlsl"
 #include "nbl/builtin/hlsl/workgroup2/basic.hlsl"
 
 template<class Binop, class device_capabilities>
diff --git a/29_Arithmetic2Bench/app_resources/benchmarkWorkgroup.comp.hlsl b/29_Arithmetic2Bench/app_resources/benchmarkWorkgroup.comp.hlsl
index 58912691f..8442ecc38 100644
--- a/29_Arithmetic2Bench/app_resources/benchmarkWorkgroup.comp.hlsl
+++ b/29_Arithmetic2Bench/app_resources/benchmarkWorkgroup.comp.hlsl
@@ -8,7 +8,7 @@
 
 using config_t = WORKGROUP_CONFIG_T;
 
-#include "shaderCommon.hlsl"
+#include "app_resources/shaderCommon.hlsl"
 
 typedef vector<uint32_t, config_t::ItemsPerInvocation_0> type_t;
 
diff --git a/29_Arithmetic2Bench/app_resources/shaderCommon.hlsl b/29_Arithmetic2Bench/app_resources/shaderCommon.hlsl
index 242ededd8..ec5824a21 100644
--- a/29_Arithmetic2Bench/app_resources/shaderCommon.hlsl
+++ b/29_Arithmetic2Bench/app_resources/shaderCommon.hlsl
@@ -1,4 +1,4 @@
-#include "common.hlsl"
+#include "app_resources/common.hlsl"
 
 using namespace nbl;
 using namespace hlsl;
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 80b3889e4..1798caa94 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -94,15 +94,18 @@ if(NBL_BUILD_EXAMPLES)
 
 	# we link common example api library and force examples to reuse its PCH
 	foreach(T IN LISTS TARGETS)
-        target_link_libraries(${T} PUBLIC ${NBL_EXAMPLES_API_TARGET})
-		target_include_directories(${T} PUBLIC $<TARGET_PROPERTY:${NBL_EXAMPLES_API_TARGET},INCLUDE_DIRECTORIES>)
-		set_target_properties(${T} PROPERTIES DISABLE_PRECOMPILE_HEADERS OFF)
-		target_precompile_headers(${T} REUSE_FROM "${NBL_EXAMPLES_API_TARGET}")
-
-		if(NBL_EMBED_BUILTIN_RESOURCES)
-			LINK_BUILTIN_RESOURCES_TO_TARGET(${T} NblExtExamplesAPIBuiltinsSource)
-			LINK_BUILTIN_RESOURCES_TO_TARGET(${T} NblExtExamplesAPIBuiltinsInclude)
-			LINK_BUILTIN_RESOURCES_TO_TARGET(${T} NblExtExamplesAPIBuiltinsBuild)
+		get_target_property(TYPE ${T} TYPE)
+		if(NOT ${TYPE} MATCHES INTERFACE)
+			target_link_libraries(${T} PUBLIC ${NBL_EXAMPLES_API_TARGET})
+			target_include_directories(${T} PUBLIC $<TARGET_PROPERTY:${NBL_EXAMPLES_API_TARGET},INCLUDE_DIRECTORIES>)
+			set_target_properties(${T} PROPERTIES DISABLE_PRECOMPILE_HEADERS OFF)
+			target_precompile_headers(${T} REUSE_FROM "${NBL_EXAMPLES_API_TARGET}")
+
+			if(NBL_EMBED_BUILTIN_RESOURCES)
+				LINK_BUILTIN_RESOURCES_TO_TARGET(${T} NblExtExamplesAPIBuiltinsSource)
+				LINK_BUILTIN_RESOURCES_TO_TARGET(${T} NblExtExamplesAPIBuiltinsInclude)
+				LINK_BUILTIN_RESOURCES_TO_TARGET(${T} NblExtExamplesAPIBuiltinsBuild)
+			endif()
 		endif()
     endforeach()
 
diff --git a/common/include/nbl/examples/common/BuiltinResourcesApplication.hpp b/common/include/nbl/examples/common/BuiltinResourcesApplication.hpp
index b0a21fb05..19a5482a0 100644
--- a/common/include/nbl/examples/common/BuiltinResourcesApplication.hpp
+++ b/common/include/nbl/examples/common/BuiltinResourcesApplication.hpp
@@ -14,13 +14,9 @@
 	#if __has_include("nbl/this_example/builtin/CArchive.h")
 		#include "nbl/this_example/builtin/CArchive.h"
 	#endif
-	// TODO: (**) there should be also 5th arch "nbl/this_example/builtin/build/CArchive.h"
-	/*
-		#if __has_include("nbl/this_example/builtin/build/CArchive.h")
-		#include "nbl/this_example/builtin/build/CArchive.h"
-		#endif
-	*/
-	//! this ain't meant to be the same as this_example ordinary archive
+	#if __has_include("nbl/this_example/builtin/build/CArchive.h")
+	#include "nbl/this_example/builtin/build/CArchive.h"
+	#endif
 #endif
 
 namespace nbl::examples
@@ -45,30 +41,38 @@ class BuiltinResourcesApplication : public virtual application_templates::MonoAs
 
 			using namespace core;
 
-			smart_refctd_ptr<system::IFileArchive> examplesHeaderArch,examplesSourceArch,examplesBuildSpirvArch,thisExampleArch;
+			smart_refctd_ptr<system::IFileArchive> examplesHeaderArch,examplesSourceArch,examplesBuildArch,thisExampleArch, thisExampleBuildArch;
 			#ifdef NBL_EMBED_BUILTIN_RESOURCES
 			examplesHeaderArch = core::make_smart_refctd_ptr<nbl::builtin::examples::include::CArchive>(smart_refctd_ptr(m_logger));
 			examplesSourceArch = core::make_smart_refctd_ptr<nbl::builtin::examples::src::CArchive>(smart_refctd_ptr(m_logger));
-			examplesBuildSpirvArch = core::make_smart_refctd_ptr<nbl::builtin::examples::build::CArchive>(smart_refctd_ptr(m_logger));
+			examplesBuildArch = core::make_smart_refctd_ptr<nbl::builtin::examples::build::CArchive>(smart_refctd_ptr(m_logger));
 
 			#ifdef _NBL_THIS_EXAMPLE_BUILTIN_C_ARCHIVE_H_
 				thisExampleArch = make_smart_refctd_ptr<nbl::this_example::builtin::CArchive>(smart_refctd_ptr(m_logger));
 			#endif
-			// TODO: (**)
+
+			#ifdef _NBL_THIS_EXAMPLE_BUILTIN_BUILD_C_ARCHIVE_H_
+				thisExampleBuildArch = make_smart_refctd_ptr<nbl::this_example::builtin::build::CArchive>(smart_refctd_ptr(m_logger));
+			#endif
+
 			#else
 			examplesHeaderArch = make_smart_refctd_ptr<system::CMountDirectoryArchive>(localInputCWD/"../common/include/nbl/examples",smart_refctd_ptr(m_logger),m_system.get());
 			examplesSourceArch = make_smart_refctd_ptr<system::CMountDirectoryArchive>(localInputCWD/"../common/src/nbl/examples",smart_refctd_ptr(m_logger),m_system.get());
-			examplesBuildSpirvArch = make_smart_refctd_ptr<system::CMountDirectoryArchive>(NBL_EXAMPLES_BUILD_MOUNT_POINT, smart_refctd_ptr(m_logger), m_system.get());
+			examplesBuildArch = make_smart_refctd_ptr<system::CMountDirectoryArchive>(NBL_EXAMPLES_BUILD_MOUNT_POINT, smart_refctd_ptr(m_logger), m_system.get());
 			thisExampleArch = make_smart_refctd_ptr<system::CMountDirectoryArchive>(localInputCWD/"app_resources",smart_refctd_ptr(m_logger),m_system.get());
-			// TODO: (**)
+			#ifdef NBL_THIS_EXAMPLE_BUILD_MOUNT_POINT
+				thisExampleBuildArch = make_smart_refctd_ptr<system::CMountDirectoryArchive>(NBL_THIS_EXAMPLE_BUILD_MOUNT_POINT, smart_refctd_ptr(m_logger), m_system.get());
+			#endif
 			#endif
 			// yes all 3 aliases are meant to be the same
 			m_system->mount(std::move(examplesHeaderArch),"nbl/examples");
 			m_system->mount(std::move(examplesSourceArch),"nbl/examples");
-			m_system->mount(std::move(examplesBuildSpirvArch),"nbl/examples");
+			m_system->mount(std::move(examplesBuildArch),"nbl/examples");
 			if (thisExampleArch)
 				m_system->mount(std::move(thisExampleArch),"app_resources");
-			// TODO: (**)
+
+			if(thisExampleBuildArch)
+				m_system->mount(std::move(thisExampleBuildArch), "app_resources");
 
 			return true;
 		}

From fbbec198ea42f911c9b7c05bbde40e83e2bc07dc Mon Sep 17 00:00:00 2001
From: Erfan Ahmadi <ahmadierfan99@gmail.com>
Date: Tue, 22 Jul 2025 10:10:18 +0400
Subject: [PATCH 494/529] Miter(PolylineConnector) Fix with Viewport Rotation

---
 62_CAD/Polyline.h                             |  3 --
 62_CAD/main.cpp                               |  4 +-
 62_CAD/shaders/main_pipeline/common.hlsl      |  7 +--
 .../main_pipeline/fragment_shader.hlsl        | 38 ++++++++++++---
 .../shaders/main_pipeline/vertex_shader.hlsl  | 46 +++++++++++--------
 5 files changed, 63 insertions(+), 35 deletions(-)

diff --git a/62_CAD/Polyline.h b/62_CAD/Polyline.h
index bee5650c7..31ba9eb15 100644
--- a/62_CAD/Polyline.h
+++ b/62_CAD/Polyline.h
@@ -1116,9 +1116,6 @@ class CPolyline : public CPolylineBase
 			if (crossProductZ < 0.0f)
 				res.v = -res.v;
 
-			// Negating y to avoid doing it in vertex shader when working in screen space, where y is in the opposite direction of worldspace y direction
-			res.v.y = -res.v.y;
-
 			m_polylineConnector.push_back(res);
 		}
 	}
diff --git a/62_CAD/main.cpp b/62_CAD/main.cpp
index 459ffe6ea..c920f0e1f 100644
--- a/62_CAD/main.cpp
+++ b/62_CAD/main.cpp
@@ -1492,9 +1492,9 @@ class ComputerAidedDesign final : public nbl::examples::SimpleWindowedApplicatio
 		projectionToNDC = m_Camera.constructViewProjection();
 
 		// TEST CAMERA ROTATION
-#if 0
+#if 1
 		// double rotation = 0.25 * PI<double>();
-		double rotation = abs(cos(m_timeElapsed * 0.0001)) * PI<double>();
+		double rotation = abs(cos(m_timeElapsed * 0.0004)) * 0.25 * PI<double>() ;
 		float64_t2 rotationVec = float64_t2(cos(rotation), sin(rotation));
 		float64_t3x3 rotationParameter = float64_t3x3 {
 			rotationVec.x, rotationVec.y, 0.0,
diff --git a/62_CAD/shaders/main_pipeline/common.hlsl b/62_CAD/shaders/main_pipeline/common.hlsl
index 7ce0e2adf..fc80c67f8 100644
--- a/62_CAD/shaders/main_pipeline/common.hlsl
+++ b/62_CAD/shaders/main_pipeline/common.hlsl
@@ -84,12 +84,13 @@ struct PSInput
     [[vk::location(5)]] nointerpolation float4 data4 : COLOR4;
     // Data segments that need interpolation, mostly for hatches
     [[vk::location(6)]] float4 interp_data5 : COLOR5;
+    [[vk::location(7)]] nointerpolation float data6 : COLOR6;
+    
 #ifdef FRAGMENT_SHADER_INPUT
-    [[vk::location(7)]] [[vk::ext_decorate(/*spv::DecoratePerVertexKHR*/5285)]] float3 vertexScreenSpacePos[3] : COLOR6;
+    [[vk::location(8)]] [[vk::ext_decorate(/*spv::DecoratePerVertexKHR*/5285)]] float3 vertexScreenSpacePos[3] : COLOR7;
 #else
-    [[vk::location(7)]] float3 vertexScreenSpacePos : COLOR6;
+    [[vk::location(8)]] float3 vertexScreenSpacePos : COLOR7;
 #endif
-    [[vk::location(14)]] nointerpolation float data6 : COLOR7; // TODO: Why is location 8 consumed by SV_Position
     // ArcLenCalculator<float>
 
     // Set functions used in vshader, get functions used in fshader
diff --git a/62_CAD/shaders/main_pipeline/fragment_shader.hlsl b/62_CAD/shaders/main_pipeline/fragment_shader.hlsl
index 222384c55..1783cb145 100644
--- a/62_CAD/shaders/main_pipeline/fragment_shader.hlsl
+++ b/62_CAD/shaders/main_pipeline/fragment_shader.hlsl
@@ -11,6 +11,7 @@
 //#include <nbl/builtin/hlsl/spirv_intrinsics/fragment_shader_barycentric.hlsl>
 
 // sdf of Isosceles Trapezoid y-aligned by https://iquilezles.org/articles/distfunctions2d/
+// Trapezoid centered around origin (0,0), the top edge has length r2, the bottom edge has length r1, the height of the trapezoid is he*2.0
 float sdTrapezoid(float2 p, float r1, float r2, float he)
 {
     float2 k1 = float2(r2, he);
@@ -34,14 +35,40 @@ float2 sdLineDstVec(float2 P, float2 A, float2 B)
     return PA - BA * h;
 }
 
+/*
+                    XXXXXXX b XXXXXX              Long Base (len = rb)
+                   X                X            
+                  X                 X            
+                 X                   X           
+                X    XXXXXXXXXXX      X          
+               X XXXX     |     XXXX  X          
+              XXX         |         XXXX         
+            XX            |            XX        
+           XX             |             XX       
+          XX              |              XX      
+         XX               T Trapz Center XX      (2) p.y = 0 after p.y = p.y - halfHeight + radius
+        XX                |               XX     
+       X X                C Circle Center  X     (1) p = (0,0) at circle center
+      X  X                |                X     
+     X    X               |               X X    
+    X     X               |               X  X   
+   X       X              |              X   X   
+  X         XX            |            XX     X  
+ X            XXX         |         XXX        X 
+X                XXXX     |     XXXX           X 
+XXXXXXXXXXXXXXXXXXXXXXXXX a XXXXXXXXXXXXXXXXXXXXX Short Base (len = ra)
+*/
+// p is in circle's space (the circle centered at line intersection and radius = thickness)
+// a and b are points at each trapezoid base (short and long base)
+// TODO[Optimization] we can probably send less info, since we only use length of b-a and the normalize vector
 float miterSDF(float2 p, float thickness, float2 a, float2 b, float ra, float rb)
 {
-    float h = length(b - a) / 2.0;
+    float halfHeight = length(b - a) / 2.0;
     float2 d = normalize(b - a);
     float2x2 rot = float2x2(d.y, -d.x, d.x, d.y);
-    p = mul(rot, p);
-    p.y -= h - thickness;
-    return sdTrapezoid(p, ra, rb, h);
+    p = mul(rot, p); // rotate(change of basis) such that the point is now in the space where trapezoid is y-axis aligned, see (1) above 
+    p.y = p.y - halfHeight + thickness; // see (2) above
+    return sdTrapezoid(p, ra, rb, halfHeight);
 }
 
 // We need to specialize color calculation based on FragmentShaderInterlock feature availability for our transparency algorithm
@@ -242,9 +269,6 @@ float4 fragMain(PSInput input) : SV_TARGET
 
             }
             localAlpha = 1.0f - smoothstep(-globals.antiAliasingFactor, globals.antiAliasingFactor, distance);
-            
-            // if (objType != ObjectType::POLYLINE_CONNECTOR)
-            //    localAlpha *= 0.3f;
         }
         else if (objType == ObjectType::CURVE_BOX) 
         {
diff --git a/62_CAD/shaders/main_pipeline/vertex_shader.hlsl b/62_CAD/shaders/main_pipeline/vertex_shader.hlsl
index 5280e7451..809e6d49b 100644
--- a/62_CAD/shaders/main_pipeline/vertex_shader.hlsl
+++ b/62_CAD/shaders/main_pipeline/vertex_shader.hlsl
@@ -68,6 +68,12 @@ float2 transformPointScreenSpace(pfloat64_t3x3 transformation, uint32_t2 resolut
 
     return _static_cast<float2>(result);
 }
+float2 transformVectorScreenSpace(pfloat64_t3x3 transformation, uint32_t2 resolution, pfloat64_t2 vec2d)
+{
+     pfloat64_t2 ndc = transformVectorNdc(transformation, vec2d);
+     pfloat64_t2 result = (ndc) * 0.5f * _static_cast<pfloat64_t2>(resolution);
+     return _static_cast<float2>(result);
+}
 float32_t4 transformFromSreenSpaceToNdc(float2 pos, uint32_t2 resolution)
 {
     return float32_t4((pos.xy / (float32_t2)resolution) * 2.0f - 1.0f, 0.0f, 1.0f);
@@ -444,29 +450,33 @@ PSInput vtxMain(uint vertexID : SV_VertexID)
                     const float2 circleCenterScreenSpace = transformPointScreenSpace(clipProjectionData.projectionToNDC, globals.resolution, circleCenter);
                     outV.setPolylineConnectorCircleCenter(circleCenterScreenSpace);
 
+                    // to better understand variables at play, and the circle space, see documentation of `miterSDF` in fragment shader
+                    // length of vector from circle center to intersection position (normalized so that circle radius = line thickness = 1.0)
+                    float vLen = length(v);
+                    float2 intersectionDirection_Screenspace = normalize(transformVectorScreenSpace(clipProjectionData.projectionToNDC, globals.resolution, v));
+                    const float2 v_Screenspace = intersectionDirection_Screenspace * vLen;
+
                     // Find other miter vertices
                     const float sinHalfAngleBetweenNormals = sqrt(1.0f - (cosHalfAngleBetweenNormals * cosHalfAngleBetweenNormals));
                     const float32_t2x2 rotationMatrix = float32_t2x2(cosHalfAngleBetweenNormals, -sinHalfAngleBetweenNormals, sinHalfAngleBetweenNormals, cosHalfAngleBetweenNormals);
 
                     // Pass the precomputed trapezoid values for the sdf
                     {
-                        float vLen = length(v);
-                        float2 intersectionDirection = v / vLen;
-
                         float longBase = sinHalfAngleBetweenNormals;
                         float shortBase = max((vLen - globals.miterLimit) * cosHalfAngleBetweenNormals / sinHalfAngleBetweenNormals, 0.0);
                         // height of the trapezoid / triangle
                         float hLen = min(globals.miterLimit, vLen);
 
-                        outV.setPolylineConnectorTrapezoidStart(-1.0 * intersectionDirection * sdfLineThickness);
-                        outV.setPolylineConnectorTrapezoidEnd(intersectionDirection * hLen * sdfLineThickness);
+                        outV.setPolylineConnectorTrapezoidStart(-1.0 * intersectionDirection_Screenspace * sdfLineThickness);
+                        outV.setPolylineConnectorTrapezoidEnd(intersectionDirection_Screenspace * hLen * sdfLineThickness);
                         outV.setPolylineConnectorTrapezoidLongBase(sinHalfAngleBetweenNormals * ((1.0 + vLen) / (vLen - cosHalfAngleBetweenNormals)) * sdfLineThickness);
                         outV.setPolylineConnectorTrapezoidShortBase(shortBase * sdfLineThickness);
                     }
 
                     if (vertexIdx == 0u)
                     {
-                        const float2 V1 = normalize(mul(v, rotationMatrix)) * antiAliasedLineThickness * 2.0f;
+                        // multiplying the other way to rotate by -theta
+                        const float2 V1 = normalize(mul(v_Screenspace, rotationMatrix)) * antiAliasedLineThickness * 2.0f;
                         const float2 screenSpaceV1 = circleCenterScreenSpace + V1;
                         outV.position = float4(screenSpaceV1, 0.0f, 1.0f);   
                     }
@@ -477,13 +487,13 @@ PSInput vtxMain(uint vertexID : SV_VertexID)
                     else if (vertexIdx == 2u)
                     {
                         // find intersection point vertex
-                        float2 intersectionPoint = v * antiAliasedLineThickness * 2.0f;
+                        float2 intersectionPoint = v_Screenspace * antiAliasedLineThickness * 2.0f;
                         intersectionPoint += circleCenterScreenSpace;
                         outV.position = float4(intersectionPoint, 0.0f, 1.0f);
                     }
                     else if (vertexIdx == 3u)
                     {
-                        const float2 V2 = normalize(mul(rotationMatrix, v)) * antiAliasedLineThickness * 2.0f;
+                        const float2 V2 = normalize(mul(rotationMatrix, v_Screenspace)) * antiAliasedLineThickness * 2.0f;
                         const float2 screenSpaceV2 = circleCenterScreenSpace + V2;
                         outV.position = float4(screenSpaceV2, 0.0f, 1.0f);
                     }
@@ -751,18 +761,14 @@ PSInput vtxMain(uint vertexID : SV_VertexID)
 
     // Make the cage fullscreen for testing: 
 #if 0
-        // disabled for object of POLYLINE_CONNECTOR type, since miters would cover whole screen
-        if(objType != ObjectType::POLYLINE_CONNECTOR)
-        {
-            if (vertexIdx == 0u)
-                outV.position = float4(-1, -1, 0, 1);
-            else if (vertexIdx == 1u)
-                outV.position = float4(-1, +1, 0, 1);
-            else if (vertexIdx == 2u)
-                outV.position = float4(+1, -1, 0, 1);
-            else if (vertexIdx == 3u)
-                outV.position = float4(+1, +1, 0, 1);
-        }
+        if (vertexIdx == 0u)
+            outV.position = float4(-1, -1, 0, 1);
+        else if (vertexIdx == 1u)
+            outV.position = float4(-1, +1, 0, 1);
+        else if (vertexIdx == 2u)
+            outV.position = float4(+1, -1, 0, 1);
+        else if (vertexIdx == 3u)
+            outV.position = float4(+1, +1, 0, 1);
 #endif
     }
     outV.clip = float4(outV.position.x - clipProjectionData.minClipNDC.x, outV.position.y - clipProjectionData.minClipNDC.y, clipProjectionData.maxClipNDC.x - outV.position.x, clipProjectionData.maxClipNDC.y - outV.position.y);

From fc0f38b38d572ea9403e448ee206cccc0c030df9 Mon Sep 17 00:00:00 2001
From: Erfan Ahmadi <ahmadierfan99@gmail.com>
Date: Tue, 22 Jul 2025 14:22:35 +0400
Subject: [PATCH 495/529] Fix Small Contour Bug

---
 62_CAD/shaders/main_pipeline/dtm.hlsl | 31 +++++++++++++++++++--------
 1 file changed, 22 insertions(+), 9 deletions(-)

diff --git a/62_CAD/shaders/main_pipeline/dtm.hlsl b/62_CAD/shaders/main_pipeline/dtm.hlsl
index be10c2f31..749d0fd6f 100644
--- a/62_CAD/shaders/main_pipeline/dtm.hlsl
+++ b/62_CAD/shaders/main_pipeline/dtm.hlsl
@@ -251,6 +251,15 @@ float calculateDTMContourSDF(in DTMContourSettings contourSettings, in LineStyle
     contourLineIdx = clamp(contourLineIdx, 0, maxContourLineIdx);
     float contourLineHeight = startHeight + interval * contourLineIdx;
 
+    
+    // Sort so that v[0].z >= v[1].z >= v[2].z
+    if (v[0].z < v[1].z)
+        nbl::hlsl::swap(v[0], v[1]);
+    if (v[0].z < v[2].z)
+        nbl::hlsl::swap(v[0], v[2]);
+    if (v[1].z < v[2].z)
+        nbl::hlsl::swap(v[1], v[2]);
+
     int contourLinePointsIdx = 0;
     float2 contourLinePoints[2];
     for (int i = 0; i < 3; ++i)
@@ -258,16 +267,20 @@ float calculateDTMContourSDF(in DTMContourSettings contourSettings, in LineStyle
         if (contourLinePointsIdx == 2)
             break;
 
-        float3 p0 = v[i];
-        float3 p1 = v[(i + 1) % 3];
-
-        if (p1.z < p0.z)
-            nbl::hlsl::swap(p0, p1);
-
-        if (contourLineHeight >= p0.z && contourLineHeight <= p1.z)
+        int minvIdx = 0;
+        int maxvIdx = 0;
+        
+        if (i == 0) { minvIdx = 2; maxvIdx = 0; }
+        if (i == 1) { minvIdx = 1; maxvIdx = 0; }
+        if (i == 2) { minvIdx = 2; maxvIdx = 1; }
+        
+        float3 minV = v[minvIdx];
+        float3 maxV = v[maxvIdx];
+        
+        if (contourLineHeight >= minV.z && contourLineHeight <= maxV.z)
         {
-            float interpolationVal = (contourLineHeight - p0.z) / (p1.z - p0.z);
-            contourLinePoints[contourLinePointsIdx] = lerp(p0.xy, p1.xy, clamp(interpolationVal, 0.0f, 1.0f));
+            float interpolationVal = (contourLineHeight - minV.z) / (maxV.z - minV.z);
+            contourLinePoints[contourLinePointsIdx] = lerp(minV.xy, maxV.xy, clamp(interpolationVal, 0.0f, 1.0f));
             ++contourLinePointsIdx;
         }
     }

From 620aba9e28c18c36347a8daabc1e4ed8ffcc85fd Mon Sep 17 00:00:00 2001
From: Erfan Ahmadi <ahmadierfan99@gmail.com>
Date: Tue, 22 Jul 2025 14:32:53 +0400
Subject: [PATCH 496/529] remove workaround

---
 62_CAD/shaders/globals.hlsl | 2 --
 1 file changed, 2 deletions(-)

diff --git a/62_CAD/shaders/globals.hlsl b/62_CAD/shaders/globals.hlsl
index dbf0cf390..5c3681910 100644
--- a/62_CAD/shaders/globals.hlsl
+++ b/62_CAD/shaders/globals.hlsl
@@ -475,8 +475,6 @@ struct DTMSettings
     uint32_t contourSettingsCount;
     DTMContourSettings contourSettings[MaxContourSettings];
 
-    uint32_t workaroundForSpirvOptimizerBugToMakeNextMembersAlignmentEqualTo16_LOL;
-    
     // height shading
     DTMHeightShadingSettings heightShadingSettings;
     

From 6dfc0fe1ff94608b93816c4360da9fbca6e56b90 Mon Sep 17 00:00:00 2001
From: Arkadiusz Lachowicz <areklachowicz@gmail.com>
Date: Tue, 22 Jul 2025 12:56:33 +0200
Subject: [PATCH 497/529] remove GLOB expressions from 23,27,29 exs

---
 23_Arithmetic2UnitTest/CMakeLists.txt | 14 +++++++-------
 27_MPMCScheduler/CMakeLists.txt       | 16 +++++++++++-----
 29_Arithmetic2Bench/CMakeLists.txt    | 14 +++++++-------
 3 files changed, 25 insertions(+), 19 deletions(-)

diff --git a/23_Arithmetic2UnitTest/CMakeLists.txt b/23_Arithmetic2UnitTest/CMakeLists.txt
index 9e6a62f67..a18b7a8c0 100644
--- a/23_Arithmetic2UnitTest/CMakeLists.txt
+++ b/23_Arithmetic2UnitTest/CMakeLists.txt
@@ -2,14 +2,14 @@ include(common)
 
 nbl_create_executable_project("" "" "" "")
 
-get_filename_component(MOUNT_POINT "${CMAKE_CURRENT_SOURCE_DIR}/app_resources" ABSOLUTE)
-file(GLOB_RECURSE KEYS RELATIVE ${MOUNT_POINT} app_resources/*.hlsl)
-list(FILTER KEYS EXCLUDE REGEX "preprocessed\\.hlsl$")
-
 NBL_CREATE_RESOURCE_ARCHIVE(
+	NAMESPACE nbl::this_example::builtin
 	TARGET ${EXECUTABLE_NAME}_builtins
 	LINK_TO ${EXECUTABLE_NAME}
-	BIND ${MOUNT_POINT}
-	BUILTINS ${KEYS}
-	NAMESPACE nbl::this_example::builtin
+	BIND app_resources
+	BUILTINS
+		common.hlsl
+		shaderCommon.hlsl
+		testSubgroup.comp.hlsl
+		testWorkgroup.comp.hlsl
 )
\ No newline at end of file
diff --git a/27_MPMCScheduler/CMakeLists.txt b/27_MPMCScheduler/CMakeLists.txt
index 08bccdb1b..92531a8d5 100644
--- a/27_MPMCScheduler/CMakeLists.txt
+++ b/27_MPMCScheduler/CMakeLists.txt
@@ -3,8 +3,14 @@ include(common)
 nbl_create_executable_project("" "" "" "")
 
 set(OUTPUT_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/auto-gen")
-file(GLOB_RECURSE DEPENDS app_resources/*.hlsl)
-list(FILTER DEPENDS EXCLUDE REGEX "preprocessed\\.hlsl$")
+set(DEPENDS
+	app_resources/common.hlsl
+	app_resources/mpmc_queue.hlsl
+	app_resources/schedulers/mpmc.hlsl
+	app_resources/shader.comp.hlsl
+	app_resources/workgroup/pool_allocator.hlsl
+	app_resources/workgroup/stack.hlsl
+)
 
 set(JSON [=[
 [
@@ -24,7 +30,7 @@ NBL_CREATE_NSC_COMPILE_RULES(
 	DEPENDS ${DEPENDS}
 	BINARY_DIR ${OUTPUT_DIRECTORY}
 	MOUNT_POINT_DEFINE NBL_THIS_EXAMPLE_BUILD_MOUNT_POINT
-	COMMON_OPTIONS -I "${CMAKE_CURRENT_SOURCE_DIR}"
+	COMMON_OPTIONS -I ${CMAKE_CURRENT_SOURCE_DIR}
 	OUTPUT_VAR KEYS
 	INCLUDE nbl/this_example/builtin/build/spirv/keys.hpp
 	NAMESPACE nbl::this_example::builtin::build
@@ -32,9 +38,9 @@ NBL_CREATE_NSC_COMPILE_RULES(
 )
 
 NBL_CREATE_RESOURCE_ARCHIVE(
+	NAMESPACE nbl::this_example::builtin::build
 	TARGET ${EXECUTABLE_NAME}_builtinsBuild
 	LINK_TO ${EXECUTABLE_NAME}
-	BIND "${OUTPUT_DIRECTORY}"
+	BIND ${OUTPUT_DIRECTORY}
 	BUILTINS ${KEYS}
-	NAMESPACE nbl::this_example::builtin::build
 )
\ No newline at end of file
diff --git a/29_Arithmetic2Bench/CMakeLists.txt b/29_Arithmetic2Bench/CMakeLists.txt
index 9e6a62f67..99c51769c 100644
--- a/29_Arithmetic2Bench/CMakeLists.txt
+++ b/29_Arithmetic2Bench/CMakeLists.txt
@@ -2,14 +2,14 @@ include(common)
 
 nbl_create_executable_project("" "" "" "")
 
-get_filename_component(MOUNT_POINT "${CMAKE_CURRENT_SOURCE_DIR}/app_resources" ABSOLUTE)
-file(GLOB_RECURSE KEYS RELATIVE ${MOUNT_POINT} app_resources/*.hlsl)
-list(FILTER KEYS EXCLUDE REGEX "preprocessed\\.hlsl$")
-
 NBL_CREATE_RESOURCE_ARCHIVE(
+	NAMESPACE nbl::this_example::builtin
 	TARGET ${EXECUTABLE_NAME}_builtins
 	LINK_TO ${EXECUTABLE_NAME}
-	BIND ${MOUNT_POINT}
-	BUILTINS ${KEYS}
-	NAMESPACE nbl::this_example::builtin
+	BIND app_resources
+	BUILTINS
+		benchmarkSubgroup.comp.hlsl
+		benchmarkWorkgroup.comp.hlsl
+		common.hlsl
+		shaderCommon.hlsl
 )
\ No newline at end of file

From cafd6806fc8e593e00d2fe0d1aa64735e8f97457 Mon Sep 17 00:00:00 2001
From: Arkadiusz Lachowicz <areklachowicz@gmail.com>
Date: Wed, 23 Jul 2025 17:18:42 +0200
Subject: [PATCH 498/529] forgot about you 22, targetSpirvVersion fix

---
 22_CppCompat/ITester.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/22_CppCompat/ITester.h b/22_CppCompat/ITester.h
index 9f2353c95..4ecd522b9 100644
--- a/22_CppCompat/ITester.h
+++ b/22_CppCompat/ITester.h
@@ -71,7 +71,7 @@ class ITester
 
             asset::IShaderCompiler::SCompilerOptions options = {};
             options.stage = shaderStage;
-            options.targetSpirvVersion = m_device->getPhysicalDevice()->getLimits().spirvVersion;
+            options.preprocessorOptions.targetSpirvVersion = m_device->getPhysicalDevice()->getLimits().spirvVersion;
             options.spirvOptimizer = nullptr;
             options.debugInfoFlags |= asset::IShaderCompiler::E_DEBUG_INFO_FLAGS::EDIF_SOURCE_BIT;
             options.preprocessorOptions.sourceIdentifier = source->getFilepathHint();

From 3899c451c5c9b74d71f221ab3b0faadc2d9986c8 Mon Sep 17 00:00:00 2001
From: Przemek <minikers21@gmail.com>
Date: Thu, 24 Jul 2025 17:24:31 +0200
Subject: [PATCH 499/529] Fixed emulated float compilation error

---
 62_CAD/shaders/main_pipeline/vertex_shader.hlsl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/62_CAD/shaders/main_pipeline/vertex_shader.hlsl b/62_CAD/shaders/main_pipeline/vertex_shader.hlsl
index 809e6d49b..90394e935 100644
--- a/62_CAD/shaders/main_pipeline/vertex_shader.hlsl
+++ b/62_CAD/shaders/main_pipeline/vertex_shader.hlsl
@@ -453,7 +453,7 @@ PSInput vtxMain(uint vertexID : SV_VertexID)
                     // to better understand variables at play, and the circle space, see documentation of `miterSDF` in fragment shader
                     // length of vector from circle center to intersection position (normalized so that circle radius = line thickness = 1.0)
                     float vLen = length(v);
-                    float2 intersectionDirection_Screenspace = normalize(transformVectorScreenSpace(clipProjectionData.projectionToNDC, globals.resolution, v));
+                    float2 intersectionDirection_Screenspace = normalize(transformVectorScreenSpace(clipProjectionData.projectionToNDC, globals.resolution, _static_cast<pfloat64_t2>(v)));
                     const float2 v_Screenspace = intersectionDirection_Screenspace * vLen;
 
                     // Find other miter vertices

From 1bbfd0979d10cb25c8faa19495939788a93e47f3 Mon Sep 17 00:00:00 2001
From: kevyuu <kevin.kayu@gmail.com>
Date: Fri, 25 Jul 2025 17:34:26 +0700
Subject: [PATCH 500/529] Fix example 67

---
 67_RayQueryGeometry/app_resources/common.hlsl |  27 ++--
 .../app_resources/render.comp.hlsl            |  57 ++++---
 67_RayQueryGeometry/include/common.hpp        |  60 +------
 67_RayQueryGeometry/main.cpp                  | 149 ++++++++++++++----
 4 files changed, 160 insertions(+), 133 deletions(-)

diff --git a/67_RayQueryGeometry/app_resources/common.hlsl b/67_RayQueryGeometry/app_resources/common.hlsl
index 9110cd4a1..6a74a1fbe 100644
--- a/67_RayQueryGeometry/app_resources/common.hlsl
+++ b/67_RayQueryGeometry/app_resources/common.hlsl
@@ -5,6 +5,13 @@
 
 NBL_CONSTEXPR uint32_t WorkgroupSize = 16;
 
+enum NormalType : uint32_t
+{
+    NT_R8G8B8A8_SNORM,
+    NT_R32G32B32_SFLOAT,
+    NT_UNKNOWN
+};
+
 // we need bitfield support in NBL_HLSL_DECLARE_STRUCT it seems
 struct SGeomInfo
 {
@@ -12,10 +19,8 @@ struct SGeomInfo
     uint64_t indexBufferAddress;
     uint64_t normalBufferAddress;
 
-    uint32_t objType : 29;
-    uint32_t indexType : 2; // 16 bit, 32 bit or none
-    uint32_t smoothNormals : 1;	// flat for cube, rectangle, disk
-    uint32_t padding;
+    uint32_t normalType : 2;
+    uint32_t indexType : 1; // 16 bit, 32 bit
 };
 
 struct SPushConstants
@@ -29,18 +34,4 @@ struct SPushConstants
     float32_t2 offsetNDC;
 };
 
-#ifdef __HLSL_VERSION
-enum ObjectType : uint32_t  // matches c++
-{
-    OT_CUBE = 0,
-    OT_SPHERE,
-    OT_CYLINDER,
-    OT_RECTANGLE,
-    OT_CONE,
-    OT_ICOSPHERE,
-
-    OT_COUNT
-};
-#endif
-
 #endif  // RQG_COMMON_HLSL
diff --git a/67_RayQueryGeometry/app_resources/render.comp.hlsl b/67_RayQueryGeometry/app_resources/render.comp.hlsl
index 937273767..135100573 100644
--- a/67_RayQueryGeometry/app_resources/render.comp.hlsl
+++ b/67_RayQueryGeometry/app_resources/render.comp.hlsl
@@ -25,41 +25,47 @@ float3 unpackNormals3x10(uint32_t v)
     return clamp(float3(pn) / 511.0, -1.0, 1.0);
 }
 
-float3 calculateSmoothNormals(int instID, int primID, SGeomInfo geom, float2 bary)
+float3 calculateNormals(int primID, SGeomInfo geom, float2 bary)
 {
     const uint indexType = geom.indexType;
-    const uint objType = geom.objType;
+    const uint normalType = geom.normalType;
 
     const uint64_t vertexBufferAddress = geom.vertexBufferAddress;
     const uint64_t indexBufferAddress = geom.indexBufferAddress;
     const uint64_t normalBufferAddress = geom.normalBufferAddress;
 
     uint32_t3 indices;
-    switch (indexType)
+    if (indexBufferAddress == 0)
     {
-        case 0: // EIT_16BIT
-            indices = uint32_t3((nbl::hlsl::bda::__ptr<uint16_t3>::create(indexBufferAddress)+primID).deref().load());
-            break;
-        case 1: // EIT_32BIT
-            indices = uint32_t3((nbl::hlsl::bda::__ptr<uint32_t3>::create(indexBufferAddress)+primID).deref().load());
-            break;
-        default:    // EIT_NONE
+        indices[0] = primID * 3;
+        indices[1] = indices[0] + 1;
+        indices[2] = indices[0] + 2;
+    }
+    else {
+        switch (indexType)
         {
-            indices[0] = primID * 3;
-            indices[1] = indices[0] + 1;
-            indices[2] = indices[0] + 2;
+            case 0: // EIT_16BIT
+                indices = uint32_t3((nbl::hlsl::bda::__ptr<uint16_t3>::create(indexBufferAddress)+primID).deref().load());
+                break;
+            case 1: // EIT_32BIT
+                indices = uint32_t3((nbl::hlsl::bda::__ptr<uint32_t3>::create(indexBufferAddress)+primID).deref().load());
+                break;
         }
     }
 
+    if (normalBufferAddress == 0 || normalType == NT_UNKNOWN)
+    {
+        float3 v0 = vk::RawBufferLoad<float3>(vertexBufferAddress + indices[0] * 12);
+        float3 v1 = vk::RawBufferLoad<float3>(vertexBufferAddress + indices[1] * 12);
+        float3 v2 = vk::RawBufferLoad<float3>(vertexBufferAddress + indices[2] * 12);
+
+        return normalize(cross(v2 - v0, v1 - v0));
+    }
+
     float3 n0, n1, n2;
-    switch (objType)
+    switch (normalType)
     {
-        case OT_CUBE:
-        case OT_SPHERE:
-        case OT_RECTANGLE:
-        case OT_CYLINDER:
-        //case OT_ARROW:
-        case OT_CONE:
+        case NT_R8G8B8A8_SNORM:
         {
             uint32_t v0 = vk::RawBufferLoad<uint32_t>(normalBufferAddress + indices[0] * 4);
             uint32_t v1 = vk::RawBufferLoad<uint32_t>(normalBufferAddress + indices[1] * 4);
@@ -70,13 +76,13 @@ float3 calculateSmoothNormals(int instID, int primID, SGeomInfo geom, float2 bar
             n2 = normalize(nbl::hlsl::spirv::unpackSnorm4x8(v2).xyz);
         }
         break;
-        case OT_ICOSPHERE:
-        default:
+        case NT_R32G32B32_SFLOAT:
         {
             n0 = normalize(vk::RawBufferLoad<float3>(normalBufferAddress + indices[0] * 12));
             n1 = normalize(vk::RawBufferLoad<float3>(normalBufferAddress + indices[1] * 12));
             n2 = normalize(vk::RawBufferLoad<float3>(normalBufferAddress + indices[2] * 12));
         }
+        break;
     }
 
     float3 barycentrics = float3(0.0, bary);
@@ -113,15 +119,16 @@ void main(uint32_t3 threadID : SV_DispatchThreadID)
 
     if (spirv::rayQueryGetIntersectionTypeKHR(query, true) == spv::RayQueryCommittedIntersectionTypeRayQueryCommittedIntersectionTriangleKHR)
     {
-        const int instID = spirv::rayQueryGetIntersectionInstanceIdKHR(query, true);
+        const int instanceCustomIndex = spirv::rayQueryGetIntersectionInstanceCustomIndexKHR(query, true);
+        const int geometryIndex = spirv::rayQueryGetIntersectionGeometryIndexKHR(query, true);
         const int primID = spirv::rayQueryGetIntersectionPrimitiveIndexKHR(query, true);
 
         // TODO: candidate for `bda::__ptr<SGeomInfo>`
-        const SGeomInfo geom = vk::RawBufferLoad<SGeomInfo>(pc.geometryInfoBuffer + instID * sizeof(SGeomInfo),8);
+        const SGeomInfo geom = vk::RawBufferLoad<SGeomInfo>(pc.geometryInfoBuffer + (instanceCustomIndex + geometryIndex) * sizeof(SGeomInfo), 8);
 
         float3 normals;
         float2 barycentrics = spirv::rayQueryGetIntersectionBarycentricsKHR(query, true);
-        normals = calculateSmoothNormals(instID, primID, geom, barycentrics);
+        normals = calculateNormals(primID, geom, barycentrics);
 
         normals = normalize(normals) * 0.5 + 0.5;
         color = float4(normals, 1.0);
diff --git a/67_RayQueryGeometry/include/common.hpp b/67_RayQueryGeometry/include/common.hpp
index b1759e9e3..84b0a3dcf 100644
--- a/67_RayQueryGeometry/include/common.hpp
+++ b/67_RayQueryGeometry/include/common.hpp
@@ -17,67 +17,17 @@ using namespace nbl::examples;
 
 namespace nbl::scene
 {
-enum ObjectType : uint8_t
-{
-	OT_CUBE,
-	OT_SPHERE,
-	OT_CYLINDER,
-	OT_RECTANGLE,
-	OT_CONE,
-	OT_ICOSPHERE,
-
-	OT_COUNT,
-	OT_UNKNOWN = std::numeric_limits<uint8_t>::max()
-};
-
-static constexpr uint32_t s_smoothNormals[OT_COUNT] = { 0, 1, 1, 0, 1, 1 };
-
-struct ObjectMeta
-{
-	ObjectType type = OT_UNKNOWN;
-	std::string_view name = "Unknown";
-};
-
-struct ObjectDrawHookCpu
-{
-	nbl::core::matrix3x4SIMD model;
-	ObjectMeta meta;
-};
-
-enum GeometryShader
-{
-	GP_BASIC = 0,
-	GP_CONE,
-	GP_ICO,
-
-	GP_COUNT
-};
 
+using PolygonGeometryData = core::smart_refctd_ptr<ICPUPolygonGeometry>;
+using GeometryCollectionData = core::smart_refctd_ptr<ICPUGeometryCollection>;
+using GeometryData = std::variant<PolygonGeometryData, GeometryCollectionData>;
 struct ReferenceObjectCpu
 {
-	ObjectMeta meta;
   core::matrix3x4SIMD transform;
-	core::smart_refctd_ptr<ICPUPolygonGeometry> data;
+	GeometryData data;
+  uint32_t instanceID;
 };
 
-struct ReferenceObjectGpu
-{
-	struct Bindings
-	{
-		nbl::asset::SBufferBinding<IGPUBuffer> vertex, index;
-	};
-
-	ObjectMeta meta;
-	Bindings bindings;
-	uint32_t vertexStride;
-	nbl::asset::E_INDEX_TYPE indexType = nbl::asset::E_INDEX_TYPE::EIT_UNKNOWN;
-	uint32_t indexCount = {};
-
-	const bool useIndex() const
-	{
-		return bindings.index.buffer && (indexType != E_INDEX_TYPE::EIT_UNKNOWN);
-	}
-};
 }
 
 
diff --git a/67_RayQueryGeometry/main.cpp b/67_RayQueryGeometry/main.cpp
index 76a4819e0..0f662c558 100644
--- a/67_RayQueryGeometry/main.cpp
+++ b/67_RayQueryGeometry/main.cpp
@@ -501,44 +501,91 @@ class RayQueryGeometryApp final : public SimpleWindowedApplication, public Built
 			};
 
 			std::vector<ReferenceObjectCpu> cpuObjects;
-			cpuObjects.push_back(ReferenceObjectCpu{ .meta = {.type = OT_CUBE, .name = "Cube Mesh" }, .transform = nextTransform(), .data = gc->createCube({1.f, 1.f, 1.f})});
-			cpuObjects.push_back(ReferenceObjectCpu{ .meta = {.type = OT_SPHERE, .name = "Sphere Mesh" }, .transform = nextTransform(), .data = gc->createSphere(2, 16, 16)});
-			cpuObjects.push_back(ReferenceObjectCpu{ .meta = {.type = OT_CYLINDER, .name = "Cylinder Mesh" }, .transform = nextTransform(), .data = gc->createCylinder(2, 2, 20)});
-			cpuObjects.push_back(ReferenceObjectCpu{ .meta = {.type = OT_RECTANGLE, .name = "Rectangle Mesh" }, .transform = nextTransform(), .data = gc->createRectangle({1.5, 3})});
-			cpuObjects.push_back(ReferenceObjectCpu{ .meta = {.type = OT_CONE, .name = "Cone Mesh" }, .transform = nextTransform(), .data = gc->createCone(2, 3, 10)});
-			cpuObjects.push_back(ReferenceObjectCpu{ .meta = {.type = OT_ICOSPHERE, .name = "Icosphere Mesh" }, .transform = nextTransform(), .data = gc->createIcoSphere(1, 3, true)});
-			const auto arrowPolygons = gc->createArrow();
-			const auto arrowTransform = nextTransform();
-			cpuObjects.push_back(ReferenceObjectCpu{ .meta = {.type = OT_CYLINDER, .name = "Arrow Mesh" }, .transform = arrowTransform, .data = arrowPolygons[0]});
-			cpuObjects.push_back(ReferenceObjectCpu{ .meta = {.type = OT_CONE, .name = "Arrow Mesh" }, .transform = arrowTransform, .data = arrowPolygons[1]});
-			auto geomInfoBuffer = ICPUBuffer::create({ cpuObjects.size() * sizeof(SGeomInfo) });
+			cpuObjects.push_back(ReferenceObjectCpu{ .transform = nextTransform(), .data = gc->createArrow() });
+			cpuObjects.push_back(ReferenceObjectCpu{ .transform = nextTransform(), .data = gc->createCube({1.f, 1.f, 1.f})});
+			cpuObjects.push_back(ReferenceObjectCpu{ .transform = nextTransform(), .data = gc->createSphere(2, 16, 16)});
+			cpuObjects.push_back(ReferenceObjectCpu{ .transform = nextTransform(), .data = gc->createCylinder(2, 2, 20)});
+			cpuObjects.push_back(ReferenceObjectCpu{ .transform = nextTransform(), .data = gc->createRectangle({1.5, 3})});
+			cpuObjects.push_back(ReferenceObjectCpu{ .transform = nextTransform(), .data = gc->createCone(2, 3, 10)});
+			cpuObjects.push_back(ReferenceObjectCpu{ .transform = nextTransform(), .data = gc->createIcoSphere(1, 3, true)});
+
+			const auto geometryCount = [&cpuObjects]
+			{
+        size_t count = 0;
+			  for (auto& cpuObject: cpuObjects)
+			  {
+					const auto data = cpuObject.data;
+					cpuObject.instanceID = count;
+			    if (std::holds_alternative<PolygonGeometryData>(data))
+			    {
+						count += 1;
+			    } else if (std::holds_alternative<GeometryCollectionData>(data))
+			    {
+						const auto colData = std::get<GeometryCollectionData>(data);
+						count += colData->getGeometries()->size();
+			    }
+			  }
+				return count;
+			}();
+
+			auto geomInfoBuffer = ICPUBuffer::create({ geometryCount * sizeof(SGeomInfo) });
 
 			SGeomInfo* geomInfos = reinterpret_cast<SGeomInfo*>(geomInfoBuffer->getPointer());
 
 			// get ICPUBuffers into ICPUBottomLevelAccelerationStructures
 			std::vector<smart_refctd_ptr<ICPUBottomLevelAccelerationStructure>> cpuBlas(cpuObjects.size());
-			for (uint32_t i = 0; i < cpuBlas.size(); i++)
+			for (uint32_t blas_i = 0; blas_i < cpuBlas.size(); blas_i++)
 			{
-				auto triangles = make_refctd_dynamic_array<smart_refctd_dynamic_array<ICPUBottomLevelAccelerationStructure::Triangles<ICPUBuffer>>>(1u);
-				auto primitiveCounts = make_refctd_dynamic_array<smart_refctd_dynamic_array<uint32_t>>(1u);
+        auto& blas = cpuBlas[blas_i];
+        blas = make_smart_refctd_ptr<ICPUBottomLevelAccelerationStructure>();
+				if (std::holds_alternative<PolygonGeometryData>(cpuObjects[blas_i].data))
+				{
+					const auto data = std::get<PolygonGeometryData>(cpuObjects[blas_i].data);
+
+          auto triangles = make_refctd_dynamic_array<smart_refctd_dynamic_array<ICPUBottomLevelAccelerationStructure::Triangles<ICPUBuffer>>>(1u);
+          auto primitiveCounts = make_refctd_dynamic_array<smart_refctd_dynamic_array<uint32_t>>(1u);
+
+          auto& tri = triangles->front();
+
+          auto& primCount = primitiveCounts->front();
+          primCount = data->getPrimitiveCount();
 
-				auto& tri = triangles->front();
+          tri = data->exportForBLAS();
 
-				auto& primCount = primitiveCounts->front();
-				primCount = cpuObjects[i].data->getPrimitiveCount();
+          blas->setGeometries(std::move(triangles), std::move(primitiveCounts));
 
-				tri = cpuObjects[i].data->exportForBLAS();
+				  
+				} else if (std::holds_alternative<GeometryCollectionData>(cpuObjects[blas_i].data))
+				{
+				  
+					const auto data = std::get<GeometryCollectionData>(cpuObjects[blas_i].data);
+
+					const auto& geometries = *data->getGeometries();
+					const auto geometryCount = geometries.size();
+
+          auto triangles = make_refctd_dynamic_array<smart_refctd_dynamic_array<ICPUBottomLevelAccelerationStructure::Triangles<ICPUBuffer>>>(geometryCount);
+          auto primitiveCounts = make_refctd_dynamic_array<smart_refctd_dynamic_array<uint32_t>>(geometryCount);
 
-				auto& blas = cpuBlas[i];
-				blas = make_smart_refctd_ptr<ICPUBottomLevelAccelerationStructure>();
-				blas->setGeometries(std::move(triangles), std::move(primitiveCounts));
+          for (auto geometry_i = 0u; geometry_i < geometryCount; geometry_i++)
+          {
+						const auto& geometry = geometries[geometry_i];
+            const auto* polyGeo = static_cast<const ICPUPolygonGeometry*>(geometry.geometry.get());
+						primitiveCounts->operator[](geometry_i) = polyGeo->getPrimitiveCount();
+						auto& triangle = triangles->operator[](geometry_i);
+						triangle = polyGeo->exportForBLAS();
+						if (geometry.hasTransform())
+							triangle.transform = geometry.transform;
+          }
 
-				auto blasFlags = bitflag(IGPUBottomLevelAccelerationStructure::BUILD_FLAGS::PREFER_FAST_TRACE_BIT) | IGPUBottomLevelAccelerationStructure::BUILD_FLAGS::ALLOW_COMPACTION_BIT;
-				if (m_physicalDevice->getProperties().limits.rayTracingPositionFetch)
-					blasFlags |= IGPUBottomLevelAccelerationStructure::BUILD_FLAGS::ALLOW_DATA_ACCESS;
+          blas->setGeometries(std::move(triangles), std::move(primitiveCounts));
+
+				}
+        auto blasFlags = bitflag(IGPUBottomLevelAccelerationStructure::BUILD_FLAGS::PREFER_FAST_TRACE_BIT) | IGPUBottomLevelAccelerationStructure::BUILD_FLAGS::ALLOW_COMPACTION_BIT;
+        if (m_physicalDevice->getProperties().limits.rayTracingPositionFetch)
+          blasFlags |= IGPUBottomLevelAccelerationStructure::BUILD_FLAGS::ALLOW_DATA_ACCESS;
 
-				blas->setBuildFlags(blasFlags);
-				blas->setContentHash(blas->computeContentHash());
+        blas->setBuildFlags(blasFlags);
+        blas->setContentHash(blas->computeContentHash());
 			}
 
 			// get ICPUBottomLevelAccelerationStructure into ICPUTopLevelAccelerationStructure
@@ -550,7 +597,7 @@ class RayQueryGeometryApp final : public SimpleWindowedApplication, public Built
 					ICPUTopLevelAccelerationStructure::StaticInstance inst;
 					inst.base.blas = cpuBlas[i];
 					inst.base.flags = static_cast<uint32_t>(IGPUTopLevelAccelerationStructure::INSTANCE_FLAGS::TRIANGLE_FACING_CULL_DISABLE_BIT);
-					inst.base.instanceCustomIndex = i;
+					inst.base.instanceCustomIndex = cpuObjects[i].instanceID;
 					inst.base.instanceShaderBindingTableRecordOffset = 0;
 					inst.base.mask = 0xFF;
 					inst.transform = cpuObjects[i].transform;
@@ -621,17 +668,37 @@ class RayQueryGeometryApp final : public SimpleWindowedApplication, public Built
 			CAssetConverter::patch_t<ICPUTopLevelAccelerationStructure> tlasPatch = {};
 			tlasPatch.compactAfterBuild = true;
 			std::vector<CAssetConverter::patch_t<ICPUBottomLevelAccelerationStructure>> tmpBLASPatches(cpuObjects.size());
-			std::vector<ICPUPolygonGeometry*> tmpGeometries(cpuObjects.size());
-			std::vector<CAssetConverter::patch_t<asset::ICPUPolygonGeometry>> tmpGeometryPatches(cpuObjects.size());
+			std::vector<ICPUPolygonGeometry*> tmpGeometries;
+			tmpGeometries.reserve(geometryCount);
+			std::vector<CAssetConverter::patch_t<asset::ICPUPolygonGeometry>> tmpGeometryPatches;
+			tmpGeometryPatches.reserve(geometryCount);
 			{
 				tmpBLASPatches.front().compactAfterBuild = true;
 				std::fill(tmpBLASPatches.begin(),tmpBLASPatches.end(),tmpBLASPatches.front());
 				//
 				for (uint32_t i = 0; i < cpuObjects.size(); i++)
 				{
-					tmpGeometries[i] = cpuObjects[i].data.get();
-					tmpGeometryPatches[i].indexBufferUsages= IGPUBuffer::E_USAGE_FLAGS::EUF_SHADER_DEVICE_ADDRESS_BIT;
+					const auto data = cpuObjects[i].data;
+					if (std::holds_alternative<PolygonGeometryData>(data))
+					{
+						const auto polygonData = std::get<PolygonGeometryData>(data);
+						tmpGeometries.push_back(polygonData.get());
+            tmpGeometryPatches.push_back({});
+						tmpGeometryPatches.back().indexBufferUsages = IGPUBuffer::E_USAGE_FLAGS::EUF_SHADER_DEVICE_ADDRESS_BIT;
+					} else if (std::holds_alternative<GeometryCollectionData>(data))
+					{
+						const auto collectionData = std::get<GeometryCollectionData>(data);
+						for (const auto& geometryRef : *collectionData->getGeometries())
+						{
+              auto* polyGeo = static_cast<ICPUPolygonGeometry*>(geometryRef.geometry.get());
+							tmpGeometries.push_back(polyGeo);
+              tmpGeometryPatches.push_back({});
+              tmpGeometryPatches.back().indexBufferUsages = IGPUBuffer::E_USAGE_FLAGS::EUF_SHADER_DEVICE_ADDRESS_BIT;
+						}
+					}
 				}
+				assert(tmpGeometries.size() == geometryCount);
+				assert(tmpGeometryPatches.size() == geometryCount);
 
 				std::get<CAssetConverter::SInputs::asset_span_t<ICPUDescriptorSet>>(inputs.assets) = {&descriptorSet.get(),1};
 				std::get<CAssetConverter::SInputs::asset_span_t<ICPUTopLevelAccelerationStructure>>(inputs.assets) = {&cpuTlas.get(),1};
@@ -771,7 +838,6 @@ class RayQueryGeometryApp final : public SimpleWindowedApplication, public Built
 				// assign gpu objects to output
 				for (uint32_t i = 0; i < gpuPolygonGeometries.size(); i++)
 				{
-					const auto& cpuObject = cpuObjects[i];
 					const auto& gpuPolygon = gpuPolygonGeometries[i].value;
 					const auto gpuTriangles = gpuPolygon->exportForBLAS();
 
@@ -781,15 +847,28 @@ class RayQueryGeometryApp final : public SimpleWindowedApplication, public Built
 					const auto& normalView = gpuPolygon->getNormalView();
 					const uint64_t normalBufferAddress = normalView ? normalView.src.buffer->getDeviceAddress() + normalView.src.offset : 0;
 
+					const auto normalType = [&normalView]
+					{
+            if (!normalView) return NT_UNKNOWN;
+						switch (normalView.composed.format)
+						{
+						case EF_R32G32B32_SFLOAT:
+							return NT_R32G32B32_SFLOAT;
+						case EF_R8G8B8A8_SNORM:
+							return NT_R8G8B8A8_SNORM;
+						default:
+							return NT_UNKNOWN;
+						}
+					}();
+
 					const auto& indexBufferBinding = gpuTriangles.indexData;
 					auto& geomInfo = geomInfos[i];
 					geomInfo = {
 						.vertexBufferAddress = vertexBufferAddress,
 						.indexBufferAddress = indexBufferBinding.buffer ? indexBufferBinding.buffer->getDeviceAddress() + indexBufferBinding.offset : vertexBufferAddress,
 						.normalBufferAddress = normalBufferAddress,
-						.objType = cpuObject.meta.type,
+						.normalType = normalType,
 						.indexType = gpuTriangles.indexType,
-						.smoothNormals = s_smoothNormals[cpuObject.meta.type],
 					};
 
 					m_gpuPolygons[i] = gpuPolygon;
@@ -800,7 +879,7 @@ class RayQueryGeometryApp final : public SimpleWindowedApplication, public Built
 			{
 				IGPUBuffer::SCreationParams params;
 				params.usage = IGPUBuffer::EUF_STORAGE_BUFFER_BIT | IGPUBuffer::EUF_TRANSFER_DST_BIT | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT;
-				params.size = cpuObjects.size() * sizeof(SGeomInfo);
+				params.size = geometryCount * sizeof(SGeomInfo);
 				m_utils->createFilledDeviceLocalBufferOnDedMem(SIntendedSubmitInfo{ .queue = gQueue }, std::move(params), geomInfos).move_into(geometryInfoBuffer);
 			}
 

From 25f1cd44e4f87a5ee9455bcf750fbcc1a1fc7ec5 Mon Sep 17 00:00:00 2001
From: kevyuu <kevin.kayu@gmail.com>
Date: Fri, 25 Jul 2025 17:34:35 +0700
Subject: [PATCH 501/529] Fix example 71

---
 .../app_resources/common.hlsl                 |  12 ++-
 .../app_resources/raytrace.rchit.hlsl         | 100 +++++++++---------
 71_RayTracingPipeline/include/common.hpp      |  24 -----
 71_RayTracingPipeline/main.cpp                |  21 ++--
 4 files changed, 71 insertions(+), 86 deletions(-)

diff --git a/71_RayTracingPipeline/app_resources/common.hlsl b/71_RayTracingPipeline/app_resources/common.hlsl
index 8f7a06a33..6a0f65253 100644
--- a/71_RayTracingPipeline/app_resources/common.hlsl
+++ b/71_RayTracingPipeline/app_resources/common.hlsl
@@ -86,6 +86,12 @@ struct SProceduralGeomInfo
     float32_t radius;
 };
 
+enum NormalType : uint32_t
+{
+    NT_R8G8B8A8_SNORM,
+    NT_R32G32B32_SFLOAT,
+    NT_UNKNOWN
+};
 
 struct STriangleGeomInfo
 {
@@ -94,10 +100,8 @@ struct STriangleGeomInfo
     uint64_t indexBufferAddress;
     uint64_t normalBufferAddress;
 
-    uint32_t vertexStride : 26;
-    uint32_t objType: 3;
-    uint32_t indexType : 2; // 16 bit, 32 bit or none
-    uint32_t smoothNormals : 1;	// flat for cube, rectangle, disk
+    uint32_t normalType : 2;
+    uint32_t indexType : 1; // 16 bit, 32 bit
 
 };
 
diff --git a/71_RayTracingPipeline/app_resources/raytrace.rchit.hlsl b/71_RayTracingPipeline/app_resources/raytrace.rchit.hlsl
index b513d5958..c63281782 100644
--- a/71_RayTracingPipeline/app_resources/raytrace.rchit.hlsl
+++ b/71_RayTracingPipeline/app_resources/raytrace.rchit.hlsl
@@ -1,89 +1,87 @@
 #include "common.hlsl"
 
+#include "nbl/builtin/hlsl/bda/__ptr.hlsl"
+
 [[vk::push_constant]] SPushConstants pc;
 
-float32_t3 fetchVertexNormal(int instID, int primID, STriangleGeomInfo geom, float2 bary)
+float3 calculateNormals(int primID, STriangleGeomInfo geom, float2 bary)
 {
-    uint idxOffset = primID * 3;
-    
     const uint indexType = geom.indexType;
-    const uint vertexStride = geom.vertexStride;
-    
-    const uint32_t objType = geom.objType;
+    const uint normalType = geom.normalType;
+
+    const uint64_t vertexBufferAddress = geom.vertexBufferAddress;
     const uint64_t indexBufferAddress = geom.indexBufferAddress;
-    
-    uint i0, i1, i2;
-    switch (indexType)
+    const uint64_t normalBufferAddress = geom.normalBufferAddress;
+
+    uint32_t3 indices;
+    if (indexBufferAddress == 0)
     {
-        case 0: // EIT_16BIT
-        {
-            i0 = uint32_t(vk::RawBufferLoad < uint16_t > (indexBufferAddress + (idxOffset + 0) * sizeof(uint16_t), 2u));
-            i1 = uint32_t(vk::RawBufferLoad < uint16_t > (indexBufferAddress + (idxOffset + 1) * sizeof(uint16_t), 2u));
-            i2 = uint32_t(vk::RawBufferLoad < uint16_t > (indexBufferAddress + (idxOffset + 2) * sizeof(uint16_t), 2u));
-        }
-        break;
-        case 1: // EIT_32BIT
-        {
-            i0 = vk::RawBufferLoad < uint32_t > (indexBufferAddress + (idxOffset + 0) * sizeof(uint32_t));
-            i1 = vk::RawBufferLoad < uint32_t > (indexBufferAddress + (idxOffset + 1) * sizeof(uint32_t));
-            i2 = vk::RawBufferLoad < uint32_t > (indexBufferAddress + (idxOffset + 2) * sizeof(uint32_t));
-        }
-        break;
-        default: // EIT_NONE
+        indices[0] = primID * 3;
+        indices[1] = indices[0] + 1;
+        indices[2] = indices[0] + 2;
+    }
+    else {
+        switch (indexType)
         {
-            i0 = idxOffset;
-            i1 = idxOffset + 1;
-            i2 = idxOffset + 2;
+            case 0: // EIT_16BIT
+                indices = uint32_t3((nbl::hlsl::bda::__ptr<uint16_t3>::create(indexBufferAddress)+primID).deref().load());
+                break;
+            case 1: // EIT_32BIT
+                indices = uint32_t3((nbl::hlsl::bda::__ptr<uint32_t3>::create(indexBufferAddress)+primID).deref().load());
+                break;
         }
     }
 
-    const uint64_t normalVertexBufferAddress = geom.normalBufferAddress;
-    float3 n0, n1, n2;
+    if (normalBufferAddress == 0 || normalType == NT_UNKNOWN)
+    {
+        float3 v0 = vk::RawBufferLoad<float3>(vertexBufferAddress + indices[0] * 12);
+        float3 v1 = vk::RawBufferLoad<float3>(vertexBufferAddress + indices[1] * 12);
+        float3 v2 = vk::RawBufferLoad<float3>(vertexBufferAddress + indices[2] * 12);
+
+        return normalize(cross(v2 - v0, v1 - v0));
+    }
 
     float3 n0, n1, n2;
-    switch (objType)
+    switch (normalType)
     {
-        case OT_CUBE:
-        case OT_SPHERE:
-        case OT_RECTANGLE:
-        case OT_CYLINDER:
-        //case OT_ARROW:
-        case OT_CONE:
+        case NT_R8G8B8A8_SNORM:
         {
-            // TODO: document why the alignment is 2 here and nowhere else? isnt the `vertexStride` aligned to more than 2 anyway?
-            uint32_t v0 = vk::RawBufferLoad<uint32_t>(normalBufferAddress + i0 * 4);
-            uint32_t v1 = vk::RawBufferLoad<uint32_t>(normalBufferAddress + i1 * 4);
-            uint32_t v2 = vk::RawBufferLoad<uint32_t>(normalBufferAddress + i2 * 4);
+            uint32_t v0 = vk::RawBufferLoad<uint32_t>(normalBufferAddress + indices[0] * 4);
+            uint32_t v1 = vk::RawBufferLoad<uint32_t>(normalBufferAddress + indices[1] * 4);
+            uint32_t v2 = vk::RawBufferLoad<uint32_t>(normalBufferAddress + indices[2] * 4);
 
             n0 = normalize(nbl::hlsl::spirv::unpackSnorm4x8(v0).xyz);
             n1 = normalize(nbl::hlsl::spirv::unpackSnorm4x8(v1).xyz);
             n2 = normalize(nbl::hlsl::spirv::unpackSnorm4x8(v2).xyz);
         }
         break;
-        case OT_ICOSPHERE:
-        default:
+        case NT_R32G32B32_SFLOAT:
         {
-            n0 = normalize(vk::RawBufferLoad<float3>(normalBufferAddress + i0 * 12));
-            n1 = normalize(vk::RawBufferLoad<float3>(normalBufferAddress + i1 * 12));
-            n2 = normalize(vk::RawBufferLoad<float3>(normalBufferAddress + i2 * 12));
+            n0 = normalize(vk::RawBufferLoad<float3>(normalBufferAddress + indices[0] * 12));
+            n1 = normalize(vk::RawBufferLoad<float3>(normalBufferAddress + indices[1] * 12));
+            n2 = normalize(vk::RawBufferLoad<float3>(normalBufferAddress + indices[2] * 12));
         }
+        break;
     }
 
     float3 barycentrics = float3(0.0, bary);
-    barycentrics.x = 1.0 - barycentrics.y - barycentrics.z;
-    return normalize(barycentrics.x * n0 + barycentrics.y * n1 + barycentrics.z * n2);
+    barycentrics.x = 1.0 - barycentrics.y - barycentrics.z;        
+
+    return barycentrics.x * n0 + barycentrics.y * n1 + barycentrics.z * n2;
 }
 
+
 [shader("closesthit")]
 void main(inout PrimaryPayload payload, in BuiltInTriangleIntersectionAttributes attribs)
 {
-    const int instID = InstanceID();
     const int primID = PrimitiveIndex();
-    const STriangleGeomInfo geom = vk::RawBufferLoad < STriangleGeomInfo > (pc.triangleGeomInfoBuffer + instID * sizeof(STriangleGeomInfo));
-    const float32_t3 vertexNormal = fetchVertexNormal(instID, primID, geom, attribs.barycentrics);
+    const int instanceCustomIndex = InstanceIndex();
+    const int geometryIndex = GeometryIndex();
+    const STriangleGeomInfo geom = vk::RawBufferLoad < STriangleGeomInfo > (pc.triangleGeomInfoBuffer + (instanceCustomIndex + geometryIndex) * sizeof(STriangleGeomInfo));
+    const float32_t3 vertexNormal = calculateNormals(primID, geom, attribs.barycentrics);
     const float32_t3 worldNormal = normalize(mul(vertexNormal, WorldToObject3x4()).xyz);
 
-    payload.materialId = MaterialId::createTriangle(instID);
+    payload.materialId = MaterialId::createTriangle(instanceCustomIndex);
 
     payload.worldNormal = worldNormal;
     payload.rayDistance = RayTCurrent();
diff --git a/71_RayTracingPipeline/include/common.hpp b/71_RayTracingPipeline/include/common.hpp
index 479b7fff6..6727c879c 100644
--- a/71_RayTracingPipeline/include/common.hpp
+++ b/71_RayTracingPipeline/include/common.hpp
@@ -22,32 +22,8 @@ using namespace nbl::examples;
 namespace nbl::scene
 {
 
-enum ObjectType : uint8_t
-{
-	OT_CUBE,
-	OT_SPHERE,
-	OT_CYLINDER,
-	OT_RECTANGLE,
-	OT_DISK,
-	OT_ARROW,
-	OT_CONE,
-	OT_ICOSPHERE,
-
-	OT_COUNT,
-	OT_UNKNOWN = std::numeric_limits<uint8_t>::max()
-};
-
-static constexpr uint32_t s_smoothNormals[OT_COUNT] = { 0, 1, 1, 0, 0, 1, 1, 1 };
-
-struct ObjectMeta
-{
-	ObjectType type = OT_UNKNOWN;
-	std::string_view name = "Unknown";
-};
-
 struct ReferenceObjectCpu
 {
-	ObjectMeta meta;
 	core::smart_refctd_ptr<ICPUPolygonGeometry> data;
 	Material material;
   core::matrix3x4SIMD transform;
diff --git a/71_RayTracingPipeline/main.cpp b/71_RayTracingPipeline/main.cpp
index c47eea1c4..dadff5c8d 100644
--- a/71_RayTracingPipeline/main.cpp
+++ b/71_RayTracingPipeline/main.cpp
@@ -1114,19 +1114,16 @@ class RaytracingPipelineApp final : public SimpleWindowedApplication, public Bui
 
 		const auto cpuObjects = std::array{
 			scene::ReferenceObjectCpu {
-				.meta = {.type = scene::OT_RECTANGLE, .name = "Plane Mesh"},
 				.data = geometryCreator->createRectangle({10, 10}),
 				.material = defaultMaterial,
 				.transform = planeTransform,
 			},
 			scene::ReferenceObjectCpu {
-				.meta = {.type = scene::OT_CUBE, .name = "Cube Mesh"},
 				.data = geometryCreator->createCube({1, 1, 1}),
 				.material = defaultMaterial,
 				.transform = getTranslationMatrix(0, 0.5f, 0),
 			},
 			scene::ReferenceObjectCpu {
-				.meta = {.type = scene::OT_CUBE, .name = "Cube Mesh 2"},
 				.data = geometryCreator->createCube({1.5, 1.5, 1.5}),
 				.material = Material{
 					.ambient = {0.1, 0.1, 0.2},
@@ -1138,7 +1135,6 @@ class RaytracingPipelineApp final : public SimpleWindowedApplication, public Bui
 				.transform = getTranslationMatrix(-5.0f, 1.0f, 0),
 			},
 			scene::ReferenceObjectCpu {
-				.meta = {.type = scene::OT_CUBE, .name = "Transparent Cube Mesh"},
 				.data = geometryCreator->createCube({1.5, 1.5, 1.5}),
 				.material = Material{
 					.ambient = {0.1, 0.2, 0.1},
@@ -1448,6 +1444,19 @@ class RaytracingPipelineApp final : public SimpleWindowedApplication, public Bui
 
 				const auto& normalView = gpuPolygon->getNormalView();
 				const uint64_t normalBufferAddress = normalView ? normalView.src.buffer->getDeviceAddress() + normalView.src.offset : 0;
+        const auto normalType = [&normalView]
+        {
+          if (!normalView) return NT_UNKNOWN;
+          switch (normalView.composed.format)
+          {
+          case EF_R32G32B32_SFLOAT:
+            return NT_R32G32B32_SFLOAT;
+          case EF_R8G8B8A8_SNORM:
+            return NT_R8G8B8A8_SNORM;
+          default:
+            return NT_UNKNOWN;
+          }
+        }();
 
 				const auto& indexBufferBinding = gpuTriangles.indexData;
 				auto& geomInfo = geomInfos[i];
@@ -1456,10 +1465,8 @@ class RaytracingPipelineApp final : public SimpleWindowedApplication, public Bui
 				  .vertexBufferAddress = vertexBufferAddress,
 				  .indexBufferAddress = indexBufferBinding.buffer ? indexBufferBinding.buffer->getDeviceAddress() + indexBufferBinding.offset : vertexBufferAddress,
 					.normalBufferAddress = normalBufferAddress,
-				  .vertexStride = gpuTriangles.vertexStride,
-				  .objType = cpuObject.meta.type,
+					.normalType = normalType,
 				  .indexType = gpuTriangles.indexType,
-				  .smoothNormals = scene::s_smoothNormals[cpuObject.meta.type],
 				};
 
 				m_gpuPolygons[i] = gpuPolygon;

From 6f58d05ea2e9769a759ca61c1720add59b10c1a6 Mon Sep 17 00:00:00 2001
From: kevyuu <kevin.kayu@gmail.com>
Date: Wed, 30 Jul 2025 17:00:53 +0700
Subject: [PATCH 502/529] Fix after merge

---
 71_RayTracingPipeline/app_resources/raytrace.rchit.hlsl | 1 -
 1 file changed, 1 deletion(-)

diff --git a/71_RayTracingPipeline/app_resources/raytrace.rchit.hlsl b/71_RayTracingPipeline/app_resources/raytrace.rchit.hlsl
index 1ea3f93be..c63281782 100644
--- a/71_RayTracingPipeline/app_resources/raytrace.rchit.hlsl
+++ b/71_RayTracingPipeline/app_resources/raytrace.rchit.hlsl
@@ -32,7 +32,6 @@ float3 calculateNormals(int primID, STriangleGeomInfo geom, float2 bary)
         }
     }
 
-    const uint64_t normalBufferAddress = geom.normalBufferAddress;
     if (normalBufferAddress == 0 || normalType == NT_UNKNOWN)
     {
         float3 v0 = vk::RawBufferLoad<float3>(vertexBufferAddress + indices[0] * 12);

From 261203dc132e2474f7589b80911b719a45cd427e Mon Sep 17 00:00:00 2001
From: kevyuu <kevin.kayu@gmail.com>
Date: Wed, 30 Jul 2025 17:03:06 +0700
Subject: [PATCH 503/529] Add no null intersection shader

---
 71_RayTracingPipeline/main.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/71_RayTracingPipeline/main.cpp b/71_RayTracingPipeline/main.cpp
index 6b3273b43..ce3775be9 100644
--- a/71_RayTracingPipeline/main.cpp
+++ b/71_RayTracingPipeline/main.cpp
@@ -316,6 +316,7 @@ class RaytracingPipelineApp final : public SimpleWindowedApplication, public Bui
 
 			const auto pipeline = ICPURayTracingPipeline::create(cpuPipelineLayout.get());
 			pipeline->getCachedCreationParams() = {
+				.flags = IGPURayTracingPipeline::SCreationParams::FLAGS::NO_NULL_INTERSECTION_SHADERS,
 				.maxRecursionDepth = 1,
 				.dynamicStackSize = true,
 			};

From 48bdacaf581729513a7e9342ed7421e7bd5435bd Mon Sep 17 00:00:00 2001
From: Erfan Ahmadi <ahmadierfan99@gmail.com>
Date: Thu, 31 Jul 2025 12:17:33 +0400
Subject: [PATCH 504/529] Cad Example: SingleLineText to take wchar

---
 62_CAD/SingleLineText.cpp | 4 ++--
 62_CAD/SingleLineText.h   | 2 +-
 62_CAD/main.cpp           | 6 +++---
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/62_CAD/SingleLineText.cpp b/62_CAD/SingleLineText.cpp
index ea755a2df..76eb797e7 100644
--- a/62_CAD/SingleLineText.cpp
+++ b/62_CAD/SingleLineText.cpp
@@ -1,6 +1,6 @@
 #include "SingleLineText.h"
 
-SingleLineText::SingleLineText(nbl::ext::TextRendering::FontFace* face, const std::string& text)
+SingleLineText::SingleLineText(nbl::ext::TextRendering::FontFace* face, const std::wstring& text)
 {
 	m_glyphBoxes.reserve(text.length());
 
@@ -11,7 +11,7 @@ SingleLineText::SingleLineText(nbl::ext::TextRendering::FontFace* face, const st
 	float64_t2 currentPos = float32_t2(0.0, 0.0);
 	for (uint32_t i = 0; i < text.length(); i++)
 	{
-		const auto glyphIndex = face->getGlyphIndex(wchar_t(text.at(i)));
+		const auto glyphIndex = face->getGlyphIndex(text.at(i));
 		const auto glyphMetrics = face->getGlyphMetrics(glyphIndex);
 		const bool skipGenerateGlyph = (glyphIndex == 0 || (glyphMetrics.size.x == 0.0 && glyphMetrics.size.y == 0.0));
 
diff --git a/62_CAD/SingleLineText.h b/62_CAD/SingleLineText.h
index aef22892a..624f3399f 100644
--- a/62_CAD/SingleLineText.h
+++ b/62_CAD/SingleLineText.h
@@ -12,7 +12,7 @@ class SingleLineText
 {
 public:
 	// constructs and fills the `glyphBoxes`
-	SingleLineText(nbl::ext::TextRendering::FontFace* face, const std::string& text);
+	SingleLineText(nbl::ext::TextRendering::FontFace* face, const std::wstring& text);
 	
 	struct BoundingBox
 	{
diff --git a/62_CAD/main.cpp b/62_CAD/main.cpp
index c920f0e1f..f4a886791 100644
--- a/62_CAD/main.cpp
+++ b/62_CAD/main.cpp
@@ -83,7 +83,7 @@ constexpr std::array<float, (uint32_t)ExampleMode::CASE_COUNT> cameraExtents =
 	1000.0	// CASE_11
 };
 
-constexpr ExampleMode mode = ExampleMode::CASE_5;
+constexpr ExampleMode mode = ExampleMode::CASE_8;
 
 class Camera2D
 {
@@ -1098,10 +1098,10 @@ class ComputerAidedDesign final : public nbl::examples::SimpleWindowedApplicatio
 		if (m_font->getFreetypeFace()->num_charmaps > 0)
 			FT_Set_Charmap(m_font->getFreetypeFace(), m_font->getFreetypeFace()->charmaps[0]);
 		
-		const auto str = "MSDF: ABCDEFGHIJKLMNOPQRSTUVWXYZ abcdefghijklmnoprstuvwxyz '1234567890-=\"!@#$%&*()_+";
+		const std::wstring str = L"MSDF: ABCDEFGHIJKLMNOPQRSTUVWXYZ abcdefghijklmnoprstuvwxyz '1234567890-=\"!@#$%&*()_+";
 		singleLineText = std::unique_ptr<SingleLineText>(new SingleLineText(
 			m_font.get(),
-			std::string(str)));
+			str));
 
 		drawResourcesFiller.setGlyphMSDFTextureFunction(
 			[&](nbl::ext::TextRendering::FontFace* face, uint32_t glyphIdx) -> core::smart_refctd_ptr<asset::ICPUImage>

From 30a5a19a698758f3ca77e68f24beb74a77c41dae Mon Sep 17 00:00:00 2001
From: kevyuu <kevin.kayu@gmail.com>
Date: Mon, 4 Aug 2025 13:04:29 +0700
Subject: [PATCH 505/529] Remore normal type unknown from demo

---
 67_RayQueryGeometry/app_resources/common.hlsl    |  3 +--
 .../app_resources/render.comp.hlsl               |  2 +-
 67_RayQueryGeometry/main.cpp                     | 16 +++-------------
 71_RayTracingPipeline/app_resources/common.hlsl  |  3 +--
 .../app_resources/raytrace.rchit.hlsl            |  2 +-
 71_RayTracingPipeline/main.cpp                   | 16 +++-------------
 6 files changed, 10 insertions(+), 32 deletions(-)

diff --git a/67_RayQueryGeometry/app_resources/common.hlsl b/67_RayQueryGeometry/app_resources/common.hlsl
index 6a74a1fbe..68a353adc 100644
--- a/67_RayQueryGeometry/app_resources/common.hlsl
+++ b/67_RayQueryGeometry/app_resources/common.hlsl
@@ -9,7 +9,6 @@ enum NormalType : uint32_t
 {
     NT_R8G8B8A8_SNORM,
     NT_R32G32B32_SFLOAT,
-    NT_UNKNOWN
 };
 
 // we need bitfield support in NBL_HLSL_DECLARE_STRUCT it seems
@@ -19,7 +18,7 @@ struct SGeomInfo
     uint64_t indexBufferAddress;
     uint64_t normalBufferAddress;
 
-    uint32_t normalType : 2;
+    uint32_t normalType : 1;
     uint32_t indexType : 1; // 16 bit, 32 bit
 };
 
diff --git a/67_RayQueryGeometry/app_resources/render.comp.hlsl b/67_RayQueryGeometry/app_resources/render.comp.hlsl
index 135100573..6bfde98e5 100644
--- a/67_RayQueryGeometry/app_resources/render.comp.hlsl
+++ b/67_RayQueryGeometry/app_resources/render.comp.hlsl
@@ -53,7 +53,7 @@ float3 calculateNormals(int primID, SGeomInfo geom, float2 bary)
         }
     }
 
-    if (normalBufferAddress == 0 || normalType == NT_UNKNOWN)
+    if (normalBufferAddress == 0)
     {
         float3 v0 = vk::RawBufferLoad<float3>(vertexBufferAddress + indices[0] * 12);
         float3 v1 = vk::RawBufferLoad<float3>(vertexBufferAddress + indices[1] * 12);
diff --git a/67_RayQueryGeometry/main.cpp b/67_RayQueryGeometry/main.cpp
index 0f662c558..820b165b0 100644
--- a/67_RayQueryGeometry/main.cpp
+++ b/67_RayQueryGeometry/main.cpp
@@ -847,19 +847,9 @@ class RayQueryGeometryApp final : public SimpleWindowedApplication, public Built
 					const auto& normalView = gpuPolygon->getNormalView();
 					const uint64_t normalBufferAddress = normalView ? normalView.src.buffer->getDeviceAddress() + normalView.src.offset : 0;
 
-					const auto normalType = [&normalView]
-					{
-            if (!normalView) return NT_UNKNOWN;
-						switch (normalView.composed.format)
-						{
-						case EF_R32G32B32_SFLOAT:
-							return NT_R32G32B32_SFLOAT;
-						case EF_R8G8B8A8_SNORM:
-							return NT_R8G8B8A8_SNORM;
-						default:
-							return NT_UNKNOWN;
-						}
-					}();
+					auto normalType = NT_R32G32B32_SFLOAT;
+					if (normalView && normalView.composed.format == EF_R8G8B8A8_SNORM)
+						normalType = NT_R8G8B8A8_SNORM;
 
 					const auto& indexBufferBinding = gpuTriangles.indexData;
 					auto& geomInfo = geomInfos[i];
diff --git a/71_RayTracingPipeline/app_resources/common.hlsl b/71_RayTracingPipeline/app_resources/common.hlsl
index 6a0f65253..fd719b239 100644
--- a/71_RayTracingPipeline/app_resources/common.hlsl
+++ b/71_RayTracingPipeline/app_resources/common.hlsl
@@ -90,7 +90,6 @@ enum NormalType : uint32_t
 {
     NT_R8G8B8A8_SNORM,
     NT_R32G32B32_SFLOAT,
-    NT_UNKNOWN
 };
 
 struct STriangleGeomInfo
@@ -100,7 +99,7 @@ struct STriangleGeomInfo
     uint64_t indexBufferAddress;
     uint64_t normalBufferAddress;
 
-    uint32_t normalType : 2;
+    uint32_t normalType : 1;
     uint32_t indexType : 1; // 16 bit, 32 bit
 
 };
diff --git a/71_RayTracingPipeline/app_resources/raytrace.rchit.hlsl b/71_RayTracingPipeline/app_resources/raytrace.rchit.hlsl
index c63281782..891ba2f95 100644
--- a/71_RayTracingPipeline/app_resources/raytrace.rchit.hlsl
+++ b/71_RayTracingPipeline/app_resources/raytrace.rchit.hlsl
@@ -32,7 +32,7 @@ float3 calculateNormals(int primID, STriangleGeomInfo geom, float2 bary)
         }
     }
 
-    if (normalBufferAddress == 0 || normalType == NT_UNKNOWN)
+    if (normalBufferAddress == 0)
     {
         float3 v0 = vk::RawBufferLoad<float3>(vertexBufferAddress + indices[0] * 12);
         float3 v1 = vk::RawBufferLoad<float3>(vertexBufferAddress + indices[1] * 12);
diff --git a/71_RayTracingPipeline/main.cpp b/71_RayTracingPipeline/main.cpp
index dadff5c8d..766018788 100644
--- a/71_RayTracingPipeline/main.cpp
+++ b/71_RayTracingPipeline/main.cpp
@@ -1444,19 +1444,9 @@ class RaytracingPipelineApp final : public SimpleWindowedApplication, public Bui
 
 				const auto& normalView = gpuPolygon->getNormalView();
 				const uint64_t normalBufferAddress = normalView ? normalView.src.buffer->getDeviceAddress() + normalView.src.offset : 0;
-        const auto normalType = [&normalView]
-        {
-          if (!normalView) return NT_UNKNOWN;
-          switch (normalView.composed.format)
-          {
-          case EF_R32G32B32_SFLOAT:
-            return NT_R32G32B32_SFLOAT;
-          case EF_R8G8B8A8_SNORM:
-            return NT_R8G8B8A8_SNORM;
-          default:
-            return NT_UNKNOWN;
-          }
-        }();
+        auto normalType = NT_R32G32B32_SFLOAT;
+        if (normalView && normalView.composed.format == EF_R8G8B8A8_SNORM)
+          normalType = NT_R8G8B8A8_SNORM;
 
 				const auto& indexBufferBinding = gpuTriangles.indexData;
 				auto& geomInfo = geomInfos[i];

From 33015eee1762836cbf0d2194e2059d2c6a37de0d Mon Sep 17 00:00:00 2001
From: YasInvolved <szrtxm.op@gmail.com>
Date: Tue, 5 Aug 2025 21:11:37 +0200
Subject: [PATCH 506/529] parse --savemesh argument

---
 12_MeshLoaders/main.cpp | 45 +++++++++++++++++++++++++++++++++++++++++
 1 file changed, 45 insertions(+)

diff --git a/12_MeshLoaders/main.cpp b/12_MeshLoaders/main.cpp
index 3a4d8b13b..54349362d 100644
--- a/12_MeshLoaders/main.cpp
+++ b/12_MeshLoaders/main.cpp
@@ -4,6 +4,7 @@
 #include "common.hpp"
 
 #include "../3rdparty/portable-file-dialogs/portable-file-dialogs.h"
+#include "../3rdparty/argparse/include/argparse/argparse.hpp"
 
 #ifdef NBL_BUILD_MITSUBA_LOADER
 #include "nbl/ext/MitsubaLoader/CSerializedLoader.h"
@@ -21,6 +22,8 @@ class MeshLoadersApp final : public MonoWindowApplication, public BuiltinResourc
 
 		inline bool onAppInitialized(smart_refctd_ptr<ISystem>&& system) override
 		{
+			namespace fs = std::filesystem;
+
 			if (!asset_base_t::onAppInitialized(smart_refctd_ptr(system)))
 				return false;
 		#ifdef NBL_BUILD_MITSUBA_LOADER
@@ -29,6 +32,32 @@ class MeshLoadersApp final : public MonoWindowApplication, public BuiltinResourc
 			if (!device_base_t::onAppInitialized(smart_refctd_ptr(system)))
 				return false;
 
+			// parse args
+			argparse::ArgumentParser parser("12_meshloaders");
+			parser.add_argument("--savemesh")
+				.help("Save the displayed mesh on program termination at specified path")
+				.nargs(argparse::nargs_pattern::at_least_one);
+
+			try
+			{
+				parser.parse_args({ argv.data(), argv.data() + argv.size() });
+			}
+			catch (const std::exception& e)
+			{
+				return logFail(e.what());
+			}
+
+			if (parser.is_used("--savemesh"))
+			{
+				m_saveMeshOnExit = true;
+				std::string savePath = parser.get<std::string>("--savemesh");
+
+				if (!fs::exists(fs::path(savePath).parent_path()))
+					return logFail("Parent path for %s doesn't exist!", savePath.c_str());
+
+				m_savePath = std::move(savePath);
+			}
+
 			m_semaphore = m_device->createSemaphore(m_realFrameIx);
 			if (!m_semaphore)
 				return logFail("Failed to Create a Semaphore!");
@@ -176,6 +205,19 @@ class MeshLoadersApp final : public MonoWindowApplication, public BuiltinResourc
 			return retval;
 		}
 
+		inline bool onAppTerminated() override
+		{
+			// TODO: Save mesh if arg is enabled
+			if (m_saveMeshOnExit)
+			{ }
+
+			if (!device_base_t::onAppTerminated())
+				return false;
+
+			if (!asset_base_t::onAppTerminated())
+				return false;
+		}
+
 	protected:
 		const video::IGPURenderpass::SCreationParams::SSubpassDependency* getDefaultSubpassDependencies() const override
 		{
@@ -416,6 +458,9 @@ class MeshLoadersApp final : public MonoWindowApplication, public BuiltinResourc
 		Camera camera = Camera(core::vectorSIMDf(0, 0, 0), core::vectorSIMDf(0, 0, 0), core::matrix4SIMD());
 		// mutables
 		std::string m_modelPath;
+
+		std::string m_savePath;
+		bool m_saveMeshOnExit;
 };
 
 NBL_MAIN_FUNC(MeshLoadersApp)
\ No newline at end of file

From 4a01642e0e1f27519c8861688d39952a2bc612db Mon Sep 17 00:00:00 2001
From: YasInvolved <szrtxm.op@gmail.com>
Date: Tue, 5 Aug 2025 23:29:39 +0200
Subject: [PATCH 507/529] save loaded mesh (requires improvements)

---
 12_MeshLoaders/main.cpp | 51 ++++++++++++++++++++++-------------------
 1 file changed, 28 insertions(+), 23 deletions(-)

diff --git a/12_MeshLoaders/main.cpp b/12_MeshLoaders/main.cpp
index 54349362d..b876dd7ec 100644
--- a/12_MeshLoaders/main.cpp
+++ b/12_MeshLoaders/main.cpp
@@ -1,10 +1,10 @@
 // Copyright (C) 2018-2020 - DevSH Graphics Programming Sp. z O.O.
 // This file is part of the "Nabla Engine".
 // For conditions of distribution and use, see copyright notice in nabla.h
+#include "../3rdparty/argparse/include/argparse/argparse.hpp"
 #include "common.hpp"
 
 #include "../3rdparty/portable-file-dialogs/portable-file-dialogs.h"
-#include "../3rdparty/argparse/include/argparse/argparse.hpp"
 
 #ifdef NBL_BUILD_MITSUBA_LOADER
 #include "nbl/ext/MitsubaLoader/CSerializedLoader.h"
@@ -22,8 +22,6 @@ class MeshLoadersApp final : public MonoWindowApplication, public BuiltinResourc
 
 		inline bool onAppInitialized(smart_refctd_ptr<ISystem>&& system) override
 		{
-			namespace fs = std::filesystem;
-
 			if (!asset_base_t::onAppInitialized(smart_refctd_ptr(system)))
 				return false;
 		#ifdef NBL_BUILD_MITSUBA_LOADER
@@ -35,8 +33,8 @@ class MeshLoadersApp final : public MonoWindowApplication, public BuiltinResourc
 			// parse args
 			argparse::ArgumentParser parser("12_meshloaders");
 			parser.add_argument("--savemesh")
-				.help("Save the displayed mesh on program termination at specified path")
-				.nargs(argparse::nargs_pattern::at_least_one);
+				.help("Save the displayed mesh on program termination to the file with specified name. Takes filename without extension as an argument")
+				.flag();
 
 			try
 			{
@@ -47,16 +45,8 @@ class MeshLoadersApp final : public MonoWindowApplication, public BuiltinResourc
 				return logFail(e.what());
 			}
 
-			if (parser.is_used("--savemesh"))
-			{
+			if (parser["--savemesh"] == true)
 				m_saveMeshOnExit = true;
-				std::string savePath = parser.get<std::string>("--savemesh");
-
-				if (!fs::exists(fs::path(savePath).parent_path()))
-					return logFail("Parent path for %s doesn't exist!", savePath.c_str());
-
-				m_savePath = std::move(savePath);
-			}
 
 			m_semaphore = m_device->createSemaphore(m_realFrameIx);
 			if (!m_semaphore)
@@ -207,15 +197,28 @@ class MeshLoadersApp final : public MonoWindowApplication, public BuiltinResourc
 
 		inline bool onAppTerminated() override
 		{
-			// TODO: Save mesh if arg is enabled
 			if (m_saveMeshOnExit)
-			{ }
+			{
+				// make save path
+				static const auto prefix = std::filesystem::absolute("saved/");
+
+				if (!std::filesystem::exists(prefix))
+					m_system->createDirectory(prefix);
+
+				auto savePath = (prefix / path(m_modelPath).filename()).generic_string();
+			
+				m_logger->log("Saving mesh to %S", ILogger::ELL_INFO, savePath.c_str()); 
+				// TODO (Yas): learn how to get out the geometry from renderer and transform it into IAsset
+				
+				auto& asset = m_currentBundle.getContents()[0];
+				IAssetWriter::SAssetWriteParams params{ asset.get() };
+				m_assetMgr->writeAsset(savePath, params);
+			}
 
 			if (!device_base_t::onAppTerminated())
 				return false;
 
-			if (!asset_base_t::onAppTerminated())
-				return false;
+			return true;
 		}
 
 	protected:
@@ -291,16 +294,16 @@ class MeshLoadersApp final : public MonoWindowApplication, public BuiltinResourc
 			//! load the geometry
 			IAssetLoader::SAssetLoadParams params = {};
 			params.logger = m_logger.get();
-			auto bundle = m_assetMgr->getAsset(m_modelPath,params);
-			if (bundle.getContents().empty())
+			m_currentBundle = m_assetMgr->getAsset(m_modelPath,params);
+			if (m_currentBundle.getContents().empty())
 				return false;
 
 			// 
 			core::vector<smart_refctd_ptr<const ICPUPolygonGeometry>> geometries;
-			switch (bundle.getAssetType())
+			switch (m_currentBundle.getAssetType())
 			{
 				case IAsset::E_TYPE::ET_GEOMETRY:
-					for (const auto& item : bundle.getContents())
+					for (const auto& item : m_currentBundle.getContents())
 					if (auto polyGeo=IAsset::castDown<ICPUPolygonGeometry>(item); polyGeo)
 						geometries.push_back(polyGeo);
 					break;
@@ -459,7 +462,9 @@ class MeshLoadersApp final : public MonoWindowApplication, public BuiltinResourc
 		// mutables
 		std::string m_modelPath;
 
-		std::string m_savePath;
+		SAssetBundle m_currentBundle;
+
+		std::string m_saveFileName; // NOTE: no extension
 		bool m_saveMeshOnExit;
 };
 

From 4551f28b73a158e7a37e4f6d635917e35b5e7f1e Mon Sep 17 00:00:00 2001
From: YasInvolved <szrtxm.op@gmail.com>
Date: Mon, 11 Aug 2025 12:26:12 +0200
Subject: [PATCH 508/529] argparse by relative path

---
 12_MeshLoaders/CMakeLists.txt | 5 ++++-
 12_MeshLoaders/main.cpp       | 2 +-
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/12_MeshLoaders/CMakeLists.txt b/12_MeshLoaders/CMakeLists.txt
index dee195066..d2ea26ef5 100644
--- a/12_MeshLoaders/CMakeLists.txt
+++ b/12_MeshLoaders/CMakeLists.txt
@@ -15,4 +15,7 @@ endif()
 	# TODO; Arek I removed `NBL_EXECUTABLE_PROJECT_CREATION_PCH_TARGET` from the last parameter here, doesn't this macro have 4 arguments anyway !?
 nbl_create_executable_project("" "" "${NBL_INCLUDE_SERACH_DIRECTORIES}" "${NBL_LIBRARIES}")
 # TODO: Arek temporarily disabled cause I haven't figured out how to make this target yet
-# LINK_BUILTIN_RESOURCES_TO_TARGET(${EXECUTABLE_NAME} nblExamplesGeometrySpirvBRD)
\ No newline at end of file
+# LINK_BUILTIN_RESOURCES_TO_TARGET(${EXECUTABLE_NAME} nblExamplesGeometrySpirvBRD)
+
+add_dependencies(${EXECUTABLE_NAME} argparse)
+target_include_directories(${EXECUTABLE_NAME} PUBLIC $<TARGET_PROPERTY:argparse,INTERFACE_INCLUDE_DIRECTORIES>)
\ No newline at end of file
diff --git a/12_MeshLoaders/main.cpp b/12_MeshLoaders/main.cpp
index b876dd7ec..94e4a2aec 100644
--- a/12_MeshLoaders/main.cpp
+++ b/12_MeshLoaders/main.cpp
@@ -1,7 +1,7 @@
 // Copyright (C) 2018-2020 - DevSH Graphics Programming Sp. z O.O.
 // This file is part of the "Nabla Engine".
 // For conditions of distribution and use, see copyright notice in nabla.h
-#include "../3rdparty/argparse/include/argparse/argparse.hpp"
+#include "argparse/argparse.hpp"
 #include "common.hpp"
 
 #include "../3rdparty/portable-file-dialogs/portable-file-dialogs.h"

From 128fe5bca23c6a6796c56b1d466197d673851875 Mon Sep 17 00:00:00 2001
From: YasInvolved <szrtxm.op@gmail.com>
Date: Mon, 11 Aug 2025 13:17:14 +0200
Subject: [PATCH 509/529] only geometry is intended to be written

---
 12_MeshLoaders/main.cpp | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

diff --git a/12_MeshLoaders/main.cpp b/12_MeshLoaders/main.cpp
index 94e4a2aec..c820485c6 100644
--- a/12_MeshLoaders/main.cpp
+++ b/12_MeshLoaders/main.cpp
@@ -208,10 +208,10 @@ class MeshLoadersApp final : public MonoWindowApplication, public BuiltinResourc
 				auto savePath = (prefix / path(m_modelPath).filename()).generic_string();
 			
 				m_logger->log("Saving mesh to %S", ILogger::ELL_INFO, savePath.c_str()); 
-				// TODO (Yas): learn how to get out the geometry from renderer and transform it into IAsset
 				
-				auto& asset = m_currentBundle.getContents()[0];
-				IAssetWriter::SAssetWriteParams params{ asset.get() };
+				// should I do a const cast here?
+				const IAsset* asset = m_currentGeom.get();
+				IAssetWriter::SAssetWriteParams params{ const_cast<IAsset*>(asset) };
 				m_assetMgr->writeAsset(savePath, params);
 			}
 
@@ -294,16 +294,16 @@ class MeshLoadersApp final : public MonoWindowApplication, public BuiltinResourc
 			//! load the geometry
 			IAssetLoader::SAssetLoadParams params = {};
 			params.logger = m_logger.get();
-			m_currentBundle = m_assetMgr->getAsset(m_modelPath,params);
-			if (m_currentBundle.getContents().empty())
+			auto asset = m_assetMgr->getAsset(m_modelPath,params);
+			if (asset.getContents().empty())
 				return false;
 
 			// 
 			core::vector<smart_refctd_ptr<const ICPUPolygonGeometry>> geometries;
-			switch (m_currentBundle.getAssetType())
+			switch (asset.getAssetType())
 			{
 				case IAsset::E_TYPE::ET_GEOMETRY:
-					for (const auto& item : m_currentBundle.getContents())
+					for (const auto& item : asset.getContents())
 					if (auto polyGeo=IAsset::castDown<ICPUPolygonGeometry>(item); polyGeo)
 						geometries.push_back(polyGeo);
 					break;
@@ -314,6 +314,8 @@ class MeshLoadersApp final : public MonoWindowApplication, public BuiltinResourc
 			if (geometries.empty())
 				return false;
 
+			m_currentGeom = geometries[0];
+
 			using aabb_t = hlsl::shapes::AABB<3,double>;
 			auto printAABB = [&](const aabb_t& aabb, const char* extraMsg="")->void
 			{
@@ -462,7 +464,7 @@ class MeshLoadersApp final : public MonoWindowApplication, public BuiltinResourc
 		// mutables
 		std::string m_modelPath;
 
-		SAssetBundle m_currentBundle;
+		smart_refctd_ptr<const ICPUPolygonGeometry> m_currentGeom;
 
 		std::string m_saveFileName; // NOTE: no extension
 		bool m_saveMeshOnExit;

From 4dabe03c97f67664cda83becffdeab5759cf0afc Mon Sep 17 00:00:00 2001
From: YasInvolved <szrtxm.op@gmail.com>
Date: Mon, 11 Aug 2025 13:27:42 +0200
Subject: [PATCH 510/529] fix naming, write mesh on reload

---
 12_MeshLoaders/main.cpp | 42 ++++++++++++++++++++++++-----------------
 1 file changed, 25 insertions(+), 17 deletions(-)

diff --git a/12_MeshLoaders/main.cpp b/12_MeshLoaders/main.cpp
index c820485c6..b84588b8f 100644
--- a/12_MeshLoaders/main.cpp
+++ b/12_MeshLoaders/main.cpp
@@ -46,7 +46,7 @@ class MeshLoadersApp final : public MonoWindowApplication, public BuiltinResourc
 			}
 
 			if (parser["--savemesh"] == true)
-				m_saveMeshOnExit = true;
+				m_saveGeomOnExit = true;
 
 			m_semaphore = m_device->createSemaphore(m_realFrameIx);
 			if (!m_semaphore)
@@ -197,22 +197,9 @@ class MeshLoadersApp final : public MonoWindowApplication, public BuiltinResourc
 
 		inline bool onAppTerminated() override
 		{
-			if (m_saveMeshOnExit)
+			if (m_saveGeomOnExit)
 			{
-				// make save path
-				static const auto prefix = std::filesystem::absolute("saved/");
-
-				if (!std::filesystem::exists(prefix))
-					m_system->createDirectory(prefix);
-
-				auto savePath = (prefix / path(m_modelPath).filename()).generic_string();
-			
-				m_logger->log("Saving mesh to %S", ILogger::ELL_INFO, savePath.c_str()); 
-				
-				// should I do a const cast here?
-				const IAsset* asset = m_currentGeom.get();
-				IAssetWriter::SAssetWriteParams params{ const_cast<IAsset*>(asset) };
-				m_assetMgr->writeAsset(savePath, params);
+				writeGeometry();
 			}
 
 			if (!device_base_t::onAppTerminated())
@@ -286,6 +273,9 @@ class MeshLoadersApp final : public MonoWindowApplication, public BuiltinResourc
 				m_modelPath = file.result()[0];
 			}
 
+			if (m_saveGeomOnExit && m_currentGeom)
+				writeGeometry();
+
 			// free up
 			m_renderer->m_instances.clear();
 			m_renderer->clearGeometries({.semaphore=m_semaphore.get(),.value=m_realFrameIx});
@@ -448,6 +438,24 @@ class MeshLoadersApp final : public MonoWindowApplication, public BuiltinResourc
 			return true;
 		}
 
+		void writeGeometry()
+		{
+			// make save path
+			static const auto prefix = std::filesystem::absolute("saved/");
+
+			if (!std::filesystem::exists(prefix))
+				m_system->createDirectory(prefix);
+
+			auto savePath = (prefix / path(m_modelPath).filename()).generic_string();
+
+			m_logger->log("Saving mesh to %S", ILogger::ELL_INFO, savePath.c_str());
+
+			// should I do a const cast here?
+			const IAsset* asset = m_currentGeom.get();
+			IAssetWriter::SAssetWriteParams params{ const_cast<IAsset*>(asset) };
+			m_assetMgr->writeAsset(savePath, params);
+		}
+
 		// Maximum frames which can be simultaneously submitted, used to cycle through our per-frame resources like command buffers
 		constexpr static inline uint32_t MaxFramesInFlight = 3u;
 		//
@@ -467,7 +475,7 @@ class MeshLoadersApp final : public MonoWindowApplication, public BuiltinResourc
 		smart_refctd_ptr<const ICPUPolygonGeometry> m_currentGeom;
 
 		std::string m_saveFileName; // NOTE: no extension
-		bool m_saveMeshOnExit;
+		bool m_saveGeomOnExit;
 };
 
 NBL_MAIN_FUNC(MeshLoadersApp)
\ No newline at end of file

From 3de2363c2a91dbe6305cfe43117685a1d2842ffb Mon Sep 17 00:00:00 2001
From: devsh <devsh@devsh.eu>
Date: Tue, 12 Aug 2025 00:03:23 +0200
Subject: [PATCH 511/529] re-add newly reworked examples to CI pipeline

---
 CMakeLists.txt | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 5e02eadc1..5ef4c0efc 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -82,11 +82,11 @@ if(NBL_BUILD_EXAMPLES)
 	add_subdirectory(0_ImportanceSamplingEnvMaps EXCLUDE_FROM_ALL) #TODO: integrate back into 42
 
 	add_subdirectory(66_HLSLBxDFTests EXCLUDE_FROM_ALL)
-	add_subdirectory(67_RayQueryGeometry EXCLUDE_FROM_ALL) # TODO: resurrect before `mesh_loaders` merge
+	add_subdirectory(67_RayQueryGeometry)
 	add_subdirectory(68_JpegLoading)
 
   	add_subdirectory(70_FLIPFluids)
-	add_subdirectory(71_RayTracingPipeline EXCLUDE_FROM_ALL) # TODO: resurrect before `mesh_loaders` merge
+	add_subdirectory(71_RayTracingPipeline)
 
 	# add new examples *before* NBL_GET_ALL_TARGETS invocation, it gathers recursively all targets created so far in this subdirectory
 	NBL_GET_ALL_TARGETS(TARGETS)

From 7899653c3ede67a9ac2ecc2d7beda580e75bd222 Mon Sep 17 00:00:00 2001
From: YasInvolved <szrtxm.op@gmail.com>
Date: Tue, 12 Aug 2025 14:29:13 +0200
Subject: [PATCH 512/529] prompt user to choose the file to which the geometry
 will be saved

---
 12_MeshLoaders/main.cpp | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

diff --git a/12_MeshLoaders/main.cpp b/12_MeshLoaders/main.cpp
index b84588b8f..001a8447e 100644
--- a/12_MeshLoaders/main.cpp
+++ b/12_MeshLoaders/main.cpp
@@ -254,6 +254,8 @@ class MeshLoadersApp final : public MonoWindowApplication, public BuiltinResourc
 
 		bool reloadModel()
 		{
+			m_currentGeom = nullptr;
+
 			if (m_nonInteractiveTest) // TODO: maybe also take from argv and argc
 				m_modelPath = (sharedInputCWD/"ply/Spanner-ply.ply").string();
 			else
@@ -440,20 +442,20 @@ class MeshLoadersApp final : public MonoWindowApplication, public BuiltinResourc
 
 		void writeGeometry()
 		{
-			// make save path
-			static const auto prefix = std::filesystem::absolute("saved/");
-
-			if (!std::filesystem::exists(prefix))
-				m_system->createDirectory(prefix);
+			auto dest = pfd::save_file("Save Geometry", sharedInputCWD.string(),
+				{ "All Supported Formats", "*.stl *.ply *.serialized" },
+				pfd::opt::force_overwrite
+			).result();
 
-			auto savePath = (prefix / path(m_modelPath).filename()).generic_string();
+			if (dest.empty())
+				return;
 
-			m_logger->log("Saving mesh to %S", ILogger::ELL_INFO, savePath.c_str());
+			m_logger->log("Saving mesh to %S", ILogger::ELL_INFO, dest.c_str());
 
 			// should I do a const cast here?
 			const IAsset* asset = m_currentGeom.get();
 			IAssetWriter::SAssetWriteParams params{ const_cast<IAsset*>(asset) };
-			m_assetMgr->writeAsset(savePath, params);
+			m_assetMgr->writeAsset(dest, params);
 		}
 
 		// Maximum frames which can be simultaneously submitted, used to cycle through our per-frame resources like command buffers

From 9c24b5249312411c42ae13707b26fdb1a1c9aab9 Mon Sep 17 00:00:00 2001
From: YasInvolved <szrtxm.op@gmail.com>
Date: Tue, 12 Aug 2025 14:35:02 +0200
Subject: [PATCH 513/529] check if geometry is null on exit and null it out
 after saving, log when invalid path has been selected

---
 12_MeshLoaders/main.cpp | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/12_MeshLoaders/main.cpp b/12_MeshLoaders/main.cpp
index 001a8447e..95062a493 100644
--- a/12_MeshLoaders/main.cpp
+++ b/12_MeshLoaders/main.cpp
@@ -197,7 +197,7 @@ class MeshLoadersApp final : public MonoWindowApplication, public BuiltinResourc
 
 		inline bool onAppTerminated() override
 		{
-			if (m_saveGeomOnExit)
+			if (m_saveGeomOnExit && m_currentGeom)
 			{
 				writeGeometry();
 			}
@@ -448,7 +448,10 @@ class MeshLoadersApp final : public MonoWindowApplication, public BuiltinResourc
 			).result();
 
 			if (dest.empty())
+			{
+				m_logger->log("Invalid path has been selected. Geometry won't be saved.", ILogger::ELL_ERROR);
 				return;
+			}
 
 			m_logger->log("Saving mesh to %S", ILogger::ELL_INFO, dest.c_str());
 
@@ -456,6 +459,7 @@ class MeshLoadersApp final : public MonoWindowApplication, public BuiltinResourc
 			const IAsset* asset = m_currentGeom.get();
 			IAssetWriter::SAssetWriteParams params{ const_cast<IAsset*>(asset) };
 			m_assetMgr->writeAsset(dest, params);
+			m_currentGeom = nullptr;
 		}
 
 		// Maximum frames which can be simultaneously submitted, used to cycle through our per-frame resources like command buffers

From 2c879de20c36096d0492309f581f5a095a0e2d3d Mon Sep 17 00:00:00 2001
From: YasInvolved <szrtxm.op@gmail.com>
Date: Tue, 12 Aug 2025 15:10:38 +0200
Subject: [PATCH 514/529] allow user to specify path in additional optional
 argument

---
 12_MeshLoaders/main.cpp | 36 +++++++++++++++++++++++++++---------
 1 file changed, 27 insertions(+), 9 deletions(-)

diff --git a/12_MeshLoaders/main.cpp b/12_MeshLoaders/main.cpp
index 95062a493..2cfe0dc53 100644
--- a/12_MeshLoaders/main.cpp
+++ b/12_MeshLoaders/main.cpp
@@ -33,9 +33,13 @@ class MeshLoadersApp final : public MonoWindowApplication, public BuiltinResourc
 			// parse args
 			argparse::ArgumentParser parser("12_meshloaders");
 			parser.add_argument("--savemesh")
-				.help("Save the displayed mesh on program termination to the file with specified name. Takes filename without extension as an argument")
+				.help("Save the mesh on exit or reload")
 				.flag();
 
+			parser.add_argument("--savepath")
+				.nargs(1)
+				.help("Specify the file to which the mesh will be saved");
+
 			try
 			{
 				parser.parse_args({ argv.data(), argv.data() + argv.size() });
@@ -48,6 +52,19 @@ class MeshLoadersApp final : public MonoWindowApplication, public BuiltinResourc
 			if (parser["--savemesh"] == true)
 				m_saveGeomOnExit = true;
 
+			if (parser.present("--savepath"))
+			{
+				auto tmp = path(parser.get<std::string>("--savepath"));
+				
+				if (tmp.empty() || !tmp.has_filename())
+					return logFail("Invalid path has been specified in --savepath argument");
+
+				if (!std::filesystem::exists(tmp.parent_path()))
+					return logFail("Path specified in --savepath argument doesn't exist");
+
+				m_geomSavePath.emplace(std::move(tmp));
+			}
+
 			m_semaphore = m_device->createSemaphore(m_realFrameIx);
 			if (!m_semaphore)
 				return logFail("Failed to Create a Semaphore!");
@@ -198,9 +215,7 @@ class MeshLoadersApp final : public MonoWindowApplication, public BuiltinResourc
 		inline bool onAppTerminated() override
 		{
 			if (m_saveGeomOnExit && m_currentGeom)
-			{
 				writeGeometry();
-			}
 
 			if (!device_base_t::onAppTerminated())
 				return false;
@@ -442,10 +457,13 @@ class MeshLoadersApp final : public MonoWindowApplication, public BuiltinResourc
 
 		void writeGeometry()
 		{
-			auto dest = pfd::save_file("Save Geometry", sharedInputCWD.string(),
-				{ "All Supported Formats", "*.stl *.ply *.serialized" },
-				pfd::opt::force_overwrite
-			).result();
+			if (!m_geomSavePath.has_value())
+				m_geomSavePath = pfd::save_file("Save Geometry", sharedInputCWD.string(),
+					{ "All Supported Formats (.stl, .ply, .serialized)", "*.stl *.ply *.serialized" },
+					pfd::opt::force_overwrite
+				).result();
+
+			auto& dest = m_geomSavePath.value();
 
 			if (dest.empty())
 			{
@@ -458,7 +476,7 @@ class MeshLoadersApp final : public MonoWindowApplication, public BuiltinResourc
 			// should I do a const cast here?
 			const IAsset* asset = m_currentGeom.get();
 			IAssetWriter::SAssetWriteParams params{ const_cast<IAsset*>(asset) };
-			m_assetMgr->writeAsset(dest, params);
+			m_assetMgr->writeAsset(dest.string(), params);
 			m_currentGeom = nullptr;
 		}
 
@@ -480,8 +498,8 @@ class MeshLoadersApp final : public MonoWindowApplication, public BuiltinResourc
 
 		smart_refctd_ptr<const ICPUPolygonGeometry> m_currentGeom;
 
-		std::string m_saveFileName; // NOTE: no extension
 		bool m_saveGeomOnExit;
+		std::optional<nbl::system::path> m_geomSavePath;
 };
 
 NBL_MAIN_FUNC(MeshLoadersApp)
\ No newline at end of file

From b51add6e53f470577da89f6aaa66b3d03713a684 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20B=C4=85czyk?= <szrtxm.op@gmail.com>
Date: Tue, 12 Aug 2025 18:29:31 +0200
Subject: [PATCH 515/529] change sharedInputCWD to localOutputCWD in geometry
 saving function

---
 12_MeshLoaders/main.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/12_MeshLoaders/main.cpp b/12_MeshLoaders/main.cpp
index 2cfe0dc53..187861441 100644
--- a/12_MeshLoaders/main.cpp
+++ b/12_MeshLoaders/main.cpp
@@ -458,7 +458,7 @@ class MeshLoadersApp final : public MonoWindowApplication, public BuiltinResourc
 		void writeGeometry()
 		{
 			if (!m_geomSavePath.has_value())
-				m_geomSavePath = pfd::save_file("Save Geometry", sharedInputCWD.string(),
+				m_geomSavePath = pfd::save_file("Save Geometry", localOutputCWD.string(),
 					{ "All Supported Formats (.stl, .ply, .serialized)", "*.stl *.ply *.serialized" },
 					pfd::opt::force_overwrite
 				).result();

From 052158c22b934e28082b33dfa7ce01cdba5cdf3a Mon Sep 17 00:00:00 2001
From: kevyuu <kevin.kayu@gmail.com>
Date: Fri, 15 Aug 2025 16:48:34 +0700
Subject: [PATCH 516/529] Use spirv intrinsics for raytracing command and
 builtin

---
 .../app_resources/raytrace.rahit.hlsl         |  4 ++++
 .../app_resources/raytrace.rchit.hlsl         | 11 ++++++----
 .../app_resources/raytrace.rgen.hlsl          | 21 +++++++++++++------
 .../app_resources/raytrace.rint.hlsl          |  8 +++++--
 .../raytrace_procedural.rchit.hlsl            |  7 +++++--
 .../app_resources/raytrace_shadow.rahit.hlsl  |  3 +++
 6 files changed, 40 insertions(+), 14 deletions(-)

diff --git a/71_RayTracingPipeline/app_resources/raytrace.rahit.hlsl b/71_RayTracingPipeline/app_resources/raytrace.rahit.hlsl
index 97713b3ec..4bc226473 100644
--- a/71_RayTracingPipeline/app_resources/raytrace.rahit.hlsl
+++ b/71_RayTracingPipeline/app_resources/raytrace.rahit.hlsl
@@ -1,5 +1,9 @@
 #include "common.hlsl"
 
+#include "nbl/builtin/hlsl/spirv_intrinsics/raytracing.hlsl"
+
+using namespace nbl::hlsl;
+
 [[vk::push_constant]] SPushConstants pc;
 
 [shader("anyhit")]
diff --git a/71_RayTracingPipeline/app_resources/raytrace.rchit.hlsl b/71_RayTracingPipeline/app_resources/raytrace.rchit.hlsl
index 891ba2f95..3e0b4b102 100644
--- a/71_RayTracingPipeline/app_resources/raytrace.rchit.hlsl
+++ b/71_RayTracingPipeline/app_resources/raytrace.rchit.hlsl
@@ -1,7 +1,10 @@
 #include "common.hlsl"
 
+#include "nbl/builtin/hlsl/spirv_intrinsics/raytracing.hlsl"
 #include "nbl/builtin/hlsl/bda/__ptr.hlsl"
 
+using namespace nbl::hlsl;
+
 [[vk::push_constant]] SPushConstants pc;
 
 float3 calculateNormals(int primID, STriangleGeomInfo geom, float2 bary)
@@ -75,15 +78,15 @@ float3 calculateNormals(int primID, STriangleGeomInfo geom, float2 bary)
 void main(inout PrimaryPayload payload, in BuiltInTriangleIntersectionAttributes attribs)
 {
     const int primID = PrimitiveIndex();
-    const int instanceCustomIndex = InstanceIndex();
-    const int geometryIndex = GeometryIndex();
+    const int instanceCustomIndex = spirv::InstanceCustomIndexKHR;
+    const int geometryIndex = spirv::RayGeometryIndexKHR;
     const STriangleGeomInfo geom = vk::RawBufferLoad < STriangleGeomInfo > (pc.triangleGeomInfoBuffer + (instanceCustomIndex + geometryIndex) * sizeof(STriangleGeomInfo));
     const float32_t3 vertexNormal = calculateNormals(primID, geom, attribs.barycentrics);
-    const float32_t3 worldNormal = normalize(mul(vertexNormal, WorldToObject3x4()).xyz);
+    const float32_t3 worldNormal = normalize(mul(vertexNormal, transpose(spirv::WorldToObjectKHR)).xyz);
 
     payload.materialId = MaterialId::createTriangle(instanceCustomIndex);
 
     payload.worldNormal = worldNormal;
-    payload.rayDistance = RayTCurrent();
+    payload.rayDistance = spirv::RayTmaxKHR;
 
 }
\ No newline at end of file
diff --git a/71_RayTracingPipeline/app_resources/raytrace.rgen.hlsl b/71_RayTracingPipeline/app_resources/raytrace.rgen.hlsl
index 55b014d07..43d16f161 100644
--- a/71_RayTracingPipeline/app_resources/raytrace.rgen.hlsl
+++ b/71_RayTracingPipeline/app_resources/raytrace.rgen.hlsl
@@ -9,6 +9,8 @@
 static const int32_t s_sampleCount = 10;
 static const float32_t3 s_clearColor = float32_t3(0.3, 0.3, 0.8);
 
+using namespace nbl::hlsl;
+
 [[vk::push_constant]] SPushConstants pc;
 
 [[vk::binding(0, 0)]] RaytracingAccelerationStructure topLevelAS;
@@ -23,8 +25,8 @@ float32_t nextRandomUnorm(inout nbl::hlsl::Xoroshiro64StarStar rnd)
 [shader("raygeneration")]
 void main()
 {
-    const uint32_t3 launchID = DispatchRaysIndex();
-    const uint32_t3 launchSize = DispatchRaysDimensions();
+    const uint32_t3 launchID = spirv::LaunchIdKHR;
+    const uint32_t3 launchSize = spirv::LaunchSizeKHR;
     const uint32_t2 coords = launchID.xy;
 
     const uint32_t seed1 = nbl::hlsl::random::Pcg::create(pc.frameCounter)();
@@ -53,9 +55,11 @@ void main()
         rayDesc.TMin = 0.01;
         rayDesc.TMax = 10000.0;
         
+        [[vk::ext_storage_class(spv::StorageClassRayPayloadKHR)]]
         PrimaryPayload payload;
         payload.pcg = PrimaryPayload::generator_t::create(rnd());
-        TraceRay(topLevelAS, RAY_FLAG_NONE, 0xff, ERT_PRIMARY, 0, EMT_PRIMARY, rayDesc, payload);
+        spirv::traceRayKHR(topLevelAS, spv::RayFlagsMaskNone, 0xff, ERT_PRIMARY, 0, EMT_PRIMARY, rayDesc.Origin, rayDesc.TMin, rayDesc.Direction, rayDesc.TMax, payload);
+        // TraceRay(topLevelAS, RAY_FLAG_NONE, 0xff, ERT_PRIMARY, 0, EMT_PRIMARY, rayDesc, payload);
 
         const float32_t rayDistance = payload.rayDistance;
         if (rayDistance < 0)
@@ -67,9 +71,10 @@ void main()
         const float32_t3 worldPosition = pc.camPos + (camDirection * rayDistance);
 
         // make sure to call with least live state
+        [[vk::ext_storage_class(spv::StorageClassCallableDataKHR)]]
         RayLight cLight;
         cLight.inHitPosition = worldPosition;
-        CallShader(pc.light.type, cLight);
+        spirv::executeCallable(pc.light.type, cLight);
 
         const float32_t3 worldNormal = payload.worldNormal;
 
@@ -97,12 +102,16 @@ void main()
             rayDesc.TMin = 0.01;
             rayDesc.TMax = cLight.outLightDistance;
 
+            [[vk::ext_storage_class(spv::StorageClassRayPayloadKHR)]]
             OcclusionPayload occlusionPayload;
             // negative means its a hit, the miss shader will flip it back around to positive
             occlusionPayload.attenuation = -1.f;
             // abuse of miss shader to mean "not hit shader" solves us having to call closest hit shaders
-            uint32_t shadowRayFlags = RAY_FLAG_ACCEPT_FIRST_HIT_AND_END_SEARCH | RAY_FLAG_SKIP_CLOSEST_HIT_SHADER;
-            TraceRay(topLevelAS, shadowRayFlags, 0xFF, ERT_OCCLUSION, 0, EMT_OCCLUSION, rayDesc, occlusionPayload);
+            uint32_t shadowRayFlags = spv::RayFlagsTerminateOnFirstHitKHRMask | spv::RayFlagsSkipClosestHitShaderKHRMask;
+            spirv::traceRayKHR(topLevelAS, shadowRayFlags, 0xFF, ERT_OCCLUSION, 0, EMT_OCCLUSION, rayDesc.Origin, rayDesc.TMin, rayDesc.Direction, rayDesc.TMax, occlusionPayload);
+
+            // uint32_t shadowRayFlags = RAY_FLAG_ACCEPT_FIRST_HIT_AND_END_SEARCH | RAY_FLAG_SKIP_CLOSEST_HIT_SHADER;
+            // TraceRay(topLevelAS, shadowRayFlags, 0xFF, ERT_OCCLUSION, 0, EMT_OCCLUSION, rayDesc, occlusionPayload);
 
             attenuation = occlusionPayload.attenuation;
             if (occlusionPayload.attenuation > 1.f/1024.f)
diff --git a/71_RayTracingPipeline/app_resources/raytrace.rint.hlsl b/71_RayTracingPipeline/app_resources/raytrace.rint.hlsl
index d081c9248..e4646512d 100644
--- a/71_RayTracingPipeline/app_resources/raytrace.rint.hlsl
+++ b/71_RayTracingPipeline/app_resources/raytrace.rint.hlsl
@@ -1,5 +1,9 @@
 #include "common.hlsl"
 
+#include "nbl/builtin/hlsl/spirv_intrinsics/raytracing.hlsl"
+
+using namespace nbl::hlsl;
+
 [[vk::push_constant]] SPushConstants pc;
 
 struct Ray
@@ -26,8 +30,8 @@ float32_t hitSphere(SProceduralGeomInfo s, Ray r)
 void main()
 {
     Ray ray;
-    ray.origin = WorldRayOrigin();
-    ray.direction = WorldRayDirection();
+    ray.origin = spirv::WorldRayOriginKHR;
+    ray.direction = spirv::WorldRayDirectionKHR;
 
     const int primID = PrimitiveIndex();
 
diff --git a/71_RayTracingPipeline/app_resources/raytrace_procedural.rchit.hlsl b/71_RayTracingPipeline/app_resources/raytrace_procedural.rchit.hlsl
index df9ef9623..29f26c902 100644
--- a/71_RayTracingPipeline/app_resources/raytrace_procedural.rchit.hlsl
+++ b/71_RayTracingPipeline/app_resources/raytrace_procedural.rchit.hlsl
@@ -1,16 +1,19 @@
 #include "common.hlsl"
 
+#include "nbl/builtin/hlsl/spirv_intrinsics/raytracing.hlsl"
+using namespace nbl::hlsl;
+
 [[vk::push_constant]] SPushConstants pc;
 
 [shader("closesthit")]
 void main(inout PrimaryPayload payload, in ProceduralHitAttribute attrib)
 {
-    const float32_t3 worldPosition = WorldRayOrigin() + WorldRayDirection() * RayTCurrent();
+    const float32_t3 worldPosition = spirv::WorldRayOriginKHR + spirv::WorldRayDirectionKHR * spirv::RayTmaxKHR;
     const float32_t3 worldNormal = normalize(worldPosition - attrib.center);
 
     payload.materialId = MaterialId::createProcedural(PrimitiveIndex()); // we use negative value to indicate that this is procedural
 
     payload.worldNormal = worldNormal;
-    payload.rayDistance = RayTCurrent();
+    payload.rayDistance = spirv::RayTmaxKHR;
 
 }
\ No newline at end of file
diff --git a/71_RayTracingPipeline/app_resources/raytrace_shadow.rahit.hlsl b/71_RayTracingPipeline/app_resources/raytrace_shadow.rahit.hlsl
index a3432b812..0741a138a 100644
--- a/71_RayTracingPipeline/app_resources/raytrace_shadow.rahit.hlsl
+++ b/71_RayTracingPipeline/app_resources/raytrace_shadow.rahit.hlsl
@@ -1,5 +1,8 @@
 #include "common.hlsl"
 #include "nbl/builtin/hlsl/spirv_intrinsics/raytracing.hlsl"
+#include "nbl/builtin/hlsl/spirv_intrinsics/core.hlsl"
+
+using namespace nbl::hlsl;
 
 [[vk::push_constant]] SPushConstants pc;
 

From cd6b9f0e93bfe76d4585b73b920048400588790f Mon Sep 17 00:00:00 2001
From: kevyuu <kevin.kayu@gmail.com>
Date: Fri, 15 Aug 2025 17:27:48 +0700
Subject: [PATCH 517/529] Add comment referring to
 https://github.com/microsoft/DirectXShaderCompiler/issues/7279 as the reason
 we don't use spirv intrinsics for ignoreIntersection and terminateRay

---
 71_RayTracingPipeline/app_resources/raytrace.rahit.hlsl        | 1 +
 71_RayTracingPipeline/app_resources/raytrace_shadow.rahit.hlsl | 3 +++
 2 files changed, 4 insertions(+)

diff --git a/71_RayTracingPipeline/app_resources/raytrace.rahit.hlsl b/71_RayTracingPipeline/app_resources/raytrace.rahit.hlsl
index 4bc226473..cdbdeaf3e 100644
--- a/71_RayTracingPipeline/app_resources/raytrace.rahit.hlsl
+++ b/71_RayTracingPipeline/app_resources/raytrace.rahit.hlsl
@@ -13,6 +13,7 @@ void main(inout PrimaryPayload payload, in BuiltInTriangleIntersectionAttributes
     const STriangleGeomInfo geom = vk::RawBufferLoad < STriangleGeomInfo > (pc.triangleGeomInfoBuffer + instID * sizeof(STriangleGeomInfo));
 
     const uint32_t bitpattern = payload.pcg();
+    // Cannot use spirv::ignoreIntersectionKHR and spirv::terminateRayKHR due to https://github.com/microsoft/DirectXShaderCompiler/issues/7279
     if (geom.material.alphaTest(bitpattern))
         IgnoreHit();
 }
diff --git a/71_RayTracingPipeline/app_resources/raytrace_shadow.rahit.hlsl b/71_RayTracingPipeline/app_resources/raytrace_shadow.rahit.hlsl
index 0741a138a..147f6af70 100644
--- a/71_RayTracingPipeline/app_resources/raytrace_shadow.rahit.hlsl
+++ b/71_RayTracingPipeline/app_resources/raytrace_shadow.rahit.hlsl
@@ -16,6 +16,9 @@ void main(inout OcclusionPayload payload, in BuiltInTriangleIntersectionAttribut
     const float attenuation = (1.f-material.alpha) * payload.attenuation;
     // DXC cogegens weird things in the presence of termination instructions
     payload.attenuation = attenuation;
+
+
+    // Cannot use spirv::ignoreIntersectionKHR and spirv::terminateRayKHR due to https://github.com/microsoft/DirectXShaderCompiler/issues/7279
     // arbitrary constant, whatever you want the smallest attenuation to be. Remember until miss, the attenuatio is negative
     if (attenuation > -1.f/1024.f)
         AcceptHitAndEndSearch();

From f472acc23a72e5455b3e1c0455bacc87e16bd3dc Mon Sep 17 00:00:00 2001
From: kevyuu <kevin.kayu@gmail.com>
Date: Mon, 18 Aug 2025 18:04:17 +0700
Subject: [PATCH 518/529] Use inline spirv to replace ReportHit

---
 71_RayTracingPipeline/app_resources/raytrace.rint.hlsl | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/71_RayTracingPipeline/app_resources/raytrace.rint.hlsl b/71_RayTracingPipeline/app_resources/raytrace.rint.hlsl
index e4646512d..940c597ad 100644
--- a/71_RayTracingPipeline/app_resources/raytrace.rint.hlsl
+++ b/71_RayTracingPipeline/app_resources/raytrace.rint.hlsl
@@ -40,12 +40,13 @@ void main()
 
     const float32_t tHit = hitSphere(sphere, ray);
     
+    [[vk::ext_storage_class(spv::StorageClassHitAttributeKHR)]]
     ProceduralHitAttribute hitAttrib;
 
     // Report hit point
     if (tHit > 0)
     {
         hitAttrib.center = sphere.center;
-        ReportHit(tHit, 0, hitAttrib);
+        spirv::reportIntersectionKHR(tHit, 0);
     }
 }
\ No newline at end of file

From 10263b0e913ff839491580f0977b94dc4e11202e Mon Sep 17 00:00:00 2001
From: YasInvolved <szrtxm.op@gmail.com>
Date: Tue, 19 Aug 2025 00:42:47 +0200
Subject: [PATCH 519/529] simplify geometry writing logic

---
 12_MeshLoaders/main.cpp | 60 ++++++++++-------------------------------
 1 file changed, 14 insertions(+), 46 deletions(-)

diff --git a/12_MeshLoaders/main.cpp b/12_MeshLoaders/main.cpp
index 187861441..aee9b0683 100644
--- a/12_MeshLoaders/main.cpp
+++ b/12_MeshLoaders/main.cpp
@@ -50,7 +50,7 @@ class MeshLoadersApp final : public MonoWindowApplication, public BuiltinResourc
 			}
 
 			if (parser["--savemesh"] == true)
-				m_saveGeomOnExit = true;
+				m_saveGeom = true;
 
 			if (parser.present("--savepath"))
 			{
@@ -61,8 +61,6 @@ class MeshLoadersApp final : public MonoWindowApplication, public BuiltinResourc
 
 				if (!std::filesystem::exists(tmp.parent_path()))
 					return logFail("Path specified in --savepath argument doesn't exist");
-
-				m_geomSavePath.emplace(std::move(tmp));
 			}
 
 			m_semaphore = m_device->createSemaphore(m_realFrameIx);
@@ -212,17 +210,6 @@ class MeshLoadersApp final : public MonoWindowApplication, public BuiltinResourc
 			return retval;
 		}
 
-		inline bool onAppTerminated() override
-		{
-			if (m_saveGeomOnExit && m_currentGeom)
-				writeGeometry();
-
-			if (!device_base_t::onAppTerminated())
-				return false;
-
-			return true;
-		}
-
 	protected:
 		const video::IGPURenderpass::SCreationParams::SSubpassDependency* getDefaultSubpassDependencies() const override
 		{
@@ -269,8 +256,6 @@ class MeshLoadersApp final : public MonoWindowApplication, public BuiltinResourc
 
 		bool reloadModel()
 		{
-			m_currentGeom = nullptr;
-
 			if (m_nonInteractiveTest) // TODO: maybe also take from argv and argc
 				m_modelPath = (sharedInputCWD/"ply/Spanner-ply.ply").string();
 			else
@@ -290,9 +275,6 @@ class MeshLoadersApp final : public MonoWindowApplication, public BuiltinResourc
 				m_modelPath = file.result()[0];
 			}
 
-			if (m_saveGeomOnExit && m_currentGeom)
-				writeGeometry();
-
 			// free up
 			m_renderer->m_instances.clear();
 			m_renderer->clearGeometries({.semaphore=m_semaphore.get(),.value=m_realFrameIx});
@@ -321,7 +303,12 @@ class MeshLoadersApp final : public MonoWindowApplication, public BuiltinResourc
 			if (geometries.empty())
 				return false;
 
-			m_currentGeom = geometries[0];
+			// TODO: do it async
+			if (m_saveGeom)
+				writeGeometry(
+					const_cast<ICPUPolygonGeometry*>(geometries[0].get()), 
+					m_specifiedGeomSavePath.value_or((m_saveGeomPrefixPath / path(m_modelPath).filename()).generic_string())
+				);
 
 			using aabb_t = hlsl::shapes::AABB<3,double>;
 			auto printAABB = [&](const aabb_t& aabb, const char* extraMsg="")->void
@@ -455,29 +442,11 @@ class MeshLoadersApp final : public MonoWindowApplication, public BuiltinResourc
 			return true;
 		}
 
-		void writeGeometry()
+		void writeGeometry(ICPUPolygonGeometry* geometry, const std::string& savePath)
 		{
-			if (!m_geomSavePath.has_value())
-				m_geomSavePath = pfd::save_file("Save Geometry", localOutputCWD.string(),
-					{ "All Supported Formats (.stl, .ply, .serialized)", "*.stl *.ply *.serialized" },
-					pfd::opt::force_overwrite
-				).result();
-
-			auto& dest = m_geomSavePath.value();
-
-			if (dest.empty())
-			{
-				m_logger->log("Invalid path has been selected. Geometry won't be saved.", ILogger::ELL_ERROR);
-				return;
-			}
-
-			m_logger->log("Saving mesh to %S", ILogger::ELL_INFO, dest.c_str());
-
-			// should I do a const cast here?
-			const IAsset* asset = m_currentGeom.get();
-			IAssetWriter::SAssetWriteParams params{ const_cast<IAsset*>(asset) };
-			m_assetMgr->writeAsset(dest.string(), params);
-			m_currentGeom = nullptr;
+			IAssetWriter::SAssetWriteParams params{ reinterpret_cast<IAsset*>(geometry) };
+			m_logger->log("Saving mesh to %S", ILogger::ELL_INFO, savePath.c_str());
+			m_assetMgr->writeAsset(savePath, params);
 		}
 
 		// Maximum frames which can be simultaneously submitted, used to cycle through our per-frame resources like command buffers
@@ -496,10 +465,9 @@ class MeshLoadersApp final : public MonoWindowApplication, public BuiltinResourc
 		// mutables
 		std::string m_modelPath;
 
-		smart_refctd_ptr<const ICPUPolygonGeometry> m_currentGeom;
-
-		bool m_saveGeomOnExit;
-		std::optional<nbl::system::path> m_geomSavePath;
+		bool m_saveGeom;
+		std::optional<const std::string> m_specifiedGeomSavePath;
+		const nbl::system::path m_saveGeomPrefixPath = localOutputCWD / "saved";
 };
 
 NBL_MAIN_FUNC(MeshLoadersApp)
\ No newline at end of file

From 0e2115ddf0de644e59afcb3ac3f7300bcbb540be Mon Sep 17 00:00:00 2001
From: YasInvolved <szrtxm.op@gmail.com>
Date: Tue, 19 Aug 2025 21:11:07 +0200
Subject: [PATCH 520/529] initialize m_saveGeomPrefixPath when everything is
 ready

---
 12_MeshLoaders/main.cpp | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/12_MeshLoaders/main.cpp b/12_MeshLoaders/main.cpp
index aee9b0683..37c48964e 100644
--- a/12_MeshLoaders/main.cpp
+++ b/12_MeshLoaders/main.cpp
@@ -18,7 +18,8 @@ class MeshLoadersApp final : public MonoWindowApplication, public BuiltinResourc
 	public:
 		inline MeshLoadersApp(const path& _localInputCWD, const path& _localOutputCWD, const path& _sharedInputCWD, const path& _sharedOutputCWD)
 			: IApplicationFramework(_localInputCWD, _localOutputCWD, _sharedInputCWD, _sharedOutputCWD),
-			device_base_t({1280,720}, EF_D32_SFLOAT, _localInputCWD, _localOutputCWD, _sharedInputCWD, _sharedOutputCWD) {}
+			device_base_t({ 1280,720 }, EF_D32_SFLOAT, _localInputCWD, _localOutputCWD, _sharedInputCWD, _sharedOutputCWD)
+		{}
 
 		inline bool onAppInitialized(smart_refctd_ptr<ISystem>&& system) override
 		{
@@ -30,6 +31,8 @@ class MeshLoadersApp final : public MonoWindowApplication, public BuiltinResourc
 			if (!device_base_t::onAppInitialized(smart_refctd_ptr(system)))
 				return false;
 
+			m_saveGeomPrefixPath = localOutputCWD / "saved";
+
 			// parse args
 			argparse::ArgumentParser parser("12_meshloaders");
 			parser.add_argument("--savemesh")
@@ -467,7 +470,7 @@ class MeshLoadersApp final : public MonoWindowApplication, public BuiltinResourc
 
 		bool m_saveGeom;
 		std::optional<const std::string> m_specifiedGeomSavePath;
-		const nbl::system::path m_saveGeomPrefixPath = localOutputCWD / "saved";
+		nbl::system::path m_saveGeomPrefixPath;
 };
 
 NBL_MAIN_FUNC(MeshLoadersApp)
\ No newline at end of file

From 6537b72300b4044caab4c38a8b15f99b30658896 Mon Sep 17 00:00:00 2001
From: YasInvolved <szrtxm.op@gmail.com>
Date: Tue, 19 Aug 2025 22:01:15 +0200
Subject: [PATCH 521/529] fix a bug where m_specifiedGeomSavePath is never set

---
 12_MeshLoaders/main.cpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/12_MeshLoaders/main.cpp b/12_MeshLoaders/main.cpp
index 37c48964e..5e730b5a1 100644
--- a/12_MeshLoaders/main.cpp
+++ b/12_MeshLoaders/main.cpp
@@ -64,6 +64,8 @@ class MeshLoadersApp final : public MonoWindowApplication, public BuiltinResourc
 
 				if (!std::filesystem::exists(tmp.parent_path()))
 					return logFail("Path specified in --savepath argument doesn't exist");
+
+				m_specifiedGeomSavePath.emplace(std::move(tmp.generic_string()));
 			}
 
 			m_semaphore = m_device->createSemaphore(m_realFrameIx);

From c21e4ccc736ff67c8f8435316cf2057516fad0f5 Mon Sep 17 00:00:00 2001
From: YasInvolved <szrtxm.op@gmail.com>
Date: Tue, 19 Aug 2025 22:02:10 +0200
Subject: [PATCH 522/529] change cmdline argument naming to match the naming
 scheme

---
 12_MeshLoaders/main.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/12_MeshLoaders/main.cpp b/12_MeshLoaders/main.cpp
index 5e730b5a1..418255fb8 100644
--- a/12_MeshLoaders/main.cpp
+++ b/12_MeshLoaders/main.cpp
@@ -35,7 +35,7 @@ class MeshLoadersApp final : public MonoWindowApplication, public BuiltinResourc
 
 			// parse args
 			argparse::ArgumentParser parser("12_meshloaders");
-			parser.add_argument("--savemesh")
+			parser.add_argument("--savegeometry")
 				.help("Save the mesh on exit or reload")
 				.flag();
 
@@ -52,7 +52,7 @@ class MeshLoadersApp final : public MonoWindowApplication, public BuiltinResourc
 				return logFail(e.what());
 			}
 
-			if (parser["--savemesh"] == true)
+			if (parser["--savegeometry"] == true)
 				m_saveGeom = true;
 
 			if (parser.present("--savepath"))

From 872d71dd8c456d5cfc91bc8b529eb83247df6601 Mon Sep 17 00:00:00 2001
From: YasInvolved <szrtxm.op@gmail.com>
Date: Tue, 19 Aug 2025 22:15:02 +0200
Subject: [PATCH 523/529] made geometry writer work async

---
 12_MeshLoaders/main.cpp | 36 +++++++++++++++++++++++++++---------
 1 file changed, 27 insertions(+), 9 deletions(-)

diff --git a/12_MeshLoaders/main.cpp b/12_MeshLoaders/main.cpp
index 418255fb8..900760bfc 100644
--- a/12_MeshLoaders/main.cpp
+++ b/12_MeshLoaders/main.cpp
@@ -58,7 +58,7 @@ class MeshLoadersApp final : public MonoWindowApplication, public BuiltinResourc
 			if (parser.present("--savepath"))
 			{
 				auto tmp = path(parser.get<std::string>("--savepath"));
-				
+
 				if (tmp.empty() || !tmp.has_filename())
 					return logFail("Invalid path has been specified in --savepath argument");
 
@@ -215,6 +215,17 @@ class MeshLoadersApp final : public MonoWindowApplication, public BuiltinResourc
 			return retval;
 		}
 
+		inline bool onAppTerminated() override
+		{
+			if (m_saveGeomTaskFuture.valid())
+			{
+				m_logger->log("Waiting for geometry writer to finish writing...", ILogger::ELL_INFO);
+				m_saveGeomTaskFuture.wait();
+			}
+
+			return device_base_t::onAppTerminated();
+		}
+
 	protected:
 		const video::IGPURenderpass::SCreationParams::SSubpassDependency* getDefaultSubpassDependencies() const override
 		{
@@ -310,9 +321,12 @@ class MeshLoadersApp final : public MonoWindowApplication, public BuiltinResourc
 
 			// TODO: do it async
 			if (m_saveGeom)
-				writeGeometry(
-					const_cast<ICPUPolygonGeometry*>(geometries[0].get()), 
-					m_specifiedGeomSavePath.value_or((m_saveGeomPrefixPath / path(m_modelPath).filename()).generic_string())
+				m_saveGeomTaskFuture = std::async(
+					std::launch::async,
+					[this, geometries] { writeGeometry(
+						geometries[0],
+						m_specifiedGeomSavePath.value_or((m_saveGeomPrefixPath / path(m_modelPath).filename()).generic_string())
+					); }
 				);
 
 			using aabb_t = hlsl::shapes::AABB<3,double>;
@@ -447,11 +461,14 @@ class MeshLoadersApp final : public MonoWindowApplication, public BuiltinResourc
 			return true;
 		}
 
-		void writeGeometry(ICPUPolygonGeometry* geometry, const std::string& savePath)
+		void writeGeometry(smart_refctd_ptr<const ICPUPolygonGeometry> geometry, const std::string& savePath)
 		{
-			IAssetWriter::SAssetWriteParams params{ reinterpret_cast<IAsset*>(geometry) };
-			m_logger->log("Saving mesh to %S", ILogger::ELL_INFO, savePath.c_str());
-			m_assetMgr->writeAsset(savePath, params);
+			IAsset* assetPtr = const_cast<IAsset*>(static_cast<const IAsset*>(geometry.get()));
+			IAssetWriter::SAssetWriteParams params{ assetPtr };
+			m_logger->log("Saving mesh to %s", ILogger::ELL_INFO, savePath.c_str());
+			if (!m_assetMgr->writeAsset(savePath, params))
+				m_logger->log("Failed to save %s", ILogger::ELL_ERROR, savePath.c_str());
+			m_logger->log("Mesh successfully saved!", ILogger::ELL_INFO);
 		}
 
 		// Maximum frames which can be simultaneously submitted, used to cycle through our per-frame resources like command buffers
@@ -470,7 +487,8 @@ class MeshLoadersApp final : public MonoWindowApplication, public BuiltinResourc
 		// mutables
 		std::string m_modelPath;
 
-		bool m_saveGeom;
+		bool m_saveGeom = false;
+		std::future<void> m_saveGeomTaskFuture;
 		std::optional<const std::string> m_specifiedGeomSavePath;
 		nbl::system::path m_saveGeomPrefixPath;
 };

From a19bf5fd3fa258977764e05d9b36e1753533b70f Mon Sep 17 00:00:00 2001
From: kevyuu <kevin.kayu@gmail.com>
Date: Wed, 20 Aug 2025 14:40:06 +0700
Subject: [PATCH 524/529] Turn InstanceID() and PrimitiveIndex() to inline
 spirv

---
 71_RayTracingPipeline/app_resources/raytrace.rahit.hlsl        | 2 +-
 71_RayTracingPipeline/app_resources/raytrace.rchit.hlsl        | 3 ++-
 71_RayTracingPipeline/app_resources/raytrace.rint.hlsl         | 3 ++-
 71_RayTracingPipeline/app_resources/raytrace_shadow.rahit.hlsl | 2 +-
 4 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/71_RayTracingPipeline/app_resources/raytrace.rahit.hlsl b/71_RayTracingPipeline/app_resources/raytrace.rahit.hlsl
index cdbdeaf3e..956ad5fe6 100644
--- a/71_RayTracingPipeline/app_resources/raytrace.rahit.hlsl
+++ b/71_RayTracingPipeline/app_resources/raytrace.rahit.hlsl
@@ -9,7 +9,7 @@ using namespace nbl::hlsl;
 [shader("anyhit")]
 void main(inout PrimaryPayload payload, in BuiltInTriangleIntersectionAttributes attribs)
 {
-    const int instID = InstanceID();
+    const int instID = spirv::InstanceCustomIndexKHR;
     const STriangleGeomInfo geom = vk::RawBufferLoad < STriangleGeomInfo > (pc.triangleGeomInfoBuffer + instID * sizeof(STriangleGeomInfo));
 
     const uint32_t bitpattern = payload.pcg();
diff --git a/71_RayTracingPipeline/app_resources/raytrace.rchit.hlsl b/71_RayTracingPipeline/app_resources/raytrace.rchit.hlsl
index 3e0b4b102..0a8bc5ec8 100644
--- a/71_RayTracingPipeline/app_resources/raytrace.rchit.hlsl
+++ b/71_RayTracingPipeline/app_resources/raytrace.rchit.hlsl
@@ -1,5 +1,6 @@
 #include "common.hlsl"
 
+#include "nbl/builtin/hlsl/spirv_intrinsics/core.hlsl"
 #include "nbl/builtin/hlsl/spirv_intrinsics/raytracing.hlsl"
 #include "nbl/builtin/hlsl/bda/__ptr.hlsl"
 
@@ -77,7 +78,7 @@ float3 calculateNormals(int primID, STriangleGeomInfo geom, float2 bary)
 [shader("closesthit")]
 void main(inout PrimaryPayload payload, in BuiltInTriangleIntersectionAttributes attribs)
 {
-    const int primID = PrimitiveIndex();
+    const int primID = spirv::PrimitiveId;
     const int instanceCustomIndex = spirv::InstanceCustomIndexKHR;
     const int geometryIndex = spirv::RayGeometryIndexKHR;
     const STriangleGeomInfo geom = vk::RawBufferLoad < STriangleGeomInfo > (pc.triangleGeomInfoBuffer + (instanceCustomIndex + geometryIndex) * sizeof(STriangleGeomInfo));
diff --git a/71_RayTracingPipeline/app_resources/raytrace.rint.hlsl b/71_RayTracingPipeline/app_resources/raytrace.rint.hlsl
index 940c597ad..72f9beffd 100644
--- a/71_RayTracingPipeline/app_resources/raytrace.rint.hlsl
+++ b/71_RayTracingPipeline/app_resources/raytrace.rint.hlsl
@@ -1,5 +1,6 @@
 #include "common.hlsl"
 
+#include "nbl/builtin/hlsl/spirv_intrinsics/core.hlsl"
 #include "nbl/builtin/hlsl/spirv_intrinsics/raytracing.hlsl"
 
 using namespace nbl::hlsl;
@@ -33,7 +34,7 @@ void main()
     ray.origin = spirv::WorldRayOriginKHR;
     ray.direction = spirv::WorldRayDirectionKHR;
 
-    const int primID = PrimitiveIndex();
+    const int primID = spirv::PrimitiveId;
 
     // Sphere data
     SProceduralGeomInfo sphere = vk::RawBufferLoad<SProceduralGeomInfo>(pc.proceduralGeomInfoBuffer + primID * sizeof(SProceduralGeomInfo));
diff --git a/71_RayTracingPipeline/app_resources/raytrace_shadow.rahit.hlsl b/71_RayTracingPipeline/app_resources/raytrace_shadow.rahit.hlsl
index 147f6af70..e41551512 100644
--- a/71_RayTracingPipeline/app_resources/raytrace_shadow.rahit.hlsl
+++ b/71_RayTracingPipeline/app_resources/raytrace_shadow.rahit.hlsl
@@ -9,7 +9,7 @@ using namespace nbl::hlsl;
 [shader("anyhit")]
 void main(inout OcclusionPayload payload, in BuiltInTriangleIntersectionAttributes attribs)
 {
-    const int instID = InstanceID();
+    const int instID = spirv::InstanceCustomIndexKHR;
     const STriangleGeomInfo geom = vk::RawBufferLoad < STriangleGeomInfo > (pc.triangleGeomInfoBuffer + instID * sizeof(STriangleGeomInfo));
     const Material material = nbl::hlsl::_static_cast<Material>(geom.material);
     

From e1972c7eb39fb0a0b970e03cc98126204ac97313 Mon Sep 17 00:00:00 2001
From: kevyuu <kevin.kayu@gmail.com>
Date: Wed, 20 Aug 2025 17:39:41 +0700
Subject: [PATCH 525/529] Convert more PrimitiveIndex() into inline spirv

---
 .../app_resources/raytrace_procedural.rchit.hlsl               | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/71_RayTracingPipeline/app_resources/raytrace_procedural.rchit.hlsl b/71_RayTracingPipeline/app_resources/raytrace_procedural.rchit.hlsl
index 29f26c902..6c2dc9903 100644
--- a/71_RayTracingPipeline/app_resources/raytrace_procedural.rchit.hlsl
+++ b/71_RayTracingPipeline/app_resources/raytrace_procedural.rchit.hlsl
@@ -1,5 +1,6 @@
 #include "common.hlsl"
 
+#include "nbl/builtin/hlsl/spirv_intrinsics/core.hlsl"
 #include "nbl/builtin/hlsl/spirv_intrinsics/raytracing.hlsl"
 using namespace nbl::hlsl;
 
@@ -11,7 +12,7 @@ void main(inout PrimaryPayload payload, in ProceduralHitAttribute attrib)
     const float32_t3 worldPosition = spirv::WorldRayOriginKHR + spirv::WorldRayDirectionKHR * spirv::RayTmaxKHR;
     const float32_t3 worldNormal = normalize(worldPosition - attrib.center);
 
-    payload.materialId = MaterialId::createProcedural(PrimitiveIndex()); // we use negative value to indicate that this is procedural
+    payload.materialId = MaterialId::createProcedural(spirv::PrimitiveId); // we use negative value to indicate that this is procedural
 
     payload.worldNormal = worldNormal;
     payload.rayDistance = spirv::RayTmaxKHR;

From 7ed614c0219b154a7144d261dedae5e5d6da0da5 Mon Sep 17 00:00:00 2001
From: YasInvolved <szrtxm.op@gmail.com>
Date: Thu, 21 Aug 2025 20:31:21 +0200
Subject: [PATCH 526/529] fix data race hazard, await running async before
 running a new one

---
 12_MeshLoaders/main.cpp | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

diff --git a/12_MeshLoaders/main.cpp b/12_MeshLoaders/main.cpp
index 900760bfc..31ab795df 100644
--- a/12_MeshLoaders/main.cpp
+++ b/12_MeshLoaders/main.cpp
@@ -319,15 +319,23 @@ class MeshLoadersApp final : public MonoWindowApplication, public BuiltinResourc
 			if (geometries.empty())
 				return false;
 
-			// TODO: do it async
 			if (m_saveGeom)
+			{
+				if (m_saveGeomTaskFuture.valid())
+				{
+					m_logger->log("Waiting for previous geometry saving task to complete...", ILogger::ELL_INFO);
+					m_saveGeomTaskFuture.wait();
+				}
+
+				std::string currentGeomSavePath = m_specifiedGeomSavePath.value_or((m_saveGeomPrefixPath / path(m_modelPath).filename()).generic_string());
 				m_saveGeomTaskFuture = std::async(
 					std::launch::async,
-					[this, geometries] { writeGeometry(
+					[this, geometries, currentGeomSavePath] { writeGeometry(
 						geometries[0],
-						m_specifiedGeomSavePath.value_or((m_saveGeomPrefixPath / path(m_modelPath).filename()).generic_string())
+						currentGeomSavePath
 					); }
 				);
+			}
 
 			using aabb_t = hlsl::shapes::AABB<3,double>;
 			auto printAABB = [&](const aabb_t& aabb, const char* extraMsg="")->void

From 1a63f5ad1ef90d8b73dd77570f957b433251f2df Mon Sep 17 00:00:00 2001
From: YasInvolved <szrtxm.op@gmail.com>
Date: Thu, 21 Aug 2025 21:36:47 +0200
Subject: [PATCH 527/529] multiple tasks support

---
 12_MeshLoaders/main.cpp | 96 +++++++++++++++++++++++++++++++++--------
 1 file changed, 79 insertions(+), 17 deletions(-)

diff --git a/12_MeshLoaders/main.cpp b/12_MeshLoaders/main.cpp
index 31ab795df..af592a473 100644
--- a/12_MeshLoaders/main.cpp
+++ b/12_MeshLoaders/main.cpp
@@ -10,6 +10,70 @@
 #include "nbl/ext/MitsubaLoader/CSerializedLoader.h"
 #endif
 
+class ThreadPool
+{
+public:
+	ThreadPool(size_t numThreads = 4)
+	{
+		for (size_t i = 0; i < numThreads; i++)
+			m_workers.emplace_back(
+				[this]
+				{
+					while (true)
+					{
+						std::function<void()> task;
+						{
+							std::unique_lock lock(m_queueMutex);
+							m_taskReady.wait(lock, [this] { return m_stop.load() || !m_tasks.empty(); });
+							if (m_stop.load() && m_tasks.empty())
+								return;
+
+							task = std::move(m_tasks.front());
+							m_tasks.pop();
+						}
+
+						task();
+					}
+				}
+			);
+	}
+
+	template <typename Function, typename... Args>
+	auto enqueue(Function&& func, Args&& ...args) -> std::future<std::invoke_result_t<Function, Args...>>
+	{
+		using ret_t = std::invoke_result_t<Function, Args...>;
+
+		auto task = std::bind(std::forward<Function>(func), std::forward<Args>(args)...);
+		auto taskPtr = std::make_shared<std::packaged_task<ret_t()>>(std::move(task));
+
+		std::future<ret_t> future = taskPtr->get_future();
+
+		{
+			std::unique_lock lock(m_queueMutex);
+			m_tasks.emplace([taskPtr]() { (*taskPtr)(); });
+		}
+
+		m_taskReady.notify_one();
+		return future;
+	}
+
+	~ThreadPool()
+	{
+		m_stop.store(true);
+		m_taskReady.notify_all();
+		for (auto& worker : m_workers)
+			worker.join();
+	}
+
+private:
+	std::vector<std::thread> m_workers;
+	std::queue<std::function<void()>> m_tasks;
+	
+	std::mutex m_queueMutex;
+	std::condition_variable m_taskReady;
+	std::atomic<bool> m_stop;
+};
+
 class MeshLoadersApp final : public MonoWindowApplication, public BuiltinResourcesApplication
 {
 		using device_base_t = MonoWindowApplication;
@@ -217,10 +281,17 @@ class MeshLoadersApp final : public MonoWindowApplication, public BuiltinResourc
 
 		inline bool onAppTerminated() override
 		{
-			if (m_saveGeomTaskFuture.valid())
+			m_logger->log("Waiting for all geometry saving tasks (%u) to complete...", ILogger::ELL_INFO, m_saveGeomTaskFutures.size());
+
+			for (size_t i = 0; i < m_saveGeomTaskFutures.size(); i++)
 			{
-				m_logger->log("Waiting for geometry writer to finish writing...", ILogger::ELL_INFO);
-				m_saveGeomTaskFuture.wait();
+				const auto& task = m_saveGeomTaskFutures[i];
+
+				if (!task.valid())
+					continue;
+
+				task.wait();
+				m_logger->log("Task %u of %u completed!", ILogger::ELL_INFO, i+1, m_saveGeomTaskFutures.size());
 			}
 
 			return device_base_t::onAppTerminated();
@@ -321,19 +392,9 @@ class MeshLoadersApp final : public MonoWindowApplication, public BuiltinResourc
 
 			if (m_saveGeom)
 			{
-				if (m_saveGeomTaskFuture.valid())
-				{
-					m_logger->log("Waiting for previous geometry saving task to complete...", ILogger::ELL_INFO);
-					m_saveGeomTaskFuture.wait();
-				}
-
-				std::string currentGeomSavePath = m_specifiedGeomSavePath.value_or((m_saveGeomPrefixPath / path(m_modelPath).filename()).generic_string());
-				m_saveGeomTaskFuture = std::async(
-					std::launch::async,
-					[this, geometries, currentGeomSavePath] { writeGeometry(
-						geometries[0],
-						currentGeomSavePath
-					); }
+				std::string savePath = m_specifiedGeomSavePath.value_or((m_saveGeomPrefixPath / path(m_modelPath).filename()).generic_string());
+				m_saveGeomTaskFutures.emplace_back(
+					m_threadPool->enqueue([this, geometries, savePath] { writeGeometry(geometries[0], savePath); })
 				);
 			}
 
@@ -496,7 +557,8 @@ class MeshLoadersApp final : public MonoWindowApplication, public BuiltinResourc
 		std::string m_modelPath;
 
 		bool m_saveGeom = false;
-		std::future<void> m_saveGeomTaskFuture;
+		std::unique_ptr<ThreadPool> m_threadPool = std::make_unique<ThreadPool>(3);
+		std::vector<std::future<void>> m_saveGeomTaskFutures;
 		std::optional<const std::string> m_specifiedGeomSavePath;
 		nbl::system::path m_saveGeomPrefixPath;
 };

From 99945dcce68fe7cf6ac9fd6e3b7966e49451596e Mon Sep 17 00:00:00 2001
From: YasInvolved <szrtxm.op@gmail.com>
Date: Thu, 21 Aug 2025 22:49:56 +0200
Subject: [PATCH 528/529] revert: fix data race hazard, await running async
 before running a new one

---
 12_MeshLoaders/main.cpp | 873 +++++++++++++++++++---------------------
 1 file changed, 406 insertions(+), 467 deletions(-)

diff --git a/12_MeshLoaders/main.cpp b/12_MeshLoaders/main.cpp
index af592a473..d80fa8998 100644
--- a/12_MeshLoaders/main.cpp
+++ b/12_MeshLoaders/main.cpp
@@ -10,557 +10,496 @@
 #include "nbl/ext/MitsubaLoader/CSerializedLoader.h"
 #endif
 
-class ThreadPool
+class MeshLoadersApp final : public MonoWindowApplication, public BuiltinResourcesApplication
 {
+	using device_base_t = MonoWindowApplication;
+	using asset_base_t = BuiltinResourcesApplication;
+
 public:
-	ThreadPool(size_t numThreads = 4)
+	inline MeshLoadersApp(const path& _localInputCWD, const path& _localOutputCWD, const path& _sharedInputCWD, const path& _sharedOutputCWD)
+		: IApplicationFramework(_localInputCWD, _localOutputCWD, _sharedInputCWD, _sharedOutputCWD),
+		device_base_t({ 1280,720 }, EF_D32_SFLOAT, _localInputCWD, _localOutputCWD, _sharedInputCWD, _sharedOutputCWD)
 	{
-		for (size_t i = 0; i < numThreads; i++)
-			m_workers.emplace_back(
-				[this]
-				{
-					while (true)
-					{
-						std::function<void()> task;
-						{
-							std::unique_lock lock(m_queueMutex);
-							m_taskReady.wait(lock, [this] { return m_stop.load() || !m_tasks.empty(); });
-							if (m_stop.load() && m_tasks.empty())
-								return;
-
-							task = std::move(m_tasks.front());
-							m_tasks.pop();
-						}
-
-						task();
-					}
-				}
-			);
 	}
 
-	template <typename Function, typename... Args>
-	auto enqueue(Function&& func, Args&& ...args) -> std::future<std::invoke_result_t<Function, Args...>>
+	inline bool onAppInitialized(smart_refctd_ptr<ISystem>&& system) override
 	{
-		using ret_t = std::invoke_result_t<Function, Args...>;
+		if (!asset_base_t::onAppInitialized(smart_refctd_ptr(system)))
+			return false;
+#ifdef NBL_BUILD_MITSUBA_LOADER
+		m_assetMgr->addAssetLoader(make_smart_refctd_ptr<ext::MitsubaLoader::CSerializedLoader>());
+#endif
+		if (!device_base_t::onAppInitialized(smart_refctd_ptr(system)))
+			return false;
 
-		auto task = std::bind(std::forward<Function>(func), std::forward<Args>(args)...);
-		auto taskPtr = std::make_shared<std::packaged_task<ret_t()>>(std::move(task));
+		m_saveGeomPrefixPath = localOutputCWD / "saved";
 
-		std::future<ret_t> future = taskPtr->get_future();
+		// parse args
+		argparse::ArgumentParser parser("12_meshloaders");
+		parser.add_argument("--savegeometry")
+			.help("Save the mesh on exit or reload")
+			.flag();
 
+		parser.add_argument("--savepath")
+			.nargs(1)
+			.help("Specify the file to which the mesh will be saved");
+
+		try
 		{
-			std::unique_lock lock(m_queueMutex);
-			m_tasks.emplace([taskPtr]() { (*taskPtr)(); });
+			parser.parse_args({ argv.data(), argv.data() + argv.size() });
 		}
-
-		m_taskReady.notify_one();
-		return future;
-	}
-
-	~ThreadPool()
-	{
-		m_stop.store(true);
-		m_taskReady.notify_all();
-		for (auto& worker : m_workers)
-			worker.join();
-	}
-
-private:
-	std::vector<std::thread> m_workers;
-	std::queue<std::function<void()>> m_tasks;
-	
-	std::mutex m_queueMutex;
-	std::condition_variable m_taskReady;
-	std::atomic<bool> m_stop;
-};
-
-class MeshLoadersApp final : public MonoWindowApplication, public BuiltinResourcesApplication
-{
-		using device_base_t = MonoWindowApplication;
-		using asset_base_t = BuiltinResourcesApplication;
-
-	public:
-		inline MeshLoadersApp(const path& _localInputCWD, const path& _localOutputCWD, const path& _sharedInputCWD, const path& _sharedOutputCWD)
-			: IApplicationFramework(_localInputCWD, _localOutputCWD, _sharedInputCWD, _sharedOutputCWD),
-			device_base_t({ 1280,720 }, EF_D32_SFLOAT, _localInputCWD, _localOutputCWD, _sharedInputCWD, _sharedOutputCWD)
-		{}
-
-		inline bool onAppInitialized(smart_refctd_ptr<ISystem>&& system) override
+		catch (const std::exception& e)
 		{
-			if (!asset_base_t::onAppInitialized(smart_refctd_ptr(system)))
-				return false;
-		#ifdef NBL_BUILD_MITSUBA_LOADER
-			m_assetMgr->addAssetLoader(make_smart_refctd_ptr<ext::MitsubaLoader::CSerializedLoader>());
-		#endif
-			if (!device_base_t::onAppInitialized(smart_refctd_ptr(system)))
-				return false;
+			return logFail(e.what());
+		}
 
-			m_saveGeomPrefixPath = localOutputCWD / "saved";
+		if (parser["--savegeometry"] == true)
+			m_saveGeom = true;
 
-			// parse args
-			argparse::ArgumentParser parser("12_meshloaders");
-			parser.add_argument("--savegeometry")
-				.help("Save the mesh on exit or reload")
-				.flag();
+		if (parser.present("--savepath"))
+		{
+			auto tmp = path(parser.get<std::string>("--savepath"));
 
-			parser.add_argument("--savepath")
-				.nargs(1)
-				.help("Specify the file to which the mesh will be saved");
+			if (tmp.empty() || !tmp.has_filename())
+				return logFail("Invalid path has been specified in --savepath argument");
 
-			try
-			{
-				parser.parse_args({ argv.data(), argv.data() + argv.size() });
-			}
-			catch (const std::exception& e)
-			{
-				return logFail(e.what());
-			}
+			if (!std::filesystem::exists(tmp.parent_path()))
+				return logFail("Path specified in --savepath argument doesn't exist");
 
-			if (parser["--savegeometry"] == true)
-				m_saveGeom = true;
+			m_specifiedGeomSavePath.emplace(std::move(tmp.generic_string()));
+		}
 
-			if (parser.present("--savepath"))
-			{
-				auto tmp = path(parser.get<std::string>("--savepath"));
+		m_semaphore = m_device->createSemaphore(m_realFrameIx);
+		if (!m_semaphore)
+			return logFail("Failed to Create a Semaphore!");
 
-				if (tmp.empty() || !tmp.has_filename())
-					return logFail("Invalid path has been specified in --savepath argument");
+		auto pool = m_device->createCommandPool(getGraphicsQueue()->getFamilyIndex(), IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT);
+		for (auto i = 0u; i < MaxFramesInFlight; i++)
+		{
+			if (!pool)
+				return logFail("Couldn't create Command Pool!");
+			if (!pool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY, { m_cmdBufs.data() + i,1 }))
+				return logFail("Couldn't create Command Buffer!");
+		}
 
-				if (!std::filesystem::exists(tmp.parent_path()))
-					return logFail("Path specified in --savepath argument doesn't exist");
 
-				m_specifiedGeomSavePath.emplace(std::move(tmp.generic_string()));
-			}
+		auto scRes = static_cast<CDefaultSwapchainFramebuffers*>(m_surface->getSwapchainResources());
+		m_renderer = CSimpleDebugRenderer::create(m_assetMgr.get(), scRes->getRenderpass(), 0, {});
+		if (!m_renderer)
+			return logFail("Failed to create renderer!");
 
-			m_semaphore = m_device->createSemaphore(m_realFrameIx);
-			if (!m_semaphore)
-				return logFail("Failed to Create a Semaphore!");
-
-			auto pool = m_device->createCommandPool(getGraphicsQueue()->getFamilyIndex(),IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT);
-			for (auto i=0u; i<MaxFramesInFlight; i++)
-			{
-				if (!pool)
-					return logFail("Couldn't create Command Pool!");
-				if (!pool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY,{m_cmdBufs.data()+i,1}))
-					return logFail("Couldn't create Command Buffer!");
-			}
-			
+		//
+		if (!reloadModel())
+			return false;
 
-			auto scRes = static_cast<CDefaultSwapchainFramebuffers*>(m_surface->getSwapchainResources());
-			m_renderer = CSimpleDebugRenderer::create(m_assetMgr.get(),scRes->getRenderpass(),0,{});
-			if (!m_renderer)
-				return logFail("Failed to create renderer!");
+		camera.mapKeysToArrows();
 
-			//
-			if (!reloadModel())
-				return false;
+		onAppInitializedFinish();
+		return true;
+	}
 
-			camera.mapKeysToArrows();
+	inline IQueue::SSubmitInfo::SSemaphoreInfo renderFrame(const std::chrono::microseconds nextPresentationTimestamp) override
+	{
+		m_inputSystem->getDefaultMouse(&mouse);
+		m_inputSystem->getDefaultKeyboard(&keyboard);
 
-			onAppInitializedFinish();
-			return true;
-		}
+		//
+		const auto resourceIx = m_realFrameIx % MaxFramesInFlight;
 
-		inline IQueue::SSubmitInfo::SSemaphoreInfo renderFrame(const std::chrono::microseconds nextPresentationTimestamp) override
+		auto* const cb = m_cmdBufs.data()[resourceIx].get();
+		cb->reset(IGPUCommandBuffer::RESET_FLAGS::RELEASE_RESOURCES_BIT);
+		cb->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT);
+		// clear to black for both things
 		{
-			m_inputSystem->getDefaultMouse(&mouse);
-			m_inputSystem->getDefaultKeyboard(&keyboard);
-
-			//
-			const auto resourceIx = m_realFrameIx % MaxFramesInFlight;
-
-			auto* const cb = m_cmdBufs.data()[resourceIx].get();
-			cb->reset(IGPUCommandBuffer::RESET_FLAGS::RELEASE_RESOURCES_BIT);
-			cb->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT);
-			// clear to black for both things
+			// begin renderpass
 			{
-				// begin renderpass
-				{
-					auto scRes = static_cast<CDefaultSwapchainFramebuffers*>(m_surface->getSwapchainResources());
-					auto* framebuffer = scRes->getFramebuffer(device_base_t::getCurrentAcquire().imageIndex);
-					const IGPUCommandBuffer::SClearColorValue clearValue = { .float32 = {1.f,0.f,1.f,1.f} };
-					const IGPUCommandBuffer::SClearDepthStencilValue depthValue = { .depth = 0.f };
-					const VkRect2D currentRenderArea =
-					{
-						.offset = {0,0},
-						.extent = {framebuffer->getCreationParameters().width,framebuffer->getCreationParameters().height}
-					};
-					const IGPUCommandBuffer::SRenderpassBeginInfo info =
-					{
-						.framebuffer = framebuffer,
-						.colorClearValues = &clearValue,
-						.depthStencilClearValues = &depthValue,
-						.renderArea = currentRenderArea
-					};
-					cb->beginRenderPass(info,IGPUCommandBuffer::SUBPASS_CONTENTS::INLINE);
-
-					const SViewport viewport = {
-						.x = static_cast<float>(currentRenderArea.offset.x),
-						.y = static_cast<float>(currentRenderArea.offset.y),
-						.width = static_cast<float>(currentRenderArea.extent.width),
-						.height = static_cast<float>(currentRenderArea.extent.height)
-					};
-					cb->setViewport(0u,1u,&viewport);
-		
-					cb->setScissor(0u,1u,&currentRenderArea);
-				}
-				// late latch input
+				auto scRes = static_cast<CDefaultSwapchainFramebuffers*>(m_surface->getSwapchainResources());
+				auto* framebuffer = scRes->getFramebuffer(device_base_t::getCurrentAcquire().imageIndex);
+				const IGPUCommandBuffer::SClearColorValue clearValue = { .float32 = {1.f,0.f,1.f,1.f} };
+				const IGPUCommandBuffer::SClearDepthStencilValue depthValue = { .depth = 0.f };
+				const VkRect2D currentRenderArea =
 				{
-					bool reload = false;
-					camera.beginInputProcessing(nextPresentationTimestamp);
-					mouse.consumeEvents([&](const IMouseEventChannel::range_t& events) -> void { camera.mouseProcess(events); }, m_logger.get());
-					keyboard.consumeEvents([&](const IKeyboardEventChannel::range_t& events) -> void
-						{
-							for (const auto& event : events)
-							if (event.keyCode==E_KEY_CODE::EKC_R && event.action==SKeyboardEvent::ECA_RELEASED)
-								reload = true;
-							camera.keyboardProcess(events);
-						},
-						m_logger.get()
-					);
-					camera.endInputProcessing(nextPresentationTimestamp);
-					if (reload)
-						reloadModel();
-				}
-				// draw scene
+					.offset = {0,0},
+					.extent = {framebuffer->getCreationParameters().width,framebuffer->getCreationParameters().height}
+				};
+				const IGPUCommandBuffer::SRenderpassBeginInfo info =
 				{
-					float32_t3x4 viewMatrix;
-					float32_t4x4 viewProjMatrix;
-					// TODO: get rid of legacy matrices
-					{
-						memcpy(&viewMatrix,camera.getViewMatrix().pointer(),sizeof(viewMatrix));
-						memcpy(&viewProjMatrix,camera.getConcatenatedMatrix().pointer(),sizeof(viewProjMatrix));
-					}
- 					m_renderer->render(cb,CSimpleDebugRenderer::SViewParams(viewMatrix,viewProjMatrix));
-				}
-				cb->endRenderPass();
+					.framebuffer = framebuffer,
+					.colorClearValues = &clearValue,
+					.depthStencilClearValues = &depthValue,
+					.renderArea = currentRenderArea
+				};
+				cb->beginRenderPass(info, IGPUCommandBuffer::SUBPASS_CONTENTS::INLINE);
+
+				const SViewport viewport = {
+					.x = static_cast<float>(currentRenderArea.offset.x),
+					.y = static_cast<float>(currentRenderArea.offset.y),
+					.width = static_cast<float>(currentRenderArea.extent.width),
+					.height = static_cast<float>(currentRenderArea.extent.height)
+				};
+				cb->setViewport(0u, 1u, &viewport);
+
+				cb->setScissor(0u, 1u, &currentRenderArea);
 			}
-			cb->end();
-
-			IQueue::SSubmitInfo::SSemaphoreInfo retval =
-			{
-				.semaphore = m_semaphore.get(),
-				.value = ++m_realFrameIx,
-				.stageMask = PIPELINE_STAGE_FLAGS::ALL_GRAPHICS_BITS
-			};
-			const IQueue::SSubmitInfo::SCommandBufferInfo commandBuffers[] =
+			// late latch input
 			{
-				{.cmdbuf = cb }
-			};
-			const IQueue::SSubmitInfo::SSemaphoreInfo acquired[] = {
-				{
-					.semaphore = device_base_t::getCurrentAcquire().semaphore,
-					.value = device_base_t::getCurrentAcquire().acquireCount,
-					.stageMask = PIPELINE_STAGE_FLAGS::NONE
-				}
-			};
-			const IQueue::SSubmitInfo infos[] =
+				bool reload = false;
+				camera.beginInputProcessing(nextPresentationTimestamp);
+				mouse.consumeEvents([&](const IMouseEventChannel::range_t& events) -> void { camera.mouseProcess(events); }, m_logger.get());
+				keyboard.consumeEvents([&](const IKeyboardEventChannel::range_t& events) -> void
+					{
+						for (const auto& event : events)
+							if (event.keyCode == E_KEY_CODE::EKC_R && event.action == SKeyboardEvent::ECA_RELEASED)
+								reload = true;
+						camera.keyboardProcess(events);
+					},
+					m_logger.get()
+				);
+				camera.endInputProcessing(nextPresentationTimestamp);
+				if (reload)
+					reloadModel();
+			}
+			// draw scene
 			{
+				float32_t3x4 viewMatrix;
+				float32_t4x4 viewProjMatrix;
+				// TODO: get rid of legacy matrices
 				{
-					.waitSemaphores = acquired,
-					.commandBuffers = commandBuffers,
-					.signalSemaphores = {&retval,1}
+					memcpy(&viewMatrix, camera.getViewMatrix().pointer(), sizeof(viewMatrix));
+					memcpy(&viewProjMatrix, camera.getConcatenatedMatrix().pointer(), sizeof(viewProjMatrix));
 				}
-			};
-			
-			if (getGraphicsQueue()->submit(infos) != IQueue::RESULT::SUCCESS)
-			{
-				retval.semaphore = nullptr; // so that we don't wait on semaphore that will never signal
-				m_realFrameIx--;
+				m_renderer->render(cb, CSimpleDebugRenderer::SViewParams(viewMatrix, viewProjMatrix));
 			}
+			cb->endRenderPass();
+		}
+		cb->end();
 
-			std::string caption = "[Nabla Engine] Mesh Loaders";
+		IQueue::SSubmitInfo::SSemaphoreInfo retval =
+		{
+			.semaphore = m_semaphore.get(),
+			.value = ++m_realFrameIx,
+			.stageMask = PIPELINE_STAGE_FLAGS::ALL_GRAPHICS_BITS
+		};
+		const IQueue::SSubmitInfo::SCommandBufferInfo commandBuffers[] =
+		{
+			{.cmdbuf = cb }
+		};
+		const IQueue::SSubmitInfo::SSemaphoreInfo acquired[] = {
+			{
+				.semaphore = device_base_t::getCurrentAcquire().semaphore,
+				.value = device_base_t::getCurrentAcquire().acquireCount,
+				.stageMask = PIPELINE_STAGE_FLAGS::NONE
+			}
+		};
+		const IQueue::SSubmitInfo infos[] =
+		{
 			{
-				caption += ", displaying [";
-				caption += m_modelPath;
-				caption += "]";
-				m_window->setCaption(caption);
+				.waitSemaphores = acquired,
+				.commandBuffers = commandBuffers,
+				.signalSemaphores = {&retval,1}
 			}
-			return retval;
+		};
+
+		if (getGraphicsQueue()->submit(infos) != IQueue::RESULT::SUCCESS)
+		{
+			retval.semaphore = nullptr; // so that we don't wait on semaphore that will never signal
+			m_realFrameIx--;
 		}
 
-		inline bool onAppTerminated() override
+		std::string caption = "[Nabla Engine] Mesh Loaders";
 		{
-			m_logger->log("Waiting for all geometry saving tasks (%u) to complete...", ILogger::ELL_INFO, m_saveGeomTaskFutures.size());
+			caption += ", displaying [";
+			caption += m_modelPath;
+			caption += "]";
+			m_window->setCaption(caption);
+		}
+		return retval;
+	}
 
-			for (size_t i = 0; i < m_saveGeomTaskFutures.size(); i++)
-			{
-				const auto& task = m_saveGeomTaskFutures[i];
+	inline bool onAppTerminated() override
+	{
+		if (m_saveGeomTaskFuture.valid())
+		{
+			m_logger->log("Waiting for geometry writer to finish writing...", ILogger::ELL_INFO);
+			m_saveGeomTaskFuture.wait();
+		}
 
-				if (!task.valid())
-					continue;
+		return device_base_t::onAppTerminated();
+	}
 
-				task.wait();
-				m_logger->log("Task %u of %u completed!", ILogger::ELL_INFO, i+1, m_saveGeomTaskFutures.size());
+protected:
+	const video::IGPURenderpass::SCreationParams::SSubpassDependency* getDefaultSubpassDependencies() const override
+	{
+		// Subsequent submits don't wait for each other, hence its important to have External Dependencies which prevent users of the depth attachment overlapping.
+		const static IGPURenderpass::SCreationParams::SSubpassDependency dependencies[] = {
+			// wipe-transition of Color to ATTACHMENT_OPTIMAL and depth
+			{
+				.srcSubpass = IGPURenderpass::SCreationParams::SSubpassDependency::External,
+				.dstSubpass = 0,
+				.memoryBarrier = {
+				// last place where the depth can get modified in previous frame, `COLOR_ATTACHMENT_OUTPUT_BIT` is implicitly later
+				.srcStageMask = PIPELINE_STAGE_FLAGS::LATE_FRAGMENT_TESTS_BIT,
+				// don't want any writes to be available, we'll clear 
+				.srcAccessMask = ACCESS_FLAGS::NONE,
+				// destination needs to wait as early as possible
+				// TODO: `COLOR_ATTACHMENT_OUTPUT_BIT` shouldn't be needed, because its a logically later stage, see TODO in `ECommonEnums.h`
+				.dstStageMask = PIPELINE_STAGE_FLAGS::EARLY_FRAGMENT_TESTS_BIT | PIPELINE_STAGE_FLAGS::COLOR_ATTACHMENT_OUTPUT_BIT,
+				// because depth and color get cleared first no read mask
+				.dstAccessMask = ACCESS_FLAGS::DEPTH_STENCIL_ATTACHMENT_WRITE_BIT | ACCESS_FLAGS::COLOR_ATTACHMENT_WRITE_BIT
 			}
+			// leave view offsets and flags default
+		},
+			// color from ATTACHMENT_OPTIMAL to PRESENT_SRC
+			{
+				.srcSubpass = 0,
+				.dstSubpass = IGPURenderpass::SCreationParams::SSubpassDependency::External,
+				.memoryBarrier = {
+				// last place where the color can get modified, depth is implicitly earlier
+				.srcStageMask = PIPELINE_STAGE_FLAGS::COLOR_ATTACHMENT_OUTPUT_BIT,
+				// only write ops, reads can't be made available
+				.srcAccessMask = ACCESS_FLAGS::COLOR_ATTACHMENT_WRITE_BIT
+				// spec says nothing is needed when presentation is the destination
+			}
+			// leave view offsets and flags default
+		},
+		IGPURenderpass::SCreationParams::DependenciesEnd
+		};
+		return dependencies;
+	}
 
-			return device_base_t::onAppTerminated();
-		}
+private:
+	// TODO: standardise this across examples, and take from `argv`
+	bool m_nonInteractiveTest = false;
 
-	protected:
-		const video::IGPURenderpass::SCreationParams::SSubpassDependency* getDefaultSubpassDependencies() const override
+	bool reloadModel()
+	{
+		if (m_nonInteractiveTest) // TODO: maybe also take from argv and argc
+			m_modelPath = (sharedInputCWD / "ply/Spanner-ply.ply").string();
+		else
 		{
-			// Subsequent submits don't wait for each other, hence its important to have External Dependencies which prevent users of the depth attachment overlapping.
-			const static IGPURenderpass::SCreationParams::SSubpassDependency dependencies[] = {
-				// wipe-transition of Color to ATTACHMENT_OPTIMAL and depth
-				{
-					.srcSubpass = IGPURenderpass::SCreationParams::SSubpassDependency::External,
-					.dstSubpass = 0,
-					.memoryBarrier = {
-						// last place where the depth can get modified in previous frame, `COLOR_ATTACHMENT_OUTPUT_BIT` is implicitly later
-						.srcStageMask = PIPELINE_STAGE_FLAGS::LATE_FRAGMENT_TESTS_BIT,
-						// don't want any writes to be available, we'll clear 
-						.srcAccessMask = ACCESS_FLAGS::NONE,
-						// destination needs to wait as early as possible
-						// TODO: `COLOR_ATTACHMENT_OUTPUT_BIT` shouldn't be needed, because its a logically later stage, see TODO in `ECommonEnums.h`
-						.dstStageMask = PIPELINE_STAGE_FLAGS::EARLY_FRAGMENT_TESTS_BIT | PIPELINE_STAGE_FLAGS::COLOR_ATTACHMENT_OUTPUT_BIT,
-						// because depth and color get cleared first no read mask
-						.dstAccessMask = ACCESS_FLAGS::DEPTH_STENCIL_ATTACHMENT_WRITE_BIT | ACCESS_FLAGS::COLOR_ATTACHMENT_WRITE_BIT
-					}
-					// leave view offsets and flags default
-				},
-				// color from ATTACHMENT_OPTIMAL to PRESENT_SRC
+			pfd::open_file file("Choose a supported Model File", sharedInputCWD.string(),
 				{
-					.srcSubpass = 0,
-					.dstSubpass = IGPURenderpass::SCreationParams::SSubpassDependency::External,
-					.memoryBarrier = {
-						// last place where the color can get modified, depth is implicitly earlier
-						.srcStageMask = PIPELINE_STAGE_FLAGS::COLOR_ATTACHMENT_OUTPUT_BIT,
-						// only write ops, reads can't be made available
-						.srcAccessMask = ACCESS_FLAGS::COLOR_ATTACHMENT_WRITE_BIT
-						// spec says nothing is needed when presentation is the destination
-					}
-					// leave view offsets and flags default
+					"All Supported Formats", "*.ply *.stl *.serialized *.obj",
+					"TODO (.ply)", "*.ply",
+					"TODO (.stl)", "*.stl",
+					"Mitsuba 0.6 Serialized (.serialized)", "*.serialized",
+					"Wavefront Object (.obj)", "*.obj"
 				},
-				IGPURenderpass::SCreationParams::DependenciesEnd
-			};
-			return dependencies;
+				false
+			);
+			if (file.result().empty())
+				return false;
+			m_modelPath = file.result()[0];
 		}
 
-	private:
-		// TODO: standardise this across examples, and take from `argv`
-		bool m_nonInteractiveTest = false;
+		// free up
+		m_renderer->m_instances.clear();
+		m_renderer->clearGeometries({ .semaphore = m_semaphore.get(),.value = m_realFrameIx });
+		m_assetMgr->clearAllAssetCache();
+
+		//! load the geometry
+		IAssetLoader::SAssetLoadParams params = {};
+		params.logger = m_logger.get();
+		auto asset = m_assetMgr->getAsset(m_modelPath, params);
+		if (asset.getContents().empty())
+			return false;
+
+		// 
+		core::vector<smart_refctd_ptr<const ICPUPolygonGeometry>> geometries;
+		switch (asset.getAssetType())
+		{
+		case IAsset::E_TYPE::ET_GEOMETRY:
+			for (const auto& item : asset.getContents())
+				if (auto polyGeo = IAsset::castDown<ICPUPolygonGeometry>(item); polyGeo)
+					geometries.push_back(polyGeo);
+			break;
+		default:
+			m_logger->log("Asset loaded but not a supported type (ET_GEOMETRY,ET_GEOMETRY_COLLECTION)", ILogger::ELL_ERROR);
+			break;
+		}
+		if (geometries.empty())
+			return false;
 
-		bool reloadModel()
+		if (m_saveGeom)
 		{
-			if (m_nonInteractiveTest) // TODO: maybe also take from argv and argc
-				m_modelPath = (sharedInputCWD/"ply/Spanner-ply.ply").string();
-			else
+			if (m_saveGeomTaskFuture.valid())
 			{
-				pfd::open_file file("Choose a supported Model File", sharedInputCWD.string(),
-					{
-						"All Supported Formats", "*.ply *.stl *.serialized *.obj",
-						"TODO (.ply)", "*.ply",
-						"TODO (.stl)", "*.stl",
-						"Mitsuba 0.6 Serialized (.serialized)", "*.serialized",
-						"Wavefront Object (.obj)", "*.obj"
-					},
-					false
-				);
-				if (file.result().empty())
-					return false;
-				m_modelPath = file.result()[0];
+				m_logger->log("Waiting for previous geometry saving task to complete...", ILogger::ELL_INFO);
+				m_saveGeomTaskFuture.wait();
 			}
 
-			// free up
-			m_renderer->m_instances.clear();
-			m_renderer->clearGeometries({.semaphore=m_semaphore.get(),.value=m_realFrameIx});
-			m_assetMgr->clearAllAssetCache();
+			std::string currentGeomSavePath = m_specifiedGeomSavePath.value_or((m_saveGeomPrefixPath / path(m_modelPath).filename()).generic_string());
+			m_saveGeomTaskFuture = std::async(
+				std::launch::async,
+				[this, geometries, currentGeomSavePath] { writeGeometry(
+					geometries[0],
+					currentGeomSavePath
+				); }
+			);
+		}
 
-			//! load the geometry
-			IAssetLoader::SAssetLoadParams params = {};
-			params.logger = m_logger.get();
-			auto asset = m_assetMgr->getAsset(m_modelPath,params);
-			if (asset.getContents().empty())
-				return false;
+		using aabb_t = hlsl::shapes::AABB<3, double>;
+		auto printAABB = [&](const aabb_t& aabb, const char* extraMsg = "")->void
+			{
+				m_logger->log("%s AABB is (%f,%f,%f) -> (%f,%f,%f)", ILogger::ELL_INFO, extraMsg, aabb.minVx.x, aabb.minVx.y, aabb.minVx.z, aabb.maxVx.x, aabb.maxVx.y, aabb.maxVx.z);
+			};
+		auto bound = aabb_t::create();
+		// convert the geometries
+		{
+			smart_refctd_ptr<CAssetConverter> converter = CAssetConverter::create({ .device = m_device.get() });
 
-			// 
-			core::vector<smart_refctd_ptr<const ICPUPolygonGeometry>> geometries;
-			switch (asset.getAssetType())
+			const auto transferFamily = getTransferUpQueue()->getFamilyIndex();
+
+			struct SInputs : CAssetConverter::SInputs
 			{
-				case IAsset::E_TYPE::ET_GEOMETRY:
-					for (const auto& item : asset.getContents())
-					if (auto polyGeo=IAsset::castDown<ICPUPolygonGeometry>(item); polyGeo)
-						geometries.push_back(polyGeo);
-					break;
-				default:
-					m_logger->log("Asset loaded but not a supported type (ET_GEOMETRY,ET_GEOMETRY_COLLECTION)",ILogger::ELL_ERROR);
-					break;
-			}
-			if (geometries.empty())
-				return false;
+				virtual inline std::span<const uint32_t> getSharedOwnershipQueueFamilies(const size_t groupCopyID, const asset::ICPUBuffer* buffer, const CAssetConverter::patch_t<asset::ICPUBuffer>& patch) const
+				{
+					return sharedBufferOwnership;
+				}
 
-			if (m_saveGeom)
+				core::vector<uint32_t> sharedBufferOwnership;
+			} inputs = {};
+			core::vector<CAssetConverter::patch_t<ICPUPolygonGeometry>> patches(geometries.size(), CSimpleDebugRenderer::DefaultPolygonGeometryPatch);
 			{
-				std::string savePath = m_specifiedGeomSavePath.value_or((m_saveGeomPrefixPath / path(m_modelPath).filename()).generic_string());
-				m_saveGeomTaskFutures.emplace_back(
-					m_threadPool->enqueue([this, geometries, savePath] { writeGeometry(geometries[0], savePath); })
-				);
+				inputs.logger = m_logger.get();
+				std::get<CAssetConverter::SInputs::asset_span_t<ICPUPolygonGeometry>>(inputs.assets) = { &geometries.front().get(),geometries.size() };
+				std::get<CAssetConverter::SInputs::patch_span_t<ICPUPolygonGeometry>>(inputs.patches) = patches;
+				// set up shared ownership so we don't have to 
+				core::unordered_set<uint32_t> families;
+				families.insert(transferFamily);
+				families.insert(getGraphicsQueue()->getFamilyIndex());
+				if (families.size() > 1)
+					for (const auto fam : families)
+						inputs.sharedBufferOwnership.push_back(fam);
 			}
 
-			using aabb_t = hlsl::shapes::AABB<3,double>;
-			auto printAABB = [&](const aabb_t& aabb, const char* extraMsg="")->void
-			{
-				m_logger->log("%s AABB is (%f,%f,%f) -> (%f,%f,%f)",ILogger::ELL_INFO,extraMsg,aabb.minVx.x,aabb.minVx.y,aabb.minVx.z,aabb.maxVx.x,aabb.maxVx.y,aabb.maxVx.z);
-			};
-			auto bound = aabb_t::create();
-			// convert the geometries
+			// reserve
+			auto reservation = converter->reserve(inputs);
+			if (!reservation)
 			{
-				smart_refctd_ptr<CAssetConverter> converter = CAssetConverter::create({.device=m_device.get()});
-
-				const auto transferFamily = getTransferUpQueue()->getFamilyIndex();
+				m_logger->log("Failed to reserve GPU objects for CPU->GPU conversion!", ILogger::ELL_ERROR);
+				return false;
+			}
 
-				struct SInputs : CAssetConverter::SInputs
-				{
-					virtual inline std::span<const uint32_t> getSharedOwnershipQueueFamilies(const size_t groupCopyID, const asset::ICPUBuffer* buffer, const CAssetConverter::patch_t<asset::ICPUBuffer>& patch) const
-					{
-						return sharedBufferOwnership;
-					}
+			// convert
+			{
+				auto semaphore = m_device->createSemaphore(0u);
 
-					core::vector<uint32_t> sharedBufferOwnership;
-				} inputs = {};
-				core::vector<CAssetConverter::patch_t<ICPUPolygonGeometry>> patches(geometries.size(),CSimpleDebugRenderer::DefaultPolygonGeometryPatch);
+				constexpr auto MultiBuffering = 2;
+				std::array<smart_refctd_ptr<IGPUCommandBuffer>, MultiBuffering> commandBuffers = {};
 				{
-					inputs.logger = m_logger.get();
-					std::get<CAssetConverter::SInputs::asset_span_t<ICPUPolygonGeometry>>(inputs.assets) = {&geometries.front().get(),geometries.size()};
-					std::get<CAssetConverter::SInputs::patch_span_t<ICPUPolygonGeometry>>(inputs.patches) = patches;
-					// set up shared ownership so we don't have to 
-					core::unordered_set<uint32_t> families;
-					families.insert(transferFamily);
-					families.insert(getGraphicsQueue()->getFamilyIndex());
-					if (families.size()>1)
-					for (const auto fam : families)
-						inputs.sharedBufferOwnership.push_back(fam);
+					auto pool = m_device->createCommandPool(transferFamily, IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT | IGPUCommandPool::CREATE_FLAGS::TRANSIENT_BIT);
+					pool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY, commandBuffers, smart_refctd_ptr(m_logger));
 				}
-				
-				// reserve
-				auto reservation = converter->reserve(inputs);
-				if (!reservation)
+				commandBuffers.front()->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT);
+
+				std::array<IQueue::SSubmitInfo::SCommandBufferInfo, MultiBuffering> commandBufferSubmits;
+				for (auto i = 0; i < MultiBuffering; i++)
+					commandBufferSubmits[i].cmdbuf = commandBuffers[i].get();
+
+				SIntendedSubmitInfo transfer = {};
+				transfer.queue = getTransferUpQueue();
+				transfer.scratchCommandBuffers = commandBufferSubmits;
+				transfer.scratchSemaphore = {
+					.semaphore = semaphore.get(),
+					.value = 0u,
+					.stageMask = PIPELINE_STAGE_FLAGS::ALL_TRANSFER_BITS
+				};
+
+				CAssetConverter::SConvertParams cpar = {};
+				cpar.utilities = m_utils.get();
+				cpar.transfer = &transfer;
+
+				// basically it records all data uploads and submits them right away
+				auto future = reservation.convert(cpar);
+				if (future.copy() != IQueue::RESULT::SUCCESS)
 				{
-					m_logger->log("Failed to reserve GPU objects for CPU->GPU conversion!",ILogger::ELL_ERROR);
+					m_logger->log("Failed to await submission feature!", ILogger::ELL_ERROR);
 					return false;
 				}
-
-				// convert
-				{
-					auto semaphore = m_device->createSemaphore(0u);
-
-					constexpr auto MultiBuffering = 2;
-					std::array<smart_refctd_ptr<IGPUCommandBuffer>,MultiBuffering> commandBuffers = {};
-					{
-						auto pool = m_device->createCommandPool(transferFamily,IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT|IGPUCommandPool::CREATE_FLAGS::TRANSIENT_BIT);
-						pool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY,commandBuffers,smart_refctd_ptr(m_logger));
-					}
-					commandBuffers.front()->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT);
-
-					std::array<IQueue::SSubmitInfo::SCommandBufferInfo,MultiBuffering> commandBufferSubmits;
-					for (auto i=0; i<MultiBuffering; i++)
-						commandBufferSubmits[i].cmdbuf = commandBuffers[i].get();
-
-					SIntendedSubmitInfo transfer = {};
-					transfer.queue = getTransferUpQueue();
-					transfer.scratchCommandBuffers = commandBufferSubmits;
-					transfer.scratchSemaphore = {
-						.semaphore = semaphore.get(),
-						.value = 0u,
-						.stageMask = PIPELINE_STAGE_FLAGS::ALL_TRANSFER_BITS
-					};
-
-					CAssetConverter::SConvertParams cpar = {};
-					cpar.utilities = m_utils.get();
-					cpar.transfer = &transfer;
-
-					// basically it records all data uploads and submits them right away
-					auto future = reservation.convert(cpar);
-					if (future.copy()!=IQueue::RESULT::SUCCESS)
-					{
-						m_logger->log("Failed to await submission feature!", ILogger::ELL_ERROR);
-						return false;
-					}
-				}
-				
-				auto tmp = hlsl::float32_t4x3(
-					hlsl::float32_t3(1,0,0),
-					hlsl::float32_t3(0,1,0),
-					hlsl::float32_t3(0,0,1),
-					hlsl::float32_t3(0,0,0)
-				);
-				core::vector<hlsl::float32_t3x4> worldTforms;
-				const auto& converted = reservation.getGPUObjects<ICPUPolygonGeometry>();
-				for (const auto& geom : converted)
-				{
-					const auto promoted = geom.value->getAABB<aabb_t>();
-					printAABB(promoted,"Geometry");
-					tmp[3].x += promoted.getExtent().x;
-					const auto promotedWorld = hlsl::float64_t3x4(worldTforms.emplace_back(hlsl::transpose(tmp)));
-					const auto transformed = hlsl::shapes::util::transform(promotedWorld,promoted);
-					printAABB(transformed,"Transformed");
-					bound = hlsl::shapes::util::union_(transformed,bound);
-				}
-				printAABB(bound,"Total");
-				if (!m_renderer->addGeometries({ &converted.front().get(),converted.size() }))
-					return false;
-
-				auto worlTformsIt = worldTforms.begin();
-				for (const auto& geo : m_renderer->getGeometries())
-					m_renderer->m_instances.push_back({
-						.world = *(worlTformsIt++),
-						.packedGeo = &geo
-					});
 			}
 
-			// get scene bounds and reset camera
+			auto tmp = hlsl::float32_t4x3(
+				hlsl::float32_t3(1, 0, 0),
+				hlsl::float32_t3(0, 1, 0),
+				hlsl::float32_t3(0, 0, 1),
+				hlsl::float32_t3(0, 0, 0)
+			);
+			core::vector<hlsl::float32_t3x4> worldTforms;
+			const auto& converted = reservation.getGPUObjects<ICPUPolygonGeometry>();
+			for (const auto& geom : converted)
 			{
-				const double distance = 0.05;
-				const auto diagonal = bound.getExtent();
-				{
-					const auto measure = hlsl::length(diagonal);
-					const auto aspectRatio = float(m_window->getWidth())/float(m_window->getHeight());
-					camera.setProjectionMatrix(core::matrix4SIMD::buildProjectionMatrixPerspectiveFovRH(1.2f,aspectRatio,distance*measure*0.1,measure*4.0));
-					camera.setMoveSpeed(measure*0.04);
-				}
-				const auto pos = bound.maxVx+diagonal*distance;
-				camera.setPosition(vectorSIMDf(pos.x,pos.y,pos.z));
-				const auto center = (bound.minVx+bound.maxVx)*0.5;
-				camera.setTarget(vectorSIMDf(center.x,center.y,center.z));
+				const auto promoted = geom.value->getAABB<aabb_t>();
+				printAABB(promoted, "Geometry");
+				tmp[3].x += promoted.getExtent().x;
+				const auto promotedWorld = hlsl::float64_t3x4(worldTforms.emplace_back(hlsl::transpose(tmp)));
+				const auto transformed = hlsl::shapes::util::transform(promotedWorld, promoted);
+				printAABB(transformed, "Transformed");
+				bound = hlsl::shapes::util::union_(transformed, bound);
 			}
+			printAABB(bound, "Total");
+			if (!m_renderer->addGeometries({ &converted.front().get(),converted.size() }))
+				return false;
 
-			// TODO: write out the geometry
-
-			return true;
+			auto worlTformsIt = worldTforms.begin();
+			for (const auto& geo : m_renderer->getGeometries())
+				m_renderer->m_instances.push_back({
+					.world = *(worlTformsIt++),
+					.packedGeo = &geo
+					});
 		}
 
-		void writeGeometry(smart_refctd_ptr<const ICPUPolygonGeometry> geometry, const std::string& savePath)
+		// get scene bounds and reset camera
 		{
-			IAsset* assetPtr = const_cast<IAsset*>(static_cast<const IAsset*>(geometry.get()));
-			IAssetWriter::SAssetWriteParams params{ assetPtr };
-			m_logger->log("Saving mesh to %s", ILogger::ELL_INFO, savePath.c_str());
-			if (!m_assetMgr->writeAsset(savePath, params))
-				m_logger->log("Failed to save %s", ILogger::ELL_ERROR, savePath.c_str());
-			m_logger->log("Mesh successfully saved!", ILogger::ELL_INFO);
+			const double distance = 0.05;
+			const auto diagonal = bound.getExtent();
+			{
+				const auto measure = hlsl::length(diagonal);
+				const auto aspectRatio = float(m_window->getWidth()) / float(m_window->getHeight());
+				camera.setProjectionMatrix(core::matrix4SIMD::buildProjectionMatrixPerspectiveFovRH(1.2f, aspectRatio, distance * measure * 0.1, measure * 4.0));
+				camera.setMoveSpeed(measure * 0.04);
+			}
+			const auto pos = bound.maxVx + diagonal * distance;
+			camera.setPosition(vectorSIMDf(pos.x, pos.y, pos.z));
+			const auto center = (bound.minVx + bound.maxVx) * 0.5;
+			camera.setTarget(vectorSIMDf(center.x, center.y, center.z));
 		}
 
-		// Maximum frames which can be simultaneously submitted, used to cycle through our per-frame resources like command buffers
-		constexpr static inline uint32_t MaxFramesInFlight = 3u;
-		//
-		smart_refctd_ptr<CSimpleDebugRenderer> m_renderer;
-		//
-		smart_refctd_ptr<ISemaphore> m_semaphore;
-		uint64_t m_realFrameIx = 0;
-		std::array<smart_refctd_ptr<IGPUCommandBuffer>,MaxFramesInFlight> m_cmdBufs;
-		//
-		InputSystem::ChannelReader<IMouseEventChannel> mouse;
-		InputSystem::ChannelReader<IKeyboardEventChannel> keyboard;
-		//
-		Camera camera = Camera(core::vectorSIMDf(0, 0, 0), core::vectorSIMDf(0, 0, 0), core::matrix4SIMD());
-		// mutables
-		std::string m_modelPath;
-
-		bool m_saveGeom = false;
-		std::unique_ptr<ThreadPool> m_threadPool = std::make_unique<ThreadPool>(3);
-		std::vector<std::future<void>> m_saveGeomTaskFutures;
-		std::optional<const std::string> m_specifiedGeomSavePath;
-		nbl::system::path m_saveGeomPrefixPath;
+		// TODO: write out the geometry
+
+		return true;
+	}
+
+	void writeGeometry(smart_refctd_ptr<const ICPUPolygonGeometry> geometry, const std::string& savePath)
+	{
+		IAsset* assetPtr = const_cast<IAsset*>(static_cast<const IAsset*>(geometry.get()));
+		IAssetWriter::SAssetWriteParams params{ assetPtr };
+		m_logger->log("Saving mesh to %s", ILogger::ELL_INFO, savePath.c_str());
+		if (!m_assetMgr->writeAsset(savePath, params))
+			m_logger->log("Failed to save %s", ILogger::ELL_ERROR, savePath.c_str());
+		m_logger->log("Mesh successfully saved!", ILogger::ELL_INFO);
+	}
+
+	// Maximum frames which can be simultaneously submitted, used to cycle through our per-frame resources like command buffers
+	constexpr static inline uint32_t MaxFramesInFlight = 3u;
+	//
+	smart_refctd_ptr<CSimpleDebugRenderer> m_renderer;
+	//
+	smart_refctd_ptr<ISemaphore> m_semaphore;
+	uint64_t m_realFrameIx = 0;
+	std::array<smart_refctd_ptr<IGPUCommandBuffer>, MaxFramesInFlight> m_cmdBufs;
+	//
+	InputSystem::ChannelReader<IMouseEventChannel> mouse;
+	InputSystem::ChannelReader<IKeyboardEventChannel> keyboard;
+	//
+	Camera camera = Camera(core::vectorSIMDf(0, 0, 0), core::vectorSIMDf(0, 0, 0), core::matrix4SIMD());
+	// mutables
+	std::string m_modelPath;
+
+	bool m_saveGeom = false;
+	std::future<void> m_saveGeomTaskFuture;
+	std::optional<const std::string> m_specifiedGeomSavePath;
+	nbl::system::path m_saveGeomPrefixPath;
 };
 
 NBL_MAIN_FUNC(MeshLoadersApp)
\ No newline at end of file

From ddafc219376598c5d2116949341ee8ee1e4080fc Mon Sep 17 00:00:00 2001
From: kevyuu <kevin.kayu@gmail.com>
Date: Mon, 25 Aug 2025 19:28:31 +0700
Subject: [PATCH 529/529] Add disk geometry to example 67

---
 67_RayQueryGeometry/main.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/67_RayQueryGeometry/main.cpp b/67_RayQueryGeometry/main.cpp
index 820b165b0..0d36ca368 100644
--- a/67_RayQueryGeometry/main.cpp
+++ b/67_RayQueryGeometry/main.cpp
@@ -502,6 +502,7 @@ class RayQueryGeometryApp final : public SimpleWindowedApplication, public Built
 
 			std::vector<ReferenceObjectCpu> cpuObjects;
 			cpuObjects.push_back(ReferenceObjectCpu{ .transform = nextTransform(), .data = gc->createArrow() });
+			cpuObjects.push_back(ReferenceObjectCpu{ .transform = nextTransform(), .data = CPolygonGeometryManipulator::createTriangleListIndexing(gc->createDisk(1.0f, 12).get()) });
 			cpuObjects.push_back(ReferenceObjectCpu{ .transform = nextTransform(), .data = gc->createCube({1.f, 1.f, 1.f})});
 			cpuObjects.push_back(ReferenceObjectCpu{ .transform = nextTransform(), .data = gc->createSphere(2, 16, 16)});
 			cpuObjects.push_back(ReferenceObjectCpu{ .transform = nextTransform(), .data = gc->createCylinder(2, 2, 20)});